Annotate audio in realtime


Test the latency of realtime annotations on a GPU (if you have one) or a CPU.

Disabling eager execution is crucial for low latency annotation - eager execution increases latencies 6-fold, from 5 ms to 30 ms.

import os
import tensorflow as tf
USE_GPU = False  # 
# disable existing GPUs if testing CPU
os.environ['CUDA_VISIBLE_DEVICES'] = '0' if USE_GPU else '-1'
# disabling eager execution in tensorflow speeds up annotations by ~6x
    from tensorflow.python.framework.ops import disable_eager_execution


    physical_devices = tf.config.list_physical_devices('GPU') 
    print('GPUs:', physical_devices)
    physical_devices = tf.config.list_physical_devices('CPU') 
    print('CPUs:', physical_devices)
except Exception as e:
GPUs: []
CPUs: [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]
%config InlineBackend.figure_format = 'jpg'  # smaller mem footprint for page

import numpy as np
import dss.utils
import dss.predict
import scipy.signal
import matplotlib.pyplot as plt
import time
import subprocess'ncb.mplstyle')

Load the model and runtime parameters

model_filename = 'models/dmel_single_rt/20200430_201821'
model, params = dss.utils.load_model_and_params(model_filename)

pulse_index = params['class_names'].index('pulse')
sine_index = params['class_names'].index('sine')

samplerate = params['samplerate_x_Hz']
interval = params['nb_hist'] / samplerate

print(f"Model expects input shape {model.input_shape[1]} samples and {model.input_shape[2]} channel(s).")
WARNING:tensorflow:From /Users/clemens10/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/ops/ calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version.
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Model expects input shape 256 samples and 1 channel(s).

Load a recording

Samplerates of the recording and of the data the model was trained with should match. Otherwise resample the recording to match the model using scipy.signal.resample_poly(x=recording, up=int(recording_samplerate), down=int(samplerate), axis=0)

recording_samplerate, recording ='dat/dmel_song_rt.wav')
recording = np.atleast_2d(recording).T

plt.figure(figsize=(30, 4))
plt.plot(recording[:100_000], c='k')  # plot the first 10 seconds

print(f"Sample rate of the recording ({recording_samplerate}Hz), sample rate expected by the model ({samplerate}Hz). Both should match.")
Sample rate of the recording (10000Hz), sample rate expected by the model (10000Hz). Both should match.

Define a virtual microphone

The microphone takes a full recording and returns chunks of data at a rate determined by the chunk duration. In a real experimental setup, this would be a function that runs the data acquisition or a multi-processing queue that gets fed by the acquisition system.

def virtual_microphone(data: np.array, interval: float, samplerate: float, block: bool = True):
    RUN = True
    current_position = 0
    t1 = time.time() - 0.025
    t0 = time.time()    
    while RUN:
        current_position += interval * samplerate 
        start_sample = int(current_position)
        end_sample = int(current_position + interval * samplerate)
        chunk = data[start_sample:end_sample]
        if block:
            while (t1-t0)<interval:  # wait til interval has elapsed
                t1 = time.time()
        if end_sample >= data.shape[0]:
            RUN = False
            yield None
            yield chunk
        t0 = time.time()

mike = virtual_microphone(recording, interval, samplerate, block=True)
rec = next(mike)
print(f"Microphone produces {rec.shape[0]} samples and {rec.shape[1]} channel(s) per chunk.")

print('Consuming and plotting 100 chunks...')
recording_latencies = []
for _ in range(100):
    with dss.utils.Timer() as t:
        rec = next(mike)
    recording_latencies.append(t.elapsed * 1000)
    plt.ylim(-1, 1)
plt.title('Individal chunks')
plt.xlim(0, rec.shape[0])
print(f"Takes {np.median(recording_latencies):1.1f} ms to return {rec.shape[0] / samplerate * 1000:1.1f} ms of audio data.")
Microphone produces 256 samples and 1 channel(s) per chunk.
Consuming and plotting 100 chunks...
Takes 26.4 ms to return 25.6 ms of audio data.

Test annotation

Test the model and a simple song detection routing with a chunk that contains song. A specific song type is detect if the corresponding confidence is exceeds 0.7 during a chunk.

def report_song(prediction, pulse_index, sine_index):
    report = 'pulse? ' + ('yes' if np.max(out[:, pulse_index]) > 0.7 else ' no')
    report += ' sine? ' + ('yes' if np.max(out[:, sine_index]) > 0.7 else ' no')
    return report

# consume a couple of chunks until there is song
for _ in range(9):
    d = next(mike)

d = d[np.newaxis, ...]
out = model.predict(d, verbose=0)

plt.plot(d[0, ...])
plt.ylabel('Audio [V]')
<matplotlib.legend.Legend at 0x7ff3cbde67d0>

Run annotation

Run the virtual microphone for 100 chunks to estimate annotation latencies.

latencies = []
runs = 0

while runs < 100:
    runs += 1
    with dss.utils.Timer() as dt:
        d = next(mike)  # get a new chunk
        d = d[np.newaxis, ...]
        out = model.predict(d, verbose=0)  # run inference on the chunk
        # detect pulse and sine
        result = report_song(out[0, ...], pulse_index, sine_index)
        # print(result)
    latencies.append(dt.elapsed * 1000)

Evaluate latencies

Timing on GPUs can be more variable than on CPUs since a major contributor to latency is the transfer of data from/to the GPU.

Subtract the recording latency to get an estimate of the annotation latency.

annotation_latencies = np.array(latencies) - np.median(recording_latencies)
print('Annotation latencies')
print(f'  5, 50, 95 percentiles: {np.percentile(annotation_latencies, [5, 50, 95])} ms')
print(f'  min {np.min(annotation_latencies):1.2f} ms, max {np.max(annotation_latencies):1.2f} ms')

plt.figure(figsize=(5, 5))
plt.ylabel('Annotation latency [ms]')
plt.ylim(0, np.max(annotation_latencies))
Annotation latencies
  5, 50, 95 percentiles: [4.7993752  5.7063365  6.88310785] ms
  min 4.60 ms, max 7.66 ms