Source code for das.kapre.backend

"""

Kapre backend functions
=======================\

|  Some backend functions that mainly use numpy.
|  Functions with Keras' backend is in ``backend_keras.py``.

Notes
-----
    * Don't forget to use ``K.float()``! Otherwise numpy uses float64.
    * Some functions are copied-and-pasted from librosa (to reduce dependency), but
        later I realised it'd be better to just use it.
    * TODO: remove copied code and use librosa.
"""
from tensorflow.keras import backend as K
import numpy as np
import librosa

# Forward compatability to replace xrange
from builtins import range

EPS = 1e-7


def eps():
    return EPS


[docs]def mel(sr, n_dft, n_mels=128, fmin=0.0, fmax=None, htk=False, norm=1): """[np] create a filterbank matrix to combine stft bins into mel-frequency bins use Slaney (said Librosa) n_mels: numbre of mel bands fmin : lowest frequency [Hz] fmax : highest frequency [Hz] If `None`, use `sr / 2.0` """ return librosa.filters.mel(sr=sr, n_fft=n_dft, n_mels=n_mels, fmin=fmin, fmax=fmax, htk=htk, norm=norm).astype(K.floatx())
[docs]def get_stft_kernels(n_dft): """[np] Return dft kernels for real/imagnary parts assuming the input . is real. An asymmetric hann window is used (scipy.signal.hann). Parameters ---------- n_dft : int > 0 and power of 2 [scalar] Number of dft components. Returns ------- | dft_real_kernels : np.ndarray [shape=(nb_filter, 1, 1, n_win)] | dft_imag_kernels : np.ndarray [shape=(nb_filter, 1, 1, n_win)] * nb_filter = n_dft/2 + 1 * n_win = n_dft """ assert n_dft > 1 and ((n_dft & (n_dft - 1)) == 0), "n_dft should be > 1 and power of 2, but n_dft == %d" % n_dft nb_filter = int(n_dft // 2 + 1) # prepare DFT filters timesteps = np.array(range(n_dft)) w_ks = np.arange(nb_filter) * 2 * np.pi / float(n_dft) dft_real_kernels = np.cos(w_ks.reshape(-1, 1) * timesteps.reshape(1, -1)) dft_imag_kernels = -np.sin(w_ks.reshape(-1, 1) * timesteps.reshape(1, -1)) # windowing DFT filters dft_window = librosa.filters.get_window("hann", n_dft, fftbins=True) # _hann(n_dft, sym=False) dft_window = dft_window.astype(K.floatx()) dft_window = dft_window.reshape((1, -1)) dft_real_kernels = np.multiply(dft_real_kernels, dft_window) dft_imag_kernels = np.multiply(dft_imag_kernels, dft_window) dft_real_kernels = dft_real_kernels.transpose() dft_imag_kernels = dft_imag_kernels.transpose() dft_real_kernels = dft_real_kernels[:, np.newaxis, np.newaxis, :] dft_imag_kernels = dft_imag_kernels[:, np.newaxis, np.newaxis, :] return dft_real_kernels.astype(K.floatx()), dft_imag_kernels.astype(K.floatx())
[docs]def filterbank_mel(sr, n_freq, n_mels=128, fmin=0.0, fmax=None, htk=False, norm=1): """[np]""" return mel(sr, (n_freq - 1) * 2, n_mels=n_mels, fmin=fmin, fmax=fmax, htk=htk, norm=norm).astype(K.floatx())
[docs]def filterbank_log(sr, n_freq, n_bins=84, bins_per_octave=12, fmin=None, spread=0.125): # pragma: no cover """[np] Approximate a constant-Q filter bank for a fixed-window STFT. Each filter is a log-normal window centered at the corresponding frequency. Note: `logfrequency` in librosa 0.4 (deprecated), so copy-and-pasted, `tuning` was removed, `n_freq` instead of `n_fft`. Parameters ---------- sr : number > 0 [scalar] audio sampling rate n_freq : int > 0 [scalar] number of frequency bins n_bins : int > 0 [scalar] Number of bins. Defaults to 84 (7 octaves). bins_per_octave : int > 0 [scalar] Number of bins per octave. Defaults to 12 (semitones). fmin : float > 0 [scalar] Minimum frequency bin. Defaults to `C1 ~= 32.70` spread : float > 0 [scalar] Spread of each filter, as a fraction of a bin. Returns ------- C : np.ndarray [shape=(n_bins, 1 + n_fft/2)] log-frequency filter bank. """ if fmin is None: fmin = 32.70319566 # What's the shape parameter for our log-normal filters? sigma = float(spread) / bins_per_octave # Construct the output matrix basis = np.zeros((n_bins, n_freq)) # Get log frequencies of bins log_freqs = np.log2(librosa.fft_frequencies(sr, (n_freq - 1) * 2)[1:]) for i in range(n_bins): # What's the center (median) frequency of this filter? c_freq = fmin * (2.0 ** (float(i) / bins_per_octave)) # Place a log-normal window around c_freq basis[i, 1:] = np.exp(-0.5 * ((log_freqs - np.log2(c_freq)) / sigma) ** 2 - np.log2(sigma) - log_freqs) # Normalize the filters basis = librosa.util.normalize(basis, norm=1, axis=1) return basis.astype(K.floatx())