Source code for das.kapre.backend

"""

Kapre backend functions
=======================\

|  Some backend functions that mainly use numpy.
|  Functions with Keras' backend is in ``backend_keras.py``.

Notes
-----
    * Don't forget to use ``K.float()``! Otherwise numpy uses float64.
    * Some functions are copied-and-pasted from librosa (to reduce dependency), but
        later I realised it'd be better to just use it.
    * TODO: remove copied code and use librosa.
"""

from tensorflow.keras import backend as K
import numpy as np
import librosa

# Forward compatability to replace xrange
from builtins import range

EPS = 1e-7


def eps():
    return EPS


[docs]def mel(sr, n_dft, n_mels=128, fmin=0.0, fmax=None, htk=False, norm=1):
    """[np] create a filterbank matrix to combine stft bins into mel-frequency bins
    use Slaney (said Librosa)

    n_mels: numbre of mel bands
    fmin : lowest frequency [Hz]
    fmax : highest frequency [Hz]
        If `None`, use `sr / 2.0`
    """
    return librosa.filters.mel(sr=sr, n_fft=n_dft, n_mels=n_mels, fmin=fmin, fmax=fmax, htk=htk, norm=norm).astype(K.floatx())


[docs]def get_stft_kernels(n_dft):
    """[np] Return dft kernels for real/imagnary parts assuming
        the input . is real.
    An asymmetric hann window is used (scipy.signal.hann).

    Parameters
    ----------
    n_dft : int > 0 and power of 2 [scalar]
        Number of dft components.

    Returns
    -------
        |  dft_real_kernels : np.ndarray [shape=(nb_filter, 1, 1, n_win)]
        |  dft_imag_kernels : np.ndarray [shape=(nb_filter, 1, 1, n_win)]

    * nb_filter = n_dft/2 + 1
    * n_win = n_dft

    """
    assert n_dft > 1 and ((n_dft & (n_dft - 1)) == 0), "n_dft should be > 1 and power of 2, but n_dft == %d" % n_dft

    nb_filter = int(n_dft // 2 + 1)

    # prepare DFT filters
    timesteps = np.array(range(n_dft))
    w_ks = np.arange(nb_filter) * 2 * np.pi / float(n_dft)
    dft_real_kernels = np.cos(w_ks.reshape(-1, 1) * timesteps.reshape(1, -1))
    dft_imag_kernels = -np.sin(w_ks.reshape(-1, 1) * timesteps.reshape(1, -1))

    # windowing DFT filters
    dft_window = librosa.filters.get_window("hann", n_dft, fftbins=True)  # _hann(n_dft, sym=False)
    dft_window = dft_window.astype(K.floatx())
    dft_window = dft_window.reshape((1, -1))
    dft_real_kernels = np.multiply(dft_real_kernels, dft_window)
    dft_imag_kernels = np.multiply(dft_imag_kernels, dft_window)

    dft_real_kernels = dft_real_kernels.transpose()
    dft_imag_kernels = dft_imag_kernels.transpose()
    dft_real_kernels = dft_real_kernels[:, np.newaxis, np.newaxis, :]
    dft_imag_kernels = dft_imag_kernels[:, np.newaxis, np.newaxis, :]

    return dft_real_kernels.astype(K.floatx()), dft_imag_kernels.astype(K.floatx())


[docs]def filterbank_mel(sr, n_freq, n_mels=128, fmin=0.0, fmax=None, htk=False, norm=1):
    """[np]"""
    return mel(sr, (n_freq - 1) * 2, n_mels=n_mels, fmin=fmin, fmax=fmax, htk=htk, norm=norm).astype(K.floatx())


[docs]def filterbank_log(sr, n_freq, n_bins=84, bins_per_octave=12, fmin=None, spread=0.125):  # pragma: no cover
    """[np] Approximate a constant-Q filter bank for a fixed-window STFT.

    Each filter is a log-normal window centered at the corresponding frequency.

    Note: `logfrequency` in librosa 0.4 (deprecated), so copy-and-pasted,
        `tuning` was removed, `n_freq` instead of `n_fft`.

    Parameters
    ----------
    sr : number > 0 [scalar]
        audio sampling rate

    n_freq : int > 0 [scalar]
        number of frequency bins

    n_bins : int > 0 [scalar]
        Number of bins.  Defaults to 84 (7 octaves).

    bins_per_octave : int > 0 [scalar]
        Number of bins per octave. Defaults to 12 (semitones).

    fmin : float > 0 [scalar]
        Minimum frequency bin. Defaults to `C1 ~= 32.70`

    spread : float > 0 [scalar]
        Spread of each filter, as a fraction of a bin.

    Returns
    -------
    C : np.ndarray [shape=(n_bins, 1 + n_fft/2)]
        log-frequency filter bank.
    """

    if fmin is None:
        fmin = 32.70319566

    # What's the shape parameter for our log-normal filters?
    sigma = float(spread) / bins_per_octave

    # Construct the output matrix
    basis = np.zeros((n_bins, n_freq))

    # Get log frequencies of bins
    log_freqs = np.log2(librosa.fft_frequencies(sr, (n_freq - 1) * 2)[1:])

    for i in range(n_bins):
        # What's the center (median) frequency of this filter?
        c_freq = fmin * (2.0 ** (float(i) / bins_per_octave))

        # Place a log-normal window around c_freq
        basis[i, 1:] = np.exp(-0.5 * ((log_freqs - np.log2(c_freq)) / sigma) ** 2 - np.log2(sigma) - log_freqs)

    # Normalize the filters
    basis = librosa.util.normalize(basis, norm=1, axis=1)

    return basis.astype(K.floatx())