# first our usual bookkeeping
%pylab inline
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import IPython
from scipy import signal
from scipy.io import wavfile

%pylab is deprecated, use %matplotlib inline and import the required libraries.
Populating the interactive namespace from numpy and matplotlib

plt.rcParams["figure.figsize"] = (14,4)
plt.rcParams["font.size"] = 20

Fs, x = wavfile.read("piano.wav")
# let's remove any DC component
x = x - np.mean(x)
IPython.display.Audio(x, rate=Fs)

# let's introduce here a handy function to plot spectral magnitudes using real-world frequency labeling
def plotSpecMag(Fs, x, fmax=Fs/2):
    N = int(len(x) * fmax / Fs / 2)
    X = np.abs(np.fft.fft(x))[0:N]
    f = fmax * np.arange(0, N) / N
    plt.plot(f, X)
    plt.xlabel("Frequency [Hz]")
    ax = plt.gca()
    ax.axes.yaxis.set_ticklabels([])
    plt.xlim([0, fmax])
    plt.grid();

plotSpecMag(Fs, x)

plotSpecMag(Fs, x, fmax=1500)

Fs, s = wavfile.read("a6.wav")
IPython.display.Audio(s, rate=Fs)

# subtract mean to remove DC offset
plotSpecMag(Fs, s-np.mean(s))

Fs, ppA = wavfile.read("ppA.wav")
IPython.display.Audio(ppA, rate=Fs)

plt.subplot(1, 2, 1)    
plotSpecMag(Fs, ppA[:1024])

plt.subplot(1, 2, 2)    
plotSpecMag(Fs, ppA[80000:81000])

import builtins
def transcribe(Fs, x):
    # min and max frequencies of human whistle
    WHISTLE_MIN_HZ = 700 
    WHISTLE_MAX_HZ = 3000

    # STFT segment length in milliseconds
    WIN_LEN_MS = 24
    # bins around the peak to detect peak energy
    PEAK_WIDTH = 3
    # energy ratio to detect whistled frames
    E_RATIO = 0.8
    
    # length of the segment in samples
    win_len = int(WIN_LEN_MS * Fs / 1000.0)
    # frequency resolution of the DFT
    freq_res = float(Fs) / win_len
    # first and last frequency indices for the whistling range
    min_bin = int(WHISTLE_MIN_HZ / freq_res)
    max_bin = int(WHISTLE_MAX_HZ / freq_res)
    
    # normalize audio amplitude to 1
    x = np.array(x) / max(x)

    pitch = []
    for n in range(0, len(x)-win_len, win_len):
        # DFT square magnitude for the segment, cut over freq interval of interest
        s = np.square(np.abs(np.fft.fft(x[n:n+win_len])[min_bin:max_bin])) 
        # location of the max
        p = int(np.argmax(s))
        # energy around the peak vs total energy
        k = np.sum(s[builtins.max(p-PEAK_WIDTH, 0):builtins.min(p+PEAK_WIDTH+1,len(s))])
        # if whistled frame, append pitch (ignore all other segments)
        if k > E_RATIO * np.sum(s):
            pitch.append(freq_res * (p + min_bin))
    return np.trim_zeros(np.array(pitch))

p = transcribe(Fs, ppA)
plt.plot(p, '*-');

def synthesize(p):
    fs = 16000
    win_len = int(25 * fs / 1000.0)
    x = np.zeros((len(p) * win_len))
    theta = 0
    for n in range(0, len(p)):
        if p[n] != 0:
            for m in range(0, win_len):
                x[n * win_len + m] = cos(theta)
                theta += 2 * np.pi * p[n] / fs
    return x

IPython.display.Audio(synthesize(p), rate=16000)

Fs, ppAup = wavfile.read("ppAup.wav")
IPython.display.Audio(ppAup, rate=Fs)

def estimateBaseFreq(p):
    THRESHOLD = (1.05946309435929526456182529494634) ** 2
    s = np.trim_zeros(np.array(p))
    # first note
    notes = []
    ix = 0
    for c in range(1, len(p)):
        # add a new note only if we're jumping up or down at least a whole tone
        if s[c] > s[c-1] * THRESHOLD or s[c] < s[c-1] / THRESHOLD:
            notes.append(np.mean(s[ix:c]))
            ix = c
    return np.mean(np.array(notes))

for k in (ppA, ppAup):
    plt.plot(transcribe(Fs, k), '*-')

for k in (ppA, ppAup):
    p = transcribe(Fs, k)
    p = p / estimateBaseFreq(p)
    plt.plot(p,'*-')

def dtw(a, b):
    D = np.zeros((len(a), len(b)))
    D[1:, 0] = np.inf
    D[0, 1:] = np.inf
    for i in range(0, len(a)):
        for j in range(0, len(b)):
            d = np.abs(a[i] - b[j])
            D[i, j] = d + np.min([D[i-1,j], D[i, j-1], D[i-1, j-1]])
    return D[-1, -1]

groupA = ["ppA.wav", "ppAup.wav", "ppAslow.wav", "ppAfast.wav", ]

groupB = ["ppB.wav", "ppBslow.wav", "ppBdown.wav"]

A = []
for filename in groupA:
    Fs, xx = wavfile.read(filename)
    t = transcribe(Fs, xx)
    k = estimateBaseFreq(t)
    A.append(t / k)
    plt.plot(A[-1],'*-')

B = []
for filename in groupB:
    Fs, xx = wavfile.read(filename)
    t = transcribe(Fs, xx)
    k = estimateBaseFreq(t)
    B.append(t / k)
    plt.plot(B[-1],'*-')

for n in range(0, len(A)):
    for m in range(n+1, len(A)):
        print(n, m, dtw(A[n], A[m]))

0 1 1.2092478059523077
0 2 1.7949879374579107
0 3 1.50774223273961
1 2 2.5840783603458997
1 3 1.6151682301573282
2 3 1.5160234564298953

for n in range(0, len(B)):
    for m in range(n+1, len(B)):
        print(n, m, dtw(B[n], B[m]))

0 1 1.832957526516897
0 2 2.4222526717515445
1 2 1.9116748119219695

for n in range(0, len(A)):
    for m in range(0, len(B)):
        print(n, m, dtw(A[n], B[m]))

0 0 4.627191280779358
0 1 9.068512558850596
0 2 12.939545286284336
1 0 5.135250122035303
1 1 10.23160184333025
1 2 14.778468246184833
2 0 4.888663217031719
2 1 8.896543218942568
2 2 12.5813078833648
3 0 4.540512290609645
3 1 8.939647164831374
3 2 11.003313294414045

import sounddevice as sd

def keylock(sec, Fs, original_key):
    # s is the duration of the recording. original_key is the reference passphrase audio
    THRESHOLD = 3
    in_key = sd.rec(int(sec * Fs), samplerate=Fs, channels=1, blocking=True).flatten()
    k = []
    for audio in (in_key, original_key):
        t = transcribe(Fs, audio)
        k.append(t / estimateBaseFreq(t))
    d = dtw(k[0], k[1])
    if d < THRESHOLD:
        print("OPEN SESAME!")
    else:
        print("no dice {}".format(d))

Fs, ppA = wavfile.read("ppA.wav")
IPython.display.Audio(ppA, rate=Fs)

keylock(3, Fs, ppA)

KEY_LEN = 3 # seconds
Fs = 16000
passkey = sd.rec(int(KEY_LEN * Fs), samplerate=Fs, channels=1, blocking=True).flatten()

keylock(KEY_LEN, Fs, passkey)

OPEN SESAME!

# listen to your recording in case you forgot it ;)
# IPython.display.Audio(s, rate=Fs)
IPython.display.Audio(passkey, rate=Fs)

Whistling Keylock¶

1. Extracting the pitch¶

1.1. The fundamental frequency¶

1.2. Time-frequency resolution and temperament¶

2. Extracting the melody¶

2.1. Multiple notes¶

3. Comparing two passphrases¶

3.1 Estimating the reference frequency¶

3.2 Dynamic Time Warping¶

4. Testing it live!¶