%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import scipy.signal as sp
import IPython
from scipy.io import wavfile

plt.rcParams["figure.figsize"] = (14,4)
plt.rcParams['image.cmap'] = 'tab10'

Fs, s = wavfile.read('speech.wav')
s = s / 32767.0 # scale the signal to floats in [-1, 1]
print('sampling rate: {}Hz'.format(Fs))
IPython.display.Audio(s, rate=Fs)

sampling rate: 16000Hz

def ms2smp(ms, Fs):
    return int(float(Fs) * float(ms) / 1000.0)

def plot_spec(x, Fs, max_freq=None, do_fft=True):
    C = int(len(x) / 2)  # positive frequencies only
    if max_freq:
        C = int(C * max_freq / float(Fs) * 2) 
    X = np.abs(np.fft.fft(x)[0:C]) if do_fft else x[0:C]
    N = Fs * np.arange(0, C) / len(x);
    plt.plot(N, X)
    return N, X

plot_spec(s, Fs, 8000);

def alien_voice(x, mod_freq, Fs):
    w = (float(mod_freq) / Fs) * 2 * np.pi  # normalized modulation frequency
    return 2 * np.multiply(x, np.cos(w * np.arange(0,len(x))))

IPython.display.Audio(alien_voice(s, 500, Fs), rate=Fs)

plot_spec(alien_voice(s, 500, Fs), Fs, 8000);

def get_subsample(x, t):
    n = int(t)
    a = 1.0 - (t - n)
    try:
        return a * x[n] + (1 - a) * x[n + 1] 
    except IndexError:
        try:
            return a * x[n]
        except IndexError:
            return 0

def resample(x, factor):
    # length of the output signal after resampling
    n_out = int(np.floor(len(x) / factor))
    y = np.zeros(n_out)
    for n in range(0, n_out):
        y[n] = get_subsample(x, float(n) * factor)
    return y

IPython.display.Audio(resample(s, 0.6), rate=Fs)

IPython.display.Audio(resample(s, 1.5), rate=Fs)

def double_len(x, grain_size):
    N = len(x)
    y = np.zeros(2 * N)
    for n in range(0, len(x) - grain_size, grain_size):
        grain = x[n:n+grain_size]
        y[2*n:2*n+grain_size] = grain
        y[2*n+grain_size:2*n+2*grain_size] = grain
    return y

grain_size = ms2smp(30, Fs)
IPython.display.Audio(double_len(s, grain_size), rate=Fs)

y = np.zeros(Fs * 2)
y[0:-1:2*grain_size] = 1
IPython.display.Audio(y, rate=Fs)

def win_taper(N, overlap):
    R = int(N * overlap / 2)
    r = np.arange(0, R) / float(R)
    win = np.r_[r, np.ones(N - 2*R), r[::-1]]
    stride = N - R - 1
    return win, stride

# generate a window with 40% taper (20% left and 20% right)
win, stride = win_taper(100, .4)
# align two windows using the given stride and sum them 
win1 = np.r_[win, np.zeros(stride)]
win2 = np.r_[np.zeros(stride), win]
plt.plot(win1);
plt.plot(win2);
# if the windows are properly aligned, the tapered areas compensate
plt.plot(win1 + win2);
plt.gca().set_ylim([0, 1.1]);

def double_len2(x, grain_size):
    N = len(x)
    y = np.zeros(2 * N)
    overlap = 0.4
    win, stride = win_taper(grain_size, overlap)
    ix = 0
    for n in range(0, len(x) - grain_size, grain_size):
        for k in [0, 1]:
            y[ix:ix+grain_size] += x[n:n+grain_size] * win
            ix += stride
    return y

IPython.display.Audio(double_len2(s, ms2smp(30, Fs)), rate=Fs)

IPython.display.Audio(resample(double_len2(s, ms2smp(30, Fs)), 2), rate=Fs)

n = np.arange(0, 100)
for ix, f in enumerate([1, 1.6, 0.6]):
    plt.subplot(1, 3, ix+1)
    plt.plot(resample(n, f))
    plt.gca().set_xlim([0, 100])

def gs_map(n, factor, grain_size):
    # beginning of grain
    t = np.floor(n / grain_size) * grain_size
    # fractional index in input grain
    t += (n - t) * factor
    return t

n = np.arange(0, 100)
for ix, f in enumerate([1, 1.6, 0.6]):
    plt.subplot(1, 3, ix+1)
    plt.plot(gs_map(n, f, 15))
    plt.gca().set_xlim([0, 100])
    plt.gca().set_ylim([0, 100])
    plt.xlabel('output index')    
    plt.ylabel('input index')

def GS_pshift(x, factor, grain_size, overlap=0.5):
    N = len(x)
    y = np.zeros(N)
    # size of input buffer given target ouptut grain size and resampling factor
    input_chunk_size = int(grain_size * factor + 0.5)
    win, stride = win_taper(grain_size, overlap)
    for n in range(0, len(x) - max(input_chunk_size, grain_size), stride):
        w = resample(x[n:n+input_chunk_size], factor)
        y[n:n+grain_size] += w * win
    return y

IPython.display.Audio(GS_pshift(s, 1.5, ms2smp(40, Fs), .25), rate=Fs)

IPython.display.Audio(GS_pshift(s, 0.6, ms2smp(31, Fs), .25), rate=Fs)

Fs_y, y = wavfile.read('clarinet.wav')
IPython.display.Audio(y, rate=Fs_y)

plot_spec(y, Fs_y, 4000);

def DFT_translate(x, k):
    N = len(x)        
    X = np.fft.fft(x - np.mean(x))
    Y = np.r_[np.zeros(k), X[0:int(N/2-k)]]
    y = np.fft.ifft(np.r_[Y, np.conj(Y[-1:0:-1])])
    return np.real(y[0:N])

IPython.display.Audio(DFT_translate(y, 210), rate=Fs_y)

def DFT_rescale(x, factor):
    X = np.fft.fft(x)
    # separate even and odd lengths
    parity = (len(X) % 2 == 0)
    N = int(len(X) / 2) + 1 if parity else (len(X) + 1) / 2
    Y = np.zeros(N, dtype=complex)
    # work only in the first half of the DFT vector since input is real
    for n in range(0, N):
        # accumulate original frequency bins into rescaled bins
        ix = int(n * factor)
        if ix < N:
            Y[ix] += X[n]
    # now rebuild a Hermitian-symmetric DFT
    Y = np.r_[Y, np.conj(Y[-2:0:-1])] if parity else np.r_[Y, np.conj(Y[-1:0:-1])]
    return np.real(np.fft.ifft(Y))

IPython.display.Audio(DFT_rescale(y, 1.4), rate=Fs_y)

def DFT_pshift(x, factor, chunk_size, overlap=0):
    N = len(x)
    y = np.zeros(N)
    win, stride = win_taper(chunk_size, overlap)
    for n in range(0, len(x) - chunk_size, stride):
        w = DFT_rescale(x[n:n+chunk_size] * win, factor)
        y[n:n+chunk_size] += w * win
    return y

IPython.display.Audio(DFT_pshift(s, 0.6, ms2smp(40, Fs), 0.2), rate=Fs)

IPython.display.Audio(DFT_pshift(s, 1.5, ms2smp(40, Fs), 0.4), rate=Fs)

Fs_y, y = wavfile.read('voiced.wav')
y = y / 32767.0 # cast to floats in [-1, 1]
plot_spec(y, Fs_y)
Y = np.fft.fft([1.0, -2.1793, 2.4140, -1.6790, 0.3626, 0.5618, -0.7047, 
                0.1956, 0.1872, -0.2878, 0.2354, -0.0577, -0.0815, 0.0946, 
                0.1242, -0.1360, 0.0677, -0.0622, -0.0306, 0.0430, -0.0169], len(y))
plot_spec(np.abs(np.divide(1.0, Y)), Fs_y, do_fft=False);

def bac(x, p):
    # compute the biased autocorrelation for x up to lag p
    L = len(x)
    r = np.zeros(p+1)
    for m in range(0, p+1):
        for n in range(0, L-m):
            r[m] += x[n] * x[n+m]
        r[m] /= float(L)
    return r

def ld(r, p):
    # solve the toeplitz system using the Levinson-Durbin algorithm
    g = r[1] / r[0]
    a = np.array([g])
    v = (1. - g * g) * r[0];
    for i in range(1, p):
        g = (r[i+1] - np.dot(a, r[1:i+1])) / v
        a = np.r_[ g,  a - g * a[i-1::-1] ]
        v *= 1. - g*g
    # return the coefficients of the A(z) filter
    return np.r_[1, -a[::-1]]

def lpc(x, p):
    # compute p LPC coefficients for a speech segment
    return ld(bac(x, p), p)

plot_spec(y, Fs_y)
A = np.fft.fft(lpc(y, 20), len(y))
plot_spec(np.abs(np.divide(1.0, A)), Fs_y, do_fft=False);

def LPC_DFT_pshift(x, factor, chunk_size, overlap=0.2, LPC_order=20):
    N = len(x)
    y = np.zeros(N)
    win, stride = win_taper(chunk_size, overlap)
    for n in range(0, len(x) - chunk_size, stride):
        chunk = x[n:n+chunk_size]
        a = lpc(chunk, LPC_order)
        exc = sp.lfilter(a, [1], chunk)
        exc = DFT_rescale(exc, factor)
        chunk = sp.lfilter([1], a, exc)
        y[n:n+chunk_size] += chunk * win
    return y

IPython.display.Audio(LPC_DFT_pshift(s, 0.6, ms2smp(40, Fs)), rate=Fs)

IPython.display.Audio(LPC_DFT_pshift(s, 1.5, ms2smp(40, Fs)), rate=Fs)

def LPC_GS_pshift(x, factor, grain_size, overlap=0.2, LPC_order=20):
    N = len(x)
    y = np.zeros(N)
    # input chunk size will be dependent on grain_size and shifting factor
    chunk_size = int(grain_size * factor + 0.5)
    win, stride = win_taper(grain_size, overlap)
    for n in range(0, len(x) - max(chunk_size, grain_size), stride):
        chunk = x[n:n+chunk_size]
        a = lpc(chunk, LPC_order)
        exc = sp.lfilter(a, [1], chunk)
        # this changes the length of exc from chunk_size to grain_size:
        exc = resample(exc, factor)
        grain = sp.lfilter([1], a, exc)
        y[n:n+grain_size] += grain * win
    return y

IPython.display.Audio(LPC_GS_pshift(s, 0.6, ms2smp(40, Fs)), rate=Fs)

IPython.display.Audio(LPC_GS_pshift(s, 1.5, ms2smp(40, Fs), overlap=0.3), rate=Fs)

def LPC_daft(x, exc_pitch, chunk_size, Fs, overlap=0.2, LPC_order=20):
    omega = (float(exc_pitch) / Fs) * 2 * np.pi  
    exc = np.sign(np.cos(omega * np.arange(0, chunk_size)))
    N = len(x)
    y = np.zeros(N)
    win, stride = win_taper(chunk_size, overlap)
    for n in range(0, len(x) - chunk_size, stride):
        a = lpc(x[n:n+chunk_size], LPC_order)
        out_chunk = sp.lfilter([1], a, exc)
        y[n:n+chunk_size] += out_chunk * win
    return y

IPython.display.Audio(LPC_daft(s, 140, ms2smp(40, Fs), Fs), rate=Fs)

A Gallery of Voice Transformers¶

1. The "Alien Voice"¶

2. "Turntable" pitch shifting¶

3. Pitch shift via Granular Synthesis¶

3.1. Granular synthesis for time stretching¶

3.2. Crossfading¶

3.3. From time stretching to pitch shifting¶

4. DFT-based pitch shift¶

5. LPC and the Vocoder¶

5.1. LPC analysis¶

5.2. LPC-based pitch shifting¶

5.3 A simple vocoder¶