File size: 9,558 Bytes
b8cc2bf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
/**
 * Keet - Mel Spectrogram Math
 * 
 * Pure computation functions for mel spectrogram feature extraction.
 * Matches NeMo / onnx-asr / parakeet.js mel.js exactly.
 * 
 * Designed to be self-contained and reusable:
 *   - No external dependencies
 *   - All functions are pure (no side effects)
 *   - Can be imported by workers, tests, or bundled as a standalone package
 */

// ═══════════════════════════════════════════════════════════════════════════
// Constants
// ═══════════════════════════════════════════════════════════════════════════

export const MEL_CONSTANTS = {
    SAMPLE_RATE: 16000,
    N_FFT: 512,
    WIN_LENGTH: 400,
    HOP_LENGTH: 160,
    PREEMPH: 0.97,
    LOG_ZERO_GUARD: 2 ** -24, // float(2**-24) β‰ˆ 5.96e-8
    N_FREQ_BINS: (512 >> 1) + 1, // 257
    DEFAULT_N_MELS: 128,
} as const;

// Slaney Mel Scale constants
const F_SP = 200.0 / 3; // ~66.667 Hz spacing in linear region
const MIN_LOG_HZ = 1000.0;
const MIN_LOG_MEL = MIN_LOG_HZ / F_SP; // = 15.0
const LOG_STEP = Math.log(6.4) / 27.0;

// ═══════════════════════════════════════════════════════════════════════════
// Mel Scale Helpers
// ═══════════════════════════════════════════════════════════════════════════

/**
 * Convert frequency in Hz to mel scale (Slaney variant).
 */
export function hzToMel(freq: number): number {
    return freq >= MIN_LOG_HZ
        ? MIN_LOG_MEL + Math.log(freq / MIN_LOG_HZ) / LOG_STEP
        : freq / F_SP;
}

/**
 * Convert mel scale value back to Hz (Slaney variant).
 */
export function melToHz(mel: number): number {
    return mel >= MIN_LOG_MEL
        ? MIN_LOG_HZ * Math.exp(LOG_STEP * (mel - MIN_LOG_MEL))
        : mel * F_SP;
}

/**
 * Create mel filterbank matrix [nMels Γ— N_FREQ_BINS] with Slaney normalization.
 * Returns a flat Float32Array in row-major order.
 */
export function createMelFilterbank(nMels: number): Float32Array {
    const { SAMPLE_RATE, N_FREQ_BINS } = MEL_CONSTANTS;
    const fMax = SAMPLE_RATE / 2; // 8000

    const allFreqs = new Float64Array(N_FREQ_BINS);
    for (let i = 0; i < N_FREQ_BINS; i++) {
        allFreqs[i] = (fMax * i) / (N_FREQ_BINS - 1);
    }

    const melMin = hzToMel(0);
    const melMax = hzToMel(fMax);
    const nPoints = nMels + 2;
    const fPts = new Float64Array(nPoints);
    for (let i = 0; i < nPoints; i++) {
        fPts[i] = melToHz(melMin + ((melMax - melMin) * i) / (nPoints - 1));
    }

    const fDiff = new Float64Array(nPoints - 1);
    for (let i = 0; i < nPoints - 1; i++) {
        fDiff[i] = fPts[i + 1] - fPts[i];
    }

    const fb = new Float32Array(nMels * N_FREQ_BINS);
    for (let m = 0; m < nMels; m++) {
        const enorm = 2.0 / (fPts[m + 2] - fPts[m]); // slaney normalization
        const fbOffset = m * N_FREQ_BINS;
        for (let k = 0; k < N_FREQ_BINS; k++) {
            const downSlope = (allFreqs[k] - fPts[m]) / fDiff[m];
            const upSlope = (fPts[m + 2] - allFreqs[k]) / fDiff[m + 1];
            fb[fbOffset + k] = Math.max(0, Math.min(downSlope, upSlope)) * enorm;
        }
    }
    return fb;
}

/**
 * Create a Hann window of length WIN_LENGTH, zero-padded to N_FFT.
 */
export function createPaddedHannWindow(): Float64Array {
    const { N_FFT, WIN_LENGTH } = MEL_CONSTANTS;
    const window = new Float64Array(N_FFT);
    const padLeft = (N_FFT - WIN_LENGTH) >> 1; // 56
    for (let n = 0; n < WIN_LENGTH; n++) {
        window[padLeft + n] = 0.5 * (1 - Math.cos((2 * Math.PI * n) / (WIN_LENGTH - 1)));
    }
    return window;
}

/**
 * Precompute FFT twiddle factors for a given size N.
 */
export function precomputeTwiddles(N: number): { cos: Float64Array; sin: Float64Array } {
    const half = N >> 1;
    const cos = new Float64Array(half);
    const sin = new Float64Array(half);
    for (let i = 0; i < half; i++) {
        const angle = (-2 * Math.PI * i) / N;
        cos[i] = Math.cos(angle);
        sin[i] = Math.sin(angle);
    }
    return { cos, sin };
}

/**
 * In-place radix-2 Cooley-Tukey FFT.
 * @param re Real part (modified in-place)
 * @param im Imaginary part (modified in-place)
 * @param n FFT size (must be power of 2)
 * @param tw Precomputed twiddle factors
 */
export function fft(re: Float64Array, im: Float64Array, n: number, tw: { cos: Float64Array; sin: Float64Array }): void {
    // Bit-reversal permutation
    for (let i = 1, j = 0; i < n; i++) {
        let bit = n >> 1;
        while (j & bit) { j ^= bit; bit >>= 1; }
        j ^= bit;
        if (i < j) {
            let tmp = re[i]; re[i] = re[j]; re[j] = tmp;
            tmp = im[i]; im[i] = im[j]; im[j] = tmp;
        }
    }
    // Cooley-Tukey butterfly
    for (let size = 2; size <= n; size <<= 1) {
        const half = size >> 1;
        const step = n / size;
        for (let i = 0; i < n; i += size) {
            for (let j = 0; j < half; j++) {
                const idx = j * step;
                const tRe = re[i + j + half] * tw.cos[idx] - im[i + j + half] * tw.sin[idx];
                const tIm = re[i + j + half] * tw.sin[idx] + im[i + j + half] * tw.cos[idx];
                re[i + j + half] = re[i + j] - tRe;
                im[i + j + half] = im[i + j] - tIm;
                re[i + j] += tRe;
                im[i + j] += tIm;
            }
        }
    }
}

/**
 * Apply pre-emphasis filter to audio samples.
 * @param chunk Raw audio chunk
 * @param lastSample Last sample from previous chunk (for continuity)
 * @param coeff Pre-emphasis coefficient (default 0.97)
 * @returns Pre-emphasized samples
 */
export function preemphasize(chunk: Float32Array, lastSample: number = 0, coeff: number = MEL_CONSTANTS.PREEMPH): Float32Array {
    const out = new Float32Array(chunk.length);
    out[0] = chunk[0] - coeff * lastSample;
    for (let i = 1; i < chunk.length; i++) {
        out[i] = chunk[i] - coeff * chunk[i - 1];
    }
    return out;
}

/**
 * Compute a single mel spectrogram frame from pre-emphasized audio.
 * @param preemphAudio Full pre-emphasized audio buffer
 * @param frameIdx Frame index
 * @param hannWindow Pre-computed Hann window
 * @param twiddles Pre-computed FFT twiddle factors
 * @param melFilterbank Pre-computed mel filterbank
 * @param nMels Number of mel bins
 * @returns Raw (un-normalized) log-mel values for this frame
 */
export function computeMelFrame(
    preemphAudio: Float32Array,
    frameIdx: number,
    hannWindow: Float64Array,
    twiddles: { cos: Float64Array; sin: Float64Array },
    melFilterbank: Float32Array,
    nMels: number,
): Float32Array {
    const { N_FFT, HOP_LENGTH, N_FREQ_BINS, LOG_ZERO_GUARD } = MEL_CONSTANTS;
    const pad = N_FFT >> 1; // 256
    const frameStart = frameIdx * HOP_LENGTH - pad;
    const preemphLen = preemphAudio.length;

    // Window the frame
    const fftRe = new Float64Array(N_FFT);
    const fftIm = new Float64Array(N_FFT);
    for (let k = 0; k < N_FFT; k++) {
        const idx = frameStart + k;
        const sample = (idx >= 0 && idx < preemphLen) ? preemphAudio[idx] : 0;
        fftRe[k] = sample * hannWindow[k];
        fftIm[k] = 0;
    }

    // FFT
    fft(fftRe, fftIm, N_FFT, twiddles);

    // Power spectrum
    const power = new Float32Array(N_FREQ_BINS);
    for (let k = 0; k < N_FREQ_BINS; k++) {
        power[k] = fftRe[k] * fftRe[k] + fftIm[k] * fftIm[k];
    }

    // Mel filterbank multiply + log
    const melFrame = new Float32Array(nMels);
    for (let m = 0; m < nMels; m++) {
        let melVal = 0;
        const fbOff = m * N_FREQ_BINS;
        for (let k = 0; k < N_FREQ_BINS; k++) {
            melVal += power[k] * melFilterbank[fbOff + k];
        }
        melFrame[m] = Math.log(melVal + LOG_ZERO_GUARD);
    }
    return melFrame;
}

/**
 * Normalize mel features per-feature with Bessel-corrected variance.
 * @param features Flat array [nMels Γ— T], mel-major layout
 * @param nMels Number of mel bins
 * @param T Number of time frames
 * @returns Normalized features (new array)
 */
export function normalizeMelFeatures(features: Float32Array, nMels: number, T: number): Float32Array {
    const out = new Float32Array(features.length);

    for (let m = 0; m < nMels; m++) {
        const base = m * T;

        // Copy and compute mean
        let sum = 0;
        for (let t = 0; t < T; t++) {
            out[base + t] = features[base + t];
            sum += features[base + t];
        }
        const mean = sum / T;

        // Variance
        let varSum = 0;
        for (let t = 0; t < T; t++) {
            const d = out[base + t] - mean;
            varSum += d * d;
        }
        const invStd = T > 1
            ? 1.0 / (Math.sqrt(varSum / (T - 1)) + 1e-5)
            : 0;

        // Normalize
        for (let t = 0; t < T; t++) {
            out[base + t] = (out[base + t] - mean) * invStd;
        }
    }

    return out;
}

/**
 * Convert sample offset to frame index.
 */
export function sampleToFrame(sampleOffset: number): number {
    return Math.floor(sampleOffset / MEL_CONSTANTS.HOP_LENGTH);
}