|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
const DEFAULT_MEL_CONFIG = { |
|
|
sample_rate: 16000, |
|
|
n_fft: 512, |
|
|
win_length: 400, |
|
|
hop_length: 160, |
|
|
n_mels: 128, |
|
|
fmin: 0, |
|
|
fmax: 8000, |
|
|
preemph: 0.97, |
|
|
log_zero_guard: 5.960464477539063e-08, |
|
|
normalize: 'per_feature', |
|
|
mel_norm: 'slaney', |
|
|
}; |
|
|
|
|
|
let melConfig = { ...DEFAULT_MEL_CONFIG }; |
|
|
let melFilterbank = null; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
export async function loadMelConfig(modelPath) { |
|
|
try { |
|
|
const response = await fetch(`${modelPath}/onnx/mel_config.json`, { |
|
|
mode: 'cors', |
|
|
credentials: 'omit', |
|
|
}); |
|
|
if (response.ok) { |
|
|
melConfig = await response.json(); |
|
|
console.log('Loaded mel config:', melConfig); |
|
|
} |
|
|
} catch (e) { |
|
|
console.warn('Could not load mel_config.json, using defaults'); |
|
|
} |
|
|
|
|
|
|
|
|
melFilterbank = createMelFilterbank( |
|
|
melConfig.sample_rate, |
|
|
melConfig.n_fft, |
|
|
melConfig.n_mels, |
|
|
melConfig.fmin, |
|
|
melConfig.fmax |
|
|
); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
function createMelFilterbank(sr, nFft, nMels, fmin, fmax) { |
|
|
const nFreqs = Math.floor(nFft / 2) + 1; |
|
|
|
|
|
|
|
|
const hzToMel = (hz) => 2595 * Math.log10(1 + hz / 700); |
|
|
const melToHz = (mel) => 700 * (Math.pow(10, mel / 2595) - 1); |
|
|
|
|
|
|
|
|
const melMin = hzToMel(fmin); |
|
|
const melMax = hzToMel(fmax); |
|
|
const melPoints = new Float32Array(nMels + 2); |
|
|
for (let i = 0; i < nMels + 2; i++) { |
|
|
melPoints[i] = melMin + (melMax - melMin) * i / (nMels + 1); |
|
|
} |
|
|
|
|
|
|
|
|
const hzPoints = melPoints.map(melToHz); |
|
|
const binPoints = hzPoints.map((hz) => Math.floor((nFft + 1) * hz / sr)); |
|
|
|
|
|
|
|
|
const filterbank = []; |
|
|
for (let m = 0; m < nMels; m++) { |
|
|
const filter = new Float32Array(nFreqs); |
|
|
const start = binPoints[m]; |
|
|
const center = binPoints[m + 1]; |
|
|
const end = binPoints[m + 2]; |
|
|
|
|
|
|
|
|
for (let k = start; k < center; k++) { |
|
|
if (k < nFreqs) { |
|
|
filter[k] = (k - start) / (center - start); |
|
|
} |
|
|
} |
|
|
|
|
|
for (let k = center; k < end; k++) { |
|
|
if (k < nFreqs) { |
|
|
filter[k] = (end - k) / (end - center); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
const enorm = 2.0 / (hzPoints[m + 2] - hzPoints[m]); |
|
|
for (let k = 0; k < nFreqs; k++) { |
|
|
filter[k] *= enorm; |
|
|
} |
|
|
|
|
|
filterbank.push(filter); |
|
|
} |
|
|
|
|
|
return filterbank; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
function createHannWindow(length) { |
|
|
const window = new Float32Array(length); |
|
|
for (let i = 0; i < length; i++) { |
|
|
window[i] = 0.5 * (1 - Math.cos(2 * Math.PI * i / (length - 1))); |
|
|
} |
|
|
return window; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
function resampleAudio(audio, srcSr, dstSr) { |
|
|
if (srcSr === dstSr) return audio; |
|
|
|
|
|
const ratio = srcSr / dstSr; |
|
|
const newLength = Math.floor(audio.length / ratio); |
|
|
const resampled = new Float32Array(newLength); |
|
|
|
|
|
for (let i = 0; i < newLength; i++) { |
|
|
const srcIdx = i * ratio; |
|
|
const srcIdxFloor = Math.floor(srcIdx); |
|
|
const srcIdxCeil = Math.min(srcIdxFloor + 1, audio.length - 1); |
|
|
const frac = srcIdx - srcIdxFloor; |
|
|
resampled[i] = audio[srcIdxFloor] * (1 - frac) + audio[srcIdxCeil] * frac; |
|
|
} |
|
|
|
|
|
return resampled; |
|
|
} |
|
|
|
|
|
|
|
|
let _fftCache = null; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
function initFFT(n) { |
|
|
if (_fftCache && _fftCache.n === n) return _fftCache; |
|
|
|
|
|
|
|
|
const twiddleRe = new Float32Array(n / 2); |
|
|
const twiddleIm = new Float32Array(n / 2); |
|
|
for (let i = 0; i < n / 2; i++) { |
|
|
const angle = -2 * Math.PI * i / n; |
|
|
twiddleRe[i] = Math.cos(angle); |
|
|
twiddleIm[i] = Math.sin(angle); |
|
|
} |
|
|
|
|
|
|
|
|
const bitrev = new Uint32Array(n); |
|
|
for (let i = 0; i < n; i++) { |
|
|
let j = 0; |
|
|
let x = i; |
|
|
for (let k = 1; k < n; k <<= 1) { |
|
|
j = (j << 1) | (x & 1); |
|
|
x >>= 1; |
|
|
} |
|
|
bitrev[i] = j; |
|
|
} |
|
|
|
|
|
|
|
|
const workRe = new Float32Array(n); |
|
|
const workIm = new Float32Array(n); |
|
|
|
|
|
_fftCache = { n, twiddleRe, twiddleIm, bitrev, workRe, workIm }; |
|
|
return _fftCache; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
function computeRfftMagnitude(frame) { |
|
|
const n = frame.length; |
|
|
const nFreqs = Math.floor(n / 2) + 1; |
|
|
const cache = initFFT(n); |
|
|
|
|
|
const { twiddleRe, twiddleIm, bitrev, workRe, workIm } = cache; |
|
|
|
|
|
|
|
|
for (let i = 0; i < n; i++) { |
|
|
workRe[bitrev[i]] = frame[i]; |
|
|
workIm[bitrev[i]] = 0; |
|
|
} |
|
|
|
|
|
|
|
|
for (let len = 2; len <= n; len <<= 1) { |
|
|
const halfLen = len >> 1; |
|
|
const step = n / len; |
|
|
for (let i = 0; i < n; i += len) { |
|
|
for (let j = 0; j < halfLen; j++) { |
|
|
const twIdx = j * step; |
|
|
const wRe = twiddleRe[twIdx]; |
|
|
const wIm = twiddleIm[twIdx]; |
|
|
const u = i + j; |
|
|
const v = u + halfLen; |
|
|
const tRe = wRe * workRe[v] - wIm * workIm[v]; |
|
|
const tIm = wRe * workIm[v] + wIm * workRe[v]; |
|
|
workRe[v] = workRe[u] - tRe; |
|
|
workIm[v] = workIm[u] - tIm; |
|
|
workRe[u] += tRe; |
|
|
workIm[u] += tIm; |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
const magnitude = new Float32Array(nFreqs); |
|
|
for (let k = 0; k < nFreqs; k++) { |
|
|
magnitude[k] = Math.sqrt(workRe[k] * workRe[k] + workIm[k] * workIm[k]); |
|
|
} |
|
|
|
|
|
return magnitude; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
export function computeMelSpectrogram(audioData, sampleRate) { |
|
|
const { |
|
|
sample_rate: targetSr, |
|
|
n_fft: nFft, |
|
|
win_length: winLength, |
|
|
hop_length: hopLength, |
|
|
preemph, |
|
|
log_zero_guard: logZeroGuard, |
|
|
n_mels: nMels, |
|
|
} = melConfig; |
|
|
|
|
|
|
|
|
if (!melFilterbank) { |
|
|
melFilterbank = createMelFilterbank(targetSr, nFft, nMels, melConfig.fmin, melConfig.fmax); |
|
|
} |
|
|
|
|
|
|
|
|
let audio = resampleAudio(audioData, sampleRate, targetSr); |
|
|
|
|
|
|
|
|
const audioPreemph = new Float32Array(audio.length); |
|
|
audioPreemph[0] = audio[0]; |
|
|
for (let i = 1; i < audio.length; i++) { |
|
|
audioPreemph[i] = audio[i] - preemph * audio[i - 1]; |
|
|
} |
|
|
|
|
|
|
|
|
const padAmount = Math.floor(nFft / 2); |
|
|
const audioPadded = new Float32Array(audio.length + 2 * padAmount); |
|
|
audioPadded.set(audioPreemph, padAmount); |
|
|
|
|
|
|
|
|
const numFrames = 1 + Math.floor((audioPadded.length - nFft) / hopLength); |
|
|
const nFreqs = Math.floor(nFft / 2) + 1; |
|
|
|
|
|
|
|
|
const hannWindow = createHannWindow(winLength); |
|
|
const padLeft = Math.floor((nFft - winLength) / 2); |
|
|
const paddedWindow = new Float32Array(nFft); |
|
|
for (let i = 0; i < winLength; i++) { |
|
|
paddedWindow[padLeft + i] = hannWindow[i]; |
|
|
} |
|
|
|
|
|
|
|
|
const melFeatures = new Float32Array(numFrames * nMels); |
|
|
|
|
|
for (let frameIdx = 0; frameIdx < numFrames; frameIdx++) { |
|
|
|
|
|
const start = frameIdx * hopLength; |
|
|
const frame = new Float32Array(nFft); |
|
|
for (let i = 0; i < nFft; i++) { |
|
|
frame[i] = audioPadded[start + i] * paddedWindow[i]; |
|
|
} |
|
|
|
|
|
|
|
|
const magnitude = computeRfftMagnitude(frame); |
|
|
|
|
|
|
|
|
for (let m = 0; m < nMels; m++) { |
|
|
let melVal = 0; |
|
|
for (let k = 0; k < nFreqs; k++) { |
|
|
melVal += melFilterbank[m][k] * magnitude[k] * magnitude[k]; |
|
|
} |
|
|
|
|
|
melFeatures[frameIdx * nMels + m] = Math.log(Math.max(melVal, logZeroGuard)); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
if (melConfig.normalize === 'per_feature') { |
|
|
for (let m = 0; m < nMels; m++) { |
|
|
let mean = 0; |
|
|
let std = 0; |
|
|
for (let t = 0; t < numFrames; t++) { |
|
|
mean += melFeatures[t * nMels + m]; |
|
|
} |
|
|
mean /= numFrames; |
|
|
|
|
|
for (let t = 0; t < numFrames; t++) { |
|
|
const diff = melFeatures[t * nMels + m] - mean; |
|
|
std += diff * diff; |
|
|
} |
|
|
std = Math.sqrt(std / numFrames + 1e-5); |
|
|
|
|
|
for (let t = 0; t < numFrames; t++) { |
|
|
melFeatures[t * nMels + m] = (melFeatures[t * nMels + m] - mean) / std; |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
return { melFeatures, numFrames }; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
export async function loadAudioFile(file) { |
|
|
const arrayBuffer = await file.arrayBuffer(); |
|
|
const audioContext = new (window.AudioContext || window.webkitAudioContext)(); |
|
|
|
|
|
try { |
|
|
const audioBuffer = await audioContext.decodeAudioData(arrayBuffer); |
|
|
|
|
|
|
|
|
let audioData; |
|
|
if (audioBuffer.numberOfChannels === 1) { |
|
|
audioData = audioBuffer.getChannelData(0); |
|
|
} else { |
|
|
const ch0 = audioBuffer.getChannelData(0); |
|
|
const ch1 = audioBuffer.getChannelData(1); |
|
|
audioData = new Float32Array(ch0.length); |
|
|
for (let i = 0; i < ch0.length; i++) { |
|
|
audioData[i] = (ch0[i] + ch1[i]) / 2; |
|
|
} |
|
|
} |
|
|
|
|
|
return { |
|
|
audioData: new Float32Array(audioData), |
|
|
sampleRate: audioBuffer.sampleRate, |
|
|
}; |
|
|
} finally { |
|
|
audioContext.close(); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
export async function recordAudio(maxDurationMs = 30000) { |
|
|
const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); |
|
|
const mediaRecorder = new MediaRecorder(stream); |
|
|
const chunks = []; |
|
|
|
|
|
return new Promise((resolve, reject) => { |
|
|
mediaRecorder.ondataavailable = (e) => chunks.push(e.data); |
|
|
|
|
|
mediaRecorder.onstop = async () => { |
|
|
stream.getTracks().forEach((track) => track.stop()); |
|
|
const blob = new Blob(chunks, { type: 'audio/webm' }); |
|
|
try { |
|
|
const result = await loadAudioFile(blob); |
|
|
resolve(result); |
|
|
} catch (e) { |
|
|
reject(e); |
|
|
} |
|
|
}; |
|
|
|
|
|
mediaRecorder.onerror = (e) => { |
|
|
stream.getTracks().forEach((track) => track.stop()); |
|
|
reject(e); |
|
|
}; |
|
|
|
|
|
mediaRecorder.start(); |
|
|
|
|
|
|
|
|
setTimeout(() => { |
|
|
if (mediaRecorder.state === 'recording') { |
|
|
mediaRecorder.stop(); |
|
|
} |
|
|
}, maxDurationMs); |
|
|
}); |
|
|
} |
|
|
|
|
|
export { melConfig }; |
|
|
|