Spaces:

KevinAHM
/

vox-upscaler-web

Running

App Files Files Community

KevinAHM commited on 21 days ago

Commit

ea8d726

verified ·

1 Parent(s): 59b2bdd

Fix deterministic audio preprocessing on Windows

Browse files

Files changed (1) hide show

index.html +156 -22

index.html CHANGED Viewed

@@ -367,6 +367,10 @@ function handleFile(file) {
 function updateBtn() { processBtn.disabled = !(fileBuffer && session); }
 // -- Detect backend & load model --
 async function init() {
   // Detect WebGPU and patch device creation to raise storage buffer limits
@@ -411,7 +415,12 @@ async function init() {
   const ep = backend === 'webgpu' ? 'webgpu' : 'wasm';
   const opts = { executionProviders: [ep] };
-  if (ep === 'wasm') {
     opts.executionProviders = [{ name: 'wasm', options: { numThreads: navigator.hardwareConcurrency || 4 } }];
   }
@@ -445,25 +454,148 @@ async function init() {
   updateBtn();
 }
 // -- Decode audio to mono 16kHz Float32 --
 async function decodeToMono16k(arrayBuffer) {
-  const audioCtx = new OfflineAudioContext(1, 1, INPUT_SR);
-  const decoded = await audioCtx.decodeAudioData(arrayBuffer.slice(0));
-  const origSr = decoded.sampleRate;
-  const origData = decoded.getChannelData(0);
-  // Resample to 16kHz
-  const ratio = INPUT_SR / origSr;
-  const outLen = Math.round(origData.length * ratio);
-  const ctx2 = new OfflineAudioContext(1, outLen, INPUT_SR);
-  const src = ctx2.createBufferSource();
-  const buf = ctx2.createBuffer(1, origData.length, origSr);
-  buf.getChannelData(0).set(origData);
-  src.buffer = buf;
-  src.connect(ctx2.destination);
-  src.start();
-  const rendered = await ctx2.startRendering();
-  return rendered.getChannelData(0);
 }
 // -- Process --
@@ -480,7 +612,7 @@ processBtn.addEventListener('click', async () => {
   const totalSamples = audio16k.length;
   const audioDuration = totalSamples / INPUT_SR;
-  // Chunk sizing: CPU=1000ms, GPU=30s
   const chunkMs = backend === 'webgpu' ? 5000 : 1000;
   const chunkHops = Math.max(1, Math.floor(chunkMs / 1000 * INPUT_SR / HOP));
   const chunkSamples = chunkHops * HOP;
@@ -522,8 +654,10 @@ processBtn.addEventListener('click', async () => {
       state_in: stateTensor,
     });
-    outputs.push(new Float32Array(result.audio_out.data));
-    state = new Float32Array(result.state_out.data);
     chunkIdx++;
     const pct = Math.round(chunkIdx / numChunks * 100);
@@ -595,7 +729,7 @@ function encodeWav(samples, sr) {
 // Load ORT and init
 const script = document.createElement('script');
-script.src = 'https://cdn.jsdelivr.net/npm/onnxruntime-web@1.22.0/dist/ort.min.js';
 script.crossOrigin = 'anonymous';
 script.onload = () => {
   ort.env.wasm.numThreads = navigator.hardwareConcurrency || 4;

 function updateBtn() { processBtn.disabled = !(fileBuffer && session); }
+async function readTensorData(tensor) {
+  return typeof tensor.getData === 'function' ? await tensor.getData() : tensor.data;
+}
 // -- Detect backend & load model --
 async function init() {
   // Detect WebGPU and patch device creation to raise storage buffer limits
   const ep = backend === 'webgpu' ? 'webgpu' : 'wasm';
   const opts = { executionProviders: [ep] };
+  if (ep === 'webgpu') {
+    opts.preferredOutputLocation = {
+      audio_out: 'cpu',
+      state_out: 'cpu',
+    };
+  } else {
     opts.executionProviders = [{ name: 'wasm', options: { numThreads: navigator.hardwareConcurrency || 4 } }];
   }
   updateBtn();
 }
+function mixToMono(audioBuffer) {
+  const len = audioBuffer.length;
+  const mono = new Float32Array(len);
+  for (let ch = 0; ch < audioBuffer.numberOfChannels; ch++) {
+    const data = audioBuffer.getChannelData(ch);
+    for (let i = 0; i < len; i++) mono[i] += data[i];
+  }
+  const gain = 1 / audioBuffer.numberOfChannels;
+  for (let i = 0; i < len; i++) mono[i] *= gain;
+  return mono;
+}
+function readFourCc(view, offset) {
+  return String.fromCharCode(
+    view.getUint8(offset),
+    view.getUint8(offset + 1),
+    view.getUint8(offset + 2),
+    view.getUint8(offset + 3)
+  );
+}
+function decodeWavToMono(arrayBuffer) {
+  if (arrayBuffer.byteLength < 44) return null;
+  const view = new DataView(arrayBuffer);
+  if (readFourCc(view, 0) !== 'RIFF' || readFourCc(view, 8) !== 'WAVE') return null;
+  let offset = 12;
+  let fmt = null;
+  let dataOffset = 0;
+  let dataSize = 0;
+  while (offset + 8 <= view.byteLength) {
+    const id = readFourCc(view, offset);
+    const size = view.getUint32(offset + 4, true);
+    const chunkStart = offset + 8;
+    if (id === 'fmt ') {
+      const format = view.getUint16(chunkStart, true);
+      fmt = {
+        format: format === 0xfffe && size >= 40 ? view.getUint16(chunkStart + 24, true) : format,
+        channels: view.getUint16(chunkStart + 2, true),
+        sampleRate: view.getUint32(chunkStart + 4, true),
+        blockAlign: view.getUint16(chunkStart + 12, true),
+        bitsPerSample: view.getUint16(chunkStart + 14, true),
+      };
+    } else if (id === 'data') {
+      dataOffset = chunkStart;
+      dataSize = size;
+      break;
+    }
+    offset = chunkStart + size + (size % 2);
+  }
+  if (!fmt || !dataOffset || !dataSize) return null;
+  if (fmt.format !== 1 && fmt.format !== 3) return null;
+  const bytesPerSample = fmt.bitsPerSample / 8;
+  if (!Number.isInteger(bytesPerSample) || bytesPerSample < 1) return null;
+  const frames = Math.floor(dataSize / fmt.blockAlign);
+  const mono = new Float32Array(frames);
+  const readSample = (pos) => {
+    if (fmt.format === 3 && fmt.bitsPerSample === 32) return view.getFloat32(pos, true);
+    if (fmt.format !== 1) return 0;
+    if (fmt.bitsPerSample === 8) return (view.getUint8(pos) - 128) / 128;
+    if (fmt.bitsPerSample === 16) return view.getInt16(pos, true) / 32768;
+    if (fmt.bitsPerSample === 24) {
+      let v = view.getUint8(pos) | (view.getUint8(pos + 1) << 8) | (view.getUint8(pos + 2) << 16);
+      if (v & 0x800000) v |= 0xff000000;
+      return v / 8388608;
+    }
+    if (fmt.bitsPerSample === 32) return view.getInt32(pos, true) / 2147483648;
+    return 0;
+  };
+  for (let frame = 0; frame < frames; frame++) {
+    const frameOffset = dataOffset + frame * fmt.blockAlign;
+    let sum = 0;
+    for (let ch = 0; ch < fmt.channels; ch++) {
+      sum += readSample(frameOffset + ch * bytesPerSample);
+    }
+    mono[frame] = sum / fmt.channels;
+  }
+  return {
+    mono,
+    sampleRate: fmt.sampleRate,
+    channels: fmt.channels,
+    source: 'wav',
+  };
+}
+function sinc(x) {
+  if (Math.abs(x) < 1e-8) return 1;
+  const pix = Math.PI * x;
+  return Math.sin(pix) / pix;
+}
+function resampleSinc(input, inSr, outSr) {
+  if (inSr === outSr) return new Float32Array(input);
+  const outLen = Math.round(input.length * outSr / inSr);
+  const output = new Float32Array(outLen);
+  const ratio = inSr / outSr;
+  const cutoff = Math.min(1, outSr / inSr) * 0.95;
+  const radius = 12;
+  const support = radius / cutoff;
+  for (let i = 0; i < outLen; i++) {
+    const center = i * ratio;
+    const left = Math.max(0, Math.ceil(center - support));
+    const right = Math.min(input.length - 1, Math.floor(center + support));
+    let sum = 0;
+    let weightSum = 0;
+    for (let j = left; j <= right; j++) {
+      const x = (center - j) * cutoff;
+      const weight = sinc(x) * sinc(x / radius);
+      sum += input[j] * weight;
+      weightSum += weight;
+    }
+    output[i] = weightSum ? sum / weightSum : 0;
+  }
+  return output;
+}
 // -- Decode audio to mono 16kHz Float32 --
 async function decodeToMono16k(arrayBuffer) {
+  let decodedAudio = decodeWavToMono(arrayBuffer);
+  if (!decodedAudio) {
+    const AudioCtx = window.AudioContext || window.webkitAudioContext;
+    const audioCtx = new AudioCtx();
+    const decoded = await audioCtx.decodeAudioData(arrayBuffer.slice(0));
+    await audioCtx.close();
+    decodedAudio = {
+      mono: mixToMono(decoded),
+      sampleRate: decoded.sampleRate,
+      channels: decoded.numberOfChannels,
+      source: 'webaudio',
+    };
+  }
+  const origSr = decodedAudio.sampleRate;
+  const mono = decodedAudio.mono;
+  const audio16k = resampleSinc(mono, origSr, INPUT_SR);
+  return audio16k;
 }
 // -- Process --
   const totalSamples = audio16k.length;
   const audioDuration = totalSamples / INPUT_SR;
+  // Chunk sizing: CPU=1000ms, GPU=5000ms
   const chunkMs = backend === 'webgpu' ? 5000 : 1000;
   const chunkHops = Math.max(1, Math.floor(chunkMs / 1000 * INPUT_SR / HOP));
   const chunkSamples = chunkHops * HOP;
       state_in: stateTensor,
     });
+    const audioOut = await readTensorData(result.audio_out);
+    const stateOut = await readTensorData(result.state_out);
+    outputs.push(new Float32Array(audioOut));
+    state = new Float32Array(stateOut);
     chunkIdx++;
     const pct = Math.round(chunkIdx / numChunks * 100);
 // Load ORT and init
 const script = document.createElement('script');
+script.src = 'https://cdn.jsdelivr.net/npm/onnxruntime-web@1.22.0/dist/ort.webgpu.min.js';
 script.crossOrigin = 'anonymous';
 script.onload = () => {
   ort.env.wasm.numThreads = navigator.hardwareConcurrency || 4;