Spaces:

rjzevallos
/

streaming

Runtime error

App Files Files Community

rjzevallos commited on Nov 18, 2025

Commit

0eb9991

1 Parent(s): 3e8e10c

Refactor: Use native Gradio audio component for reliable microphone capture

Browse files

Files changed (1) hide show

app.py +103 -195

app.py CHANGED Viewed

@@ -2,10 +2,10 @@ import asyncio
 import logging
 from fastapi import FastAPI, UploadFile, File
 from fastapi.responses import JSONResponse
-from fastapi.staticfiles import StaticFiles
 import gradio as gr
-import os
-import tempfile
 import server_wrapper
@@ -14,6 +14,9 @@ logger = logging.getLogger(__name__)
 app = FastAPI(title="SimulStreaming ASR")
 @app.on_event("startup")
 async def startup_event():
@@ -56,6 +59,8 @@ def _ensure_model_downloaded():
 async def api_reset():
     try:
         server_wrapper.reset()
         return JSONResponse({"status": "ok"})
     except Exception as e:
         return JSONResponse({"status": "error", "message": str(e)}, status_code=500)
@@ -66,7 +71,9 @@ async def api_chunk(file: UploadFile = File(...)):
     try:
         raw = await file.read()
         out = await asyncio.get_event_loop().run_in_executor(None, server_wrapper.process_chunk_from_bytes, raw)
-        return JSONResponse(out)
     except Exception as e:
         logger.error(f"Error processing chunk: {e}")
         return JSONResponse({"status": "error", "message": str(e)}, status_code=500)
@@ -76,217 +83,117 @@ async def api_chunk(file: UploadFile = File(...)):
 async def api_finish():
     try:
         out = await asyncio.get_event_loop().run_in_executor(None, server_wrapper.finish)
-        return JSONResponse(out)
     except Exception as e:
         logger.error(f"Error finishing: {e}")
         return JSONResponse({"status": "error", "message": str(e)}, status_code=500)
-def create_ui():
-    with gr.Blocks(title="Streaming ASR", theme=gr.themes.Soft()) as demo:
-        gr.Markdown("""
-        # 🎙️ Streaming ASR — SimulWhisper
-        Graba tu voz y verás la transcripción en tiempo real.
-        **Instrucciones:**
-        1. Haz clic en **"Start Recording"**
-        2. Habla en el micrófono
-        3. Haz clic en **"Stop Recording"** cuando termines
-        """)
-        with gr.Row():
-            start_btn = gr.Button("🔴 Start Recording", size="lg", scale=1)
-            stop_btn = gr.Button("⏹️ Stop Recording", size="lg", scale=1)
-            reset_btn = gr.Button("🔄 Reset", size="lg", scale=1)
-        transcript_output = gr.Textbox(
-            label="Transcription",
-            lines=5,
-            interactive=False,
-            placeholder="Transcription will appear here..."
-        )
-        status_output = gr.Textbox(
-            label="Status",
-            lines=2,
-            interactive=False,
-            placeholder="Ready"
-        )
-        # JavaScript para captura de audio
-        js_code = r"""
-<script>
-let mediaRecorder, audioCtx, source, processor;
-let recording = false;
-let transcriptDiv = null;
-let statusDiv = null;
-function to16BitPCM(float32Array) {
-    const l = float32Array.length;
-    const buffer = new ArrayBuffer(l * 2);
-    const view = new DataView(buffer);
-    let offset = 0;
-    for (let i = 0; i < l; i++) {
-        let s = Math.max(-1, Math.min(1, float32Array[i]));
-        view.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7FFF, true);
-        offset += 2;
-    }
-    return buffer;
-}
-function writeWAV(samples, sampleRate) {
-    const buffer = new ArrayBuffer(44 + samples.byteLength);
-    const view = new DataView(buffer);
-    function writeString(view, offset, string) {
-        for (let i = 0; i < string.length; i++) {
-            view.setUint8(offset + i, string.charCodeAt(i));
-        }
-    }
-    writeString(view, 0, 'RIFF');
-    view.setUint32(4, 36 + samples.byteLength, true);
-    writeString(view, 8, 'WAVE');
-    writeString(view, 12, 'fmt ');
-    view.setUint32(16, 16, true);
-    view.setUint16(20, 1, true);
-    view.setUint16(22, 1, true);
-    view.setUint32(24, sampleRate, true);
-    view.setUint32(28, sampleRate * 2, true);
-    view.setUint16(32, 2, true);
-    view.setUint16(34, 16, true);
-    writeString(view, 36, 'data');
-    view.setUint32(40, samples.byteLength, true);
-    const bytes = new Uint8Array(buffer, 44);
-    bytes.set(new Uint8Array(samples));
-    return buffer;
-}
-async function resampleAudio(float32Array, fromSampleRate, toSampleRate) {
-    if (fromSampleRate === toSampleRate) {
-        return float32Array;
-    }
-    const offlineCtx = new OfflineAudioContext(1, Math.round(float32Array.length * toSampleRate / fromSampleRate), toSampleRate);
-    const buffer = offlineCtx.createBuffer(1, float32Array.length, fromSampleRate);
-    buffer.copyToChannel(float32Array, 0, 0);
-    const src = offlineCtx.createBufferSource();
-    src.buffer = buffer;
-    src.connect(offlineCtx.destination);
-    src.start(0);
-    const rendered = await offlineCtx.startRendering();
-    return rendered.getChannelData(0);
-}
-async function sendChunk(float32Array, sampleRate) {
-    try {
-        let resampled = await resampleAudio(float32Array, sampleRate, 16000);
-        const pcm16 = to16BitPCM(resampled);
-        const wav = writeWAV(pcm16, 16000);
-        const blob = new Blob([wav], { type: 'audio/wav' });
-        const fd = new FormData();
-        fd.append('file', blob, 'chunk.wav');
-        const resp = await fetch('/api/chunk', { method: 'POST', body: fd });
-        if (!resp.ok) {
-            console.error('Chunk upload failed:', resp.status);
-            return;
-        }
-        const j = await resp.json();
-        if (j.text && transcriptDiv) {
-            transcriptDiv.value = j.text;
-        }
-    } catch (e) {
-        console.error('Error sending chunk:', e);
-    }
-}
-async function startRecording() {
-    try {
-        if (recording) return;
-        recording = true;
-        if (statusDiv) statusDiv.value = "Recording... listening to audio";
-        audioCtx = new (window.AudioContext || window.webkitAudioContext)();
-        const stream = await navigator.mediaDevices.getUserMedia({ audio: { echoCancellation: false, noiseSuppression: false } });
-        source = audioCtx.createMediaStreamSource(stream);
-        processor = audioCtx.createScriptProcessor(4096, 1, 1);
-        let chunkBuffer = [];
-        let bufferLength = 0;
-        const bufferThreshold = 16000 * 1; // 1 second of audio at 16kHz
-        processor.onaudioprocess = function(e) {
-            const ch = e.inputBuffer.getChannelData(0);
-            for (let i = 0; i < ch.length; i++) {
-                chunkBuffer.push(ch[i]);
-                bufferLength++;
-            }
-            // Send chunks of ~1 second
-            if (bufferLength >= bufferThreshold) {
-                const chunk = new Float32Array(chunkBuffer);
-                chunkBuffer = [];
-                bufferLength = 0;
-                sendChunk(chunk, audioCtx.sampleRate);
-            }
-        };
-        source.connect(processor);
-        processor.connect(audioCtx.destination);
-    } catch (e) {
-        console.error('Error starting recording:', e);
-        recording = false;
-        if (statusDiv) statusDiv.value = "Error: " + e.message;
-    }
-}
-function stopRecording() {
-    try {
-        if (!recording) return;
-        recording = false;
-        if (statusDiv) statusDiv.value = "Stopping...";
-        // Notify server
-        fetch('/api/finish', { method: 'POST' }).then(() => {
-            if (statusDiv) statusDiv.value = "Done";
-            console.log('Recording finished');
-        }).catch(e => console.error('Error finishing:', e));
-        if (source && source.mediaStream) {
-            const tracks = source.mediaStream.getTracks();
-            tracks.forEach(t => t.stop());
-        }
-        if (processor) processor.disconnect();
-        if (source) source.disconnect();
-    } catch (e) {
-        console.error('Error stopping recording:', e);
-    }
-}
-document.addEventListener('DOMContentLoaded', () => {
-    // Esperar a que Gradio cargue completamente
-    setTimeout(() => {
-        // Encontrar textboxes por label
-        const textboxes = document.querySelectorAll('textarea');
-        if (textboxes.length >= 2) {
-            transcriptDiv = textboxes[0];
-            statusDiv = textboxes[1];
-            console.log('UI elements found');
-        }
-        // Encontrar botones
-        const buttons = document.querySelectorAll('button');
-        if (buttons.length >= 2) {
-            buttons[0].addEventListener('click', startRecording);
-            buttons[1].addEventListener('click', stopRecording);
-            console.log('Button listeners attached');
-        }
-    }, 500);
-});
-</script>
-"""
-        gr.HTML(js_code)
     return demo
@@ -301,3 +208,4 @@ if __name__ == "__main__":
     import uvicorn
     uvicorn.run(app, host="0.0.0.0", port=7860)

 import logging
 from fastapi import FastAPI, UploadFile, File
 from fastapi.responses import JSONResponse
 import gradio as gr
+import numpy as np
+import soundfile as sf
+import io
 import server_wrapper
 app = FastAPI(title="SimulStreaming ASR")
+# Global state for streaming
+_transcription_state = {"text": "", "final": False}
 @app.on_event("startup")
 async def startup_event():
 async def api_reset():
     try:
         server_wrapper.reset()
+        _transcription_state["text"] = ""
+        _transcription_state["final"] = False
         return JSONResponse({"status": "ok"})
     except Exception as e:
         return JSONResponse({"status": "error", "message": str(e)}, status_code=500)
     try:
         raw = await file.read()
         out = await asyncio.get_event_loop().run_in_executor(None, server_wrapper.process_chunk_from_bytes, raw)
+        if out and out.get("text"):
+            _transcription_state["text"] = out["text"]
+        return JSONResponse(out or {})
     except Exception as e:
         logger.error(f"Error processing chunk: {e}")
         return JSONResponse({"status": "error", "message": str(e)}, status_code=500)
 async def api_finish():
     try:
         out = await asyncio.get_event_loop().run_in_executor(None, server_wrapper.finish)
+        if out and out.get("text"):
+            _transcription_state["text"] = out["text"]
+            _transcription_state["final"] = True
+        return JSONResponse(out or {})
     except Exception as e:
         logger.error(f"Error finishing: {e}")
         return JSONResponse({"status": "error", "message": str(e)}, status_code=500)
+def process_audio(audio_data):
+    """
+    Process audio from Gradio audio component.
+    audio_data is a tuple of (sample_rate, audio_array)
+    """
+    if audio_data is None:
+        return "Please record audio first."
+    try:
+        sample_rate, audio_array = audio_data
+        # Ensure audio is float32 and mono
+        if audio_array.dtype != np.float32:
+            audio_array = audio_array.astype(np.float32)
+        if len(audio_array.shape) > 1:
+            audio_array = np.mean(audio_array, axis=1)
+        # Resample to 16kHz if needed
+        if sample_rate != 16000:
+            import librosa
+            audio_array = librosa.resample(audio_array, orig_sr=sample_rate, target_sr=16000)
+            sample_rate = 16000
+        # Convert to WAV format
+        bio = io.BytesIO()
+        sf.write(bio, audio_array, sample_rate, format='WAV')
+        wav_bytes = bio.getvalue()
+        logger.info(f"Processing audio: {len(wav_bytes)} bytes, {sample_rate}Hz")
+        # Reset state
+        server_wrapper.reset()
+        _transcription_state["text"] = ""
+        # Process the chunk
+        result = server_wrapper.process_chunk_from_bytes(wav_bytes)
+        # Finish processing
+        final_result = server_wrapper.finish()
+        # Return final transcription
+        if final_result and final_result.get("text"):
+            return final_result["text"]
+        elif result and result.get("text"):
+            return result["text"]
+        else:
+            return "No transcription available"
+    except Exception as e:
+        logger.error(f"Error processing audio: {e}")
+        return f"Error: {str(e)}"
+def create_ui():
+    with gr.Blocks(title="Streaming ASR", theme=gr.themes.Soft()) as demo:
+        gr.Markdown("""
+        # 🎙️ Streaming ASR — SimulWhisper
+        Graba tu voz y verás la transcripción en tiempo real.
+        **Instrucciones:**
+        1. Haz clic en el botón de **Record** (rojo)
+        2. Habla en el micrófono
+        3. Haz clic en el botón de **Stop** (cuadrado) cuando termines
+        4. Verás la transcripción automáticamente
+        """)
+        with gr.Row():
+            with gr.Column():
+                gr.Markdown("### 🎤 Record Audio")
+                audio_input = gr.Audio(
+                    label="Record your voice",
+                    type="numpy",
+                    sources=["microphone"],
+                )
+            with gr.Column():
+                gr.Markdown("### 📝 Transcription")
+                transcript_output = gr.Textbox(
+                    label="Transcription Result",
+                    lines=8,
+                    interactive=False,
+                    placeholder="Transcription will appear here..."
+                )
+        # Button to process
+        process_btn = gr.Button("🚀 Transcribe", size="lg", variant="primary")
+        # Connect the button to process audio
+        process_btn.click(
+            fn=process_audio,
+            inputs=[audio_input],
+            outputs=[transcript_output]
+        )
+        # Also process on upload (no button needed)
+        audio_input.change(
+            fn=process_audio,
+            inputs=[audio_input],
+            outputs=[transcript_output]
+        )
     return demo
     import uvicorn
     uvicorn.run(app, host="0.0.0.0", port=7860)