rjzevallos commited on
Commit
0eb9991
·
1 Parent(s): 3e8e10c

Refactor: Use native Gradio audio component for reliable microphone capture

Browse files
Files changed (1) hide show
  1. app.py +103 -195
app.py CHANGED
@@ -2,10 +2,10 @@ import asyncio
2
  import logging
3
  from fastapi import FastAPI, UploadFile, File
4
  from fastapi.responses import JSONResponse
5
- from fastapi.staticfiles import StaticFiles
6
  import gradio as gr
7
- import os
8
- import tempfile
 
9
 
10
  import server_wrapper
11
 
@@ -14,6 +14,9 @@ logger = logging.getLogger(__name__)
14
 
15
  app = FastAPI(title="SimulStreaming ASR")
16
 
 
 
 
17
 
18
  @app.on_event("startup")
19
  async def startup_event():
@@ -56,6 +59,8 @@ def _ensure_model_downloaded():
56
  async def api_reset():
57
  try:
58
  server_wrapper.reset()
 
 
59
  return JSONResponse({"status": "ok"})
60
  except Exception as e:
61
  return JSONResponse({"status": "error", "message": str(e)}, status_code=500)
@@ -66,7 +71,9 @@ async def api_chunk(file: UploadFile = File(...)):
66
  try:
67
  raw = await file.read()
68
  out = await asyncio.get_event_loop().run_in_executor(None, server_wrapper.process_chunk_from_bytes, raw)
69
- return JSONResponse(out)
 
 
70
  except Exception as e:
71
  logger.error(f"Error processing chunk: {e}")
72
  return JSONResponse({"status": "error", "message": str(e)}, status_code=500)
@@ -76,217 +83,117 @@ async def api_chunk(file: UploadFile = File(...)):
76
  async def api_finish():
77
  try:
78
  out = await asyncio.get_event_loop().run_in_executor(None, server_wrapper.finish)
79
- return JSONResponse(out)
 
 
 
80
  except Exception as e:
81
  logger.error(f"Error finishing: {e}")
82
  return JSONResponse({"status": "error", "message": str(e)}, status_code=500)
83
 
84
 
85
- def create_ui():
86
- with gr.Blocks(title="Streaming ASR", theme=gr.themes.Soft()) as demo:
87
- gr.Markdown("""
88
- # 🎙️ Streaming ASR SimulWhisper
89
-
90
- Graba tu voz y verás la transcripción en tiempo real.
91
-
92
- **Instrucciones:**
93
- 1. Haz clic en **"Start Recording"**
94
- 2. Habla en el micrófono
95
- 3. Haz clic en **"Stop Recording"** cuando termines
96
- """)
97
 
98
- with gr.Row():
99
- start_btn = gr.Button("🔴 Start Recording", size="lg", scale=1)
100
- stop_btn = gr.Button("⏹️ Stop Recording", size="lg", scale=1)
101
- reset_btn = gr.Button("🔄 Reset", size="lg", scale=1)
102
 
103
- transcript_output = gr.Textbox(
104
- label="Transcription",
105
- lines=5,
106
- interactive=False,
107
- placeholder="Transcription will appear here..."
108
- )
109
 
110
- status_output = gr.Textbox(
111
- label="Status",
112
- lines=2,
113
- interactive=False,
114
- placeholder="Ready"
115
- )
116
 
117
- # JavaScript para captura de audio
118
- js_code = r"""
119
- <script>
120
- let mediaRecorder, audioCtx, source, processor;
121
- let recording = false;
122
- let transcriptDiv = null;
123
- let statusDiv = null;
124
-
125
- function to16BitPCM(float32Array) {
126
- const l = float32Array.length;
127
- const buffer = new ArrayBuffer(l * 2);
128
- const view = new DataView(buffer);
129
- let offset = 0;
130
- for (let i = 0; i < l; i++) {
131
- let s = Math.max(-1, Math.min(1, float32Array[i]));
132
- view.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7FFF, true);
133
- offset += 2;
134
- }
135
- return buffer;
136
- }
137
-
138
- function writeWAV(samples, sampleRate) {
139
- const buffer = new ArrayBuffer(44 + samples.byteLength);
140
- const view = new DataView(buffer);
141
- function writeString(view, offset, string) {
142
- for (let i = 0; i < string.length; i++) {
143
- view.setUint8(offset + i, string.charCodeAt(i));
144
- }
145
- }
146
- writeString(view, 0, 'RIFF');
147
- view.setUint32(4, 36 + samples.byteLength, true);
148
- writeString(view, 8, 'WAVE');
149
- writeString(view, 12, 'fmt ');
150
- view.setUint32(16, 16, true);
151
- view.setUint16(20, 1, true);
152
- view.setUint16(22, 1, true);
153
- view.setUint32(24, sampleRate, true);
154
- view.setUint32(28, sampleRate * 2, true);
155
- view.setUint16(32, 2, true);
156
- view.setUint16(34, 16, true);
157
- writeString(view, 36, 'data');
158
- view.setUint32(40, samples.byteLength, true);
159
- const bytes = new Uint8Array(buffer, 44);
160
- bytes.set(new Uint8Array(samples));
161
- return buffer;
162
- }
163
-
164
- async function resampleAudio(float32Array, fromSampleRate, toSampleRate) {
165
- if (fromSampleRate === toSampleRate) {
166
- return float32Array;
167
- }
168
- const offlineCtx = new OfflineAudioContext(1, Math.round(float32Array.length * toSampleRate / fromSampleRate), toSampleRate);
169
- const buffer = offlineCtx.createBuffer(1, float32Array.length, fromSampleRate);
170
- buffer.copyToChannel(float32Array, 0, 0);
171
- const src = offlineCtx.createBufferSource();
172
- src.buffer = buffer;
173
- src.connect(offlineCtx.destination);
174
- src.start(0);
175
- const rendered = await offlineCtx.startRendering();
176
- return rendered.getChannelData(0);
177
- }
178
-
179
- async function sendChunk(float32Array, sampleRate) {
180
- try {
181
- let resampled = await resampleAudio(float32Array, sampleRate, 16000);
182
- const pcm16 = to16BitPCM(resampled);
183
- const wav = writeWAV(pcm16, 16000);
184
- const blob = new Blob([wav], { type: 'audio/wav' });
185
- const fd = new FormData();
186
- fd.append('file', blob, 'chunk.wav');
187
 
188
- const resp = await fetch('/api/chunk', { method: 'POST', body: fd });
189
- if (!resp.ok) {
190
- console.error('Chunk upload failed:', resp.status);
191
- return;
192
- }
193
- const j = await resp.json();
194
- if (j.text && transcriptDiv) {
195
- transcriptDiv.value = j.text;
196
- }
197
- } catch (e) {
198
- console.error('Error sending chunk:', e);
199
- }
200
- }
201
-
202
- async function startRecording() {
203
- try {
204
- if (recording) return;
205
 
206
- recording = true;
207
- if (statusDiv) statusDiv.value = "Recording... listening to audio";
 
208
 
209
- audioCtx = new (window.AudioContext || window.webkitAudioContext)();
210
- const stream = await navigator.mediaDevices.getUserMedia({ audio: { echoCancellation: false, noiseSuppression: false } });
211
- source = audioCtx.createMediaStreamSource(stream);
212
- processor = audioCtx.createScriptProcessor(4096, 1, 1);
213
 
214
- let chunkBuffer = [];
215
- let bufferLength = 0;
216
- const bufferThreshold = 16000 * 1; // 1 second of audio at 16kHz
217
 
218
- processor.onaudioprocess = function(e) {
219
- const ch = e.inputBuffer.getChannelData(0);
220
- for (let i = 0; i < ch.length; i++) {
221
- chunkBuffer.push(ch[i]);
222
- bufferLength++;
223
- }
224
- // Send chunks of ~1 second
225
- if (bufferLength >= bufferThreshold) {
226
- const chunk = new Float32Array(chunkBuffer);
227
- chunkBuffer = [];
228
- bufferLength = 0;
229
- sendChunk(chunk, audioCtx.sampleRate);
230
- }
231
- };
232
 
233
- source.connect(processor);
234
- processor.connect(audioCtx.destination);
235
- } catch (e) {
236
- console.error('Error starting recording:', e);
237
- recording = false;
238
- if (statusDiv) statusDiv.value = "Error: " + e.message;
239
- }
240
- }
241
 
242
- function stopRecording() {
243
- try {
244
- if (!recording) return;
245
-
246
- recording = false;
247
- if (statusDiv) statusDiv.value = "Stopping...";
248
 
249
- // Notify server
250
- fetch('/api/finish', { method: 'POST' }).then(() => {
251
- if (statusDiv) statusDiv.value = "Done";
252
- console.log('Recording finished');
253
- }).catch(e => console.error('Error finishing:', e));
254
 
255
- if (source && source.mediaStream) {
256
- const tracks = source.mediaStream.getTracks();
257
- tracks.forEach(t => t.stop());
258
- }
259
- if (processor) processor.disconnect();
260
- if (source) source.disconnect();
261
- } catch (e) {
262
- console.error('Error stopping recording:', e);
263
- }
264
- }
265
-
266
- document.addEventListener('DOMContentLoaded', () => {
267
- // Esperar a que Gradio cargue completamente
268
- setTimeout(() => {
269
- // Encontrar textboxes por label
270
- const textboxes = document.querySelectorAll('textarea');
271
- if (textboxes.length >= 2) {
272
- transcriptDiv = textboxes[0];
273
- statusDiv = textboxes[1];
274
- console.log('UI elements found');
275
- }
276
 
277
- // Encontrar botones
278
- const buttons = document.querySelectorAll('button');
279
- if (buttons.length >= 2) {
280
- buttons[0].addEventListener('click', startRecording);
281
- buttons[1].addEventListener('click', stopRecording);
282
- console.log('Button listeners attached');
283
- }
284
- }, 500);
285
- });
286
- </script>
287
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
 
289
- gr.HTML(js_code)
 
 
 
 
 
290
 
291
  return demo
292
 
@@ -301,3 +208,4 @@ if __name__ == "__main__":
301
  import uvicorn
302
  uvicorn.run(app, host="0.0.0.0", port=7860)
303
 
 
 
2
  import logging
3
  from fastapi import FastAPI, UploadFile, File
4
  from fastapi.responses import JSONResponse
 
5
  import gradio as gr
6
+ import numpy as np
7
+ import soundfile as sf
8
+ import io
9
 
10
  import server_wrapper
11
 
 
14
 
15
  app = FastAPI(title="SimulStreaming ASR")
16
 
17
+ # Global state for streaming
18
+ _transcription_state = {"text": "", "final": False}
19
+
20
 
21
  @app.on_event("startup")
22
  async def startup_event():
 
59
  async def api_reset():
60
  try:
61
  server_wrapper.reset()
62
+ _transcription_state["text"] = ""
63
+ _transcription_state["final"] = False
64
  return JSONResponse({"status": "ok"})
65
  except Exception as e:
66
  return JSONResponse({"status": "error", "message": str(e)}, status_code=500)
 
71
  try:
72
  raw = await file.read()
73
  out = await asyncio.get_event_loop().run_in_executor(None, server_wrapper.process_chunk_from_bytes, raw)
74
+ if out and out.get("text"):
75
+ _transcription_state["text"] = out["text"]
76
+ return JSONResponse(out or {})
77
  except Exception as e:
78
  logger.error(f"Error processing chunk: {e}")
79
  return JSONResponse({"status": "error", "message": str(e)}, status_code=500)
 
83
  async def api_finish():
84
  try:
85
  out = await asyncio.get_event_loop().run_in_executor(None, server_wrapper.finish)
86
+ if out and out.get("text"):
87
+ _transcription_state["text"] = out["text"]
88
+ _transcription_state["final"] = True
89
+ return JSONResponse(out or {})
90
  except Exception as e:
91
  logger.error(f"Error finishing: {e}")
92
  return JSONResponse({"status": "error", "message": str(e)}, status_code=500)
93
 
94
 
95
+ def process_audio(audio_data):
96
+ """
97
+ Process audio from Gradio audio component.
98
+ audio_data is a tuple of (sample_rate, audio_array)
99
+ """
100
+ if audio_data is None:
101
+ return "Please record audio first."
102
+
103
+ try:
104
+ sample_rate, audio_array = audio_data
 
 
105
 
106
+ # Ensure audio is float32 and mono
107
+ if audio_array.dtype != np.float32:
108
+ audio_array = audio_array.astype(np.float32)
 
109
 
110
+ if len(audio_array.shape) > 1:
111
+ audio_array = np.mean(audio_array, axis=1)
 
 
 
 
112
 
113
+ # Resample to 16kHz if needed
114
+ if sample_rate != 16000:
115
+ import librosa
116
+ audio_array = librosa.resample(audio_array, orig_sr=sample_rate, target_sr=16000)
117
+ sample_rate = 16000
 
118
 
119
+ # Convert to WAV format
120
+ bio = io.BytesIO()
121
+ sf.write(bio, audio_array, sample_rate, format='WAV')
122
+ wav_bytes = bio.getvalue()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
 
124
+ logger.info(f"Processing audio: {len(wav_bytes)} bytes, {sample_rate}Hz")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
 
126
+ # Reset state
127
+ server_wrapper.reset()
128
+ _transcription_state["text"] = ""
129
 
130
+ # Process the chunk
131
+ result = server_wrapper.process_chunk_from_bytes(wav_bytes)
 
 
132
 
133
+ # Finish processing
134
+ final_result = server_wrapper.finish()
 
135
 
136
+ # Return final transcription
137
+ if final_result and final_result.get("text"):
138
+ return final_result["text"]
139
+ elif result and result.get("text"):
140
+ return result["text"]
141
+ else:
142
+ return "No transcription available"
 
 
 
 
 
 
 
143
 
144
+ except Exception as e:
145
+ logger.error(f"Error processing audio: {e}")
146
+ return f"Error: {str(e)}"
 
 
 
 
 
147
 
148
+
149
+ def create_ui():
150
+ with gr.Blocks(title="Streaming ASR", theme=gr.themes.Soft()) as demo:
151
+ gr.Markdown("""
152
+ # 🎙️ Streaming ASR — SimulWhisper
 
153
 
154
+ Graba tu voz y verás la transcripción en tiempo real.
 
 
 
 
155
 
156
+ **Instrucciones:**
157
+ 1. Haz clic en el botón de **Record** (rojo)
158
+ 2. Habla en el micrófono
159
+ 3. Haz clic en el botón de **Stop** (cuadrado) cuando termines
160
+ 4. Verás la transcripción automáticamente
161
+ """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
 
163
+ with gr.Row():
164
+ with gr.Column():
165
+ gr.Markdown("### 🎤 Record Audio")
166
+ audio_input = gr.Audio(
167
+ label="Record your voice",
168
+ type="numpy",
169
+ sources=["microphone"],
170
+ )
171
+
172
+ with gr.Column():
173
+ gr.Markdown("### 📝 Transcription")
174
+ transcript_output = gr.Textbox(
175
+ label="Transcription Result",
176
+ lines=8,
177
+ interactive=False,
178
+ placeholder="Transcription will appear here..."
179
+ )
180
+
181
+ # Button to process
182
+ process_btn = gr.Button("🚀 Transcribe", size="lg", variant="primary")
183
+
184
+ # Connect the button to process audio
185
+ process_btn.click(
186
+ fn=process_audio,
187
+ inputs=[audio_input],
188
+ outputs=[transcript_output]
189
+ )
190
 
191
+ # Also process on upload (no button needed)
192
+ audio_input.change(
193
+ fn=process_audio,
194
+ inputs=[audio_input],
195
+ outputs=[transcript_output]
196
+ )
197
 
198
  return demo
199
 
 
208
  import uvicorn
209
  uvicorn.run(app, host="0.0.0.0", port=7860)
210
 
211
+