rjzevallos commited on
Commit
f8ce589
·
1 Parent(s): 3b07c9e

Improve UI: Fix audio recording with better button detection and feedback

Browse files
Files changed (1) hide show
  1. app.py +194 -103
app.py CHANGED
@@ -2,7 +2,10 @@ import asyncio
2
  import logging
3
  from fastapi import FastAPI, UploadFile, File
4
  from fastapi.responses import JSONResponse
 
5
  import gradio as gr
 
 
6
 
7
  import server_wrapper
8
 
@@ -40,6 +43,7 @@ async def api_chunk(file: UploadFile = File(...)):
40
  out = await asyncio.get_event_loop().run_in_executor(None, server_wrapper.process_chunk_from_bytes, raw)
41
  return JSONResponse(out)
42
  except Exception as e:
 
43
  return JSONResponse({"status": "error", "message": str(e)}, status_code=500)
44
 
45
 
@@ -49,139 +53,226 @@ async def api_finish():
49
  out = await asyncio.get_event_loop().run_in_executor(None, server_wrapper.finish)
50
  return JSONResponse(out)
51
  except Exception as e:
 
52
  return JSONResponse({"status": "error", "message": str(e)}, status_code=500)
53
 
54
 
55
  def create_ui():
56
- with gr.Blocks() as demo:
57
- gr.Markdown("# Streaming ASR — SimulWhisper\nGraba tu voz y verás la transcripción incrementalmente.")
 
 
 
 
 
 
 
 
 
 
58
  with gr.Row():
59
- start_btn = gr.Button("Start Recording")
60
- stop_btn = gr.Button("Stop Recording")
61
- transcript = gr.HTML("<div id='transcript' style='white-space:pre-wrap; border:1px solid #ddd; padding:10px; min-height:120px'></div>")
62
- # include JS responsible for recording and sending chunks to /api/chunk
63
- js = r"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  <script>
65
- let mediaRecorder, audioCtx, source;
66
  let recording = false;
67
- let startBtn = null, stopBtn = null;
68
- function to16BitPCM(float32Array){
69
- const l = float32Array.length;
70
- const buffer = new ArrayBuffer(l*2);
71
- const view = new DataView(buffer);
72
- let offset=0;
73
- for(let i=0;i<l;i++){
74
- let s = Math.max(-1, Math.min(1, float32Array[i]));
75
- view.setInt16(offset, s<0 ? s*0x8000 : s*0x7FFF, true);
76
- offset += 2;
77
- }
78
- return buffer;
 
 
79
  }
80
 
81
- function writeWAV(samples, sampleRate){
82
- const buffer = new ArrayBuffer(44 + samples.byteLength);
83
- const view = new DataView(buffer);
84
- function writeString(view, offset, string){
85
- for(let i=0;i<string.length;i++) view.setUint8(offset+i, string.charCodeAt(i));
86
- }
87
- writeString(view,0,'RIFF');
88
- view.setUint32(4,36 + samples.byteLength, true);
89
- writeString(view,8,'WAVE');
90
- writeString(view,12,'fmt ');
91
- view.setUint32(16,16,true);
92
- view.setUint16(20,1,true);
93
- view.setUint16(22,1,true);
94
- view.setUint32(24,sampleRate,true);
95
- view.setUint32(28,sampleRate*2,true);
96
- view.setUint16(32,2,true);
97
- view.setUint16(34,16,true);
98
- writeString(view,36,'data');
99
- view.setUint32(40,samples.byteLength,true);
100
- // write samples
101
- const bytes = new Uint8Array(buffer,44);
102
- bytes.set(new Uint8Array(samples));
103
- return buffer;
 
104
  }
105
 
106
- async function sendChunk(float32Array, sampleRate){
107
- // resample to 16000 if needed
108
- if(sampleRate !== 16000){
109
- // offline resampling using OfflineAudioContext
110
- let offlineCtx = new OfflineAudioContext(1, Math.round(float32Array.length * 16000 / sampleRate), 16000);
111
- let buffer = offlineCtx.createBuffer(1, float32Array.length, sampleRate);
112
- buffer.copyToChannel(float32Array,0,0);
113
- let src = offlineCtx.createBufferSource();
114
  src.buffer = buffer;
115
  src.connect(offlineCtx.destination);
116
  src.start(0);
117
- let rendered = await offlineCtx.startRendering();
118
- float32Array = rendered.getChannelData(0);
119
- sampleRate = 16000;
120
- }
121
- const pcm16 = to16BitPCM(float32Array);
122
- const wav = writeWAV(pcm16, sampleRate);
123
- const blob = new Blob([wav], {type:'audio/wav'});
124
- const fd = new FormData();
125
- fd.append('file', blob, 'chunk.wav');
126
- try{
127
- const resp = await fetch('/api/chunk', {method:'POST', body: fd});
128
- if(!resp.ok) { console.error('Chunk upload failed', resp.status); return; }
129
- const j = await resp.json();
130
- if(j.text){
131
- const div = document.getElementById('transcript');
132
- div.innerText = j.text;
 
 
 
 
 
 
 
 
133
  }
134
- }catch(e){ console.error(e); }
135
  }
136
 
137
- async function initRecording(){
138
- if(recording) return;
139
- audioCtx = new (window.AudioContext || window.webkitAudioContext)();
140
- const stream = await navigator.mediaDevices.getUserMedia({audio:true});
141
- source = audioCtx.createMediaStreamSource(stream);
142
- const processor = audioCtx.createScriptProcessor(4096, 1, 1);
143
- source.connect(processor);
144
- processor.connect(audioCtx.destination);
145
- processor.onaudioprocess = function(e){
146
- const ch = e.inputBuffer.getChannelData(0);
147
- // copy to a new Float32Array because the buffer is reused
148
- sendChunk(new Float32Array(ch), audioCtx.sampleRate);
149
- }
150
- recording = true;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  }
152
 
153
- function stopRecording(){
154
- if(!recording) return;
155
- // notify server that stream finished
156
- fetch('/api/finish', {method:'POST'}).then(()=>console.log('finished'));
157
- if(source && source.mediaStream){
158
- const tracks = source.mediaStream.getTracks();
159
- tracks.forEach(t=>t.stop());
160
- }
161
- recording = false;
 
 
 
 
 
 
 
 
 
 
 
 
 
162
  }
163
 
164
- document.addEventListener('DOMContentLoaded', ()=>{
165
- // find the buttons rendered by Gradio
166
- const btns = document.querySelectorAll('button');
167
- // crude: take first two as start/stop
168
- startBtn = btns[0]; stopBtn = btns[1];
169
- startBtn.addEventListener('click', ()=>{ initRecording(); });
170
- stopBtn.addEventListener('click', ()=>{ stopRecording(); });
 
 
 
 
 
 
 
 
 
 
 
 
171
  });
172
  </script>
173
  """
174
-
175
- gr.HTML(js)
176
-
177
  return demo
178
 
179
 
180
  demo = create_ui()
181
 
182
- # Mount Gradio app on FastAPI root
183
  app = gr.mount_gradio_app(app, demo, path="/")
184
 
185
 
186
  if __name__ == "__main__":
187
- demo.launch(server_name="0.0.0.0", server_port=7860)
 
 
 
2
  import logging
3
  from fastapi import FastAPI, UploadFile, File
4
  from fastapi.responses import JSONResponse
5
+ from fastapi.staticfiles import StaticFiles
6
  import gradio as gr
7
+ import os
8
+ import tempfile
9
 
10
  import server_wrapper
11
 
 
43
  out = await asyncio.get_event_loop().run_in_executor(None, server_wrapper.process_chunk_from_bytes, raw)
44
  return JSONResponse(out)
45
  except Exception as e:
46
+ logger.error(f"Error processing chunk: {e}")
47
  return JSONResponse({"status": "error", "message": str(e)}, status_code=500)
48
 
49
 
 
53
  out = await asyncio.get_event_loop().run_in_executor(None, server_wrapper.finish)
54
  return JSONResponse(out)
55
  except Exception as e:
56
+ logger.error(f"Error finishing: {e}")
57
  return JSONResponse({"status": "error", "message": str(e)}, status_code=500)
58
 
59
 
60
  def create_ui():
61
+ with gr.Blocks(title="Streaming ASR", theme=gr.themes.Soft()) as demo:
62
+ gr.Markdown("""
63
+ # 🎙️ Streaming ASR — SimulWhisper
64
+
65
+ Graba tu voz y verás la transcripción en tiempo real.
66
+
67
+ **Instrucciones:**
68
+ 1. Haz clic en **"Start Recording"**
69
+ 2. Habla en el micrófono
70
+ 3. Haz clic en **"Stop Recording"** cuando termines
71
+ """)
72
+
73
  with gr.Row():
74
+ start_btn = gr.Button("🔴 Start Recording", size="lg", scale=1)
75
+ stop_btn = gr.Button("⏹️ Stop Recording", size="lg", scale=1)
76
+ reset_btn = gr.Button("🔄 Reset", size="lg", scale=1)
77
+
78
+ transcript_output = gr.Textbox(
79
+ label="Transcription",
80
+ lines=5,
81
+ interactive=False,
82
+ placeholder="Transcription will appear here..."
83
+ )
84
+
85
+ status_output = gr.Textbox(
86
+ label="Status",
87
+ lines=2,
88
+ interactive=False,
89
+ placeholder="Ready"
90
+ )
91
+
92
+ # JavaScript para captura de audio
93
+ js_code = r"""
94
  <script>
95
+ let mediaRecorder, audioCtx, source, processor;
96
  let recording = false;
97
+ let transcriptDiv = null;
98
+ let statusDiv = null;
99
+
100
+ function to16BitPCM(float32Array) {
101
+ const l = float32Array.length;
102
+ const buffer = new ArrayBuffer(l * 2);
103
+ const view = new DataView(buffer);
104
+ let offset = 0;
105
+ for (let i = 0; i < l; i++) {
106
+ let s = Math.max(-1, Math.min(1, float32Array[i]));
107
+ view.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7FFF, true);
108
+ offset += 2;
109
+ }
110
+ return buffer;
111
  }
112
 
113
+ function writeWAV(samples, sampleRate) {
114
+ const buffer = new ArrayBuffer(44 + samples.byteLength);
115
+ const view = new DataView(buffer);
116
+ function writeString(view, offset, string) {
117
+ for (let i = 0; i < string.length; i++) {
118
+ view.setUint8(offset + i, string.charCodeAt(i));
119
+ }
120
+ }
121
+ writeString(view, 0, 'RIFF');
122
+ view.setUint32(4, 36 + samples.byteLength, true);
123
+ writeString(view, 8, 'WAVE');
124
+ writeString(view, 12, 'fmt ');
125
+ view.setUint32(16, 16, true);
126
+ view.setUint16(20, 1, true);
127
+ view.setUint16(22, 1, true);
128
+ view.setUint32(24, sampleRate, true);
129
+ view.setUint32(28, sampleRate * 2, true);
130
+ view.setUint16(32, 2, true);
131
+ view.setUint16(34, 16, true);
132
+ writeString(view, 36, 'data');
133
+ view.setUint32(40, samples.byteLength, true);
134
+ const bytes = new Uint8Array(buffer, 44);
135
+ bytes.set(new Uint8Array(samples));
136
+ return buffer;
137
  }
138
 
139
+ async function resampleAudio(float32Array, fromSampleRate, toSampleRate) {
140
+ if (fromSampleRate === toSampleRate) {
141
+ return float32Array;
142
+ }
143
+ const offlineCtx = new OfflineAudioContext(1, Math.round(float32Array.length * toSampleRate / fromSampleRate), toSampleRate);
144
+ const buffer = offlineCtx.createBuffer(1, float32Array.length, fromSampleRate);
145
+ buffer.copyToChannel(float32Array, 0, 0);
146
+ const src = offlineCtx.createBufferSource();
147
  src.buffer = buffer;
148
  src.connect(offlineCtx.destination);
149
  src.start(0);
150
+ const rendered = await offlineCtx.startRendering();
151
+ return rendered.getChannelData(0);
152
+ }
153
+
154
+ async function sendChunk(float32Array, sampleRate) {
155
+ try {
156
+ let resampled = await resampleAudio(float32Array, sampleRate, 16000);
157
+ const pcm16 = to16BitPCM(resampled);
158
+ const wav = writeWAV(pcm16, 16000);
159
+ const blob = new Blob([wav], { type: 'audio/wav' });
160
+ const fd = new FormData();
161
+ fd.append('file', blob, 'chunk.wav');
162
+
163
+ const resp = await fetch('/api/chunk', { method: 'POST', body: fd });
164
+ if (!resp.ok) {
165
+ console.error('Chunk upload failed:', resp.status);
166
+ return;
167
+ }
168
+ const j = await resp.json();
169
+ if (j.text && transcriptDiv) {
170
+ transcriptDiv.value = j.text;
171
+ }
172
+ } catch (e) {
173
+ console.error('Error sending chunk:', e);
174
  }
 
175
  }
176
 
177
+ async function startRecording() {
178
+ try {
179
+ if (recording) return;
180
+
181
+ recording = true;
182
+ if (statusDiv) statusDiv.value = "Recording... listening to audio";
183
+
184
+ audioCtx = new (window.AudioContext || window.webkitAudioContext)();
185
+ const stream = await navigator.mediaDevices.getUserMedia({ audio: { echoCancellation: false, noiseSuppression: false } });
186
+ source = audioCtx.createMediaStreamSource(stream);
187
+ processor = audioCtx.createScriptProcessor(4096, 1, 1);
188
+
189
+ let chunkBuffer = [];
190
+ let bufferLength = 0;
191
+ const bufferThreshold = 16000 * 1; // 1 second of audio at 16kHz
192
+
193
+ processor.onaudioprocess = function(e) {
194
+ const ch = e.inputBuffer.getChannelData(0);
195
+ for (let i = 0; i < ch.length; i++) {
196
+ chunkBuffer.push(ch[i]);
197
+ bufferLength++;
198
+ }
199
+ // Send chunks of ~1 second
200
+ if (bufferLength >= bufferThreshold) {
201
+ const chunk = new Float32Array(chunkBuffer);
202
+ chunkBuffer = [];
203
+ bufferLength = 0;
204
+ sendChunk(chunk, audioCtx.sampleRate);
205
+ }
206
+ };
207
+
208
+ source.connect(processor);
209
+ processor.connect(audioCtx.destination);
210
+ } catch (e) {
211
+ console.error('Error starting recording:', e);
212
+ recording = false;
213
+ if (statusDiv) statusDiv.value = "Error: " + e.message;
214
+ }
215
  }
216
 
217
+ function stopRecording() {
218
+ try {
219
+ if (!recording) return;
220
+
221
+ recording = false;
222
+ if (statusDiv) statusDiv.value = "Stopping...";
223
+
224
+ // Notify server
225
+ fetch('/api/finish', { method: 'POST' }).then(() => {
226
+ if (statusDiv) statusDiv.value = "Done";
227
+ console.log('Recording finished');
228
+ }).catch(e => console.error('Error finishing:', e));
229
+
230
+ if (source && source.mediaStream) {
231
+ const tracks = source.mediaStream.getTracks();
232
+ tracks.forEach(t => t.stop());
233
+ }
234
+ if (processor) processor.disconnect();
235
+ if (source) source.disconnect();
236
+ } catch (e) {
237
+ console.error('Error stopping recording:', e);
238
+ }
239
  }
240
 
241
+ document.addEventListener('DOMContentLoaded', () => {
242
+ // Esperar a que Gradio cargue completamente
243
+ setTimeout(() => {
244
+ // Encontrar textboxes por label
245
+ const textboxes = document.querySelectorAll('textarea');
246
+ if (textboxes.length >= 2) {
247
+ transcriptDiv = textboxes[0];
248
+ statusDiv = textboxes[1];
249
+ console.log('UI elements found');
250
+ }
251
+
252
+ // Encontrar botones
253
+ const buttons = document.querySelectorAll('button');
254
+ if (buttons.length >= 2) {
255
+ buttons[0].addEventListener('click', startRecording);
256
+ buttons[1].addEventListener('click', stopRecording);
257
+ console.log('Button listeners attached');
258
+ }
259
+ }, 500);
260
  });
261
  </script>
262
  """
263
+
264
+ gr.HTML(js_code)
265
+
266
  return demo
267
 
268
 
269
  demo = create_ui()
270
 
271
+ # Mount Gradio app on FastAPI
272
  app = gr.mount_gradio_app(app, demo, path="/")
273
 
274
 
275
  if __name__ == "__main__":
276
+ import uvicorn
277
+ uvicorn.run(app, host="0.0.0.0", port=7860)
278
+