gnumanth commited on
Commit
306c2bc
·
verified ·
1 Parent(s): e97881c

Fix UI - simpler Gradio-native approach

Browse files
Files changed (1) hide show
  1. app.py +132 -239
app.py CHANGED
@@ -5,7 +5,6 @@ import spaces
5
  from datetime import datetime
6
  import random
7
  import string
8
- import os
9
 
10
  SESSION_ID = f"LIVE_{''.join(random.choices(string.ascii_uppercase + string.digits, k=4))}"
11
 
@@ -34,15 +33,14 @@ def transcribe(audio, state):
34
  print(f"[SESSION START] {SESSION_ID}", flush=True)
35
 
36
  if audio is None:
37
- return state, "<div class='transcript-text'>Listening...</div>"
38
 
39
  try:
40
  sr, data = audio
41
 
42
- # 1. AUDIO RECEIPT LOG
43
  if len(data) > 0:
44
  peak = np.abs(data).max()
45
- if state['counter'] % 10 == 0: # Log every ~1s
46
  print(f"[AUDIO RECV] Step {state['counter']} | Shape: {data.shape} | Peak: {peak:.4f}", flush=True)
47
 
48
  # Normalize
@@ -60,13 +58,10 @@ def transcribe(audio, state):
60
 
61
  state['counter'] += 1
62
 
63
- # LATENCY FIX: 0.2s Threshold (Fast Response)
64
  if len(state['buffer']) >= 3200:
65
  if ASR_MODEL:
66
  with torch.no_grad():
67
- # LATENCY FIX: Only send 2s of context (32000) instead of 5s.
68
  context = state['buffer'][-32000:]
69
-
70
  results = ASR_MODEL.transcribe([context])
71
  print(f"[INFER] Context: {len(context)} | Raw: {results}", flush=True)
72
 
@@ -83,7 +78,6 @@ def transcribe(audio, state):
83
  if not current_lines: current_lines.append(text)
84
  else: current_lines[-1] = text
85
 
86
- # LATENCY FIX: Aggressive Buffer Cleanup
87
  if len(state['buffer']) > 32000:
88
  state['buffer'] = state['buffer'][-32000:]
89
 
@@ -92,228 +86,163 @@ def transcribe(audio, state):
92
  import traceback
93
  traceback.print_exc()
94
 
95
- # Format output for HTML display
96
  valid = [l for l in state['transcript'] if l]
97
  current = valid[-1] if valid else "Listening..."
98
- history = "<br>".join(valid[:-1]) if len(valid) > 1 else ""
99
-
100
- output_html = f"""
101
- <div class='transcript-current'>{current}</div>
102
- <div class='transcript-history'>{history}</div>
103
- """
104
- return state, output_html
105
 
106
  def clear_session():
107
  print("[SESSION RESET]", flush=True)
108
- return {'transcript': [], 'buffer': None, 'counter': 0}, "<div class='transcript-text'>Listening...</div>"
109
 
110
  def log_connection():
111
  print(">>> CLIENT CONNECTED <<<", flush=True)
112
 
113
- # --- CUSTOM HTML/CSS UI ---
114
- CUSTOM_CSS = """
115
  @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600&display=swap');
116
 
117
- * {
118
- margin: 0;
119
- padding: 0;
120
- box-sizing: border-box;
121
  }
122
 
123
- body, .gradio-container {
124
- background: #0a0a0f !important;
125
- font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif !important;
126
- min-height: 100vh;
127
  }
128
 
129
  .gradio-container {
 
 
 
130
  max-width: 100% !important;
131
  padding: 0 !important;
 
132
  }
133
 
134
- /* Hide default Gradio elements */
135
- .contain {
136
- background: transparent !important;
137
- }
138
-
139
- footer {
140
- display: none !important;
141
- }
142
-
143
- /* Main App Container */
144
- #app-container {
145
- position: fixed;
146
- top: 0;
147
- left: 0;
148
- width: 100vw;
149
- height: 100vh;
150
- background: linear-gradient(180deg, #0a0a0f 0%, #12121a 50%, #0a0a0f 100%);
151
  display: flex;
152
  flex-direction: column;
153
- overflow: hidden;
154
  }
155
 
156
  /* Header */
157
- #header {
158
  padding: 20px;
159
- text-align: center;
160
- z-index: 10;
 
161
  }
162
 
163
  #session-badge {
164
- display: inline-flex;
165
- align-items: center;
166
- gap: 8px;
167
- background: rgba(255, 255, 255, 0.05);
168
- border: 1px solid rgba(255, 255, 255, 0.1);
169
- border-radius: 20px;
170
- padding: 8px 16px;
171
- color: rgba(255, 255, 255, 0.7);
172
- font-size: 13px;
173
- font-weight: 500;
174
- }
175
-
176
- #live-dot {
177
- width: 8px;
178
- height: 8px;
179
- background: #22c55e;
180
- border-radius: 50%;
181
- animation: pulse 2s ease-in-out infinite;
182
- }
183
-
184
- @keyframes pulse {
185
- 0%, 100% { opacity: 1; transform: scale(1); }
186
- 50% { opacity: 0.5; transform: scale(0.9); }
187
  }
188
 
189
- /* Transcript Area */
190
- #transcript-area {
191
  flex: 1;
192
  display: flex;
193
- flex-direction: column;
194
- justify-content: center;
195
  align-items: center;
 
 
 
196
  padding: 40px 20px;
197
- text-align: center;
198
- z-index: 10;
199
  }
200
 
201
- .transcript-current {
202
- color: #ffffff;
203
- font-size: clamp(24px, 5vw, 42px);
204
- font-weight: 400;
205
- line-height: 1.4;
206
- max-width: 800px;
207
- text-shadow: 0 4px 20px rgba(0, 0, 0, 0.5);
208
- animation: fadeIn 0.3s ease-out;
209
  }
210
 
211
- .transcript-history {
212
- color: rgba(255, 255, 255, 0.35);
213
- font-size: clamp(14px, 2.5vw, 18px);
214
- font-weight: 300;
215
- line-height: 1.8;
216
- max-width: 700px;
217
- margin-top: 30px;
 
 
 
 
218
  }
219
 
220
- @keyframes fadeIn {
221
- from { opacity: 0; transform: translateY(10px); }
222
- to { opacity: 1; transform: translateY(0); }
223
  }
224
 
225
- /* Ambient Glow */
226
- #ambient-glow {
227
- position: absolute;
228
- bottom: 0;
229
- left: 50%;
230
- transform: translateX(-50%);
231
- width: 150%;
232
- height: 50vh;
233
- background: radial-gradient(ellipse at center bottom, rgba(99, 102, 241, 0.15) 0%, transparent 70%);
234
- pointer-events: none;
235
- z-index: 1;
236
  }
237
 
238
- /* Controls Area */
239
- #controls-area {
240
- padding: 30px 20px 50px;
241
  display: flex;
242
  justify-content: center;
243
  align-items: center;
244
- gap: 40px;
245
- z-index: 20;
246
- }
247
-
248
- /* Mic Button Container */
249
- #mic-container {
250
- position: relative;
251
- }
252
-
253
- /* Style the Gradio audio component to look like our mic button */
254
- #mic-container .wrap {
255
  background: transparent !important;
256
- border: none !important;
257
- padding: 0 !important;
258
  }
259
 
260
- #mic-container audio {
261
- display: none !important;
262
- }
263
-
264
- #mic-container .audio-container {
265
  background: transparent !important;
 
 
 
266
  }
267
 
268
- #mic-container > div {
269
  background: transparent !important;
270
  border: none !important;
271
  box-shadow: none !important;
 
272
  }
273
 
274
- #mic-container label {
 
275
  display: none !important;
276
  }
277
 
278
- #mic-container .controls {
279
- background: transparent !important;
280
  }
281
 
282
- #mic-container button {
283
  width: 80px !important;
284
  height: 80px !important;
285
  border-radius: 50% !important;
286
  background: linear-gradient(135deg, #6366f1 0%, #8b5cf6 100%) !important;
287
  border: none !important;
288
  cursor: pointer !important;
289
- transition: all 0.2s ease !important;
290
  box-shadow: 0 8px 30px rgba(99, 102, 241, 0.4) !important;
 
291
  }
292
 
293
- #mic-container button:hover {
294
  transform: scale(1.05) !important;
295
  box-shadow: 0 12px 40px rgba(99, 102, 241, 0.5) !important;
296
  }
297
 
298
- #mic-container button:active,
299
- #mic-container button.recording {
300
- transform: scale(0.95) !important;
301
- background: linear-gradient(135deg, #ef4444 0%, #f97316 100%) !important;
302
- box-shadow: 0 8px 30px rgba(239, 68, 68, 0.4) !important;
303
- }
304
-
305
- #mic-container button svg {
306
  width: 32px !important;
307
  height: 32px !important;
308
- fill: white !important;
309
  }
310
 
311
- /* Reset Button */
312
  #reset-btn {
313
  background: rgba(255, 255, 255, 0.05) !important;
314
- border: 1px solid rgba(255, 255, 255, 0.1) !important;
315
- color: rgba(255, 255, 255, 0.6) !important;
316
- padding: 12px 24px !important;
317
  border-radius: 12px !important;
318
  font-size: 14px !important;
319
  font-weight: 500 !important;
@@ -323,21 +252,15 @@ footer {
323
 
324
  #reset-btn:hover {
325
  background: rgba(255, 255, 255, 0.1) !important;
326
- color: rgba(255, 255, 255, 0.9) !important;
327
- }
328
-
329
- /* Transcript display area styling */
330
- #transcript-display {
331
- background: transparent !important;
332
- border: none !important;
333
  }
334
 
335
- #transcript-display > div {
336
- background: transparent !important;
 
337
  }
338
 
339
- /* Hide unnecessary Gradio UI elements */
340
- .svelte-1gfkn6j {
341
  background: transparent !important;
342
  }
343
 
@@ -346,107 +269,77 @@ footer {
346
  border: none !important;
347
  box-shadow: none !important;
348
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
349
  """
350
 
351
  # --- GRADIO APP ---
352
- with gr.Blocks(css=CUSTOM_CSS, title="Nemotron Speech Streaming") as demo:
353
  state = gr.State({'transcript': [], 'buffer': None, 'counter': 0})
354
 
355
- # Main HTML Structure
356
- gr.HTML(f"""
357
- <div id="app-container">
358
- <div id="header">
359
- <div id="session-badge">
360
- <div id="live-dot"></div>
361
- <span>Live Session {SESSION_ID}</span>
362
- </div>
363
- </div>
364
-
365
- <div id="transcript-area">
366
- <!-- Transcript will be injected here via Gradio -->
367
- </div>
368
-
369
- <div id="ambient-glow"></div>
370
-
371
- <div id="controls-area">
372
- <!-- Audio and Reset button will be placed here -->
373
- </div>
374
- </div>
375
- """)
376
-
377
- # Transcript Display (positioned in transcript area via JS)
378
- with gr.Row(elem_id="transcript-row"):
379
- transcript_display = gr.HTML(
380
- value="<div class='transcript-current'>Tap the microphone to start speaking...</div>",
381
- elem_id="transcript-display"
382
- )
383
-
384
- # Controls
385
- with gr.Row(elem_id="controls-row"):
386
- with gr.Column(elem_id="mic-container", scale=0):
387
  audio = gr.Audio(
388
  sources=["microphone"],
389
  streaming=True,
390
  type="numpy",
 
391
  show_label=False
392
  )
393
-
394
- reset_btn = gr.Button("Reset", elem_id="reset-btn")
395
-
396
- # JavaScript to reposition elements into our custom layout
397
- gr.HTML("""
398
- <script>
399
- function setupUI() {
400
- // Move transcript display into our transcript area
401
- const transcriptArea = document.getElementById('transcript-area');
402
- const transcriptDisplay = document.getElementById('transcript-display');
403
- if (transcriptArea && transcriptDisplay) {
404
- transcriptArea.appendChild(transcriptDisplay);
405
- }
406
-
407
- // Move controls into our controls area
408
- const controlsArea = document.getElementById('controls-area');
409
- const micContainer = document.getElementById('mic-container');
410
- const resetBtn = document.getElementById('reset-btn');
411
- if (controlsArea && micContainer) {
412
- controlsArea.appendChild(micContainer);
413
- }
414
- if (controlsArea && resetBtn) {
415
- controlsArea.appendChild(resetBtn);
416
- }
417
-
418
- // Hide original Gradio rows
419
- const transcriptRow = document.getElementById('transcript-row');
420
- const controlsRow = document.getElementById('controls-row');
421
- if (transcriptRow) transcriptRow.style.display = 'none';
422
- if (controlsRow) controlsRow.style.display = 'none';
423
- }
424
-
425
- // Run setup when DOM is ready
426
- if (document.readyState === 'loading') {
427
- document.addEventListener('DOMContentLoaded', setupUI);
428
- } else {
429
- setTimeout(setupUI, 100);
430
- }
431
-
432
- // Re-run on Gradio updates
433
- const observer = new MutationObserver(() => {
434
- setTimeout(setupUI, 50);
435
- });
436
- observer.observe(document.body, { childList: true, subtree: true });
437
- </script>
438
- """)
439
 
440
  # Events
441
  demo.load(fn=log_connection)
442
  audio.stream(
443
  fn=transcribe,
444
  inputs=[audio, state],
445
- outputs=[state, transcript_display],
446
  show_progress="hidden",
447
  trigger_mode="always_last"
448
  )
449
- reset_btn.click(fn=clear_session, outputs=[state, transcript_display])
450
 
451
  if __name__ == "__main__":
452
  demo.launch(server_name="0.0.0.0", server_port=7860, show_api=False)
 
5
  from datetime import datetime
6
  import random
7
  import string
 
8
 
9
  SESSION_ID = f"LIVE_{''.join(random.choices(string.ascii_uppercase + string.digits, k=4))}"
10
 
 
33
  print(f"[SESSION START] {SESSION_ID}", flush=True)
34
 
35
  if audio is None:
36
+ return state, "Listening..."
37
 
38
  try:
39
  sr, data = audio
40
 
 
41
  if len(data) > 0:
42
  peak = np.abs(data).max()
43
+ if state['counter'] % 10 == 0:
44
  print(f"[AUDIO RECV] Step {state['counter']} | Shape: {data.shape} | Peak: {peak:.4f}", flush=True)
45
 
46
  # Normalize
 
58
 
59
  state['counter'] += 1
60
 
 
61
  if len(state['buffer']) >= 3200:
62
  if ASR_MODEL:
63
  with torch.no_grad():
 
64
  context = state['buffer'][-32000:]
 
65
  results = ASR_MODEL.transcribe([context])
66
  print(f"[INFER] Context: {len(context)} | Raw: {results}", flush=True)
67
 
 
78
  if not current_lines: current_lines.append(text)
79
  else: current_lines[-1] = text
80
 
 
81
  if len(state['buffer']) > 32000:
82
  state['buffer'] = state['buffer'][-32000:]
83
 
 
86
  import traceback
87
  traceback.print_exc()
88
 
 
89
  valid = [l for l in state['transcript'] if l]
90
  current = valid[-1] if valid else "Listening..."
91
+ return state, current
 
 
 
 
 
 
92
 
93
  def clear_session():
94
  print("[SESSION RESET]", flush=True)
95
+ return {'transcript': [], 'buffer': None, 'counter': 0}, "Tap the mic to start..."
96
 
97
  def log_connection():
98
  print(">>> CLIENT CONNECTED <<<", flush=True)
99
 
100
+ # --- SIMPLE CLEAN CSS ---
101
+ CSS = """
102
  @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600&display=swap');
103
 
104
+ :root {
105
+ --bg-dark: #0a0a0f;
106
+ --bg-gradient: linear-gradient(180deg, #0a0a0f 0%, #151520 50%, #0a0a0f 100%);
 
107
  }
108
 
109
+ body {
110
+ background: var(--bg-dark) !important;
 
 
111
  }
112
 
113
  .gradio-container {
114
+ background: var(--bg-gradient) !important;
115
+ min-height: 100vh !important;
116
+ font-family: 'Inter', sans-serif !important;
117
  max-width: 100% !important;
118
  padding: 0 !important;
119
+ margin: 0 !important;
120
  }
121
 
122
+ #main-container {
123
+ min-height: 100vh;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  display: flex;
125
  flex-direction: column;
126
+ background: var(--bg-gradient);
127
  }
128
 
129
  /* Header */
130
+ #header-row {
131
  padding: 20px;
132
+ display: flex;
133
+ justify-content: center;
134
+ background: transparent !important;
135
  }
136
 
137
  #session-badge {
138
+ background: rgba(255, 255, 255, 0.05) !important;
139
+ border: 1px solid rgba(255, 255, 255, 0.1) !important;
140
+ border-radius: 20px !important;
141
+ padding: 8px 16px !important;
142
+ color: rgba(255, 255, 255, 0.7) !important;
143
+ font-size: 14px !important;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  }
145
 
146
+ /* Transcript area */
147
+ #transcript-row {
148
  flex: 1;
149
  display: flex;
 
 
150
  align-items: center;
151
+ justify-content: center;
152
+ min-height: 50vh;
153
+ background: transparent !important;
154
  padding: 40px 20px;
 
 
155
  }
156
 
157
+ #transcript-display {
158
+ background: transparent !important;
159
+ border: none !important;
160
+ box-shadow: none !important;
161
+ text-align: center;
 
 
 
162
  }
163
 
164
+ #transcript-display textarea {
165
+ background: transparent !important;
166
+ border: none !important;
167
+ color: #ffffff !important;
168
+ font-size: 36px !important;
169
+ font-weight: 400 !important;
170
+ text-align: center !important;
171
+ line-height: 1.4 !important;
172
+ text-shadow: 0 4px 20px rgba(0, 0, 0, 0.5);
173
+ min-height: 150px !important;
174
+ resize: none !important;
175
  }
176
 
177
+ #transcript-display label {
178
+ display: none !important;
 
179
  }
180
 
181
+ #transcript-display .wrap {
182
+ background: transparent !important;
 
 
 
 
 
 
 
 
 
183
  }
184
 
185
+ /* Controls */
186
+ #controls-row {
187
+ padding: 30px 20px 60px;
188
  display: flex;
189
  justify-content: center;
190
  align-items: center;
191
+ gap: 30px;
 
 
 
 
 
 
 
 
 
 
192
  background: transparent !important;
 
 
193
  }
194
 
195
+ /* Mic button styling */
196
+ #mic-input {
 
 
 
197
  background: transparent !important;
198
+ border: none !important;
199
+ box-shadow: none !important;
200
+ max-width: 120px;
201
  }
202
 
203
+ #mic-input > div {
204
  background: transparent !important;
205
  border: none !important;
206
  box-shadow: none !important;
207
+ padding: 0 !important;
208
  }
209
 
210
+ #mic-input label,
211
+ #mic-input .wrap > div:first-child {
212
  display: none !important;
213
  }
214
 
215
+ #mic-input audio {
216
+ display: none !important;
217
  }
218
 
219
+ #mic-input button {
220
  width: 80px !important;
221
  height: 80px !important;
222
  border-radius: 50% !important;
223
  background: linear-gradient(135deg, #6366f1 0%, #8b5cf6 100%) !important;
224
  border: none !important;
225
  cursor: pointer !important;
 
226
  box-shadow: 0 8px 30px rgba(99, 102, 241, 0.4) !important;
227
+ transition: all 0.2s ease !important;
228
  }
229
 
230
+ #mic-input button:hover {
231
  transform: scale(1.05) !important;
232
  box-shadow: 0 12px 40px rgba(99, 102, 241, 0.5) !important;
233
  }
234
 
235
+ #mic-input button svg {
 
 
 
 
 
 
 
236
  width: 32px !important;
237
  height: 32px !important;
 
238
  }
239
 
240
+ /* Reset button */
241
  #reset-btn {
242
  background: rgba(255, 255, 255, 0.05) !important;
243
+ border: 1px solid rgba(255, 255, 255, 0.15) !important;
244
+ color: rgba(255, 255, 255, 0.7) !important;
245
+ padding: 14px 28px !important;
246
  border-radius: 12px !important;
247
  font-size: 14px !important;
248
  font-weight: 500 !important;
 
252
 
253
  #reset-btn:hover {
254
  background: rgba(255, 255, 255, 0.1) !important;
255
+ color: #ffffff !important;
 
 
 
 
 
 
256
  }
257
 
258
+ /* Hide Gradio footer and other elements */
259
+ footer {
260
+ display: none !important;
261
  }
262
 
263
+ .contain {
 
264
  background: transparent !important;
265
  }
266
 
 
269
  border: none !important;
270
  box-shadow: none !important;
271
  }
272
+
273
+ /* Ambient glow at bottom */
274
+ #controls-row::before {
275
+ content: '';
276
+ position: fixed;
277
+ bottom: 0;
278
+ left: 50%;
279
+ transform: translateX(-50%);
280
+ width: 200%;
281
+ height: 40vh;
282
+ background: radial-gradient(ellipse at center bottom, rgba(99, 102, 241, 0.12) 0%, transparent 70%);
283
+ pointer-events: none;
284
+ z-index: 0;
285
+ }
286
  """
287
 
288
  # --- GRADIO APP ---
289
+ with gr.Blocks(css=CSS, title="Nemotron Speech Streaming", theme=gr.themes.Base()) as demo:
290
  state = gr.State({'transcript': [], 'buffer': None, 'counter': 0})
291
 
292
+ with gr.Column(elem_id="main-container"):
293
+ # Header
294
+ with gr.Row(elem_id="header-row"):
295
+ gr.HTML(f"""
296
+ <div id="session-badge" style="display: inline-flex; align-items: center; gap: 10px;
297
+ background: rgba(255,255,255,0.05); border: 1px solid rgba(255,255,255,0.1);
298
+ border-radius: 20px; padding: 10px 20px;">
299
+ <span style="width: 10px; height: 10px; background: #22c55e; border-radius: 50%;
300
+ animation: pulse 2s infinite;"></span>
301
+ <span style="color: rgba(255,255,255,0.8); font-size: 14px;">Live Session • {SESSION_ID}</span>
302
+ </div>
303
+ <style>
304
+ @keyframes pulse {{
305
+ 0%, 100% {{ opacity: 1; }}
306
+ 50% {{ opacity: 0.5; }}
307
+ }}
308
+ </style>
309
+ """)
310
+
311
+ # Transcript display
312
+ with gr.Row(elem_id="transcript-row"):
313
+ transcript = gr.Textbox(
314
+ value="Tap the mic to start...",
315
+ elem_id="transcript-display",
316
+ show_label=False,
317
+ lines=4,
318
+ max_lines=6,
319
+ interactive=False
320
+ )
321
+
322
+ # Controls
323
+ with gr.Row(elem_id="controls-row"):
324
  audio = gr.Audio(
325
  sources=["microphone"],
326
  streaming=True,
327
  type="numpy",
328
+ elem_id="mic-input",
329
  show_label=False
330
  )
331
+ reset_btn = gr.Button("Reset", elem_id="reset-btn")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
332
 
333
  # Events
334
  demo.load(fn=log_connection)
335
  audio.stream(
336
  fn=transcribe,
337
  inputs=[audio, state],
338
+ outputs=[state, transcript],
339
  show_progress="hidden",
340
  trigger_mode="always_last"
341
  )
342
+ reset_btn.click(fn=clear_session, outputs=[state, transcript])
343
 
344
  if __name__ == "__main__":
345
  demo.launch(server_name="0.0.0.0", server_port=7860, show_api=False)