nemotron-speech-streaming

Runtime error

App Files Files Community

gnumanth commited on 22 days ago

Commit

da8393f

verified ·

1 Parent(s): 306c2bc

Simplified working Gradio UI with standard components

Browse files

Files changed (1) hide show

app.py +73 -197

app.py CHANGED Viewed

@@ -62,6 +62,7 @@ def transcribe(audio, state):
             if ASR_MODEL:
                 with torch.no_grad():
                     context = state['buffer'][-32000:]
                     results = ASR_MODEL.transcribe([context])
                     print(f"[INFER] Context: {len(context)} | Raw: {results}", flush=True)
@@ -92,254 +93,129 @@ def transcribe(audio, state):
 def clear_session():
     print("[SESSION RESET]", flush=True)
-    return {'transcript': [], 'buffer': None, 'counter': 0}, "Tap the mic to start..."
 def log_connection():
     print(">>> CLIENT CONNECTED <<<", flush=True)
-# --- SIMPLE CLEAN CSS ---
-CSS = """
-@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600&display=swap');
-:root {
-    --bg-dark: #0a0a0f;
-    --bg-gradient: linear-gradient(180deg, #0a0a0f 0%, #151520 50%, #0a0a0f 100%);
-}
-body {
-    background: var(--bg-dark) !important;
-}
 .gradio-container {
-    background: var(--bg-gradient) !important;
-    min-height: 100vh !important;
-    font-family: 'Inter', sans-serif !important;
-    max-width: 100% !important;
-    padding: 0 !important;
-    margin: 0 !important;
-}
-#main-container {
     min-height: 100vh;
-    display: flex;
-    flex-direction: column;
-    background: var(--bg-gradient);
 }
-/* Header */
-#header-row {
-    padding: 20px;
-    display: flex;
-    justify-content: center;
-    background: transparent !important;
 }
-#session-badge {
-    background: rgba(255, 255, 255, 0.05) !important;
-    border: 1px solid rgba(255, 255, 255, 0.1) !important;
-    border-radius: 20px !important;
-    padding: 8px 16px !important;
-    color: rgba(255, 255, 255, 0.7) !important;
-    font-size: 14px !important;
 }
-/* Transcript area */
-#transcript-row {
-    flex: 1;
-    display: flex;
-    align-items: center;
-    justify-content: center;
-    min-height: 50vh;
-    background: transparent !important;
-    padding: 40px 20px;
 }
-#transcript-display {
-    background: transparent !important;
-    border: none !important;
-    box-shadow: none !important;
     text-align: center;
 }
-#transcript-display textarea {
     background: transparent !important;
-    border: none !important;
     color: #ffffff !important;
-    font-size: 36px !important;
-    font-weight: 400 !important;
     text-align: center !important;
-    line-height: 1.4 !important;
-    text-shadow: 0 4px 20px rgba(0, 0, 0, 0.5);
-    min-height: 150px !important;
-    resize: none !important;
-}
-#transcript-display label {
-    display: none !important;
-}
-#transcript-display .wrap {
-    background: transparent !important;
-}
-/* Controls */
-#controls-row {
-    padding: 30px 20px 60px;
-    display: flex;
-    justify-content: center;
-    align-items: center;
-    gap: 30px;
-    background: transparent !important;
-}
-/* Mic button styling */
-#mic-input {
-    background: transparent !important;
-    border: none !important;
-    box-shadow: none !important;
-    max-width: 120px;
-}
-#mic-input > div {
-    background: transparent !important;
     border: none !important;
-    box-shadow: none !important;
-    padding: 0 !important;
 }
-#mic-input label,
-#mic-input .wrap > div:first-child {
-    display: none !important;
 }
-#mic-input audio {
-    display: none !important;
-}
-#mic-input button {
-    width: 80px !important;
-    height: 80px !important;
-    border-radius: 50% !important;
-    background: linear-gradient(135deg, #6366f1 0%, #8b5cf6 100%) !important;
-    border: none !important;
-    cursor: pointer !important;
-    box-shadow: 0 8px 30px rgba(99, 102, 241, 0.4) !important;
-    transition: all 0.2s ease !important;
-}
-#mic-input button:hover {
-    transform: scale(1.05) !important;
-    box-shadow: 0 12px 40px rgba(99, 102, 241, 0.5) !important;
-}
-#mic-input button svg {
-    width: 32px !important;
-    height: 32px !important;
-}
-/* Reset button */
-#reset-btn {
-    background: rgba(255, 255, 255, 0.05) !important;
-    border: 1px solid rgba(255, 255, 255, 0.15) !important;
-    color: rgba(255, 255, 255, 0.7) !important;
-    padding: 14px 28px !important;
-    border-radius: 12px !important;
-    font-size: 14px !important;
-    font-weight: 500 !important;
-    cursor: pointer !important;
-    transition: all 0.2s ease !important;
-}
-#reset-btn:hover {
     background: rgba(255, 255, 255, 0.1) !important;
-    color: #ffffff !important;
 }
-/* Hide Gradio footer and other elements */
 footer {
     display: none !important;
 }
-.contain {
-    background: transparent !important;
-}
-.block {
-    background: transparent !important;
-    border: none !important;
-    box-shadow: none !important;
-}
-/* Ambient glow at bottom */
-#controls-row::before {
-    content: '';
-    position: fixed;
-    bottom: 0;
-    left: 50%;
-    transform: translateX(-50%);
-    width: 200%;
-    height: 40vh;
-    background: radial-gradient(ellipse at center bottom, rgba(99, 102, 241, 0.12) 0%, transparent 70%);
-    pointer-events: none;
-    z-index: 0;
-}
 """
-# --- GRADIO APP ---
-with gr.Blocks(css=CSS, title="Nemotron Speech Streaming", theme=gr.themes.Base()) as demo:
     state = gr.State({'transcript': [], 'buffer': None, 'counter': 0})
-    with gr.Column(elem_id="main-container"):
-        # Header
-        with gr.Row(elem_id="header-row"):
-            gr.HTML(f"""
-                <div id="session-badge" style="display: inline-flex; align-items: center; gap: 10px;
-                     background: rgba(255,255,255,0.05); border: 1px solid rgba(255,255,255,0.1);
-                     border-radius: 20px; padding: 10px 20px;">
-                    <span style="width: 10px; height: 10px; background: #22c55e; border-radius: 50%;
-                          animation: pulse 2s infinite;"></span>
-                    <span style="color: rgba(255,255,255,0.8); font-size: 14px;">Live Session • {SESSION_ID}</span>
-                </div>
-                <style>
-                    @keyframes pulse {{
-                        0%, 100% {{ opacity: 1; }}
-                        50% {{ opacity: 0.5; }}
-                    }}
-                </style>
-            """)
-        # Transcript display
-        with gr.Row(elem_id="transcript-row"):
-            transcript = gr.Textbox(
-                value="Tap the mic to start...",
-                elem_id="transcript-display",
-                show_label=False,
-                lines=4,
-                max_lines=6,
-                interactive=False
             )
-        # Controls
-        with gr.Row(elem_id="controls-row"):
             audio = gr.Audio(
                 sources=["microphone"],
                 streaming=True,
                 type="numpy",
-                elem_id="mic-input",
-                show_label=False
             )
-            reset_btn = gr.Button("Reset", elem_id="reset-btn")
     # Events
     demo.load(fn=log_connection)
     audio.stream(
         fn=transcribe,
         inputs=[audio, state],
-        outputs=[state, transcript],
         show_progress="hidden",
         trigger_mode="always_last"
     )
-    reset_btn.click(fn=clear_session, outputs=[state, transcript])
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860, show_api=False)

             if ASR_MODEL:
                 with torch.no_grad():
                     context = state['buffer'][-32000:]
                     results = ASR_MODEL.transcribe([context])
                     print(f"[INFER] Context: {len(context)} | Raw: {results}", flush=True)
 def clear_session():
     print("[SESSION RESET]", flush=True)
+    return {'transcript': [], 'buffer': None, 'counter': 0}, "Listening..."
 def log_connection():
     print(">>> CLIENT CONNECTED <<<", flush=True)
+# --- CUSTOM THEME CSS ---
+custom_css = """
 .gradio-container {
+    background: linear-gradient(135deg, #1a1a2e 0%, #16213e 50%, #0f0f23 100%) !important;
     min-height: 100vh;
 }
+#title-text {
+    text-align: center;
+    color: #76b900;
+    font-size: 2em;
+    font-weight: bold;
+    margin-bottom: 10px;
 }
+#subtitle-text {
+    text-align: center;
+    color: #888;
+    font-size: 1em;
+    margin-bottom: 30px;
 }
+#session-info {
+    text-align: center;
+    color: #76b900;
+    font-size: 0.9em;
+    padding: 10px;
+    background: rgba(118, 185, 0, 0.1);
+    border-radius: 20px;
+    display: inline-block;
 }
+#transcript-box {
+    min-height: 200px;
+    font-size: 1.5em;
     text-align: center;
+    padding: 40px 20px;
+    background: rgba(255, 255, 255, 0.05);
+    border-radius: 15px;
+    border: 1px solid rgba(255, 255, 255, 0.1);
 }
+#transcript-box textarea {
     background: transparent !important;
     color: #ffffff !important;
+    font-size: 1.5em !important;
     text-align: center !important;
     border: none !important;
 }
+#mic-button {
+    margin: 20px auto;
+    display: block;
 }
+#reset-button {
     background: rgba(255, 255, 255, 0.1) !important;
+    border: 1px solid rgba(255, 255, 255, 0.2) !important;
 }
 footer {
     display: none !important;
 }
 """
+# --- GRADIO UI ---
+with gr.Blocks(css=custom_css, title="Nemotron Speech Streaming", theme=gr.themes.Soft(primary_hue="green")) as demo:
     state = gr.State({'transcript': [], 'buffer': None, 'counter': 0})
+    gr.HTML(f"""
+        <div id="title-text">Nemotron Speech Streaming</div>
+        <div id="subtitle-text">Real-time speech recognition powered by NVIDIA NeMo</div>
+        <div style="text-align: center; margin-bottom: 20px;">
+            <span id="session-info">Session: {SESSION_ID}</span>
+        </div>
+    """)
+    with gr.Row():
+        with gr.Column():
+            transcript_display = gr.Textbox(
+                value="Listening...",
+                label="Transcript",
+                elem_id="transcript-box",
+                lines=6,
+                max_lines=10,
+                interactive=False,
+                show_copy_button=True
             )
+    with gr.Row():
+        with gr.Column(scale=2):
             audio = gr.Audio(
                 sources=["microphone"],
                 streaming=True,
                 type="numpy",
+                label="Click to Start Recording",
+                elem_id="mic-button"
             )
+        with gr.Column(scale=1):
+            reset_btn = gr.Button("Reset", elem_id="reset-button", variant="secondary")
+    gr.HTML("""
+        <div style="text-align: center; margin-top: 30px; color: #666; font-size: 0.85em;">
+            <p>Click the microphone to start speaking. Your speech will be transcribed in real-time.</p>
+            <p>Model: <strong>nvidia/nemotron-speech-streaming-en-0.6b</strong></p>
+        </div>
+    """)
     # Events
     demo.load(fn=log_connection)
     audio.stream(
         fn=transcribe,
         inputs=[audio, state],
+        outputs=[state, transcript_display],
         show_progress="hidden",
         trigger_mode="always_last"
     )
+    reset_btn.click(fn=clear_session, outputs=[state, transcript_display])
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860, show_api=False)