import gradio as gr import torch import numpy as np import spaces from datetime import datetime import random import string SESSION_ID = f"LIVE_{''.join(random.choices(string.ascii_uppercase + string.digits, k=4))}" # --- BACKEND LOGIC --- print(f"[{datetime.now().strftime('%H:%M:%S')}] --- SYSTEM STARTUP ---", flush=True) try: print("Loading NeMo ASR Model...", flush=True) import nemo.collections.asr as nemo_asr ASR_MODEL = nemo_asr.models.ASRModel.from_pretrained(model_name="nvidia/nemotron-speech-streaming-en-0.6b") ASR_MODEL.eval() if torch.cuda.is_available(): print("Moving model to CUDA...", flush=True) ASR_MODEL = ASR_MODEL.cuda() else: print("WARNING: CUDA not available, running on CPU (Slow)", flush=True) print("Model Loaded Successfully.", flush=True) except Exception as e: print(f"CRITICAL MODEL LOAD ERROR: {e}", flush=True) ASR_MODEL = None @spaces.GPU(duration=120) def transcribe(audio, state): if state is None: state = {'transcript': [], 'buffer': None, 'counter': 0} print(f"[SESSION START] {SESSION_ID}", flush=True) if audio is None: return state, "Listening..." try: sr, data = audio if len(data) > 0: peak = np.abs(data).max() if state['counter'] % 10 == 0: print(f"[AUDIO RECV] Step {state['counter']} | Shape: {data.shape} | Peak: {peak:.4f}", flush=True) # Normalize if data.dtype == np.int16: data = data.astype(np.float32) / 32768.0 elif data.dtype == np.int32: data = data.astype(np.float32) / 2147483648.0 else: data = data.astype(np.float32) if len(data.shape) > 1: data = data.mean(axis=1) if sr != 16000: import librosa data = librosa.resample(data, orig_sr=sr, target_sr=16000) # Buffer if state['buffer'] is None: state['buffer'] = data else: state['buffer'] = np.concatenate((state['buffer'], data)) state['counter'] += 1 if len(state['buffer']) >= 3200: if ASR_MODEL: with torch.no_grad(): context = state['buffer'][-32000:] results = ASR_MODEL.transcribe([context]) print(f"[INFER] Context: {len(context)} | Raw: {results}", flush=True) if results and len(results) > 0: hyp = results[0] text = "" if isinstance(hyp, str): text = hyp elif hasattr(hyp, 'text'): text = hyp.text elif hasattr(hyp, 'pred_text'): text = hyp.pred_text if text and text.strip(): print(f" >>> [TXT] {text}", flush=True) current_lines = state['transcript'] if not current_lines: current_lines.append(text) else: current_lines[-1] = text if len(state['buffer']) > 32000: state['buffer'] = state['buffer'][-32000:] except Exception as e: print(f"[CRITICAL PROCESSING ERROR] {e}", flush=True) import traceback traceback.print_exc() valid = [l for l in state['transcript'] if l] current = valid[-1] if valid else "Listening..." return state, current def clear_session(): print("[SESSION RESET]", flush=True) return {'transcript': [], 'buffer': None, 'counter': 0}, "Listening..." def log_connection(): print(">>> CLIENT CONNECTED <<<", flush=True) # --- CUSTOM THEME CSS --- custom_css = """ .gradio-container { background: linear-gradient(135deg, #1a1a2e 0%, #16213e 50%, #0f0f23 100%) !important; min-height: 100vh; } #title-text { text-align: center; color: #76b900; font-size: 2em; font-weight: bold; margin-bottom: 10px; } #subtitle-text { text-align: center; color: #888; font-size: 1em; margin-bottom: 30px; } #session-info { text-align: center; color: #76b900; font-size: 0.9em; padding: 10px; background: rgba(118, 185, 0, 0.1); border-radius: 20px; display: inline-block; } #transcript-box { min-height: 200px; font-size: 1.5em; text-align: center; padding: 40px 20px; background: rgba(255, 255, 255, 0.05); border-radius: 15px; border: 1px solid rgba(255, 255, 255, 0.1); } #transcript-box textarea { background: transparent !important; color: #ffffff !important; font-size: 1.5em !important; text-align: center !important; border: none !important; } #mic-button { margin: 20px auto; display: block; } #reset-button { background: rgba(255, 255, 255, 0.1) !important; border: 1px solid rgba(255, 255, 255, 0.2) !important; } footer { display: none !important; } """ # --- GRADIO UI --- with gr.Blocks(css=custom_css, title="Nemotron Speech Streaming", theme=gr.themes.Soft(primary_hue="green")) as demo: state = gr.State({'transcript': [], 'buffer': None, 'counter': 0}) gr.HTML(f"""
Nemotron Speech Streaming
Real-time speech recognition powered by NVIDIA NeMo
Session: {SESSION_ID}
""") with gr.Row(): with gr.Column(): transcript_display = gr.Textbox( value="Listening...", label="Transcript", elem_id="transcript-box", lines=6, max_lines=10, interactive=False, show_copy_button=True ) with gr.Row(): with gr.Column(scale=2): audio = gr.Audio( sources=["microphone"], streaming=True, type="numpy", label="Click to Start Recording", elem_id="mic-button" ) with gr.Column(scale=1): reset_btn = gr.Button("Reset", elem_id="reset-button", variant="secondary") gr.HTML("""

Click the microphone to start speaking. Your speech will be transcribed in real-time.

Model: nvidia/nemotron-speech-streaming-en-0.6b

""") # Events demo.load(fn=log_connection) audio.stream( fn=transcribe, inputs=[audio, state], outputs=[state, transcript_display], show_progress="hidden", trigger_mode="always_last" ) reset_btn.click(fn=clear_session, outputs=[state, transcript_display]) if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860, show_api=False)