import gradio as gr import nemo.collections.asr as nemo_asr import time from huggingface_hub import hf_hub_download import librosa import soundfile as sf # ───────────────────────────────────────────── # 1. MODEL LOADING (Runs once when server starts) # ───────────────────────────────────────────── print("Downloading your Full Custom Model from the Hub...") # This safely pulls your 2.5GB model from your unlimited Model repository! custom_model_path = hf_hub_download(repo_id="harphool17/parakeet-asr-adapter", filename="ASR-Adapter.nemo") print("Booting up the model engine...") # Unpacks the .nemo file and loads everything inside model = nemo_asr.models.EncDecRNNTBPEModel.restore_from(custom_model_path) model.eval() print("✅ Custom Parakeet Engine Online! Server Ready.") # ───────────────────────────────────────────── # 2. INFERENCE FUNCTION # ───────────────────────────────────────────── def transcribe_audio(file_upload, mic_upload): # Smartly pick whichever tab actually has audio in it audio_filepath = file_upload if file_upload is not None else mic_upload if audio_filepath is None: return "Please upload or record an audio file.", "0.00s" try: start_time = time.time() # --- AUDIO SANITIZER (Fixes the Stereo/Shape Crash) --- # Forces the audio to be Mono (1 channel) and 16,000 Hz y, sr = librosa.load(audio_filepath, sr=16000, mono=True) # Save the clean mono audio to a temporary file clean_audio_path = "clean_temp.wav" sf.write(clean_audio_path, y, sr) # --- RUN INFERENCE --- # Pass the CLEAN file to the model, not the raw upload transcription = model.transcribe([clean_audio_path]) # Extract text safely (handles the Hypothesis object bug) if isinstance(transcription, tuple): raw_result = transcription[0][0] else: raw_result = transcription[0] if hasattr(raw_result, 'text'): result_text = raw_result.text else: result_text = str(raw_result) process_time = time.time() - start_time time_str = f"{process_time:.2f} seconds" return result_text, time_str except Exception as e: return f"An error occurred: {str(e)}", "Error" # ───────────────────────────────────────────── # 3. THE "PRO" DASHBOARD UI # ───────────────────────────────────────────── theme = gr.themes.Soft( primary_hue="indigo", secondary_hue="blue", neutral_hue="slate", font=[gr.themes.GoogleFont("Inter"), "sans-serif"] ) with gr.Blocks(theme=theme, title="Parakeet ASR") as demo: # ── HEADER ── gr.Markdown( """ # 🎙️ Next-Gen Speech Recognition ### Built with NVIDIA Parakeet & Custom Fine-Tuning *This model was fine-tuned offline to achieve a highly competitive **0.29 Word Error Rate** on a rigorous test dataset.* """ ) # ── MAIN LAYOUT (Two Columns) ── with gr.Row(): # LEFT COLUMN: Inputs with gr.Column(scale=1): gr.Markdown("### 1. Input Audio") with gr.Tabs(): with gr.TabItem("Upload File"): audio_upload = gr.Audio(sources=["upload"], type="filepath", label="Audio File") with gr.TabItem("Record Microphone"): audio_mic = gr.Audio(sources=["microphone"], type="filepath", label="Speak into Mic") submit_btn = gr.Button("🚀 Transcribe Audio", variant="primary", size="lg") clear_btn = gr.ClearButton([audio_upload, audio_mic]) # RIGHT COLUMN: Outputs with gr.Column(scale=1): gr.Markdown("### 2. Transcription Result") output_text = gr.Textbox( label="Transcribed Text", lines=8, placeholder="Your transcription will appear here..." ) with gr.Row(): metrics = gr.Textbox(label="Processing Time", value="0.00s", interactive=False) # ── FOOTER ── gr.Markdown("---") gr.Markdown( """ **System Specs:** `Parakeet-tdt-0.6b-v2` Base | `Custom LoRA Adapter` | `Greedy Decoding` """ ) # ── EVENT WIRING ── # Single click event that checks both inputs simultaneously to stop the ghost-click bug submit_btn.click( fn=transcribe_audio, inputs=[audio_upload, audio_mic], outputs=[output_text, metrics] ) # ───────────────────────────────────────────── # 4. LAUNCH # ───────────────────────────────────────────── if __name__ == "__main__": demo.launch()