Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import nemo.collections.asr as nemo_asr | |
| import time | |
| from huggingface_hub import hf_hub_download | |
| import librosa | |
| import soundfile as sf | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # 1. MODEL LOADING (Runs once when server starts) | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| print("Downloading your Full Custom Model from the Hub...") | |
| # This safely pulls your 2.5GB model from your unlimited Model repository! | |
| custom_model_path = hf_hub_download(repo_id="harphool17/parakeet-asr-adapter", filename="ASR-Adapter.nemo") | |
| print("Booting up the model engine...") | |
| # Unpacks the .nemo file and loads everything inside | |
| model = nemo_asr.models.EncDecRNNTBPEModel.restore_from(custom_model_path) | |
| model.eval() | |
| print("β Custom Parakeet Engine Online! Server Ready.") | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # 2. INFERENCE FUNCTION | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| def transcribe_audio(file_upload, mic_upload): | |
| # Smartly pick whichever tab actually has audio in it | |
| audio_filepath = file_upload if file_upload is not None else mic_upload | |
| if audio_filepath is None: | |
| return "Please upload or record an audio file.", "0.00s" | |
| try: | |
| start_time = time.time() | |
| # --- AUDIO SANITIZER (Fixes the Stereo/Shape Crash) --- | |
| # Forces the audio to be Mono (1 channel) and 16,000 Hz | |
| y, sr = librosa.load(audio_filepath, sr=16000, mono=True) | |
| # Save the clean mono audio to a temporary file | |
| clean_audio_path = "clean_temp.wav" | |
| sf.write(clean_audio_path, y, sr) | |
| # --- RUN INFERENCE --- | |
| # Pass the CLEAN file to the model, not the raw upload | |
| transcription = model.transcribe([clean_audio_path]) | |
| # Extract text safely (handles the Hypothesis object bug) | |
| if isinstance(transcription, tuple): | |
| raw_result = transcription[0][0] | |
| else: | |
| raw_result = transcription[0] | |
| if hasattr(raw_result, 'text'): | |
| result_text = raw_result.text | |
| else: | |
| result_text = str(raw_result) | |
| process_time = time.time() - start_time | |
| time_str = f"{process_time:.2f} seconds" | |
| return result_text, time_str | |
| except Exception as e: | |
| return f"An error occurred: {str(e)}", "Error" | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # 3. THE "PRO" DASHBOARD UI | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| theme = gr.themes.Soft( | |
| primary_hue="indigo", | |
| secondary_hue="blue", | |
| neutral_hue="slate", | |
| font=[gr.themes.GoogleFont("Inter"), "sans-serif"] | |
| ) | |
| with gr.Blocks(theme=theme, title="Parakeet ASR") as demo: | |
| # ββ HEADER ββ | |
| gr.Markdown( | |
| """ | |
| # ποΈ Next-Gen Speech Recognition | |
| ### Built with NVIDIA Parakeet & Custom Fine-Tuning | |
| *This model was fine-tuned offline to achieve a highly competitive **0.29 Word Error Rate** on a rigorous test dataset.* | |
| """ | |
| ) | |
| # ββ MAIN LAYOUT (Two Columns) ββ | |
| with gr.Row(): | |
| # LEFT COLUMN: Inputs | |
| with gr.Column(scale=1): | |
| gr.Markdown("### 1. Input Audio") | |
| with gr.Tabs(): | |
| with gr.TabItem("Upload File"): | |
| audio_upload = gr.Audio(sources=["upload"], type="filepath", label="Audio File") | |
| with gr.TabItem("Record Microphone"): | |
| audio_mic = gr.Audio(sources=["microphone"], type="filepath", label="Speak into Mic") | |
| submit_btn = gr.Button("π Transcribe Audio", variant="primary", size="lg") | |
| clear_btn = gr.ClearButton([audio_upload, audio_mic]) | |
| # RIGHT COLUMN: Outputs | |
| with gr.Column(scale=1): | |
| gr.Markdown("### 2. Transcription Result") | |
| output_text = gr.Textbox( | |
| label="Transcribed Text", | |
| lines=8, | |
| placeholder="Your transcription will appear here..." | |
| ) | |
| with gr.Row(): | |
| metrics = gr.Textbox(label="Processing Time", value="0.00s", interactive=False) | |
| # ββ FOOTER ββ | |
| gr.Markdown("---") | |
| gr.Markdown( | |
| """ | |
| **System Specs:** `Parakeet-tdt-0.6b-v2` Base | `Custom LoRA Adapter` | `Greedy Decoding` | |
| """ | |
| ) | |
| # ββ EVENT WIRING ββ | |
| # Single click event that checks both inputs simultaneously to stop the ghost-click bug | |
| submit_btn.click( | |
| fn=transcribe_audio, | |
| inputs=[audio_upload, audio_mic], | |
| outputs=[output_text, metrics] | |
| ) | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # 4. LAUNCH | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| if __name__ == "__main__": | |
| demo.launch() |