Spaces:

harphool17
/

Parakeet-ASR-Competition-Winner

Runtime error

File size: 5,535 Bytes

import gradio as gr
import nemo.collections.asr as nemo_asr
import time
from huggingface_hub import hf_hub_download 
import librosa
import soundfile as sf

# ─────────────────────────────────────────────
#  1. MODEL LOADING (Runs once when server starts)
# ─────────────────────────────────────────────
print("Downloading your Full Custom Model from the Hub...")
# This safely pulls your 2.5GB model from your unlimited Model repository!
custom_model_path = hf_hub_download(repo_id="harphool17/parakeet-asr-adapter", filename="ASR-Adapter.nemo")

print("Booting up the model engine...")
# Unpacks the .nemo file and loads everything inside
model = nemo_asr.models.EncDecRNNTBPEModel.restore_from(custom_model_path)
model.eval()

print("✅ Custom Parakeet Engine Online! Server Ready.")

# ─────────────────────────────────────────────
#  2. INFERENCE FUNCTION
# ─────────────────────────────────────────────
def transcribe_audio(file_upload, mic_upload):
    # Smartly pick whichever tab actually has audio in it
    audio_filepath = file_upload if file_upload is not None else mic_upload
    
    if audio_filepath is None:
        return "Please upload or record an audio file.", "0.00s"
    
    try:
        start_time = time.time()
        
        # --- AUDIO SANITIZER (Fixes the Stereo/Shape Crash) ---
        # Forces the audio to be Mono (1 channel) and 16,000 Hz
        y, sr = librosa.load(audio_filepath, sr=16000, mono=True)
        
        # Save the clean mono audio to a temporary file
        clean_audio_path = "clean_temp.wav"
        sf.write(clean_audio_path, y, sr)
        
        # --- RUN INFERENCE ---
        # Pass the CLEAN file to the model, not the raw upload
        transcription = model.transcribe([clean_audio_path])
        
        # Extract text safely (handles the Hypothesis object bug)
        if isinstance(transcription, tuple):
            raw_result = transcription[0][0]
        else:
            raw_result = transcription[0]
            
        if hasattr(raw_result, 'text'):
            result_text = raw_result.text
        else:
            result_text = str(raw_result)
            
        process_time = time.time() - start_time
        time_str = f"{process_time:.2f} seconds"
        
        return result_text, time_str
        
    except Exception as e:
        return f"An error occurred: {str(e)}", "Error"

# ─────────────────────────────────────────────
#  3. THE "PRO" DASHBOARD UI
# ─────────────────────────────────────────────
theme = gr.themes.Soft(
    primary_hue="indigo",
    secondary_hue="blue",
    neutral_hue="slate",
    font=[gr.themes.GoogleFont("Inter"), "sans-serif"]
)

with gr.Blocks(theme=theme, title="Parakeet ASR") as demo:
    
    # ── HEADER ──
    gr.Markdown(
        """
        # 🎙️ Next-Gen Speech Recognition
        ### Built with NVIDIA Parakeet & Custom Fine-Tuning
        *This model was fine-tuned offline to achieve a highly competitive **0.29 Word Error Rate** on a rigorous test dataset.*
        """
    )
    
    # ── MAIN LAYOUT (Two Columns) ──
    with gr.Row():
        
        # LEFT COLUMN: Inputs
        with gr.Column(scale=1):
            gr.Markdown("### 1. Input Audio")
            
            with gr.Tabs():
                with gr.TabItem("Upload File"):
                    audio_upload = gr.Audio(sources=["upload"], type="filepath", label="Audio File")
                with gr.TabItem("Record Microphone"):
                    audio_mic = gr.Audio(sources=["microphone"], type="filepath", label="Speak into Mic")
            
            submit_btn = gr.Button("🚀 Transcribe Audio", variant="primary", size="lg")
            clear_btn = gr.ClearButton([audio_upload, audio_mic])
            
        # RIGHT COLUMN: Outputs
        with gr.Column(scale=1):
            gr.Markdown("### 2. Transcription Result")
            output_text = gr.Textbox(
                label="Transcribed Text", 
                lines=8, 
                placeholder="Your transcription will appear here..."
            )
            
            with gr.Row():
                metrics = gr.Textbox(label="Processing Time", value="0.00s", interactive=False)

    # ── FOOTER ──
    gr.Markdown("---")
    gr.Markdown(
        """
        **System Specs:** `Parakeet-tdt-0.6b-v2` Base | `Custom LoRA Adapter` | `Greedy Decoding`
        """
    )

    # ── EVENT WIRING ──
    # Single click event that checks both inputs simultaneously to stop the ghost-click bug
    submit_btn.click(
        fn=transcribe_audio,
        inputs=[audio_upload, audio_mic],
        outputs=[output_text, metrics]
    )

# ─────────────────────────────────────────────
#  4. LAUNCH
# ─────────────────────────────────────────────
if __name__ == "__main__":
    demo.launch()