File size: 5,535 Bytes
528af16
 
 
 
099407f
 
528af16
 
4648c99
528af16
61b8b5c
4648c99
61b8b5c
528af16
61b8b5c
4648c99
61b8b5c
528af16
61b8b5c
 
 
528af16
4648c99
528af16
4648c99
 
 
 
528af16
 
 
 
 
 
099407f
 
 
528af16
099407f
 
 
 
 
 
 
 
 
528af16
099407f
 
 
 
 
 
528af16
099407f
528af16
 
 
 
 
 
 
 
 
 
4648c99
528af16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4648c99
528af16
 
 
 
 
 
 
 
 
 
 
 
099407f
528af16
 
4648c99
528af16
 
 
4648c99
 
 
528af16
f735115
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import gradio as gr
import nemo.collections.asr as nemo_asr
import time
from huggingface_hub import hf_hub_download 
import librosa
import soundfile as sf

# ─────────────────────────────────────────────
#  1. MODEL LOADING (Runs once when server starts)
# ─────────────────────────────────────────────
print("Downloading your Full Custom Model from the Hub...")
# This safely pulls your 2.5GB model from your unlimited Model repository!
custom_model_path = hf_hub_download(repo_id="harphool17/parakeet-asr-adapter", filename="ASR-Adapter.nemo")

print("Booting up the model engine...")
# Unpacks the .nemo file and loads everything inside
model = nemo_asr.models.EncDecRNNTBPEModel.restore_from(custom_model_path)
model.eval()

print("βœ… Custom Parakeet Engine Online! Server Ready.")

# ─────────────────────────────────────────────
#  2. INFERENCE FUNCTION
# ─────────────────────────────────────────────
def transcribe_audio(file_upload, mic_upload):
    # Smartly pick whichever tab actually has audio in it
    audio_filepath = file_upload if file_upload is not None else mic_upload
    
    if audio_filepath is None:
        return "Please upload or record an audio file.", "0.00s"
    
    try:
        start_time = time.time()
        
        # --- AUDIO SANITIZER (Fixes the Stereo/Shape Crash) ---
        # Forces the audio to be Mono (1 channel) and 16,000 Hz
        y, sr = librosa.load(audio_filepath, sr=16000, mono=True)
        
        # Save the clean mono audio to a temporary file
        clean_audio_path = "clean_temp.wav"
        sf.write(clean_audio_path, y, sr)
        
        # --- RUN INFERENCE ---
        # Pass the CLEAN file to the model, not the raw upload
        transcription = model.transcribe([clean_audio_path])
        
        # Extract text safely (handles the Hypothesis object bug)
        if isinstance(transcription, tuple):
            raw_result = transcription[0][0]
        else:
            raw_result = transcription[0]
            
        if hasattr(raw_result, 'text'):
            result_text = raw_result.text
        else:
            result_text = str(raw_result)
            
        process_time = time.time() - start_time
        time_str = f"{process_time:.2f} seconds"
        
        return result_text, time_str
        
    except Exception as e:
        return f"An error occurred: {str(e)}", "Error"

# ─────────────────────────────────────────────
#  3. THE "PRO" DASHBOARD UI
# ─────────────────────────────────────────────
theme = gr.themes.Soft(
    primary_hue="indigo",
    secondary_hue="blue",
    neutral_hue="slate",
    font=[gr.themes.GoogleFont("Inter"), "sans-serif"]
)

with gr.Blocks(theme=theme, title="Parakeet ASR") as demo:
    
    # ── HEADER ──
    gr.Markdown(
        """
        # πŸŽ™οΈ Next-Gen Speech Recognition
        ### Built with NVIDIA Parakeet & Custom Fine-Tuning
        *This model was fine-tuned offline to achieve a highly competitive **0.29 Word Error Rate** on a rigorous test dataset.*
        """
    )
    
    # ── MAIN LAYOUT (Two Columns) ──
    with gr.Row():
        
        # LEFT COLUMN: Inputs
        with gr.Column(scale=1):
            gr.Markdown("### 1. Input Audio")
            
            with gr.Tabs():
                with gr.TabItem("Upload File"):
                    audio_upload = gr.Audio(sources=["upload"], type="filepath", label="Audio File")
                with gr.TabItem("Record Microphone"):
                    audio_mic = gr.Audio(sources=["microphone"], type="filepath", label="Speak into Mic")
            
            submit_btn = gr.Button("πŸš€ Transcribe Audio", variant="primary", size="lg")
            clear_btn = gr.ClearButton([audio_upload, audio_mic])
            
        # RIGHT COLUMN: Outputs
        with gr.Column(scale=1):
            gr.Markdown("### 2. Transcription Result")
            output_text = gr.Textbox(
                label="Transcribed Text", 
                lines=8, 
                placeholder="Your transcription will appear here..."
            )
            
            with gr.Row():
                metrics = gr.Textbox(label="Processing Time", value="0.00s", interactive=False)

    # ── FOOTER ──
    gr.Markdown("---")
    gr.Markdown(
        """
        **System Specs:** `Parakeet-tdt-0.6b-v2` Base | `Custom LoRA Adapter` | `Greedy Decoding`
        """
    )

    # ── EVENT WIRING ──
    # Single click event that checks both inputs simultaneously to stop the ghost-click bug
    submit_btn.click(
        fn=transcribe_audio,
        inputs=[audio_upload, audio_mic],
        outputs=[output_text, metrics]
    )

# ─────────────────────────────────────────────
#  4. LAUNCH
# ─────────────────────────────────────────────
if __name__ == "__main__":
    demo.launch()