harphool17's picture
Update app.py
099407f verified
import gradio as gr
import nemo.collections.asr as nemo_asr
import time
from huggingface_hub import hf_hub_download
import librosa
import soundfile as sf
# ─────────────────────────────────────────────
# 1. MODEL LOADING (Runs once when server starts)
# ─────────────────────────────────────────────
print("Downloading your Full Custom Model from the Hub...")
# This safely pulls your 2.5GB model from your unlimited Model repository!
custom_model_path = hf_hub_download(repo_id="harphool17/parakeet-asr-adapter", filename="ASR-Adapter.nemo")
print("Booting up the model engine...")
# Unpacks the .nemo file and loads everything inside
model = nemo_asr.models.EncDecRNNTBPEModel.restore_from(custom_model_path)
model.eval()
print("βœ… Custom Parakeet Engine Online! Server Ready.")
# ─────────────────────────────────────────────
# 2. INFERENCE FUNCTION
# ─────────────────────────────────────────────
def transcribe_audio(file_upload, mic_upload):
# Smartly pick whichever tab actually has audio in it
audio_filepath = file_upload if file_upload is not None else mic_upload
if audio_filepath is None:
return "Please upload or record an audio file.", "0.00s"
try:
start_time = time.time()
# --- AUDIO SANITIZER (Fixes the Stereo/Shape Crash) ---
# Forces the audio to be Mono (1 channel) and 16,000 Hz
y, sr = librosa.load(audio_filepath, sr=16000, mono=True)
# Save the clean mono audio to a temporary file
clean_audio_path = "clean_temp.wav"
sf.write(clean_audio_path, y, sr)
# --- RUN INFERENCE ---
# Pass the CLEAN file to the model, not the raw upload
transcription = model.transcribe([clean_audio_path])
# Extract text safely (handles the Hypothesis object bug)
if isinstance(transcription, tuple):
raw_result = transcription[0][0]
else:
raw_result = transcription[0]
if hasattr(raw_result, 'text'):
result_text = raw_result.text
else:
result_text = str(raw_result)
process_time = time.time() - start_time
time_str = f"{process_time:.2f} seconds"
return result_text, time_str
except Exception as e:
return f"An error occurred: {str(e)}", "Error"
# ─────────────────────────────────────────────
# 3. THE "PRO" DASHBOARD UI
# ─────────────────────────────────────────────
theme = gr.themes.Soft(
primary_hue="indigo",
secondary_hue="blue",
neutral_hue="slate",
font=[gr.themes.GoogleFont("Inter"), "sans-serif"]
)
with gr.Blocks(theme=theme, title="Parakeet ASR") as demo:
# ── HEADER ──
gr.Markdown(
"""
# πŸŽ™οΈ Next-Gen Speech Recognition
### Built with NVIDIA Parakeet & Custom Fine-Tuning
*This model was fine-tuned offline to achieve a highly competitive **0.29 Word Error Rate** on a rigorous test dataset.*
"""
)
# ── MAIN LAYOUT (Two Columns) ──
with gr.Row():
# LEFT COLUMN: Inputs
with gr.Column(scale=1):
gr.Markdown("### 1. Input Audio")
with gr.Tabs():
with gr.TabItem("Upload File"):
audio_upload = gr.Audio(sources=["upload"], type="filepath", label="Audio File")
with gr.TabItem("Record Microphone"):
audio_mic = gr.Audio(sources=["microphone"], type="filepath", label="Speak into Mic")
submit_btn = gr.Button("πŸš€ Transcribe Audio", variant="primary", size="lg")
clear_btn = gr.ClearButton([audio_upload, audio_mic])
# RIGHT COLUMN: Outputs
with gr.Column(scale=1):
gr.Markdown("### 2. Transcription Result")
output_text = gr.Textbox(
label="Transcribed Text",
lines=8,
placeholder="Your transcription will appear here..."
)
with gr.Row():
metrics = gr.Textbox(label="Processing Time", value="0.00s", interactive=False)
# ── FOOTER ──
gr.Markdown("---")
gr.Markdown(
"""
**System Specs:** `Parakeet-tdt-0.6b-v2` Base | `Custom LoRA Adapter` | `Greedy Decoding`
"""
)
# ── EVENT WIRING ──
# Single click event that checks both inputs simultaneously to stop the ghost-click bug
submit_btn.click(
fn=transcribe_audio,
inputs=[audio_upload, audio_mic],
outputs=[output_text, metrics]
)
# ─────────────────────────────────────────────
# 4. LAUNCH
# ─────────────────────────────────────────────
if __name__ == "__main__":
demo.launch()