Spaces:

harphool17
/

Parakeet-ASR-Competition-Winner

Runtime error

App Files Files Community

Parakeet-ASR-Competition-Winner / app.py

harphool17

Update app.py

099407f verified about 1 month ago

raw

history blame contribute delete

5.54 kB

	import gradio as gr
	import nemo.collections.asr as nemo_asr
	import time
	from huggingface_hub import hf_hub_download
	import librosa
	import soundfile as sf

	# ─────────────────────────────────────────────
	# 1. MODEL LOADING (Runs once when server starts)
	# ─────────────────────────────────────────────
	print("Downloading your Full Custom Model from the Hub...")
	# This safely pulls your 2.5GB model from your unlimited Model repository!
	custom_model_path = hf_hub_download(repo_id="harphool17/parakeet-asr-adapter", filename="ASR-Adapter.nemo")

	print("Booting up the model engine...")
	# Unpacks the .nemo file and loads everything inside
	model = nemo_asr.models.EncDecRNNTBPEModel.restore_from(custom_model_path)
	model.eval()

	print("✅ Custom Parakeet Engine Online! Server Ready.")

	# ─────────────────────────────────────────────
	# 2. INFERENCE FUNCTION
	# ─────────────────────────────────────────────
	def transcribe_audio(file_upload, mic_upload):
	# Smartly pick whichever tab actually has audio in it
	audio_filepath = file_upload if file_upload is not None else mic_upload

	if audio_filepath is None:
	return "Please upload or record an audio file.", "0.00s"

	try:
	start_time = time.time()

	# --- AUDIO SANITIZER (Fixes the Stereo/Shape Crash) ---
	# Forces the audio to be Mono (1 channel) and 16,000 Hz
	y, sr = librosa.load(audio_filepath, sr=16000, mono=True)

	# Save the clean mono audio to a temporary file
	clean_audio_path = "clean_temp.wav"
	sf.write(clean_audio_path, y, sr)

	# --- RUN INFERENCE ---
	# Pass the CLEAN file to the model, not the raw upload
	transcription = model.transcribe([clean_audio_path])

	# Extract text safely (handles the Hypothesis object bug)
	if isinstance(transcription, tuple):
	raw_result = transcription[0][0]
	else:
	raw_result = transcription[0]

	if hasattr(raw_result, 'text'):
	result_text = raw_result.text
	else:
	result_text = str(raw_result)

	process_time = time.time() - start_time
	time_str = f"{process_time:.2f} seconds"

	return result_text, time_str

	except Exception as e:
	return f"An error occurred: {str(e)}", "Error"

	# ─────────────────────────────────────────────
	# 3. THE "PRO" DASHBOARD UI
	# ─────────────────────────────────────────────
	theme = gr.themes.Soft(
	primary_hue="indigo",
	secondary_hue="blue",
	neutral_hue="slate",
	font=[gr.themes.GoogleFont("Inter"), "sans-serif"]
	)

	with gr.Blocks(theme=theme, title="Parakeet ASR") as demo:

	# ── HEADER ──
	gr.Markdown(
	"""
	# 🎙️ Next-Gen Speech Recognition
	### Built with NVIDIA Parakeet & Custom Fine-Tuning
	This model was fine-tuned offline to achieve a highly competitive 0.29 Word Error Rate* on a rigorous test dataset.*
	"""
	)

	# ── MAIN LAYOUT (Two Columns) ──
	with gr.Row():

	# LEFT COLUMN: Inputs
	with gr.Column(scale=1):
	gr.Markdown("### 1. Input Audio")

	with gr.Tabs():
	with gr.TabItem("Upload File"):
	audio_upload = gr.Audio(sources=["upload"], type="filepath", label="Audio File")
	with gr.TabItem("Record Microphone"):
	audio_mic = gr.Audio(sources=["microphone"], type="filepath", label="Speak into Mic")

	submit_btn = gr.Button("🚀 Transcribe Audio", variant="primary", size="lg")
	clear_btn = gr.ClearButton([audio_upload, audio_mic])

	# RIGHT COLUMN: Outputs
	with gr.Column(scale=1):
	gr.Markdown("### 2. Transcription Result")
	output_text = gr.Textbox(
	label="Transcribed Text",
	lines=8,
	placeholder="Your transcription will appear here..."
	)

	with gr.Row():
	metrics = gr.Textbox(label="Processing Time", value="0.00s", interactive=False)

	# ── FOOTER ──
	gr.Markdown("---")
	gr.Markdown(
	"""
	System Specs: `Parakeet-tdt-0.6b-v2` Base \| `Custom LoRA Adapter` \| `Greedy Decoding`
	"""
	)

	# ── EVENT WIRING ──
	# Single click event that checks both inputs simultaneously to stop the ghost-click bug
	submit_btn.click(
	fn=transcribe_audio,
	inputs=[audio_upload, audio_mic],
	outputs=[output_text, metrics]
	)

	# ─────────────────────────────────────────────
	# 4. LAUNCH
	# ─────────────────────────────────────────────
	if __name__ == "__main__":
	demo.launch()