Spaces:

D3vShoaib
/

MoonshineASR

Running

App Files Files Community

MoonshineASR / app.py

D3vShoaib

Enhance UI layout by adjusting header and disclaimer styles, and add microphone input option for audio upload

9e1dec8 about 2 months ago

raw

history blame contribute delete

18.4 kB

	import gradio as gr
	import os
	import time
	import threading
	import traceback
	import queue

	from moonshine_voice import (
	Transcriber,
	load_wav_file,
	TranscriptEventListener,
	get_model_for_language,
	string_to_model_arch,
	ModelArch,
	)

	STREAMING_MODELS = {
	"medium-streaming": "Medium Streaming (245M params)",
	"small-streaming": "Small Streaming (123M params)",
	"tiny-streaming": "Tiny Streaming (34M params)",
	}

	NON_STREAMING_MODELS = {
	"base": "Base (61M params)",
	"tiny": "Tiny (39M params)",
	}

	STREAMING_CHOICES = [(v, k) for k, v in STREAMING_MODELS.items()]
	NON_STREAMING_CHOICES = [(v, k) for k, v in NON_STREAMING_MODELS.items()]
	ALL_CHOICES = STREAMING_CHOICES + NON_STREAMING_CHOICES
	ALL_MODELS = list(STREAMING_MODELS.keys()) + list(NON_STREAMING_MODELS.keys())

	# Preload all English models at startup
	print("=" * 60)
	print(" Preloading all English Moonshine models...")
	print("=" * 60)

	transcriber_cache: dict[str, Transcriber] = {}

	for model_name in ALL_MODELS:
	print(f" -> Loading '{model_name}'...")
	arch = string_to_model_arch(model_name)
	model_path, model_arch = get_model_for_language("en", arch)
	transcriber_cache[model_name] = Transcriber(
	model_path=model_path, model_arch=model_arch
	)
	print(f" OK '{model_name}' ready")

	print("=" * 60)
	print(" All models loaded!")
	print("=" * 60)

	ASSETS_DIR = os.path.join(os.path.dirname(__file__), "assets")
	EXAMPLE_AUDIO = os.path.join(ASSETS_DIR, "Aiden.wav")

	# ---------------------------------------------------------------------------
	# Queue system — serializes transcription requests (critical for 2 vCPU)
	# ---------------------------------------------------------------------------
	transcription_queue: queue.Queue = queue.Queue()
	stop_event = threading.Event() # raised to cancel current job
	active_transcriber: Transcriber \| None = None # so stop can call .stop()
	active_transcriber_lock = threading.Lock()
	queue_position_lock = threading.Lock()
	current_queue_size = 0 # approximate position indicator
	job_active = False # is a transcription currently running?


	def request_generation_stop():
	"""Signal a request to stop the current generation."""
	stop_event.set()
	with active_transcriber_lock:
	if active_transcriber is not None:
	try:
	active_transcriber.stop()
	except Exception:
	pass
	return gr.update(interactive=False)


	def update_model_choices(mode):
	"""Return ONLY the relevant models for the selected mode."""
	if mode == "Streaming":
	return gr.Dropdown(choices=STREAMING_CHOICES, value="tiny-streaming")
	else:
	return gr.Dropdown(choices=NON_STREAMING_CHOICES, value="tiny")


	def transcribe(audio_path, mode, model_name):
	"""Run transcription with queue system and stop support."""
	global active_transcriber, current_queue_size, job_active

	if audio_path is None:
	raise gr.Error("Please upload an audio file.")

	try:
	audio_data, sample_rate = load_wav_file(audio_path)
	except Exception as e:
	raise gr.Error(f"Error loading audio: {e}")

	transcriber = transcriber_cache.get(model_name)
	if transcriber is None:
	raise gr.Error(f"Model '{model_name}' not loaded.")

	# --- Queue gate: wait for our turn ---
	ticket = threading.Event()
	with queue_position_lock:
	current_queue_size += 1
	pos = current_queue_size
	if not job_active:
	# No one is running — we go immediately
	job_active = True
	ticket.set()
	else:
	# Someone is running — queue up
	transcription_queue.put(ticket)

	if pos > 1 and not ticket.is_set():
	yield f"⏳ Queued — position {pos - 1} in line. Please wait..."

	# Block until it's our turn
	while not ticket.wait(timeout=0.5):
	if stop_event.is_set():
	with queue_position_lock:
	current_queue_size = max(0, current_queue_size - 1)
	yield "🛑 Cancelled while queued."
	return

	# Reset stop event for this job
	stop_event.clear()

	with active_transcriber_lock:
	active_transcriber = transcriber

	is_streaming = mode == "Streaming"

	try:
	if not is_streaming:
	yield "⏳ Transcribing..."
	if stop_event.is_set():
	yield "🛑 Stopped."
	return

	transcript = transcriber.transcribe_without_streaming(
	audio_data, sample_rate=sample_rate, flags=0
	)

	if stop_event.is_set():
	yield "🛑 Stopped."
	return

	lines = []
	for line in transcript.lines:
	end = line.start_time + line.duration
	lines.append(f"[{line.start_time:.2f}s → {end:.2f}s] {line.text}")
	yield "\n".join(lines) if lines else "No speech detected."
	else:
	yield "⏳ Streaming..."
	transcriber.start()

	completed_lines: list[str] = []
	current_partial = ""

	class _Listener(TranscriptEventListener):
	def on_line_started(self, event):
	nonlocal current_partial
	current_partial = (
	f"⏺ {event.line.start_time:.2f}s: {event.line.text}"
	)

	def on_line_text_changed(self, event):
	nonlocal current_partial
	current_partial = (
	f"⏺ {event.line.start_time:.2f}s: {event.line.text}"
	)

	def on_line_completed(self, event):
	nonlocal current_partial
	completed_lines.append(
	f"✔ {event.line.start_time:.2f}s: {event.line.text}"
	)
	current_partial = ""

	listener = _Listener()
	transcriber.remove_all_listeners()
	transcriber.add_listener(listener)

	chunk_duration = 0.25
	chunk_size = int(chunk_duration * sample_rate)

	for i in range(0, len(audio_data), chunk_size):
	if stop_event.is_set():
	display = "\n".join(completed_lines)
	display += "\n🛑 Stopped."
	yield display
	try:
	transcriber.stop()
	except Exception:
	pass
	break

	chunk = audio_data[i : i + chunk_size]
	transcriber.add_audio(chunk, sample_rate)
	time.sleep(0.05)
	display = "\n".join(completed_lines)
	if current_partial:
	display += "\n" + current_partial
	yield display.strip() or "⏳ Streaming..."
	else:
	# Normal completion (loop didn't break)
	transcriber.stop()
	time.sleep(0.5)
	display = "\n".join(completed_lines)
	if current_partial:
	display += "\n" + current_partial
	yield display.strip() or "No speech detected."

	except gr.Error:
	raise
	except Exception as e:
	full_error = traceback.format_exc()
	print(f"Unexpected error: {full_error}")
	raise gr.Error(f"An unexpected error occurred: {str(e)}")
	finally:
	with active_transcriber_lock:
	active_transcriber = None
	with queue_position_lock:
	current_queue_size = max(0, current_queue_size - 1)
	# Release next job in the queue
	try:
	next_ticket = transcription_queue.get_nowait()
	next_ticket.set()
	except queue.Empty:
	with queue_position_lock:
	job_active = False


	# Load custom theme with fallback
	try:
	theme_path = os.path.join(ASSETS_DIR, "theme.json")
	theme = gr.Theme.load(theme_path)
	except Exception as e:
	print(f"Warning: Could not load custom theme: {e}. Using default Soft theme.")
	theme = gr.themes.Soft()

	css = """
	footer {visibility: hidden}
	.gradio-container {
	max-width: 100% !important;
	padding: 0 !important;
	}
	.header-section {
	text-align: left;
	margin-bottom: 0;
	}
	#app-header {
	margin: 0 !important;
	padding: 0 !important;
	}
	#app-header > div {
	margin: 0 !important;
	padding: 0 !important;
	}
	.logo-container {
	display: flex;
	justify-content: flex-start;
	align-items: center;
	gap: 8px;
	margin-bottom: 0;
	}
	.logo-img {
	height: 34px;
	border-radius: 8px;
	}
	.main-title {
	color: #2c8afa;
	font-weight: 800;
	font-size: 1.7rem;
	margin: 0;
	}
	.description {
	max-width: 900px;
	margin: 0;
	font-size: 0.9rem;
	line-height: 1.35;
	color: #4b5563;
	}
	.links-row {
	display: flex;
	flex-wrap: wrap;
	justify-content: flex-start;
	gap: 8px;
	margin: 0;
	font-size: 0.85rem;
	}
	.links-row a {
	color: #2c8afa;
	text-decoration: none;
	padding: 3px 12px;
	border: 1px solid #2c8afa;
	border-radius: 15px;
	transition: all 0.2s;
	white-space: nowrap;
	}
	.links-row a:hover {
	background-color: #2c8afa;
	color: white;
	}
	.disclaimer {
	text-align: center;
	font-size: 0.8rem;
	color: #9ca3af;
	margin-top: 30px;
	padding: 20px;
	border: 2px dashed #4b5563;
	border-radius: 12px;
	}
	#app-disclaimer {
	margin: 0 !important;
	padding: 0 !important;
	}
	#app-disclaimer > div {
	margin: 0 !important;
	}
	#app-disclaimer .html-container {
	margin: 0 !important;
	padding: 0 !important;
	}
	#app-disclaimer .html-container .disclaimer {
	margin-left: 0 !important;
	}
	.social-handles {
	display: flex;
	justify-content: center;
	gap: 20px;
	margin: 15px 0;
	}
	.social-icon {
	width: 28px;
	height: 28px;
	transition: all 0.3s ease;
	}
	.social-icon:hover {
	transform: scale(1.1) translateY(-3px);
	}
	#transcription-mode .wrap {
	display: flex !important;
	flex-direction: row !important;
	width: 100% !important;
	}
	#transcription-mode .wrap label {
	flex: 1 !important;
	justify-content: center !important;
	text-align: center !important;
	}
	"""

	with gr.Blocks(css=css, theme=theme) as demo:
	with gr.Column(elem_classes="header-section"):
	gr.HTML("""
	<div style="gap: 12px; display: flex; flex-direction: column; align-items: flex-start;">
	<div class="logo-container">
	<img src="https://raw.githubusercontent.com/moonshine-ai/moonshine/main/images/logo.png" class="logo-img" alt="Moonshine Web Logo">
	<h1 class="main-title">Moonshine ASR</h1>
	</div>
	<div class="description">
	<b>Fast, accurate, on-device speech recognition.</b><br>
	Moonshine delivers real-time transcription on edge devices — from laptops to Raspberry Pi.
	</div>
	<div class="links-row">
	<a href="https://github.com/moonshine-ai/moonshine" target="_blank">⭐ Star on GitHub</a>
	</div>
	</div>
	""", elem_id="app-header")

	with gr.Row():
	with gr.Column(scale=1):
	audio_file = gr.Audio(
	sources=["upload", "microphone"],
	type="filepath",
	label="Upload Audio (.wav)",
	)
	mode_radio = gr.Radio(
	choices=["Streaming", "Non-Streaming"],
	value="Streaming",
	label="Transcription Mode",
	elem_id="transcription-mode",
	)
	model_dropdown = gr.Dropdown(
	choices=ALL_CHOICES,
	value="tiny-streaming",
	label="Select from Moonshine Models",
	)

	mode_radio.change(
	fn=update_model_choices,
	inputs=mode_radio,
	outputs=model_dropdown,
	)

	with gr.Row():
	clear_btn = gr.Button("🗑️ Clear", variant="secondary")
	transcribe_btn = gr.Button("⚡ Transcribe", variant="primary")
	stop_btn = gr.Button("🔴 Stop", variant="stop", visible=False)

	with gr.Column(scale=1):
	output_text = gr.Textbox(label="Transcription Output", lines=6)
	gr.Examples(
	examples=[
	[EXAMPLE_AUDIO, "Streaming", "medium-streaming"],
	[EXAMPLE_AUDIO, "Streaming", "small-streaming"],
	[EXAMPLE_AUDIO, "Streaming", "tiny-streaming"],
	[EXAMPLE_AUDIO, "Non-Streaming", "base"],
	[EXAMPLE_AUDIO, "Non-Streaming", "tiny"],
	],
	inputs=[audio_file, mode_radio, model_dropdown],
	)

	gr.HTML("""
	<div class="disclaimer">
	<div class="social-handles">
	<a href="https://github.com/D3vShoaib" target="_blank" style="color: inherit;" aria-label="GitHub">
	<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" width="24" height="24" fill="currentColor"><path d="M12 0c-6.626 0-12 5.373-12 12 0 5.302 3.438 9.8 8.207 11.387.599.111.793-.261.793-.577v-2.234c-3.338.726-4.033-1.416-4.033-1.416-.546-1.387-1.333-1.756-1.333-1.756-1.089-.745.083-.729.083-.729 1.205.084 1.839 1.237 1.839 1.237 1.07 1.834 2.807 1.304 3.492.997.107-.775.418-1.305.762-1.604-2.665-.305-5.467-1.334-5.467-5.931 0-1.311.469-2.381 1.236-3.221-.124-.303-.535-1.524.117-3.176 0 0 1.008-.322 3.301 1.23.957-.266 1.983-.399 3.003-.404 1.02.005 2.047.138 3.006.404 2.291-1.552 3.297-1.23 3.297-1.23.653 1.653.242 2.874.118 3.176.77.84 1.235 1.911 1.235 3.221 0 4.609-2.807 5.624-5.479 5.921.43.372.823 1.102.823 2.222v3.293c0 .319.192.694.801.576 4.765-1.589 8.199-6.086 8.199-11.386 0-6.627-5.373-12-12-12z"/></svg>
	</a>
	<a href="https://linkedin.com/in/D3vShoaib" target="_blank" style="color: inherit;" aria-label="LinkedIn">
	<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" width="24" height="24" fill="currentColor"><path d="M19 0h-14c-2.761 0-5 2.239-5 5v14c0 2.761 2.239 5 5 5h14c2.762 0 5-2.239 5-5v-14c0-2.761-2.238-5-5-5zm-11 19h-3v-11h3v11zm-1.5-12.268c-.966 0-1.75-.79-1.75-1.764s.784-1.764 1.75-1.764 1.75.79 1.75 1.764-.783 1.764-1.75 1.764zm13.5 12.268h-3v-5.604c0-3.368-4-3.113-4 0v5.604h-3v-11h3v1.765c1.396-2.586 7-2.777 7 2.476v6.759z"/></svg>
	</a>
	<a href="https://twitter.com/D3vShoaib" target="_blank" style="color: inherit;" aria-label="Twitter">
	<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" width="24" height="24" fill="currentColor"><path d="M13.682 10.621L20.216 3h-1.549l-5.674 6.624-4.53-6.624H2.433l6.85 10.007-6.85 7.993h1.549l6.014-7.022 4.811 7.022h6.03L13.68 10.62zm-2.091 2.441l-.683-.98L5.342 4.144H7.72l4.475 6.417.683.981 5.8 8.32h-2.378l-4.71-6.8z"/></svg>
	</a>
	<a href="https://instagram.com/d3vshoaib" target="_blank" style="color: inherit;" aria-label="Instagram">
	<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" width="24" height="24" fill="currentColor"><path d="M12 2.163c3.204 0 3.584.012 4.85.07 3.252.148 4.771 1.691 4.919 4.919.058 1.265.069 1.645.069 4.849 0 3.205-.012 3.584-.069 4.849-.149 3.225-1.664 4.771-4.919 4.919-1.266.058-1.644.07-4.85.07-3.204 0-3.584-.012-4.849-.07-3.26-.149-4.771-1.699-4.919-4.92-.058-1.265-.07-1.644-.07-4.849 0-3.204.013-3.583.07-4.849.149-3.227 1.664-4.771 4.919-4.919 1.266-.057 1.645-.069 4.849-.069zm0-2.163c-3.259 0-3.667.014-4.947.072-4.358.2-6.78 2.618-6.98 6.98-.059 1.281-.073 1.689-.073 4.948 0 3.259.014 3.668.072 4.948.2 4.358 2.618 6.78 6.98 6.98 1.281.058 1.689.072 4.948.072 3.259 0 3.668-.014 4.948-.072 4.354-.2 6.782-2.618 6.979-6.98.059-1.28.073-1.689.073-4.948 0-3.259-.014-3.667-.072-4.947-.196-4.354-2.617-6.78-6.979-6.98-1.281-.059-1.69-.073-4.949-.073zm0 5.838c-3.403 0-6.162 2.759-6.162 6.162s2.759 6.163 6.162 6.163 6.162-2.759 6.162-6.163c0-3.403-2.759-6.162-6.162-6.162zm0 10.162c-2.209 0-4-1.79-4-4 0-2.209 1.791-4 4-4s4 1.791 4 4c0 2.21-1.791 4-4 4zm6.406-11.845c-.796 0-1.441.645-1.441 1.44s.645 1.44 1.441 1.44c.795 0 1.439-.645 1.439-1.44s-.644-1.44-1.439-1.44z"/></svg>
	</a>
	</div>
	<p>Built with ❤️ by <a href="https://github.com/D3vShoaib" style="color: #2c8afa; text-decoration: none; font-weight: 500;">D3vShoaib</a></p>
	<p>⚠️ I am not associated with Moonshine and this is only for demonstration purposes.</p>
	</div>
	""", elem_id="app-disclaimer")

	# UI state management functions
	def switch_to_generating_state():
	return (
	gr.update(visible=False), # Hide transcribe button
	gr.update(visible=True, interactive=True), # Show stop button
	)

	def switch_to_idle_state():
	return (
	gr.update(visible=True), # Show transcribe button
	gr.update(visible=False), # Hide stop button
	)

	# Event handlers
	transcribe_event = (
	transcribe_btn.click(
	fn=switch_to_generating_state, outputs=[transcribe_btn, stop_btn]
	)
	.then(
	fn=transcribe,
	inputs=[audio_file, mode_radio, model_dropdown],
	outputs=output_text,
	)
	.then(fn=switch_to_idle_state, outputs=[transcribe_btn, stop_btn])
	)

	# Stop button handler — cancels the Gradio event AND sets the Python stop flag
	stop_btn.click(
	fn=request_generation_stop, outputs=[stop_btn], cancels=[transcribe_event]
	).then(fn=switch_to_idle_state, outputs=[transcribe_btn, stop_btn])

	# Clear button handler
	def perform_clear_action():
	return (
	None, # audio_file
	"Streaming", # mode_radio
	"tiny-streaming", # model_dropdown
	"", # output_text
	)

	clear_btn.click(
	fn=perform_clear_action,
	outputs=[audio_file, mode_radio, model_dropdown, output_text],
	)

	if __name__ == "__main__":
	demo.queue(default_concurrency_limit=1).launch(
	theme=theme, css=css, allowed_paths=[ASSETS_DIR]
	)