MoonshineASR / app.py
D3vShoaib's picture
Enhance UI layout by adjusting header and disclaimer styles, and add microphone input option for audio upload
9e1dec8
import gradio as gr
import os
import time
import threading
import traceback
import queue
from moonshine_voice import (
Transcriber,
load_wav_file,
TranscriptEventListener,
get_model_for_language,
string_to_model_arch,
ModelArch,
)
STREAMING_MODELS = {
"medium-streaming": "Medium Streaming (245M params)",
"small-streaming": "Small Streaming (123M params)",
"tiny-streaming": "Tiny Streaming (34M params)",
}
NON_STREAMING_MODELS = {
"base": "Base (61M params)",
"tiny": "Tiny (39M params)",
}
STREAMING_CHOICES = [(v, k) for k, v in STREAMING_MODELS.items()]
NON_STREAMING_CHOICES = [(v, k) for k, v in NON_STREAMING_MODELS.items()]
ALL_CHOICES = STREAMING_CHOICES + NON_STREAMING_CHOICES
ALL_MODELS = list(STREAMING_MODELS.keys()) + list(NON_STREAMING_MODELS.keys())
# Preload all English models at startup
print("=" * 60)
print(" Preloading all English Moonshine models...")
print("=" * 60)
transcriber_cache: dict[str, Transcriber] = {}
for model_name in ALL_MODELS:
print(f" -> Loading '{model_name}'...")
arch = string_to_model_arch(model_name)
model_path, model_arch = get_model_for_language("en", arch)
transcriber_cache[model_name] = Transcriber(
model_path=model_path, model_arch=model_arch
)
print(f" OK '{model_name}' ready")
print("=" * 60)
print(" All models loaded!")
print("=" * 60)
ASSETS_DIR = os.path.join(os.path.dirname(__file__), "assets")
EXAMPLE_AUDIO = os.path.join(ASSETS_DIR, "Aiden.wav")
# ---------------------------------------------------------------------------
# Queue system β€” serializes transcription requests (critical for 2 vCPU)
# ---------------------------------------------------------------------------
transcription_queue: queue.Queue = queue.Queue()
stop_event = threading.Event() # raised to cancel current job
active_transcriber: Transcriber | None = None # so stop can call .stop()
active_transcriber_lock = threading.Lock()
queue_position_lock = threading.Lock()
current_queue_size = 0 # approximate position indicator
job_active = False # is a transcription currently running?
def request_generation_stop():
"""Signal a request to stop the current generation."""
stop_event.set()
with active_transcriber_lock:
if active_transcriber is not None:
try:
active_transcriber.stop()
except Exception:
pass
return gr.update(interactive=False)
def update_model_choices(mode):
"""Return ONLY the relevant models for the selected mode."""
if mode == "Streaming":
return gr.Dropdown(choices=STREAMING_CHOICES, value="tiny-streaming")
else:
return gr.Dropdown(choices=NON_STREAMING_CHOICES, value="tiny")
def transcribe(audio_path, mode, model_name):
"""Run transcription with queue system and stop support."""
global active_transcriber, current_queue_size, job_active
if audio_path is None:
raise gr.Error("Please upload an audio file.")
try:
audio_data, sample_rate = load_wav_file(audio_path)
except Exception as e:
raise gr.Error(f"Error loading audio: {e}")
transcriber = transcriber_cache.get(model_name)
if transcriber is None:
raise gr.Error(f"Model '{model_name}' not loaded.")
# --- Queue gate: wait for our turn ---
ticket = threading.Event()
with queue_position_lock:
current_queue_size += 1
pos = current_queue_size
if not job_active:
# No one is running β€” we go immediately
job_active = True
ticket.set()
else:
# Someone is running β€” queue up
transcription_queue.put(ticket)
if pos > 1 and not ticket.is_set():
yield f"⏳ Queued β€” position {pos - 1} in line. Please wait..."
# Block until it's our turn
while not ticket.wait(timeout=0.5):
if stop_event.is_set():
with queue_position_lock:
current_queue_size = max(0, current_queue_size - 1)
yield "πŸ›‘ Cancelled while queued."
return
# Reset stop event for this job
stop_event.clear()
with active_transcriber_lock:
active_transcriber = transcriber
is_streaming = mode == "Streaming"
try:
if not is_streaming:
yield "⏳ Transcribing..."
if stop_event.is_set():
yield "πŸ›‘ Stopped."
return
transcript = transcriber.transcribe_without_streaming(
audio_data, sample_rate=sample_rate, flags=0
)
if stop_event.is_set():
yield "πŸ›‘ Stopped."
return
lines = []
for line in transcript.lines:
end = line.start_time + line.duration
lines.append(f"[{line.start_time:.2f}s β†’ {end:.2f}s] {line.text}")
yield "\n".join(lines) if lines else "No speech detected."
else:
yield "⏳ Streaming..."
transcriber.start()
completed_lines: list[str] = []
current_partial = ""
class _Listener(TranscriptEventListener):
def on_line_started(self, event):
nonlocal current_partial
current_partial = (
f"⏺ {event.line.start_time:.2f}s: {event.line.text}"
)
def on_line_text_changed(self, event):
nonlocal current_partial
current_partial = (
f"⏺ {event.line.start_time:.2f}s: {event.line.text}"
)
def on_line_completed(self, event):
nonlocal current_partial
completed_lines.append(
f"βœ” {event.line.start_time:.2f}s: {event.line.text}"
)
current_partial = ""
listener = _Listener()
transcriber.remove_all_listeners()
transcriber.add_listener(listener)
chunk_duration = 0.25
chunk_size = int(chunk_duration * sample_rate)
for i in range(0, len(audio_data), chunk_size):
if stop_event.is_set():
display = "\n".join(completed_lines)
display += "\nπŸ›‘ Stopped."
yield display
try:
transcriber.stop()
except Exception:
pass
break
chunk = audio_data[i : i + chunk_size]
transcriber.add_audio(chunk, sample_rate)
time.sleep(0.05)
display = "\n".join(completed_lines)
if current_partial:
display += "\n" + current_partial
yield display.strip() or "⏳ Streaming..."
else:
# Normal completion (loop didn't break)
transcriber.stop()
time.sleep(0.5)
display = "\n".join(completed_lines)
if current_partial:
display += "\n" + current_partial
yield display.strip() or "No speech detected."
except gr.Error:
raise
except Exception as e:
full_error = traceback.format_exc()
print(f"Unexpected error: {full_error}")
raise gr.Error(f"An unexpected error occurred: {str(e)}")
finally:
with active_transcriber_lock:
active_transcriber = None
with queue_position_lock:
current_queue_size = max(0, current_queue_size - 1)
# Release next job in the queue
try:
next_ticket = transcription_queue.get_nowait()
next_ticket.set()
except queue.Empty:
with queue_position_lock:
job_active = False
# Load custom theme with fallback
try:
theme_path = os.path.join(ASSETS_DIR, "theme.json")
theme = gr.Theme.load(theme_path)
except Exception as e:
print(f"Warning: Could not load custom theme: {e}. Using default Soft theme.")
theme = gr.themes.Soft()
css = """
footer {visibility: hidden}
.gradio-container {
max-width: 100% !important;
padding: 0 !important;
}
.header-section {
text-align: left;
margin-bottom: 0;
}
#app-header {
margin: 0 !important;
padding: 0 !important;
}
#app-header > div {
margin: 0 !important;
padding: 0 !important;
}
.logo-container {
display: flex;
justify-content: flex-start;
align-items: center;
gap: 8px;
margin-bottom: 0;
}
.logo-img {
height: 34px;
border-radius: 8px;
}
.main-title {
color: #2c8afa;
font-weight: 800;
font-size: 1.7rem;
margin: 0;
}
.description {
max-width: 900px;
margin: 0;
font-size: 0.9rem;
line-height: 1.35;
color: #4b5563;
}
.links-row {
display: flex;
flex-wrap: wrap;
justify-content: flex-start;
gap: 8px;
margin: 0;
font-size: 0.85rem;
}
.links-row a {
color: #2c8afa;
text-decoration: none;
padding: 3px 12px;
border: 1px solid #2c8afa;
border-radius: 15px;
transition: all 0.2s;
white-space: nowrap;
}
.links-row a:hover {
background-color: #2c8afa;
color: white;
}
.disclaimer {
text-align: center;
font-size: 0.8rem;
color: #9ca3af;
margin-top: 30px;
padding: 20px;
border: 2px dashed #4b5563;
border-radius: 12px;
}
#app-disclaimer {
margin: 0 !important;
padding: 0 !important;
}
#app-disclaimer > div {
margin: 0 !important;
}
#app-disclaimer .html-container {
margin: 0 !important;
padding: 0 !important;
}
#app-disclaimer .html-container .disclaimer {
margin-left: 0 !important;
}
.social-handles {
display: flex;
justify-content: center;
gap: 20px;
margin: 15px 0;
}
.social-icon {
width: 28px;
height: 28px;
transition: all 0.3s ease;
}
.social-icon:hover {
transform: scale(1.1) translateY(-3px);
}
#transcription-mode .wrap {
display: flex !important;
flex-direction: row !important;
width: 100% !important;
}
#transcription-mode .wrap label {
flex: 1 !important;
justify-content: center !important;
text-align: center !important;
}
"""
with gr.Blocks(css=css, theme=theme) as demo:
with gr.Column(elem_classes="header-section"):
gr.HTML("""
<div style="gap: 12px; display: flex; flex-direction: column; align-items: flex-start;">
<div class="logo-container">
<img src="https://raw.githubusercontent.com/moonshine-ai/moonshine/main/images/logo.png" class="logo-img" alt="Moonshine Web Logo">
<h1 class="main-title">Moonshine ASR</h1>
</div>
<div class="description">
<b>Fast, accurate, on-device speech recognition.</b><br>
Moonshine delivers real-time transcription on edge devices &mdash; from laptops to Raspberry Pi.
</div>
<div class="links-row">
<a href="https://github.com/moonshine-ai/moonshine" target="_blank">⭐ Star on GitHub</a>
</div>
</div>
""", elem_id="app-header")
with gr.Row():
with gr.Column(scale=1):
audio_file = gr.Audio(
sources=["upload", "microphone"],
type="filepath",
label="Upload Audio (.wav)",
)
mode_radio = gr.Radio(
choices=["Streaming", "Non-Streaming"],
value="Streaming",
label="Transcription Mode",
elem_id="transcription-mode",
)
model_dropdown = gr.Dropdown(
choices=ALL_CHOICES,
value="tiny-streaming",
label="Select from Moonshine Models",
)
mode_radio.change(
fn=update_model_choices,
inputs=mode_radio,
outputs=model_dropdown,
)
with gr.Row():
clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary")
transcribe_btn = gr.Button("⚑ Transcribe", variant="primary")
stop_btn = gr.Button("πŸ”΄ Stop", variant="stop", visible=False)
with gr.Column(scale=1):
output_text = gr.Textbox(label="Transcription Output", lines=6)
gr.Examples(
examples=[
[EXAMPLE_AUDIO, "Streaming", "medium-streaming"],
[EXAMPLE_AUDIO, "Streaming", "small-streaming"],
[EXAMPLE_AUDIO, "Streaming", "tiny-streaming"],
[EXAMPLE_AUDIO, "Non-Streaming", "base"],
[EXAMPLE_AUDIO, "Non-Streaming", "tiny"],
],
inputs=[audio_file, mode_radio, model_dropdown],
)
gr.HTML("""
<div class="disclaimer">
<div class="social-handles">
<a href="https://github.com/D3vShoaib" target="_blank" style="color: inherit;" aria-label="GitHub">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" width="24" height="24" fill="currentColor"><path d="M12 0c-6.626 0-12 5.373-12 12 0 5.302 3.438 9.8 8.207 11.387.599.111.793-.261.793-.577v-2.234c-3.338.726-4.033-1.416-4.033-1.416-.546-1.387-1.333-1.756-1.333-1.756-1.089-.745.083-.729.083-.729 1.205.084 1.839 1.237 1.839 1.237 1.07 1.834 2.807 1.304 3.492.997.107-.775.418-1.305.762-1.604-2.665-.305-5.467-1.334-5.467-5.931 0-1.311.469-2.381 1.236-3.221-.124-.303-.535-1.524.117-3.176 0 0 1.008-.322 3.301 1.23.957-.266 1.983-.399 3.003-.404 1.02.005 2.047.138 3.006.404 2.291-1.552 3.297-1.23 3.297-1.23.653 1.653.242 2.874.118 3.176.77.84 1.235 1.911 1.235 3.221 0 4.609-2.807 5.624-5.479 5.921.43.372.823 1.102.823 2.222v3.293c0 .319.192.694.801.576 4.765-1.589 8.199-6.086 8.199-11.386 0-6.627-5.373-12-12-12z"/></svg>
</a>
<a href="https://linkedin.com/in/D3vShoaib" target="_blank" style="color: inherit;" aria-label="LinkedIn">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" width="24" height="24" fill="currentColor"><path d="M19 0h-14c-2.761 0-5 2.239-5 5v14c0 2.761 2.239 5 5 5h14c2.762 0 5-2.239 5-5v-14c0-2.761-2.238-5-5-5zm-11 19h-3v-11h3v11zm-1.5-12.268c-.966 0-1.75-.79-1.75-1.764s.784-1.764 1.75-1.764 1.75.79 1.75 1.764-.783 1.764-1.75 1.764zm13.5 12.268h-3v-5.604c0-3.368-4-3.113-4 0v5.604h-3v-11h3v1.765c1.396-2.586 7-2.777 7 2.476v6.759z"/></svg>
</a>
<a href="https://twitter.com/D3vShoaib" target="_blank" style="color: inherit;" aria-label="Twitter">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" width="24" height="24" fill="currentColor"><path d="M13.682 10.621L20.216 3h-1.549l-5.674 6.624-4.53-6.624H2.433l6.85 10.007-6.85 7.993h1.549l6.014-7.022 4.811 7.022h6.03L13.68 10.62zm-2.091 2.441l-.683-.98L5.342 4.144H7.72l4.475 6.417.683.981 5.8 8.32h-2.378l-4.71-6.8z"/></svg>
</a>
<a href="https://instagram.com/d3vshoaib" target="_blank" style="color: inherit;" aria-label="Instagram">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" width="24" height="24" fill="currentColor"><path d="M12 2.163c3.204 0 3.584.012 4.85.07 3.252.148 4.771 1.691 4.919 4.919.058 1.265.069 1.645.069 4.849 0 3.205-.012 3.584-.069 4.849-.149 3.225-1.664 4.771-4.919 4.919-1.266.058-1.644.07-4.85.07-3.204 0-3.584-.012-4.849-.07-3.26-.149-4.771-1.699-4.919-4.92-.058-1.265-.07-1.644-.07-4.849 0-3.204.013-3.583.07-4.849.149-3.227 1.664-4.771 4.919-4.919 1.266-.057 1.645-.069 4.849-.069zm0-2.163c-3.259 0-3.667.014-4.947.072-4.358.2-6.78 2.618-6.98 6.98-.059 1.281-.073 1.689-.073 4.948 0 3.259.014 3.668.072 4.948.2 4.358 2.618 6.78 6.98 6.98 1.281.058 1.689.072 4.948.072 3.259 0 3.668-.014 4.948-.072 4.354-.2 6.782-2.618 6.979-6.98.059-1.28.073-1.689.073-4.948 0-3.259-.014-3.667-.072-4.947-.196-4.354-2.617-6.78-6.979-6.98-1.281-.059-1.69-.073-4.949-.073zm0 5.838c-3.403 0-6.162 2.759-6.162 6.162s2.759 6.163 6.162 6.163 6.162-2.759 6.162-6.163c0-3.403-2.759-6.162-6.162-6.162zm0 10.162c-2.209 0-4-1.79-4-4 0-2.209 1.791-4 4-4s4 1.791 4 4c0 2.21-1.791 4-4 4zm6.406-11.845c-.796 0-1.441.645-1.441 1.44s.645 1.44 1.441 1.44c.795 0 1.439-.645 1.439-1.44s-.644-1.44-1.439-1.44z"/></svg>
</a>
</div>
<p>Built with ❀️ by <a href="https://github.com/D3vShoaib" style="color: #2c8afa; text-decoration: none; font-weight: 500;">D3vShoaib</a></p>
<p>⚠️ I am not associated with Moonshine and this is only for demonstration purposes.</p>
</div>
""", elem_id="app-disclaimer")
# UI state management functions
def switch_to_generating_state():
return (
gr.update(visible=False), # Hide transcribe button
gr.update(visible=True, interactive=True), # Show stop button
)
def switch_to_idle_state():
return (
gr.update(visible=True), # Show transcribe button
gr.update(visible=False), # Hide stop button
)
# Event handlers
transcribe_event = (
transcribe_btn.click(
fn=switch_to_generating_state, outputs=[transcribe_btn, stop_btn]
)
.then(
fn=transcribe,
inputs=[audio_file, mode_radio, model_dropdown],
outputs=output_text,
)
.then(fn=switch_to_idle_state, outputs=[transcribe_btn, stop_btn])
)
# Stop button handler β€” cancels the Gradio event AND sets the Python stop flag
stop_btn.click(
fn=request_generation_stop, outputs=[stop_btn], cancels=[transcribe_event]
).then(fn=switch_to_idle_state, outputs=[transcribe_btn, stop_btn])
# Clear button handler
def perform_clear_action():
return (
None, # audio_file
"Streaming", # mode_radio
"tiny-streaming", # model_dropdown
"", # output_text
)
clear_btn.click(
fn=perform_clear_action,
outputs=[audio_file, mode_radio, model_dropdown, output_text],
)
if __name__ == "__main__":
demo.queue(default_concurrency_limit=1).launch(
theme=theme, css=css, allowed_paths=[ASSETS_DIR]
)