Sign2Voice / app.py
lilblueyes's picture
Update requirements
dfed012
Raw
History Blame Contribute Delete
11.3 kB
from __future__ import annotations
from pathlib import Path
import gradio as gr
from signspeak.llm import generate_subtitle_and_instruction
from signspeak.live_debug import process_live_debug_frame
from signspeak.pipeline import DEFAULT_INTENT, json_text, run_asl_video
from signspeak.tts import generate_tts
APP_DIR = Path(__file__).resolve().parent
CUSTOM_CSS = (APP_DIR / "assets" / "styles.css").read_text(encoding="utf-8")
LANGUAGE_CHOICES = [
"Auto",
"Chinese",
"English",
"Japanese",
"Korean",
"German",
"French",
"Russian",
"Portuguese",
"Spanish",
"Italian",
]
SPEAKER_CHOICES = [
"Vivian",
"Serena",
"Uncle_Fu",
"Dylan",
"Eric",
"Ryan",
"Aiden",
"Ono_Anna",
"Sohee",
]
def run_asl_brick(video_file: str | None, gloss_override: str | None = None) -> tuple[str, dict, str, str]:
try:
return run_asl_video(video_file, gloss_override)
except Exception as exc:
raise gr.Error(f"ASL pipeline failed: {type(exc).__name__}: {exc}") from exc
def run_llm_brick(intent_json_text: str) -> tuple[str, str, dict]:
try:
return generate_subtitle_and_instruction(intent_json_text)
except Exception as exc:
raise gr.Error(f"llama.cpp generation failed: {type(exc).__name__}: {exc}") from exc
def run_tts_brick(text: str, language: str, speaker: str, instruction: str) -> str:
try:
if (text or "").strip() == "No ASL words were detected yet.":
raise ValueError("Analyze ASL did not detect words. Add a real ASL model or use an explicit debug override.")
return generate_tts(text, language, speaker, instruction)
except Exception as exc:
raise gr.Error(f"Qwen3-TTS generation failed: {type(exc).__name__}: {exc}") from exc
def build_video_input(label: str) -> gr.Video:
return gr.Video(
label=label,
sources=["upload", "webcam"],
format="mp4",
)
with gr.Blocks(title="Sign2Voice", css=CUSTOM_CSS, theme=gr.themes.Base()) as demo:
gr.HTML(
"""
<main id="hero">
<div class="brand-lockup">
<span class="brand-mark" aria-hidden="true"></span>
<div>
<p class="eyebrow">Local-first sign-to-speech console</p>
<h1>Sign2Voice</h1>
</div>
</div>
<div class="hero-grid">
<div>
<p class="hero-copy">
Translate camera or uploaded signing clips into natural speech with
visible intent, expression, and confidence diagnostics.
</p>
<div class="pipeline-rail" aria-label="Pipeline stages">
<span>Capture</span>
<span>Signs</span>
<span>Intent</span>
<span>Voice</span>
</div>
</div>
<div class="system-strip" aria-label="System capabilities">
<span>Camera ready</span>
<span>llama.cpp local</span>
<span>Expressive voice</span>
</div>
</div>
</main>
"""
)
with gr.Tabs():
with gr.Tab("Run demo"):
with gr.Row(elem_classes=["demo-grid"]):
with gr.Column(scale=6, elem_classes=["panel-shell", "input-panel"]):
gr.HTML('<div class="section-kicker">01 Capture</div>')
full_video_input = build_video_input("Video or camera capture")
with gr.Accordion("Advanced debug controls", open=False):
full_gloss_override_input = gr.Textbox(
label="Manual gloss override",
value="",
lines=1,
info="Optional. Use only to test the downstream LLM/TTS when no ASL model is available.",
)
with gr.Row(elem_classes=["control-row"]):
full_language_input = gr.Dropdown(
label="Language",
choices=LANGUAGE_CHOICES,
value="English",
)
full_speaker_input = gr.Dropdown(
label="Speaker",
choices=SPEAKER_CHOICES,
value="Ryan",
)
run_demo_asl_button = gr.Button("1 Analyze ASL", elem_id="run_demo_asl")
with gr.Column(scale=5, elem_classes=["panel-shell", "output-panel"]):
gr.HTML('<div class="section-kicker">02 Live debug</div>')
full_debug_video_output = gr.Video(label="Debug overlay playback")
full_summary_output = gr.Textbox(label="ASL summary", lines=4)
full_intent_output = gr.Code(label="Intent JSON", language="json", lines=8)
with gr.Row(elem_classes=["demo-grid"]):
with gr.Column(scale=1, elem_classes=["panel-shell"]):
gr.HTML('<div class="section-kicker">03 llama.cpp</div>')
run_demo_llm_button = gr.Button("2 Generate subtitle", elem_id="run_demo_llm")
full_subtitle_output = gr.Textbox(label="Subtitle", lines=3)
full_instruction_output = gr.Textbox(label="Voice instruction", lines=3)
with gr.Column(scale=1, elem_classes=["panel-shell"]):
gr.HTML('<div class="section-kicker">04 Qwen3-TTS</div>')
run_demo_tts_button = gr.Button("3 Generate speech", elem_id="run_demo_tts")
full_audio_output = gr.Audio(label="Generated audio", type="filepath")
with gr.Accordion("Pipeline diagnostics", open=False):
with gr.Row(elem_classes=["diagnostic-grid"]):
full_asl_json_output = gr.JSON(label="ASL structured output")
full_llm_json_output = gr.JSON(label="LLM structured output")
with gr.Tab("Inspect bricks"):
with gr.Row(elem_classes=["brick-grid"]):
with gr.Column(scale=1, elem_classes=["panel-shell"]):
gr.HTML('<div class="section-kicker">ASL brick</div>')
asl_video_input = build_video_input("Video or camera capture")
asl_gloss_override_input = gr.Textbox(
label="Debug gloss override",
value="",
lines=1,
)
run_asl_button = gr.Button("Run ASL brick", elem_id="run_asl")
asl_summary_output = gr.Textbox(label="ASL summary", lines=4)
asl_intent_output = gr.Code(label="Intent JSON", language="json", lines=12)
with gr.Column(scale=1):
asl_debug_video_output = gr.Video(label="Debug overlay playback")
asl_json_output = gr.JSON(label="ASL structured output")
with gr.Row(elem_classes=["brick-grid"]):
with gr.Column(scale=1, elem_classes=["panel-shell"]):
gr.HTML('<div class="section-kicker">llama.cpp brick</div>')
intent_input = gr.Code(
label="Intent JSON",
value=json_text(DEFAULT_INTENT),
language="json",
lines=14,
)
run_llm_button = gr.Button(
"Generate subtitle",
elem_id="run_llm",
)
with gr.Column(scale=1):
subtitle_output = gr.Textbox(label="Subtitle", lines=3)
instruction_output = gr.Textbox(label="Voice instruction", lines=3)
llm_json_output = gr.JSON(label="LLM structured output")
with gr.Row(elem_classes=["brick-grid"]):
with gr.Column(scale=1, elem_classes=["panel-shell"]):
gr.HTML('<div class="section-kicker">Qwen3-TTS brick</div>')
tts_language_input = gr.Dropdown(
label="Language",
choices=LANGUAGE_CHOICES,
value="English",
)
tts_speaker_input = gr.Dropdown(
label="Speaker",
choices=SPEAKER_CHOICES,
value="Ryan",
)
run_tts_button = gr.Button("Generate speech", elem_id="run_tts")
with gr.Column(scale=1):
audio_output = gr.Audio(label="Generated audio", type="filepath")
with gr.Tab("Live camera debug"):
with gr.Row(elem_classes=["demo-grid"]):
with gr.Column(scale=1, elem_classes=["panel-shell"]):
gr.HTML('<div class="section-kicker">Camera stream</div>')
live_camera_input = gr.Image(
label="Live camera frame",
sources=["webcam"],
streaming=True,
type="numpy",
)
with gr.Column(scale=1, elem_classes=["panel-shell"]):
gr.HTML('<div class="section-kicker">Live overlay</div>')
live_camera_output = gr.Image(label="Overlay preview", type="numpy")
live_camera_status = gr.Textbox(label="Live status", lines=3)
gr.HTML(
"""
<p class="footer-note">
Build Small badges targeted: Off the Grid, Llama Champion, Off-Brand.
</p>
"""
)
run_demo_asl_button.click(
fn=run_asl_brick,
inputs=[full_video_input, full_gloss_override_input],
outputs=[full_intent_output, full_asl_json_output, full_summary_output, full_debug_video_output],
)
run_demo_llm_button.click(
fn=run_llm_brick,
inputs=[full_intent_output],
outputs=[full_subtitle_output, full_instruction_output, full_llm_json_output],
)
run_demo_tts_button.click(
fn=run_tts_brick,
inputs=[
full_subtitle_output,
full_language_input,
full_speaker_input,
full_instruction_output,
],
outputs=[full_audio_output],
)
run_asl_button.click(
fn=run_asl_brick,
inputs=[asl_video_input, asl_gloss_override_input],
outputs=[asl_intent_output, asl_json_output, asl_summary_output, asl_debug_video_output],
)
run_llm_button.click(
fn=run_llm_brick,
inputs=[intent_input],
outputs=[subtitle_output, instruction_output, llm_json_output],
)
run_tts_button.click(
fn=run_tts_brick,
inputs=[
subtitle_output,
tts_language_input,
tts_speaker_input,
instruction_output,
],
outputs=[audio_output],
)
live_camera_input.stream(
fn=process_live_debug_frame,
inputs=[live_camera_input],
outputs=[live_camera_output, live_camera_status],
)
if __name__ == "__main__":
demo.queue().launch()