from __future__ import annotations from pathlib import Path import gradio as gr from signspeak.llm import generate_subtitle_and_instruction from signspeak.live_debug import process_live_debug_frame from signspeak.pipeline import DEFAULT_INTENT, json_text, run_asl_video from signspeak.tts import generate_tts APP_DIR = Path(__file__).resolve().parent CUSTOM_CSS = (APP_DIR / "assets" / "styles.css").read_text(encoding="utf-8") LANGUAGE_CHOICES = [ "Auto", "Chinese", "English", "Japanese", "Korean", "German", "French", "Russian", "Portuguese", "Spanish", "Italian", ] SPEAKER_CHOICES = [ "Vivian", "Serena", "Uncle_Fu", "Dylan", "Eric", "Ryan", "Aiden", "Ono_Anna", "Sohee", ] def run_asl_brick(video_file: str | None, gloss_override: str | None = None) -> tuple[str, dict, str, str]: try: return run_asl_video(video_file, gloss_override) except Exception as exc: raise gr.Error(f"ASL pipeline failed: {type(exc).__name__}: {exc}") from exc def run_llm_brick(intent_json_text: str) -> tuple[str, str, dict]: try: return generate_subtitle_and_instruction(intent_json_text) except Exception as exc: raise gr.Error(f"llama.cpp generation failed: {type(exc).__name__}: {exc}") from exc def run_tts_brick(text: str, language: str, speaker: str, instruction: str) -> str: try: if (text or "").strip() == "No ASL words were detected yet.": raise ValueError("Analyze ASL did not detect words. Add a real ASL model or use an explicit debug override.") return generate_tts(text, language, speaker, instruction) except Exception as exc: raise gr.Error(f"Qwen3-TTS generation failed: {type(exc).__name__}: {exc}") from exc def build_video_input(label: str) -> gr.Video: return gr.Video( label=label, sources=["upload", "webcam"], format="mp4", ) with gr.Blocks(title="Sign2Voice", css=CUSTOM_CSS, theme=gr.themes.Base()) as demo: gr.HTML( """

Local-first sign-to-speech console

Sign2Voice

Translate camera or uploaded signing clips into natural speech with visible intent, expression, and confidence diagnostics.

Capture Signs Intent Voice
Camera ready llama.cpp local Expressive voice
""" ) with gr.Tabs(): with gr.Tab("Run demo"): with gr.Row(elem_classes=["demo-grid"]): with gr.Column(scale=6, elem_classes=["panel-shell", "input-panel"]): gr.HTML('
01 Capture
') full_video_input = build_video_input("Video or camera capture") with gr.Accordion("Advanced debug controls", open=False): full_gloss_override_input = gr.Textbox( label="Manual gloss override", value="", lines=1, info="Optional. Use only to test the downstream LLM/TTS when no ASL model is available.", ) with gr.Row(elem_classes=["control-row"]): full_language_input = gr.Dropdown( label="Language", choices=LANGUAGE_CHOICES, value="English", ) full_speaker_input = gr.Dropdown( label="Speaker", choices=SPEAKER_CHOICES, value="Ryan", ) run_demo_asl_button = gr.Button("1 Analyze ASL", elem_id="run_demo_asl") with gr.Column(scale=5, elem_classes=["panel-shell", "output-panel"]): gr.HTML('
02 Live debug
') full_debug_video_output = gr.Video(label="Debug overlay playback") full_summary_output = gr.Textbox(label="ASL summary", lines=4) full_intent_output = gr.Code(label="Intent JSON", language="json", lines=8) with gr.Row(elem_classes=["demo-grid"]): with gr.Column(scale=1, elem_classes=["panel-shell"]): gr.HTML('
03 llama.cpp
') run_demo_llm_button = gr.Button("2 Generate subtitle", elem_id="run_demo_llm") full_subtitle_output = gr.Textbox(label="Subtitle", lines=3) full_instruction_output = gr.Textbox(label="Voice instruction", lines=3) with gr.Column(scale=1, elem_classes=["panel-shell"]): gr.HTML('
04 Qwen3-TTS
') run_demo_tts_button = gr.Button("3 Generate speech", elem_id="run_demo_tts") full_audio_output = gr.Audio(label="Generated audio", type="filepath") with gr.Accordion("Pipeline diagnostics", open=False): with gr.Row(elem_classes=["diagnostic-grid"]): full_asl_json_output = gr.JSON(label="ASL structured output") full_llm_json_output = gr.JSON(label="LLM structured output") with gr.Tab("Inspect bricks"): with gr.Row(elem_classes=["brick-grid"]): with gr.Column(scale=1, elem_classes=["panel-shell"]): gr.HTML('
ASL brick
') asl_video_input = build_video_input("Video or camera capture") asl_gloss_override_input = gr.Textbox( label="Debug gloss override", value="", lines=1, ) run_asl_button = gr.Button("Run ASL brick", elem_id="run_asl") asl_summary_output = gr.Textbox(label="ASL summary", lines=4) asl_intent_output = gr.Code(label="Intent JSON", language="json", lines=12) with gr.Column(scale=1): asl_debug_video_output = gr.Video(label="Debug overlay playback") asl_json_output = gr.JSON(label="ASL structured output") with gr.Row(elem_classes=["brick-grid"]): with gr.Column(scale=1, elem_classes=["panel-shell"]): gr.HTML('
llama.cpp brick
') intent_input = gr.Code( label="Intent JSON", value=json_text(DEFAULT_INTENT), language="json", lines=14, ) run_llm_button = gr.Button( "Generate subtitle", elem_id="run_llm", ) with gr.Column(scale=1): subtitle_output = gr.Textbox(label="Subtitle", lines=3) instruction_output = gr.Textbox(label="Voice instruction", lines=3) llm_json_output = gr.JSON(label="LLM structured output") with gr.Row(elem_classes=["brick-grid"]): with gr.Column(scale=1, elem_classes=["panel-shell"]): gr.HTML('
Qwen3-TTS brick
') tts_language_input = gr.Dropdown( label="Language", choices=LANGUAGE_CHOICES, value="English", ) tts_speaker_input = gr.Dropdown( label="Speaker", choices=SPEAKER_CHOICES, value="Ryan", ) run_tts_button = gr.Button("Generate speech", elem_id="run_tts") with gr.Column(scale=1): audio_output = gr.Audio(label="Generated audio", type="filepath") with gr.Tab("Live camera debug"): with gr.Row(elem_classes=["demo-grid"]): with gr.Column(scale=1, elem_classes=["panel-shell"]): gr.HTML('
Camera stream
') live_camera_input = gr.Image( label="Live camera frame", sources=["webcam"], streaming=True, type="numpy", ) with gr.Column(scale=1, elem_classes=["panel-shell"]): gr.HTML('
Live overlay
') live_camera_output = gr.Image(label="Overlay preview", type="numpy") live_camera_status = gr.Textbox(label="Live status", lines=3) gr.HTML( """ """ ) run_demo_asl_button.click( fn=run_asl_brick, inputs=[full_video_input, full_gloss_override_input], outputs=[full_intent_output, full_asl_json_output, full_summary_output, full_debug_video_output], ) run_demo_llm_button.click( fn=run_llm_brick, inputs=[full_intent_output], outputs=[full_subtitle_output, full_instruction_output, full_llm_json_output], ) run_demo_tts_button.click( fn=run_tts_brick, inputs=[ full_subtitle_output, full_language_input, full_speaker_input, full_instruction_output, ], outputs=[full_audio_output], ) run_asl_button.click( fn=run_asl_brick, inputs=[asl_video_input, asl_gloss_override_input], outputs=[asl_intent_output, asl_json_output, asl_summary_output, asl_debug_video_output], ) run_llm_button.click( fn=run_llm_brick, inputs=[intent_input], outputs=[subtitle_output, instruction_output, llm_json_output], ) run_tts_button.click( fn=run_tts_brick, inputs=[ subtitle_output, tts_language_input, tts_speaker_input, instruction_output, ], outputs=[audio_output], ) live_camera_input.stream( fn=process_live_debug_frame, inputs=[live_camera_input], outputs=[live_camera_output, live_camera_status], ) if __name__ == "__main__": demo.queue().launch()