Spaces:
Build error
Build error
| from __future__ import annotations | |
| from pathlib import Path | |
| import gradio as gr | |
| from signspeak.llm import generate_subtitle_and_instruction | |
| from signspeak.live_debug import process_live_debug_frame | |
| from signspeak.pipeline import DEFAULT_INTENT, json_text, run_asl_video | |
| from signspeak.tts import generate_tts | |
| APP_DIR = Path(__file__).resolve().parent | |
| CUSTOM_CSS = (APP_DIR / "assets" / "styles.css").read_text(encoding="utf-8") | |
| LANGUAGE_CHOICES = [ | |
| "Auto", | |
| "Chinese", | |
| "English", | |
| "Japanese", | |
| "Korean", | |
| "German", | |
| "French", | |
| "Russian", | |
| "Portuguese", | |
| "Spanish", | |
| "Italian", | |
| ] | |
| SPEAKER_CHOICES = [ | |
| "Vivian", | |
| "Serena", | |
| "Uncle_Fu", | |
| "Dylan", | |
| "Eric", | |
| "Ryan", | |
| "Aiden", | |
| "Ono_Anna", | |
| "Sohee", | |
| ] | |
| def run_asl_brick(video_file: str | None, gloss_override: str | None = None) -> tuple[str, dict, str, str]: | |
| try: | |
| return run_asl_video(video_file, gloss_override) | |
| except Exception as exc: | |
| raise gr.Error(f"ASL pipeline failed: {type(exc).__name__}: {exc}") from exc | |
| def run_llm_brick(intent_json_text: str) -> tuple[str, str, dict]: | |
| try: | |
| return generate_subtitle_and_instruction(intent_json_text) | |
| except Exception as exc: | |
| raise gr.Error(f"llama.cpp generation failed: {type(exc).__name__}: {exc}") from exc | |
| def run_tts_brick(text: str, language: str, speaker: str, instruction: str) -> str: | |
| try: | |
| if (text or "").strip() == "No ASL words were detected yet.": | |
| raise ValueError("Analyze ASL did not detect words. Add a real ASL model or use an explicit debug override.") | |
| return generate_tts(text, language, speaker, instruction) | |
| except Exception as exc: | |
| raise gr.Error(f"Qwen3-TTS generation failed: {type(exc).__name__}: {exc}") from exc | |
| def build_video_input(label: str) -> gr.Video: | |
| return gr.Video( | |
| label=label, | |
| sources=["upload", "webcam"], | |
| format="mp4", | |
| ) | |
| with gr.Blocks(title="Sign2Voice", css=CUSTOM_CSS, theme=gr.themes.Base()) as demo: | |
| gr.HTML( | |
| """ | |
| <main id="hero"> | |
| <div class="brand-lockup"> | |
| <span class="brand-mark" aria-hidden="true"></span> | |
| <div> | |
| <p class="eyebrow">Local-first sign-to-speech console</p> | |
| <h1>Sign2Voice</h1> | |
| </div> | |
| </div> | |
| <div class="hero-grid"> | |
| <div> | |
| <p class="hero-copy"> | |
| Translate camera or uploaded signing clips into natural speech with | |
| visible intent, expression, and confidence diagnostics. | |
| </p> | |
| <div class="pipeline-rail" aria-label="Pipeline stages"> | |
| <span>Capture</span> | |
| <span>Signs</span> | |
| <span>Intent</span> | |
| <span>Voice</span> | |
| </div> | |
| </div> | |
| <div class="system-strip" aria-label="System capabilities"> | |
| <span>Camera ready</span> | |
| <span>llama.cpp local</span> | |
| <span>Expressive voice</span> | |
| </div> | |
| </div> | |
| </main> | |
| """ | |
| ) | |
| with gr.Tabs(): | |
| with gr.Tab("Run demo"): | |
| with gr.Row(elem_classes=["demo-grid"]): | |
| with gr.Column(scale=6, elem_classes=["panel-shell", "input-panel"]): | |
| gr.HTML('<div class="section-kicker">01 Capture</div>') | |
| full_video_input = build_video_input("Video or camera capture") | |
| with gr.Accordion("Advanced debug controls", open=False): | |
| full_gloss_override_input = gr.Textbox( | |
| label="Manual gloss override", | |
| value="", | |
| lines=1, | |
| info="Optional. Use only to test the downstream LLM/TTS when no ASL model is available.", | |
| ) | |
| with gr.Row(elem_classes=["control-row"]): | |
| full_language_input = gr.Dropdown( | |
| label="Language", | |
| choices=LANGUAGE_CHOICES, | |
| value="English", | |
| ) | |
| full_speaker_input = gr.Dropdown( | |
| label="Speaker", | |
| choices=SPEAKER_CHOICES, | |
| value="Ryan", | |
| ) | |
| run_demo_asl_button = gr.Button("1 Analyze ASL", elem_id="run_demo_asl") | |
| with gr.Column(scale=5, elem_classes=["panel-shell", "output-panel"]): | |
| gr.HTML('<div class="section-kicker">02 Live debug</div>') | |
| full_debug_video_output = gr.Video(label="Debug overlay playback") | |
| full_summary_output = gr.Textbox(label="ASL summary", lines=4) | |
| full_intent_output = gr.Code(label="Intent JSON", language="json", lines=8) | |
| with gr.Row(elem_classes=["demo-grid"]): | |
| with gr.Column(scale=1, elem_classes=["panel-shell"]): | |
| gr.HTML('<div class="section-kicker">03 llama.cpp</div>') | |
| run_demo_llm_button = gr.Button("2 Generate subtitle", elem_id="run_demo_llm") | |
| full_subtitle_output = gr.Textbox(label="Subtitle", lines=3) | |
| full_instruction_output = gr.Textbox(label="Voice instruction", lines=3) | |
| with gr.Column(scale=1, elem_classes=["panel-shell"]): | |
| gr.HTML('<div class="section-kicker">04 Qwen3-TTS</div>') | |
| run_demo_tts_button = gr.Button("3 Generate speech", elem_id="run_demo_tts") | |
| full_audio_output = gr.Audio(label="Generated audio", type="filepath") | |
| with gr.Accordion("Pipeline diagnostics", open=False): | |
| with gr.Row(elem_classes=["diagnostic-grid"]): | |
| full_asl_json_output = gr.JSON(label="ASL structured output") | |
| full_llm_json_output = gr.JSON(label="LLM structured output") | |
| with gr.Tab("Inspect bricks"): | |
| with gr.Row(elem_classes=["brick-grid"]): | |
| with gr.Column(scale=1, elem_classes=["panel-shell"]): | |
| gr.HTML('<div class="section-kicker">ASL brick</div>') | |
| asl_video_input = build_video_input("Video or camera capture") | |
| asl_gloss_override_input = gr.Textbox( | |
| label="Debug gloss override", | |
| value="", | |
| lines=1, | |
| ) | |
| run_asl_button = gr.Button("Run ASL brick", elem_id="run_asl") | |
| asl_summary_output = gr.Textbox(label="ASL summary", lines=4) | |
| asl_intent_output = gr.Code(label="Intent JSON", language="json", lines=12) | |
| with gr.Column(scale=1): | |
| asl_debug_video_output = gr.Video(label="Debug overlay playback") | |
| asl_json_output = gr.JSON(label="ASL structured output") | |
| with gr.Row(elem_classes=["brick-grid"]): | |
| with gr.Column(scale=1, elem_classes=["panel-shell"]): | |
| gr.HTML('<div class="section-kicker">llama.cpp brick</div>') | |
| intent_input = gr.Code( | |
| label="Intent JSON", | |
| value=json_text(DEFAULT_INTENT), | |
| language="json", | |
| lines=14, | |
| ) | |
| run_llm_button = gr.Button( | |
| "Generate subtitle", | |
| elem_id="run_llm", | |
| ) | |
| with gr.Column(scale=1): | |
| subtitle_output = gr.Textbox(label="Subtitle", lines=3) | |
| instruction_output = gr.Textbox(label="Voice instruction", lines=3) | |
| llm_json_output = gr.JSON(label="LLM structured output") | |
| with gr.Row(elem_classes=["brick-grid"]): | |
| with gr.Column(scale=1, elem_classes=["panel-shell"]): | |
| gr.HTML('<div class="section-kicker">Qwen3-TTS brick</div>') | |
| tts_language_input = gr.Dropdown( | |
| label="Language", | |
| choices=LANGUAGE_CHOICES, | |
| value="English", | |
| ) | |
| tts_speaker_input = gr.Dropdown( | |
| label="Speaker", | |
| choices=SPEAKER_CHOICES, | |
| value="Ryan", | |
| ) | |
| run_tts_button = gr.Button("Generate speech", elem_id="run_tts") | |
| with gr.Column(scale=1): | |
| audio_output = gr.Audio(label="Generated audio", type="filepath") | |
| with gr.Tab("Live camera debug"): | |
| with gr.Row(elem_classes=["demo-grid"]): | |
| with gr.Column(scale=1, elem_classes=["panel-shell"]): | |
| gr.HTML('<div class="section-kicker">Camera stream</div>') | |
| live_camera_input = gr.Image( | |
| label="Live camera frame", | |
| sources=["webcam"], | |
| streaming=True, | |
| type="numpy", | |
| ) | |
| with gr.Column(scale=1, elem_classes=["panel-shell"]): | |
| gr.HTML('<div class="section-kicker">Live overlay</div>') | |
| live_camera_output = gr.Image(label="Overlay preview", type="numpy") | |
| live_camera_status = gr.Textbox(label="Live status", lines=3) | |
| gr.HTML( | |
| """ | |
| <p class="footer-note"> | |
| Build Small badges targeted: Off the Grid, Llama Champion, Off-Brand. | |
| </p> | |
| """ | |
| ) | |
| run_demo_asl_button.click( | |
| fn=run_asl_brick, | |
| inputs=[full_video_input, full_gloss_override_input], | |
| outputs=[full_intent_output, full_asl_json_output, full_summary_output, full_debug_video_output], | |
| ) | |
| run_demo_llm_button.click( | |
| fn=run_llm_brick, | |
| inputs=[full_intent_output], | |
| outputs=[full_subtitle_output, full_instruction_output, full_llm_json_output], | |
| ) | |
| run_demo_tts_button.click( | |
| fn=run_tts_brick, | |
| inputs=[ | |
| full_subtitle_output, | |
| full_language_input, | |
| full_speaker_input, | |
| full_instruction_output, | |
| ], | |
| outputs=[full_audio_output], | |
| ) | |
| run_asl_button.click( | |
| fn=run_asl_brick, | |
| inputs=[asl_video_input, asl_gloss_override_input], | |
| outputs=[asl_intent_output, asl_json_output, asl_summary_output, asl_debug_video_output], | |
| ) | |
| run_llm_button.click( | |
| fn=run_llm_brick, | |
| inputs=[intent_input], | |
| outputs=[subtitle_output, instruction_output, llm_json_output], | |
| ) | |
| run_tts_button.click( | |
| fn=run_tts_brick, | |
| inputs=[ | |
| subtitle_output, | |
| tts_language_input, | |
| tts_speaker_input, | |
| instruction_output, | |
| ], | |
| outputs=[audio_output], | |
| ) | |
| live_camera_input.stream( | |
| fn=process_live_debug_frame, | |
| inputs=[live_camera_input], | |
| outputs=[live_camera_output, live_camera_status], | |
| ) | |
| if __name__ == "__main__": | |
| demo.queue().launch() | |