Sign2Voice

from __future__ import annotations

from pathlib import Path

import gradio as gr

from signspeak.llm import generate_subtitle_and_instruction
from signspeak.live_debug import process_live_debug_frame
from signspeak.pipeline import DEFAULT_INTENT, json_text, run_asl_video
from signspeak.tts import generate_tts


APP_DIR = Path(__file__).resolve().parent
CUSTOM_CSS = (APP_DIR / "assets" / "styles.css").read_text(encoding="utf-8")

LANGUAGE_CHOICES = [
    "Auto",
    "Chinese",
    "English",
    "Japanese",
    "Korean",
    "German",
    "French",
    "Russian",
    "Portuguese",
    "Spanish",
    "Italian",
]

SPEAKER_CHOICES = [
    "Vivian",
    "Serena",
    "Uncle_Fu",
    "Dylan",
    "Eric",
    "Ryan",
    "Aiden",
    "Ono_Anna",
    "Sohee",
]


def run_asl_brick(video_file: str | None, gloss_override: str | None = None) -> tuple[str, dict, str, str]:
    try:
        return run_asl_video(video_file, gloss_override)
    except Exception as exc:
        raise gr.Error(f"ASL pipeline failed: {type(exc).__name__}: {exc}") from exc


def run_llm_brick(intent_json_text: str) -> tuple[str, str, dict]:
    try:
        return generate_subtitle_and_instruction(intent_json_text)
    except Exception as exc:
        raise gr.Error(f"llama.cpp generation failed: {type(exc).__name__}: {exc}") from exc


def run_tts_brick(text: str, language: str, speaker: str, instruction: str) -> str:
    try:
        if (text or "").strip() == "No ASL words were detected yet.":
            raise ValueError("Analyze ASL did not detect words. Add a real ASL model or use an explicit debug override.")
        return generate_tts(text, language, speaker, instruction)
    except Exception as exc:
        raise gr.Error(f"Qwen3-TTS generation failed: {type(exc).__name__}: {exc}") from exc


def build_video_input(label: str) -> gr.Video:
    return gr.Video(
        label=label,
        sources=["upload", "webcam"],
        format="mp4",
    )


with gr.Blocks(title="Sign2Voice", css=CUSTOM_CSS, theme=gr.themes.Base()) as demo:
    gr.HTML(
        """
        <main id="hero">
          <div class="brand-lockup">
            <span class="brand-mark" aria-hidden="true"></span>
            <div>
              <p class="eyebrow">Local-first sign-to-speech console</p>
              <h1>Sign2Voice</h1>
            </div>
          </div>
          <div class="hero-grid">
            <div>
              <p class="hero-copy">
                Translate camera or uploaded signing clips into natural speech with
                visible intent, expression, and confidence diagnostics.
              </p>
              <div class="pipeline-rail" aria-label="Pipeline stages">
                <span>Capture</span>
                <span>Signs</span>
                <span>Intent</span>
                <span>Voice</span>
              </div>
            </div>
            <div class="system-strip" aria-label="System capabilities">
              <span>Camera ready</span>
              <span>llama.cpp local</span>
              <span>Expressive voice</span>
            </div>
          </div>
        </main>
        """
    )

    with gr.Tabs():
        with gr.Tab("Run demo"):
            with gr.Row(elem_classes=["demo-grid"]):
                with gr.Column(scale=6, elem_classes=["panel-shell", "input-panel"]):
                    gr.HTML('<div class="section-kicker">01 Capture</div>')
                    full_video_input = build_video_input("Video or camera capture")
                    with gr.Accordion("Advanced debug controls", open=False):
                        full_gloss_override_input = gr.Textbox(
                            label="Manual gloss override",
                            value="",
                            lines=1,
                            info="Optional. Use only to test the downstream LLM/TTS when no ASL model is available.",
                        )
                    with gr.Row(elem_classes=["control-row"]):
                        full_language_input = gr.Dropdown(
                            label="Language",
                            choices=LANGUAGE_CHOICES,
                            value="English",
                        )
                        full_speaker_input = gr.Dropdown(
                            label="Speaker",
                            choices=SPEAKER_CHOICES,
                            value="Ryan",
                        )
                    run_demo_asl_button = gr.Button("1 Analyze ASL", elem_id="run_demo_asl")

                with gr.Column(scale=5, elem_classes=["panel-shell", "output-panel"]):
                    gr.HTML('<div class="section-kicker">02 Live debug</div>')
                    full_debug_video_output = gr.Video(label="Debug overlay playback")
                    full_summary_output = gr.Textbox(label="ASL summary", lines=4)
                    full_intent_output = gr.Code(label="Intent JSON", language="json", lines=8)

            with gr.Row(elem_classes=["demo-grid"]):
                with gr.Column(scale=1, elem_classes=["panel-shell"]):
                    gr.HTML('<div class="section-kicker">03 llama.cpp</div>')
                    run_demo_llm_button = gr.Button("2 Generate subtitle", elem_id="run_demo_llm")
                    full_subtitle_output = gr.Textbox(label="Subtitle", lines=3)
                    full_instruction_output = gr.Textbox(label="Voice instruction", lines=3)
                with gr.Column(scale=1, elem_classes=["panel-shell"]):
                    gr.HTML('<div class="section-kicker">04 Qwen3-TTS</div>')
                    run_demo_tts_button = gr.Button("3 Generate speech", elem_id="run_demo_tts")
                    full_audio_output = gr.Audio(label="Generated audio", type="filepath")

            with gr.Accordion("Pipeline diagnostics", open=False):
                with gr.Row(elem_classes=["diagnostic-grid"]):
                    full_asl_json_output = gr.JSON(label="ASL structured output")
                    full_llm_json_output = gr.JSON(label="LLM structured output")

        with gr.Tab("Inspect bricks"):
            with gr.Row(elem_classes=["brick-grid"]):
                with gr.Column(scale=1, elem_classes=["panel-shell"]):
                    gr.HTML('<div class="section-kicker">ASL brick</div>')
                    asl_video_input = build_video_input("Video or camera capture")
                    asl_gloss_override_input = gr.Textbox(
                        label="Debug gloss override",
                        value="",
                        lines=1,
                    )
                    run_asl_button = gr.Button("Run ASL brick", elem_id="run_asl")
                    asl_summary_output = gr.Textbox(label="ASL summary", lines=4)
                    asl_intent_output = gr.Code(label="Intent JSON", language="json", lines=12)
                with gr.Column(scale=1):
                    asl_debug_video_output = gr.Video(label="Debug overlay playback")
                    asl_json_output = gr.JSON(label="ASL structured output")

            with gr.Row(elem_classes=["brick-grid"]):
                with gr.Column(scale=1, elem_classes=["panel-shell"]):
                    gr.HTML('<div class="section-kicker">llama.cpp brick</div>')
                    intent_input = gr.Code(
                        label="Intent JSON",
                        value=json_text(DEFAULT_INTENT),
                        language="json",
                        lines=14,
                    )
                    run_llm_button = gr.Button(
                        "Generate subtitle",
                        elem_id="run_llm",
                    )
                with gr.Column(scale=1):
                    subtitle_output = gr.Textbox(label="Subtitle", lines=3)
                    instruction_output = gr.Textbox(label="Voice instruction", lines=3)
                    llm_json_output = gr.JSON(label="LLM structured output")

            with gr.Row(elem_classes=["brick-grid"]):
                with gr.Column(scale=1, elem_classes=["panel-shell"]):
                    gr.HTML('<div class="section-kicker">Qwen3-TTS brick</div>')
                    tts_language_input = gr.Dropdown(
                        label="Language",
                        choices=LANGUAGE_CHOICES,
                        value="English",
                    )
                    tts_speaker_input = gr.Dropdown(
                        label="Speaker",
                        choices=SPEAKER_CHOICES,
                        value="Ryan",
                    )
                    run_tts_button = gr.Button("Generate speech", elem_id="run_tts")
                with gr.Column(scale=1):
                    audio_output = gr.Audio(label="Generated audio", type="filepath")

        with gr.Tab("Live camera debug"):
            with gr.Row(elem_classes=["demo-grid"]):
                with gr.Column(scale=1, elem_classes=["panel-shell"]):
                    gr.HTML('<div class="section-kicker">Camera stream</div>')
                    live_camera_input = gr.Image(
                        label="Live camera frame",
                        sources=["webcam"],
                        streaming=True,
                        type="numpy",
                    )
                with gr.Column(scale=1, elem_classes=["panel-shell"]):
                    gr.HTML('<div class="section-kicker">Live overlay</div>')
                    live_camera_output = gr.Image(label="Overlay preview", type="numpy")
                    live_camera_status = gr.Textbox(label="Live status", lines=3)

    gr.HTML(
        """
        <p class="footer-note">
          Build Small badges targeted: Off the Grid, Llama Champion, Off-Brand.
        </p>
        """
    )

    run_demo_asl_button.click(
        fn=run_asl_brick,
        inputs=[full_video_input, full_gloss_override_input],
        outputs=[full_intent_output, full_asl_json_output, full_summary_output, full_debug_video_output],
    )

    run_demo_llm_button.click(
        fn=run_llm_brick,
        inputs=[full_intent_output],
        outputs=[full_subtitle_output, full_instruction_output, full_llm_json_output],
    )

    run_demo_tts_button.click(
        fn=run_tts_brick,
        inputs=[
            full_subtitle_output,
            full_language_input,
            full_speaker_input,
            full_instruction_output,
        ],
        outputs=[full_audio_output],
    )

    run_asl_button.click(
        fn=run_asl_brick,
        inputs=[asl_video_input, asl_gloss_override_input],
        outputs=[asl_intent_output, asl_json_output, asl_summary_output, asl_debug_video_output],
    )

    run_llm_button.click(
        fn=run_llm_brick,
        inputs=[intent_input],
        outputs=[subtitle_output, instruction_output, llm_json_output],
    )

    run_tts_button.click(
        fn=run_tts_brick,
        inputs=[
            subtitle_output,
            tts_language_input,
            tts_speaker_input,
            instruction_output,
        ],
        outputs=[audio_output],
    )

    live_camera_input.stream(
        fn=process_live_debug_frame,
        inputs=[live_camera_input],
        outputs=[live_camera_output, live_camera_status],
    )


if __name__ == "__main__":
    demo.queue().launch()