Spaces:

build-small-hackathon
/

Sign2Voice

Build error

App Files Files Community

lilblueyes commited on 25 days ago

Commit

b0558d5

1 Parent(s): 57d04e5

Refactor app into pipeline bricks

Browse files

Files changed (5) hide show

app.py +191 -401
assets/styles.css +118 -0
signspeak/llm.py +159 -0
signspeak/pipeline.py +99 -0
signspeak/tts.py +64 -0

app.py CHANGED Viewed

@@ -1,430 +1,200 @@
-import os
-import json
-import time
-import tempfile
-import gradio as gr
-import soundfile as sf
-import torch
-from qwen_tts import Qwen3TTSModel
-from llama_cpp import Llama
-TTS_MODEL_ID = os.getenv("TTS_MODEL_ID", "Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice")
-LLM_REPO_ID = os.getenv("LLM_REPO_ID", "Qwen/Qwen2.5-0.5B-Instruct-GGUF")
-LLM_FILENAME = os.getenv("LLM_FILENAME", "qwen2.5-0.5b-instruct-q4_k_m.gguf")
-tts_model = None
-llm_model = None
-CUSTOM_CSS = """
-:root {
-  --bg: #050816;
-  --panel: rgba(255, 255, 255, 0.075);
-  --panel-border: rgba(255, 255, 255, 0.16);
-  --text: #f8fafc;
-  --muted: #94a3b8;
-  --accent: #8b5cf6;
-  --accent-2: #06b6d4;
-}
-.gradio-container {
-  background:
-    radial-gradient(circle at 20% 20%, rgba(139, 92, 246, 0.30), transparent 28%),
-    radial-gradient(circle at 80% 0%, rgba(6, 182, 212, 0.24), transparent 28%),
-    linear-gradient(135deg, #050816 0%, #0f172a 55%, #111827 100%) !important;
-  color: var(--text) !important;
-  font-family: Inter, ui-sans-serif, system-ui, sans-serif !important;
-}
-#hero {
-  padding: 28px;
-  border: 1px solid var(--panel-border);
-  border-radius: 28px;
-  background: linear-gradient(135deg, rgba(255,255,255,0.10), rgba(255,255,255,0.04));
-  box-shadow: 0 24px 80px rgba(0,0,0,0.35);
-  backdrop-filter: blur(18px);
-}
-#hero h1 {
-  font-size: 42px;
-  line-height: 1.05;
-  margin-bottom: 8px;
-  letter-spacing: -0.04em;
-}
-#hero p {
-  color: var(--muted);
-  font-size: 16px;
-}
-.badge-row {
-  display: flex;
-  flex-wrap: wrap;
-  gap: 10px;
-  margin-top: 16px;
-}
-.badge {
-  padding: 8px 12px;
-  border-radius: 999px;
-  background: rgba(139, 92, 246, 0.16);
-  border: 1px solid rgba(139, 92, 246, 0.34);
-  color: #ddd6fe;
-  font-weight: 700;
-  font-size: 13px;
-}
-.block, .form, .panel {
-  border-radius: 22px !important;
-}
-textarea, input, select {
-  background: rgba(15, 23, 42, 0.72) !important;
-  color: var(--text) !important;
-  border-color: rgba(255,255,255,0.14) !important;
-}
-button.primary, button {
-  border-radius: 999px !important;
-  font-weight: 800 !important;
-}
-#run_llm {
-  background: linear-gradient(135deg, var(--accent), var(--accent-2)) !important;
-  color: white !important;
-  border: none !important;
-}
-#run_tts {
-  background: linear-gradient(135deg, #f97316, #ec4899) !important;
-  color: white !important;
-  border: none !important;
-}
-.footer-note {
-  color: var(--muted);
-  font-size: 13px;
-  text-align: center;
-}
-"""
-def get_tts_model():
-    global tts_model
-    if tts_model is not None:
-        return tts_model
-    if torch.cuda.is_available():
-        tts_model = Qwen3TTSModel.from_pretrained(
-            TTS_MODEL_ID,
-            device_map="cuda:0",
-            dtype=torch.bfloat16,
-        )
-    else:
-        tts_model = Qwen3TTSModel.from_pretrained(
-            TTS_MODEL_ID,
-            device_map="cpu",
-            dtype=torch.float32,
-        )
-    return tts_model
-def get_llm_model():
-    global llm_model
-    if llm_model is not None:
-        return llm_model
-    # llama-cpp-python downloads the GGUF from Hugging Face.
-    # Q4_K_M is a good first compromise for CPU Spaces.
-    llm_model = Llama.from_pretrained(
-        repo_id=LLM_REPO_ID,
-        filename=LLM_FILENAME,
-        n_ctx=1024,
-        n_threads=max(2, os.cpu_count() or 2),
-        n_gpu_layers=-1 if torch.cuda.is_available() else 0,
-        verbose=True,
-    )
-    return llm_model
-def safe_json_loads(text):
     try:
-        return json.loads(text)
-    except Exception:
-        return {
-            "raw_input": text,
-            "warning": "Input was not valid JSON, treated as raw text.",
-        }
-def extract_json_object(text):
-    """
-    Extract the first valid JSON object from a model response.
-    Handles:
-    - pure JSON
-    - ```json ... ```
-    - text before/after JSON
-    """
-    if not text:
-        raise ValueError("Empty model response")
-    cleaned = text.strip()
-    if cleaned.startswith("```"):
-        cleaned = cleaned.replace("```json", "", 1)
-        cleaned = cleaned.replace("```JSON", "", 1)
-        cleaned = cleaned.replace("```", "")
-        cleaned = cleaned.strip()
     try:
-        return json.loads(cleaned)
-    except Exception:
-        pass
-    start = cleaned.find("{")
-    end = cleaned.rfind("}")
-    if start == -1 or end == -1 or end <= start:
-        raise ValueError(f"No JSON object found in model response: {text}")
-    candidate = cleaned[start:end + 1]
-    return json.loads(candidate)
-def normalize_llm_output(parsed):
-    subtitle = str(parsed.get("subtitle", "")).strip()
-    voice_instruction = str(parsed.get("voice_instruction", "")).strip()
-    if not subtitle:
-        subtitle = "I want to say something."
-    if not voice_instruction:
-        voice_instruction = "Speak clearly and naturally."
-    forbidden_fragments = ["```", '"subtitle"', '"voice_instruction"', "{", "}"]
-    if any(fragment in subtitle for fragment in forbidden_fragments):
-        subtitle = "I am happy to see you."
-    return {
-        "subtitle": subtitle,
-        "voice_instruction": voice_instruction,
-    }
-def generate_subtitle_and_instruction(intent_json_text):
-    intent = safe_json_loads(intent_json_text)
-    system_prompt = (
-        "You are an assistant inside an ASL-to-speech accessibility app. "
-        "Convert detected ASL glosses and emotion metadata into speech output. "
-        "You must return raw JSON only. "
-        "Do not use markdown. "
-        "Do not wrap the response in ```json fences. "
-        "Return exactly this schema: "
-        '{"subtitle": "...", "voice_instruction": "..."}'
-    )
-    user_prompt = f"""
-Input intent data:
-{json.dumps(intent, ensure_ascii=False, indent=2)}
-Task:
-Generate a short natural subtitle and a TTS voice instruction.
-Rules:
-- Return raw JSON only.
-- Do not use markdown.
-- Do not include explanations.
-- Do not include code fences.
-- The subtitle must be only the sentence to speak.
-- The voice_instruction must describe tone, emotion, pace, and intensity.
-- Do not copy JSON keys into the subtitle.
-Expected output format:
-{{"subtitle": "I am happy to see you.", "voice_instruction": "Speak warmly, joyfully, and clearly."}}
-"""
-    llm = get_llm_model()
-    result = llm.create_chat_completion(
-        messages=[
-            {"role": "system", "content": system_prompt},
-            {"role": "user", "content": user_prompt},
-        ],
-        temperature=0.1,
-        max_tokens=96,
-    )
-    raw_content = result["choices"][0]["message"]["content"].strip()
     try:
-        parsed = extract_json_object(raw_content)
-        normalized = normalize_llm_output(parsed)
-    except Exception as error:
-        normalized = {
-            "subtitle": "I am happy to see you.",
-            "voice_instruction": "Speak warmly, joyfully, and clearly.",
-            "parser_warning": str(error),
-            "raw_model_output": raw_content,
-        }
-    return (
-        normalized["subtitle"],
-        normalized["voice_instruction"],
-        normalized,
-    )
-def generate_tts(text, language, speaker, instruction):
-    text = (text or "").strip()
-    instruction = (instruction or "").strip()
-    if not text:
-        raise gr.Error("Aucun subtitle à synthétiser.")
-    tts = get_tts_model()
-    wavs, sr = tts.generate_custom_voice(
-        text=text,
-        language=language,
-        speaker=speaker,
-        instruct=instruction,
     )
-    output_path = os.path.join(
-        tempfile.gettempdir(),
-        f"qwen_tts_{int(time.time() * 1000)}.wav",
-    )
-    sf.write(output_path, wavs[0], sr)
-    return output_path
-DEFAULT_INTENT = {
-    "detected_glosses": ["I", "HAPPY", "SEE", "YOU"],
-    "detected_facial_expression": "happy",
-    "emotion_profile": {
-        "dominant": "joy",
-        "confidence": 0.83,
-    },
-    "communication_intent": "friendly_greeting",
-    "pipeline_stage": "mock_asl_intent_for_llama_cpp_test",
-}
-with gr.Blocks(
-    title="SignSpeak Local",
-) as demo:
     gr.HTML(
         """
         <section id="hero">
           <h1>SignSpeak Local</h1>
           <p>
-            ASL video to expressive speech, built as a local-first accessibility pipeline.
-            Current milestone: llama.cpp intent generation + Qwen3-TTS voice synthesis.
           </p>
           <div class="badge-row">
             <span class="badge">llama.cpp</span>
-            <span class="badge">local-first</span>
-            <span class="badge">custom Gradio UI</span>
-            <span class="badge">expressive TTS</span>
           </div>
         </section>
         """
     )
-    with gr.Row():
-        with gr.Column(scale=1):
-            gr.Markdown("## 1. Intent input")
-            intent_input = gr.Textbox(
-                label="Mock intent JSON",
-                value=json.dumps(DEFAULT_INTENT, ensure_ascii=False, indent=2),
-                lines=13,
-            )
-            run_llm_button = gr.Button(
-                "Generate subtitle with llama.cpp",
-                elem_id="run_llm",
-            )
-        with gr.Column(scale=1):
-            gr.Markdown("## 2. llama.cpp output")
-            subtitle_output = gr.Textbox(
-                label="Subtitle",
-                lines=3,
-            )
-            instruction_output = gr.Textbox(
-                label="Voice instruction",
-                lines=3,
-            )
-            llm_json_output = gr.JSON(
-                label="LLM structured output",
-            )
-    with gr.Row():
-        with gr.Column(scale=1):
-            gr.Markdown("## 3. Voice synthesis")
-            language_input = gr.Dropdown(
-                label="Language",
-                choices=[
-                    "Auto",
-                    "Chinese",
-                    "English",
-                    "Japanese",
-                    "Korean",
-                    "German",
-                    "French",
-                    "Russian",
-                    "Portuguese",
-                    "Spanish",
-                    "Italian",
-                ],
-                value="English",
-            )
-            speaker_input = gr.Dropdown(
-                label="Speaker",
-                choices=[
-                    "Vivian",
-                    "Serena",
-                    "Uncle_Fu",
-                    "Dylan",
-                    "Eric",
-                    "Ryan",
-                    "Aiden",
-                    "Ono_Anna",
-                    "Sohee",
-                ],
-                value="Ryan",
-            )
-            run_tts_button = gr.Button(
-                "Generate expressive speech",
-                elem_id="run_tts",
-            )
-        with gr.Column(scale=1):
-            gr.Markdown("## 4. Result")
-            audio_output = gr.Audio(
-                label="Generated audio",
-                type="filepath",
-            )
     gr.HTML(
         """
@@ -434,18 +204,38 @@ with gr.Blocks(
         """
     )
     run_llm_button.click(
-        fn=generate_subtitle_and_instruction,
         inputs=[intent_input],
         outputs=[subtitle_output, instruction_output, llm_json_output],
     )
     run_tts_button.click(
-        fn=generate_tts,
         inputs=[
             subtitle_output,
-            language_input,
-            speaker_input,
             instruction_output,
         ],
         outputs=[audio_output],

+from __future__ import annotations
+from pathlib import Path
+import gradio as gr
+from signspeak.llm import generate_subtitle_and_instruction
+from signspeak.pipeline import DEFAULT_INTENT, json_text, run_asl_video
+from signspeak.tts import generate_tts
+APP_DIR = Path(__file__).resolve().parent
+CUSTOM_CSS = (APP_DIR / "assets" / "styles.css").read_text(encoding="utf-8")
+def run_asl_brick(video_file: str | None) -> tuple[str, dict, str]:
     try:
+        return run_asl_video(video_file)
+    except Exception as exc:
+        raise gr.Error(f"ASL pipeline failed: {type(exc).__name__}: {exc}") from exc
+def run_llm_brick(intent_json_text: str) -> tuple[str, str, dict]:
     try:
+        return generate_subtitle_and_instruction(intent_json_text)
+    except Exception as exc:
+        raise gr.Error(f"llama.cpp generation failed: {type(exc).__name__}: {exc}") from exc
+def run_tts_brick(text: str, language: str, speaker: str, instruction: str) -> str:
     try:
+        return generate_tts(text, language, speaker, instruction)
+    except Exception as exc:
+        raise gr.Error(f"Qwen3-TTS generation failed: {type(exc).__name__}: {exc}") from exc
+def run_full_pipeline(
+    video_file: str | None,
+    language: str,
+    speaker: str,
+) -> tuple[str, dict, str, str, str, dict, str]:
+    intent_json, asl_result, asl_summary = run_asl_brick(video_file)
+    subtitle, instruction, llm_result = run_llm_brick(intent_json)
+    audio_path = run_tts_brick(subtitle, language, speaker, instruction)
+    return intent_json, asl_result, asl_summary, subtitle, instruction, llm_result, audio_path
+def build_video_input(label: str) -> gr.Video:
+    return gr.Video(
+        label=label,
+        sources=["upload", "webcam"],
+        type="filepath",
+        format="mp4",
     )
+with gr.Blocks(title="SignSpeak Local") as demo:
     gr.HTML(
         """
         <section id="hero">
           <h1>SignSpeak Local</h1>
           <p>
+            ASL video to expressive speech, with independent ASL, llama.cpp,
+            and Qwen3-TTS bricks for controlled demo runs.
           </p>
           <div class="badge-row">
+            <span class="badge">ASL video</span>
+            <span class="badge">live camera</span>
             <span class="badge">llama.cpp</span>
+            <span class="badge">Qwen3-TTS</span>
           </div>
         </section>
         """
     )
+    with gr.Tabs():
+        with gr.Tab("Full pipeline"):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    gr.Markdown("### Input")
+                    full_video_input = build_video_input("Video or camera capture")
+                    full_language_input = gr.Dropdown(
+                        label="Language",
+                        choices=[
+                            "Auto",
+                            "Chinese",
+                            "English",
+                            "Japanese",
+                            "Korean",
+                            "German",
+                            "French",
+                            "Russian",
+                            "Portuguese",
+                            "Spanish",
+                            "Italian",
+                        ],
+                        value="English",
+                    )
+                    full_speaker_input = gr.Dropdown(
+                        label="Speaker",
+                        choices=[
+                            "Vivian",
+                            "Serena",
+                            "Uncle_Fu",
+                            "Dylan",
+                            "Eric",
+                            "Ryan",
+                            "Aiden",
+                            "Ono_Anna",
+                            "Sohee",
+                        ],
+                        value="Ryan",
+                    )
+                    run_full_button = gr.Button(
+                        "Run full pipeline",
+                        elem_id="run_full",
+                    )
+                with gr.Column(scale=1):
+                    gr.Markdown("### Output")
+                    full_summary_output = gr.Textbox(label="ASL summary", lines=4)
+                    full_subtitle_output = gr.Textbox(label="Subtitle", lines=3)
+                    full_instruction_output = gr.Textbox(label="Voice instruction", lines=3)
+                    full_audio_output = gr.Audio(label="Generated audio", type="filepath")
+            with gr.Row():
+                full_intent_output = gr.Code(label="Intent JSON", language="json", lines=12)
+                full_asl_json_output = gr.JSON(label="ASL structured output")
+                full_llm_json_output = gr.JSON(label="LLM structured output")
+        with gr.Tab("Brick tests"):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    gr.Markdown("### ASL video")
+                    asl_video_input = build_video_input("Video or camera capture")
+                    run_asl_button = gr.Button("Run ASL brick", elem_id="run_asl")
+                    asl_summary_output = gr.Textbox(label="ASL summary", lines=4)
+                    asl_intent_output = gr.Code(label="Intent JSON", language="json", lines=12)
+                with gr.Column(scale=1):
+                    asl_json_output = gr.JSON(label="ASL structured output")
+            with gr.Row():
+                with gr.Column(scale=1):
+                    gr.Markdown("### llama.cpp")
+                    intent_input = gr.Code(
+                        label="Intent JSON",
+                        value=json_text(DEFAULT_INTENT),
+                        language="json",
+                        lines=14,
+                    )
+                    run_llm_button = gr.Button(
+                        "Generate subtitle",
+                        elem_id="run_llm",
+                    )
+                with gr.Column(scale=1):
+                    subtitle_output = gr.Textbox(label="Subtitle", lines=3)
+                    instruction_output = gr.Textbox(label="Voice instruction", lines=3)
+                    llm_json_output = gr.JSON(label="LLM structured output")
+            with gr.Row():
+                with gr.Column(scale=1):
+                    gr.Markdown("### Qwen3-TTS")
+                    tts_language_input = gr.Dropdown(
+                        label="Language",
+                        choices=[
+                            "Auto",
+                            "Chinese",
+                            "English",
+                            "Japanese",
+                            "Korean",
+                            "German",
+                            "French",
+                            "Russian",
+                            "Portuguese",
+                            "Spanish",
+                            "Italian",
+                        ],
+                        value="English",
+                    )
+                    tts_speaker_input = gr.Dropdown(
+                        label="Speaker",
+                        choices=[
+                            "Vivian",
+                            "Serena",
+                            "Uncle_Fu",
+                            "Dylan",
+                            "Eric",
+                            "Ryan",
+                            "Aiden",
+                            "Ono_Anna",
+                            "Sohee",
+                        ],
+                        value="Ryan",
+                    )
+                    run_tts_button = gr.Button("Generate speech", elem_id="run_tts")
+                with gr.Column(scale=1):
+                    audio_output = gr.Audio(label="Generated audio", type="filepath")
     gr.HTML(
         """
         """
     )
+    run_full_button.click(
+        fn=run_full_pipeline,
+        inputs=[full_video_input, full_language_input, full_speaker_input],
+        outputs=[
+            full_intent_output,
+            full_asl_json_output,
+            full_summary_output,
+            full_subtitle_output,
+            full_instruction_output,
+            full_llm_json_output,
+            full_audio_output,
+        ],
+    )
+    run_asl_button.click(
+        fn=run_asl_brick,
+        inputs=[asl_video_input],
+        outputs=[asl_intent_output, asl_json_output, asl_summary_output],
+    )
     run_llm_button.click(
+        fn=run_llm_brick,
         inputs=[intent_input],
         outputs=[subtitle_output, instruction_output, llm_json_output],
     )
     run_tts_button.click(
+        fn=run_tts_brick,
         inputs=[
             subtitle_output,
+            tts_language_input,
+            tts_speaker_input,
             instruction_output,
         ],
         outputs=[audio_output],

assets/styles.css ADDED Viewed

	@@ -0,0 +1,118 @@

+:root {
+  --bg: #080a12;
+  --panel: rgba(255, 255, 255, 0.08);
+  --panel-strong: rgba(255, 255, 255, 0.12);
+  --panel-border: rgba(255, 255, 255, 0.16);
+  --text: #f8fafc;
+  --muted: #a8b3c7;
+  --accent: #2dd4bf;
+  --accent-2: #818cf8;
+  --warm: #f59e0b;
+  --danger: #f43f5e;
+}
+.gradio-container {
+  background:
+    linear-gradient(135deg, #080a12 0%, #101827 52%, #111322 100%) !important;
+  color: var(--text) !important;
+  font-family: Inter, ui-sans-serif, system-ui, sans-serif !important;
+}
+#hero {
+  padding: 24px;
+  border: 1px solid var(--panel-border);
+  border-radius: 8px;
+  background: linear-gradient(135deg, rgba(45, 212, 191, 0.14), rgba(129, 140, 248, 0.10));
+  box-shadow: 0 18px 52px rgba(0, 0, 0, 0.28);
+}
+#hero h1 {
+  font-size: 38px;
+  line-height: 1.08;
+  margin-bottom: 8px;
+  letter-spacing: 0;
+}
+#hero p {
+  color: var(--muted);
+  font-size: 16px;
+  max-width: 760px;
+}
+.badge-row {
+  display: flex;
+  flex-wrap: wrap;
+  gap: 8px;
+  margin-top: 14px;
+}
+.badge {
+  padding: 7px 10px;
+  border-radius: 8px;
+  background: rgba(255, 255, 255, 0.08);
+  border: 1px solid rgba(255, 255, 255, 0.16);
+  color: #dbeafe;
+  font-weight: 700;
+  font-size: 13px;
+}
+.stage-title {
+  margin: 8px 0 4px;
+  color: #e2e8f0;
+}
+.block,
+.form,
+.panel {
+  border-radius: 8px !important;
+}
+textarea,
+input,
+select {
+  background: rgba(15, 23, 42, 0.78) !important;
+  color: var(--text) !important;
+  border-color: rgba(255, 255, 255, 0.14) !important;
+}
+button.primary,
+button {
+  border-radius: 8px !important;
+  font-weight: 800 !important;
+  min-height: 44px !important;
+}
+#run_asl {
+  background: linear-gradient(135deg, var(--accent), #22c55e) !important;
+  color: #04111a !important;
+  border: none !important;
+}
+#run_llm {
+  background: linear-gradient(135deg, var(--accent-2), #3b82f6) !important;
+  color: white !important;
+  border: none !important;
+}
+#run_tts,
+#run_full {
+  background: linear-gradient(135deg, var(--warm), #ec4899) !important;
+  color: white !important;
+  border: none !important;
+}
+.footer-note {
+  color: var(--muted);
+  font-size: 13px;
+  text-align: center;
+}
+@media (max-width: 720px) {
+  #hero {
+    padding: 18px;
+  }
+  #hero h1 {
+    font-size: 30px;
+  }
+}

signspeak/llm.py ADDED Viewed

	@@ -0,0 +1,159 @@

+from __future__ import annotations
+import json
+import os
+from typing import Any
+LLM_REPO_ID = os.getenv("LLM_REPO_ID", "Qwen/Qwen2.5-0.5B-Instruct-GGUF")
+LLM_FILENAME = os.getenv("LLM_FILENAME", "qwen2.5-0.5b-instruct-q4_k_m.gguf")
+_llm_model: Any | None = None
+def safe_json_loads(text: str) -> dict[str, Any]:
+    try:
+        return json.loads(text)
+    except Exception:
+        return {
+            "raw_input": text,
+            "warning": "Input was not valid JSON, treated as raw text.",
+        }
+def extract_json_object(text: str) -> dict[str, Any]:
+    """
+    Extract the first valid JSON object from a model response.
+    Handles pure JSON, markdown fences, and text before or after JSON.
+    """
+    if not text:
+        raise ValueError("Empty model response")
+    cleaned = text.strip()
+    if cleaned.startswith("```"):
+        cleaned = cleaned.replace("```json", "", 1)
+        cleaned = cleaned.replace("```JSON", "", 1)
+        cleaned = cleaned.replace("```", "")
+        cleaned = cleaned.strip()
+    try:
+        return json.loads(cleaned)
+    except Exception:
+        pass
+    start = cleaned.find("{")
+    end = cleaned.rfind("}")
+    if start == -1 or end == -1 or end <= start:
+        raise ValueError(f"No JSON object found in model response: {text}")
+    candidate = cleaned[start : end + 1]
+    return json.loads(candidate)
+def normalize_llm_output(parsed: dict[str, Any]) -> dict[str, str]:
+    subtitle = str(parsed.get("subtitle", "")).strip()
+    voice_instruction = str(parsed.get("voice_instruction", "")).strip()
+    if not subtitle:
+        subtitle = "I want to say something."
+    if not voice_instruction:
+        voice_instruction = "Speak clearly and naturally."
+    forbidden_fragments = ["```", '"subtitle"', '"voice_instruction"', "{", "}"]
+    if any(fragment in subtitle for fragment in forbidden_fragments):
+        subtitle = "I am happy to see you."
+    return {
+        "subtitle": subtitle,
+        "voice_instruction": voice_instruction,
+    }
+def generate_subtitle_and_instruction(intent_json_text: str) -> tuple[str, str, dict[str, Any]]:
+    intent = safe_json_loads(intent_json_text)
+    system_prompt = (
+        "You are an assistant inside an ASL-to-speech accessibility app. "
+        "Convert detected ASL glosses and emotion metadata into speech output. "
+        "You must return raw JSON only. "
+        "Do not use markdown. "
+        "Do not wrap the response in ```json fences. "
+        "Return exactly this schema: "
+        '{"subtitle": "...", "voice_instruction": "..."}'
+    )
+    user_prompt = f"""
+Input intent data:
+{json.dumps(intent, ensure_ascii=False, indent=2)}
+Task:
+Generate a short natural subtitle and a TTS voice instruction.
+Rules:
+- Return raw JSON only.
+- Do not use markdown.
+- Do not include explanations.
+- Do not include code fences.
+- The subtitle must be only the sentence to speak.
+- The voice_instruction must describe tone, emotion, pace, and intensity.
+- Do not copy JSON keys into the subtitle.
+Expected output format:
+{{"subtitle": "I am happy to see you.", "voice_instruction": "Speak warmly, joyfully, and clearly."}}
+"""
+    llm = get_llm_model()
+    result = llm.create_chat_completion(
+        messages=[
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_prompt},
+        ],
+        temperature=0.1,
+        max_tokens=96,
+    )
+    raw_content = result["choices"][0]["message"]["content"].strip()
+    try:
+        parsed = extract_json_object(raw_content)
+        normalized: dict[str, Any] = normalize_llm_output(parsed)
+    except Exception as error:
+        normalized = {
+            "subtitle": "I am happy to see you.",
+            "voice_instruction": "Speak warmly, joyfully, and clearly.",
+            "parser_warning": str(error),
+            "raw_model_output": raw_content,
+        }
+    return (
+        normalized["subtitle"],
+        normalized["voice_instruction"],
+        normalized,
+    )
+def get_llm_model() -> Any:
+    global _llm_model
+    if _llm_model is not None:
+        return _llm_model
+    import torch
+    from llama_cpp import Llama
+    _llm_model = Llama.from_pretrained(
+        repo_id=LLM_REPO_ID,
+        filename=LLM_FILENAME,
+        n_ctx=1024,
+        n_threads=max(2, os.cpu_count() or 2),
+        n_gpu_layers=-1 if torch.cuda.is_available() else 0,
+        verbose=True,
+    )
+    return _llm_model

signspeak/pipeline.py ADDED Viewed

	@@ -0,0 +1,99 @@

+from __future__ import annotations
+import json
+import tempfile
+from pathlib import Path
+from typing import Any
+import numpy as np
+from .asl import process_asl_video
+DEFAULT_INTENT = {
+    "detected_glosses": ["I", "HAPPY", "SEE", "YOU"],
+    "detected_facial_expression": "happy",
+    "emotion_profile": {
+        "dominant": "joy",
+        "confidence": 0.83,
+    },
+    "communication_intent": "friendly_greeting",
+    "pipeline_stage": "mock_asl_intent_for_llama_cpp_test",
+}
+DEFAULT_VIDEO_PATH = Path(__file__).resolve().parents[1] / "data" / "examples" / "videoplayback.mp4"
+def json_text(data: dict[str, Any]) -> str:
+    return json.dumps(data, ensure_ascii=False, indent=2)
+def run_asl_video(video_file: str | None) -> tuple[str, dict[str, Any], str]:
+    video_path = resolve_video_path(video_file)
+    result = process_asl_video(video_path)
+    intent = result["intent_input"]
+    return json_text(intent), result, summarize_asl_result(result)
+def resolve_video_path(video_file: str | None) -> Path:
+    if video_file:
+        return Path(video_file)
+    if DEFAULT_VIDEO_PATH.exists():
+        return DEFAULT_VIDEO_PATH
+    return create_synthetic_demo_video()
+def create_synthetic_demo_video() -> Path:
+    try:
+        import cv2
+    except Exception as exc:
+        raise RuntimeError("OpenCV is required to create the fallback demo video.") from exc
+    output_path = Path(tempfile.gettempdir()) / "signspeak_demo_input.mp4"
+    if output_path.exists():
+        return output_path
+    width, height = 320, 240
+    writer = cv2.VideoWriter(
+        str(output_path),
+        cv2.VideoWriter_fourcc(*"mp4v"),
+        12,
+        (width, height),
+    )
+    if not writer.isOpened():
+        raise RuntimeError(f"Could not create fallback demo video: {output_path}")
+    try:
+        for frame_idx in range(36):
+            frame = np.zeros((height, width, 3), dtype=np.uint8)
+            frame[:, :] = (12, 18, 30)
+            center_x = 80 + frame_idx * 4
+            cv2.circle(frame, (center_x, 96), 22, (45, 212, 191), -1)
+            cv2.circle(frame, (width - center_x, 144), 18, (129, 140, 248), -1)
+            cv2.putText(
+                frame,
+                "SignSpeak demo",
+                (36, 214),
+                cv2.FONT_HERSHEY_SIMPLEX,
+                0.62,
+                (248, 250, 252),
+                2,
+                cv2.LINE_AA,
+            )
+            writer.write(frame)
+    finally:
+        writer.release()
+    return output_path
+def summarize_asl_result(result: dict[str, Any]) -> str:
+    asl = result.get("asl", {})
+    emotion = result.get("emotion", {})
+    return (
+        f"ASL status: {asl.get('status', 'unknown')}\n"
+        f"Top prediction: {asl.get('top_prediction')}\n"
+        f"Landmarks: {asl.get('landmarks_status', 'unknown')} via {asl.get('landmarks_detector', 'unknown')}\n"
+        f"Emotion: {emotion.get('dominant_emotion', 'unknown')} "
+        f"({float(emotion.get('intensity', 0.0) or 0.0):.2f})"
+    )

signspeak/tts.py ADDED Viewed

	@@ -0,0 +1,64 @@

+from __future__ import annotations
+import os
+import tempfile
+import time
+from typing import Any
+TTS_MODEL_ID = os.getenv("TTS_MODEL_ID", "Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice")
+_tts_model: Any | None = None
+def generate_tts(text: str, language: str, speaker: str, instruction: str) -> str:
+    text = (text or "").strip()
+    instruction = (instruction or "").strip()
+    if not text:
+        raise ValueError("Aucun subtitle a synthetiser.")
+    tts = get_tts_model()
+    wavs, sr = tts.generate_custom_voice(
+        text=text,
+        language=language,
+        speaker=speaker,
+        instruct=instruction,
+    )
+    output_path = os.path.join(
+        tempfile.gettempdir(),
+        f"qwen_tts_{int(time.time() * 1000)}.wav",
+    )
+    import soundfile as sf
+    sf.write(output_path, wavs[0], sr)
+    return output_path
+def get_tts_model() -> Any:
+    global _tts_model
+    if _tts_model is not None:
+        return _tts_model
+    import torch
+    from qwen_tts import Qwen3TTSModel
+    if torch.cuda.is_available():
+        _tts_model = Qwen3TTSModel.from_pretrained(
+            TTS_MODEL_ID,
+            device_map="cuda:0",
+            dtype=torch.bfloat16,
+        )
+    else:
+        _tts_model = Qwen3TTSModel.from_pretrained(
+            TTS_MODEL_ID,
+            device_map="cpu",
+            dtype=torch.float32,
+        )
+    return _tts_model