Spaces:
Build error
Build error
Commit ·
b0558d5
1
Parent(s): 57d04e5
Refactor app into pipeline bricks
Browse files- app.py +191 -401
- assets/styles.css +118 -0
- signspeak/llm.py +159 -0
- signspeak/pipeline.py +99 -0
- signspeak/tts.py +64 -0
app.py
CHANGED
|
@@ -1,430 +1,200 @@
|
|
| 1 |
-
import
|
| 2 |
-
import json
|
| 3 |
-
import time
|
| 4 |
-
import tempfile
|
| 5 |
-
|
| 6 |
-
import gradio as gr
|
| 7 |
-
import soundfile as sf
|
| 8 |
-
import torch
|
| 9 |
-
|
| 10 |
-
from qwen_tts import Qwen3TTSModel
|
| 11 |
-
from llama_cpp import Llama
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
TTS_MODEL_ID = os.getenv("TTS_MODEL_ID", "Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice")
|
| 15 |
-
|
| 16 |
-
LLM_REPO_ID = os.getenv("LLM_REPO_ID", "Qwen/Qwen2.5-0.5B-Instruct-GGUF")
|
| 17 |
-
LLM_FILENAME = os.getenv("LLM_FILENAME", "qwen2.5-0.5b-instruct-q4_k_m.gguf")
|
| 18 |
-
|
| 19 |
-
tts_model = None
|
| 20 |
-
llm_model = None
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
CUSTOM_CSS = """
|
| 24 |
-
:root {
|
| 25 |
-
--bg: #050816;
|
| 26 |
-
--panel: rgba(255, 255, 255, 0.075);
|
| 27 |
-
--panel-border: rgba(255, 255, 255, 0.16);
|
| 28 |
-
--text: #f8fafc;
|
| 29 |
-
--muted: #94a3b8;
|
| 30 |
-
--accent: #8b5cf6;
|
| 31 |
-
--accent-2: #06b6d4;
|
| 32 |
-
}
|
| 33 |
-
|
| 34 |
-
.gradio-container {
|
| 35 |
-
background:
|
| 36 |
-
radial-gradient(circle at 20% 20%, rgba(139, 92, 246, 0.30), transparent 28%),
|
| 37 |
-
radial-gradient(circle at 80% 0%, rgba(6, 182, 212, 0.24), transparent 28%),
|
| 38 |
-
linear-gradient(135deg, #050816 0%, #0f172a 55%, #111827 100%) !important;
|
| 39 |
-
color: var(--text) !important;
|
| 40 |
-
font-family: Inter, ui-sans-serif, system-ui, sans-serif !important;
|
| 41 |
-
}
|
| 42 |
-
|
| 43 |
-
#hero {
|
| 44 |
-
padding: 28px;
|
| 45 |
-
border: 1px solid var(--panel-border);
|
| 46 |
-
border-radius: 28px;
|
| 47 |
-
background: linear-gradient(135deg, rgba(255,255,255,0.10), rgba(255,255,255,0.04));
|
| 48 |
-
box-shadow: 0 24px 80px rgba(0,0,0,0.35);
|
| 49 |
-
backdrop-filter: blur(18px);
|
| 50 |
-
}
|
| 51 |
-
|
| 52 |
-
#hero h1 {
|
| 53 |
-
font-size: 42px;
|
| 54 |
-
line-height: 1.05;
|
| 55 |
-
margin-bottom: 8px;
|
| 56 |
-
letter-spacing: -0.04em;
|
| 57 |
-
}
|
| 58 |
-
|
| 59 |
-
#hero p {
|
| 60 |
-
color: var(--muted);
|
| 61 |
-
font-size: 16px;
|
| 62 |
-
}
|
| 63 |
-
|
| 64 |
-
.badge-row {
|
| 65 |
-
display: flex;
|
| 66 |
-
flex-wrap: wrap;
|
| 67 |
-
gap: 10px;
|
| 68 |
-
margin-top: 16px;
|
| 69 |
-
}
|
| 70 |
-
|
| 71 |
-
.badge {
|
| 72 |
-
padding: 8px 12px;
|
| 73 |
-
border-radius: 999px;
|
| 74 |
-
background: rgba(139, 92, 246, 0.16);
|
| 75 |
-
border: 1px solid rgba(139, 92, 246, 0.34);
|
| 76 |
-
color: #ddd6fe;
|
| 77 |
-
font-weight: 700;
|
| 78 |
-
font-size: 13px;
|
| 79 |
-
}
|
| 80 |
-
|
| 81 |
-
.block, .form, .panel {
|
| 82 |
-
border-radius: 22px !important;
|
| 83 |
-
}
|
| 84 |
-
|
| 85 |
-
textarea, input, select {
|
| 86 |
-
background: rgba(15, 23, 42, 0.72) !important;
|
| 87 |
-
color: var(--text) !important;
|
| 88 |
-
border-color: rgba(255,255,255,0.14) !important;
|
| 89 |
-
}
|
| 90 |
-
|
| 91 |
-
button.primary, button {
|
| 92 |
-
border-radius: 999px !important;
|
| 93 |
-
font-weight: 800 !important;
|
| 94 |
-
}
|
| 95 |
-
|
| 96 |
-
#run_llm {
|
| 97 |
-
background: linear-gradient(135deg, var(--accent), var(--accent-2)) !important;
|
| 98 |
-
color: white !important;
|
| 99 |
-
border: none !important;
|
| 100 |
-
}
|
| 101 |
-
|
| 102 |
-
#run_tts {
|
| 103 |
-
background: linear-gradient(135deg, #f97316, #ec4899) !important;
|
| 104 |
-
color: white !important;
|
| 105 |
-
border: none !important;
|
| 106 |
-
}
|
| 107 |
-
|
| 108 |
-
.footer-note {
|
| 109 |
-
color: var(--muted);
|
| 110 |
-
font-size: 13px;
|
| 111 |
-
text-align: center;
|
| 112 |
-
}
|
| 113 |
-
"""
|
| 114 |
|
|
|
|
| 115 |
|
| 116 |
-
|
| 117 |
-
global tts_model
|
| 118 |
-
|
| 119 |
-
if tts_model is not None:
|
| 120 |
-
return tts_model
|
| 121 |
-
|
| 122 |
-
if torch.cuda.is_available():
|
| 123 |
-
tts_model = Qwen3TTSModel.from_pretrained(
|
| 124 |
-
TTS_MODEL_ID,
|
| 125 |
-
device_map="cuda:0",
|
| 126 |
-
dtype=torch.bfloat16,
|
| 127 |
-
)
|
| 128 |
-
else:
|
| 129 |
-
tts_model = Qwen3TTSModel.from_pretrained(
|
| 130 |
-
TTS_MODEL_ID,
|
| 131 |
-
device_map="cpu",
|
| 132 |
-
dtype=torch.float32,
|
| 133 |
-
)
|
| 134 |
-
|
| 135 |
-
return tts_model
|
| 136 |
-
|
| 137 |
|
| 138 |
-
|
| 139 |
-
|
|
|
|
| 140 |
|
| 141 |
-
if llm_model is not None:
|
| 142 |
-
return llm_model
|
| 143 |
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
llm_model = Llama.from_pretrained(
|
| 147 |
-
repo_id=LLM_REPO_ID,
|
| 148 |
-
filename=LLM_FILENAME,
|
| 149 |
-
n_ctx=1024,
|
| 150 |
-
n_threads=max(2, os.cpu_count() or 2),
|
| 151 |
-
n_gpu_layers=-1 if torch.cuda.is_available() else 0,
|
| 152 |
-
verbose=True,
|
| 153 |
-
)
|
| 154 |
|
| 155 |
-
return llm_model
|
| 156 |
|
| 157 |
-
|
| 158 |
-
def safe_json_loads(text):
|
| 159 |
try:
|
| 160 |
-
return
|
| 161 |
-
except Exception:
|
| 162 |
-
|
| 163 |
-
"raw_input": text,
|
| 164 |
-
"warning": "Input was not valid JSON, treated as raw text.",
|
| 165 |
-
}
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
def extract_json_object(text):
|
| 169 |
-
"""
|
| 170 |
-
Extract the first valid JSON object from a model response.
|
| 171 |
-
|
| 172 |
-
Handles:
|
| 173 |
-
- pure JSON
|
| 174 |
-
- ```json ... ```
|
| 175 |
-
- text before/after JSON
|
| 176 |
-
"""
|
| 177 |
-
if not text:
|
| 178 |
-
raise ValueError("Empty model response")
|
| 179 |
-
|
| 180 |
-
cleaned = text.strip()
|
| 181 |
|
| 182 |
-
if cleaned.startswith("```"):
|
| 183 |
-
cleaned = cleaned.replace("```json", "", 1)
|
| 184 |
-
cleaned = cleaned.replace("```JSON", "", 1)
|
| 185 |
-
cleaned = cleaned.replace("```", "")
|
| 186 |
-
cleaned = cleaned.strip()
|
| 187 |
|
|
|
|
| 188 |
try:
|
| 189 |
-
return
|
| 190 |
-
except Exception:
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
start = cleaned.find("{")
|
| 194 |
-
end = cleaned.rfind("}")
|
| 195 |
-
|
| 196 |
-
if start == -1 or end == -1 or end <= start:
|
| 197 |
-
raise ValueError(f"No JSON object found in model response: {text}")
|
| 198 |
-
|
| 199 |
-
candidate = cleaned[start:end + 1]
|
| 200 |
-
return json.loads(candidate)
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
def normalize_llm_output(parsed):
|
| 204 |
-
subtitle = str(parsed.get("subtitle", "")).strip()
|
| 205 |
-
voice_instruction = str(parsed.get("voice_instruction", "")).strip()
|
| 206 |
-
|
| 207 |
-
if not subtitle:
|
| 208 |
-
subtitle = "I want to say something."
|
| 209 |
-
|
| 210 |
-
if not voice_instruction:
|
| 211 |
-
voice_instruction = "Speak clearly and naturally."
|
| 212 |
-
|
| 213 |
-
forbidden_fragments = ["```", '"subtitle"', '"voice_instruction"', "{", "}"]
|
| 214 |
-
if any(fragment in subtitle for fragment in forbidden_fragments):
|
| 215 |
-
subtitle = "I am happy to see you."
|
| 216 |
-
|
| 217 |
-
return {
|
| 218 |
-
"subtitle": subtitle,
|
| 219 |
-
"voice_instruction": voice_instruction,
|
| 220 |
-
}
|
| 221 |
-
|
| 222 |
|
| 223 |
-
def generate_subtitle_and_instruction(intent_json_text):
|
| 224 |
-
intent = safe_json_loads(intent_json_text)
|
| 225 |
-
|
| 226 |
-
system_prompt = (
|
| 227 |
-
"You are an assistant inside an ASL-to-speech accessibility app. "
|
| 228 |
-
"Convert detected ASL glosses and emotion metadata into speech output. "
|
| 229 |
-
"You must return raw JSON only. "
|
| 230 |
-
"Do not use markdown. "
|
| 231 |
-
"Do not wrap the response in ```json fences. "
|
| 232 |
-
"Return exactly this schema: "
|
| 233 |
-
'{"subtitle": "...", "voice_instruction": "..."}'
|
| 234 |
-
)
|
| 235 |
-
|
| 236 |
-
user_prompt = f"""
|
| 237 |
-
Input intent data:
|
| 238 |
-
{json.dumps(intent, ensure_ascii=False, indent=2)}
|
| 239 |
-
|
| 240 |
-
Task:
|
| 241 |
-
Generate a short natural subtitle and a TTS voice instruction.
|
| 242 |
-
|
| 243 |
-
Rules:
|
| 244 |
-
- Return raw JSON only.
|
| 245 |
-
- Do not use markdown.
|
| 246 |
-
- Do not include explanations.
|
| 247 |
-
- Do not include code fences.
|
| 248 |
-
- The subtitle must be only the sentence to speak.
|
| 249 |
-
- The voice_instruction must describe tone, emotion, pace, and intensity.
|
| 250 |
-
- Do not copy JSON keys into the subtitle.
|
| 251 |
-
|
| 252 |
-
Expected output format:
|
| 253 |
-
{{"subtitle": "I am happy to see you.", "voice_instruction": "Speak warmly, joyfully, and clearly."}}
|
| 254 |
-
"""
|
| 255 |
-
|
| 256 |
-
llm = get_llm_model()
|
| 257 |
-
|
| 258 |
-
result = llm.create_chat_completion(
|
| 259 |
-
messages=[
|
| 260 |
-
{"role": "system", "content": system_prompt},
|
| 261 |
-
{"role": "user", "content": user_prompt},
|
| 262 |
-
],
|
| 263 |
-
temperature=0.1,
|
| 264 |
-
max_tokens=96,
|
| 265 |
-
)
|
| 266 |
-
|
| 267 |
-
raw_content = result["choices"][0]["message"]["content"].strip()
|
| 268 |
|
|
|
|
| 269 |
try:
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
raise gr.Error("Aucun subtitle à synthétiser.")
|
| 293 |
-
|
| 294 |
-
tts = get_tts_model()
|
| 295 |
-
|
| 296 |
-
wavs, sr = tts.generate_custom_voice(
|
| 297 |
-
text=text,
|
| 298 |
-
language=language,
|
| 299 |
-
speaker=speaker,
|
| 300 |
-
instruct=instruction,
|
| 301 |
)
|
| 302 |
|
| 303 |
-
output_path = os.path.join(
|
| 304 |
-
tempfile.gettempdir(),
|
| 305 |
-
f"qwen_tts_{int(time.time() * 1000)}.wav",
|
| 306 |
-
)
|
| 307 |
-
|
| 308 |
-
sf.write(output_path, wavs[0], sr)
|
| 309 |
-
|
| 310 |
-
return output_path
|
| 311 |
|
| 312 |
-
|
| 313 |
-
DEFAULT_INTENT = {
|
| 314 |
-
"detected_glosses": ["I", "HAPPY", "SEE", "YOU"],
|
| 315 |
-
"detected_facial_expression": "happy",
|
| 316 |
-
"emotion_profile": {
|
| 317 |
-
"dominant": "joy",
|
| 318 |
-
"confidence": 0.83,
|
| 319 |
-
},
|
| 320 |
-
"communication_intent": "friendly_greeting",
|
| 321 |
-
"pipeline_stage": "mock_asl_intent_for_llama_cpp_test",
|
| 322 |
-
}
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
with gr.Blocks(
|
| 326 |
-
title="SignSpeak Local",
|
| 327 |
-
) as demo:
|
| 328 |
gr.HTML(
|
| 329 |
"""
|
| 330 |
<section id="hero">
|
| 331 |
<h1>SignSpeak Local</h1>
|
| 332 |
<p>
|
| 333 |
-
ASL video to expressive speech,
|
| 334 |
-
|
| 335 |
</p>
|
| 336 |
<div class="badge-row">
|
|
|
|
|
|
|
| 337 |
<span class="badge">llama.cpp</span>
|
| 338 |
-
<span class="badge">
|
| 339 |
-
<span class="badge">custom Gradio UI</span>
|
| 340 |
-
<span class="badge">expressive TTS</span>
|
| 341 |
</div>
|
| 342 |
</section>
|
| 343 |
"""
|
| 344 |
)
|
| 345 |
|
| 346 |
-
with gr.
|
| 347 |
-
with gr.
|
| 348 |
-
gr.
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
"
|
| 391 |
-
"
|
| 392 |
-
"
|
| 393 |
-
"
|
| 394 |
-
"
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
|
| 399 |
-
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
"
|
| 405 |
-
"
|
| 406 |
-
"
|
| 407 |
-
"
|
| 408 |
-
"
|
| 409 |
-
|
| 410 |
-
"
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 428 |
|
| 429 |
gr.HTML(
|
| 430 |
"""
|
|
@@ -434,18 +204,38 @@ with gr.Blocks(
|
|
| 434 |
"""
|
| 435 |
)
|
| 436 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 437 |
run_llm_button.click(
|
| 438 |
-
fn=
|
| 439 |
inputs=[intent_input],
|
| 440 |
outputs=[subtitle_output, instruction_output, llm_json_output],
|
| 441 |
)
|
| 442 |
|
| 443 |
run_tts_button.click(
|
| 444 |
-
fn=
|
| 445 |
inputs=[
|
| 446 |
subtitle_output,
|
| 447 |
-
|
| 448 |
-
|
| 449 |
instruction_output,
|
| 450 |
],
|
| 451 |
outputs=[audio_output],
|
|
|
|
| 1 |
+
from __future__ import annotations
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
+
from pathlib import Path
|
| 4 |
|
| 5 |
+
import gradio as gr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
+
from signspeak.llm import generate_subtitle_and_instruction
|
| 8 |
+
from signspeak.pipeline import DEFAULT_INTENT, json_text, run_asl_video
|
| 9 |
+
from signspeak.tts import generate_tts
|
| 10 |
|
|
|
|
|
|
|
| 11 |
|
| 12 |
+
APP_DIR = Path(__file__).resolve().parent
|
| 13 |
+
CUSTOM_CSS = (APP_DIR / "assets" / "styles.css").read_text(encoding="utf-8")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
|
|
|
| 15 |
|
| 16 |
+
def run_asl_brick(video_file: str | None) -> tuple[str, dict, str]:
|
|
|
|
| 17 |
try:
|
| 18 |
+
return run_asl_video(video_file)
|
| 19 |
+
except Exception as exc:
|
| 20 |
+
raise gr.Error(f"ASL pipeline failed: {type(exc).__name__}: {exc}") from exc
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
+
def run_llm_brick(intent_json_text: str) -> tuple[str, str, dict]:
|
| 24 |
try:
|
| 25 |
+
return generate_subtitle_and_instruction(intent_json_text)
|
| 26 |
+
except Exception as exc:
|
| 27 |
+
raise gr.Error(f"llama.cpp generation failed: {type(exc).__name__}: {exc}") from exc
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
+
def run_tts_brick(text: str, language: str, speaker: str, instruction: str) -> str:
|
| 31 |
try:
|
| 32 |
+
return generate_tts(text, language, speaker, instruction)
|
| 33 |
+
except Exception as exc:
|
| 34 |
+
raise gr.Error(f"Qwen3-TTS generation failed: {type(exc).__name__}: {exc}") from exc
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def run_full_pipeline(
|
| 38 |
+
video_file: str | None,
|
| 39 |
+
language: str,
|
| 40 |
+
speaker: str,
|
| 41 |
+
) -> tuple[str, dict, str, str, str, dict, str]:
|
| 42 |
+
intent_json, asl_result, asl_summary = run_asl_brick(video_file)
|
| 43 |
+
subtitle, instruction, llm_result = run_llm_brick(intent_json)
|
| 44 |
+
audio_path = run_tts_brick(subtitle, language, speaker, instruction)
|
| 45 |
+
return intent_json, asl_result, asl_summary, subtitle, instruction, llm_result, audio_path
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def build_video_input(label: str) -> gr.Video:
|
| 49 |
+
return gr.Video(
|
| 50 |
+
label=label,
|
| 51 |
+
sources=["upload", "webcam"],
|
| 52 |
+
type="filepath",
|
| 53 |
+
format="mp4",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
)
|
| 55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
+
with gr.Blocks(title="SignSpeak Local") as demo:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
gr.HTML(
|
| 59 |
"""
|
| 60 |
<section id="hero">
|
| 61 |
<h1>SignSpeak Local</h1>
|
| 62 |
<p>
|
| 63 |
+
ASL video to expressive speech, with independent ASL, llama.cpp,
|
| 64 |
+
and Qwen3-TTS bricks for controlled demo runs.
|
| 65 |
</p>
|
| 66 |
<div class="badge-row">
|
| 67 |
+
<span class="badge">ASL video</span>
|
| 68 |
+
<span class="badge">live camera</span>
|
| 69 |
<span class="badge">llama.cpp</span>
|
| 70 |
+
<span class="badge">Qwen3-TTS</span>
|
|
|
|
|
|
|
| 71 |
</div>
|
| 72 |
</section>
|
| 73 |
"""
|
| 74 |
)
|
| 75 |
|
| 76 |
+
with gr.Tabs():
|
| 77 |
+
with gr.Tab("Full pipeline"):
|
| 78 |
+
with gr.Row():
|
| 79 |
+
with gr.Column(scale=1):
|
| 80 |
+
gr.Markdown("### Input")
|
| 81 |
+
full_video_input = build_video_input("Video or camera capture")
|
| 82 |
+
full_language_input = gr.Dropdown(
|
| 83 |
+
label="Language",
|
| 84 |
+
choices=[
|
| 85 |
+
"Auto",
|
| 86 |
+
"Chinese",
|
| 87 |
+
"English",
|
| 88 |
+
"Japanese",
|
| 89 |
+
"Korean",
|
| 90 |
+
"German",
|
| 91 |
+
"French",
|
| 92 |
+
"Russian",
|
| 93 |
+
"Portuguese",
|
| 94 |
+
"Spanish",
|
| 95 |
+
"Italian",
|
| 96 |
+
],
|
| 97 |
+
value="English",
|
| 98 |
+
)
|
| 99 |
+
full_speaker_input = gr.Dropdown(
|
| 100 |
+
label="Speaker",
|
| 101 |
+
choices=[
|
| 102 |
+
"Vivian",
|
| 103 |
+
"Serena",
|
| 104 |
+
"Uncle_Fu",
|
| 105 |
+
"Dylan",
|
| 106 |
+
"Eric",
|
| 107 |
+
"Ryan",
|
| 108 |
+
"Aiden",
|
| 109 |
+
"Ono_Anna",
|
| 110 |
+
"Sohee",
|
| 111 |
+
],
|
| 112 |
+
value="Ryan",
|
| 113 |
+
)
|
| 114 |
+
run_full_button = gr.Button(
|
| 115 |
+
"Run full pipeline",
|
| 116 |
+
elem_id="run_full",
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
with gr.Column(scale=1):
|
| 120 |
+
gr.Markdown("### Output")
|
| 121 |
+
full_summary_output = gr.Textbox(label="ASL summary", lines=4)
|
| 122 |
+
full_subtitle_output = gr.Textbox(label="Subtitle", lines=3)
|
| 123 |
+
full_instruction_output = gr.Textbox(label="Voice instruction", lines=3)
|
| 124 |
+
full_audio_output = gr.Audio(label="Generated audio", type="filepath")
|
| 125 |
+
|
| 126 |
+
with gr.Row():
|
| 127 |
+
full_intent_output = gr.Code(label="Intent JSON", language="json", lines=12)
|
| 128 |
+
full_asl_json_output = gr.JSON(label="ASL structured output")
|
| 129 |
+
full_llm_json_output = gr.JSON(label="LLM structured output")
|
| 130 |
+
|
| 131 |
+
with gr.Tab("Brick tests"):
|
| 132 |
+
with gr.Row():
|
| 133 |
+
with gr.Column(scale=1):
|
| 134 |
+
gr.Markdown("### ASL video")
|
| 135 |
+
asl_video_input = build_video_input("Video or camera capture")
|
| 136 |
+
run_asl_button = gr.Button("Run ASL brick", elem_id="run_asl")
|
| 137 |
+
asl_summary_output = gr.Textbox(label="ASL summary", lines=4)
|
| 138 |
+
asl_intent_output = gr.Code(label="Intent JSON", language="json", lines=12)
|
| 139 |
+
with gr.Column(scale=1):
|
| 140 |
+
asl_json_output = gr.JSON(label="ASL structured output")
|
| 141 |
+
|
| 142 |
+
with gr.Row():
|
| 143 |
+
with gr.Column(scale=1):
|
| 144 |
+
gr.Markdown("### llama.cpp")
|
| 145 |
+
intent_input = gr.Code(
|
| 146 |
+
label="Intent JSON",
|
| 147 |
+
value=json_text(DEFAULT_INTENT),
|
| 148 |
+
language="json",
|
| 149 |
+
lines=14,
|
| 150 |
+
)
|
| 151 |
+
run_llm_button = gr.Button(
|
| 152 |
+
"Generate subtitle",
|
| 153 |
+
elem_id="run_llm",
|
| 154 |
+
)
|
| 155 |
+
with gr.Column(scale=1):
|
| 156 |
+
subtitle_output = gr.Textbox(label="Subtitle", lines=3)
|
| 157 |
+
instruction_output = gr.Textbox(label="Voice instruction", lines=3)
|
| 158 |
+
llm_json_output = gr.JSON(label="LLM structured output")
|
| 159 |
+
|
| 160 |
+
with gr.Row():
|
| 161 |
+
with gr.Column(scale=1):
|
| 162 |
+
gr.Markdown("### Qwen3-TTS")
|
| 163 |
+
tts_language_input = gr.Dropdown(
|
| 164 |
+
label="Language",
|
| 165 |
+
choices=[
|
| 166 |
+
"Auto",
|
| 167 |
+
"Chinese",
|
| 168 |
+
"English",
|
| 169 |
+
"Japanese",
|
| 170 |
+
"Korean",
|
| 171 |
+
"German",
|
| 172 |
+
"French",
|
| 173 |
+
"Russian",
|
| 174 |
+
"Portuguese",
|
| 175 |
+
"Spanish",
|
| 176 |
+
"Italian",
|
| 177 |
+
],
|
| 178 |
+
value="English",
|
| 179 |
+
)
|
| 180 |
+
tts_speaker_input = gr.Dropdown(
|
| 181 |
+
label="Speaker",
|
| 182 |
+
choices=[
|
| 183 |
+
"Vivian",
|
| 184 |
+
"Serena",
|
| 185 |
+
"Uncle_Fu",
|
| 186 |
+
"Dylan",
|
| 187 |
+
"Eric",
|
| 188 |
+
"Ryan",
|
| 189 |
+
"Aiden",
|
| 190 |
+
"Ono_Anna",
|
| 191 |
+
"Sohee",
|
| 192 |
+
],
|
| 193 |
+
value="Ryan",
|
| 194 |
+
)
|
| 195 |
+
run_tts_button = gr.Button("Generate speech", elem_id="run_tts")
|
| 196 |
+
with gr.Column(scale=1):
|
| 197 |
+
audio_output = gr.Audio(label="Generated audio", type="filepath")
|
| 198 |
|
| 199 |
gr.HTML(
|
| 200 |
"""
|
|
|
|
| 204 |
"""
|
| 205 |
)
|
| 206 |
|
| 207 |
+
run_full_button.click(
|
| 208 |
+
fn=run_full_pipeline,
|
| 209 |
+
inputs=[full_video_input, full_language_input, full_speaker_input],
|
| 210 |
+
outputs=[
|
| 211 |
+
full_intent_output,
|
| 212 |
+
full_asl_json_output,
|
| 213 |
+
full_summary_output,
|
| 214 |
+
full_subtitle_output,
|
| 215 |
+
full_instruction_output,
|
| 216 |
+
full_llm_json_output,
|
| 217 |
+
full_audio_output,
|
| 218 |
+
],
|
| 219 |
+
)
|
| 220 |
+
|
| 221 |
+
run_asl_button.click(
|
| 222 |
+
fn=run_asl_brick,
|
| 223 |
+
inputs=[asl_video_input],
|
| 224 |
+
outputs=[asl_intent_output, asl_json_output, asl_summary_output],
|
| 225 |
+
)
|
| 226 |
+
|
| 227 |
run_llm_button.click(
|
| 228 |
+
fn=run_llm_brick,
|
| 229 |
inputs=[intent_input],
|
| 230 |
outputs=[subtitle_output, instruction_output, llm_json_output],
|
| 231 |
)
|
| 232 |
|
| 233 |
run_tts_button.click(
|
| 234 |
+
fn=run_tts_brick,
|
| 235 |
inputs=[
|
| 236 |
subtitle_output,
|
| 237 |
+
tts_language_input,
|
| 238 |
+
tts_speaker_input,
|
| 239 |
instruction_output,
|
| 240 |
],
|
| 241 |
outputs=[audio_output],
|
assets/styles.css
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
:root {
|
| 2 |
+
--bg: #080a12;
|
| 3 |
+
--panel: rgba(255, 255, 255, 0.08);
|
| 4 |
+
--panel-strong: rgba(255, 255, 255, 0.12);
|
| 5 |
+
--panel-border: rgba(255, 255, 255, 0.16);
|
| 6 |
+
--text: #f8fafc;
|
| 7 |
+
--muted: #a8b3c7;
|
| 8 |
+
--accent: #2dd4bf;
|
| 9 |
+
--accent-2: #818cf8;
|
| 10 |
+
--warm: #f59e0b;
|
| 11 |
+
--danger: #f43f5e;
|
| 12 |
+
}
|
| 13 |
+
|
| 14 |
+
.gradio-container {
|
| 15 |
+
background:
|
| 16 |
+
linear-gradient(135deg, #080a12 0%, #101827 52%, #111322 100%) !important;
|
| 17 |
+
color: var(--text) !important;
|
| 18 |
+
font-family: Inter, ui-sans-serif, system-ui, sans-serif !important;
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
#hero {
|
| 22 |
+
padding: 24px;
|
| 23 |
+
border: 1px solid var(--panel-border);
|
| 24 |
+
border-radius: 8px;
|
| 25 |
+
background: linear-gradient(135deg, rgba(45, 212, 191, 0.14), rgba(129, 140, 248, 0.10));
|
| 26 |
+
box-shadow: 0 18px 52px rgba(0, 0, 0, 0.28);
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
#hero h1 {
|
| 30 |
+
font-size: 38px;
|
| 31 |
+
line-height: 1.08;
|
| 32 |
+
margin-bottom: 8px;
|
| 33 |
+
letter-spacing: 0;
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
#hero p {
|
| 37 |
+
color: var(--muted);
|
| 38 |
+
font-size: 16px;
|
| 39 |
+
max-width: 760px;
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
.badge-row {
|
| 43 |
+
display: flex;
|
| 44 |
+
flex-wrap: wrap;
|
| 45 |
+
gap: 8px;
|
| 46 |
+
margin-top: 14px;
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
.badge {
|
| 50 |
+
padding: 7px 10px;
|
| 51 |
+
border-radius: 8px;
|
| 52 |
+
background: rgba(255, 255, 255, 0.08);
|
| 53 |
+
border: 1px solid rgba(255, 255, 255, 0.16);
|
| 54 |
+
color: #dbeafe;
|
| 55 |
+
font-weight: 700;
|
| 56 |
+
font-size: 13px;
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
.stage-title {
|
| 60 |
+
margin: 8px 0 4px;
|
| 61 |
+
color: #e2e8f0;
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
.block,
|
| 65 |
+
.form,
|
| 66 |
+
.panel {
|
| 67 |
+
border-radius: 8px !important;
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
textarea,
|
| 71 |
+
input,
|
| 72 |
+
select {
|
| 73 |
+
background: rgba(15, 23, 42, 0.78) !important;
|
| 74 |
+
color: var(--text) !important;
|
| 75 |
+
border-color: rgba(255, 255, 255, 0.14) !important;
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
button.primary,
|
| 79 |
+
button {
|
| 80 |
+
border-radius: 8px !important;
|
| 81 |
+
font-weight: 800 !important;
|
| 82 |
+
min-height: 44px !important;
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
#run_asl {
|
| 86 |
+
background: linear-gradient(135deg, var(--accent), #22c55e) !important;
|
| 87 |
+
color: #04111a !important;
|
| 88 |
+
border: none !important;
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
#run_llm {
|
| 92 |
+
background: linear-gradient(135deg, var(--accent-2), #3b82f6) !important;
|
| 93 |
+
color: white !important;
|
| 94 |
+
border: none !important;
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
#run_tts,
|
| 98 |
+
#run_full {
|
| 99 |
+
background: linear-gradient(135deg, var(--warm), #ec4899) !important;
|
| 100 |
+
color: white !important;
|
| 101 |
+
border: none !important;
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
.footer-note {
|
| 105 |
+
color: var(--muted);
|
| 106 |
+
font-size: 13px;
|
| 107 |
+
text-align: center;
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
@media (max-width: 720px) {
|
| 111 |
+
#hero {
|
| 112 |
+
padding: 18px;
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
#hero h1 {
|
| 116 |
+
font-size: 30px;
|
| 117 |
+
}
|
| 118 |
+
}
|
signspeak/llm.py
ADDED
|
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import os
|
| 5 |
+
from typing import Any
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
LLM_REPO_ID = os.getenv("LLM_REPO_ID", "Qwen/Qwen2.5-0.5B-Instruct-GGUF")
|
| 9 |
+
LLM_FILENAME = os.getenv("LLM_FILENAME", "qwen2.5-0.5b-instruct-q4_k_m.gguf")
|
| 10 |
+
|
| 11 |
+
_llm_model: Any | None = None
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def safe_json_loads(text: str) -> dict[str, Any]:
|
| 15 |
+
try:
|
| 16 |
+
return json.loads(text)
|
| 17 |
+
except Exception:
|
| 18 |
+
return {
|
| 19 |
+
"raw_input": text,
|
| 20 |
+
"warning": "Input was not valid JSON, treated as raw text.",
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def extract_json_object(text: str) -> dict[str, Any]:
|
| 25 |
+
"""
|
| 26 |
+
Extract the first valid JSON object from a model response.
|
| 27 |
+
|
| 28 |
+
Handles pure JSON, markdown fences, and text before or after JSON.
|
| 29 |
+
"""
|
| 30 |
+
if not text:
|
| 31 |
+
raise ValueError("Empty model response")
|
| 32 |
+
|
| 33 |
+
cleaned = text.strip()
|
| 34 |
+
|
| 35 |
+
if cleaned.startswith("```"):
|
| 36 |
+
cleaned = cleaned.replace("```json", "", 1)
|
| 37 |
+
cleaned = cleaned.replace("```JSON", "", 1)
|
| 38 |
+
cleaned = cleaned.replace("```", "")
|
| 39 |
+
cleaned = cleaned.strip()
|
| 40 |
+
|
| 41 |
+
try:
|
| 42 |
+
return json.loads(cleaned)
|
| 43 |
+
except Exception:
|
| 44 |
+
pass
|
| 45 |
+
|
| 46 |
+
start = cleaned.find("{")
|
| 47 |
+
end = cleaned.rfind("}")
|
| 48 |
+
|
| 49 |
+
if start == -1 or end == -1 or end <= start:
|
| 50 |
+
raise ValueError(f"No JSON object found in model response: {text}")
|
| 51 |
+
|
| 52 |
+
candidate = cleaned[start : end + 1]
|
| 53 |
+
return json.loads(candidate)
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def normalize_llm_output(parsed: dict[str, Any]) -> dict[str, str]:
|
| 57 |
+
subtitle = str(parsed.get("subtitle", "")).strip()
|
| 58 |
+
voice_instruction = str(parsed.get("voice_instruction", "")).strip()
|
| 59 |
+
|
| 60 |
+
if not subtitle:
|
| 61 |
+
subtitle = "I want to say something."
|
| 62 |
+
|
| 63 |
+
if not voice_instruction:
|
| 64 |
+
voice_instruction = "Speak clearly and naturally."
|
| 65 |
+
|
| 66 |
+
forbidden_fragments = ["```", '"subtitle"', '"voice_instruction"', "{", "}"]
|
| 67 |
+
if any(fragment in subtitle for fragment in forbidden_fragments):
|
| 68 |
+
subtitle = "I am happy to see you."
|
| 69 |
+
|
| 70 |
+
return {
|
| 71 |
+
"subtitle": subtitle,
|
| 72 |
+
"voice_instruction": voice_instruction,
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def generate_subtitle_and_instruction(intent_json_text: str) -> tuple[str, str, dict[str, Any]]:
|
| 77 |
+
intent = safe_json_loads(intent_json_text)
|
| 78 |
+
|
| 79 |
+
system_prompt = (
|
| 80 |
+
"You are an assistant inside an ASL-to-speech accessibility app. "
|
| 81 |
+
"Convert detected ASL glosses and emotion metadata into speech output. "
|
| 82 |
+
"You must return raw JSON only. "
|
| 83 |
+
"Do not use markdown. "
|
| 84 |
+
"Do not wrap the response in ```json fences. "
|
| 85 |
+
"Return exactly this schema: "
|
| 86 |
+
'{"subtitle": "...", "voice_instruction": "..."}'
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
+
user_prompt = f"""
|
| 90 |
+
Input intent data:
|
| 91 |
+
{json.dumps(intent, ensure_ascii=False, indent=2)}
|
| 92 |
+
|
| 93 |
+
Task:
|
| 94 |
+
Generate a short natural subtitle and a TTS voice instruction.
|
| 95 |
+
|
| 96 |
+
Rules:
|
| 97 |
+
- Return raw JSON only.
|
| 98 |
+
- Do not use markdown.
|
| 99 |
+
- Do not include explanations.
|
| 100 |
+
- Do not include code fences.
|
| 101 |
+
- The subtitle must be only the sentence to speak.
|
| 102 |
+
- The voice_instruction must describe tone, emotion, pace, and intensity.
|
| 103 |
+
- Do not copy JSON keys into the subtitle.
|
| 104 |
+
|
| 105 |
+
Expected output format:
|
| 106 |
+
{{"subtitle": "I am happy to see you.", "voice_instruction": "Speak warmly, joyfully, and clearly."}}
|
| 107 |
+
"""
|
| 108 |
+
|
| 109 |
+
llm = get_llm_model()
|
| 110 |
+
|
| 111 |
+
result = llm.create_chat_completion(
|
| 112 |
+
messages=[
|
| 113 |
+
{"role": "system", "content": system_prompt},
|
| 114 |
+
{"role": "user", "content": user_prompt},
|
| 115 |
+
],
|
| 116 |
+
temperature=0.1,
|
| 117 |
+
max_tokens=96,
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
raw_content = result["choices"][0]["message"]["content"].strip()
|
| 121 |
+
|
| 122 |
+
try:
|
| 123 |
+
parsed = extract_json_object(raw_content)
|
| 124 |
+
normalized: dict[str, Any] = normalize_llm_output(parsed)
|
| 125 |
+
except Exception as error:
|
| 126 |
+
normalized = {
|
| 127 |
+
"subtitle": "I am happy to see you.",
|
| 128 |
+
"voice_instruction": "Speak warmly, joyfully, and clearly.",
|
| 129 |
+
"parser_warning": str(error),
|
| 130 |
+
"raw_model_output": raw_content,
|
| 131 |
+
}
|
| 132 |
+
|
| 133 |
+
return (
|
| 134 |
+
normalized["subtitle"],
|
| 135 |
+
normalized["voice_instruction"],
|
| 136 |
+
normalized,
|
| 137 |
+
)
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
def get_llm_model() -> Any:
|
| 141 |
+
global _llm_model
|
| 142 |
+
|
| 143 |
+
if _llm_model is not None:
|
| 144 |
+
return _llm_model
|
| 145 |
+
|
| 146 |
+
import torch
|
| 147 |
+
from llama_cpp import Llama
|
| 148 |
+
|
| 149 |
+
_llm_model = Llama.from_pretrained(
|
| 150 |
+
repo_id=LLM_REPO_ID,
|
| 151 |
+
filename=LLM_FILENAME,
|
| 152 |
+
n_ctx=1024,
|
| 153 |
+
n_threads=max(2, os.cpu_count() or 2),
|
| 154 |
+
n_gpu_layers=-1 if torch.cuda.is_available() else 0,
|
| 155 |
+
verbose=True,
|
| 156 |
+
)
|
| 157 |
+
|
| 158 |
+
return _llm_model
|
| 159 |
+
|
signspeak/pipeline.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import tempfile
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from typing import Any
|
| 7 |
+
|
| 8 |
+
import numpy as np
|
| 9 |
+
|
| 10 |
+
from .asl import process_asl_video
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
DEFAULT_INTENT = {
|
| 14 |
+
"detected_glosses": ["I", "HAPPY", "SEE", "YOU"],
|
| 15 |
+
"detected_facial_expression": "happy",
|
| 16 |
+
"emotion_profile": {
|
| 17 |
+
"dominant": "joy",
|
| 18 |
+
"confidence": 0.83,
|
| 19 |
+
},
|
| 20 |
+
"communication_intent": "friendly_greeting",
|
| 21 |
+
"pipeline_stage": "mock_asl_intent_for_llama_cpp_test",
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
DEFAULT_VIDEO_PATH = Path(__file__).resolve().parents[1] / "data" / "examples" / "videoplayback.mp4"
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def json_text(data: dict[str, Any]) -> str:
|
| 28 |
+
return json.dumps(data, ensure_ascii=False, indent=2)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def run_asl_video(video_file: str | None) -> tuple[str, dict[str, Any], str]:
|
| 32 |
+
video_path = resolve_video_path(video_file)
|
| 33 |
+
result = process_asl_video(video_path)
|
| 34 |
+
intent = result["intent_input"]
|
| 35 |
+
return json_text(intent), result, summarize_asl_result(result)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def resolve_video_path(video_file: str | None) -> Path:
|
| 39 |
+
if video_file:
|
| 40 |
+
return Path(video_file)
|
| 41 |
+
if DEFAULT_VIDEO_PATH.exists():
|
| 42 |
+
return DEFAULT_VIDEO_PATH
|
| 43 |
+
return create_synthetic_demo_video()
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def create_synthetic_demo_video() -> Path:
|
| 47 |
+
try:
|
| 48 |
+
import cv2
|
| 49 |
+
except Exception as exc:
|
| 50 |
+
raise RuntimeError("OpenCV is required to create the fallback demo video.") from exc
|
| 51 |
+
|
| 52 |
+
output_path = Path(tempfile.gettempdir()) / "signspeak_demo_input.mp4"
|
| 53 |
+
if output_path.exists():
|
| 54 |
+
return output_path
|
| 55 |
+
|
| 56 |
+
width, height = 320, 240
|
| 57 |
+
writer = cv2.VideoWriter(
|
| 58 |
+
str(output_path),
|
| 59 |
+
cv2.VideoWriter_fourcc(*"mp4v"),
|
| 60 |
+
12,
|
| 61 |
+
(width, height),
|
| 62 |
+
)
|
| 63 |
+
if not writer.isOpened():
|
| 64 |
+
raise RuntimeError(f"Could not create fallback demo video: {output_path}")
|
| 65 |
+
|
| 66 |
+
try:
|
| 67 |
+
for frame_idx in range(36):
|
| 68 |
+
frame = np.zeros((height, width, 3), dtype=np.uint8)
|
| 69 |
+
frame[:, :] = (12, 18, 30)
|
| 70 |
+
center_x = 80 + frame_idx * 4
|
| 71 |
+
cv2.circle(frame, (center_x, 96), 22, (45, 212, 191), -1)
|
| 72 |
+
cv2.circle(frame, (width - center_x, 144), 18, (129, 140, 248), -1)
|
| 73 |
+
cv2.putText(
|
| 74 |
+
frame,
|
| 75 |
+
"SignSpeak demo",
|
| 76 |
+
(36, 214),
|
| 77 |
+
cv2.FONT_HERSHEY_SIMPLEX,
|
| 78 |
+
0.62,
|
| 79 |
+
(248, 250, 252),
|
| 80 |
+
2,
|
| 81 |
+
cv2.LINE_AA,
|
| 82 |
+
)
|
| 83 |
+
writer.write(frame)
|
| 84 |
+
finally:
|
| 85 |
+
writer.release()
|
| 86 |
+
|
| 87 |
+
return output_path
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def summarize_asl_result(result: dict[str, Any]) -> str:
|
| 91 |
+
asl = result.get("asl", {})
|
| 92 |
+
emotion = result.get("emotion", {})
|
| 93 |
+
return (
|
| 94 |
+
f"ASL status: {asl.get('status', 'unknown')}\n"
|
| 95 |
+
f"Top prediction: {asl.get('top_prediction')}\n"
|
| 96 |
+
f"Landmarks: {asl.get('landmarks_status', 'unknown')} via {asl.get('landmarks_detector', 'unknown')}\n"
|
| 97 |
+
f"Emotion: {emotion.get('dominant_emotion', 'unknown')} "
|
| 98 |
+
f"({float(emotion.get('intensity', 0.0) or 0.0):.2f})"
|
| 99 |
+
)
|
signspeak/tts.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import tempfile
|
| 5 |
+
import time
|
| 6 |
+
from typing import Any
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
TTS_MODEL_ID = os.getenv("TTS_MODEL_ID", "Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice")
|
| 10 |
+
|
| 11 |
+
_tts_model: Any | None = None
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def generate_tts(text: str, language: str, speaker: str, instruction: str) -> str:
|
| 15 |
+
text = (text or "").strip()
|
| 16 |
+
instruction = (instruction or "").strip()
|
| 17 |
+
|
| 18 |
+
if not text:
|
| 19 |
+
raise ValueError("Aucun subtitle a synthetiser.")
|
| 20 |
+
|
| 21 |
+
tts = get_tts_model()
|
| 22 |
+
|
| 23 |
+
wavs, sr = tts.generate_custom_voice(
|
| 24 |
+
text=text,
|
| 25 |
+
language=language,
|
| 26 |
+
speaker=speaker,
|
| 27 |
+
instruct=instruction,
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
output_path = os.path.join(
|
| 31 |
+
tempfile.gettempdir(),
|
| 32 |
+
f"qwen_tts_{int(time.time() * 1000)}.wav",
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
import soundfile as sf
|
| 36 |
+
|
| 37 |
+
sf.write(output_path, wavs[0], sr)
|
| 38 |
+
return output_path
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def get_tts_model() -> Any:
|
| 42 |
+
global _tts_model
|
| 43 |
+
|
| 44 |
+
if _tts_model is not None:
|
| 45 |
+
return _tts_model
|
| 46 |
+
|
| 47 |
+
import torch
|
| 48 |
+
from qwen_tts import Qwen3TTSModel
|
| 49 |
+
|
| 50 |
+
if torch.cuda.is_available():
|
| 51 |
+
_tts_model = Qwen3TTSModel.from_pretrained(
|
| 52 |
+
TTS_MODEL_ID,
|
| 53 |
+
device_map="cuda:0",
|
| 54 |
+
dtype=torch.bfloat16,
|
| 55 |
+
)
|
| 56 |
+
else:
|
| 57 |
+
_tts_model = Qwen3TTSModel.from_pretrained(
|
| 58 |
+
TTS_MODEL_ID,
|
| 59 |
+
device_map="cpu",
|
| 60 |
+
dtype=torch.float32,
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
return _tts_model
|
| 64 |
+
|