lilblueyes commited on
Commit
b0558d5
·
1 Parent(s): 57d04e5

Refactor app into pipeline bricks

Browse files
Files changed (5) hide show
  1. app.py +191 -401
  2. assets/styles.css +118 -0
  3. signspeak/llm.py +159 -0
  4. signspeak/pipeline.py +99 -0
  5. signspeak/tts.py +64 -0
app.py CHANGED
@@ -1,430 +1,200 @@
1
- import os
2
- import json
3
- import time
4
- import tempfile
5
-
6
- import gradio as gr
7
- import soundfile as sf
8
- import torch
9
-
10
- from qwen_tts import Qwen3TTSModel
11
- from llama_cpp import Llama
12
-
13
-
14
- TTS_MODEL_ID = os.getenv("TTS_MODEL_ID", "Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice")
15
-
16
- LLM_REPO_ID = os.getenv("LLM_REPO_ID", "Qwen/Qwen2.5-0.5B-Instruct-GGUF")
17
- LLM_FILENAME = os.getenv("LLM_FILENAME", "qwen2.5-0.5b-instruct-q4_k_m.gguf")
18
-
19
- tts_model = None
20
- llm_model = None
21
-
22
-
23
- CUSTOM_CSS = """
24
- :root {
25
- --bg: #050816;
26
- --panel: rgba(255, 255, 255, 0.075);
27
- --panel-border: rgba(255, 255, 255, 0.16);
28
- --text: #f8fafc;
29
- --muted: #94a3b8;
30
- --accent: #8b5cf6;
31
- --accent-2: #06b6d4;
32
- }
33
-
34
- .gradio-container {
35
- background:
36
- radial-gradient(circle at 20% 20%, rgba(139, 92, 246, 0.30), transparent 28%),
37
- radial-gradient(circle at 80% 0%, rgba(6, 182, 212, 0.24), transparent 28%),
38
- linear-gradient(135deg, #050816 0%, #0f172a 55%, #111827 100%) !important;
39
- color: var(--text) !important;
40
- font-family: Inter, ui-sans-serif, system-ui, sans-serif !important;
41
- }
42
-
43
- #hero {
44
- padding: 28px;
45
- border: 1px solid var(--panel-border);
46
- border-radius: 28px;
47
- background: linear-gradient(135deg, rgba(255,255,255,0.10), rgba(255,255,255,0.04));
48
- box-shadow: 0 24px 80px rgba(0,0,0,0.35);
49
- backdrop-filter: blur(18px);
50
- }
51
-
52
- #hero h1 {
53
- font-size: 42px;
54
- line-height: 1.05;
55
- margin-bottom: 8px;
56
- letter-spacing: -0.04em;
57
- }
58
-
59
- #hero p {
60
- color: var(--muted);
61
- font-size: 16px;
62
- }
63
-
64
- .badge-row {
65
- display: flex;
66
- flex-wrap: wrap;
67
- gap: 10px;
68
- margin-top: 16px;
69
- }
70
-
71
- .badge {
72
- padding: 8px 12px;
73
- border-radius: 999px;
74
- background: rgba(139, 92, 246, 0.16);
75
- border: 1px solid rgba(139, 92, 246, 0.34);
76
- color: #ddd6fe;
77
- font-weight: 700;
78
- font-size: 13px;
79
- }
80
-
81
- .block, .form, .panel {
82
- border-radius: 22px !important;
83
- }
84
-
85
- textarea, input, select {
86
- background: rgba(15, 23, 42, 0.72) !important;
87
- color: var(--text) !important;
88
- border-color: rgba(255,255,255,0.14) !important;
89
- }
90
-
91
- button.primary, button {
92
- border-radius: 999px !important;
93
- font-weight: 800 !important;
94
- }
95
-
96
- #run_llm {
97
- background: linear-gradient(135deg, var(--accent), var(--accent-2)) !important;
98
- color: white !important;
99
- border: none !important;
100
- }
101
-
102
- #run_tts {
103
- background: linear-gradient(135deg, #f97316, #ec4899) !important;
104
- color: white !important;
105
- border: none !important;
106
- }
107
-
108
- .footer-note {
109
- color: var(--muted);
110
- font-size: 13px;
111
- text-align: center;
112
- }
113
- """
114
 
 
115
 
116
- def get_tts_model():
117
- global tts_model
118
-
119
- if tts_model is not None:
120
- return tts_model
121
-
122
- if torch.cuda.is_available():
123
- tts_model = Qwen3TTSModel.from_pretrained(
124
- TTS_MODEL_ID,
125
- device_map="cuda:0",
126
- dtype=torch.bfloat16,
127
- )
128
- else:
129
- tts_model = Qwen3TTSModel.from_pretrained(
130
- TTS_MODEL_ID,
131
- device_map="cpu",
132
- dtype=torch.float32,
133
- )
134
-
135
- return tts_model
136
-
137
 
138
- def get_llm_model():
139
- global llm_model
 
140
 
141
- if llm_model is not None:
142
- return llm_model
143
 
144
- # llama-cpp-python downloads the GGUF from Hugging Face.
145
- # Q4_K_M is a good first compromise for CPU Spaces.
146
- llm_model = Llama.from_pretrained(
147
- repo_id=LLM_REPO_ID,
148
- filename=LLM_FILENAME,
149
- n_ctx=1024,
150
- n_threads=max(2, os.cpu_count() or 2),
151
- n_gpu_layers=-1 if torch.cuda.is_available() else 0,
152
- verbose=True,
153
- )
154
 
155
- return llm_model
156
 
157
-
158
- def safe_json_loads(text):
159
  try:
160
- return json.loads(text)
161
- except Exception:
162
- return {
163
- "raw_input": text,
164
- "warning": "Input was not valid JSON, treated as raw text.",
165
- }
166
-
167
-
168
- def extract_json_object(text):
169
- """
170
- Extract the first valid JSON object from a model response.
171
-
172
- Handles:
173
- - pure JSON
174
- - ```json ... ```
175
- - text before/after JSON
176
- """
177
- if not text:
178
- raise ValueError("Empty model response")
179
-
180
- cleaned = text.strip()
181
 
182
- if cleaned.startswith("```"):
183
- cleaned = cleaned.replace("```json", "", 1)
184
- cleaned = cleaned.replace("```JSON", "", 1)
185
- cleaned = cleaned.replace("```", "")
186
- cleaned = cleaned.strip()
187
 
 
188
  try:
189
- return json.loads(cleaned)
190
- except Exception:
191
- pass
192
-
193
- start = cleaned.find("{")
194
- end = cleaned.rfind("}")
195
-
196
- if start == -1 or end == -1 or end <= start:
197
- raise ValueError(f"No JSON object found in model response: {text}")
198
-
199
- candidate = cleaned[start:end + 1]
200
- return json.loads(candidate)
201
-
202
-
203
- def normalize_llm_output(parsed):
204
- subtitle = str(parsed.get("subtitle", "")).strip()
205
- voice_instruction = str(parsed.get("voice_instruction", "")).strip()
206
-
207
- if not subtitle:
208
- subtitle = "I want to say something."
209
-
210
- if not voice_instruction:
211
- voice_instruction = "Speak clearly and naturally."
212
-
213
- forbidden_fragments = ["```", '"subtitle"', '"voice_instruction"', "{", "}"]
214
- if any(fragment in subtitle for fragment in forbidden_fragments):
215
- subtitle = "I am happy to see you."
216
-
217
- return {
218
- "subtitle": subtitle,
219
- "voice_instruction": voice_instruction,
220
- }
221
-
222
 
223
- def generate_subtitle_and_instruction(intent_json_text):
224
- intent = safe_json_loads(intent_json_text)
225
-
226
- system_prompt = (
227
- "You are an assistant inside an ASL-to-speech accessibility app. "
228
- "Convert detected ASL glosses and emotion metadata into speech output. "
229
- "You must return raw JSON only. "
230
- "Do not use markdown. "
231
- "Do not wrap the response in ```json fences. "
232
- "Return exactly this schema: "
233
- '{"subtitle": "...", "voice_instruction": "..."}'
234
- )
235
-
236
- user_prompt = f"""
237
- Input intent data:
238
- {json.dumps(intent, ensure_ascii=False, indent=2)}
239
-
240
- Task:
241
- Generate a short natural subtitle and a TTS voice instruction.
242
-
243
- Rules:
244
- - Return raw JSON only.
245
- - Do not use markdown.
246
- - Do not include explanations.
247
- - Do not include code fences.
248
- - The subtitle must be only the sentence to speak.
249
- - The voice_instruction must describe tone, emotion, pace, and intensity.
250
- - Do not copy JSON keys into the subtitle.
251
-
252
- Expected output format:
253
- {{"subtitle": "I am happy to see you.", "voice_instruction": "Speak warmly, joyfully, and clearly."}}
254
- """
255
-
256
- llm = get_llm_model()
257
-
258
- result = llm.create_chat_completion(
259
- messages=[
260
- {"role": "system", "content": system_prompt},
261
- {"role": "user", "content": user_prompt},
262
- ],
263
- temperature=0.1,
264
- max_tokens=96,
265
- )
266
-
267
- raw_content = result["choices"][0]["message"]["content"].strip()
268
 
 
269
  try:
270
- parsed = extract_json_object(raw_content)
271
- normalized = normalize_llm_output(parsed)
272
- except Exception as error:
273
- normalized = {
274
- "subtitle": "I am happy to see you.",
275
- "voice_instruction": "Speak warmly, joyfully, and clearly.",
276
- "parser_warning": str(error),
277
- "raw_model_output": raw_content,
278
- }
279
-
280
- return (
281
- normalized["subtitle"],
282
- normalized["voice_instruction"],
283
- normalized,
284
- )
285
-
286
-
287
- def generate_tts(text, language, speaker, instruction):
288
- text = (text or "").strip()
289
- instruction = (instruction or "").strip()
290
-
291
- if not text:
292
- raise gr.Error("Aucun subtitle à synthétiser.")
293
-
294
- tts = get_tts_model()
295
-
296
- wavs, sr = tts.generate_custom_voice(
297
- text=text,
298
- language=language,
299
- speaker=speaker,
300
- instruct=instruction,
301
  )
302
 
303
- output_path = os.path.join(
304
- tempfile.gettempdir(),
305
- f"qwen_tts_{int(time.time() * 1000)}.wav",
306
- )
307
-
308
- sf.write(output_path, wavs[0], sr)
309
-
310
- return output_path
311
 
312
-
313
- DEFAULT_INTENT = {
314
- "detected_glosses": ["I", "HAPPY", "SEE", "YOU"],
315
- "detected_facial_expression": "happy",
316
- "emotion_profile": {
317
- "dominant": "joy",
318
- "confidence": 0.83,
319
- },
320
- "communication_intent": "friendly_greeting",
321
- "pipeline_stage": "mock_asl_intent_for_llama_cpp_test",
322
- }
323
-
324
-
325
- with gr.Blocks(
326
- title="SignSpeak Local",
327
- ) as demo:
328
  gr.HTML(
329
  """
330
  <section id="hero">
331
  <h1>SignSpeak Local</h1>
332
  <p>
333
- ASL video to expressive speech, built as a local-first accessibility pipeline.
334
- Current milestone: llama.cpp intent generation + Qwen3-TTS voice synthesis.
335
  </p>
336
  <div class="badge-row">
 
 
337
  <span class="badge">llama.cpp</span>
338
- <span class="badge">local-first</span>
339
- <span class="badge">custom Gradio UI</span>
340
- <span class="badge">expressive TTS</span>
341
  </div>
342
  </section>
343
  """
344
  )
345
 
346
- with gr.Row():
347
- with gr.Column(scale=1):
348
- gr.Markdown("## 1. Intent input")
349
-
350
- intent_input = gr.Textbox(
351
- label="Mock intent JSON",
352
- value=json.dumps(DEFAULT_INTENT, ensure_ascii=False, indent=2),
353
- lines=13,
354
- )
355
-
356
- run_llm_button = gr.Button(
357
- "Generate subtitle with llama.cpp",
358
- elem_id="run_llm",
359
- )
360
-
361
- with gr.Column(scale=1):
362
- gr.Markdown("## 2. llama.cpp output")
363
-
364
- subtitle_output = gr.Textbox(
365
- label="Subtitle",
366
- lines=3,
367
- )
368
-
369
- instruction_output = gr.Textbox(
370
- label="Voice instruction",
371
- lines=3,
372
- )
373
-
374
- llm_json_output = gr.JSON(
375
- label="LLM structured output",
376
- )
377
-
378
- with gr.Row():
379
- with gr.Column(scale=1):
380
- gr.Markdown("## 3. Voice synthesis")
381
-
382
- language_input = gr.Dropdown(
383
- label="Language",
384
- choices=[
385
- "Auto",
386
- "Chinese",
387
- "English",
388
- "Japanese",
389
- "Korean",
390
- "German",
391
- "French",
392
- "Russian",
393
- "Portuguese",
394
- "Spanish",
395
- "Italian",
396
- ],
397
- value="English",
398
- )
399
-
400
- speaker_input = gr.Dropdown(
401
- label="Speaker",
402
- choices=[
403
- "Vivian",
404
- "Serena",
405
- "Uncle_Fu",
406
- "Dylan",
407
- "Eric",
408
- "Ryan",
409
- "Aiden",
410
- "Ono_Anna",
411
- "Sohee",
412
- ],
413
- value="Ryan",
414
- )
415
-
416
- run_tts_button = gr.Button(
417
- "Generate expressive speech",
418
- elem_id="run_tts",
419
- )
420
-
421
- with gr.Column(scale=1):
422
- gr.Markdown("## 4. Result")
423
-
424
- audio_output = gr.Audio(
425
- label="Generated audio",
426
- type="filepath",
427
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
428
 
429
  gr.HTML(
430
  """
@@ -434,18 +204,38 @@ with gr.Blocks(
434
  """
435
  )
436
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
437
  run_llm_button.click(
438
- fn=generate_subtitle_and_instruction,
439
  inputs=[intent_input],
440
  outputs=[subtitle_output, instruction_output, llm_json_output],
441
  )
442
 
443
  run_tts_button.click(
444
- fn=generate_tts,
445
  inputs=[
446
  subtitle_output,
447
- language_input,
448
- speaker_input,
449
  instruction_output,
450
  ],
451
  outputs=[audio_output],
 
1
+ from __future__ import annotations
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
+ from pathlib import Path
4
 
5
+ import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
+ from signspeak.llm import generate_subtitle_and_instruction
8
+ from signspeak.pipeline import DEFAULT_INTENT, json_text, run_asl_video
9
+ from signspeak.tts import generate_tts
10
 
 
 
11
 
12
+ APP_DIR = Path(__file__).resolve().parent
13
+ CUSTOM_CSS = (APP_DIR / "assets" / "styles.css").read_text(encoding="utf-8")
 
 
 
 
 
 
 
 
14
 
 
15
 
16
+ def run_asl_brick(video_file: str | None) -> tuple[str, dict, str]:
 
17
  try:
18
+ return run_asl_video(video_file)
19
+ except Exception as exc:
20
+ raise gr.Error(f"ASL pipeline failed: {type(exc).__name__}: {exc}") from exc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
 
 
 
 
 
22
 
23
+ def run_llm_brick(intent_json_text: str) -> tuple[str, str, dict]:
24
  try:
25
+ return generate_subtitle_and_instruction(intent_json_text)
26
+ except Exception as exc:
27
+ raise gr.Error(f"llama.cpp generation failed: {type(exc).__name__}: {exc}") from exc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
+ def run_tts_brick(text: str, language: str, speaker: str, instruction: str) -> str:
31
  try:
32
+ return generate_tts(text, language, speaker, instruction)
33
+ except Exception as exc:
34
+ raise gr.Error(f"Qwen3-TTS generation failed: {type(exc).__name__}: {exc}") from exc
35
+
36
+
37
+ def run_full_pipeline(
38
+ video_file: str | None,
39
+ language: str,
40
+ speaker: str,
41
+ ) -> tuple[str, dict, str, str, str, dict, str]:
42
+ intent_json, asl_result, asl_summary = run_asl_brick(video_file)
43
+ subtitle, instruction, llm_result = run_llm_brick(intent_json)
44
+ audio_path = run_tts_brick(subtitle, language, speaker, instruction)
45
+ return intent_json, asl_result, asl_summary, subtitle, instruction, llm_result, audio_path
46
+
47
+
48
+ def build_video_input(label: str) -> gr.Video:
49
+ return gr.Video(
50
+ label=label,
51
+ sources=["upload", "webcam"],
52
+ type="filepath",
53
+ format="mp4",
 
 
 
 
 
 
 
 
 
54
  )
55
 
 
 
 
 
 
 
 
 
56
 
57
+ with gr.Blocks(title="SignSpeak Local") as demo:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  gr.HTML(
59
  """
60
  <section id="hero">
61
  <h1>SignSpeak Local</h1>
62
  <p>
63
+ ASL video to expressive speech, with independent ASL, llama.cpp,
64
+ and Qwen3-TTS bricks for controlled demo runs.
65
  </p>
66
  <div class="badge-row">
67
+ <span class="badge">ASL video</span>
68
+ <span class="badge">live camera</span>
69
  <span class="badge">llama.cpp</span>
70
+ <span class="badge">Qwen3-TTS</span>
 
 
71
  </div>
72
  </section>
73
  """
74
  )
75
 
76
+ with gr.Tabs():
77
+ with gr.Tab("Full pipeline"):
78
+ with gr.Row():
79
+ with gr.Column(scale=1):
80
+ gr.Markdown("### Input")
81
+ full_video_input = build_video_input("Video or camera capture")
82
+ full_language_input = gr.Dropdown(
83
+ label="Language",
84
+ choices=[
85
+ "Auto",
86
+ "Chinese",
87
+ "English",
88
+ "Japanese",
89
+ "Korean",
90
+ "German",
91
+ "French",
92
+ "Russian",
93
+ "Portuguese",
94
+ "Spanish",
95
+ "Italian",
96
+ ],
97
+ value="English",
98
+ )
99
+ full_speaker_input = gr.Dropdown(
100
+ label="Speaker",
101
+ choices=[
102
+ "Vivian",
103
+ "Serena",
104
+ "Uncle_Fu",
105
+ "Dylan",
106
+ "Eric",
107
+ "Ryan",
108
+ "Aiden",
109
+ "Ono_Anna",
110
+ "Sohee",
111
+ ],
112
+ value="Ryan",
113
+ )
114
+ run_full_button = gr.Button(
115
+ "Run full pipeline",
116
+ elem_id="run_full",
117
+ )
118
+
119
+ with gr.Column(scale=1):
120
+ gr.Markdown("### Output")
121
+ full_summary_output = gr.Textbox(label="ASL summary", lines=4)
122
+ full_subtitle_output = gr.Textbox(label="Subtitle", lines=3)
123
+ full_instruction_output = gr.Textbox(label="Voice instruction", lines=3)
124
+ full_audio_output = gr.Audio(label="Generated audio", type="filepath")
125
+
126
+ with gr.Row():
127
+ full_intent_output = gr.Code(label="Intent JSON", language="json", lines=12)
128
+ full_asl_json_output = gr.JSON(label="ASL structured output")
129
+ full_llm_json_output = gr.JSON(label="LLM structured output")
130
+
131
+ with gr.Tab("Brick tests"):
132
+ with gr.Row():
133
+ with gr.Column(scale=1):
134
+ gr.Markdown("### ASL video")
135
+ asl_video_input = build_video_input("Video or camera capture")
136
+ run_asl_button = gr.Button("Run ASL brick", elem_id="run_asl")
137
+ asl_summary_output = gr.Textbox(label="ASL summary", lines=4)
138
+ asl_intent_output = gr.Code(label="Intent JSON", language="json", lines=12)
139
+ with gr.Column(scale=1):
140
+ asl_json_output = gr.JSON(label="ASL structured output")
141
+
142
+ with gr.Row():
143
+ with gr.Column(scale=1):
144
+ gr.Markdown("### llama.cpp")
145
+ intent_input = gr.Code(
146
+ label="Intent JSON",
147
+ value=json_text(DEFAULT_INTENT),
148
+ language="json",
149
+ lines=14,
150
+ )
151
+ run_llm_button = gr.Button(
152
+ "Generate subtitle",
153
+ elem_id="run_llm",
154
+ )
155
+ with gr.Column(scale=1):
156
+ subtitle_output = gr.Textbox(label="Subtitle", lines=3)
157
+ instruction_output = gr.Textbox(label="Voice instruction", lines=3)
158
+ llm_json_output = gr.JSON(label="LLM structured output")
159
+
160
+ with gr.Row():
161
+ with gr.Column(scale=1):
162
+ gr.Markdown("### Qwen3-TTS")
163
+ tts_language_input = gr.Dropdown(
164
+ label="Language",
165
+ choices=[
166
+ "Auto",
167
+ "Chinese",
168
+ "English",
169
+ "Japanese",
170
+ "Korean",
171
+ "German",
172
+ "French",
173
+ "Russian",
174
+ "Portuguese",
175
+ "Spanish",
176
+ "Italian",
177
+ ],
178
+ value="English",
179
+ )
180
+ tts_speaker_input = gr.Dropdown(
181
+ label="Speaker",
182
+ choices=[
183
+ "Vivian",
184
+ "Serena",
185
+ "Uncle_Fu",
186
+ "Dylan",
187
+ "Eric",
188
+ "Ryan",
189
+ "Aiden",
190
+ "Ono_Anna",
191
+ "Sohee",
192
+ ],
193
+ value="Ryan",
194
+ )
195
+ run_tts_button = gr.Button("Generate speech", elem_id="run_tts")
196
+ with gr.Column(scale=1):
197
+ audio_output = gr.Audio(label="Generated audio", type="filepath")
198
 
199
  gr.HTML(
200
  """
 
204
  """
205
  )
206
 
207
+ run_full_button.click(
208
+ fn=run_full_pipeline,
209
+ inputs=[full_video_input, full_language_input, full_speaker_input],
210
+ outputs=[
211
+ full_intent_output,
212
+ full_asl_json_output,
213
+ full_summary_output,
214
+ full_subtitle_output,
215
+ full_instruction_output,
216
+ full_llm_json_output,
217
+ full_audio_output,
218
+ ],
219
+ )
220
+
221
+ run_asl_button.click(
222
+ fn=run_asl_brick,
223
+ inputs=[asl_video_input],
224
+ outputs=[asl_intent_output, asl_json_output, asl_summary_output],
225
+ )
226
+
227
  run_llm_button.click(
228
+ fn=run_llm_brick,
229
  inputs=[intent_input],
230
  outputs=[subtitle_output, instruction_output, llm_json_output],
231
  )
232
 
233
  run_tts_button.click(
234
+ fn=run_tts_brick,
235
  inputs=[
236
  subtitle_output,
237
+ tts_language_input,
238
+ tts_speaker_input,
239
  instruction_output,
240
  ],
241
  outputs=[audio_output],
assets/styles.css ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ :root {
2
+ --bg: #080a12;
3
+ --panel: rgba(255, 255, 255, 0.08);
4
+ --panel-strong: rgba(255, 255, 255, 0.12);
5
+ --panel-border: rgba(255, 255, 255, 0.16);
6
+ --text: #f8fafc;
7
+ --muted: #a8b3c7;
8
+ --accent: #2dd4bf;
9
+ --accent-2: #818cf8;
10
+ --warm: #f59e0b;
11
+ --danger: #f43f5e;
12
+ }
13
+
14
+ .gradio-container {
15
+ background:
16
+ linear-gradient(135deg, #080a12 0%, #101827 52%, #111322 100%) !important;
17
+ color: var(--text) !important;
18
+ font-family: Inter, ui-sans-serif, system-ui, sans-serif !important;
19
+ }
20
+
21
+ #hero {
22
+ padding: 24px;
23
+ border: 1px solid var(--panel-border);
24
+ border-radius: 8px;
25
+ background: linear-gradient(135deg, rgba(45, 212, 191, 0.14), rgba(129, 140, 248, 0.10));
26
+ box-shadow: 0 18px 52px rgba(0, 0, 0, 0.28);
27
+ }
28
+
29
+ #hero h1 {
30
+ font-size: 38px;
31
+ line-height: 1.08;
32
+ margin-bottom: 8px;
33
+ letter-spacing: 0;
34
+ }
35
+
36
+ #hero p {
37
+ color: var(--muted);
38
+ font-size: 16px;
39
+ max-width: 760px;
40
+ }
41
+
42
+ .badge-row {
43
+ display: flex;
44
+ flex-wrap: wrap;
45
+ gap: 8px;
46
+ margin-top: 14px;
47
+ }
48
+
49
+ .badge {
50
+ padding: 7px 10px;
51
+ border-radius: 8px;
52
+ background: rgba(255, 255, 255, 0.08);
53
+ border: 1px solid rgba(255, 255, 255, 0.16);
54
+ color: #dbeafe;
55
+ font-weight: 700;
56
+ font-size: 13px;
57
+ }
58
+
59
+ .stage-title {
60
+ margin: 8px 0 4px;
61
+ color: #e2e8f0;
62
+ }
63
+
64
+ .block,
65
+ .form,
66
+ .panel {
67
+ border-radius: 8px !important;
68
+ }
69
+
70
+ textarea,
71
+ input,
72
+ select {
73
+ background: rgba(15, 23, 42, 0.78) !important;
74
+ color: var(--text) !important;
75
+ border-color: rgba(255, 255, 255, 0.14) !important;
76
+ }
77
+
78
+ button.primary,
79
+ button {
80
+ border-radius: 8px !important;
81
+ font-weight: 800 !important;
82
+ min-height: 44px !important;
83
+ }
84
+
85
+ #run_asl {
86
+ background: linear-gradient(135deg, var(--accent), #22c55e) !important;
87
+ color: #04111a !important;
88
+ border: none !important;
89
+ }
90
+
91
+ #run_llm {
92
+ background: linear-gradient(135deg, var(--accent-2), #3b82f6) !important;
93
+ color: white !important;
94
+ border: none !important;
95
+ }
96
+
97
+ #run_tts,
98
+ #run_full {
99
+ background: linear-gradient(135deg, var(--warm), #ec4899) !important;
100
+ color: white !important;
101
+ border: none !important;
102
+ }
103
+
104
+ .footer-note {
105
+ color: var(--muted);
106
+ font-size: 13px;
107
+ text-align: center;
108
+ }
109
+
110
+ @media (max-width: 720px) {
111
+ #hero {
112
+ padding: 18px;
113
+ }
114
+
115
+ #hero h1 {
116
+ font-size: 30px;
117
+ }
118
+ }
signspeak/llm.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import os
5
+ from typing import Any
6
+
7
+
8
+ LLM_REPO_ID = os.getenv("LLM_REPO_ID", "Qwen/Qwen2.5-0.5B-Instruct-GGUF")
9
+ LLM_FILENAME = os.getenv("LLM_FILENAME", "qwen2.5-0.5b-instruct-q4_k_m.gguf")
10
+
11
+ _llm_model: Any | None = None
12
+
13
+
14
+ def safe_json_loads(text: str) -> dict[str, Any]:
15
+ try:
16
+ return json.loads(text)
17
+ except Exception:
18
+ return {
19
+ "raw_input": text,
20
+ "warning": "Input was not valid JSON, treated as raw text.",
21
+ }
22
+
23
+
24
+ def extract_json_object(text: str) -> dict[str, Any]:
25
+ """
26
+ Extract the first valid JSON object from a model response.
27
+
28
+ Handles pure JSON, markdown fences, and text before or after JSON.
29
+ """
30
+ if not text:
31
+ raise ValueError("Empty model response")
32
+
33
+ cleaned = text.strip()
34
+
35
+ if cleaned.startswith("```"):
36
+ cleaned = cleaned.replace("```json", "", 1)
37
+ cleaned = cleaned.replace("```JSON", "", 1)
38
+ cleaned = cleaned.replace("```", "")
39
+ cleaned = cleaned.strip()
40
+
41
+ try:
42
+ return json.loads(cleaned)
43
+ except Exception:
44
+ pass
45
+
46
+ start = cleaned.find("{")
47
+ end = cleaned.rfind("}")
48
+
49
+ if start == -1 or end == -1 or end <= start:
50
+ raise ValueError(f"No JSON object found in model response: {text}")
51
+
52
+ candidate = cleaned[start : end + 1]
53
+ return json.loads(candidate)
54
+
55
+
56
+ def normalize_llm_output(parsed: dict[str, Any]) -> dict[str, str]:
57
+ subtitle = str(parsed.get("subtitle", "")).strip()
58
+ voice_instruction = str(parsed.get("voice_instruction", "")).strip()
59
+
60
+ if not subtitle:
61
+ subtitle = "I want to say something."
62
+
63
+ if not voice_instruction:
64
+ voice_instruction = "Speak clearly and naturally."
65
+
66
+ forbidden_fragments = ["```", '"subtitle"', '"voice_instruction"', "{", "}"]
67
+ if any(fragment in subtitle for fragment in forbidden_fragments):
68
+ subtitle = "I am happy to see you."
69
+
70
+ return {
71
+ "subtitle": subtitle,
72
+ "voice_instruction": voice_instruction,
73
+ }
74
+
75
+
76
+ def generate_subtitle_and_instruction(intent_json_text: str) -> tuple[str, str, dict[str, Any]]:
77
+ intent = safe_json_loads(intent_json_text)
78
+
79
+ system_prompt = (
80
+ "You are an assistant inside an ASL-to-speech accessibility app. "
81
+ "Convert detected ASL glosses and emotion metadata into speech output. "
82
+ "You must return raw JSON only. "
83
+ "Do not use markdown. "
84
+ "Do not wrap the response in ```json fences. "
85
+ "Return exactly this schema: "
86
+ '{"subtitle": "...", "voice_instruction": "..."}'
87
+ )
88
+
89
+ user_prompt = f"""
90
+ Input intent data:
91
+ {json.dumps(intent, ensure_ascii=False, indent=2)}
92
+
93
+ Task:
94
+ Generate a short natural subtitle and a TTS voice instruction.
95
+
96
+ Rules:
97
+ - Return raw JSON only.
98
+ - Do not use markdown.
99
+ - Do not include explanations.
100
+ - Do not include code fences.
101
+ - The subtitle must be only the sentence to speak.
102
+ - The voice_instruction must describe tone, emotion, pace, and intensity.
103
+ - Do not copy JSON keys into the subtitle.
104
+
105
+ Expected output format:
106
+ {{"subtitle": "I am happy to see you.", "voice_instruction": "Speak warmly, joyfully, and clearly."}}
107
+ """
108
+
109
+ llm = get_llm_model()
110
+
111
+ result = llm.create_chat_completion(
112
+ messages=[
113
+ {"role": "system", "content": system_prompt},
114
+ {"role": "user", "content": user_prompt},
115
+ ],
116
+ temperature=0.1,
117
+ max_tokens=96,
118
+ )
119
+
120
+ raw_content = result["choices"][0]["message"]["content"].strip()
121
+
122
+ try:
123
+ parsed = extract_json_object(raw_content)
124
+ normalized: dict[str, Any] = normalize_llm_output(parsed)
125
+ except Exception as error:
126
+ normalized = {
127
+ "subtitle": "I am happy to see you.",
128
+ "voice_instruction": "Speak warmly, joyfully, and clearly.",
129
+ "parser_warning": str(error),
130
+ "raw_model_output": raw_content,
131
+ }
132
+
133
+ return (
134
+ normalized["subtitle"],
135
+ normalized["voice_instruction"],
136
+ normalized,
137
+ )
138
+
139
+
140
+ def get_llm_model() -> Any:
141
+ global _llm_model
142
+
143
+ if _llm_model is not None:
144
+ return _llm_model
145
+
146
+ import torch
147
+ from llama_cpp import Llama
148
+
149
+ _llm_model = Llama.from_pretrained(
150
+ repo_id=LLM_REPO_ID,
151
+ filename=LLM_FILENAME,
152
+ n_ctx=1024,
153
+ n_threads=max(2, os.cpu_count() or 2),
154
+ n_gpu_layers=-1 if torch.cuda.is_available() else 0,
155
+ verbose=True,
156
+ )
157
+
158
+ return _llm_model
159
+
signspeak/pipeline.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import tempfile
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ import numpy as np
9
+
10
+ from .asl import process_asl_video
11
+
12
+
13
+ DEFAULT_INTENT = {
14
+ "detected_glosses": ["I", "HAPPY", "SEE", "YOU"],
15
+ "detected_facial_expression": "happy",
16
+ "emotion_profile": {
17
+ "dominant": "joy",
18
+ "confidence": 0.83,
19
+ },
20
+ "communication_intent": "friendly_greeting",
21
+ "pipeline_stage": "mock_asl_intent_for_llama_cpp_test",
22
+ }
23
+
24
+ DEFAULT_VIDEO_PATH = Path(__file__).resolve().parents[1] / "data" / "examples" / "videoplayback.mp4"
25
+
26
+
27
+ def json_text(data: dict[str, Any]) -> str:
28
+ return json.dumps(data, ensure_ascii=False, indent=2)
29
+
30
+
31
+ def run_asl_video(video_file: str | None) -> tuple[str, dict[str, Any], str]:
32
+ video_path = resolve_video_path(video_file)
33
+ result = process_asl_video(video_path)
34
+ intent = result["intent_input"]
35
+ return json_text(intent), result, summarize_asl_result(result)
36
+
37
+
38
+ def resolve_video_path(video_file: str | None) -> Path:
39
+ if video_file:
40
+ return Path(video_file)
41
+ if DEFAULT_VIDEO_PATH.exists():
42
+ return DEFAULT_VIDEO_PATH
43
+ return create_synthetic_demo_video()
44
+
45
+
46
+ def create_synthetic_demo_video() -> Path:
47
+ try:
48
+ import cv2
49
+ except Exception as exc:
50
+ raise RuntimeError("OpenCV is required to create the fallback demo video.") from exc
51
+
52
+ output_path = Path(tempfile.gettempdir()) / "signspeak_demo_input.mp4"
53
+ if output_path.exists():
54
+ return output_path
55
+
56
+ width, height = 320, 240
57
+ writer = cv2.VideoWriter(
58
+ str(output_path),
59
+ cv2.VideoWriter_fourcc(*"mp4v"),
60
+ 12,
61
+ (width, height),
62
+ )
63
+ if not writer.isOpened():
64
+ raise RuntimeError(f"Could not create fallback demo video: {output_path}")
65
+
66
+ try:
67
+ for frame_idx in range(36):
68
+ frame = np.zeros((height, width, 3), dtype=np.uint8)
69
+ frame[:, :] = (12, 18, 30)
70
+ center_x = 80 + frame_idx * 4
71
+ cv2.circle(frame, (center_x, 96), 22, (45, 212, 191), -1)
72
+ cv2.circle(frame, (width - center_x, 144), 18, (129, 140, 248), -1)
73
+ cv2.putText(
74
+ frame,
75
+ "SignSpeak demo",
76
+ (36, 214),
77
+ cv2.FONT_HERSHEY_SIMPLEX,
78
+ 0.62,
79
+ (248, 250, 252),
80
+ 2,
81
+ cv2.LINE_AA,
82
+ )
83
+ writer.write(frame)
84
+ finally:
85
+ writer.release()
86
+
87
+ return output_path
88
+
89
+
90
+ def summarize_asl_result(result: dict[str, Any]) -> str:
91
+ asl = result.get("asl", {})
92
+ emotion = result.get("emotion", {})
93
+ return (
94
+ f"ASL status: {asl.get('status', 'unknown')}\n"
95
+ f"Top prediction: {asl.get('top_prediction')}\n"
96
+ f"Landmarks: {asl.get('landmarks_status', 'unknown')} via {asl.get('landmarks_detector', 'unknown')}\n"
97
+ f"Emotion: {emotion.get('dominant_emotion', 'unknown')} "
98
+ f"({float(emotion.get('intensity', 0.0) or 0.0):.2f})"
99
+ )
signspeak/tts.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import tempfile
5
+ import time
6
+ from typing import Any
7
+
8
+
9
+ TTS_MODEL_ID = os.getenv("TTS_MODEL_ID", "Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice")
10
+
11
+ _tts_model: Any | None = None
12
+
13
+
14
+ def generate_tts(text: str, language: str, speaker: str, instruction: str) -> str:
15
+ text = (text or "").strip()
16
+ instruction = (instruction or "").strip()
17
+
18
+ if not text:
19
+ raise ValueError("Aucun subtitle a synthetiser.")
20
+
21
+ tts = get_tts_model()
22
+
23
+ wavs, sr = tts.generate_custom_voice(
24
+ text=text,
25
+ language=language,
26
+ speaker=speaker,
27
+ instruct=instruction,
28
+ )
29
+
30
+ output_path = os.path.join(
31
+ tempfile.gettempdir(),
32
+ f"qwen_tts_{int(time.time() * 1000)}.wav",
33
+ )
34
+
35
+ import soundfile as sf
36
+
37
+ sf.write(output_path, wavs[0], sr)
38
+ return output_path
39
+
40
+
41
+ def get_tts_model() -> Any:
42
+ global _tts_model
43
+
44
+ if _tts_model is not None:
45
+ return _tts_model
46
+
47
+ import torch
48
+ from qwen_tts import Qwen3TTSModel
49
+
50
+ if torch.cuda.is_available():
51
+ _tts_model = Qwen3TTSModel.from_pretrained(
52
+ TTS_MODEL_ID,
53
+ device_map="cuda:0",
54
+ dtype=torch.bfloat16,
55
+ )
56
+ else:
57
+ _tts_model = Qwen3TTSModel.from_pretrained(
58
+ TTS_MODEL_ID,
59
+ device_map="cpu",
60
+ dtype=torch.float32,
61
+ )
62
+
63
+ return _tts_model
64
+