lilblueyes commited on
Commit
b34cf5c
·
1 Parent(s): c6c2ad9

Remove demo gloss prefill and harden no-detection flow

Browse files
README.md CHANGED
@@ -36,12 +36,9 @@ The demo screen is intentionally step-by-step:
36
  3 Generate speech -> Qwen3-TTS audio
37
  ```
38
 
39
- When the ASL classifier file is missing, the UI exposes a visible debug gloss override instead of
40
- pretending the model detected words. For an "I love you" demo clip, use:
41
-
42
- ```text
43
- I LOVE YOU
44
- ```
45
 
46
  ## Local checks
47
 
 
36
  3 Generate speech -> Qwen3-TTS audio
37
  ```
38
 
39
+ When the ASL classifier file is missing, the app reports `model_missing` and does not invent
40
+ ASL words. An empty-by-default manual gloss override exists only under advanced debug controls
41
+ for testing downstream LLM/TTS behavior.
 
 
 
42
 
43
  ## Local checks
44
 
app.py CHANGED
@@ -55,6 +55,8 @@ def run_llm_brick(intent_json_text: str) -> tuple[str, str, dict]:
55
 
56
  def run_tts_brick(text: str, language: str, speaker: str, instruction: str) -> str:
57
  try:
 
 
58
  return generate_tts(text, language, speaker, instruction)
59
  except Exception as exc:
60
  raise gr.Error(f"Qwen3-TTS generation failed: {type(exc).__name__}: {exc}") from exc
@@ -108,12 +110,13 @@ with gr.Blocks(title="SignSpeak Local") as demo:
108
  with gr.Column(scale=6, elem_classes=["panel-shell", "input-panel"]):
109
  gr.HTML('<div class="section-kicker">01 Capture</div>')
110
  full_video_input = build_video_input("Video or camera capture")
111
- full_gloss_override_input = gr.Textbox(
112
- label="Debug gloss override",
113
- value="I LOVE YOU",
114
- lines=1,
115
- info="Used when the ASL classifier is missing or uncertain. Leave empty to use raw model output only.",
116
- )
 
117
  with gr.Row(elem_classes=["control-row"]):
118
  full_language_input = gr.Dropdown(
119
  label="Language",
 
55
 
56
  def run_tts_brick(text: str, language: str, speaker: str, instruction: str) -> str:
57
  try:
58
+ if (text or "").strip() == "No ASL words were detected yet.":
59
+ raise ValueError("Analyze ASL did not detect words. Add a real ASL model or use an explicit debug override.")
60
  return generate_tts(text, language, speaker, instruction)
61
  except Exception as exc:
62
  raise gr.Error(f"Qwen3-TTS generation failed: {type(exc).__name__}: {exc}") from exc
 
110
  with gr.Column(scale=6, elem_classes=["panel-shell", "input-panel"]):
111
  gr.HTML('<div class="section-kicker">01 Capture</div>')
112
  full_video_input = build_video_input("Video or camera capture")
113
+ with gr.Accordion("Advanced debug controls", open=False):
114
+ full_gloss_override_input = gr.Textbox(
115
+ label="Manual gloss override",
116
+ value="",
117
+ lines=1,
118
+ info="Optional. Use only to test the downstream LLM/TTS when no ASL model is available.",
119
+ )
120
  with gr.Row(elem_classes=["control-row"]):
121
  full_language_input = gr.Dropdown(
122
  label="Language",
signspeak/llm.py CHANGED
@@ -65,7 +65,7 @@ def normalize_llm_output(parsed: dict[str, Any]) -> dict[str, str]:
65
 
66
  forbidden_fragments = ["```", '"subtitle"', '"voice_instruction"', "{", "}"]
67
  if any(fragment in subtitle for fragment in forbidden_fragments):
68
- subtitle = "I am happy to see you."
69
 
70
  return {
71
  "subtitle": subtitle,
 
65
 
66
  forbidden_fragments = ["```", '"subtitle"', '"voice_instruction"', "{", "}"]
67
  if any(fragment in subtitle for fragment in forbidden_fragments):
68
+ subtitle = "I want to say something."
69
 
70
  return {
71
  "subtitle": subtitle,
signspeak/pipeline.py CHANGED
@@ -67,13 +67,16 @@ def parse_gloss_override(gloss_override: str | None) -> list[str]:
67
 
68
  def resolve_video_path(video_file: Any | None) -> Path:
69
  if video_file:
 
 
70
  if isinstance(video_file, dict):
71
  video_file = video_file.get("path") or video_file.get("name") or video_file.get("video")
72
  elif hasattr(video_file, "path"):
73
  video_file = video_file.path
74
  elif hasattr(video_file, "name"):
75
  video_file = video_file.name
76
- return Path(video_file)
 
77
  if DEFAULT_VIDEO_PATH.exists():
78
  return DEFAULT_VIDEO_PATH
79
  return create_synthetic_demo_video()
 
67
 
68
  def resolve_video_path(video_file: Any | None) -> Path:
69
  if video_file:
70
+ if isinstance(video_file, (list, tuple)):
71
+ video_file = video_file[0] if video_file else None
72
  if isinstance(video_file, dict):
73
  video_file = video_file.get("path") or video_file.get("name") or video_file.get("video")
74
  elif hasattr(video_file, "path"):
75
  video_file = video_file.path
76
  elif hasattr(video_file, "name"):
77
  video_file = video_file.name
78
+ if video_file:
79
+ return Path(video_file)
80
  if DEFAULT_VIDEO_PATH.exists():
81
  return DEFAULT_VIDEO_PATH
82
  return create_synthetic_demo_video()
tests/test_asl_pipeline.py CHANGED
@@ -46,6 +46,15 @@ def test_resolve_video_path_accepts_gradio_dict_payload(tmp_path):
46
  assert resolved == video_path
47
 
48
 
 
 
 
 
 
 
 
 
 
49
  def test_parse_gloss_override_normalizes_words():
50
  assert parse_gloss_override("i love,you") == ["I", "LOVE", "YOU"]
51
 
 
46
  assert resolved == video_path
47
 
48
 
49
+ def test_resolve_video_path_accepts_gradio_tuple_payload(tmp_path):
50
+ video_path = tmp_path / "capture.mp4"
51
+ video_path.write_bytes(b"demo")
52
+
53
+ resolved = resolve_video_path((str(video_path),))
54
+
55
+ assert resolved == video_path
56
+
57
+
58
  def test_parse_gloss_override_normalizes_words():
59
  assert parse_gloss_override("i love,you") == ["I", "LOVE", "YOU"]
60
 
tests/test_llm_parsing.py CHANGED
@@ -37,7 +37,7 @@ def test_normalize_llm_output_blocks_json_subtitle():
37
  }
38
  )
39
 
40
- assert normalized["subtitle"] == "I am happy to see you."
41
  assert normalized["voice_instruction"] == "Speak warmly."
42
 
43
 
 
37
  }
38
  )
39
 
40
+ assert normalized["subtitle"] == "I want to say something."
41
  assert normalized["voice_instruction"] == "Speak warmly."
42
 
43