lilblueyes commited on
Commit
c2837be
·
1 Parent(s): 9fa7793

Harden ASL confidence and LLM consistency

Browse files
README.md CHANGED
@@ -81,6 +81,9 @@ https://github.com/jamesjbustos/sign-language-recognition
81
  ```
82
 
83
  Without these files, the ASL brick still samples frames and emits `model_missing` diagnostics.
 
 
 
84
 
85
  ## GPU dependencies
86
 
 
81
  ```
82
 
83
  Without these files, the ASL brick still samples frames and emits `model_missing` diagnostics.
84
+ This model recognizes the isolated signs listed in `sign_to_prediction_index_map.json`; it is not
85
+ a full sentence or fingerspelling recognizer. Predictions below `ASL_CONFIDENCE_THRESHOLD`
86
+ defaulting to `0.70` are reported as `low_confidence` and are not forwarded as detected glosses.
87
 
88
  ## GPU dependencies
89
 
app.py CHANGED
@@ -70,6 +70,38 @@ def build_video_input(label: str) -> gr.Video:
70
  )
71
 
72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  with gr.Blocks(title="SignSpeak Local") as demo:
74
  gr.HTML(
75
  """
@@ -204,6 +236,21 @@ with gr.Blocks(title="SignSpeak Local") as demo:
204
  with gr.Column(scale=1):
205
  audio_output = gr.Audio(label="Generated audio", type="filepath")
206
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
  gr.HTML(
208
  """
209
  <p class="footer-note">
@@ -258,6 +305,12 @@ with gr.Blocks(title="SignSpeak Local") as demo:
258
  outputs=[audio_output],
259
  )
260
 
 
 
 
 
 
 
261
 
262
  if __name__ == "__main__":
263
  demo.queue().launch(
 
70
  )
71
 
72
 
73
+ def render_live_frame_debug(frame):
74
+ if frame is None:
75
+ return None, "Waiting for camera frame."
76
+
77
+ import cv2
78
+
79
+ output = frame.copy()
80
+ height, width = output.shape[:2]
81
+ cv2.rectangle(output, (0, 0), (width, 72), (8, 11, 16), -1)
82
+ cv2.putText(
83
+ output,
84
+ "LIVE CAMERA DEBUG",
85
+ (14, 28),
86
+ cv2.FONT_HERSHEY_SIMPLEX,
87
+ 0.62,
88
+ (45, 212, 191),
89
+ 2,
90
+ cv2.LINE_AA,
91
+ )
92
+ cv2.putText(
93
+ output,
94
+ "Use Analyze ASL for 30-frame TFLite gloss inference",
95
+ (14, 58),
96
+ cv2.FONT_HERSHEY_SIMPLEX,
97
+ 0.46,
98
+ (248, 250, 252),
99
+ 1,
100
+ cv2.LINE_AA,
101
+ )
102
+ return output, "Live frame received. Sequence inference runs from the recorded/uploaded clip."
103
+
104
+
105
  with gr.Blocks(title="SignSpeak Local") as demo:
106
  gr.HTML(
107
  """
 
236
  with gr.Column(scale=1):
237
  audio_output = gr.Audio(label="Generated audio", type="filepath")
238
 
239
+ with gr.Tab("Live camera debug"):
240
+ with gr.Row(elem_classes=["demo-grid"]):
241
+ with gr.Column(scale=1, elem_classes=["panel-shell"]):
242
+ gr.HTML('<div class="section-kicker">Camera stream</div>')
243
+ live_camera_input = gr.Image(
244
+ label="Live camera frame",
245
+ sources=["webcam"],
246
+ streaming=True,
247
+ type="numpy",
248
+ )
249
+ with gr.Column(scale=1, elem_classes=["panel-shell"]):
250
+ gr.HTML('<div class="section-kicker">Live overlay</div>')
251
+ live_camera_output = gr.Image(label="Overlay preview", type="numpy")
252
+ live_camera_status = gr.Textbox(label="Live status", lines=3)
253
+
254
  gr.HTML(
255
  """
256
  <p class="footer-note">
 
305
  outputs=[audio_output],
306
  )
307
 
308
+ live_camera_input.stream(
309
+ fn=render_live_frame_debug,
310
+ inputs=[live_camera_input],
311
+ outputs=[live_camera_output, live_camera_status],
312
+ )
313
+
314
 
315
  if __name__ == "__main__":
316
  demo.queue().launch(
signspeak/asl/asl_detector.py CHANGED
@@ -1,6 +1,7 @@
1
  from __future__ import annotations
2
 
3
  import json
 
4
  from pathlib import Path
5
  from typing import Any
6
 
@@ -18,6 +19,7 @@ class ASLDetector:
18
  self.model_path = self.model_dir / "model.tflite"
19
  self.train_csv_path = self.model_dir / "train.csv"
20
  self.sign_map_path = self.model_dir / "sign_to_prediction_index_map.json"
 
21
  self.labels = self._load_labels()
22
 
23
  def predict_from_frames(self, frames: list[np.ndarray]) -> dict[str, Any]:
@@ -52,13 +54,16 @@ class ASLDetector:
52
  top_idx = int(np.argmax(probs))
53
  top_prediction = self._label_for_index(top_idx)
54
  confidence = float(probs[top_idx])
 
55
 
56
  base.update(
57
  {
58
- "status": "ok",
59
- "gloss_sequence": [top_prediction] if top_prediction else [],
60
  "top_prediction": top_prediction,
61
  "confidence": confidence,
 
 
62
  }
63
  )
64
  return base
@@ -169,6 +174,18 @@ class ASLDetector:
169
  return self.labels[index]
170
  return str(index)
171
 
 
 
 
 
 
 
 
 
 
 
 
 
172
  def _softmax_if_needed(self, values: np.ndarray) -> np.ndarray:
173
  if values.size == 0:
174
  return np.asarray([0.0], dtype=np.float32)
 
1
  from __future__ import annotations
2
 
3
  import json
4
+ import os
5
  from pathlib import Path
6
  from typing import Any
7
 
 
19
  self.model_path = self.model_dir / "model.tflite"
20
  self.train_csv_path = self.model_dir / "train.csv"
21
  self.sign_map_path = self.model_dir / "sign_to_prediction_index_map.json"
22
+ self.confidence_threshold = float(os.getenv("ASL_CONFIDENCE_THRESHOLD", "0.70"))
23
  self.labels = self._load_labels()
24
 
25
  def predict_from_frames(self, frames: list[np.ndarray]) -> dict[str, Any]:
 
54
  top_idx = int(np.argmax(probs))
55
  top_prediction = self._label_for_index(top_idx)
56
  confidence = float(probs[top_idx])
57
+ accepted = confidence >= self.confidence_threshold
58
 
59
  base.update(
60
  {
61
+ "status": "ok" if accepted else "low_confidence",
62
+ "gloss_sequence": [top_prediction] if accepted and top_prediction else [],
63
  "top_prediction": top_prediction,
64
  "confidence": confidence,
65
+ "confidence_threshold": self.confidence_threshold,
66
+ "top_predictions": self._top_predictions(probs),
67
  }
68
  )
69
  return base
 
174
  return self.labels[index]
175
  return str(index)
176
 
177
+ def _top_predictions(self, probs: np.ndarray, limit: int = 5) -> list[dict[str, Any]]:
178
+ if probs.size == 0:
179
+ return []
180
+ top_indices = np.argsort(probs)[::-1][:limit]
181
+ return [
182
+ {
183
+ "label": self._label_for_index(int(index)),
184
+ "confidence": float(probs[int(index)]),
185
+ }
186
+ for index in top_indices
187
+ ]
188
+
189
  def _softmax_if_needed(self, values: np.ndarray) -> np.ndarray:
190
  if values.size == 0:
191
  return np.asarray([0.0], dtype=np.float32)
signspeak/debug_video.py CHANGED
@@ -36,7 +36,13 @@ def create_debug_overlay_video(video_path: str | Path, result: dict[str, Any]) -
36
  glosses = intent.get("detected_glosses") or asl.get("gloss_sequence") or []
37
  gloss_text = " ".join(str(gloss) for gloss in glosses) if glosses else "NO ASL WORDS DETECTED"
38
  emotion_text = str(emotion.get("dominant_emotion", "unknown")).upper()
39
- status_text = f"ASL {asl.get('status', 'unknown')} | EMOTION {emotion.get('status', 'unknown')}"
 
 
 
 
 
 
40
 
41
  try:
42
  while True:
@@ -88,4 +94,3 @@ def _load_cv2():
88
  return cv2
89
  except Exception as exc:
90
  raise RuntimeError("OpenCV is required for debug overlay video generation.") from exc
91
-
 
36
  glosses = intent.get("detected_glosses") or asl.get("gloss_sequence") or []
37
  gloss_text = " ".join(str(gloss) for gloss in glosses) if glosses else "NO ASL WORDS DETECTED"
38
  emotion_text = str(emotion.get("dominant_emotion", "unknown")).upper()
39
+ top_prediction = asl.get("top_prediction") or "none"
40
+ confidence = float(asl.get("confidence", 0.0) or 0.0)
41
+ threshold = float(asl.get("confidence_threshold", 0.0) or 0.0)
42
+ status_text = (
43
+ f"ASL {asl.get('status', 'unknown')} | top {top_prediction} "
44
+ f"{confidence:.2f}/{threshold:.2f} | EMOTION {emotion.get('status', 'unknown')}"
45
+ )
46
 
47
  try:
48
  while True:
 
94
  return cv2
95
  except Exception as exc:
96
  raise RuntimeError("OpenCV is required for debug overlay video generation.") from exc
 
signspeak/llm.py CHANGED
@@ -83,7 +83,13 @@ def deterministic_speech_from_intent(intent: dict[str, Any]) -> dict[str, str]:
83
  "LOVE YOU": "I love you.",
84
  "I HAPPY SEE YOU": "I am happy to see you.",
85
  "I SEE YOU": "I see you.",
 
 
 
 
 
86
  "THANK YOU": "Thank you.",
 
87
  "HELLO": "Hello.",
88
  "YES": "Yes.",
89
  "NO": "No.",
@@ -123,6 +129,13 @@ def enforce_intent_consistency(intent: dict[str, Any], normalized: dict[str, Any
123
  corrected["consistency_warning"] = "LLM subtitle did not match I LOVE YOU glosses; deterministic correction applied."
124
  return corrected
125
 
 
 
 
 
 
 
 
126
  return normalized
127
 
128
 
@@ -144,10 +157,15 @@ def generate_subtitle_and_instruction(intent_json_text: str) -> tuple[str, str,
144
  '{"subtitle": "...", "voice_instruction": "..."}'
145
  )
146
 
 
 
147
  user_prompt = f"""
148
  Input intent data:
149
  {json.dumps(intent, ensure_ascii=False, indent=2)}
150
 
 
 
 
151
  Task:
152
  Generate a short natural subtitle and a TTS voice instruction.
153
 
@@ -159,9 +177,8 @@ Rules:
159
  - The subtitle must be only the sentence to speak.
160
  - The voice_instruction must describe tone, emotion, pace, and intensity.
161
  - Do not copy JSON keys into the subtitle.
162
-
163
- Expected output format:
164
- {{"subtitle": "I am happy to see you.", "voice_instruction": "Speak warmly, joyfully, and clearly."}}
165
  """
166
 
167
  llm = get_llm_model()
 
83
  "LOVE YOU": "I love you.",
84
  "I HAPPY SEE YOU": "I am happy to see you.",
85
  "I SEE YOU": "I see you.",
86
+ "WHERE": "Where?",
87
+ "WHO": "Who?",
88
+ "WHY": "Why?",
89
+ "WHAT": "What?",
90
+ "HOW": "How?",
91
  "THANK YOU": "Thank you.",
92
+ "THANKYOU": "Thank you.",
93
  "HELLO": "Hello.",
94
  "YES": "Yes.",
95
  "NO": "No.",
 
129
  corrected["consistency_warning"] = "LLM subtitle did not match I LOVE YOU glosses; deterministic correction applied."
130
  return corrected
131
 
132
+ meaningful_glosses = [gloss.lower() for gloss in glosses if len(gloss) > 1]
133
+ if meaningful_glosses and not any(gloss in subtitle for gloss in meaningful_glosses):
134
+ corrected = deterministic_speech_from_intent(intent)
135
+ corrected["consistency_warning"] = "LLM subtitle did not match detected glosses; deterministic correction applied."
136
+ corrected["raw_llm_subtitle"] = normalized.get("subtitle", "")
137
+ return corrected
138
+
139
  return normalized
140
 
141
 
 
157
  '{"subtitle": "...", "voice_instruction": "..."}'
158
  )
159
 
160
+ deterministic_reference = deterministic_speech_from_intent(intent)
161
+
162
  user_prompt = f"""
163
  Input intent data:
164
  {json.dumps(intent, ensure_ascii=False, indent=2)}
165
 
166
+ Reference conversion:
167
+ {json.dumps(deterministic_reference, ensure_ascii=False)}
168
+
169
  Task:
170
  Generate a short natural subtitle and a TTS voice instruction.
171
 
 
177
  - The subtitle must be only the sentence to speak.
178
  - The voice_instruction must describe tone, emotion, pace, and intensity.
179
  - Do not copy JSON keys into the subtitle.
180
+ - The subtitle must preserve the detected gloss meaning.
181
+ - If the reference conversion is already correct, return it unchanged.
 
182
  """
183
 
184
  llm = get_llm_model()
signspeak/pipeline.py CHANGED
@@ -131,11 +131,15 @@ def summarize_asl_result(result: dict[str, Any]) -> str:
131
  emotion = result.get("emotion", {})
132
  glosses = result.get("intent_input", {}).get("detected_glosses", [])
133
  gloss_line = " ".join(glosses) if glosses else "None"
 
 
 
134
  override = result.get("intent_input", {}).get("diagnostics", {}).get("manual_gloss_override")
135
  override_line = "\nOverride: manual glosses applied" if override else ""
136
  return (
137
  f"ASL status: {asl.get('status', 'unknown')}\n"
138
  f"Detected words: {gloss_line}\n"
 
139
  f"Landmarks: {asl.get('landmarks_status', 'unknown')} via {asl.get('landmarks_detector', 'unknown')}\n"
140
  f"Emotion: {emotion.get('dominant_emotion', 'unknown')} "
141
  f"({float(emotion.get('intensity', 0.0) or 0.0):.2f})"
 
131
  emotion = result.get("emotion", {})
132
  glosses = result.get("intent_input", {}).get("detected_glosses", [])
133
  gloss_line = " ".join(glosses) if glosses else "None"
134
+ top_prediction = asl.get("top_prediction") or "None"
135
+ confidence = float(asl.get("confidence", 0.0) or 0.0)
136
+ threshold = float(asl.get("confidence_threshold", 0.0) or 0.0)
137
  override = result.get("intent_input", {}).get("diagnostics", {}).get("manual_gloss_override")
138
  override_line = "\nOverride: manual glosses applied" if override else ""
139
  return (
140
  f"ASL status: {asl.get('status', 'unknown')}\n"
141
  f"Detected words: {gloss_line}\n"
142
+ f"Top candidate: {top_prediction} ({confidence:.2f}, threshold {threshold:.2f})\n"
143
  f"Landmarks: {asl.get('landmarks_status', 'unknown')} via {asl.get('landmarks_detector', 'unknown')}\n"
144
  f"Emotion: {emotion.get('dominant_emotion', 'unknown')} "
145
  f"({float(emotion.get('intensity', 0.0) or 0.0):.2f})"
tests/test_asl_detector.py CHANGED
@@ -2,6 +2,7 @@ import json
2
 
3
  import numpy as np
4
 
 
5
  from signspeak.asl.asl_detector import ASLDetector
6
 
7
 
@@ -40,3 +41,45 @@ def test_predict_uses_tflite_signature_runner(tmp_path):
40
 
41
  assert output.shape == (1, 3)
42
  assert float(output[0][1]) == np.float32(0.8)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  import numpy as np
4
 
5
+ import signspeak.asl.asl_detector as asl_detector_module
6
  from signspeak.asl.asl_detector import ASLDetector
7
 
8
 
 
41
 
42
  assert output.shape == (1, 3)
43
  assert float(output[0][1]) == np.float32(0.8)
44
+
45
+
46
+ class FakeLandmarkResult:
47
+ keypoints = np.zeros((30, 543, 3), dtype=np.float32)
48
+ status = "ok"
49
+ detector = "fake"
50
+ error = None
51
+
52
+
53
+ class FakeLandmarksDetector:
54
+ def __init__(self, missing_value=0.0):
55
+ pass
56
+
57
+ def detect_sequence(self, frames):
58
+ return FakeLandmarkResult()
59
+
60
+ def close(self):
61
+ pass
62
+
63
+
64
+ def test_low_confidence_prediction_is_not_accepted(monkeypatch, tmp_path):
65
+ model_dir = tmp_path / "asl"
66
+ model_dir.mkdir()
67
+ (model_dir / "model.tflite").write_bytes(b"demo")
68
+ (model_dir / "sign_to_prediction_index_map.json").write_text(
69
+ json.dumps({"where": 0, "hello": 1}),
70
+ encoding="utf-8",
71
+ )
72
+ monkeypatch.setattr(asl_detector_module, "LandmarksDetector", FakeLandmarksDetector)
73
+ monkeypatch.setenv("ASL_CONFIDENCE_THRESHOLD", "0.70")
74
+ monkeypatch.setattr(ASLDetector, "_load_interpreter", lambda self: object())
75
+ monkeypatch.setattr(
76
+ ASLDetector,
77
+ "_predict",
78
+ lambda self, interpreter, keypoints: np.asarray([[0.667, 0.333]], dtype=np.float32),
79
+ )
80
+
81
+ result = ASLDetector(model_dir=model_dir).predict_from_frames([np.zeros((2, 2, 3), dtype=np.uint8)] * 30)
82
+
83
+ assert result["status"] == "low_confidence"
84
+ assert result["top_prediction"] == "where"
85
+ assert result["gloss_sequence"] == []
tests/test_llm_parsing.py CHANGED
@@ -73,3 +73,13 @@ def test_enforce_intent_consistency_corrects_wrong_love_subtitle():
73
 
74
  assert result["subtitle"] == "I love you."
75
  assert "consistency_warning" in result
 
 
 
 
 
 
 
 
 
 
 
73
 
74
  assert result["subtitle"] == "I love you."
75
  assert "consistency_warning" in result
76
+
77
+
78
+ def test_enforce_intent_consistency_corrects_wrong_where_subtitle():
79
+ result = enforce_intent_consistency(
80
+ {"detected_glosses": ["where"]},
81
+ {"subtitle": "I am happy to see you.", "voice_instruction": "Speak warmly."},
82
+ )
83
+
84
+ assert result["subtitle"] == "Where?"
85
+ assert result["raw_llm_subtitle"] == "I am happy to see you."