Spaces:
Build error
Build error
Commit ·
c2837be
1
Parent(s): 9fa7793
Harden ASL confidence and LLM consistency
Browse files- README.md +3 -0
- app.py +53 -0
- signspeak/asl/asl_detector.py +19 -2
- signspeak/debug_video.py +7 -2
- signspeak/llm.py +20 -3
- signspeak/pipeline.py +4 -0
- tests/test_asl_detector.py +43 -0
- tests/test_llm_parsing.py +10 -0
README.md
CHANGED
|
@@ -81,6 +81,9 @@ https://github.com/jamesjbustos/sign-language-recognition
|
|
| 81 |
```
|
| 82 |
|
| 83 |
Without these files, the ASL brick still samples frames and emits `model_missing` diagnostics.
|
|
|
|
|
|
|
|
|
|
| 84 |
|
| 85 |
## GPU dependencies
|
| 86 |
|
|
|
|
| 81 |
```
|
| 82 |
|
| 83 |
Without these files, the ASL brick still samples frames and emits `model_missing` diagnostics.
|
| 84 |
+
This model recognizes the isolated signs listed in `sign_to_prediction_index_map.json`; it is not
|
| 85 |
+
a full sentence or fingerspelling recognizer. Predictions below `ASL_CONFIDENCE_THRESHOLD`
|
| 86 |
+
defaulting to `0.70` are reported as `low_confidence` and are not forwarded as detected glosses.
|
| 87 |
|
| 88 |
## GPU dependencies
|
| 89 |
|
app.py
CHANGED
|
@@ -70,6 +70,38 @@ def build_video_input(label: str) -> gr.Video:
|
|
| 70 |
)
|
| 71 |
|
| 72 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
with gr.Blocks(title="SignSpeak Local") as demo:
|
| 74 |
gr.HTML(
|
| 75 |
"""
|
|
@@ -204,6 +236,21 @@ with gr.Blocks(title="SignSpeak Local") as demo:
|
|
| 204 |
with gr.Column(scale=1):
|
| 205 |
audio_output = gr.Audio(label="Generated audio", type="filepath")
|
| 206 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
gr.HTML(
|
| 208 |
"""
|
| 209 |
<p class="footer-note">
|
|
@@ -258,6 +305,12 @@ with gr.Blocks(title="SignSpeak Local") as demo:
|
|
| 258 |
outputs=[audio_output],
|
| 259 |
)
|
| 260 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 261 |
|
| 262 |
if __name__ == "__main__":
|
| 263 |
demo.queue().launch(
|
|
|
|
| 70 |
)
|
| 71 |
|
| 72 |
|
| 73 |
+
def render_live_frame_debug(frame):
|
| 74 |
+
if frame is None:
|
| 75 |
+
return None, "Waiting for camera frame."
|
| 76 |
+
|
| 77 |
+
import cv2
|
| 78 |
+
|
| 79 |
+
output = frame.copy()
|
| 80 |
+
height, width = output.shape[:2]
|
| 81 |
+
cv2.rectangle(output, (0, 0), (width, 72), (8, 11, 16), -1)
|
| 82 |
+
cv2.putText(
|
| 83 |
+
output,
|
| 84 |
+
"LIVE CAMERA DEBUG",
|
| 85 |
+
(14, 28),
|
| 86 |
+
cv2.FONT_HERSHEY_SIMPLEX,
|
| 87 |
+
0.62,
|
| 88 |
+
(45, 212, 191),
|
| 89 |
+
2,
|
| 90 |
+
cv2.LINE_AA,
|
| 91 |
+
)
|
| 92 |
+
cv2.putText(
|
| 93 |
+
output,
|
| 94 |
+
"Use Analyze ASL for 30-frame TFLite gloss inference",
|
| 95 |
+
(14, 58),
|
| 96 |
+
cv2.FONT_HERSHEY_SIMPLEX,
|
| 97 |
+
0.46,
|
| 98 |
+
(248, 250, 252),
|
| 99 |
+
1,
|
| 100 |
+
cv2.LINE_AA,
|
| 101 |
+
)
|
| 102 |
+
return output, "Live frame received. Sequence inference runs from the recorded/uploaded clip."
|
| 103 |
+
|
| 104 |
+
|
| 105 |
with gr.Blocks(title="SignSpeak Local") as demo:
|
| 106 |
gr.HTML(
|
| 107 |
"""
|
|
|
|
| 236 |
with gr.Column(scale=1):
|
| 237 |
audio_output = gr.Audio(label="Generated audio", type="filepath")
|
| 238 |
|
| 239 |
+
with gr.Tab("Live camera debug"):
|
| 240 |
+
with gr.Row(elem_classes=["demo-grid"]):
|
| 241 |
+
with gr.Column(scale=1, elem_classes=["panel-shell"]):
|
| 242 |
+
gr.HTML('<div class="section-kicker">Camera stream</div>')
|
| 243 |
+
live_camera_input = gr.Image(
|
| 244 |
+
label="Live camera frame",
|
| 245 |
+
sources=["webcam"],
|
| 246 |
+
streaming=True,
|
| 247 |
+
type="numpy",
|
| 248 |
+
)
|
| 249 |
+
with gr.Column(scale=1, elem_classes=["panel-shell"]):
|
| 250 |
+
gr.HTML('<div class="section-kicker">Live overlay</div>')
|
| 251 |
+
live_camera_output = gr.Image(label="Overlay preview", type="numpy")
|
| 252 |
+
live_camera_status = gr.Textbox(label="Live status", lines=3)
|
| 253 |
+
|
| 254 |
gr.HTML(
|
| 255 |
"""
|
| 256 |
<p class="footer-note">
|
|
|
|
| 305 |
outputs=[audio_output],
|
| 306 |
)
|
| 307 |
|
| 308 |
+
live_camera_input.stream(
|
| 309 |
+
fn=render_live_frame_debug,
|
| 310 |
+
inputs=[live_camera_input],
|
| 311 |
+
outputs=[live_camera_output, live_camera_status],
|
| 312 |
+
)
|
| 313 |
+
|
| 314 |
|
| 315 |
if __name__ == "__main__":
|
| 316 |
demo.queue().launch(
|
signspeak/asl/asl_detector.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
| 3 |
import json
|
|
|
|
| 4 |
from pathlib import Path
|
| 5 |
from typing import Any
|
| 6 |
|
|
@@ -18,6 +19,7 @@ class ASLDetector:
|
|
| 18 |
self.model_path = self.model_dir / "model.tflite"
|
| 19 |
self.train_csv_path = self.model_dir / "train.csv"
|
| 20 |
self.sign_map_path = self.model_dir / "sign_to_prediction_index_map.json"
|
|
|
|
| 21 |
self.labels = self._load_labels()
|
| 22 |
|
| 23 |
def predict_from_frames(self, frames: list[np.ndarray]) -> dict[str, Any]:
|
|
@@ -52,13 +54,16 @@ class ASLDetector:
|
|
| 52 |
top_idx = int(np.argmax(probs))
|
| 53 |
top_prediction = self._label_for_index(top_idx)
|
| 54 |
confidence = float(probs[top_idx])
|
|
|
|
| 55 |
|
| 56 |
base.update(
|
| 57 |
{
|
| 58 |
-
"status": "ok",
|
| 59 |
-
"gloss_sequence": [top_prediction] if top_prediction else [],
|
| 60 |
"top_prediction": top_prediction,
|
| 61 |
"confidence": confidence,
|
|
|
|
|
|
|
| 62 |
}
|
| 63 |
)
|
| 64 |
return base
|
|
@@ -169,6 +174,18 @@ class ASLDetector:
|
|
| 169 |
return self.labels[index]
|
| 170 |
return str(index)
|
| 171 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
def _softmax_if_needed(self, values: np.ndarray) -> np.ndarray:
|
| 173 |
if values.size == 0:
|
| 174 |
return np.asarray([0.0], dtype=np.float32)
|
|
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
| 3 |
import json
|
| 4 |
+
import os
|
| 5 |
from pathlib import Path
|
| 6 |
from typing import Any
|
| 7 |
|
|
|
|
| 19 |
self.model_path = self.model_dir / "model.tflite"
|
| 20 |
self.train_csv_path = self.model_dir / "train.csv"
|
| 21 |
self.sign_map_path = self.model_dir / "sign_to_prediction_index_map.json"
|
| 22 |
+
self.confidence_threshold = float(os.getenv("ASL_CONFIDENCE_THRESHOLD", "0.70"))
|
| 23 |
self.labels = self._load_labels()
|
| 24 |
|
| 25 |
def predict_from_frames(self, frames: list[np.ndarray]) -> dict[str, Any]:
|
|
|
|
| 54 |
top_idx = int(np.argmax(probs))
|
| 55 |
top_prediction = self._label_for_index(top_idx)
|
| 56 |
confidence = float(probs[top_idx])
|
| 57 |
+
accepted = confidence >= self.confidence_threshold
|
| 58 |
|
| 59 |
base.update(
|
| 60 |
{
|
| 61 |
+
"status": "ok" if accepted else "low_confidence",
|
| 62 |
+
"gloss_sequence": [top_prediction] if accepted and top_prediction else [],
|
| 63 |
"top_prediction": top_prediction,
|
| 64 |
"confidence": confidence,
|
| 65 |
+
"confidence_threshold": self.confidence_threshold,
|
| 66 |
+
"top_predictions": self._top_predictions(probs),
|
| 67 |
}
|
| 68 |
)
|
| 69 |
return base
|
|
|
|
| 174 |
return self.labels[index]
|
| 175 |
return str(index)
|
| 176 |
|
| 177 |
+
def _top_predictions(self, probs: np.ndarray, limit: int = 5) -> list[dict[str, Any]]:
|
| 178 |
+
if probs.size == 0:
|
| 179 |
+
return []
|
| 180 |
+
top_indices = np.argsort(probs)[::-1][:limit]
|
| 181 |
+
return [
|
| 182 |
+
{
|
| 183 |
+
"label": self._label_for_index(int(index)),
|
| 184 |
+
"confidence": float(probs[int(index)]),
|
| 185 |
+
}
|
| 186 |
+
for index in top_indices
|
| 187 |
+
]
|
| 188 |
+
|
| 189 |
def _softmax_if_needed(self, values: np.ndarray) -> np.ndarray:
|
| 190 |
if values.size == 0:
|
| 191 |
return np.asarray([0.0], dtype=np.float32)
|
signspeak/debug_video.py
CHANGED
|
@@ -36,7 +36,13 @@ def create_debug_overlay_video(video_path: str | Path, result: dict[str, Any]) -
|
|
| 36 |
glosses = intent.get("detected_glosses") or asl.get("gloss_sequence") or []
|
| 37 |
gloss_text = " ".join(str(gloss) for gloss in glosses) if glosses else "NO ASL WORDS DETECTED"
|
| 38 |
emotion_text = str(emotion.get("dominant_emotion", "unknown")).upper()
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
try:
|
| 42 |
while True:
|
|
@@ -88,4 +94,3 @@ def _load_cv2():
|
|
| 88 |
return cv2
|
| 89 |
except Exception as exc:
|
| 90 |
raise RuntimeError("OpenCV is required for debug overlay video generation.") from exc
|
| 91 |
-
|
|
|
|
| 36 |
glosses = intent.get("detected_glosses") or asl.get("gloss_sequence") or []
|
| 37 |
gloss_text = " ".join(str(gloss) for gloss in glosses) if glosses else "NO ASL WORDS DETECTED"
|
| 38 |
emotion_text = str(emotion.get("dominant_emotion", "unknown")).upper()
|
| 39 |
+
top_prediction = asl.get("top_prediction") or "none"
|
| 40 |
+
confidence = float(asl.get("confidence", 0.0) or 0.0)
|
| 41 |
+
threshold = float(asl.get("confidence_threshold", 0.0) or 0.0)
|
| 42 |
+
status_text = (
|
| 43 |
+
f"ASL {asl.get('status', 'unknown')} | top {top_prediction} "
|
| 44 |
+
f"{confidence:.2f}/{threshold:.2f} | EMOTION {emotion.get('status', 'unknown')}"
|
| 45 |
+
)
|
| 46 |
|
| 47 |
try:
|
| 48 |
while True:
|
|
|
|
| 94 |
return cv2
|
| 95 |
except Exception as exc:
|
| 96 |
raise RuntimeError("OpenCV is required for debug overlay video generation.") from exc
|
|
|
signspeak/llm.py
CHANGED
|
@@ -83,7 +83,13 @@ def deterministic_speech_from_intent(intent: dict[str, Any]) -> dict[str, str]:
|
|
| 83 |
"LOVE YOU": "I love you.",
|
| 84 |
"I HAPPY SEE YOU": "I am happy to see you.",
|
| 85 |
"I SEE YOU": "I see you.",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
"THANK YOU": "Thank you.",
|
|
|
|
| 87 |
"HELLO": "Hello.",
|
| 88 |
"YES": "Yes.",
|
| 89 |
"NO": "No.",
|
|
@@ -123,6 +129,13 @@ def enforce_intent_consistency(intent: dict[str, Any], normalized: dict[str, Any
|
|
| 123 |
corrected["consistency_warning"] = "LLM subtitle did not match I LOVE YOU glosses; deterministic correction applied."
|
| 124 |
return corrected
|
| 125 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
return normalized
|
| 127 |
|
| 128 |
|
|
@@ -144,10 +157,15 @@ def generate_subtitle_and_instruction(intent_json_text: str) -> tuple[str, str,
|
|
| 144 |
'{"subtitle": "...", "voice_instruction": "..."}'
|
| 145 |
)
|
| 146 |
|
|
|
|
|
|
|
| 147 |
user_prompt = f"""
|
| 148 |
Input intent data:
|
| 149 |
{json.dumps(intent, ensure_ascii=False, indent=2)}
|
| 150 |
|
|
|
|
|
|
|
|
|
|
| 151 |
Task:
|
| 152 |
Generate a short natural subtitle and a TTS voice instruction.
|
| 153 |
|
|
@@ -159,9 +177,8 @@ Rules:
|
|
| 159 |
- The subtitle must be only the sentence to speak.
|
| 160 |
- The voice_instruction must describe tone, emotion, pace, and intensity.
|
| 161 |
- Do not copy JSON keys into the subtitle.
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
{{"subtitle": "I am happy to see you.", "voice_instruction": "Speak warmly, joyfully, and clearly."}}
|
| 165 |
"""
|
| 166 |
|
| 167 |
llm = get_llm_model()
|
|
|
|
| 83 |
"LOVE YOU": "I love you.",
|
| 84 |
"I HAPPY SEE YOU": "I am happy to see you.",
|
| 85 |
"I SEE YOU": "I see you.",
|
| 86 |
+
"WHERE": "Where?",
|
| 87 |
+
"WHO": "Who?",
|
| 88 |
+
"WHY": "Why?",
|
| 89 |
+
"WHAT": "What?",
|
| 90 |
+
"HOW": "How?",
|
| 91 |
"THANK YOU": "Thank you.",
|
| 92 |
+
"THANKYOU": "Thank you.",
|
| 93 |
"HELLO": "Hello.",
|
| 94 |
"YES": "Yes.",
|
| 95 |
"NO": "No.",
|
|
|
|
| 129 |
corrected["consistency_warning"] = "LLM subtitle did not match I LOVE YOU glosses; deterministic correction applied."
|
| 130 |
return corrected
|
| 131 |
|
| 132 |
+
meaningful_glosses = [gloss.lower() for gloss in glosses if len(gloss) > 1]
|
| 133 |
+
if meaningful_glosses and not any(gloss in subtitle for gloss in meaningful_glosses):
|
| 134 |
+
corrected = deterministic_speech_from_intent(intent)
|
| 135 |
+
corrected["consistency_warning"] = "LLM subtitle did not match detected glosses; deterministic correction applied."
|
| 136 |
+
corrected["raw_llm_subtitle"] = normalized.get("subtitle", "")
|
| 137 |
+
return corrected
|
| 138 |
+
|
| 139 |
return normalized
|
| 140 |
|
| 141 |
|
|
|
|
| 157 |
'{"subtitle": "...", "voice_instruction": "..."}'
|
| 158 |
)
|
| 159 |
|
| 160 |
+
deterministic_reference = deterministic_speech_from_intent(intent)
|
| 161 |
+
|
| 162 |
user_prompt = f"""
|
| 163 |
Input intent data:
|
| 164 |
{json.dumps(intent, ensure_ascii=False, indent=2)}
|
| 165 |
|
| 166 |
+
Reference conversion:
|
| 167 |
+
{json.dumps(deterministic_reference, ensure_ascii=False)}
|
| 168 |
+
|
| 169 |
Task:
|
| 170 |
Generate a short natural subtitle and a TTS voice instruction.
|
| 171 |
|
|
|
|
| 177 |
- The subtitle must be only the sentence to speak.
|
| 178 |
- The voice_instruction must describe tone, emotion, pace, and intensity.
|
| 179 |
- Do not copy JSON keys into the subtitle.
|
| 180 |
+
- The subtitle must preserve the detected gloss meaning.
|
| 181 |
+
- If the reference conversion is already correct, return it unchanged.
|
|
|
|
| 182 |
"""
|
| 183 |
|
| 184 |
llm = get_llm_model()
|
signspeak/pipeline.py
CHANGED
|
@@ -131,11 +131,15 @@ def summarize_asl_result(result: dict[str, Any]) -> str:
|
|
| 131 |
emotion = result.get("emotion", {})
|
| 132 |
glosses = result.get("intent_input", {}).get("detected_glosses", [])
|
| 133 |
gloss_line = " ".join(glosses) if glosses else "None"
|
|
|
|
|
|
|
|
|
|
| 134 |
override = result.get("intent_input", {}).get("diagnostics", {}).get("manual_gloss_override")
|
| 135 |
override_line = "\nOverride: manual glosses applied" if override else ""
|
| 136 |
return (
|
| 137 |
f"ASL status: {asl.get('status', 'unknown')}\n"
|
| 138 |
f"Detected words: {gloss_line}\n"
|
|
|
|
| 139 |
f"Landmarks: {asl.get('landmarks_status', 'unknown')} via {asl.get('landmarks_detector', 'unknown')}\n"
|
| 140 |
f"Emotion: {emotion.get('dominant_emotion', 'unknown')} "
|
| 141 |
f"({float(emotion.get('intensity', 0.0) or 0.0):.2f})"
|
|
|
|
| 131 |
emotion = result.get("emotion", {})
|
| 132 |
glosses = result.get("intent_input", {}).get("detected_glosses", [])
|
| 133 |
gloss_line = " ".join(glosses) if glosses else "None"
|
| 134 |
+
top_prediction = asl.get("top_prediction") or "None"
|
| 135 |
+
confidence = float(asl.get("confidence", 0.0) or 0.0)
|
| 136 |
+
threshold = float(asl.get("confidence_threshold", 0.0) or 0.0)
|
| 137 |
override = result.get("intent_input", {}).get("diagnostics", {}).get("manual_gloss_override")
|
| 138 |
override_line = "\nOverride: manual glosses applied" if override else ""
|
| 139 |
return (
|
| 140 |
f"ASL status: {asl.get('status', 'unknown')}\n"
|
| 141 |
f"Detected words: {gloss_line}\n"
|
| 142 |
+
f"Top candidate: {top_prediction} ({confidence:.2f}, threshold {threshold:.2f})\n"
|
| 143 |
f"Landmarks: {asl.get('landmarks_status', 'unknown')} via {asl.get('landmarks_detector', 'unknown')}\n"
|
| 144 |
f"Emotion: {emotion.get('dominant_emotion', 'unknown')} "
|
| 145 |
f"({float(emotion.get('intensity', 0.0) or 0.0):.2f})"
|
tests/test_asl_detector.py
CHANGED
|
@@ -2,6 +2,7 @@ import json
|
|
| 2 |
|
| 3 |
import numpy as np
|
| 4 |
|
|
|
|
| 5 |
from signspeak.asl.asl_detector import ASLDetector
|
| 6 |
|
| 7 |
|
|
@@ -40,3 +41,45 @@ def test_predict_uses_tflite_signature_runner(tmp_path):
|
|
| 40 |
|
| 41 |
assert output.shape == (1, 3)
|
| 42 |
assert float(output[0][1]) == np.float32(0.8)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
import numpy as np
|
| 4 |
|
| 5 |
+
import signspeak.asl.asl_detector as asl_detector_module
|
| 6 |
from signspeak.asl.asl_detector import ASLDetector
|
| 7 |
|
| 8 |
|
|
|
|
| 41 |
|
| 42 |
assert output.shape == (1, 3)
|
| 43 |
assert float(output[0][1]) == np.float32(0.8)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
class FakeLandmarkResult:
|
| 47 |
+
keypoints = np.zeros((30, 543, 3), dtype=np.float32)
|
| 48 |
+
status = "ok"
|
| 49 |
+
detector = "fake"
|
| 50 |
+
error = None
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
class FakeLandmarksDetector:
|
| 54 |
+
def __init__(self, missing_value=0.0):
|
| 55 |
+
pass
|
| 56 |
+
|
| 57 |
+
def detect_sequence(self, frames):
|
| 58 |
+
return FakeLandmarkResult()
|
| 59 |
+
|
| 60 |
+
def close(self):
|
| 61 |
+
pass
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def test_low_confidence_prediction_is_not_accepted(monkeypatch, tmp_path):
|
| 65 |
+
model_dir = tmp_path / "asl"
|
| 66 |
+
model_dir.mkdir()
|
| 67 |
+
(model_dir / "model.tflite").write_bytes(b"demo")
|
| 68 |
+
(model_dir / "sign_to_prediction_index_map.json").write_text(
|
| 69 |
+
json.dumps({"where": 0, "hello": 1}),
|
| 70 |
+
encoding="utf-8",
|
| 71 |
+
)
|
| 72 |
+
monkeypatch.setattr(asl_detector_module, "LandmarksDetector", FakeLandmarksDetector)
|
| 73 |
+
monkeypatch.setenv("ASL_CONFIDENCE_THRESHOLD", "0.70")
|
| 74 |
+
monkeypatch.setattr(ASLDetector, "_load_interpreter", lambda self: object())
|
| 75 |
+
monkeypatch.setattr(
|
| 76 |
+
ASLDetector,
|
| 77 |
+
"_predict",
|
| 78 |
+
lambda self, interpreter, keypoints: np.asarray([[0.667, 0.333]], dtype=np.float32),
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
result = ASLDetector(model_dir=model_dir).predict_from_frames([np.zeros((2, 2, 3), dtype=np.uint8)] * 30)
|
| 82 |
+
|
| 83 |
+
assert result["status"] == "low_confidence"
|
| 84 |
+
assert result["top_prediction"] == "where"
|
| 85 |
+
assert result["gloss_sequence"] == []
|
tests/test_llm_parsing.py
CHANGED
|
@@ -73,3 +73,13 @@ def test_enforce_intent_consistency_corrects_wrong_love_subtitle():
|
|
| 73 |
|
| 74 |
assert result["subtitle"] == "I love you."
|
| 75 |
assert "consistency_warning" in result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
|
| 74 |
assert result["subtitle"] == "I love you."
|
| 75 |
assert "consistency_warning" in result
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def test_enforce_intent_consistency_corrects_wrong_where_subtitle():
|
| 79 |
+
result = enforce_intent_consistency(
|
| 80 |
+
{"detected_glosses": ["where"]},
|
| 81 |
+
{"subtitle": "I am happy to see you.", "voice_instruction": "Speak warmly."},
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
assert result["subtitle"] == "Where?"
|
| 85 |
+
assert result["raw_llm_subtitle"] == "I am happy to see you."
|