Spaces:

build-small-hackathon
/

Sign2Voice

Build error

App Files Files Community

lilblueyes commited on 24 days ago

Commit

c2837be

1 Parent(s): 9fa7793

Harden ASL confidence and LLM consistency

Browse files

Files changed (8) hide show

README.md +3 -0
app.py +53 -0
signspeak/asl/asl_detector.py +19 -2
signspeak/debug_video.py +7 -2
signspeak/llm.py +20 -3
signspeak/pipeline.py +4 -0
tests/test_asl_detector.py +43 -0
tests/test_llm_parsing.py +10 -0

README.md CHANGED Viewed

@@ -81,6 +81,9 @@ https://github.com/jamesjbustos/sign-language-recognition
 ```
 Without these files, the ASL brick still samples frames and emits `model_missing` diagnostics.
 ## GPU dependencies

 ```
 Without these files, the ASL brick still samples frames and emits `model_missing` diagnostics.
+This model recognizes the isolated signs listed in `sign_to_prediction_index_map.json`; it is not
+a full sentence or fingerspelling recognizer. Predictions below `ASL_CONFIDENCE_THRESHOLD`
+defaulting to `0.70` are reported as `low_confidence` and are not forwarded as detected glosses.
 ## GPU dependencies

app.py CHANGED Viewed

@@ -70,6 +70,38 @@ def build_video_input(label: str) -> gr.Video:
     )
 with gr.Blocks(title="SignSpeak Local") as demo:
     gr.HTML(
         """
@@ -204,6 +236,21 @@ with gr.Blocks(title="SignSpeak Local") as demo:
                 with gr.Column(scale=1):
                     audio_output = gr.Audio(label="Generated audio", type="filepath")
     gr.HTML(
         """
         <p class="footer-note">
@@ -258,6 +305,12 @@ with gr.Blocks(title="SignSpeak Local") as demo:
         outputs=[audio_output],
     )
 if __name__ == "__main__":
     demo.queue().launch(

     )
+def render_live_frame_debug(frame):
+    if frame is None:
+        return None, "Waiting for camera frame."
+    import cv2
+    output = frame.copy()
+    height, width = output.shape[:2]
+    cv2.rectangle(output, (0, 0), (width, 72), (8, 11, 16), -1)
+    cv2.putText(
+        output,
+        "LIVE CAMERA DEBUG",
+        (14, 28),
+        cv2.FONT_HERSHEY_SIMPLEX,
+        0.62,
+        (45, 212, 191),
+        2,
+        cv2.LINE_AA,
+    )
+    cv2.putText(
+        output,
+        "Use Analyze ASL for 30-frame TFLite gloss inference",
+        (14, 58),
+        cv2.FONT_HERSHEY_SIMPLEX,
+        0.46,
+        (248, 250, 252),
+        1,
+        cv2.LINE_AA,
+    )
+    return output, "Live frame received. Sequence inference runs from the recorded/uploaded clip."
 with gr.Blocks(title="SignSpeak Local") as demo:
     gr.HTML(
         """
                 with gr.Column(scale=1):
                     audio_output = gr.Audio(label="Generated audio", type="filepath")
+        with gr.Tab("Live camera debug"):
+            with gr.Row(elem_classes=["demo-grid"]):
+                with gr.Column(scale=1, elem_classes=["panel-shell"]):
+                    gr.HTML('<div class="section-kicker">Camera stream</div>')
+                    live_camera_input = gr.Image(
+                        label="Live camera frame",
+                        sources=["webcam"],
+                        streaming=True,
+                        type="numpy",
+                    )
+                with gr.Column(scale=1, elem_classes=["panel-shell"]):
+                    gr.HTML('<div class="section-kicker">Live overlay</div>')
+                    live_camera_output = gr.Image(label="Overlay preview", type="numpy")
+                    live_camera_status = gr.Textbox(label="Live status", lines=3)
     gr.HTML(
         """
         <p class="footer-note">
         outputs=[audio_output],
     )
+    live_camera_input.stream(
+        fn=render_live_frame_debug,
+        inputs=[live_camera_input],
+        outputs=[live_camera_output, live_camera_status],
+    )
 if __name__ == "__main__":
     demo.queue().launch(

signspeak/asl/asl_detector.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from __future__ import annotations
 import json
 from pathlib import Path
 from typing import Any
@@ -18,6 +19,7 @@ class ASLDetector:
         self.model_path = self.model_dir / "model.tflite"
         self.train_csv_path = self.model_dir / "train.csv"
         self.sign_map_path = self.model_dir / "sign_to_prediction_index_map.json"
         self.labels = self._load_labels()
     def predict_from_frames(self, frames: list[np.ndarray]) -> dict[str, Any]:
@@ -52,13 +54,16 @@ class ASLDetector:
             top_idx = int(np.argmax(probs))
             top_prediction = self._label_for_index(top_idx)
             confidence = float(probs[top_idx])
             base.update(
                 {
-                    "status": "ok",
-                    "gloss_sequence": [top_prediction] if top_prediction else [],
                     "top_prediction": top_prediction,
                     "confidence": confidence,
                 }
             )
             return base
@@ -169,6 +174,18 @@ class ASLDetector:
             return self.labels[index]
         return str(index)
     def _softmax_if_needed(self, values: np.ndarray) -> np.ndarray:
         if values.size == 0:
             return np.asarray([0.0], dtype=np.float32)

 from __future__ import annotations
 import json
+import os
 from pathlib import Path
 from typing import Any
         self.model_path = self.model_dir / "model.tflite"
         self.train_csv_path = self.model_dir / "train.csv"
         self.sign_map_path = self.model_dir / "sign_to_prediction_index_map.json"
+        self.confidence_threshold = float(os.getenv("ASL_CONFIDENCE_THRESHOLD", "0.70"))
         self.labels = self._load_labels()
     def predict_from_frames(self, frames: list[np.ndarray]) -> dict[str, Any]:
             top_idx = int(np.argmax(probs))
             top_prediction = self._label_for_index(top_idx)
             confidence = float(probs[top_idx])
+            accepted = confidence >= self.confidence_threshold
             base.update(
                 {
+                    "status": "ok" if accepted else "low_confidence",
+                    "gloss_sequence": [top_prediction] if accepted and top_prediction else [],
                     "top_prediction": top_prediction,
                     "confidence": confidence,
+                    "confidence_threshold": self.confidence_threshold,
+                    "top_predictions": self._top_predictions(probs),
                 }
             )
             return base
             return self.labels[index]
         return str(index)
+    def _top_predictions(self, probs: np.ndarray, limit: int = 5) -> list[dict[str, Any]]:
+        if probs.size == 0:
+            return []
+        top_indices = np.argsort(probs)[::-1][:limit]
+        return [
+            {
+                "label": self._label_for_index(int(index)),
+                "confidence": float(probs[int(index)]),
+            }
+            for index in top_indices
+        ]
     def _softmax_if_needed(self, values: np.ndarray) -> np.ndarray:
         if values.size == 0:
             return np.asarray([0.0], dtype=np.float32)

signspeak/debug_video.py CHANGED Viewed

@@ -36,7 +36,13 @@ def create_debug_overlay_video(video_path: str | Path, result: dict[str, Any]) -
     glosses = intent.get("detected_glosses") or asl.get("gloss_sequence") or []
     gloss_text = " ".join(str(gloss) for gloss in glosses) if glosses else "NO ASL WORDS DETECTED"
     emotion_text = str(emotion.get("dominant_emotion", "unknown")).upper()
-    status_text = f"ASL {asl.get('status', 'unknown')} | EMOTION {emotion.get('status', 'unknown')}"
     try:
         while True:
@@ -88,4 +94,3 @@ def _load_cv2():
         return cv2
     except Exception as exc:
         raise RuntimeError("OpenCV is required for debug overlay video generation.") from exc

     glosses = intent.get("detected_glosses") or asl.get("gloss_sequence") or []
     gloss_text = " ".join(str(gloss) for gloss in glosses) if glosses else "NO ASL WORDS DETECTED"
     emotion_text = str(emotion.get("dominant_emotion", "unknown")).upper()
+    top_prediction = asl.get("top_prediction") or "none"
+    confidence = float(asl.get("confidence", 0.0) or 0.0)
+    threshold = float(asl.get("confidence_threshold", 0.0) or 0.0)
+    status_text = (
+        f"ASL {asl.get('status', 'unknown')} | top {top_prediction} "
+        f"{confidence:.2f}/{threshold:.2f} | EMOTION {emotion.get('status', 'unknown')}"
+    )
     try:
         while True:
         return cv2
     except Exception as exc:
         raise RuntimeError("OpenCV is required for debug overlay video generation.") from exc

signspeak/llm.py CHANGED Viewed

@@ -83,7 +83,13 @@ def deterministic_speech_from_intent(intent: dict[str, Any]) -> dict[str, str]:
         "LOVE YOU": "I love you.",
         "I HAPPY SEE YOU": "I am happy to see you.",
         "I SEE YOU": "I see you.",
         "THANK YOU": "Thank you.",
         "HELLO": "Hello.",
         "YES": "Yes.",
         "NO": "No.",
@@ -123,6 +129,13 @@ def enforce_intent_consistency(intent: dict[str, Any], normalized: dict[str, Any
         corrected["consistency_warning"] = "LLM subtitle did not match I LOVE YOU glosses; deterministic correction applied."
         return corrected
     return normalized
@@ -144,10 +157,15 @@ def generate_subtitle_and_instruction(intent_json_text: str) -> tuple[str, str,
         '{"subtitle": "...", "voice_instruction": "..."}'
     )
     user_prompt = f"""
 Input intent data:
 {json.dumps(intent, ensure_ascii=False, indent=2)}
 Task:
 Generate a short natural subtitle and a TTS voice instruction.
@@ -159,9 +177,8 @@ Rules:
 - The subtitle must be only the sentence to speak.
 - The voice_instruction must describe tone, emotion, pace, and intensity.
 - Do not copy JSON keys into the subtitle.
-Expected output format:
-{{"subtitle": "I am happy to see you.", "voice_instruction": "Speak warmly, joyfully, and clearly."}}
 """
     llm = get_llm_model()

         "LOVE YOU": "I love you.",
         "I HAPPY SEE YOU": "I am happy to see you.",
         "I SEE YOU": "I see you.",
+        "WHERE": "Where?",
+        "WHO": "Who?",
+        "WHY": "Why?",
+        "WHAT": "What?",
+        "HOW": "How?",
         "THANK YOU": "Thank you.",
+        "THANKYOU": "Thank you.",
         "HELLO": "Hello.",
         "YES": "Yes.",
         "NO": "No.",
         corrected["consistency_warning"] = "LLM subtitle did not match I LOVE YOU glosses; deterministic correction applied."
         return corrected
+    meaningful_glosses = [gloss.lower() for gloss in glosses if len(gloss) > 1]
+    if meaningful_glosses and not any(gloss in subtitle for gloss in meaningful_glosses):
+        corrected = deterministic_speech_from_intent(intent)
+        corrected["consistency_warning"] = "LLM subtitle did not match detected glosses; deterministic correction applied."
+        corrected["raw_llm_subtitle"] = normalized.get("subtitle", "")
+        return corrected
     return normalized
         '{"subtitle": "...", "voice_instruction": "..."}'
     )
+    deterministic_reference = deterministic_speech_from_intent(intent)
     user_prompt = f"""
 Input intent data:
 {json.dumps(intent, ensure_ascii=False, indent=2)}
+Reference conversion:
+{json.dumps(deterministic_reference, ensure_ascii=False)}
 Task:
 Generate a short natural subtitle and a TTS voice instruction.
 - The subtitle must be only the sentence to speak.
 - The voice_instruction must describe tone, emotion, pace, and intensity.
 - Do not copy JSON keys into the subtitle.
+- The subtitle must preserve the detected gloss meaning.
+- If the reference conversion is already correct, return it unchanged.
 """
     llm = get_llm_model()

signspeak/pipeline.py CHANGED Viewed

@@ -131,11 +131,15 @@ def summarize_asl_result(result: dict[str, Any]) -> str:
     emotion = result.get("emotion", {})
     glosses = result.get("intent_input", {}).get("detected_glosses", [])
     gloss_line = " ".join(glosses) if glosses else "None"
     override = result.get("intent_input", {}).get("diagnostics", {}).get("manual_gloss_override")
     override_line = "\nOverride: manual glosses applied" if override else ""
     return (
         f"ASL status: {asl.get('status', 'unknown')}\n"
         f"Detected words: {gloss_line}\n"
         f"Landmarks: {asl.get('landmarks_status', 'unknown')} via {asl.get('landmarks_detector', 'unknown')}\n"
         f"Emotion: {emotion.get('dominant_emotion', 'unknown')} "
         f"({float(emotion.get('intensity', 0.0) or 0.0):.2f})"

     emotion = result.get("emotion", {})
     glosses = result.get("intent_input", {}).get("detected_glosses", [])
     gloss_line = " ".join(glosses) if glosses else "None"
+    top_prediction = asl.get("top_prediction") or "None"
+    confidence = float(asl.get("confidence", 0.0) or 0.0)
+    threshold = float(asl.get("confidence_threshold", 0.0) or 0.0)
     override = result.get("intent_input", {}).get("diagnostics", {}).get("manual_gloss_override")
     override_line = "\nOverride: manual glosses applied" if override else ""
     return (
         f"ASL status: {asl.get('status', 'unknown')}\n"
         f"Detected words: {gloss_line}\n"
+        f"Top candidate: {top_prediction} ({confidence:.2f}, threshold {threshold:.2f})\n"
         f"Landmarks: {asl.get('landmarks_status', 'unknown')} via {asl.get('landmarks_detector', 'unknown')}\n"
         f"Emotion: {emotion.get('dominant_emotion', 'unknown')} "
         f"({float(emotion.get('intensity', 0.0) or 0.0):.2f})"

tests/test_asl_detector.py CHANGED Viewed

@@ -2,6 +2,7 @@ import json
 import numpy as np
 from signspeak.asl.asl_detector import ASLDetector
@@ -40,3 +41,45 @@ def test_predict_uses_tflite_signature_runner(tmp_path):
     assert output.shape == (1, 3)
     assert float(output[0][1]) == np.float32(0.8)

 import numpy as np
+import signspeak.asl.asl_detector as asl_detector_module
 from signspeak.asl.asl_detector import ASLDetector
     assert output.shape == (1, 3)
     assert float(output[0][1]) == np.float32(0.8)
+class FakeLandmarkResult:
+    keypoints = np.zeros((30, 543, 3), dtype=np.float32)
+    status = "ok"
+    detector = "fake"
+    error = None
+class FakeLandmarksDetector:
+    def __init__(self, missing_value=0.0):
+        pass
+    def detect_sequence(self, frames):
+        return FakeLandmarkResult()
+    def close(self):
+        pass
+def test_low_confidence_prediction_is_not_accepted(monkeypatch, tmp_path):
+    model_dir = tmp_path / "asl"
+    model_dir.mkdir()
+    (model_dir / "model.tflite").write_bytes(b"demo")
+    (model_dir / "sign_to_prediction_index_map.json").write_text(
+        json.dumps({"where": 0, "hello": 1}),
+        encoding="utf-8",
+    )
+    monkeypatch.setattr(asl_detector_module, "LandmarksDetector", FakeLandmarksDetector)
+    monkeypatch.setenv("ASL_CONFIDENCE_THRESHOLD", "0.70")
+    monkeypatch.setattr(ASLDetector, "_load_interpreter", lambda self: object())
+    monkeypatch.setattr(
+        ASLDetector,
+        "_predict",
+        lambda self, interpreter, keypoints: np.asarray([[0.667, 0.333]], dtype=np.float32),
+    )
+    result = ASLDetector(model_dir=model_dir).predict_from_frames([np.zeros((2, 2, 3), dtype=np.uint8)] * 30)
+    assert result["status"] == "low_confidence"
+    assert result["top_prediction"] == "where"
+    assert result["gloss_sequence"] == []

tests/test_llm_parsing.py CHANGED Viewed

@@ -73,3 +73,13 @@ def test_enforce_intent_consistency_corrects_wrong_love_subtitle():
     assert result["subtitle"] == "I love you."
     assert "consistency_warning" in result

     assert result["subtitle"] == "I love you."
     assert "consistency_warning" in result
+def test_enforce_intent_consistency_corrects_wrong_where_subtitle():
+    result = enforce_intent_consistency(
+        {"detected_glosses": ["where"]},
+        {"subtitle": "I am happy to see you.", "voice_instruction": "Speak warmly."},
+    )
+    assert result["subtitle"] == "Where?"
+    assert result["raw_llm_subtitle"] == "I am happy to see you."