ISR

Runtime error

App Files Files Community

Zhen Ye commited on Mar 23

Commit

4ff16d7

2 Parent(s): f144948 29c2d5f

Merge branch 'worktree-agent-afd6bcf7'

Browse files

Files changed (4) hide show

inspection/router.py +129 -1
jobs/storage.py +20 -0
models/isr/explainer.py +331 -0
requirements.txt +2 -0

inspection/router.py CHANGED Viewed

@@ -4,9 +4,10 @@ All endpoints are on-demand — they do not affect the main inference pipeline.
 Endpoints are mounted at /inspect in app.py.
 """
 import logging
 from pathlib import Path
-from typing import Optional
 from fastapi import APIRouter, HTTPException, Query
 from fastapi.responses import JSONResponse, Response
@@ -764,3 +765,130 @@ async def get_pointcloud(
         )
     return JSONResponse(result)

 Endpoints are mounted at /inspect in app.py.
 """
+import asyncio
 import logging
 from pathlib import Path
+from typing import Dict, Optional
 from fastapi import APIRouter, HTTPException, Query
 from fastapi.responses import JSONResponse, Response
         )
     return JSONResponse(result)
+# ── Explainability (Multi-LLM) ───────────────────────────────────
+# Per-(job_id, track_id) locks to prevent duplicate concurrent LLM calls
+_explain_locks: Dict[tuple, asyncio.Lock] = {}
+def _get_explain_lock(job_id: str, track_id: str) -> asyncio.Lock:
+    """Get or create an asyncio lock for a (job_id, track_id) pair."""
+    key = (job_id, track_id)
+    if key not in _explain_locks:
+        _explain_locks[key] = asyncio.Lock()
+    return _explain_locks[key]
+@router.get("/explain/{job_id}/{track_id}")
+async def explain_track(job_id: str, track_id: str):
+    """Generate a multi-LLM interpretability tree for a tracked object.
+    Calls GPT-4o (primary) to generate a hierarchical feature tree,
+    then Claude + Gemini (validators) in parallel to validate each feature.
+    Results are cached per (job_id, track_id).
+    """
+    from jobs.storage import get_explanation, set_explanation
+    from models.isr.utils import crop_and_encode, encode_frame
+    from inspection.frames import extract_frame
+    job = _get_job_or_404(job_id)
+    # Check cache first
+    cached = get_explanation(job_id, track_id)
+    if cached:
+        return JSONResponse(cached)
+    # Acquire per-track lock to prevent duplicate LLM calls
+    lock = _get_explain_lock(job_id, track_id)
+    async with lock:
+        # Re-check cache after acquiring lock
+        cached = get_explanation(job_id, track_id)
+        if cached:
+            return JSONResponse(cached)
+        # Validate OpenAI key is available
+        import os
+        if not os.environ.get("OPENAI_API_KEY"):
+            raise HTTPException(status_code=503, detail="OpenAI API key not configured")
+        storage = get_job_storage()
+        # Parse track_id
+        instance_id = _parse_track_id(track_id)
+        # Find the best frame for this track (largest bbox area)
+        best_frame_idx = None
+        best_area = 0
+        best_track = None
+        with storage._lock:
+            frames = storage._tracks.get(job_id, {})
+            for fidx in sorted(frames.keys(), reverse=True):
+                for det in frames[fidx]:
+                    tid = det.get("instance_id")
+                    tid_str = det.get("track_id")
+                    if (tid is not None and tid == instance_id) or tid_str == track_id:
+                        bbox = det.get("bbox")
+                        if bbox:
+                            area = (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
+                            if area > best_area:
+                                best_area = area
+                                best_frame_idx = fidx
+                                best_track = dict(det)
+        if best_frame_idx is None or best_track is None:
+            raise HTTPException(status_code=404, detail=f"Track {track_id} not found in any frame.")
+        # Extract frame
+        input_path = job.input_video_path
+        if not input_path or not Path(input_path).exists():
+            raise HTTPException(status_code=404, detail="Input video not found on disk.")
+        frame = await asyncio.to_thread(extract_frame, input_path, best_frame_idx)
+        # Encode images
+        crop_b64 = crop_and_encode(frame, best_track["bbox"], max_dim=512, quality=80)
+        if not crop_b64:
+            raise HTTPException(status_code=422, detail="Failed to crop track from frame.")
+        frame_b64 = encode_frame(frame, max_dim=1024, quality=70)
+        if not frame_b64:
+            raise HTTPException(status_code=422, detail="Failed to encode frame.")
+        # Get mission query (job.queries is List[str])
+        mission = ", ".join(job.queries) if job.queries else "general surveillance"
+        # Build metadata
+        metadata = {
+            "label": best_track.get("label", "unknown"),
+            "score": best_track.get("score", 0),
+            "speed_kph": best_track.get("speed_kph", 0),
+            "direction_clock": best_track.get("direction_clock", "unknown"),
+            "depth_rel": best_track.get("depth_rel"),
+            "depth_est_m": best_track.get("depth_est_m"),
+            "angle_deg": best_track.get("angle_deg"),
+            "bbox": best_track.get("bbox"),
+        }
+        # Run explainer
+        from models.isr.explainer import ISRExplainer
+        explainer = ISRExplainer()
+        try:
+            result = await asyncio.wait_for(
+                explainer.explain(crop_b64, frame_b64, metadata, mission),
+                timeout=30.0,
+            )
+        except asyncio.TimeoutError:
+            raise HTTPException(status_code=504, detail="Explanation timed out (30s)")
+        except ValueError as e:
+            raise HTTPException(status_code=502, detail=str(e))
+        # Add track_id to result
+        result["track_id"] = track_id
+        # Cache and return
+        set_explanation(job_id, track_id, result)
+        return JSONResponse(result)

jobs/storage.py CHANGED Viewed

@@ -42,6 +42,7 @@ class JobStorage:
         self._latest_frames: Dict[str, any] = {}  # job_id -> np.ndarray
         self._mask_data: Dict[str, Dict[str, any]] = {}  # job_id -> {f"{frame_idx}:{track_id}" -> rle_dict}
         self._mission_verdicts: Dict[str, Dict[str, bool]] = {}  # job_id -> {track_id -> mission_relevant}
         self._lock = RLock()
     def create(self, job: JobInfo) -> None:
@@ -106,6 +107,18 @@ class JobStorage:
         with self._lock:
             return dict(self._mission_verdicts.get(job_id, {}))
     def get_all_masks_for_frame(self, job_id: str, frame_idx: int) -> dict:
         """Return {track_id: rle_dict} for all objects in a frame."""
         with self._lock:
@@ -136,6 +149,7 @@ class JobStorage:
             self._latest_frames.pop(job_id, None)
             self._mask_data.pop(job_id, None)
             self._mission_verdicts.pop(job_id, None)
         shutil.rmtree(get_job_directory(job_id), ignore_errors=True)
     def cleanup_expired(self, max_age: timedelta) -> None:
@@ -181,3 +195,9 @@ def get_mask_data(job_id: str, frame_idx: int, track_id: int) -> Optional[dict]:
 def get_all_masks_for_frame(job_id: str, frame_idx: int) -> dict:
     return get_job_storage().get_all_masks_for_frame(job_id, frame_idx)

         self._latest_frames: Dict[str, any] = {}  # job_id -> np.ndarray
         self._mask_data: Dict[str, Dict[str, any]] = {}  # job_id -> {f"{frame_idx}:{track_id}" -> rle_dict}
         self._mission_verdicts: Dict[str, Dict[str, bool]] = {}  # job_id -> {track_id -> mission_relevant}
+        self._explanations: Dict[str, Dict[str, dict]] = {}  # job_id -> {track_id -> explanation}
         self._lock = RLock()
     def create(self, job: JobInfo) -> None:
         with self._lock:
             return dict(self._mission_verdicts.get(job_id, {}))
+    def set_explanation(self, job_id: str, track_id: str, data: dict) -> None:
+        """Cache an explanation result for a track."""
+        with self._lock:
+            if job_id not in self._explanations:
+                self._explanations[job_id] = {}
+            self._explanations[job_id][track_id] = data
+    def get_explanation(self, job_id: str, track_id: str) -> Optional[dict]:
+        """Retrieve cached explanation for a track."""
+        with self._lock:
+            return self._explanations.get(job_id, {}).get(track_id)
     def get_all_masks_for_frame(self, job_id: str, frame_idx: int) -> dict:
         """Return {track_id: rle_dict} for all objects in a frame."""
         with self._lock:
             self._latest_frames.pop(job_id, None)
             self._mask_data.pop(job_id, None)
             self._mission_verdicts.pop(job_id, None)
+            self._explanations.pop(job_id, None)
         shutil.rmtree(get_job_directory(job_id), ignore_errors=True)
     def cleanup_expired(self, max_age: timedelta) -> None:
 def get_all_masks_for_frame(job_id: str, frame_idx: int) -> dict:
     return get_job_storage().get_all_masks_for_frame(job_id, frame_idx)
+def get_explanation(job_id: str, track_id: str) -> Optional[dict]:
+    return get_job_storage().get_explanation(job_id, track_id)
+def set_explanation(job_id: str, track_id: str, data: dict) -> None:
+    get_job_storage().set_explanation(job_id, track_id, data)

models/isr/explainer.py ADDED Viewed

	@@ -0,0 +1,331 @@

+"""Multi-LLM Explainability Pipeline.
+Orchestrates GPT-4o (primary analyzer) + Claude & Gemini (validators)
+to produce a hierarchical feature tree explaining why an object was
+classified as mission-relevant.
+"""
+import asyncio
+import json
+import logging
+import os
+from typing import Optional
+from models.isr.utils import crop_and_encode, encode_frame, parse_llm_json
+logger = logging.getLogger(__name__)
+# Category color map (synced with frontend)
+_CATEGORY_COLORS = {
+    "Structure": "#3b82f6",
+    "Function": "#06b6d4",
+    "Material": "#f59e0b",
+    "Color": "#ef4444",
+    "Size": "#10b981",
+    "Type": "#8b5cf6",
+    "Motion": "#ec4899",
+    "Context": "#64748b",
+    "Shape": "#f97316",
+    "Markings": "#a855f7",
+}
+_PRIMARY_SYSTEM_PROMPT = """You are an ISR (Intelligence, Surveillance, Reconnaissance) analyst explaining WHY a detected object matches or does not match a mission objective.
+You will receive:
+- A cropped image of the detected object
+- The full frame showing spatial context
+- Detection metadata (label, confidence, speed, depth, direction)
+- The mission objective
+Analyze the object and produce a HIERARCHICAL FEATURE TREE explaining the key visual and functional features that led to the classification.
+Return ONLY a JSON object (no markdown, no explanation) with this exact structure:
+{
+  "object": "<detected class label>",
+  "satisfies": true/false/null,
+  "confidence": 0.0-1.0,
+  "reasoning_summary": "<1-2 sentence summary>",
+  "categories": [
+    {
+      "name": "<category name>",
+      "features": [
+        {
+          "name": "<feature name>",
+          "value": true/false,
+          "reasoning": "<1 sentence explaining this observation>"
+        }
+      ]
+    }
+  ]
+}
+Rules:
+- Pick 3-6 categories most relevant to THIS SPECIFIC object from: Structure, Function, Material, Color, Size, Type, Motion, Context, Shape, Markings
+- Each category should have 1-4 features (total 5-20 features across all categories)
+- Features must be VISUAL OBSERVATIONS from the image, not assumptions
+- Be specific and expert-level (a program manager should find this insightful)
+- confidence reflects how certain you are about the overall assessment"""
+_VALIDATOR_SYSTEM_PROMPT = """You are an ISR analyst reviewing another analyst's feature assessment of a detected object.
+You will receive:
+- The same cropped image and full frame
+- Detection metadata
+- The primary analyst's hierarchical feature tree
+Your job: independently validate each feature by examining the images yourself.
+Return ONLY a JSON object (no markdown) with this structure:
+{
+  "agreement": true/false,
+  "confidence": 0.0-1.0,
+  "feature_validations": {
+    "CategoryName/FeatureName": {
+      "agree": true/false,
+      "note": "<brief observation>"
+    }
+  }
+}
+Rules:
+- Validate EVERY feature in the tree
+- Use the key format "CategoryName/FeatureName" exactly
+- Be honest — disagree when the image doesn't support the claim
+- Keep notes to 1 sentence"""
+class ISRExplainer:
+    """Orchestrates multi-LLM explanation pipeline for a single track."""
+    def __init__(self):
+        self._openai_client = None
+        self._anthropic_client = None
+    def _get_openai(self):
+        if self._openai_client is None:
+            import openai
+            key = os.environ.get("OPENAI_API_KEY")
+            if not key:
+                raise ValueError("OPENAI_API_KEY not set")
+            self._openai_client = openai.OpenAI(api_key=key)
+        return self._openai_client
+    def _get_anthropic(self):
+        if self._anthropic_client is None:
+            import anthropic
+            key = os.environ.get("ANTHROPIC_API_KEY")
+            if not key:
+                return None
+            self._anthropic_client = anthropic.Anthropic(api_key=key)
+        return self._anthropic_client
+    async def explain(
+        self,
+        crop_b64: str,
+        frame_b64: str,
+        metadata: dict,
+        mission: str,
+    ) -> dict:
+        """Run the full 3-LLM explanation pipeline.
+        Args:
+            crop_b64: Base64-encoded JPEG of the cropped ROI.
+            frame_b64: Base64-encoded JPEG of the full frame.
+            metadata: Detection metadata dict (label, score, speed_kph, etc.).
+            mission: Mission objective string.
+        Returns:
+            Merged explanation tree with consensus data.
+        """
+        # Step 1: GPT-4o primary analysis
+        primary_tree = await self._call_gpt(crop_b64, frame_b64, metadata, mission)
+        if primary_tree is None:
+            raise ValueError("Primary GPT-4o analysis failed")
+        # Step 2: Claude + Gemini validation in parallel
+        claude_result, gemini_result = await asyncio.gather(
+            self._call_claude(crop_b64, frame_b64, metadata, mission, primary_tree),
+            self._call_gemini(crop_b64, frame_b64, metadata, mission, primary_tree),
+            return_exceptions=True,
+        )
+        # Handle exceptions from validators
+        if isinstance(claude_result, Exception):
+            logger.warning("Claude validation failed: %s", claude_result)
+            claude_result = None
+        if isinstance(gemini_result, Exception):
+            logger.warning("Gemini validation failed: %s", gemini_result)
+            gemini_result = None
+        # Step 3: Merge into consensus tree
+        return self._merge(primary_tree, claude_result, gemini_result)
+    async def _call_gpt(self, crop_b64: str, frame_b64: str, metadata: dict, mission: str) -> Optional[dict]:
+        """Call GPT-4o to generate the primary feature tree."""
+        try:
+            client = self._get_openai()
+            user_text = self._build_metadata_text(metadata, mission)
+            response = await asyncio.to_thread(
+                client.chat.completions.create,
+                model="gpt-4o",
+                messages=[
+                    {"role": "system", "content": _PRIMARY_SYSTEM_PROMPT},
+                    {"role": "user", "content": [
+                        {"type": "text", "text": user_text},
+                        {"type": "text", "text": "\n[Cropped object]:"},
+                        {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{crop_b64}", "detail": "high"}},
+                        {"type": "text", "text": "\n[Full frame context]:"},
+                        {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame_b64}", "detail": "low"}},
+                    ]},
+                ],
+                max_tokens=2048,
+                temperature=0.3,
+            )
+            raw = response.choices[0].message.content
+            return parse_llm_json(raw)
+        except Exception:
+            logger.exception("GPT-4o primary analysis failed")
+            return None
+    async def _call_claude(self, crop_b64: str, frame_b64: str, metadata: dict, mission: str, tree: dict) -> Optional[dict]:
+        """Call Claude to validate the primary tree."""
+        client = self._get_anthropic()
+        if client is None:
+            logger.info("Skipping Claude validation — ANTHROPIC_API_KEY not set")
+            return None
+        try:
+            user_text = self._build_metadata_text(metadata, mission)
+            user_text += f"\n\nPrimary analyst's feature tree:\n```json\n{json.dumps(tree, indent=2)}\n```"
+            response = await asyncio.to_thread(
+                client.messages.create,
+                model="claude-sonnet-4-20250514",
+                max_tokens=1024,
+                system=_VALIDATOR_SYSTEM_PROMPT,
+                messages=[{
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": user_text},
+                        {"type": "text", "text": "\n[Cropped object]:"},
+                        {"type": "image", "source": {"type": "base64", "media_type": "image/jpeg", "data": crop_b64}},
+                        {"type": "text", "text": "\n[Full frame context]:"},
+                        {"type": "image", "source": {"type": "base64", "media_type": "image/jpeg", "data": frame_b64}},
+                    ],
+                }],
+            )
+            raw = response.content[0].text
+            return parse_llm_json(raw)
+        except Exception:
+            logger.exception("Claude validation failed")
+            return None
+    async def _call_gemini(self, crop_b64: str, frame_b64: str, metadata: dict, mission: str, tree: dict) -> Optional[dict]:
+        """Call Gemini to validate the primary tree."""
+        api_key = os.environ.get("GEMINI_API_KEY")
+        if not api_key:
+            logger.info("Skipping Gemini validation — GEMINI_API_KEY not set")
+            return None
+        try:
+            import base64
+            import google.generativeai as genai
+            genai.configure(api_key=api_key)
+            model = genai.GenerativeModel("gemini-2.0-flash")
+            user_text = self._build_metadata_text(metadata, mission)
+            user_text += f"\n\nPrimary analyst's feature tree:\n```json\n{json.dumps(tree, indent=2)}\n```"
+            # Decode images for Gemini
+            crop_bytes = base64.b64decode(crop_b64)
+            frame_bytes = base64.b64decode(frame_b64)
+            response = await asyncio.to_thread(
+                model.generate_content,
+                [
+                    _VALIDATOR_SYSTEM_PROMPT + "\n\n" + user_text,
+                    {"mime_type": "image/jpeg", "data": crop_bytes},
+                    "\n[Full frame context]:",
+                    {"mime_type": "image/jpeg", "data": frame_bytes},
+                ],
+                generation_config=genai.GenerationConfig(
+                    max_output_tokens=1024,
+                    temperature=0.3,
+                ),
+            )
+            raw = response.text
+            return parse_llm_json(raw)
+        except Exception:
+            logger.exception("Gemini validation failed")
+            return None
+    def _build_metadata_text(self, metadata: dict, mission: str) -> str:
+        """Build the text portion describing the detection."""
+        lines = [
+            f'Mission: "{mission}"',
+            "",
+            "Detection metadata:",
+            f"- Label: {metadata.get('label', 'unknown')}",
+            f"- Confidence: {metadata.get('score', 0):.2f}",
+            f"- Speed: {metadata.get('speed_kph', 0):.1f} kph",
+            f"- Direction: {metadata.get('direction_clock', 'unknown')}",
+            f"- Depth (relative): {metadata.get('depth_rel', 'N/A')}",
+            f"- Depth (estimated): {metadata.get('depth_est_m', 'N/A')}m",
+            f"- Angle: {metadata.get('angle_deg', 'N/A')}°",
+        ]
+        bbox = metadata.get("bbox")
+        if bbox:
+            bw = bbox[2] - bbox[0]
+            bh = bbox[3] - bbox[1]
+            lines.append(f"- Bounding box size: {bw}x{bh} px")
+        return "\n".join(lines)
+    def _merge(self, tree: dict, claude: Optional[dict], gemini: Optional[dict]) -> dict:
+        """Merge primary tree with validator results into consensus output."""
+        validators_available = sum(1 for v in [claude, gemini] if v is not None)
+        total_features = 0
+        agreed = 0
+        for cat in tree.get("categories", []):
+            cat_name = cat.get("name", "")
+            cat["color"] = _CATEGORY_COLORS.get(cat_name, "#64748b")
+            for feat in cat.get("features", []):
+                total_features += 1
+                feat_key = f"{cat_name}/{feat['name']}"
+                validators = {}
+                feat_agreed = 0
+                if claude and "feature_validations" in claude:
+                    cv = claude["feature_validations"].get(feat_key)
+                    if cv:
+                        validators["claude"] = cv
+                        if cv.get("agree"):
+                            feat_agreed += 1
+                if gemini and "feature_validations" in gemini:
+                    gv = gemini["feature_validations"].get(feat_key)
+                    if gv:
+                        validators["gemini"] = gv
+                        if gv.get("agree"):
+                            feat_agreed += 1
+                feat["validators"] = validators
+                feat["consensus"] = feat_agreed
+                if validators_available > 0 and feat_agreed == validators_available:
+                    agreed += 1
+        tree["consensus_bar"] = {
+            "total_features": total_features,
+            "agreed": agreed,
+            "disagreed": total_features - agreed,
+            "validators_available": validators_available,
+        }
+        return tree

requirements.txt CHANGED Viewed

@@ -16,3 +16,5 @@ iopath>=0.1.10
 psutil
 dill
 openai>=1.0.0

 psutil
 dill
 openai>=1.0.0
+anthropic>=0.40.0
+google-generativeai>=0.8.0