Zhen Ye Claude Opus 4.6 (1M context) commited on
Commit
9574811
·
1 Parent(s): 164d8b0

refactor: remove GPT/mission/relevance system, keep CV-only base

Browse files

Strip all GPT threat assessment, mission parsing, relevance gating,
and enrichment logic to create a reusable detection base. All computer
vision functionality preserved: detectors, segmenters, depth estimation,
object tracking, multi-GPU pipeline, async jobs, and MJPEG streaming.

Deleted: utils/{gpt_reasoning,openai_client,threat_chat,relevance,
enrichment,mission_parser,schemas}.py
Removed: enable_gpt, mission_spec, first_frame_gpt_results params
from inference pipeline, jobs, and API endpoints.
Removed: /detect/analyze-frame, /reason/track, /chat/threat endpoints.
Removed: sentence-transformers, python-dotenv dependencies.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

app.py CHANGED
@@ -1,7 +1,4 @@
1
  import os
2
- from dotenv import load_dotenv
3
- load_dotenv()
4
-
5
  import logging
6
 
7
  # Fix: Set Hugging Face cache to writable location
@@ -39,8 +36,7 @@ import cv2
39
  import numpy as np
40
  from fastapi import BackgroundTasks, FastAPI, File, Form, HTTPException, UploadFile
41
  from fastapi.middleware.cors import CORSMiddleware
42
- from fastapi.responses import FileResponse, HTMLResponse, JSONResponse, RedirectResponse, StreamingResponse
43
- from fastapi.staticfiles import StaticFiles
44
  import uvicorn
45
 
46
  from inference import process_first_frame, run_inference, run_grounded_sam2_tracking
@@ -57,14 +53,6 @@ from jobs.storage import (
57
  get_job_storage,
58
  get_output_video_path,
59
  )
60
- from utils.gpt_reasoning import estimate_threat_gpt
61
- from utils.threat_chat import chat_about_threats
62
- from utils.relevance import evaluate_relevance
63
- from utils.enrichment import run_enrichment
64
- from utils.schemas import AssessmentStatus
65
- from models.segmenters.model_loader import get_segmenter_detector
66
- from utils.mission_parser import parse_mission_text, build_broad_queries, MissionParseError
67
-
68
  logging.basicConfig(level=logging.INFO)
69
 
70
  # Suppress noisy external libraries
@@ -72,77 +60,6 @@ logging.getLogger("httpx").setLevel(logging.WARNING)
72
  logging.getLogger("huggingface_hub").setLevel(logging.WARNING)
73
  logging.getLogger("transformers").setLevel(logging.WARNING)
74
 
75
- # GPT concurrency limiter — prevents thread exhaustion under load
76
- _GPT_SEMAPHORE = asyncio.Semaphore(int(os.environ.get("GPT_CONCURRENCY_LIMIT", "4")))
77
-
78
-
79
- async def _enrich_first_frame_gpt(
80
- job_id: str,
81
- frame: np.ndarray,
82
- detections: list,
83
- enable_gpt: bool,
84
- mission_spec,
85
- ) -> None:
86
- """Fire-and-forget GPT enrichment for first-frame track cards.
87
-
88
- Runs concurrently with the video pipeline so the user gets instant
89
- first-frame preview (UNASSESSED), then track cards update once GPT
90
- finishes (typically 2-5s later).
91
- """
92
- if not enable_gpt or not detections:
93
- return
94
- try:
95
- # Non-LLM_EXTRACTED relevance filter runs BEFORE run_enrichment (FAST_PATH case)
96
- if mission_spec and mission_spec.parse_mode != "LLM_EXTRACTED":
97
- for d in detections:
98
- decision = evaluate_relevance(d, mission_spec.relevance_criteria)
99
- d["mission_relevant"] = decision.relevant
100
- d["relevance_reason"] = decision.reason
101
- filtered = [d for d in detections if d.get("mission_relevant", True)]
102
- if not filtered:
103
- for det in detections:
104
- det["assessment_status"] = AssessmentStatus.ASSESSED
105
- get_job_storage().update(
106
- job_id,
107
- first_frame_detections=detections,
108
- )
109
- logging.info("All detections non-relevant for job %s; marked ASSESSED", job_id)
110
- return
111
-
112
- gpt_results = await asyncio.to_thread(
113
- run_enrichment, 0, frame, detections, mission_spec,
114
- job_id=job_id,
115
- )
116
- logging.info("Background GPT enrichment complete for job %s", job_id)
117
-
118
- if not gpt_results:
119
- # All detections filtered as not relevant
120
- for det in detections:
121
- det["assessment_status"] = AssessmentStatus.ASSESSED
122
- get_job_storage().update(
123
- job_id,
124
- first_frame_detections=detections,
125
- )
126
- logging.info("All detections non-relevant for job %s; marked ASSESSED", job_id)
127
- return
128
-
129
- # Tag any remaining detections without an assessment status
130
- for det in detections:
131
- if "assessment_status" not in det:
132
- det["assessment_status"] = AssessmentStatus.UNASSESSED
133
-
134
- # Update stored job so frontend polls pick up GPT data
135
- get_job_storage().update(
136
- job_id,
137
- first_frame_detections=detections,
138
- first_frame_gpt_results=gpt_results,
139
- )
140
- logging.info("Updated first_frame_detections with GPT results for job %s", job_id)
141
-
142
- except Exception:
143
- logging.exception("Background GPT enrichment failed for job %s", job_id)
144
-
145
-
146
  async def _periodic_cleanup() -> None:
147
  while True:
148
  await asyncio.sleep(600)
@@ -168,26 +85,6 @@ app.add_middleware(
168
  )
169
 
170
 
171
- from fastapi import Request
172
-
173
- @app.middleware("http")
174
- async def add_no_cache_header(request: Request, call_next):
175
- """Ensure frontend assets are not cached by the browser (important for HF Spaces updates)."""
176
- response = await call_next(request)
177
- # Apply to all static files and the root page
178
- if request.url.path.startswith("/laser") or request.url.path == "/":
179
- response.headers["Cache-Control"] = "no-cache, no-store, must-revalidate"
180
- response.headers["Pragma"] = "no-cache"
181
- response.headers["Expires"] = "0"
182
- return response
183
-
184
- # Optional: serve the LaserPerception frontend from this backend.
185
- # The frontend files are now located in the 'frontend' directory.
186
- _FRONTEND_DIR = Path(__file__).with_name("frontend")
187
- if _FRONTEND_DIR.exists():
188
- # Mount the entire frontend directory at /laser (legacy path) or /frontend
189
- app.mount("/laser", StaticFiles(directory=_FRONTEND_DIR, html=True), name="laser")
190
-
191
  # Valid detection modes
192
  VALID_MODES = {"object_detection", "segmentation", "drone_detection"}
193
 
@@ -228,7 +125,11 @@ def _schedule_cleanup(background_tasks: BackgroundTasks, path: str) -> None:
228
  background_tasks.add_task(_cleanup)
229
 
230
 
231
- def _default_queries_for_mode(mode: str) -> list[str]:
 
 
 
 
232
  if mode == "segmentation":
233
  return ["object"]
234
  if mode == "drone_detection":
@@ -236,11 +137,17 @@ def _default_queries_for_mode(mode: str) -> list[str]:
236
  return ["person", "car", "truck", "motorcycle", "bicycle", "bus", "train", "airplane"]
237
 
238
 
 
 
 
 
 
239
  @app.get("/", response_class=HTMLResponse)
240
  async def demo_page():
241
- """Redirect to LaserPerception app."""
242
- # The main entry point is now index.html in the mounted directory
243
- return RedirectResponse(url="/laser/index.html")
 
244
 
245
 
246
  @app.post("/detect")
@@ -252,10 +159,9 @@ async def detect_endpoint(
252
  detector: str = Form("yolo11"),
253
  segmenter: str = Form("GSAM2-L"),
254
  enable_depth: bool = Form(False),
255
- enable_gpt: bool = Form(True),
256
  ):
257
  """
258
- Main detection endpoint.
259
 
260
  Args:
261
  video: Video file to process
@@ -263,8 +169,7 @@ async def detect_endpoint(
263
  queries: Comma-separated object classes for object_detection mode
264
  detector: Model to use (yolo11, detr_resnet50, grounding_dino)
265
  segmenter: Segmentation model to use (GSAM2-S/B/L, YSAM2-S/B/L)
266
- enable_depth: Whether to run legacy depth estimation (default: False)
267
- drone_detection uses the dedicated drone_yolo model.
268
 
269
  Returns:
270
  - For object_detection: Processed video with bounding boxes
@@ -293,10 +198,7 @@ async def detect_endpoint(
293
  fd, output_path = tempfile.mkstemp(prefix="output_", suffix=".mp4", dir="/tmp")
294
  os.close(fd)
295
 
296
- # Parse queries
297
- query_list = [q.strip() for q in queries.split(",") if q.strip()]
298
- if not query_list:
299
- query_list = ["object"]
300
 
301
  try:
302
  output_path = run_grounded_sam2_tracking(
@@ -343,25 +245,11 @@ async def detect_endpoint(
343
  fd, output_path = tempfile.mkstemp(prefix="output_", suffix=".mp4", dir="/tmp")
344
  os.close(fd)
345
 
346
- # Parse queries with mission awareness
347
  detector_name = "drone_yolo" if mode == "drone_detection" else detector
348
- mission_spec = None
349
-
350
- if queries.strip():
351
- try:
352
- mission_spec = parse_mission_text(queries.strip(), detector_name, video_path=input_path)
353
- query_list = build_broad_queries(detector_name, mission_spec)
354
- except MissionParseError as e:
355
- raise HTTPException(status_code=422, detail=str(e))
356
- else:
357
- query_list = _default_queries_for_mode(mode)
358
 
359
- if mode == "drone_detection" and not query_list:
360
- query_list = ["drone"]
361
 
362
- # Run inference
363
  try:
364
-
365
  # Determine depth estimator
366
  active_depth = "depth" if enable_depth else None
367
 
@@ -372,7 +260,6 @@ async def detect_endpoint(
372
  detector_name=detector_name,
373
  depth_estimator_name=active_depth,
374
  depth_scale=25.0,
375
- enable_gpt=enable_gpt,
376
  )
377
  except ValueError as exc:
378
  logging.exception("Video processing failed.")
@@ -408,7 +295,6 @@ async def detect_async_endpoint(
408
  depth_estimator: str = Form("depth"),
409
  depth_scale: float = Form(25.0),
410
  enable_depth: bool = Form(False),
411
- enable_gpt: bool = Form(True),
412
  step: int = Form(7),
413
  ):
414
  _ttfs_t0 = time.perf_counter()
@@ -440,49 +326,13 @@ async def detect_async_endpoint(
440
 
441
  logging.info("[TTFS:%s] +%.1fs upload_saved", job_id, time.perf_counter() - _ttfs_t0)
442
 
443
- # --- Mission-Driven Query Parsing ---
444
- mission_spec = None
445
- mission_mode = "LEGACY"
446
-
447
  detector_name = detector
448
- mission_detector = detector # detector key used for mission query parsing
449
  if mode == "drone_detection":
450
  detector_name = "drone_yolo"
451
- mission_detector = "drone_yolo"
452
  elif mode == "segmentation":
453
- # Segmenter registry owns detector selection (GSAM2→GDINO, YSAM2→YOLO).
454
- # detector_name=None so the job doesn't forward it (avoids duplicate kwarg).
455
- try:
456
- mission_detector = get_segmenter_detector(segmenter)
457
- except ValueError as exc:
458
- raise HTTPException(status_code=400, detail=str(exc))
459
  detector_name = None
460
 
461
- if queries.strip():
462
- try:
463
- mission_spec = parse_mission_text(queries.strip(), mission_detector, video_path=str(input_path))
464
- query_list = build_broad_queries(mission_detector, mission_spec)
465
- mission_mode = "MISSION"
466
- logging.info(
467
- "Mission parsed: mode=%s classes=%s broad_queries=%s domain=%s(%s)",
468
- mission_mode, mission_spec.object_classes, query_list,
469
- mission_spec.domain, mission_spec.domain_source,
470
- )
471
- except MissionParseError as e:
472
- raise HTTPException(
473
- status_code=422,
474
- detail=str(e),
475
- )
476
- else:
477
- # LEGACY mode: no mission context, use defaults, disable GPT
478
- query_list = _default_queries_for_mode(mode)
479
- enable_gpt = False
480
- mission_mode = "LEGACY"
481
- logging.info(
482
- "LEGACY mode: no mission text, defaults=%s, GPT disabled", query_list
483
- )
484
-
485
- logging.info("[TTFS:%s] +%.1fs mission_parsed", job_id, time.perf_counter() - _ttfs_t0)
486
 
487
  available_depth_estimators = set(list_depth_estimators())
488
  if depth_estimator not in available_depth_estimators:
@@ -508,8 +358,6 @@ async def detect_async_endpoint(
508
  )
509
  cv2.imwrite(str(first_frame_path), processed_frame)
510
  logging.info("[TTFS:%s] +%.1fs process_first_frame done", job_id, time.perf_counter() - _ttfs_t0)
511
- # GPT and depth are now handled in the async pipeline (enrichment thread)
512
- first_frame_gpt_results = None
513
  except Exception:
514
  logging.exception("First-frame processing failed.")
515
  shutil.rmtree(job_dir, ignore_errors=True)
@@ -530,26 +378,12 @@ async def detect_async_endpoint(
530
  depth_scale=float(depth_scale),
531
  depth_output_path=str(depth_output_path),
532
  first_frame_depth_path=str(first_frame_depth_path),
533
- enable_gpt=enable_gpt,
534
- mission_spec=mission_spec,
535
- mission_mode=mission_mode,
536
- first_frame_gpt_results=first_frame_gpt_results,
537
  step=step,
538
  ttfs_t0=_ttfs_t0,
539
  )
540
  get_job_storage().create(job)
541
  asyncio.create_task(process_video_async(job_id))
542
 
543
- # Fire-and-forget: enrich first-frame detections with GPT in background.
544
- # Runs for ALL modes including segmentation — first-frame detections from
545
- # process_first_frame() already have stable track IDs (T01, T02, ...) and
546
- # valid bboxes, so there's no reason to defer. The GSAM2 writer's
547
- # enrichment thread will see the cached results via first_frame_gpt_results
548
- # in JobStorage and skip the duplicate call on frame 0.
549
- asyncio.create_task(_enrich_first_frame_gpt(
550
- job_id, processed_frame, detections, enable_gpt, mission_spec,
551
- ))
552
-
553
  response_data = {
554
  "job_id": job_id,
555
  "first_frame_url": f"/detect/first-frame/{job_id}",
@@ -560,21 +394,8 @@ async def detect_async_endpoint(
560
  "stream_url": f"/detect/stream/{job_id}",
561
  "status": job.status.value,
562
  "first_frame_detections": detections,
563
- "mission_mode": mission_mode,
564
  }
565
 
566
- if mission_spec:
567
- response_data["mission_spec"] = {
568
- "object_classes": mission_spec.object_classes,
569
- "mission_intent": mission_spec.mission_intent,
570
- "domain": mission_spec.domain,
571
- "domain_source": mission_spec.domain_source,
572
- "parse_confidence": mission_spec.parse_confidence,
573
- "parse_warnings": mission_spec.parse_warnings,
574
- "context_phrases": mission_spec.context_phrases,
575
- "stripped_modifiers": mission_spec.stripped_modifiers,
576
- }
577
-
578
  return response_data
579
 
580
 
@@ -618,59 +439,6 @@ async def get_frame_tracks(job_id: str, frame_idx: int):
618
  return data or []
619
 
620
 
621
- @app.post("/detect/analyze-frame")
622
- async def analyze_frame(
623
- image: UploadFile = File(...),
624
- detections: str = Form(...),
625
- job_id: str = Form(None),
626
- ):
627
- """Run GPT threat assessment on a single video frame."""
628
- import json as json_module
629
- from utils.gpt_reasoning import encode_frame_to_b64
630
-
631
- dets = json_module.loads(detections)
632
-
633
- # Look up mission_spec from stored job (if available)
634
- mission_spec = None
635
- if job_id:
636
- job = get_job_storage().get(job_id)
637
- if job:
638
- mission_spec = job.mission_spec
639
-
640
- # Decode uploaded image
641
- image_bytes = await image.read()
642
- nparr = np.frombuffer(image_bytes, np.uint8)
643
- frame = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
644
- if frame is None:
645
- raise HTTPException(status_code=400, detail="Invalid image")
646
-
647
- # Run GPT in thread pool (blocking OpenAI API call)
648
- frame_b64 = encode_frame_to_b64(frame)
649
- async with _GPT_SEMAPHORE:
650
- gpt_results = await asyncio.to_thread(
651
- estimate_threat_gpt,
652
- detections=dets,
653
- mission_spec=mission_spec,
654
- image_b64=frame_b64,
655
- )
656
-
657
- # Merge GPT results into detection records
658
- for d in dets:
659
- oid = d.get("track_id") or d.get("id")
660
- if oid and oid in gpt_results:
661
- payload = gpt_results[oid]
662
- d["gpt_raw"] = payload
663
- d["assessment_status"] = payload.get("assessment_status", "ASSESSED")
664
- d["threat_level_score"] = payload.get("threat_level_score", 0)
665
- d["threat_classification"] = payload.get("threat_classification", "Unknown")
666
- d["weapon_readiness"] = payload.get("weapon_readiness", "Unknown")
667
- d["gpt_description"] = payload.get("gpt_description")
668
- d["gpt_distance_m"] = payload.get("gpt_distance_m")
669
- d["gpt_direction"] = payload.get("gpt_direction")
670
-
671
- return dets
672
-
673
-
674
  @app.delete("/detect/job/{job_id}")
675
  async def cancel_job(job_id: str):
676
  """Cancel a running job."""
@@ -856,93 +624,6 @@ async def stream_video(job_id: str):
856
  )
857
 
858
 
859
- @app.post("/reason/track")
860
- async def reason_track(
861
- frame: UploadFile = File(...),
862
- tracks: str = Form(...) # JSON string of tracks: [{"id": "T01", "bbox": [x,y,w,h], "label": "car"}, ...]
863
- ):
864
- """
865
- Reason about specific tracks in a frame using GPT.
866
- Returns distance and description for each object ID.
867
- """
868
- import json
869
- try:
870
- input_path = _save_upload_to_tmp(frame)
871
- except Exception:
872
- raise HTTPException(status_code=500, detail="Failed to save uploaded frame")
873
-
874
- try:
875
- track_list = json.loads(tracks)
876
- except json.JSONDecodeError:
877
- _safe_delete(input_path)
878
- raise HTTPException(status_code=400, detail="Invalid tracks JSON")
879
-
880
- # Run GPT estimation
881
- # This is blocking, but that's expected for this endpoint structure.
882
- # For high concurrency, might want to offload to threadpool or async wrapper.
883
- try:
884
- async with _GPT_SEMAPHORE:
885
- results = await asyncio.to_thread(estimate_threat_gpt, input_path, track_list)
886
- logging.info(f"GPT Output for Video Track Update:\n{results}")
887
- except Exception as e:
888
- logging.exception("GPT reasoning failed")
889
- _safe_delete(input_path)
890
- raise HTTPException(status_code=500, detail=str(e))
891
-
892
- _safe_delete(input_path)
893
- return results
894
-
895
-
896
- @app.post("/chat/threat")
897
- async def chat_threat_endpoint(
898
- question: str = Form(...),
899
- detections: str = Form(...), # JSON string of current detections
900
- mission_context: str = Form(""), # Optional JSON string of mission spec
901
- ):
902
- """
903
- Chat about detected threats using GPT.
904
-
905
- Args:
906
- question: User's question about the current threat situation.
907
- detections: JSON string of detection list with threat analysis data.
908
- mission_context: Optional JSON string of mission specification.
909
-
910
- Returns:
911
- GPT response about the threats.
912
- """
913
- import json as json_module
914
-
915
- if not question.strip():
916
- raise HTTPException(status_code=400, detail="Question cannot be empty.")
917
-
918
- try:
919
- detection_list = json_module.loads(detections)
920
- except json_module.JSONDecodeError:
921
- raise HTTPException(status_code=400, detail="Invalid detections JSON.")
922
-
923
- if not isinstance(detection_list, list):
924
- raise HTTPException(status_code=400, detail="Detections must be a list.")
925
-
926
- # Parse optional mission context
927
- mission_spec_dict = None
928
- if mission_context.strip():
929
- try:
930
- mission_spec_dict = json_module.loads(mission_context)
931
- except json_module.JSONDecodeError:
932
- pass # Non-critical, proceed without mission context
933
-
934
- # Run chat in thread to avoid blocking (with concurrency limit)
935
- try:
936
- async with _GPT_SEMAPHORE:
937
- response = await asyncio.to_thread(
938
- chat_about_threats, question, detection_list, mission_spec_dict
939
- )
940
- return {"response": response}
941
- except Exception as e:
942
- logging.exception("Threat chat failed")
943
- raise HTTPException(status_code=500, detail=str(e))
944
-
945
-
946
  @app.post("/benchmark")
947
  async def benchmark_endpoint(
948
  video: UploadFile = File(...),
@@ -990,7 +671,6 @@ async def benchmark_endpoint(
990
  query_list,
991
  segmenter_name=segmenter,
992
  step=step,
993
- enable_gpt=False,
994
  _perf_metrics=metrics,
995
  _perf_lock=lock,
996
  num_maskmem=num_maskmem,
 
1
  import os
 
 
 
2
  import logging
3
 
4
  # Fix: Set Hugging Face cache to writable location
 
36
  import numpy as np
37
  from fastapi import BackgroundTasks, FastAPI, File, Form, HTTPException, UploadFile
38
  from fastapi.middleware.cors import CORSMiddleware
39
+ from fastapi.responses import FileResponse, HTMLResponse, JSONResponse, StreamingResponse
 
40
  import uvicorn
41
 
42
  from inference import process_first_frame, run_inference, run_grounded_sam2_tracking
 
53
  get_job_storage,
54
  get_output_video_path,
55
  )
 
 
 
 
 
 
 
 
56
  logging.basicConfig(level=logging.INFO)
57
 
58
  # Suppress noisy external libraries
 
60
  logging.getLogger("huggingface_hub").setLevel(logging.WARNING)
61
  logging.getLogger("transformers").setLevel(logging.WARNING)
62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  async def _periodic_cleanup() -> None:
64
  while True:
65
  await asyncio.sleep(600)
 
85
  )
86
 
87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  # Valid detection modes
89
  VALID_MODES = {"object_detection", "segmentation", "drone_detection"}
90
 
 
125
  background_tasks.add_task(_cleanup)
126
 
127
 
128
+ def _parse_queries(raw: str, mode: str) -> list[str]:
129
+ """Parse comma-separated query string, falling back to mode defaults."""
130
+ parsed = [q.strip() for q in raw.split(",") if q.strip()]
131
+ if parsed:
132
+ return parsed
133
  if mode == "segmentation":
134
  return ["object"]
135
  if mode == "drone_detection":
 
137
  return ["person", "car", "truck", "motorcycle", "bicycle", "bus", "train", "airplane"]
138
 
139
 
140
+ # Cache index.html at module load
141
+ _INDEX_HTML_PATH = Path(__file__).with_name("index.html")
142
+ _INDEX_HTML = _INDEX_HTML_PATH.read_text() if _INDEX_HTML_PATH.exists() else None
143
+
144
+
145
  @app.get("/", response_class=HTMLResponse)
146
  async def demo_page():
147
+ """Serve minimal detection UI."""
148
+ if _INDEX_HTML:
149
+ return HTMLResponse(_INDEX_HTML)
150
+ return HTMLResponse("<h1>Detection Base</h1><p>index.html not found</p>")
151
 
152
 
153
  @app.post("/detect")
 
159
  detector: str = Form("yolo11"),
160
  segmenter: str = Form("GSAM2-L"),
161
  enable_depth: bool = Form(False),
 
162
  ):
163
  """
164
+ Main detection endpoint (synchronous).
165
 
166
  Args:
167
  video: Video file to process
 
169
  queries: Comma-separated object classes for object_detection mode
170
  detector: Model to use (yolo11, detr_resnet50, grounding_dino)
171
  segmenter: Segmentation model to use (GSAM2-S/B/L, YSAM2-S/B/L)
172
+ enable_depth: Whether to run depth estimation (default: False)
 
173
 
174
  Returns:
175
  - For object_detection: Processed video with bounding boxes
 
198
  fd, output_path = tempfile.mkstemp(prefix="output_", suffix=".mp4", dir="/tmp")
199
  os.close(fd)
200
 
201
+ query_list = _parse_queries(queries, mode)
 
 
 
202
 
203
  try:
204
  output_path = run_grounded_sam2_tracking(
 
245
  fd, output_path = tempfile.mkstemp(prefix="output_", suffix=".mp4", dir="/tmp")
246
  os.close(fd)
247
 
 
248
  detector_name = "drone_yolo" if mode == "drone_detection" else detector
 
 
 
 
 
 
 
 
 
 
249
 
250
+ query_list = _parse_queries(queries, mode)
 
251
 
 
252
  try:
 
253
  # Determine depth estimator
254
  active_depth = "depth" if enable_depth else None
255
 
 
260
  detector_name=detector_name,
261
  depth_estimator_name=active_depth,
262
  depth_scale=25.0,
 
263
  )
264
  except ValueError as exc:
265
  logging.exception("Video processing failed.")
 
295
  depth_estimator: str = Form("depth"),
296
  depth_scale: float = Form(25.0),
297
  enable_depth: bool = Form(False),
 
298
  step: int = Form(7),
299
  ):
300
  _ttfs_t0 = time.perf_counter()
 
326
 
327
  logging.info("[TTFS:%s] +%.1fs upload_saved", job_id, time.perf_counter() - _ttfs_t0)
328
 
 
 
 
 
329
  detector_name = detector
 
330
  if mode == "drone_detection":
331
  detector_name = "drone_yolo"
 
332
  elif mode == "segmentation":
 
 
 
 
 
 
333
  detector_name = None
334
 
335
+ query_list = _parse_queries(queries, mode)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
336
 
337
  available_depth_estimators = set(list_depth_estimators())
338
  if depth_estimator not in available_depth_estimators:
 
358
  )
359
  cv2.imwrite(str(first_frame_path), processed_frame)
360
  logging.info("[TTFS:%s] +%.1fs process_first_frame done", job_id, time.perf_counter() - _ttfs_t0)
 
 
361
  except Exception:
362
  logging.exception("First-frame processing failed.")
363
  shutil.rmtree(job_dir, ignore_errors=True)
 
378
  depth_scale=float(depth_scale),
379
  depth_output_path=str(depth_output_path),
380
  first_frame_depth_path=str(first_frame_depth_path),
 
 
 
 
381
  step=step,
382
  ttfs_t0=_ttfs_t0,
383
  )
384
  get_job_storage().create(job)
385
  asyncio.create_task(process_video_async(job_id))
386
 
 
 
 
 
 
 
 
 
 
 
387
  response_data = {
388
  "job_id": job_id,
389
  "first_frame_url": f"/detect/first-frame/{job_id}",
 
394
  "stream_url": f"/detect/stream/{job_id}",
395
  "status": job.status.value,
396
  "first_frame_detections": detections,
 
397
  }
398
 
 
 
 
 
 
 
 
 
 
 
 
 
399
  return response_data
400
 
401
 
 
439
  return data or []
440
 
441
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
442
  @app.delete("/detect/job/{job_id}")
443
  async def cancel_job(job_id: str):
444
  """Cancel a running job."""
 
624
  )
625
 
626
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
627
  @app.post("/benchmark")
628
  async def benchmark_endpoint(
629
  video: UploadFile = File(...),
 
671
  query_list,
672
  segmenter_name=segmenter,
673
  step=step,
 
674
  _perf_metrics=metrics,
675
  _perf_lock=lock,
676
  num_maskmem=num_maskmem,
inference.py CHANGED
@@ -21,12 +21,8 @@ from models.model_loader import load_detector, load_detector_on_device
21
  from models.segmenters.model_loader import load_segmenter, load_segmenter_on_device
22
  from models.depth_estimators.model_loader import load_depth_estimator, load_depth_estimator_on_device
23
  from utils.video import StreamingVideoWriter
24
- from utils.relevance import evaluate_relevance
25
- from utils.enrichment import run_enrichment
26
- from utils.schemas import AssessmentStatus
27
  from jobs.storage import set_track_data
28
  import tempfile
29
- import json as json_module
30
 
31
 
32
  class AsyncVideoReader:
@@ -301,9 +297,7 @@ class SpeedEstimator:
301
 
302
  dist_px = np.sqrt((cx1-cx2)**2 + (cy1-cy2)**2)
303
 
304
- # Heuristic scale: Assume car is ~4m long? Or just arbitrary pixel scale
305
- # If we had GPT distance, we could calibrate.
306
- # For now, let's use a dummy scale: 50px = 1m (very rough)
307
  # Speed = (dist_px / 50) meters / (5 frames / 30 fps) seconds
308
  # = (dist_px / 50) / (0.166) m/s
309
  # = (dist_px * 0.12) m/s
@@ -403,7 +397,7 @@ def _attach_depth_metrics(
403
  depth_scale: float, # No longer used for distance calculation
404
  estimator_instance: Optional[Any] = None,
405
  ) -> None:
406
- """Attach relative depth values for visualization only. GPT handles distance estimation."""
407
  if not detections or (not depth_estimator_name and not estimator_instance):
408
  return
409
 
@@ -514,16 +508,7 @@ def infer_frame(
514
  except Exception:
515
  logging.exception("Depth estimation failed for frame")
516
 
517
- # Re-build display labels to include GPT distance if available
518
- display_labels = []
519
- for i, det in enumerate(detections):
520
- label = det["label"]
521
- if det.get("gpt_distance_m") is not None:
522
- # Add GPT distance to label, e.g. "car 12m"
523
- depth_str = f"{int(det['gpt_distance_m'])}m"
524
- label = f"{label} {depth_str}"
525
- logging.debug("Object '%s' at %s (bbox: %s)", label, depth_str, det['bbox'])
526
- display_labels.append(label)
527
 
528
  except Exception:
529
  logging.exception("Inference failed for queries %s", text_queries)
@@ -537,15 +522,8 @@ def infer_frame(
537
  ), detections
538
 
539
 
540
- def _build_display_label(det):
541
- """Build display label with GPT distance if available."""
542
- label = det["label"]
543
- if det.get("gpt_distance_m") is not None:
544
- label = f"{label} {int(det['gpt_distance_m'])}m"
545
- return label
546
-
547
  def _attach_depth_from_result(detections, depth_result, depth_scale):
548
- """Attach relative depth values for visualization only. GPT handles distance estimation."""
549
  depth_map = depth_result.depth_map
550
  if depth_map is None or depth_map.size == 0: return
551
 
@@ -644,11 +622,8 @@ def process_first_frame(
644
  ) -> Tuple[np.ndarray, List[Dict[str, Any]]]:
645
  """Lightweight first-frame processing: detection + rendering only.
646
 
647
- GPT, depth, and LLM relevance are handled later in the async pipeline
648
- (writer enrichment thread), avoiding 2-8s synchronous startup delay.
649
-
650
  Returns:
651
- (processed_frame, detections) — all detections tagged UNASSESSED.
652
  """
653
  frame, _, _, _ = extract_first_frame(video_path)
654
  if mode == "segmentation":
@@ -665,7 +640,6 @@ def process_first_frame(
665
  "bbox": [int(c) for c in box],
666
  "score": float(seg_result.scores[idx]) if seg_result.scores is not None and idx < len(seg_result.scores) else 1.0,
667
  "track_id": f"T{idx + 1:02d}",
668
- "assessment_status": AssessmentStatus.UNASSESSED,
669
  })
670
  return processed, detections
671
 
@@ -673,10 +647,6 @@ def process_first_frame(
673
  frame, queries, detector_name=detector_name
674
  )
675
 
676
- # Tag all detections as unassessed — GPT runs later in enrichment thread
677
- for det in detections:
678
- det["assessment_status"] = AssessmentStatus.UNASSESSED
679
-
680
  return processed, detections
681
 
682
 
@@ -689,10 +659,7 @@ def run_inference(
689
  job_id: Optional[str] = None,
690
  depth_estimator_name: Optional[str] = None,
691
  depth_scale: float = 1.0,
692
- enable_gpt: bool = True,
693
  stream_queue: Optional[Queue] = None,
694
- mission_spec=None, # Optional[MissionSpecification]
695
- first_frame_gpt_results: Optional[Dict[str, Any]] = None,
696
  first_frame_detections: Optional[List[Dict[str, Any]]] = None,
697
  ) -> Tuple[str, List[List[Dict[str, Any]]]]:
698
 
@@ -769,8 +736,7 @@ def run_inference(
769
  # queue_in: (frame_idx, frame_data)
770
  # queue_out: (frame_idx, processed_frame, detections)
771
  queue_in = Queue(maxsize=16)
772
- # Tuning for A10: buffer at least 32 frames per GPU (batch size)
773
- # GPT Latency Buffer: GPT takes ~3s. At 30fps, that's 90 frames. We need to absorb this burst.
774
  queue_out_max = max(128, (len(detectors) if detectors else 1) * 32)
775
  queue_out = Queue(maxsize=queue_out_max)
776
 
@@ -948,32 +914,6 @@ def run_inference(
948
  # writer_finished = False
949
 
950
 
951
- # --- GPT Enrichment Thread (non-blocking) ---
952
- # Runs LLM relevance + GPT threat assessment off the writer's critical path.
953
- gpt_enrichment_queue = Queue(maxsize=4)
954
- _relevance_refined = Event()
955
-
956
- def enrichment_thread_fn(tracker_ref):
957
- """Dedicated thread for GPT/LLM calls. Receives work from writer, injects results via tracker."""
958
- while True:
959
- item = gpt_enrichment_queue.get()
960
- if item is None:
961
- break # Sentinel — shutdown
962
- frame_idx, frame_data, gpt_dets, ms = item
963
- try:
964
- gpt_res = run_enrichment(
965
- frame_idx, frame_data, gpt_dets, ms,
966
- first_frame_gpt_results=first_frame_gpt_results,
967
- job_id=job_id,
968
- relevance_refined_event=_relevance_refined,
969
- )
970
- if gpt_res:
971
- tracker_ref.inject_metadata(gpt_dets)
972
- logging.info("Enrichment: GPT results injected into tracker for frame %d", frame_idx)
973
-
974
- except Exception as e:
975
- logging.error("Enrichment thread failed for frame %d: %s", frame_idx, e)
976
-
977
  def writer_loop():
978
  nonlocal writer_finished
979
  next_idx = 0
@@ -982,11 +922,6 @@ def run_inference(
982
  # Initialize Tracker & Speed Estimator
983
  tracker = ByteTracker(frame_rate=fps)
984
  speed_est = SpeedEstimator(fps=fps)
985
- gpt_submitted = False # GPT enrichment submitted once for frame 0
986
-
987
- # Start enrichment thread
988
- enrich_thread = Thread(target=enrichment_thread_fn, args=(tracker,), daemon=True)
989
- enrich_thread.start()
990
 
991
  try:
992
  with StreamingVideoWriter(output_video_path, fps, width, height) as writer:
@@ -1016,67 +951,11 @@ def run_inference(
1016
  next_idx, pre_track_count, len(dets))
1017
  speed_est.estimate(dets)
1018
 
1019
- # --- RELEVANCE GATE (deterministic, fast — stays in writer) ---
1020
- if mission_spec:
1021
- if (mission_spec.parse_mode == "LLM_EXTRACTED"
1022
- and not _relevance_refined.is_set()):
1023
- # LLM post-filter hasn't run yet — pass all through
1024
- for d in dets:
1025
- d["mission_relevant"] = True
1026
- d["relevance_reason"] = "pending_llm_postfilter"
1027
- gpt_dets = dets
1028
- else:
1029
- # Normal deterministic gate (with refined or FAST_PATH classes)
1030
- for d in dets:
1031
- decision = evaluate_relevance(d, mission_spec.relevance_criteria)
1032
- d["mission_relevant"] = decision.relevant
1033
- d["relevance_reason"] = decision.reason
1034
- if not decision.relevant:
1035
- logging.info(
1036
- json_module.dumps({
1037
- "event": "relevance_decision",
1038
- "track_id": d.get("track_id"),
1039
- "label": d.get("label"),
1040
- "relevant": False,
1041
- "reason": decision.reason,
1042
- "required_classes": mission_spec.relevance_criteria.required_classes,
1043
- "frame": next_idx,
1044
- })
1045
- )
1046
- gpt_dets = [d for d in dets if d.get("mission_relevant", True)]
1047
- else:
1048
- for d in dets:
1049
- d["mission_relevant"] = None
1050
- gpt_dets = dets
1051
-
1052
- # --- GPT ENRICHMENT (non-blocking, offloaded to enrichment thread) ---
1053
- if enable_gpt and gpt_dets and not gpt_submitted:
1054
- # Tag as pending — enrichment thread will update to ASSESSED later
1055
- for d in gpt_dets:
1056
- d["assessment_status"] = AssessmentStatus.PENDING_GPT
1057
- try:
1058
- gpt_enrichment_queue.put(
1059
- (next_idx, p_frame.copy(), gpt_dets, mission_spec),
1060
- timeout=1.0,
1061
- )
1062
- gpt_submitted = True
1063
- logging.info("Writer: offloaded GPT enrichment for frame %d", next_idx)
1064
- except Full:
1065
- logging.warning("GPT enrichment queue full, skipping frame 0 GPT")
1066
-
1067
- # Tag unassessed detections (INV-6)
1068
- for d in dets:
1069
- if "assessment_status" not in d:
1070
- d["assessment_status"] = AssessmentStatus.UNASSESSED
1071
-
1072
  # --- RENDER BOXES & OVERLAYS ---
1073
  if dets:
1074
  display_boxes = np.array([d['bbox'] for d in dets])
1075
  display_labels = []
1076
  for d in dets:
1077
- if d.get("mission_relevant") is False:
1078
- display_labels.append("")
1079
- continue
1080
  lbl = d.get('label', 'obj')
1081
  display_labels.append(lbl)
1082
 
@@ -1131,12 +1010,6 @@ def run_inference(
1131
  logging.exception("Writer loop failed")
1132
  finally:
1133
  logging.info("Writer loop finished. Wrote %d frames (target %d)", next_idx, total_frames)
1134
- # Shut down enrichment thread
1135
- try:
1136
- gpt_enrichment_queue.put(None, timeout=5.0)
1137
- enrich_thread.join(timeout=30)
1138
- except Exception:
1139
- logging.warning("Enrichment thread shutdown timed out")
1140
  writer_finished = True
1141
 
1142
  writer_thread = Thread(target=writer_loop, daemon=True)
@@ -1213,8 +1086,7 @@ def _gsam2_render_frame(
1213
  ) -> np.ndarray:
1214
  """Render a single GSAM2 tracking frame (masks + boxes). CPU-only.
1215
 
1216
- When *masks_only* is True, skip box rendering so the writer thread can
1217
- draw boxes later with enriched (GPT) labels.
1218
  """
1219
  if frame_store is not None:
1220
  frame = frame_store.get_bgr(frame_idx).copy() # .copy() — render mutates
@@ -1274,9 +1146,6 @@ def run_grounded_sam2_tracking(
1274
  job_id: Optional[str] = None,
1275
  stream_queue: Optional[Queue] = None,
1276
  step: int = 20,
1277
- enable_gpt: bool = False,
1278
- mission_spec=None, # Optional[MissionSpecification]
1279
- first_frame_gpt_results: Optional[Dict[str, Any]] = None,
1280
  _perf_metrics: Optional[Dict[str, float]] = None,
1281
  _perf_lock=None,
1282
  num_maskmem: Optional[int] = None,
@@ -1376,7 +1245,6 @@ def run_grounded_sam2_tracking(
1376
  frm = _gsam2_render_frame(
1377
  frame_dir, frame_names, fidx, fobjs,
1378
  height, width,
1379
- masks_only=enable_gpt,
1380
  frame_store=frame_store,
1381
  )
1382
 
@@ -1387,7 +1255,7 @@ def run_grounded_sam2_tracking(
1387
  else:
1388
  _perf_metrics["render_total_ms"] += _r_ms
1389
 
1390
- payload = (fidx, frm, fobjs) if enable_gpt else (fidx, frm, {})
1391
  while True:
1392
  try:
1393
  render_out.put(payload, timeout=1.0)
@@ -1410,92 +1278,6 @@ def run_grounded_sam2_tracking(
1410
  for t in r_workers:
1411
  t.start()
1412
 
1413
- # --- ObjectInfo → detection dict adapter ---
1414
- def _objectinfo_to_dets(frame_objects_dict):
1415
- dets = []
1416
- for obj_id, info in frame_objects_dict.items():
1417
- dets.append({
1418
- "label": info.class_name,
1419
- "bbox": [info.x1, info.y1, info.x2, info.y2],
1420
- "score": 1.0,
1421
- "track_id": f"T{obj_id:02d}",
1422
- "instance_id": obj_id,
1423
- })
1424
- return dets
1425
-
1426
- # --- GPT enrichment thread (when enabled) ---
1427
- gpt_enrichment_queue: Queue = Queue(maxsize=4)
1428
- gpt_data_by_track: Dict[str, Dict] = {}
1429
- gpt_data_lock = RLock()
1430
- _relevance_refined = Event()
1431
-
1432
- def _gsam2_enrichment_thread_fn():
1433
- while True:
1434
- item = gpt_enrichment_queue.get()
1435
- if item is None:
1436
- break
1437
- frame_idx, frame_data, gpt_dets, ms = item
1438
- try:
1439
- gpt_res = run_enrichment(
1440
- frame_idx, frame_data, gpt_dets, ms,
1441
- first_frame_gpt_results=first_frame_gpt_results,
1442
- job_id=job_id,
1443
- relevance_refined_event=_relevance_refined,
1444
- )
1445
-
1446
- # GSAM2-specific: store results in per-track dict and persist to job storage
1447
- if gpt_res:
1448
- for d in gpt_dets:
1449
- tid = d.get("track_id")
1450
- if tid and tid in gpt_res:
1451
- merged = dict(gpt_res[tid])
1452
- merged["gpt_raw"] = gpt_res[tid]
1453
- merged["assessment_frame_index"] = frame_idx
1454
- merged["assessment_status"] = merged.get(
1455
- "assessment_status", AssessmentStatus.ASSESSED
1456
- )
1457
- with gpt_data_lock:
1458
- gpt_data_by_track[tid] = merged
1459
- logging.info("GSAM2 enrichment: GPT results stored for %d tracks", len(gpt_data_by_track))
1460
-
1461
- # Persist GPT-enriched detections to job storage so
1462
- # frontend polling (/detect/status) picks them up.
1463
- if job_id:
1464
- try:
1465
- from jobs.storage import get_job_storage as _gjs
1466
- _st = _gjs().get(job_id)
1467
- if _st and _st.first_frame_detections:
1468
- for det in _st.first_frame_detections:
1469
- tid = det.get("track_id")
1470
- with gpt_data_lock:
1471
- payload = gpt_data_by_track.get(tid)
1472
- if payload:
1473
- det.update(payload)
1474
- # Also sync relevance from gpt_dets
1475
- src = next((d for d in gpt_dets if d.get("track_id") == tid), None)
1476
- if src:
1477
- if "mission_relevant" in src:
1478
- det["mission_relevant"] = src["mission_relevant"]
1479
- if "relevance_reason" in src:
1480
- det["relevance_reason"] = src["relevance_reason"]
1481
- from jobs.storage import get_job_storage as _gjs2
1482
- _gjs2().update(
1483
- job_id,
1484
- first_frame_detections=_st.first_frame_detections,
1485
- first_frame_gpt_results=gpt_res,
1486
- )
1487
- logging.info(
1488
- "GSAM2 enrichment: updated first_frame_detections in job storage for %s",
1489
- job_id,
1490
- )
1491
- except Exception:
1492
- logging.exception(
1493
- "GSAM2 enrichment: failed to update job storage for %s", job_id
1494
- )
1495
-
1496
- except Exception as e:
1497
- logging.error("GSAM2 enrichment thread failed for frame %d: %s", frame_idx, e)
1498
-
1499
  # Shared streaming state (publisher ↔ writer)
1500
  _stream_deque: collections.deque = collections.deque(maxlen=200)
1501
  _stream_lock = RLock()
@@ -1508,15 +1290,6 @@ def run_grounded_sam2_tracking(
1508
  buf: Dict[int, Tuple] = {}
1509
 
1510
  # Per-track bbox history (replaces ByteTracker for GSAM2)
1511
- track_history: Dict[int, List] = {}
1512
- speed_est = SpeedEstimator(fps=fps) if enable_gpt else None
1513
- gpt_submitted = False
1514
-
1515
- # Start enrichment thread when GPT enabled
1516
- enrich_thread = None
1517
- if enable_gpt:
1518
- enrich_thread = Thread(target=_gsam2_enrichment_thread_fn, daemon=True)
1519
- enrich_thread.start()
1520
 
1521
  try:
1522
  with StreamingVideoWriter(
@@ -1538,100 +1311,6 @@ def run_grounded_sam2_tracking(
1538
 
1539
  frm, fobjs = buf.pop(next_idx)
1540
 
1541
- # --- GPT enrichment path ---
1542
- if enable_gpt and fobjs:
1543
- dets = _objectinfo_to_dets(fobjs)
1544
-
1545
- # Maintain per-track bbox history (30-frame window)
1546
- for det in dets:
1547
- iid = det["instance_id"]
1548
- track_history.setdefault(iid, []).append(det["bbox"])
1549
- if len(track_history[iid]) > 30:
1550
- track_history[iid].pop(0)
1551
- # Store an immutable per-frame snapshot.
1552
- det["history"] = list(track_history[iid])
1553
-
1554
- # Speed estimation
1555
- if speed_est:
1556
- speed_est.estimate(dets)
1557
-
1558
- # Relevance gate
1559
- if mission_spec:
1560
- if (mission_spec.parse_mode == "LLM_EXTRACTED"
1561
- and not _relevance_refined.is_set()):
1562
- for d in dets:
1563
- d["mission_relevant"] = True
1564
- d["relevance_reason"] = "pending_llm_postfilter"
1565
- gpt_dets = dets
1566
- else:
1567
- for d in dets:
1568
- decision = evaluate_relevance(d, mission_spec.relevance_criteria)
1569
- d["mission_relevant"] = decision.relevant
1570
- d["relevance_reason"] = decision.reason
1571
- gpt_dets = [d for d in dets if d.get("mission_relevant", True)]
1572
- else:
1573
- for d in dets:
1574
- d["mission_relevant"] = None
1575
- gpt_dets = dets
1576
-
1577
- # GPT enrichment (one-shot, first frame with detections)
1578
- if gpt_dets and not gpt_submitted:
1579
- for d in gpt_dets:
1580
- d["assessment_status"] = AssessmentStatus.PENDING_GPT
1581
- try:
1582
- gpt_enrichment_queue.put(
1583
- (
1584
- next_idx,
1585
- frm.copy(),
1586
- copy.deepcopy(gpt_dets),
1587
- mission_spec,
1588
- ),
1589
- timeout=1.0,
1590
- )
1591
- gpt_submitted = True
1592
- logging.info("GSAM2 writer: offloaded GPT enrichment for frame %d", next_idx)
1593
- except Full:
1594
- logging.warning("GSAM2 GPT enrichment queue full, skipping")
1595
-
1596
- # Merge persistent GPT data
1597
- for det in dets:
1598
- tid = det["track_id"]
1599
- with gpt_data_lock:
1600
- gpt_payload = gpt_data_by_track.get(tid)
1601
- if gpt_payload:
1602
- det.update(gpt_payload)
1603
- det["assessment_status"] = AssessmentStatus.ASSESSED
1604
- elif "assessment_status" not in det:
1605
- det["assessment_status"] = AssessmentStatus.UNASSESSED
1606
-
1607
- # Build enriched display labels
1608
- display_labels = []
1609
- for d in dets:
1610
- if d.get("mission_relevant") is False:
1611
- display_labels.append("")
1612
- continue
1613
- lbl = d.get("label", "obj")
1614
- if d.get("gpt_distance_m") is not None:
1615
- try:
1616
- lbl = f"{lbl} {int(float(d['gpt_distance_m']))}m"
1617
- except (TypeError, ValueError):
1618
- pass
1619
- display_labels.append(lbl)
1620
-
1621
- # Draw boxes on mask-rendered frame
1622
- if dets:
1623
- boxes = np.array([d["bbox"] for d in dets])
1624
- frm = draw_boxes(frm, boxes, label_names=display_labels)
1625
-
1626
- # Store tracks for frontend
1627
- if job_id:
1628
- set_track_data(job_id, next_idx, copy.deepcopy(dets))
1629
-
1630
- elif enable_gpt:
1631
- # No objects this frame — still store empty track data
1632
- if job_id:
1633
- set_track_data(job_id, next_idx, [])
1634
-
1635
  if _perf_metrics is not None:
1636
  _t_w = time.perf_counter()
1637
 
@@ -1668,13 +1347,6 @@ def run_grounded_sam2_tracking(
1668
  finally:
1669
  render_done = True
1670
  _stream_writer_done.set()
1671
- # Shut down enrichment thread
1672
- if enrich_thread:
1673
- try:
1674
- gpt_enrichment_queue.put(None, timeout=5.0)
1675
- enrich_thread.join(timeout=30)
1676
- except Exception:
1677
- logging.warning("GSAM2 enrichment thread shutdown timed out")
1678
 
1679
  def _stream_publisher_thread():
1680
  """Adaptive-rate publisher: reads from _stream_deque, publishes at measured pace."""
 
21
  from models.segmenters.model_loader import load_segmenter, load_segmenter_on_device
22
  from models.depth_estimators.model_loader import load_depth_estimator, load_depth_estimator_on_device
23
  from utils.video import StreamingVideoWriter
 
 
 
24
  from jobs.storage import set_track_data
25
  import tempfile
 
26
 
27
 
28
  class AsyncVideoReader:
 
297
 
298
  dist_px = np.sqrt((cx1-cx2)**2 + (cy1-cy2)**2)
299
 
300
+ # Heuristic scale: 50px = 1m (very rough)
 
 
301
  # Speed = (dist_px / 50) meters / (5 frames / 30 fps) seconds
302
  # = (dist_px / 50) / (0.166) m/s
303
  # = (dist_px * 0.12) m/s
 
397
  depth_scale: float, # No longer used for distance calculation
398
  estimator_instance: Optional[Any] = None,
399
  ) -> None:
400
+ """Attach relative depth values to detection dicts for visualization."""
401
  if not detections or (not depth_estimator_name and not estimator_instance):
402
  return
403
 
 
508
  except Exception:
509
  logging.exception("Depth estimation failed for frame")
510
 
511
+ display_labels = [det["label"] for det in detections]
 
 
 
 
 
 
 
 
 
512
 
513
  except Exception:
514
  logging.exception("Inference failed for queries %s", text_queries)
 
522
  ), detections
523
 
524
 
 
 
 
 
 
 
 
525
  def _attach_depth_from_result(detections, depth_result, depth_scale):
526
+ """Attach relative depth values to detection dicts for visualization."""
527
  depth_map = depth_result.depth_map
528
  if depth_map is None or depth_map.size == 0: return
529
 
 
622
  ) -> Tuple[np.ndarray, List[Dict[str, Any]]]:
623
  """Lightweight first-frame processing: detection + rendering only.
624
 
 
 
 
625
  Returns:
626
+ (processed_frame, detections)
627
  """
628
  frame, _, _, _ = extract_first_frame(video_path)
629
  if mode == "segmentation":
 
640
  "bbox": [int(c) for c in box],
641
  "score": float(seg_result.scores[idx]) if seg_result.scores is not None and idx < len(seg_result.scores) else 1.0,
642
  "track_id": f"T{idx + 1:02d}",
 
643
  })
644
  return processed, detections
645
 
 
647
  frame, queries, detector_name=detector_name
648
  )
649
 
 
 
 
 
650
  return processed, detections
651
 
652
 
 
659
  job_id: Optional[str] = None,
660
  depth_estimator_name: Optional[str] = None,
661
  depth_scale: float = 1.0,
 
662
  stream_queue: Optional[Queue] = None,
 
 
663
  first_frame_detections: Optional[List[Dict[str, Any]]] = None,
664
  ) -> Tuple[str, List[List[Dict[str, Any]]]]:
665
 
 
736
  # queue_in: (frame_idx, frame_data)
737
  # queue_out: (frame_idx, processed_frame, detections)
738
  queue_in = Queue(maxsize=16)
739
+ # Buffer at least 32 frames per GPU for pipeline overlap
 
740
  queue_out_max = max(128, (len(detectors) if detectors else 1) * 32)
741
  queue_out = Queue(maxsize=queue_out_max)
742
 
 
914
  # writer_finished = False
915
 
916
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
917
  def writer_loop():
918
  nonlocal writer_finished
919
  next_idx = 0
 
922
  # Initialize Tracker & Speed Estimator
923
  tracker = ByteTracker(frame_rate=fps)
924
  speed_est = SpeedEstimator(fps=fps)
 
 
 
 
 
925
 
926
  try:
927
  with StreamingVideoWriter(output_video_path, fps, width, height) as writer:
 
951
  next_idx, pre_track_count, len(dets))
952
  speed_est.estimate(dets)
953
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
954
  # --- RENDER BOXES & OVERLAYS ---
955
  if dets:
956
  display_boxes = np.array([d['bbox'] for d in dets])
957
  display_labels = []
958
  for d in dets:
 
 
 
959
  lbl = d.get('label', 'obj')
960
  display_labels.append(lbl)
961
 
 
1010
  logging.exception("Writer loop failed")
1011
  finally:
1012
  logging.info("Writer loop finished. Wrote %d frames (target %d)", next_idx, total_frames)
 
 
 
 
 
 
1013
  writer_finished = True
1014
 
1015
  writer_thread = Thread(target=writer_loop, daemon=True)
 
1086
  ) -> np.ndarray:
1087
  """Render a single GSAM2 tracking frame (masks + boxes). CPU-only.
1088
 
1089
+ When *masks_only* is True, skip box rendering.
 
1090
  """
1091
  if frame_store is not None:
1092
  frame = frame_store.get_bgr(frame_idx).copy() # .copy() — render mutates
 
1146
  job_id: Optional[str] = None,
1147
  stream_queue: Optional[Queue] = None,
1148
  step: int = 20,
 
 
 
1149
  _perf_metrics: Optional[Dict[str, float]] = None,
1150
  _perf_lock=None,
1151
  num_maskmem: Optional[int] = None,
 
1245
  frm = _gsam2_render_frame(
1246
  frame_dir, frame_names, fidx, fobjs,
1247
  height, width,
 
1248
  frame_store=frame_store,
1249
  )
1250
 
 
1255
  else:
1256
  _perf_metrics["render_total_ms"] += _r_ms
1257
 
1258
+ payload = (fidx, frm, {})
1259
  while True:
1260
  try:
1261
  render_out.put(payload, timeout=1.0)
 
1278
  for t in r_workers:
1279
  t.start()
1280
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1281
  # Shared streaming state (publisher ↔ writer)
1282
  _stream_deque: collections.deque = collections.deque(maxlen=200)
1283
  _stream_lock = RLock()
 
1290
  buf: Dict[int, Tuple] = {}
1291
 
1292
  # Per-track bbox history (replaces ByteTracker for GSAM2)
 
 
 
 
 
 
 
 
 
1293
 
1294
  try:
1295
  with StreamingVideoWriter(
 
1311
 
1312
  frm, fobjs = buf.pop(next_idx)
1313
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1314
  if _perf_metrics is not None:
1315
  _t_w = time.perf_counter()
1316
 
 
1347
  finally:
1348
  render_done = True
1349
  _stream_writer_done.set()
 
 
 
 
 
 
 
1350
 
1351
  def _stream_publisher_thread():
1352
  """Adaptive-rate publisher: reads from _stream_deque, publishes at measured pace."""
jobs/background.py CHANGED
@@ -35,9 +35,6 @@ async def process_video_async(job_id: str) -> None:
35
  job_id=job_id,
36
  stream_queue=stream_queue,
37
  step=job.step,
38
- enable_gpt=job.enable_gpt,
39
- mission_spec=job.mission_spec,
40
- first_frame_gpt_results=job.first_frame_gpt_results,
41
  num_maskmem=7,
42
  detector_name=job.detector_name,
43
  _ttfs_t0=job.ttfs_t0,
@@ -53,13 +50,10 @@ async def process_video_async(job_id: str) -> None:
53
  None,
54
  job.detector_name,
55
  job_id,
56
- job.depth_estimator_name, # Pass depth estimator to trigger unified loop
57
  job.depth_scale,
58
- job.enable_gpt,
59
  stream_queue,
60
- job.mission_spec, # Forward mission spec for relevance gating
61
- job.first_frame_gpt_results, # Avoid duplicate GPT call on frame 0
62
- job.first_frame_detections, # Reuse frame 0 detections (avoid re-detecting)
63
  )
64
  detection_path, detections_list = result_pkg
65
 
 
35
  job_id=job_id,
36
  stream_queue=stream_queue,
37
  step=job.step,
 
 
 
38
  num_maskmem=7,
39
  detector_name=job.detector_name,
40
  _ttfs_t0=job.ttfs_t0,
 
50
  None,
51
  job.detector_name,
52
  job_id,
53
+ job.depth_estimator_name,
54
  job.depth_scale,
 
55
  stream_queue,
56
+ job.first_frame_detections,
 
 
57
  )
58
  detection_path, detections_list = result_pkg
59
 
jobs/models.py CHANGED
@@ -33,10 +33,5 @@ class JobInfo:
33
  first_frame_depth_path: Optional[str] = None
34
  partial_success: bool = False # True if one component failed but job completed
35
  depth_error: Optional[str] = None # Error message if depth failed
36
- enable_gpt: bool = True # Whether to use GPT for distance estimation
37
- # Mission specification (None = LEGACY mode)
38
- mission_spec: Optional[Any] = None # utils.schemas.MissionSpecification
39
- mission_mode: str = "LEGACY" # "MISSION" or "LEGACY"
40
- first_frame_gpt_results: Optional[Dict[str, Any]] = None # Cached GPT results from process_first_frame
41
  step: int = 7 # Segmentation keyframe step (matches num_maskmem)
42
  ttfs_t0: Optional[float] = None # TTFS anchor: time.perf_counter() at endpoint entry
 
33
  first_frame_depth_path: Optional[str] = None
34
  partial_success: bool = False # True if one component failed but job completed
35
  depth_error: Optional[str] = None # Error message if depth failed
 
 
 
 
 
36
  step: int = 7 # Segmentation keyframe step (matches num_maskmem)
37
  ttfs_t0: Optional[float] = None # TTFS anchor: time.perf_counter() at endpoint entry
requirements.txt CHANGED
@@ -7,9 +7,7 @@ python-multipart
7
  pillow
8
  huggingface-hub
9
  ultralytics
10
- python-dotenv
11
  einops
12
- sentence-transformers
13
  SAM-2 @ git+https://github.com/facebookresearch/sam2.git
14
  hydra-core>=1.3.2
15
  iopath>=0.1.10
 
7
  pillow
8
  huggingface-hub
9
  ultralytics
 
10
  einops
 
11
  SAM-2 @ git+https://github.com/facebookresearch/sam2.git
12
  hydra-core>=1.3.2
13
  iopath>=0.1.10
utils/enrichment.py DELETED
@@ -1,122 +0,0 @@
1
- """
2
- Shared enrichment workflow — single implementation of the 5-step GPT enrichment
3
- pipeline used by inference.py (detection + GSAM2) and app.py (first-frame).
4
-
5
- Consolidates duplicated logic from:
6
- - inference.py enrichment_thread_fn
7
- - inference.py _gsam2_enrichment_thread_fn
8
- - app.py _enrich_first_frame_gpt
9
- """
10
-
11
- import logging
12
- from threading import Event
13
- from typing import Any, Dict, List, Optional
14
-
15
- from utils.gpt_reasoning import estimate_threat_gpt, encode_frame_to_b64
16
- from utils.relevance import evaluate_relevance, evaluate_relevance_llm
17
- from utils.schemas import AssessmentStatus
18
-
19
- logger = logging.getLogger(__name__)
20
-
21
-
22
- def run_enrichment(
23
- frame_idx: int,
24
- frame_data,
25
- detections: List[Dict[str, Any]],
26
- mission_spec,
27
- *,
28
- first_frame_gpt_results: Optional[Dict] = None,
29
- job_id: Optional[str] = None,
30
- relevance_refined_event: Optional[Event] = None,
31
- ) -> Optional[Dict[str, Any]]:
32
- """Run the shared enrichment workflow (LLM post-filter + GPT threat assessment).
33
-
34
- Steps:
35
- 1. LLM post-filter via evaluate_relevance_llm() (if LLM_EXTRACTED mode)
36
- 2. Signal relevance_refined_event (if provided)
37
- 3. Check cached GPT results (parameter or JobStorage fallback)
38
- 4. Call estimate_threat_gpt() if no cache
39
- 5. Merge results into detections by track_id
40
-
41
- Args:
42
- frame_idx: Index of the frame being enriched.
43
- frame_data: OpenCV BGR frame (numpy array).
44
- detections: Mutable list of detection dicts to enrich in-place.
45
- mission_spec: Optional MissionSpecification.
46
- first_frame_gpt_results: Pre-computed GPT results (cache hit).
47
- job_id: Job identifier for JobStorage fallback cache lookup.
48
- relevance_refined_event: threading.Event to signal when LLM post-filter completes.
49
-
50
- Returns:
51
- GPT results dict (object_id -> assessment), or None if all detections
52
- were filtered out.
53
- """
54
- gpt_dets = detections
55
-
56
- # --- Step 1: LLM post-filter (LLM_EXTRACTED mode) ---
57
- if mission_spec and mission_spec.parse_mode == "LLM_EXTRACTED":
58
- unique_labels = list({
59
- d.get("label", "").lower()
60
- for d in gpt_dets if d.get("label")
61
- })
62
- relevant_labels = evaluate_relevance_llm(
63
- unique_labels, mission_spec.operator_text
64
- )
65
- mission_spec.relevance_criteria.required_classes = list(relevant_labels)
66
-
67
- # --- Step 2: Signal writer loop ---
68
- if relevance_refined_event is not None:
69
- relevance_refined_event.set()
70
-
71
- logger.info(
72
- "Enrichment: LLM post-filter applied on frame %d: relevant=%s",
73
- frame_idx, relevant_labels,
74
- )
75
- # Re-filter with refined classes
76
- for d in gpt_dets:
77
- decision = evaluate_relevance(d, mission_spec.relevance_criteria)
78
- d["mission_relevant"] = decision.relevant
79
- gpt_dets = [d for d in gpt_dets if d.get("mission_relevant", True)]
80
- elif relevance_refined_event is not None:
81
- # Non-LLM mode: signal immediately so writer doesn't block
82
- relevance_refined_event.set()
83
-
84
- if not gpt_dets:
85
- return None
86
-
87
- # --- Step 3: Check cached GPT results ---
88
- cached_gpt = first_frame_gpt_results
89
- if not cached_gpt and job_id:
90
- try:
91
- from jobs.storage import get_job_storage as _gjs
92
- _job = _gjs().get(job_id)
93
- if _job and _job.first_frame_gpt_results:
94
- cached_gpt = _job.first_frame_gpt_results
95
- except Exception:
96
- pass
97
-
98
- # --- Step 4: Call GPT if no cache ---
99
- if cached_gpt:
100
- logger.info("Enrichment: re-using cached GPT results for frame %d", frame_idx)
101
- gpt_res = cached_gpt
102
- else:
103
- logger.info("Enrichment: running GPT estimation for frame %d...", frame_idx)
104
- frame_b64 = encode_frame_to_b64(frame_data)
105
- gpt_res = estimate_threat_gpt(
106
- detections=gpt_dets, mission_spec=mission_spec,
107
- image_b64=frame_b64,
108
- )
109
-
110
- # --- Step 5: Merge results into detections by track_id ---
111
- for d in gpt_dets:
112
- oid = d.get("track_id")
113
- if oid and oid in gpt_res:
114
- gpt_payload = gpt_res[oid]
115
- d.update(gpt_payload)
116
- d["gpt_raw"] = gpt_payload
117
- d["assessment_frame_index"] = frame_idx
118
- d["assessment_status"] = gpt_payload.get(
119
- "assessment_status", AssessmentStatus.ASSESSED
120
- )
121
-
122
- return gpt_res
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/gpt_reasoning.py DELETED
@@ -1,374 +0,0 @@
1
- import re
2
- import json
3
- import base64
4
- import logging
5
- from typing import List, Dict, Any, Optional
6
- from utils.schemas import AssessmentStatus
7
- from utils.openai_client import chat_completion, extract_content, get_api_key
8
-
9
- logger = logging.getLogger(__name__)
10
-
11
- def encode_image(image_path: str) -> str:
12
- with open(image_path, "rb") as image_file:
13
- return base64.b64encode(image_file.read()).decode('utf-8')
14
-
15
-
16
- def encode_frame_to_b64(frame, quality=None) -> str:
17
- """Encode an OpenCV BGR frame to a base64 JPEG string in memory (no disk I/O).
18
-
19
- Args:
20
- frame: OpenCV BGR numpy array.
21
- quality: Optional JPEG quality (1-100). Uses OpenCV default if None.
22
- """
23
- import cv2
24
- params = [int(cv2.IMWRITE_JPEG_QUALITY), quality] if quality is not None else None
25
- success, buf = cv2.imencode('.jpg', frame, params) if params else cv2.imencode('.jpg', frame)
26
- if not success:
27
- raise ValueError("Failed to encode frame to JPEG")
28
- return base64.b64encode(buf.tobytes()).decode('utf-8')
29
-
30
-
31
- _DOMAIN_ROLES = {
32
- "NAVAL": "Naval Intelligence Officer and Maritime Threat Analyst",
33
- "GROUND": "Ground Surveillance Intelligence Officer",
34
- "AERIAL": "Air Surveillance Intelligence Officer",
35
- "URBAN": "Urban Surveillance Intelligence Officer",
36
- "GENERIC": "Tactical Surveillance Analyst",
37
- }
38
-
39
- _HUMAN_LABEL_HINTS = frozenset({
40
- "person", "people", "human", "pedestrian",
41
- "man", "woman", "boy", "girl", "child",
42
- "civilian", "soldier", "infantry", "troop", "trooper",
43
- })
44
-
45
-
46
- def _is_human_label(label: str) -> bool:
47
- label_l = (label or "").lower().strip()
48
- if not label_l:
49
- return False
50
- parts = [p for p in re.split(r"[^a-z0-9]+", label_l) if p]
51
- return any(part in _HUMAN_LABEL_HINTS for part in parts)
52
-
53
-
54
- def _build_status_fallback(
55
- object_ids: List[str],
56
- status: str,
57
- reason: str,
58
- ) -> Dict[str, Dict[str, Any]]:
59
- return {
60
- obj_id: {
61
- "assessment_status": status,
62
- "gpt_reason": reason,
63
- }
64
- for obj_id in object_ids
65
- }
66
-
67
- _UNIVERSAL_SCHEMA = (
68
- "RESPONSE SCHEMA (JSON):\n"
69
- "{\n"
70
- " \"objects\": {\n"
71
- " \"T01\": {\n"
72
- " \"object_type\": \"string (broad category, e.g. Warship, APC, Sedan, Person)\",\n"
73
- " \"size\": \"string (e.g. Large, Medium, Small, ~50m length)\",\n"
74
- " \"visible_weapons\": [\"string\"],\n"
75
- " \"weapon_readiness\": \"string (e.g. Stowed/PEACE, Trained/Aiming, Firing/HOSTILE, Unknown)\",\n"
76
- " \"motion_status\": \"string (e.g. Stationary, Moving Slow, Moving Fast, Hovering)\",\n"
77
- " \"range_estimate\": \"string (e.g. ~500m, ~2NM, ~1km)\",\n"
78
- " \"bearing\": \"string (e.g. 12 o'clock, NNE, 045°)\",\n"
79
- " \"threat_level\": int (1-10, 1=Benign, 10=Imminent Attack),\n"
80
- " \"threat_classification\": \"Friendly\" | \"Neutral\" | \"Suspect\" | \"Hostile\",\n"
81
- " \"tactical_intent\": \"string (e.g. Transit, Patrol, Attack Profile)\",\n"
82
- " \"dynamic_features\": [\n"
83
- " {\"key\": \"string (domain-specific observation name)\", \"value\": \"string\"}\n"
84
- " ] // up to 5 extra observations relevant to the domain\n"
85
- " }\n"
86
- " }\n"
87
- "}\n"
88
- )
89
-
90
-
91
- def _parse_range_to_meters(range_text: str) -> Optional[float]:
92
- """Convert a free-text range string to meters.
93
-
94
- Supports patterns like '~500m', '~2NM', '~1.5km', '500 meters', '2 nautical miles'.
95
- Returns None if the string cannot be parsed.
96
- """
97
- if not range_text or range_text == "Unknown":
98
- return None
99
- text = range_text.strip().lstrip("~").strip()
100
- # Try NM / nautical miles
101
- m = re.match(r"([0-9]*\.?[0-9]+)\s*(NM|nm|nautical\s*miles?)", text)
102
- if m:
103
- return float(m.group(1)) * 1852.0
104
- # Try km / kilometers
105
- m = re.match(r"([0-9]*\.?[0-9]+)\s*(km|kilometers?|kilometres?)", text, re.IGNORECASE)
106
- if m:
107
- return float(m.group(1)) * 1000.0
108
- # Try meters (default)
109
- m = re.match(r"([0-9]*\.?[0-9]+)\s*(m|meters?|metres?)?$", text, re.IGNORECASE)
110
- if m:
111
- return float(m.group(1))
112
- return None
113
-
114
-
115
- def _build_domain_system_prompt(domain: str, mission_spec=None) -> str:
116
- """Build a universal system prompt with domain-appropriate role."""
117
-
118
- # Mission context block (injected regardless of domain)
119
- mission_context = ""
120
- if mission_spec:
121
- mission_context = (
122
- "\n\nMISSION CONTEXT:\n"
123
- f"- Operator Intent: {mission_spec.mission_intent}\n"
124
- f"- Domain: {mission_spec.domain}\n"
125
- f"- Target Classes: {', '.join(mission_spec.object_classes)}\n"
126
- )
127
- if mission_spec.context_phrases:
128
- mission_context += f"- Situational Context: {'; '.join(mission_spec.context_phrases)}\n"
129
- if mission_spec.stripped_modifiers:
130
- mission_context += f"- Operator Modifiers (stripped): {', '.join(mission_spec.stripped_modifiers)}\n"
131
- mission_context += (
132
- "\nUse the mission context to inform your analysis. "
133
- "Focus assessment on the target classes and domain specified."
134
- )
135
-
136
- role = _DOMAIN_ROLES.get(domain, _DOMAIN_ROLES["GENERIC"])
137
-
138
- return (
139
- f"You are an elite {role}. "
140
- "Your task is to analyze optical surveillance imagery and provide a detailed tactical assessment for every detected object. "
141
- f"You must output a STRICT JSON object that matches the following schema for every object ID provided:\n\n"
142
- f"{_UNIVERSAL_SCHEMA}\n"
143
- "RULES:\n"
144
- "- Use dynamic_features for domain-specific observations (e.g., wake_description, deck_activity, sensor_profile, camouflage, license_plate).\n"
145
- "- Provide up to 5 dynamic_features per object. Choose the most tactically relevant observations.\n"
146
- "- range_estimate should be a human-readable string with units (e.g., '~500m', '~2NM').\n"
147
- "- Visible trained weapons are IMMINENT threat (Score 9-10).\n"
148
- "- Ignore artifacts, focus on the objects."
149
- + mission_context
150
- )
151
-
152
-
153
- def estimate_threat_gpt(
154
- image_path: Optional[str] = None,
155
- detections: Optional[List[Dict[str, Any]]] = None,
156
- mission_spec=None, # Optional[MissionSpecification]
157
- image_b64: Optional[str] = None,
158
- ) -> Dict[str, Any]:
159
- """
160
- Perform Threat Assessment on detected objects using GPT-4o.
161
-
162
- Args:
163
- image_path: Path to the image file (mutually exclusive with image_b64).
164
- detections: List of detection dicts (bbox, label, etc.).
165
- mission_spec: Optional MissionSpecification for domain-aware assessment.
166
- image_b64: Pre-encoded base64 JPEG string (avoids disk round-trip).
167
-
168
- Returns:
169
- Dict mapping object ID (e.g., T01) to threat assessment dict.
170
- """
171
- if detections is None:
172
- detections = []
173
-
174
- if not get_api_key():
175
- logger.error("OPENAI_API_KEY not set. Skipping GPT threat assessment.")
176
- return {}
177
-
178
- # 1. Prepare detections summary for prompt.
179
- # Human/person classes are explicitly skipped to avoid refusal paths.
180
- prompt_items = []
181
- skipped_human_ids: List[str] = []
182
- for i, det in enumerate(detections):
183
- obj_id = str(det.get("track_id") or det.get("id") or f"T{str(i+1).zfill(2)}")
184
- bbox = det.get("bbox", [])
185
- label = str(det.get("label", "object"))
186
- if _is_human_label(label):
187
- skipped_human_ids.append(obj_id)
188
- continue
189
- prompt_items.append({"obj_id": obj_id, "label": label, "bbox": bbox})
190
-
191
- det_text = "\n".join(
192
- [
193
- f"- ID: {it['obj_id']}, Classification Hint: {it['label']}, BBox: {it['bbox']}"
194
- for it in prompt_items
195
- ]
196
- )
197
-
198
- if not det_text:
199
- if skipped_human_ids:
200
- logger.warning(
201
- "Skipping GPT threat assessment for %d human/person detections due policy constraints.",
202
- len(skipped_human_ids),
203
- )
204
- return _build_status_fallback(
205
- skipped_human_ids,
206
- AssessmentStatus.SKIPPED_POLICY,
207
- "Human/person analysis skipped due policy constraints.",
208
- )
209
- return {}
210
-
211
- # 2. Encode image (prefer pre-encoded b64 to avoid disk I/O)
212
- if image_b64:
213
- base64_image = image_b64
214
- elif image_path:
215
- try:
216
- base64_image = encode_image(image_path)
217
- except Exception as e:
218
- logger.error(f"Failed to encode image for GPT: {e}")
219
- return {}
220
- else:
221
- logger.error("estimate_threat_gpt: no image_path or image_b64 provided")
222
- return {}
223
-
224
- # 3. Domain-aware prompt selection (INV-7)
225
- domain = "GENERIC" # default — universal schema works for all domains
226
- if mission_spec:
227
- domain = mission_spec.domain
228
- if mission_spec.domain_source == "INFERRED":
229
- logger.info("GPT assessment using inferred domain=%s (domain_inferred=True)", domain)
230
-
231
- system_prompt = _build_domain_system_prompt(domain, mission_spec)
232
-
233
- domain_label = domain.lower() if domain != "NAVAL" else "naval"
234
- user_prompt = (
235
- f"Analyze this {domain_label} surveillance image. The following objects have been detected:\n"
236
- f"{det_text}\n\n"
237
- f"Provide a detailed Threat Assessment for each object based on its visual signatures."
238
- )
239
-
240
- # 4. Call API
241
- payload = {
242
- "model": "gpt-4o", # Use 4o for better vision analysis
243
- "messages": [
244
- {
245
- "role": "system",
246
- "content": system_prompt
247
- },
248
- {
249
- "role": "user",
250
- "content": [
251
- {
252
- "type": "text",
253
- "text": user_prompt
254
- },
255
- {
256
- "type": "image_url",
257
- "image_url": {
258
- "url": f"data:image/jpeg;base64,{base64_image}",
259
- "detail": "low"
260
- }
261
- }
262
- ]
263
- }
264
- ],
265
- "max_tokens": 1500,
266
- "temperature": 0.2, # Low temp for factual consistency
267
- "response_format": { "type": "json_object" }
268
- }
269
-
270
- try:
271
- resp_data = chat_completion(payload)
272
- content, refusal = extract_content(resp_data)
273
- if not content:
274
- if refusal:
275
- logger.warning("GPT refused threat assessment: %s", refusal)
276
- else:
277
- logger.warning(
278
- "GPT returned empty content. response_id=%s finish_reason=%s",
279
- resp_data.get("id"),
280
- resp_data.get("choices", [{}])[0].get("finish_reason"),
281
- )
282
- fallback = _build_status_fallback(
283
- [it["obj_id"] for it in prompt_items],
284
- AssessmentStatus.REFUSED,
285
- refusal or "GPT returned empty content.",
286
- )
287
- fallback.update(
288
- _build_status_fallback(
289
- skipped_human_ids,
290
- AssessmentStatus.SKIPPED_POLICY,
291
- "Human/person analysis skipped due policy constraints.",
292
- )
293
- )
294
- return fallback
295
-
296
- result_json = json.loads(content)
297
-
298
- objects = result_json.get("objects", {})
299
- if not isinstance(objects, dict):
300
- logger.warning(
301
- "GPT response 'objects' field is not a dict (got %s); using fallback.",
302
- type(objects).__name__,
303
- )
304
- objects = {}
305
-
306
- # Ensure every requested object receives an explicit assessment state.
307
- for it in prompt_items:
308
- oid = it["obj_id"]
309
- if oid not in objects:
310
- objects[oid] = {
311
- "assessment_status": AssessmentStatus.NO_RESPONSE,
312
- "gpt_reason": "No structured assessment returned for object.",
313
- }
314
- for oid in skipped_human_ids:
315
- objects.setdefault(
316
- oid,
317
- {
318
- "assessment_status": AssessmentStatus.SKIPPED_POLICY,
319
- "gpt_reason": "Human/person analysis skipped due policy constraints.",
320
- },
321
- )
322
-
323
- # Polyfill legacy fields for frontend compatibility
324
- for obj_id, data in objects.items():
325
- if not isinstance(data, dict):
326
- data = {
327
- "assessment_status": AssessmentStatus.NO_RESPONSE,
328
- "gpt_reason": "Malformed object payload from GPT.",
329
- }
330
- objects[obj_id] = data
331
-
332
- # 1. Distance: parse free-text range_estimate to meters
333
- range_m = _parse_range_to_meters(data.get("range_estimate", ""))
334
- if range_m is not None:
335
- data["distance_m"] = range_m
336
- data["gpt_distance_m"] = range_m
337
-
338
- # 2. Direction (legacy alias)
339
- bearing = data.get("bearing", "")
340
- if bearing and bearing != "Unknown":
341
- data["direction"] = bearing
342
- data["gpt_direction"] = bearing
343
-
344
- # 3. Description (summary of new fields)
345
- obj_type = data.get("object_type", "Unknown")
346
- threat = data.get("threat_classification", "Unknown")
347
- score = data.get("threat_level", 0)
348
-
349
- desc_parts = [obj_type]
350
- desc_parts.append(f"[{threat.upper()} Lvl:{score}]")
351
-
352
- data["description"] = " ".join(desc_parts)
353
- data["gpt_description"] = data["description"]
354
-
355
- # 4. Legacy threat_level_score alias
356
- data["threat_level_score"] = data.get("threat_level", 0)
357
-
358
- return objects
359
-
360
- except Exception as e:
361
- logger.error("GPT API call failed: %s", e, exc_info=True)
362
- fallback = _build_status_fallback(
363
- [it["obj_id"] for it in prompt_items],
364
- AssessmentStatus.ERROR,
365
- f"GPT API call failed: {e.__class__.__name__}",
366
- )
367
- fallback.update(
368
- _build_status_fallback(
369
- skipped_human_ids,
370
- AssessmentStatus.SKIPPED_POLICY,
371
- "Human/person analysis skipped due policy constraints.",
372
- )
373
- )
374
- return fallback
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/mission_parser.py DELETED
@@ -1,481 +0,0 @@
1
- """
2
- Mission text parser — converts raw operator text into a validated MissionSpecification.
3
-
4
- Single public function: parse_mission_text(raw_text, detector_key) -> MissionSpecification
5
-
6
- Internal flow:
7
- 1. Fast-path regex check -> skip LLM if comma-separated labels
8
- 2. LLM extraction call (GPT-4o, temperature 0.0)
9
- 3. Deterministic validation pipeline
10
- 4. COCO vocabulary mapping for COCO-only detectors
11
- 5. Build RelevanceCriteria deterministically from mapped classes
12
- 6. Return validated MissionSpecification or raise MissionParseError
13
- """
14
-
15
- import json
16
- import logging
17
- import re
18
- from typing import List, Optional
19
-
20
- from utils.openai_client import chat_completion, extract_content, get_api_key, OpenAIAPIError
21
-
22
- from coco_classes import COCO_CLASSES, canonicalize_coco_name, coco_class_catalog
23
- from utils.schemas import MissionSpecification, RelevanceCriteria
24
-
25
- logger = logging.getLogger(__name__)
26
-
27
- # Detectors that only support COCO class vocabulary
28
- _COCO_ONLY_DETECTORS = frozenset({"yolo11", "detr_resnet50"})
29
-
30
-
31
- class MissionParseError(ValueError):
32
- """Raised when mission text cannot be parsed into a valid MissionSpecification."""
33
- def __init__(self, message: str, warnings: Optional[List[str]] = None):
34
- self.warnings = warnings or []
35
- super().__init__(message)
36
-
37
-
38
- def _is_comma_separated_labels(text: str) -> bool:
39
- """Fast-path: detect simple comma-separated class labels (no LLM needed)."""
40
- # Match: word tokens separated by commas, each token <= 3 words
41
- pattern = r"^[\w\s]+(,\s*[\w\s]+)*$"
42
- if not re.match(pattern, text.strip()):
43
- return False
44
- tokens = [t.strip() for t in text.split(",") if t.strip()]
45
- return all(len(t.split()) <= 3 for t in tokens)
46
-
47
-
48
- def _is_coco_only(detector_key: str) -> bool:
49
- return detector_key in _COCO_ONLY_DETECTORS
50
-
51
-
52
- def _map_coco_classes(
53
- object_classes: List[str], detector_key: str
54
- ) -> tuple[List[str], List[str], List[str]]:
55
- """Map object classes to COCO vocabulary for COCO-only detectors.
56
-
57
- Returns:
58
- (mapped_classes, unmappable_classes, warnings)
59
- """
60
- if not _is_coco_only(detector_key):
61
- return object_classes, [], []
62
-
63
- mapped = []
64
- unmappable = []
65
- warnings = []
66
- seen = set()
67
-
68
- for cls in object_classes:
69
- canonical = canonicalize_coco_name(cls)
70
- if canonical is not None:
71
- if canonical not in seen:
72
- mapped.append(canonical)
73
- seen.add(canonical)
74
- if canonical.lower() != cls.lower():
75
- warnings.append(
76
- f"'{cls}' mapped to COCO class '{canonical}'."
77
- )
78
- else:
79
- unmappable.append(cls)
80
- warnings.append(
81
- f"'{cls}' is not in COCO vocabulary. Will not be detected by {detector_key}."
82
- )
83
-
84
- return mapped, unmappable, warnings
85
-
86
-
87
- def _build_fast_path_spec(
88
- raw_text: str, object_classes: List[str], detector_key: str
89
- ) -> MissionSpecification:
90
- """Build MissionSpecification for simple comma-separated input (no LLM call)."""
91
- mapped, unmappable, warnings = _map_coco_classes(object_classes, detector_key)
92
-
93
- if _is_coco_only(detector_key) and not mapped:
94
- raise MissionParseError(
95
- f"None of the requested objects ({', '.join(object_classes)}) match the "
96
- f"{detector_key} vocabulary. This detector supports: "
97
- f"{coco_class_catalog()}. "
98
- f"Use an open-vocabulary detector (Grounding DINO) or adjust your mission.",
99
- warnings=warnings,
100
- )
101
-
102
- final_classes = mapped if _is_coco_only(detector_key) else object_classes
103
-
104
- return MissionSpecification(
105
- object_classes=final_classes,
106
- mission_intent="DETECT",
107
- domain="GENERIC",
108
- domain_source="INFERRED",
109
- relevance_criteria=RelevanceCriteria(
110
- required_classes=final_classes,
111
- min_confidence=0.0,
112
- ),
113
- context_phrases=[],
114
- stripped_modifiers=[],
115
- operator_text=raw_text,
116
- parse_mode="FAST_PATH",
117
- parse_confidence="HIGH",
118
- parse_warnings=warnings,
119
- )
120
-
121
-
122
- # --- LLM Extraction ---
123
-
124
- _SYSTEM_PROMPT = (
125
- "You are a mission text parser for an object detection system. Your ONLY job is to extract "
126
- "structured fields from operator mission text. You do NOT assess threats. You do NOT reason "
127
- "about tactics. You extract and classify.\n\n"
128
- "OUTPUT SCHEMA (strict JSON):\n"
129
- "{\n"
130
- ' "object_classes": ["string"],\n'
131
- ' "mission_intent": "ENUM",\n'
132
- ' "domain": "ENUM",\n'
133
- ' "context_phrases": ["string"],\n'
134
- ' "stripped_modifiers": ["string"],\n'
135
- ' "parse_confidence": "ENUM",\n'
136
- ' "parse_warnings": ["string"]\n'
137
- "}\n\n"
138
- "EXTRACTION RULES:\n\n"
139
- "1. OBJECT_CLASSES — What to extract:\n"
140
- " - Extract nouns and noun phrases that refer to PHYSICAL, VISUALLY DETECTABLE objects.\n"
141
- " - Keep visual descriptors that narrow the category: 'small boat', 'military vehicle', 'cargo ship'.\n"
142
- " - Use singular form: 'vessels' -> 'vessel', 'people' -> 'person'.\n"
143
- " - If the input is already comma-separated class labels (e.g., 'person, car, boat'),\n"
144
- " use them directly without modification.\n\n"
145
- "2. OBJECT_CLASSES — What to strip:\n"
146
- " - Remove threat/intent adjectives: 'hostile', 'suspicious', 'friendly', 'dangerous', 'enemy'.\n"
147
- " -> Move these to stripped_modifiers.\n"
148
- " - Remove action verbs: 'approaching', 'fleeing', 'attacking'.\n"
149
- " -> Move the full phrase to context_phrases.\n"
150
- " - Remove spatial/temporal phrases: 'from the east', 'near the harbor', 'at night'.\n"
151
- " -> Move to context_phrases.\n"
152
- " - Do NOT extract abstract concepts: 'threat', 'danger', 'hazard', 'risk' are not objects.\n\n"
153
- "3. MISSION_INTENT — Infer from verbs:\n"
154
- " - 'detect', 'find', 'locate', 'spot', 'search for' -> DETECT\n"
155
- " - 'classify', 'identify', 'determine type of' -> CLASSIFY\n"
156
- " - 'track', 'follow', 'monitor movement of' -> TRACK\n"
157
- " - 'assess threat', 'evaluate danger', 'threat assessment' -> ASSESS_THREAT\n"
158
- " - 'monitor', 'watch', 'observe', 'surveil' -> MONITOR\n"
159
- " - If no verb present (bare class list), default to DETECT.\n\n"
160
- "4. DOMAIN — Infer from contextual clues:\n"
161
- " - Maritime vocabulary (vessel, ship, boat, harbor, naval, maritime, wake, sea) -> NAVAL\n"
162
- " - Ground vocabulary (vehicle, convoy, checkpoint, road, building, infantry) -> GROUND\n"
163
- " - Aerial vocabulary (aircraft, drone, UAV, airspace, altitude, flight) -> AERIAL\n"
164
- " - Urban vocabulary (pedestrian, intersection, storefront, crowd, building) -> URBAN\n"
165
- " - If no domain clues present -> GENERIC\n\n"
166
- "5. PARSE_CONFIDENCE:\n"
167
- " - HIGH: Clear object classes extracted, domain identifiable.\n"
168
- " - MEDIUM: Some ambiguity but reasonable extraction possible. Include warnings.\n"
169
- " - LOW: Cannot extract meaningful object classes. Input is too abstract,\n"
170
- " contradictory, or contains no visual object references.\n"
171
- " Examples of LOW: 'keep us safe', 'do your job', 'analyze everything'.\n\n"
172
- "FORBIDDEN:\n"
173
- "- Do NOT infer object classes not implied by the text. If the text says 'boats',\n"
174
- " do not add 'person' or 'vehicle' unless mentioned.\n"
175
- "- Do NOT add threat scores, engagement rules, or tactical recommendations.\n"
176
- "- Do NOT interpret what 'threat' or 'danger' means in terms of specific objects.\n"
177
- " If the operator writes 'detect threats', set parse_confidence to LOW and warn:\n"
178
- " \"'threats' is not a visual object class. Specify what objects to detect.\""
179
- )
180
-
181
- _VISION_GROUNDING_ADDENDUM = (
182
- "\n\nVISION GROUNDING (when an image is provided):\n"
183
- "You may receive the first frame of the operator's video feed as an image.\n"
184
- "Use it to REFINE your object_classes extraction:\n\n"
185
- "1. If the operator uses a general term (e.g., 'vessels', 'vehicles'),\n"
186
- " inspect the image and add MORE SPECIFIC subcategories visible in the scene.\n"
187
- " Example: operator says 'detect vessels', image shows a speedboat and a cargo ship\n"
188
- " -> object_classes: ['vessel', 'speedboat', 'cargo ship']\n\n"
189
- "2. If the operator mentions objects NOT visible in the first frame,\n"
190
- " still include them (later frames may contain them), but add a\n"
191
- " parse_warning noting they were not visible in the first frame.\n\n"
192
- "3. Use the image to CONFIRM or REFINE the domain. If the text is ambiguous\n"
193
- " but the image clearly shows open water, set domain to NAVAL.\n\n"
194
- "4. Do NOT hallucinate objects. Only add specific subcategories if clearly\n"
195
- " identifiable. When uncertain, keep the general term.\n\n"
196
- "5. The same OUTPUT SCHEMA and all EXTRACTION RULES still apply.\n"
197
- " The image is supplementary context, not a replacement for the text.\n"
198
- )
199
-
200
-
201
- def _extract_and_encode_first_frame(video_path: Optional[str]) -> Optional[str]:
202
- """Extract the first frame from a video and return it as a base64-encoded JPEG.
203
-
204
- Never raises — returns None on any failure so the caller can fall back
205
- to text-only parsing.
206
- """
207
- if not video_path:
208
- return None
209
- try:
210
- from inference import extract_first_frame
211
- from utils.gpt_reasoning import encode_frame_to_b64
212
-
213
- frame, _fps, _w, _h = extract_first_frame(video_path)
214
- return encode_frame_to_b64(frame, quality=85)
215
- except Exception:
216
- logger.warning("Failed to extract/encode first frame for vision grounding", exc_info=True)
217
- return None
218
-
219
-
220
- def _call_extraction_llm(raw_text: str, detector_key: str, first_frame_b64: Optional[str] = None) -> dict:
221
- """Call GPT-4o to extract structured mission fields from natural language."""
222
- if not get_api_key():
223
- raise MissionParseError(
224
- "OPENAI_API_KEY not set. Cannot parse natural language mission text. "
225
- "Use comma-separated class labels instead (e.g., 'person, car, boat')."
226
- )
227
-
228
- detector_type = "COCO_ONLY" if _is_coco_only(detector_key) else "OPEN_VOCAB"
229
-
230
- user_prompt_text = (
231
- f'OPERATOR MISSION TEXT:\n"{raw_text}"\n\n'
232
- f"DETECTOR TYPE: {detector_type}\n\n"
233
- "Extract the structured mission specification from the above text."
234
- )
235
-
236
- # Build system prompt (append vision addendum when image is available)
237
- system_content = _SYSTEM_PROMPT
238
- if first_frame_b64:
239
- system_content = _SYSTEM_PROMPT + _VISION_GROUNDING_ADDENDUM
240
-
241
- # Build user message: mixed content array when image is available, plain string otherwise
242
- if first_frame_b64:
243
- user_message = {
244
- "role": "user",
245
- "content": [
246
- {"type": "text", "text": user_prompt_text},
247
- {
248
- "type": "image_url",
249
- "image_url": {
250
- "url": f"data:image/jpeg;base64,{first_frame_b64}",
251
- "detail": "low",
252
- },
253
- },
254
- ],
255
- }
256
- else:
257
- user_message = {"role": "user", "content": user_prompt_text}
258
-
259
- max_tokens = 700 if first_frame_b64 else 500
260
- timeout_s = 45 if first_frame_b64 else 30
261
-
262
- payload = {
263
- "model": "gpt-4o",
264
- "temperature": 0.0,
265
- "max_tokens": max_tokens,
266
- "response_format": {"type": "json_object"},
267
- "messages": [
268
- {"role": "system", "content": system_content},
269
- user_message,
270
- ],
271
- }
272
-
273
- try:
274
- resp_data = chat_completion(payload, timeout=timeout_s)
275
- content, _refusal = extract_content(resp_data)
276
- if not content:
277
- raise MissionParseError("GPT returned empty content during mission parsing.")
278
-
279
- return json.loads(content)
280
-
281
- except OpenAIAPIError as e:
282
- raise MissionParseError(f"Mission parsing API call failed: {e}")
283
- except json.JSONDecodeError:
284
- raise MissionParseError(
285
- "GPT returned invalid JSON. Please rephrase your mission."
286
- )
287
-
288
-
289
- def _validate_and_build(
290
- llm_output: dict, raw_text: str, detector_key: str
291
- ) -> MissionSpecification:
292
- """Deterministic validation pipeline (Section 7.3 decision tree)."""
293
-
294
- # Step 2: Extract fields with defaults
295
- object_classes = llm_output.get("object_classes", [])
296
- mission_intent = llm_output.get("mission_intent", "DETECT")
297
- domain = llm_output.get("domain", "GENERIC")
298
- context_phrases = llm_output.get("context_phrases", [])
299
- stripped_modifiers = llm_output.get("stripped_modifiers", [])
300
- parse_confidence = llm_output.get("parse_confidence", "LOW")
301
- parse_warnings = llm_output.get("parse_warnings", [])
302
-
303
- # Validate enum values
304
- valid_intents = {"DETECT", "CLASSIFY", "TRACK", "ASSESS_THREAT", "MONITOR"}
305
- if mission_intent not in valid_intents:
306
- mission_intent = "DETECT"
307
- parse_warnings.append(f"Invalid mission_intent '{llm_output.get('mission_intent')}', defaulted to DETECT.")
308
-
309
- valid_domains = {"NAVAL", "GROUND", "AERIAL", "URBAN", "GENERIC"}
310
- if domain not in valid_domains:
311
- domain = "GENERIC"
312
- parse_warnings.append(f"Invalid domain '{llm_output.get('domain')}', defaulted to GENERIC.")
313
-
314
- valid_confidence = {"HIGH", "MEDIUM", "LOW"}
315
- if parse_confidence not in valid_confidence:
316
- parse_confidence = "LOW"
317
-
318
- # Step 3: Parse confidence check
319
- if parse_confidence == "LOW":
320
- warnings_str = "; ".join(parse_warnings) if parse_warnings else "No details"
321
- raise MissionParseError(
322
- f"Could not extract object classes from mission text. "
323
- f"Warnings: {warnings_str}. "
324
- f"Please specify concrete objects to detect (e.g., 'vessel, small boat').",
325
- warnings=parse_warnings,
326
- )
327
-
328
- # Validate object_classes is non-empty
329
- if not object_classes:
330
- raise MissionParseError(
331
- "Mission text produced no detectable object classes. "
332
- "Please specify concrete objects to detect.",
333
- warnings=parse_warnings,
334
- )
335
-
336
- # Filter out empty strings
337
- object_classes = [c.strip() for c in object_classes if c and c.strip()]
338
- if not object_classes:
339
- raise MissionParseError(
340
- "All extracted object classes were empty after cleanup.",
341
- warnings=parse_warnings,
342
- )
343
-
344
- # Step 4: COCO vocabulary mapping
345
- mapped, unmappable, coco_warnings = _map_coco_classes(object_classes, detector_key)
346
- parse_warnings.extend(coco_warnings)
347
-
348
- if _is_coco_only(detector_key):
349
- if not mapped:
350
- raise MissionParseError(
351
- f"None of the requested objects ({', '.join(object_classes)}) match the "
352
- f"{detector_key} vocabulary. "
353
- f"This detector supports: {coco_class_catalog()}. "
354
- f"Use an open-vocabulary detector (Grounding DINO) or adjust your mission.",
355
- warnings=parse_warnings,
356
- )
357
- final_classes = mapped
358
- else:
359
- final_classes = object_classes
360
-
361
- # Step 5: Build RelevanceCriteria deterministically
362
- relevance_criteria = RelevanceCriteria(
363
- required_classes=final_classes,
364
- min_confidence=0.0,
365
- )
366
-
367
- # Step 6: Construct MissionSpecification
368
- return MissionSpecification(
369
- object_classes=final_classes,
370
- mission_intent=mission_intent,
371
- domain=domain,
372
- domain_source="INFERRED",
373
- relevance_criteria=relevance_criteria,
374
- # INVARIANT INV-13: context_phrases are forwarded to LLM reasoning layers
375
- # (GPT threat assessment, threat chat) as situational context ONLY.
376
- # They must NEVER be used in evaluate_relevance(), prioritization,
377
- # or any deterministic filtering/sorting logic.
378
- context_phrases=context_phrases,
379
- stripped_modifiers=stripped_modifiers,
380
- operator_text=raw_text,
381
- parse_mode="LLM_EXTRACTED",
382
- parse_confidence=parse_confidence,
383
- parse_warnings=parse_warnings,
384
- )
385
-
386
-
387
- _DOMAIN_BROAD_CATEGORIES: dict[str, List[str]] = {
388
- "NAVAL": ["vessel", "ship", "boat", "buoy", "person"],
389
- "AERIAL": ["aircraft", "helicopter", "drone", "airplane"],
390
- "GROUND": ["vehicle", "car", "truck", "person", "building"],
391
- "URBAN": ["person", "vehicle", "car", "bicycle"],
392
- "GENERIC": ["object"],
393
- }
394
-
395
-
396
- def build_broad_queries(
397
- detector_key: str, mission_spec: MissionSpecification
398
- ) -> List[str]:
399
- """Build broad detector queries for LLM post-filter mode.
400
-
401
- For FAST_PATH: return object_classes directly (unchanged behavior).
402
- For COCO detectors (LLM_EXTRACTED): return ALL 80 COCO classes.
403
- For open-vocab detectors (LLM_EXTRACTED): return LLM-extracted classes
404
- PLUS broad domain categories to maximize recall.
405
- """
406
- if mission_spec.parse_mode == "FAST_PATH":
407
- return mission_spec.object_classes
408
-
409
- # LLM_EXTRACTED path: detect broadly
410
- if _is_coco_only(detector_key):
411
- # COCO detectors ignore queries anyway (DETR detects all 80;
412
- # YOLO11 falls back to all if no matches). Send everything.
413
- return list(COCO_CLASSES)
414
-
415
- # Open-vocab detector (e.g. Grounding DINO):
416
- # Combine LLM-extracted classes with domain-specific broad categories
417
- broad = list(mission_spec.object_classes)
418
- domain_extras = _DOMAIN_BROAD_CATEGORIES.get(
419
- mission_spec.domain, _DOMAIN_BROAD_CATEGORIES["GENERIC"]
420
- )
421
- seen = {c.lower() for c in broad}
422
- for cat in domain_extras:
423
- if cat.lower() not in seen:
424
- broad.append(cat)
425
- seen.add(cat.lower())
426
-
427
- logger.info("Broad queries for %s: %s", detector_key, broad)
428
- return broad
429
-
430
-
431
- def parse_mission_text(
432
- raw_text: str,
433
- detector_key: str,
434
- video_path: Optional[str] = None,
435
- ) -> MissionSpecification:
436
- """Parse raw mission text into a validated MissionSpecification.
437
-
438
- Args:
439
- raw_text: Verbatim mission text from the operator.
440
- detector_key: Detector model key (determines COCO vocabulary constraints).
441
- video_path: Optional path to input video; first frame used for vision grounding.
442
-
443
- Returns:
444
- Validated MissionSpecification.
445
-
446
- Raises:
447
- MissionParseError: If mission text cannot produce a valid specification.
448
- """
449
- if not raw_text or not raw_text.strip():
450
- raise MissionParseError(
451
- "Mission text is empty. Specify objects to detect or use the default queries."
452
- )
453
-
454
- raw_text = raw_text.strip()
455
-
456
- # Fast-path: simple comma-separated labels -> skip LLM
457
- if _is_comma_separated_labels(raw_text):
458
- object_classes = [t.strip() for t in raw_text.split(",") if t.strip()]
459
- logger.info(
460
- "Mission fast-path: comma-separated labels %s", object_classes
461
- )
462
- return _build_fast_path_spec(raw_text, object_classes, detector_key)
463
-
464
- # LLM path: natural language mission text
465
- logger.info("Mission LLM-path: extracting from natural language")
466
- first_frame_b64 = _extract_and_encode_first_frame(video_path)
467
- if first_frame_b64:
468
- logger.info("Vision grounding: first frame encoded for LLM call")
469
- llm_output = _call_extraction_llm(raw_text, detector_key, first_frame_b64=first_frame_b64)
470
- logger.info("Mission LLM extraction result: %s", llm_output)
471
-
472
- mission_spec = _validate_and_build(llm_output, raw_text, detector_key)
473
- logger.info(
474
- "Mission parsed: classes=%s intent=%s domain=%s(%s) confidence=%s",
475
- mission_spec.object_classes,
476
- mission_spec.mission_intent,
477
- mission_spec.domain,
478
- mission_spec.domain_source,
479
- mission_spec.parse_confidence,
480
- )
481
- return mission_spec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/openai_client.py DELETED
@@ -1,80 +0,0 @@
1
- """
2
- Shared OpenAI HTTP client — single implementation of the chat-completions call.
3
-
4
- Replaces duplicated urllib boilerplate in gpt_reasoning, relevance,
5
- mission_parser, and threat_chat.
6
- """
7
-
8
- import json
9
- import logging
10
- import os
11
- import urllib.request
12
- import urllib.error
13
- from typing import Dict, Optional, Tuple
14
-
15
- logger = logging.getLogger(__name__)
16
-
17
- _API_URL = "https://api.openai.com/v1/chat/completions"
18
-
19
-
20
- class OpenAIAPIError(Exception):
21
- """Raised when the OpenAI API call fails (HTTP or network error)."""
22
-
23
- def __init__(self, message: str, status_code: Optional[int] = None):
24
- self.status_code = status_code
25
- super().__init__(message)
26
-
27
-
28
- def get_api_key() -> Optional[str]:
29
- """Return the OpenAI API key from the environment, or None."""
30
- return os.environ.get("OPENAI_API_KEY")
31
-
32
-
33
- def chat_completion(payload: Dict, *, timeout: int = 30) -> Dict:
34
- """Send a chat-completion request and return the parsed JSON response.
35
-
36
- Args:
37
- payload: Full request body (model, messages, etc.).
38
- timeout: HTTP timeout in seconds.
39
-
40
- Returns:
41
- Parsed response dict.
42
-
43
- Raises:
44
- OpenAIAPIError: On HTTP or network failure.
45
- """
46
- api_key = get_api_key()
47
- if not api_key:
48
- raise OpenAIAPIError("OPENAI_API_KEY not set")
49
-
50
- headers = {
51
- "Content-Type": "application/json",
52
- "Authorization": f"Bearer {api_key}",
53
- }
54
-
55
- try:
56
- req = urllib.request.Request(
57
- _API_URL,
58
- data=json.dumps(payload).encode("utf-8"),
59
- headers=headers,
60
- method="POST",
61
- )
62
- with urllib.request.urlopen(req, timeout=timeout) as response:
63
- return json.loads(response.read().decode("utf-8"))
64
- except urllib.error.HTTPError as e:
65
- raise OpenAIAPIError(
66
- f"HTTP {e.code}: {e.reason}", status_code=e.code
67
- ) from e
68
- except urllib.error.URLError as e:
69
- raise OpenAIAPIError(f"URL error: {e.reason}") from e
70
-
71
-
72
- def extract_content(resp_data: Dict) -> Tuple[Optional[str], Optional[str]]:
73
- """Safely extract content and refusal from a chat-completion response.
74
-
75
- Returns:
76
- (content, refusal) — either may be None.
77
- """
78
- choice = resp_data.get("choices", [{}])[0]
79
- message = choice.get("message", {})
80
- return message.get("content"), message.get("refusal")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/relevance.py DELETED
@@ -1,141 +0,0 @@
1
- """
2
- Object relevance evaluation — deterministic gate between detection and GPT assessment.
3
-
4
- Public functions:
5
- evaluate_relevance(detection, criteria) -> RelevanceDecision (deterministic)
6
- evaluate_relevance_llm(detected_labels, mission_text) -> set[str] (LLM post-filter)
7
-
8
- INVARIANT INV-13 enforcement: evaluate_relevance() accepts RelevanceCriteria, NOT
9
- MissionSpecification. It cannot see context_phrases, stripped_modifiers, or any
10
- LLM-derived field. This is structural, not by convention.
11
- """
12
-
13
- import json
14
- import logging
15
- from typing import Any, Dict, List, NamedTuple, Set
16
-
17
- from utils.openai_client import chat_completion, extract_content, get_api_key, OpenAIAPIError
18
-
19
- from coco_classes import canonicalize_coco_name
20
- from utils.schemas import RelevanceCriteria
21
-
22
- logger = logging.getLogger(__name__)
23
-
24
-
25
- class RelevanceDecision(NamedTuple):
26
- relevant: bool
27
- reason: str # "ok" | "label_not_in_required_classes" | "below_confidence"
28
-
29
-
30
- def evaluate_relevance(
31
- detection: Dict[str, Any],
32
- criteria: RelevanceCriteria,
33
- ) -> RelevanceDecision:
34
- """Evaluate whether a detection is relevant to the mission.
35
-
36
- Pure deterministic predicate — no LLM involvement.
37
-
38
- Args:
39
- detection: Detection dict with at least 'label' and 'score' keys.
40
- criteria: RelevanceCriteria with required_classes and min_confidence.
41
-
42
- Returns:
43
- RelevanceDecision(relevant=bool, reason=str).
44
- """
45
- label = (detection.get("label") or "").lower().strip()
46
- confidence = detection.get("score", 0.0)
47
-
48
- if not label:
49
- return RelevanceDecision(False, "label_not_in_required_classes")
50
-
51
- # Build lowercase set of required classes for comparison
52
- required_lower = {c.lower() for c in criteria.required_classes}
53
-
54
- # Direct match
55
- if label in required_lower:
56
- if confidence < criteria.min_confidence:
57
- return RelevanceDecision(False, "below_confidence")
58
- return RelevanceDecision(True, "ok")
59
-
60
- # Synonym match via COCO canonicalization
61
- canonical = canonicalize_coco_name(label)
62
- if canonical and canonical.lower() in required_lower:
63
- if confidence < criteria.min_confidence:
64
- return RelevanceDecision(False, "below_confidence")
65
- return RelevanceDecision(True, "ok")
66
-
67
- # Check if any required class canonicalizes to the same COCO class as the label
68
- if canonical:
69
- for req in criteria.required_classes:
70
- req_canonical = canonicalize_coco_name(req)
71
- if req_canonical and req_canonical.lower() == canonical.lower():
72
- if confidence < criteria.min_confidence:
73
- return RelevanceDecision(False, "below_confidence")
74
- return RelevanceDecision(True, "ok")
75
-
76
- return RelevanceDecision(False, "label_not_in_required_classes")
77
-
78
-
79
- def evaluate_relevance_llm(
80
- detected_labels: List[str],
81
- mission_text: str,
82
- ) -> Set[str]:
83
- """Ask GPT which detected labels are relevant to the mission.
84
-
85
- Called ONCE on frame 0 with the unique labels found by the detector.
86
- Returns a set of relevant label strings (lowercased).
87
-
88
- On API failure, falls back to accepting all labels (fail-open, logged).
89
- """
90
- if not detected_labels:
91
- return set()
92
-
93
- if not get_api_key():
94
- logger.warning(
95
- "OPENAI_API_KEY not set — LLM relevance filter falling back to accept-all"
96
- )
97
- return set(detected_labels)
98
-
99
- prompt = (
100
- f"Given this mission: \"{mission_text}\"\n\n"
101
- f"Which of these detected object classes are relevant to the mission?\n"
102
- f"{json.dumps(detected_labels)}\n\n"
103
- "Return JSON: {\"relevant_labels\": [...]}\n"
104
- "Only include labels from the provided list that are relevant to "
105
- "accomplishing the mission. Be inclusive — if in doubt, include it."
106
- )
107
-
108
- payload = {
109
- "model": "gpt-4o-mini",
110
- "temperature": 0.0,
111
- "max_tokens": 200,
112
- "response_format": {"type": "json_object"},
113
- "messages": [
114
- {"role": "system", "content": "You are a mission relevance filter. Return only JSON."},
115
- {"role": "user", "content": prompt},
116
- ],
117
- }
118
-
119
- try:
120
- resp_data = chat_completion(payload)
121
- content, _refusal = extract_content(resp_data)
122
- if not content:
123
- logger.warning("GPT returned empty content for relevance filter — accept-all")
124
- return set(detected_labels)
125
-
126
- result = json.loads(content)
127
- relevant = result.get("relevant_labels", detected_labels)
128
- relevant_set = {label.lower() for label in relevant}
129
-
130
- logger.info(
131
- "LLM relevance filter: mission=%r detected=%s relevant=%s",
132
- mission_text, detected_labels, relevant_set,
133
- )
134
- return relevant_set
135
-
136
- except OpenAIAPIError as e:
137
- logger.warning("LLM relevance API call failed: %s — accept-all fallback", e)
138
- return set(detected_labels)
139
- except (json.JSONDecodeError, KeyError, TypeError) as e:
140
- logger.warning("LLM relevance response parse failed: %s — accept-all fallback", e)
141
- return set(detected_labels)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/schemas.py DELETED
@@ -1,115 +0,0 @@
1
- from pydantic import BaseModel, Field
2
- from typing import List, Literal
3
-
4
- # --- Mission-Driven Abstractions ---
5
-
6
-
7
- class RelevanceCriteria(BaseModel):
8
- """Deterministic boolean predicate for filtering detections against a mission.
9
-
10
- This is the ONLY input to evaluate_relevance(). It intentionally excludes
11
- context_phrases, stripped_modifiers, and all LLM-derived context so that
12
- relevance filtering remains purely deterministic (INV-13).
13
- """
14
- required_classes: List[str] = Field(
15
- ..., min_length=1,
16
- description="Object categories that satisfy the mission. "
17
- "Detections whose label is not in this list are excluded."
18
- )
19
- min_confidence: float = Field(
20
- default=0.0, ge=0.0, le=1.0,
21
- description="Minimum detector confidence to consider a detection relevant."
22
- )
23
-
24
-
25
- class MissionSpecification(BaseModel):
26
- """Structured representation of operator intent.
27
-
28
- Created once from raw mission text at the API boundary (app.py).
29
- Forwarded to: detector (object_classes), GPT (full spec), chat (full spec),
30
- relevance gate (relevance_criteria only — INV-13).
31
-
32
- INVARIANT INV-13: context_phrases are forwarded to LLM reasoning layers
33
- (GPT threat assessment, threat chat) as situational context ONLY.
34
- They must NEVER be used in evaluate_relevance(), prioritization,
35
- or any deterministic filtering/sorting logic.
36
- """
37
-
38
- # --- Extracted by LLM or fast-path ---
39
- object_classes: List[str] = Field(
40
- ..., min_length=1,
41
- description="Concrete, visually detectable object categories to detect. "
42
- "These become detector queries. Must be nouns, not adjectives or verbs."
43
- )
44
- mission_intent: Literal[
45
- "DETECT", "CLASSIFY", "TRACK", "ASSESS_THREAT", "MONITOR"
46
- ] = Field(
47
- ...,
48
- description="Operator purpose. DETECT=find objects, CLASSIFY=identify type, "
49
- "TRACK=follow over time, ASSESS_THREAT=evaluate danger, MONITOR=passive watch."
50
- )
51
- domain: Literal[
52
- "NAVAL", "GROUND", "AERIAL", "URBAN", "GENERIC"
53
- ] = Field(
54
- ...,
55
- description="Operational domain. Selects the GPT assessment schema and system prompt."
56
- )
57
- domain_source: Literal["INFERRED", "OPERATOR_SET"] = Field(
58
- default="INFERRED",
59
- description="Whether domain was LLM-inferred or explicitly set by operator."
60
- )
61
-
62
- # --- Deterministic (derived from object_classes) ---
63
- relevance_criteria: RelevanceCriteria = Field(
64
- ...,
65
- description="Boolean predicate for filtering detections. "
66
- "Built deterministically from object_classes after extraction."
67
- )
68
-
69
- # --- Context preservation ---
70
- context_phrases: List[str] = Field(
71
- default_factory=list,
72
- description="Non-class contextual phrases from mission text. "
73
- "E.g., 'approaching from the east', 'near the harbor'. "
74
- "Forwarded to GPT as situational context, NOT used for detection."
75
- )
76
- stripped_modifiers: List[str] = Field(
77
- default_factory=list,
78
- description="Adjectives/modifiers removed during extraction. "
79
- "E.g., 'hostile', 'suspicious', 'friendly'. Logged for audit."
80
- )
81
- operator_text: str = Field(
82
- ...,
83
- description="Original unmodified mission text from the operator. Preserved for audit."
84
- )
85
-
86
- # --- Parse mode ---
87
- parse_mode: Literal["FAST_PATH", "LLM_EXTRACTED"] = Field(
88
- default="FAST_PATH",
89
- description="How this spec was created. FAST_PATH = comma-separated labels, "
90
- "LLM_EXTRACTED = natural language parsed by GPT."
91
- )
92
-
93
- # --- LLM self-assessment ---
94
- parse_confidence: Literal["HIGH", "MEDIUM", "LOW"] = Field(
95
- ...,
96
- description="Confidence in the extraction. "
97
- "LOW = could not reliably extract classes -> triggers rejection."
98
- )
99
- parse_warnings: List[str] = Field(
100
- default_factory=list,
101
- description="Specific issues encountered during extraction. "
102
- "E.g., 'term \"threat\" is not a visual class, stripped'."
103
- )
104
-
105
-
106
- class AssessmentStatus:
107
- """Canonical string constants for detection assessment lifecycle."""
108
- ASSESSED = "ASSESSED"
109
- UNASSESSED = "UNASSESSED"
110
- PENDING_GPT = "PENDING_GPT"
111
- SKIPPED_POLICY = "SKIPPED_POLICY"
112
- REFUSED = "REFUSED"
113
- ERROR = "ERROR"
114
- NO_RESPONSE = "NO_RESPONSE"
115
- STALE = "STALE"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/threat_chat.py DELETED
@@ -1,154 +0,0 @@
1
- """
2
- Threat Chat Module - GPT-powered Q&A about detected threats.
3
- """
4
-
5
- import logging
6
- from typing import List, Dict, Any
7
-
8
- from utils.openai_client import chat_completion, extract_content, get_api_key, OpenAIAPIError
9
- from utils.gpt_reasoning import _DOMAIN_ROLES
10
-
11
- logger = logging.getLogger(__name__)
12
-
13
-
14
- def chat_about_threats(
15
- question: str,
16
- detections: List[Dict[str, Any]],
17
- mission_spec_dict: Dict[str, Any] = None,
18
- ) -> str:
19
- """
20
- Answer user questions about detected threats using GPT.
21
-
22
- Args:
23
- question: User's question about the current threat situation.
24
- detections: List of detection dicts with gpt_raw threat analysis.
25
- mission_spec_dict: Optional dict of mission specification fields.
26
-
27
- Returns:
28
- GPT's response as a string.
29
- """
30
- if not get_api_key():
31
- logger.warning("OPENAI_API_KEY not set. Cannot process threat chat.")
32
- return "Error: OpenAI API key not configured."
33
-
34
- if not detections:
35
- return "No threats detected yet. Run detection first to analyze the scene."
36
-
37
- # Build threat context from detections
38
- threat_context = _build_threat_context(detections)
39
-
40
- # Domain-aware role selection
41
- domain = "GENERIC"
42
- if mission_spec_dict:
43
- domain = mission_spec_dict.get("domain", "GENERIC")
44
- role_label = _DOMAIN_ROLES.get(domain, _DOMAIN_ROLES["GENERIC"])
45
-
46
- # Build mission context block (INV-8: mission context forwarded to LLM calls)
47
- mission_block = ""
48
- if mission_spec_dict:
49
- mission_block = "\nMISSION CONTEXT:\n"
50
- if mission_spec_dict.get("mission_intent"):
51
- mission_block += f"- Intent: {mission_spec_dict['mission_intent']}\n"
52
- if mission_spec_dict.get("domain"):
53
- mission_block += f"- Domain: {mission_spec_dict['domain']}\n"
54
- if mission_spec_dict.get("object_classes"):
55
- mission_block += f"- Target Classes: {', '.join(mission_spec_dict['object_classes'])}\n"
56
- if mission_spec_dict.get("context_phrases"):
57
- mission_block += f"- Situation: {'; '.join(mission_spec_dict['context_phrases'])}\n"
58
- mission_block += "\n"
59
-
60
- system_prompt = (
61
- f"You are a {role_label} providing real-time threat analysis support. "
62
- "You have access to the current threat assessment data from optical surveillance. "
63
- "Answer questions concisely and tactically. Use military terminology where appropriate. "
64
- "If asked about engagement recommendations, always note that final decisions rest with the commanding officer.\n\n"
65
- f"{mission_block}"
66
- "CURRENT THREAT PICTURE:\n"
67
- f"{threat_context}\n\n"
68
- "Respond to the operator's question based on this threat data."
69
- )
70
-
71
- payload = {
72
- "model": "gpt-4o",
73
- "messages": [
74
- {"role": "system", "content": system_prompt},
75
- {"role": "user", "content": question}
76
- ],
77
- "max_tokens": 500,
78
- "temperature": 0.3,
79
- }
80
-
81
- try:
82
- resp_data = chat_completion(payload)
83
- content, _refusal = extract_content(resp_data)
84
- return content.strip() if content else "No response generated."
85
-
86
- except OpenAIAPIError as e:
87
- logger.error("OpenAI API error: %s", e)
88
- return f"API Error: {e}"
89
- except Exception as e:
90
- logger.error("Threat chat failed: %s", e)
91
- return f"Error processing question: {str(e)}"
92
-
93
-
94
- def _build_threat_context(detections: List[Dict[str, Any]]) -> str:
95
- """Build a text summary of all detected threats for GPT context."""
96
- lines = []
97
-
98
- for det in detections:
99
- obj_id = det.get("id", "Unknown")
100
- label = det.get("label", "object")
101
-
102
- # Extract GPT raw data if available
103
- gpt_raw = det.get("gpt_raw") or det.get("features") or {}
104
-
105
- # Universal schema fields (with fallbacks to legacy names)
106
- obj_type = gpt_raw.get("object_type") or gpt_raw.get("vessel_category", label)
107
- size = gpt_raw.get("size", "")
108
- threat_score = (
109
- det.get("threat_level_score")
110
- or gpt_raw.get("threat_level")
111
- or gpt_raw.get("threat_level_score", "?")
112
- )
113
- threat_class = (
114
- det.get("threat_classification")
115
- or gpt_raw.get("threat_classification", "Unknown")
116
- )
117
- weapons = gpt_raw.get("visible_weapons", [])
118
- weapon_ready = gpt_raw.get("weapon_readiness") or det.get("weapon_readiness", "Unknown")
119
- motion = gpt_raw.get("motion_status", "Unknown")
120
- range_est = gpt_raw.get("range_estimate") or gpt_raw.get("range_estimation_nm", "")
121
- bearing = gpt_raw.get("bearing") or gpt_raw.get("bearing_clock") or det.get("gpt_direction", "")
122
- intent = gpt_raw.get("tactical_intent", "")
123
- dynamic_features = gpt_raw.get("dynamic_features", [])
124
-
125
- # Build entry
126
- entry = f"[{obj_id}] {obj_type}"
127
- if size and size != "Unknown":
128
- entry += f" ({size})"
129
- entry += f"\n - Threat: {threat_class} (Score: {threat_score}/10)"
130
-
131
- if range_est:
132
- entry += f"\n - Range: {range_est}"
133
- if bearing and bearing != "Unknown":
134
- entry += f", Bearing: {bearing}"
135
- if motion and motion != "Unknown":
136
- entry += f"\n - Motion: {motion}"
137
- if weapons:
138
- entry += f"\n - Weapons: {', '.join(weapons) if isinstance(weapons, list) else weapons}"
139
- if weapon_ready and weapon_ready != "Unknown":
140
- entry += f" ({weapon_ready})"
141
- if intent:
142
- entry += f"\n - Assessed Intent: {intent}"
143
-
144
- # Append dynamic features
145
- for feat in dynamic_features:
146
- if isinstance(feat, dict):
147
- key = feat.get("key", "")
148
- value = feat.get("value", "")
149
- if key and value:
150
- entry += f"\n - {key}: {value}"
151
-
152
- lines.append(entry)
153
-
154
- return "\n\n".join(lines) if lines else "No threat data available."