Spaces:
Paused
Paused
Zhen Ye Claude Opus 4.6 (1M context) commited on
Commit ·
9574811
1
Parent(s): 164d8b0
refactor: remove GPT/mission/relevance system, keep CV-only base
Browse filesStrip all GPT threat assessment, mission parsing, relevance gating,
and enrichment logic to create a reusable detection base. All computer
vision functionality preserved: detectors, segmenters, depth estimation,
object tracking, multi-GPU pipeline, async jobs, and MJPEG streaming.
Deleted: utils/{gpt_reasoning,openai_client,threat_chat,relevance,
enrichment,mission_parser,schemas}.py
Removed: enable_gpt, mission_spec, first_frame_gpt_results params
from inference pipeline, jobs, and API endpoints.
Removed: /detect/analyze-frame, /reason/track, /chat/threat endpoints.
Removed: sentence-transformers, python-dotenv dependencies.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
- app.py +20 -340
- inference.py +8 -336
- jobs/background.py +2 -8
- jobs/models.py +0 -5
- requirements.txt +0 -2
- utils/enrichment.py +0 -122
- utils/gpt_reasoning.py +0 -374
- utils/mission_parser.py +0 -481
- utils/openai_client.py +0 -80
- utils/relevance.py +0 -141
- utils/schemas.py +0 -115
- utils/threat_chat.py +0 -154
app.py
CHANGED
|
@@ -1,7 +1,4 @@
|
|
| 1 |
import os
|
| 2 |
-
from dotenv import load_dotenv
|
| 3 |
-
load_dotenv()
|
| 4 |
-
|
| 5 |
import logging
|
| 6 |
|
| 7 |
# Fix: Set Hugging Face cache to writable location
|
|
@@ -39,8 +36,7 @@ import cv2
|
|
| 39 |
import numpy as np
|
| 40 |
from fastapi import BackgroundTasks, FastAPI, File, Form, HTTPException, UploadFile
|
| 41 |
from fastapi.middleware.cors import CORSMiddleware
|
| 42 |
-
from fastapi.responses import FileResponse, HTMLResponse, JSONResponse,
|
| 43 |
-
from fastapi.staticfiles import StaticFiles
|
| 44 |
import uvicorn
|
| 45 |
|
| 46 |
from inference import process_first_frame, run_inference, run_grounded_sam2_tracking
|
|
@@ -57,14 +53,6 @@ from jobs.storage import (
|
|
| 57 |
get_job_storage,
|
| 58 |
get_output_video_path,
|
| 59 |
)
|
| 60 |
-
from utils.gpt_reasoning import estimate_threat_gpt
|
| 61 |
-
from utils.threat_chat import chat_about_threats
|
| 62 |
-
from utils.relevance import evaluate_relevance
|
| 63 |
-
from utils.enrichment import run_enrichment
|
| 64 |
-
from utils.schemas import AssessmentStatus
|
| 65 |
-
from models.segmenters.model_loader import get_segmenter_detector
|
| 66 |
-
from utils.mission_parser import parse_mission_text, build_broad_queries, MissionParseError
|
| 67 |
-
|
| 68 |
logging.basicConfig(level=logging.INFO)
|
| 69 |
|
| 70 |
# Suppress noisy external libraries
|
|
@@ -72,77 +60,6 @@ logging.getLogger("httpx").setLevel(logging.WARNING)
|
|
| 72 |
logging.getLogger("huggingface_hub").setLevel(logging.WARNING)
|
| 73 |
logging.getLogger("transformers").setLevel(logging.WARNING)
|
| 74 |
|
| 75 |
-
# GPT concurrency limiter — prevents thread exhaustion under load
|
| 76 |
-
_GPT_SEMAPHORE = asyncio.Semaphore(int(os.environ.get("GPT_CONCURRENCY_LIMIT", "4")))
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
async def _enrich_first_frame_gpt(
|
| 80 |
-
job_id: str,
|
| 81 |
-
frame: np.ndarray,
|
| 82 |
-
detections: list,
|
| 83 |
-
enable_gpt: bool,
|
| 84 |
-
mission_spec,
|
| 85 |
-
) -> None:
|
| 86 |
-
"""Fire-and-forget GPT enrichment for first-frame track cards.
|
| 87 |
-
|
| 88 |
-
Runs concurrently with the video pipeline so the user gets instant
|
| 89 |
-
first-frame preview (UNASSESSED), then track cards update once GPT
|
| 90 |
-
finishes (typically 2-5s later).
|
| 91 |
-
"""
|
| 92 |
-
if not enable_gpt or not detections:
|
| 93 |
-
return
|
| 94 |
-
try:
|
| 95 |
-
# Non-LLM_EXTRACTED relevance filter runs BEFORE run_enrichment (FAST_PATH case)
|
| 96 |
-
if mission_spec and mission_spec.parse_mode != "LLM_EXTRACTED":
|
| 97 |
-
for d in detections:
|
| 98 |
-
decision = evaluate_relevance(d, mission_spec.relevance_criteria)
|
| 99 |
-
d["mission_relevant"] = decision.relevant
|
| 100 |
-
d["relevance_reason"] = decision.reason
|
| 101 |
-
filtered = [d for d in detections if d.get("mission_relevant", True)]
|
| 102 |
-
if not filtered:
|
| 103 |
-
for det in detections:
|
| 104 |
-
det["assessment_status"] = AssessmentStatus.ASSESSED
|
| 105 |
-
get_job_storage().update(
|
| 106 |
-
job_id,
|
| 107 |
-
first_frame_detections=detections,
|
| 108 |
-
)
|
| 109 |
-
logging.info("All detections non-relevant for job %s; marked ASSESSED", job_id)
|
| 110 |
-
return
|
| 111 |
-
|
| 112 |
-
gpt_results = await asyncio.to_thread(
|
| 113 |
-
run_enrichment, 0, frame, detections, mission_spec,
|
| 114 |
-
job_id=job_id,
|
| 115 |
-
)
|
| 116 |
-
logging.info("Background GPT enrichment complete for job %s", job_id)
|
| 117 |
-
|
| 118 |
-
if not gpt_results:
|
| 119 |
-
# All detections filtered as not relevant
|
| 120 |
-
for det in detections:
|
| 121 |
-
det["assessment_status"] = AssessmentStatus.ASSESSED
|
| 122 |
-
get_job_storage().update(
|
| 123 |
-
job_id,
|
| 124 |
-
first_frame_detections=detections,
|
| 125 |
-
)
|
| 126 |
-
logging.info("All detections non-relevant for job %s; marked ASSESSED", job_id)
|
| 127 |
-
return
|
| 128 |
-
|
| 129 |
-
# Tag any remaining detections without an assessment status
|
| 130 |
-
for det in detections:
|
| 131 |
-
if "assessment_status" not in det:
|
| 132 |
-
det["assessment_status"] = AssessmentStatus.UNASSESSED
|
| 133 |
-
|
| 134 |
-
# Update stored job so frontend polls pick up GPT data
|
| 135 |
-
get_job_storage().update(
|
| 136 |
-
job_id,
|
| 137 |
-
first_frame_detections=detections,
|
| 138 |
-
first_frame_gpt_results=gpt_results,
|
| 139 |
-
)
|
| 140 |
-
logging.info("Updated first_frame_detections with GPT results for job %s", job_id)
|
| 141 |
-
|
| 142 |
-
except Exception:
|
| 143 |
-
logging.exception("Background GPT enrichment failed for job %s", job_id)
|
| 144 |
-
|
| 145 |
-
|
| 146 |
async def _periodic_cleanup() -> None:
|
| 147 |
while True:
|
| 148 |
await asyncio.sleep(600)
|
|
@@ -168,26 +85,6 @@ app.add_middleware(
|
|
| 168 |
)
|
| 169 |
|
| 170 |
|
| 171 |
-
from fastapi import Request
|
| 172 |
-
|
| 173 |
-
@app.middleware("http")
|
| 174 |
-
async def add_no_cache_header(request: Request, call_next):
|
| 175 |
-
"""Ensure frontend assets are not cached by the browser (important for HF Spaces updates)."""
|
| 176 |
-
response = await call_next(request)
|
| 177 |
-
# Apply to all static files and the root page
|
| 178 |
-
if request.url.path.startswith("/laser") or request.url.path == "/":
|
| 179 |
-
response.headers["Cache-Control"] = "no-cache, no-store, must-revalidate"
|
| 180 |
-
response.headers["Pragma"] = "no-cache"
|
| 181 |
-
response.headers["Expires"] = "0"
|
| 182 |
-
return response
|
| 183 |
-
|
| 184 |
-
# Optional: serve the LaserPerception frontend from this backend.
|
| 185 |
-
# The frontend files are now located in the 'frontend' directory.
|
| 186 |
-
_FRONTEND_DIR = Path(__file__).with_name("frontend")
|
| 187 |
-
if _FRONTEND_DIR.exists():
|
| 188 |
-
# Mount the entire frontend directory at /laser (legacy path) or /frontend
|
| 189 |
-
app.mount("/laser", StaticFiles(directory=_FRONTEND_DIR, html=True), name="laser")
|
| 190 |
-
|
| 191 |
# Valid detection modes
|
| 192 |
VALID_MODES = {"object_detection", "segmentation", "drone_detection"}
|
| 193 |
|
|
@@ -228,7 +125,11 @@ def _schedule_cleanup(background_tasks: BackgroundTasks, path: str) -> None:
|
|
| 228 |
background_tasks.add_task(_cleanup)
|
| 229 |
|
| 230 |
|
| 231 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
| 232 |
if mode == "segmentation":
|
| 233 |
return ["object"]
|
| 234 |
if mode == "drone_detection":
|
|
@@ -236,11 +137,17 @@ def _default_queries_for_mode(mode: str) -> list[str]:
|
|
| 236 |
return ["person", "car", "truck", "motorcycle", "bicycle", "bus", "train", "airplane"]
|
| 237 |
|
| 238 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
@app.get("/", response_class=HTMLResponse)
|
| 240 |
async def demo_page():
|
| 241 |
-
"""
|
| 242 |
-
|
| 243 |
-
|
|
|
|
| 244 |
|
| 245 |
|
| 246 |
@app.post("/detect")
|
|
@@ -252,10 +159,9 @@ async def detect_endpoint(
|
|
| 252 |
detector: str = Form("yolo11"),
|
| 253 |
segmenter: str = Form("GSAM2-L"),
|
| 254 |
enable_depth: bool = Form(False),
|
| 255 |
-
enable_gpt: bool = Form(True),
|
| 256 |
):
|
| 257 |
"""
|
| 258 |
-
Main detection endpoint.
|
| 259 |
|
| 260 |
Args:
|
| 261 |
video: Video file to process
|
|
@@ -263,8 +169,7 @@ async def detect_endpoint(
|
|
| 263 |
queries: Comma-separated object classes for object_detection mode
|
| 264 |
detector: Model to use (yolo11, detr_resnet50, grounding_dino)
|
| 265 |
segmenter: Segmentation model to use (GSAM2-S/B/L, YSAM2-S/B/L)
|
| 266 |
-
enable_depth: Whether to run
|
| 267 |
-
drone_detection uses the dedicated drone_yolo model.
|
| 268 |
|
| 269 |
Returns:
|
| 270 |
- For object_detection: Processed video with bounding boxes
|
|
@@ -293,10 +198,7 @@ async def detect_endpoint(
|
|
| 293 |
fd, output_path = tempfile.mkstemp(prefix="output_", suffix=".mp4", dir="/tmp")
|
| 294 |
os.close(fd)
|
| 295 |
|
| 296 |
-
|
| 297 |
-
query_list = [q.strip() for q in queries.split(",") if q.strip()]
|
| 298 |
-
if not query_list:
|
| 299 |
-
query_list = ["object"]
|
| 300 |
|
| 301 |
try:
|
| 302 |
output_path = run_grounded_sam2_tracking(
|
|
@@ -343,25 +245,11 @@ async def detect_endpoint(
|
|
| 343 |
fd, output_path = tempfile.mkstemp(prefix="output_", suffix=".mp4", dir="/tmp")
|
| 344 |
os.close(fd)
|
| 345 |
|
| 346 |
-
# Parse queries with mission awareness
|
| 347 |
detector_name = "drone_yolo" if mode == "drone_detection" else detector
|
| 348 |
-
mission_spec = None
|
| 349 |
-
|
| 350 |
-
if queries.strip():
|
| 351 |
-
try:
|
| 352 |
-
mission_spec = parse_mission_text(queries.strip(), detector_name, video_path=input_path)
|
| 353 |
-
query_list = build_broad_queries(detector_name, mission_spec)
|
| 354 |
-
except MissionParseError as e:
|
| 355 |
-
raise HTTPException(status_code=422, detail=str(e))
|
| 356 |
-
else:
|
| 357 |
-
query_list = _default_queries_for_mode(mode)
|
| 358 |
|
| 359 |
-
|
| 360 |
-
query_list = ["drone"]
|
| 361 |
|
| 362 |
-
# Run inference
|
| 363 |
try:
|
| 364 |
-
|
| 365 |
# Determine depth estimator
|
| 366 |
active_depth = "depth" if enable_depth else None
|
| 367 |
|
|
@@ -372,7 +260,6 @@ async def detect_endpoint(
|
|
| 372 |
detector_name=detector_name,
|
| 373 |
depth_estimator_name=active_depth,
|
| 374 |
depth_scale=25.0,
|
| 375 |
-
enable_gpt=enable_gpt,
|
| 376 |
)
|
| 377 |
except ValueError as exc:
|
| 378 |
logging.exception("Video processing failed.")
|
|
@@ -408,7 +295,6 @@ async def detect_async_endpoint(
|
|
| 408 |
depth_estimator: str = Form("depth"),
|
| 409 |
depth_scale: float = Form(25.0),
|
| 410 |
enable_depth: bool = Form(False),
|
| 411 |
-
enable_gpt: bool = Form(True),
|
| 412 |
step: int = Form(7),
|
| 413 |
):
|
| 414 |
_ttfs_t0 = time.perf_counter()
|
|
@@ -440,49 +326,13 @@ async def detect_async_endpoint(
|
|
| 440 |
|
| 441 |
logging.info("[TTFS:%s] +%.1fs upload_saved", job_id, time.perf_counter() - _ttfs_t0)
|
| 442 |
|
| 443 |
-
# --- Mission-Driven Query Parsing ---
|
| 444 |
-
mission_spec = None
|
| 445 |
-
mission_mode = "LEGACY"
|
| 446 |
-
|
| 447 |
detector_name = detector
|
| 448 |
-
mission_detector = detector # detector key used for mission query parsing
|
| 449 |
if mode == "drone_detection":
|
| 450 |
detector_name = "drone_yolo"
|
| 451 |
-
mission_detector = "drone_yolo"
|
| 452 |
elif mode == "segmentation":
|
| 453 |
-
# Segmenter registry owns detector selection (GSAM2→GDINO, YSAM2→YOLO).
|
| 454 |
-
# detector_name=None so the job doesn't forward it (avoids duplicate kwarg).
|
| 455 |
-
try:
|
| 456 |
-
mission_detector = get_segmenter_detector(segmenter)
|
| 457 |
-
except ValueError as exc:
|
| 458 |
-
raise HTTPException(status_code=400, detail=str(exc))
|
| 459 |
detector_name = None
|
| 460 |
|
| 461 |
-
|
| 462 |
-
try:
|
| 463 |
-
mission_spec = parse_mission_text(queries.strip(), mission_detector, video_path=str(input_path))
|
| 464 |
-
query_list = build_broad_queries(mission_detector, mission_spec)
|
| 465 |
-
mission_mode = "MISSION"
|
| 466 |
-
logging.info(
|
| 467 |
-
"Mission parsed: mode=%s classes=%s broad_queries=%s domain=%s(%s)",
|
| 468 |
-
mission_mode, mission_spec.object_classes, query_list,
|
| 469 |
-
mission_spec.domain, mission_spec.domain_source,
|
| 470 |
-
)
|
| 471 |
-
except MissionParseError as e:
|
| 472 |
-
raise HTTPException(
|
| 473 |
-
status_code=422,
|
| 474 |
-
detail=str(e),
|
| 475 |
-
)
|
| 476 |
-
else:
|
| 477 |
-
# LEGACY mode: no mission context, use defaults, disable GPT
|
| 478 |
-
query_list = _default_queries_for_mode(mode)
|
| 479 |
-
enable_gpt = False
|
| 480 |
-
mission_mode = "LEGACY"
|
| 481 |
-
logging.info(
|
| 482 |
-
"LEGACY mode: no mission text, defaults=%s, GPT disabled", query_list
|
| 483 |
-
)
|
| 484 |
-
|
| 485 |
-
logging.info("[TTFS:%s] +%.1fs mission_parsed", job_id, time.perf_counter() - _ttfs_t0)
|
| 486 |
|
| 487 |
available_depth_estimators = set(list_depth_estimators())
|
| 488 |
if depth_estimator not in available_depth_estimators:
|
|
@@ -508,8 +358,6 @@ async def detect_async_endpoint(
|
|
| 508 |
)
|
| 509 |
cv2.imwrite(str(first_frame_path), processed_frame)
|
| 510 |
logging.info("[TTFS:%s] +%.1fs process_first_frame done", job_id, time.perf_counter() - _ttfs_t0)
|
| 511 |
-
# GPT and depth are now handled in the async pipeline (enrichment thread)
|
| 512 |
-
first_frame_gpt_results = None
|
| 513 |
except Exception:
|
| 514 |
logging.exception("First-frame processing failed.")
|
| 515 |
shutil.rmtree(job_dir, ignore_errors=True)
|
|
@@ -530,26 +378,12 @@ async def detect_async_endpoint(
|
|
| 530 |
depth_scale=float(depth_scale),
|
| 531 |
depth_output_path=str(depth_output_path),
|
| 532 |
first_frame_depth_path=str(first_frame_depth_path),
|
| 533 |
-
enable_gpt=enable_gpt,
|
| 534 |
-
mission_spec=mission_spec,
|
| 535 |
-
mission_mode=mission_mode,
|
| 536 |
-
first_frame_gpt_results=first_frame_gpt_results,
|
| 537 |
step=step,
|
| 538 |
ttfs_t0=_ttfs_t0,
|
| 539 |
)
|
| 540 |
get_job_storage().create(job)
|
| 541 |
asyncio.create_task(process_video_async(job_id))
|
| 542 |
|
| 543 |
-
# Fire-and-forget: enrich first-frame detections with GPT in background.
|
| 544 |
-
# Runs for ALL modes including segmentation — first-frame detections from
|
| 545 |
-
# process_first_frame() already have stable track IDs (T01, T02, ...) and
|
| 546 |
-
# valid bboxes, so there's no reason to defer. The GSAM2 writer's
|
| 547 |
-
# enrichment thread will see the cached results via first_frame_gpt_results
|
| 548 |
-
# in JobStorage and skip the duplicate call on frame 0.
|
| 549 |
-
asyncio.create_task(_enrich_first_frame_gpt(
|
| 550 |
-
job_id, processed_frame, detections, enable_gpt, mission_spec,
|
| 551 |
-
))
|
| 552 |
-
|
| 553 |
response_data = {
|
| 554 |
"job_id": job_id,
|
| 555 |
"first_frame_url": f"/detect/first-frame/{job_id}",
|
|
@@ -560,21 +394,8 @@ async def detect_async_endpoint(
|
|
| 560 |
"stream_url": f"/detect/stream/{job_id}",
|
| 561 |
"status": job.status.value,
|
| 562 |
"first_frame_detections": detections,
|
| 563 |
-
"mission_mode": mission_mode,
|
| 564 |
}
|
| 565 |
|
| 566 |
-
if mission_spec:
|
| 567 |
-
response_data["mission_spec"] = {
|
| 568 |
-
"object_classes": mission_spec.object_classes,
|
| 569 |
-
"mission_intent": mission_spec.mission_intent,
|
| 570 |
-
"domain": mission_spec.domain,
|
| 571 |
-
"domain_source": mission_spec.domain_source,
|
| 572 |
-
"parse_confidence": mission_spec.parse_confidence,
|
| 573 |
-
"parse_warnings": mission_spec.parse_warnings,
|
| 574 |
-
"context_phrases": mission_spec.context_phrases,
|
| 575 |
-
"stripped_modifiers": mission_spec.stripped_modifiers,
|
| 576 |
-
}
|
| 577 |
-
|
| 578 |
return response_data
|
| 579 |
|
| 580 |
|
|
@@ -618,59 +439,6 @@ async def get_frame_tracks(job_id: str, frame_idx: int):
|
|
| 618 |
return data or []
|
| 619 |
|
| 620 |
|
| 621 |
-
@app.post("/detect/analyze-frame")
|
| 622 |
-
async def analyze_frame(
|
| 623 |
-
image: UploadFile = File(...),
|
| 624 |
-
detections: str = Form(...),
|
| 625 |
-
job_id: str = Form(None),
|
| 626 |
-
):
|
| 627 |
-
"""Run GPT threat assessment on a single video frame."""
|
| 628 |
-
import json as json_module
|
| 629 |
-
from utils.gpt_reasoning import encode_frame_to_b64
|
| 630 |
-
|
| 631 |
-
dets = json_module.loads(detections)
|
| 632 |
-
|
| 633 |
-
# Look up mission_spec from stored job (if available)
|
| 634 |
-
mission_spec = None
|
| 635 |
-
if job_id:
|
| 636 |
-
job = get_job_storage().get(job_id)
|
| 637 |
-
if job:
|
| 638 |
-
mission_spec = job.mission_spec
|
| 639 |
-
|
| 640 |
-
# Decode uploaded image
|
| 641 |
-
image_bytes = await image.read()
|
| 642 |
-
nparr = np.frombuffer(image_bytes, np.uint8)
|
| 643 |
-
frame = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
|
| 644 |
-
if frame is None:
|
| 645 |
-
raise HTTPException(status_code=400, detail="Invalid image")
|
| 646 |
-
|
| 647 |
-
# Run GPT in thread pool (blocking OpenAI API call)
|
| 648 |
-
frame_b64 = encode_frame_to_b64(frame)
|
| 649 |
-
async with _GPT_SEMAPHORE:
|
| 650 |
-
gpt_results = await asyncio.to_thread(
|
| 651 |
-
estimate_threat_gpt,
|
| 652 |
-
detections=dets,
|
| 653 |
-
mission_spec=mission_spec,
|
| 654 |
-
image_b64=frame_b64,
|
| 655 |
-
)
|
| 656 |
-
|
| 657 |
-
# Merge GPT results into detection records
|
| 658 |
-
for d in dets:
|
| 659 |
-
oid = d.get("track_id") or d.get("id")
|
| 660 |
-
if oid and oid in gpt_results:
|
| 661 |
-
payload = gpt_results[oid]
|
| 662 |
-
d["gpt_raw"] = payload
|
| 663 |
-
d["assessment_status"] = payload.get("assessment_status", "ASSESSED")
|
| 664 |
-
d["threat_level_score"] = payload.get("threat_level_score", 0)
|
| 665 |
-
d["threat_classification"] = payload.get("threat_classification", "Unknown")
|
| 666 |
-
d["weapon_readiness"] = payload.get("weapon_readiness", "Unknown")
|
| 667 |
-
d["gpt_description"] = payload.get("gpt_description")
|
| 668 |
-
d["gpt_distance_m"] = payload.get("gpt_distance_m")
|
| 669 |
-
d["gpt_direction"] = payload.get("gpt_direction")
|
| 670 |
-
|
| 671 |
-
return dets
|
| 672 |
-
|
| 673 |
-
|
| 674 |
@app.delete("/detect/job/{job_id}")
|
| 675 |
async def cancel_job(job_id: str):
|
| 676 |
"""Cancel a running job."""
|
|
@@ -856,93 +624,6 @@ async def stream_video(job_id: str):
|
|
| 856 |
)
|
| 857 |
|
| 858 |
|
| 859 |
-
@app.post("/reason/track")
|
| 860 |
-
async def reason_track(
|
| 861 |
-
frame: UploadFile = File(...),
|
| 862 |
-
tracks: str = Form(...) # JSON string of tracks: [{"id": "T01", "bbox": [x,y,w,h], "label": "car"}, ...]
|
| 863 |
-
):
|
| 864 |
-
"""
|
| 865 |
-
Reason about specific tracks in a frame using GPT.
|
| 866 |
-
Returns distance and description for each object ID.
|
| 867 |
-
"""
|
| 868 |
-
import json
|
| 869 |
-
try:
|
| 870 |
-
input_path = _save_upload_to_tmp(frame)
|
| 871 |
-
except Exception:
|
| 872 |
-
raise HTTPException(status_code=500, detail="Failed to save uploaded frame")
|
| 873 |
-
|
| 874 |
-
try:
|
| 875 |
-
track_list = json.loads(tracks)
|
| 876 |
-
except json.JSONDecodeError:
|
| 877 |
-
_safe_delete(input_path)
|
| 878 |
-
raise HTTPException(status_code=400, detail="Invalid tracks JSON")
|
| 879 |
-
|
| 880 |
-
# Run GPT estimation
|
| 881 |
-
# This is blocking, but that's expected for this endpoint structure.
|
| 882 |
-
# For high concurrency, might want to offload to threadpool or async wrapper.
|
| 883 |
-
try:
|
| 884 |
-
async with _GPT_SEMAPHORE:
|
| 885 |
-
results = await asyncio.to_thread(estimate_threat_gpt, input_path, track_list)
|
| 886 |
-
logging.info(f"GPT Output for Video Track Update:\n{results}")
|
| 887 |
-
except Exception as e:
|
| 888 |
-
logging.exception("GPT reasoning failed")
|
| 889 |
-
_safe_delete(input_path)
|
| 890 |
-
raise HTTPException(status_code=500, detail=str(e))
|
| 891 |
-
|
| 892 |
-
_safe_delete(input_path)
|
| 893 |
-
return results
|
| 894 |
-
|
| 895 |
-
|
| 896 |
-
@app.post("/chat/threat")
|
| 897 |
-
async def chat_threat_endpoint(
|
| 898 |
-
question: str = Form(...),
|
| 899 |
-
detections: str = Form(...), # JSON string of current detections
|
| 900 |
-
mission_context: str = Form(""), # Optional JSON string of mission spec
|
| 901 |
-
):
|
| 902 |
-
"""
|
| 903 |
-
Chat about detected threats using GPT.
|
| 904 |
-
|
| 905 |
-
Args:
|
| 906 |
-
question: User's question about the current threat situation.
|
| 907 |
-
detections: JSON string of detection list with threat analysis data.
|
| 908 |
-
mission_context: Optional JSON string of mission specification.
|
| 909 |
-
|
| 910 |
-
Returns:
|
| 911 |
-
GPT response about the threats.
|
| 912 |
-
"""
|
| 913 |
-
import json as json_module
|
| 914 |
-
|
| 915 |
-
if not question.strip():
|
| 916 |
-
raise HTTPException(status_code=400, detail="Question cannot be empty.")
|
| 917 |
-
|
| 918 |
-
try:
|
| 919 |
-
detection_list = json_module.loads(detections)
|
| 920 |
-
except json_module.JSONDecodeError:
|
| 921 |
-
raise HTTPException(status_code=400, detail="Invalid detections JSON.")
|
| 922 |
-
|
| 923 |
-
if not isinstance(detection_list, list):
|
| 924 |
-
raise HTTPException(status_code=400, detail="Detections must be a list.")
|
| 925 |
-
|
| 926 |
-
# Parse optional mission context
|
| 927 |
-
mission_spec_dict = None
|
| 928 |
-
if mission_context.strip():
|
| 929 |
-
try:
|
| 930 |
-
mission_spec_dict = json_module.loads(mission_context)
|
| 931 |
-
except json_module.JSONDecodeError:
|
| 932 |
-
pass # Non-critical, proceed without mission context
|
| 933 |
-
|
| 934 |
-
# Run chat in thread to avoid blocking (with concurrency limit)
|
| 935 |
-
try:
|
| 936 |
-
async with _GPT_SEMAPHORE:
|
| 937 |
-
response = await asyncio.to_thread(
|
| 938 |
-
chat_about_threats, question, detection_list, mission_spec_dict
|
| 939 |
-
)
|
| 940 |
-
return {"response": response}
|
| 941 |
-
except Exception as e:
|
| 942 |
-
logging.exception("Threat chat failed")
|
| 943 |
-
raise HTTPException(status_code=500, detail=str(e))
|
| 944 |
-
|
| 945 |
-
|
| 946 |
@app.post("/benchmark")
|
| 947 |
async def benchmark_endpoint(
|
| 948 |
video: UploadFile = File(...),
|
|
@@ -990,7 +671,6 @@ async def benchmark_endpoint(
|
|
| 990 |
query_list,
|
| 991 |
segmenter_name=segmenter,
|
| 992 |
step=step,
|
| 993 |
-
enable_gpt=False,
|
| 994 |
_perf_metrics=metrics,
|
| 995 |
_perf_lock=lock,
|
| 996 |
num_maskmem=num_maskmem,
|
|
|
|
| 1 |
import os
|
|
|
|
|
|
|
|
|
|
| 2 |
import logging
|
| 3 |
|
| 4 |
# Fix: Set Hugging Face cache to writable location
|
|
|
|
| 36 |
import numpy as np
|
| 37 |
from fastapi import BackgroundTasks, FastAPI, File, Form, HTTPException, UploadFile
|
| 38 |
from fastapi.middleware.cors import CORSMiddleware
|
| 39 |
+
from fastapi.responses import FileResponse, HTMLResponse, JSONResponse, StreamingResponse
|
|
|
|
| 40 |
import uvicorn
|
| 41 |
|
| 42 |
from inference import process_first_frame, run_inference, run_grounded_sam2_tracking
|
|
|
|
| 53 |
get_job_storage,
|
| 54 |
get_output_video_path,
|
| 55 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
logging.basicConfig(level=logging.INFO)
|
| 57 |
|
| 58 |
# Suppress noisy external libraries
|
|
|
|
| 60 |
logging.getLogger("huggingface_hub").setLevel(logging.WARNING)
|
| 61 |
logging.getLogger("transformers").setLevel(logging.WARNING)
|
| 62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
async def _periodic_cleanup() -> None:
|
| 64 |
while True:
|
| 65 |
await asyncio.sleep(600)
|
|
|
|
| 85 |
)
|
| 86 |
|
| 87 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
# Valid detection modes
|
| 89 |
VALID_MODES = {"object_detection", "segmentation", "drone_detection"}
|
| 90 |
|
|
|
|
| 125 |
background_tasks.add_task(_cleanup)
|
| 126 |
|
| 127 |
|
| 128 |
+
def _parse_queries(raw: str, mode: str) -> list[str]:
|
| 129 |
+
"""Parse comma-separated query string, falling back to mode defaults."""
|
| 130 |
+
parsed = [q.strip() for q in raw.split(",") if q.strip()]
|
| 131 |
+
if parsed:
|
| 132 |
+
return parsed
|
| 133 |
if mode == "segmentation":
|
| 134 |
return ["object"]
|
| 135 |
if mode == "drone_detection":
|
|
|
|
| 137 |
return ["person", "car", "truck", "motorcycle", "bicycle", "bus", "train", "airplane"]
|
| 138 |
|
| 139 |
|
| 140 |
+
# Cache index.html at module load
|
| 141 |
+
_INDEX_HTML_PATH = Path(__file__).with_name("index.html")
|
| 142 |
+
_INDEX_HTML = _INDEX_HTML_PATH.read_text() if _INDEX_HTML_PATH.exists() else None
|
| 143 |
+
|
| 144 |
+
|
| 145 |
@app.get("/", response_class=HTMLResponse)
|
| 146 |
async def demo_page():
|
| 147 |
+
"""Serve minimal detection UI."""
|
| 148 |
+
if _INDEX_HTML:
|
| 149 |
+
return HTMLResponse(_INDEX_HTML)
|
| 150 |
+
return HTMLResponse("<h1>Detection Base</h1><p>index.html not found</p>")
|
| 151 |
|
| 152 |
|
| 153 |
@app.post("/detect")
|
|
|
|
| 159 |
detector: str = Form("yolo11"),
|
| 160 |
segmenter: str = Form("GSAM2-L"),
|
| 161 |
enable_depth: bool = Form(False),
|
|
|
|
| 162 |
):
|
| 163 |
"""
|
| 164 |
+
Main detection endpoint (synchronous).
|
| 165 |
|
| 166 |
Args:
|
| 167 |
video: Video file to process
|
|
|
|
| 169 |
queries: Comma-separated object classes for object_detection mode
|
| 170 |
detector: Model to use (yolo11, detr_resnet50, grounding_dino)
|
| 171 |
segmenter: Segmentation model to use (GSAM2-S/B/L, YSAM2-S/B/L)
|
| 172 |
+
enable_depth: Whether to run depth estimation (default: False)
|
|
|
|
| 173 |
|
| 174 |
Returns:
|
| 175 |
- For object_detection: Processed video with bounding boxes
|
|
|
|
| 198 |
fd, output_path = tempfile.mkstemp(prefix="output_", suffix=".mp4", dir="/tmp")
|
| 199 |
os.close(fd)
|
| 200 |
|
| 201 |
+
query_list = _parse_queries(queries, mode)
|
|
|
|
|
|
|
|
|
|
| 202 |
|
| 203 |
try:
|
| 204 |
output_path = run_grounded_sam2_tracking(
|
|
|
|
| 245 |
fd, output_path = tempfile.mkstemp(prefix="output_", suffix=".mp4", dir="/tmp")
|
| 246 |
os.close(fd)
|
| 247 |
|
|
|
|
| 248 |
detector_name = "drone_yolo" if mode == "drone_detection" else detector
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 249 |
|
| 250 |
+
query_list = _parse_queries(queries, mode)
|
|
|
|
| 251 |
|
|
|
|
| 252 |
try:
|
|
|
|
| 253 |
# Determine depth estimator
|
| 254 |
active_depth = "depth" if enable_depth else None
|
| 255 |
|
|
|
|
| 260 |
detector_name=detector_name,
|
| 261 |
depth_estimator_name=active_depth,
|
| 262 |
depth_scale=25.0,
|
|
|
|
| 263 |
)
|
| 264 |
except ValueError as exc:
|
| 265 |
logging.exception("Video processing failed.")
|
|
|
|
| 295 |
depth_estimator: str = Form("depth"),
|
| 296 |
depth_scale: float = Form(25.0),
|
| 297 |
enable_depth: bool = Form(False),
|
|
|
|
| 298 |
step: int = Form(7),
|
| 299 |
):
|
| 300 |
_ttfs_t0 = time.perf_counter()
|
|
|
|
| 326 |
|
| 327 |
logging.info("[TTFS:%s] +%.1fs upload_saved", job_id, time.perf_counter() - _ttfs_t0)
|
| 328 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 329 |
detector_name = detector
|
|
|
|
| 330 |
if mode == "drone_detection":
|
| 331 |
detector_name = "drone_yolo"
|
|
|
|
| 332 |
elif mode == "segmentation":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 333 |
detector_name = None
|
| 334 |
|
| 335 |
+
query_list = _parse_queries(queries, mode)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 336 |
|
| 337 |
available_depth_estimators = set(list_depth_estimators())
|
| 338 |
if depth_estimator not in available_depth_estimators:
|
|
|
|
| 358 |
)
|
| 359 |
cv2.imwrite(str(first_frame_path), processed_frame)
|
| 360 |
logging.info("[TTFS:%s] +%.1fs process_first_frame done", job_id, time.perf_counter() - _ttfs_t0)
|
|
|
|
|
|
|
| 361 |
except Exception:
|
| 362 |
logging.exception("First-frame processing failed.")
|
| 363 |
shutil.rmtree(job_dir, ignore_errors=True)
|
|
|
|
| 378 |
depth_scale=float(depth_scale),
|
| 379 |
depth_output_path=str(depth_output_path),
|
| 380 |
first_frame_depth_path=str(first_frame_depth_path),
|
|
|
|
|
|
|
|
|
|
|
|
|
| 381 |
step=step,
|
| 382 |
ttfs_t0=_ttfs_t0,
|
| 383 |
)
|
| 384 |
get_job_storage().create(job)
|
| 385 |
asyncio.create_task(process_video_async(job_id))
|
| 386 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 387 |
response_data = {
|
| 388 |
"job_id": job_id,
|
| 389 |
"first_frame_url": f"/detect/first-frame/{job_id}",
|
|
|
|
| 394 |
"stream_url": f"/detect/stream/{job_id}",
|
| 395 |
"status": job.status.value,
|
| 396 |
"first_frame_detections": detections,
|
|
|
|
| 397 |
}
|
| 398 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 399 |
return response_data
|
| 400 |
|
| 401 |
|
|
|
|
| 439 |
return data or []
|
| 440 |
|
| 441 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 442 |
@app.delete("/detect/job/{job_id}")
|
| 443 |
async def cancel_job(job_id: str):
|
| 444 |
"""Cancel a running job."""
|
|
|
|
| 624 |
)
|
| 625 |
|
| 626 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 627 |
@app.post("/benchmark")
|
| 628 |
async def benchmark_endpoint(
|
| 629 |
video: UploadFile = File(...),
|
|
|
|
| 671 |
query_list,
|
| 672 |
segmenter_name=segmenter,
|
| 673 |
step=step,
|
|
|
|
| 674 |
_perf_metrics=metrics,
|
| 675 |
_perf_lock=lock,
|
| 676 |
num_maskmem=num_maskmem,
|
inference.py
CHANGED
|
@@ -21,12 +21,8 @@ from models.model_loader import load_detector, load_detector_on_device
|
|
| 21 |
from models.segmenters.model_loader import load_segmenter, load_segmenter_on_device
|
| 22 |
from models.depth_estimators.model_loader import load_depth_estimator, load_depth_estimator_on_device
|
| 23 |
from utils.video import StreamingVideoWriter
|
| 24 |
-
from utils.relevance import evaluate_relevance
|
| 25 |
-
from utils.enrichment import run_enrichment
|
| 26 |
-
from utils.schemas import AssessmentStatus
|
| 27 |
from jobs.storage import set_track_data
|
| 28 |
import tempfile
|
| 29 |
-
import json as json_module
|
| 30 |
|
| 31 |
|
| 32 |
class AsyncVideoReader:
|
|
@@ -301,9 +297,7 @@ class SpeedEstimator:
|
|
| 301 |
|
| 302 |
dist_px = np.sqrt((cx1-cx2)**2 + (cy1-cy2)**2)
|
| 303 |
|
| 304 |
-
# Heuristic scale:
|
| 305 |
-
# If we had GPT distance, we could calibrate.
|
| 306 |
-
# For now, let's use a dummy scale: 50px = 1m (very rough)
|
| 307 |
# Speed = (dist_px / 50) meters / (5 frames / 30 fps) seconds
|
| 308 |
# = (dist_px / 50) / (0.166) m/s
|
| 309 |
# = (dist_px * 0.12) m/s
|
|
@@ -403,7 +397,7 @@ def _attach_depth_metrics(
|
|
| 403 |
depth_scale: float, # No longer used for distance calculation
|
| 404 |
estimator_instance: Optional[Any] = None,
|
| 405 |
) -> None:
|
| 406 |
-
"""Attach relative depth values
|
| 407 |
if not detections or (not depth_estimator_name and not estimator_instance):
|
| 408 |
return
|
| 409 |
|
|
@@ -514,16 +508,7 @@ def infer_frame(
|
|
| 514 |
except Exception:
|
| 515 |
logging.exception("Depth estimation failed for frame")
|
| 516 |
|
| 517 |
-
|
| 518 |
-
display_labels = []
|
| 519 |
-
for i, det in enumerate(detections):
|
| 520 |
-
label = det["label"]
|
| 521 |
-
if det.get("gpt_distance_m") is not None:
|
| 522 |
-
# Add GPT distance to label, e.g. "car 12m"
|
| 523 |
-
depth_str = f"{int(det['gpt_distance_m'])}m"
|
| 524 |
-
label = f"{label} {depth_str}"
|
| 525 |
-
logging.debug("Object '%s' at %s (bbox: %s)", label, depth_str, det['bbox'])
|
| 526 |
-
display_labels.append(label)
|
| 527 |
|
| 528 |
except Exception:
|
| 529 |
logging.exception("Inference failed for queries %s", text_queries)
|
|
@@ -537,15 +522,8 @@ def infer_frame(
|
|
| 537 |
), detections
|
| 538 |
|
| 539 |
|
| 540 |
-
def _build_display_label(det):
|
| 541 |
-
"""Build display label with GPT distance if available."""
|
| 542 |
-
label = det["label"]
|
| 543 |
-
if det.get("gpt_distance_m") is not None:
|
| 544 |
-
label = f"{label} {int(det['gpt_distance_m'])}m"
|
| 545 |
-
return label
|
| 546 |
-
|
| 547 |
def _attach_depth_from_result(detections, depth_result, depth_scale):
|
| 548 |
-
"""Attach relative depth values
|
| 549 |
depth_map = depth_result.depth_map
|
| 550 |
if depth_map is None or depth_map.size == 0: return
|
| 551 |
|
|
@@ -644,11 +622,8 @@ def process_first_frame(
|
|
| 644 |
) -> Tuple[np.ndarray, List[Dict[str, Any]]]:
|
| 645 |
"""Lightweight first-frame processing: detection + rendering only.
|
| 646 |
|
| 647 |
-
GPT, depth, and LLM relevance are handled later in the async pipeline
|
| 648 |
-
(writer enrichment thread), avoiding 2-8s synchronous startup delay.
|
| 649 |
-
|
| 650 |
Returns:
|
| 651 |
-
(processed_frame, detections)
|
| 652 |
"""
|
| 653 |
frame, _, _, _ = extract_first_frame(video_path)
|
| 654 |
if mode == "segmentation":
|
|
@@ -665,7 +640,6 @@ def process_first_frame(
|
|
| 665 |
"bbox": [int(c) for c in box],
|
| 666 |
"score": float(seg_result.scores[idx]) if seg_result.scores is not None and idx < len(seg_result.scores) else 1.0,
|
| 667 |
"track_id": f"T{idx + 1:02d}",
|
| 668 |
-
"assessment_status": AssessmentStatus.UNASSESSED,
|
| 669 |
})
|
| 670 |
return processed, detections
|
| 671 |
|
|
@@ -673,10 +647,6 @@ def process_first_frame(
|
|
| 673 |
frame, queries, detector_name=detector_name
|
| 674 |
)
|
| 675 |
|
| 676 |
-
# Tag all detections as unassessed — GPT runs later in enrichment thread
|
| 677 |
-
for det in detections:
|
| 678 |
-
det["assessment_status"] = AssessmentStatus.UNASSESSED
|
| 679 |
-
|
| 680 |
return processed, detections
|
| 681 |
|
| 682 |
|
|
@@ -689,10 +659,7 @@ def run_inference(
|
|
| 689 |
job_id: Optional[str] = None,
|
| 690 |
depth_estimator_name: Optional[str] = None,
|
| 691 |
depth_scale: float = 1.0,
|
| 692 |
-
enable_gpt: bool = True,
|
| 693 |
stream_queue: Optional[Queue] = None,
|
| 694 |
-
mission_spec=None, # Optional[MissionSpecification]
|
| 695 |
-
first_frame_gpt_results: Optional[Dict[str, Any]] = None,
|
| 696 |
first_frame_detections: Optional[List[Dict[str, Any]]] = None,
|
| 697 |
) -> Tuple[str, List[List[Dict[str, Any]]]]:
|
| 698 |
|
|
@@ -769,8 +736,7 @@ def run_inference(
|
|
| 769 |
# queue_in: (frame_idx, frame_data)
|
| 770 |
# queue_out: (frame_idx, processed_frame, detections)
|
| 771 |
queue_in = Queue(maxsize=16)
|
| 772 |
-
#
|
| 773 |
-
# GPT Latency Buffer: GPT takes ~3s. At 30fps, that's 90 frames. We need to absorb this burst.
|
| 774 |
queue_out_max = max(128, (len(detectors) if detectors else 1) * 32)
|
| 775 |
queue_out = Queue(maxsize=queue_out_max)
|
| 776 |
|
|
@@ -948,32 +914,6 @@ def run_inference(
|
|
| 948 |
# writer_finished = False
|
| 949 |
|
| 950 |
|
| 951 |
-
# --- GPT Enrichment Thread (non-blocking) ---
|
| 952 |
-
# Runs LLM relevance + GPT threat assessment off the writer's critical path.
|
| 953 |
-
gpt_enrichment_queue = Queue(maxsize=4)
|
| 954 |
-
_relevance_refined = Event()
|
| 955 |
-
|
| 956 |
-
def enrichment_thread_fn(tracker_ref):
|
| 957 |
-
"""Dedicated thread for GPT/LLM calls. Receives work from writer, injects results via tracker."""
|
| 958 |
-
while True:
|
| 959 |
-
item = gpt_enrichment_queue.get()
|
| 960 |
-
if item is None:
|
| 961 |
-
break # Sentinel — shutdown
|
| 962 |
-
frame_idx, frame_data, gpt_dets, ms = item
|
| 963 |
-
try:
|
| 964 |
-
gpt_res = run_enrichment(
|
| 965 |
-
frame_idx, frame_data, gpt_dets, ms,
|
| 966 |
-
first_frame_gpt_results=first_frame_gpt_results,
|
| 967 |
-
job_id=job_id,
|
| 968 |
-
relevance_refined_event=_relevance_refined,
|
| 969 |
-
)
|
| 970 |
-
if gpt_res:
|
| 971 |
-
tracker_ref.inject_metadata(gpt_dets)
|
| 972 |
-
logging.info("Enrichment: GPT results injected into tracker for frame %d", frame_idx)
|
| 973 |
-
|
| 974 |
-
except Exception as e:
|
| 975 |
-
logging.error("Enrichment thread failed for frame %d: %s", frame_idx, e)
|
| 976 |
-
|
| 977 |
def writer_loop():
|
| 978 |
nonlocal writer_finished
|
| 979 |
next_idx = 0
|
|
@@ -982,11 +922,6 @@ def run_inference(
|
|
| 982 |
# Initialize Tracker & Speed Estimator
|
| 983 |
tracker = ByteTracker(frame_rate=fps)
|
| 984 |
speed_est = SpeedEstimator(fps=fps)
|
| 985 |
-
gpt_submitted = False # GPT enrichment submitted once for frame 0
|
| 986 |
-
|
| 987 |
-
# Start enrichment thread
|
| 988 |
-
enrich_thread = Thread(target=enrichment_thread_fn, args=(tracker,), daemon=True)
|
| 989 |
-
enrich_thread.start()
|
| 990 |
|
| 991 |
try:
|
| 992 |
with StreamingVideoWriter(output_video_path, fps, width, height) as writer:
|
|
@@ -1016,67 +951,11 @@ def run_inference(
|
|
| 1016 |
next_idx, pre_track_count, len(dets))
|
| 1017 |
speed_est.estimate(dets)
|
| 1018 |
|
| 1019 |
-
# --- RELEVANCE GATE (deterministic, fast — stays in writer) ---
|
| 1020 |
-
if mission_spec:
|
| 1021 |
-
if (mission_spec.parse_mode == "LLM_EXTRACTED"
|
| 1022 |
-
and not _relevance_refined.is_set()):
|
| 1023 |
-
# LLM post-filter hasn't run yet — pass all through
|
| 1024 |
-
for d in dets:
|
| 1025 |
-
d["mission_relevant"] = True
|
| 1026 |
-
d["relevance_reason"] = "pending_llm_postfilter"
|
| 1027 |
-
gpt_dets = dets
|
| 1028 |
-
else:
|
| 1029 |
-
# Normal deterministic gate (with refined or FAST_PATH classes)
|
| 1030 |
-
for d in dets:
|
| 1031 |
-
decision = evaluate_relevance(d, mission_spec.relevance_criteria)
|
| 1032 |
-
d["mission_relevant"] = decision.relevant
|
| 1033 |
-
d["relevance_reason"] = decision.reason
|
| 1034 |
-
if not decision.relevant:
|
| 1035 |
-
logging.info(
|
| 1036 |
-
json_module.dumps({
|
| 1037 |
-
"event": "relevance_decision",
|
| 1038 |
-
"track_id": d.get("track_id"),
|
| 1039 |
-
"label": d.get("label"),
|
| 1040 |
-
"relevant": False,
|
| 1041 |
-
"reason": decision.reason,
|
| 1042 |
-
"required_classes": mission_spec.relevance_criteria.required_classes,
|
| 1043 |
-
"frame": next_idx,
|
| 1044 |
-
})
|
| 1045 |
-
)
|
| 1046 |
-
gpt_dets = [d for d in dets if d.get("mission_relevant", True)]
|
| 1047 |
-
else:
|
| 1048 |
-
for d in dets:
|
| 1049 |
-
d["mission_relevant"] = None
|
| 1050 |
-
gpt_dets = dets
|
| 1051 |
-
|
| 1052 |
-
# --- GPT ENRICHMENT (non-blocking, offloaded to enrichment thread) ---
|
| 1053 |
-
if enable_gpt and gpt_dets and not gpt_submitted:
|
| 1054 |
-
# Tag as pending — enrichment thread will update to ASSESSED later
|
| 1055 |
-
for d in gpt_dets:
|
| 1056 |
-
d["assessment_status"] = AssessmentStatus.PENDING_GPT
|
| 1057 |
-
try:
|
| 1058 |
-
gpt_enrichment_queue.put(
|
| 1059 |
-
(next_idx, p_frame.copy(), gpt_dets, mission_spec),
|
| 1060 |
-
timeout=1.0,
|
| 1061 |
-
)
|
| 1062 |
-
gpt_submitted = True
|
| 1063 |
-
logging.info("Writer: offloaded GPT enrichment for frame %d", next_idx)
|
| 1064 |
-
except Full:
|
| 1065 |
-
logging.warning("GPT enrichment queue full, skipping frame 0 GPT")
|
| 1066 |
-
|
| 1067 |
-
# Tag unassessed detections (INV-6)
|
| 1068 |
-
for d in dets:
|
| 1069 |
-
if "assessment_status" not in d:
|
| 1070 |
-
d["assessment_status"] = AssessmentStatus.UNASSESSED
|
| 1071 |
-
|
| 1072 |
# --- RENDER BOXES & OVERLAYS ---
|
| 1073 |
if dets:
|
| 1074 |
display_boxes = np.array([d['bbox'] for d in dets])
|
| 1075 |
display_labels = []
|
| 1076 |
for d in dets:
|
| 1077 |
-
if d.get("mission_relevant") is False:
|
| 1078 |
-
display_labels.append("")
|
| 1079 |
-
continue
|
| 1080 |
lbl = d.get('label', 'obj')
|
| 1081 |
display_labels.append(lbl)
|
| 1082 |
|
|
@@ -1131,12 +1010,6 @@ def run_inference(
|
|
| 1131 |
logging.exception("Writer loop failed")
|
| 1132 |
finally:
|
| 1133 |
logging.info("Writer loop finished. Wrote %d frames (target %d)", next_idx, total_frames)
|
| 1134 |
-
# Shut down enrichment thread
|
| 1135 |
-
try:
|
| 1136 |
-
gpt_enrichment_queue.put(None, timeout=5.0)
|
| 1137 |
-
enrich_thread.join(timeout=30)
|
| 1138 |
-
except Exception:
|
| 1139 |
-
logging.warning("Enrichment thread shutdown timed out")
|
| 1140 |
writer_finished = True
|
| 1141 |
|
| 1142 |
writer_thread = Thread(target=writer_loop, daemon=True)
|
|
@@ -1213,8 +1086,7 @@ def _gsam2_render_frame(
|
|
| 1213 |
) -> np.ndarray:
|
| 1214 |
"""Render a single GSAM2 tracking frame (masks + boxes). CPU-only.
|
| 1215 |
|
| 1216 |
-
When *masks_only* is True, skip box rendering
|
| 1217 |
-
draw boxes later with enriched (GPT) labels.
|
| 1218 |
"""
|
| 1219 |
if frame_store is not None:
|
| 1220 |
frame = frame_store.get_bgr(frame_idx).copy() # .copy() — render mutates
|
|
@@ -1274,9 +1146,6 @@ def run_grounded_sam2_tracking(
|
|
| 1274 |
job_id: Optional[str] = None,
|
| 1275 |
stream_queue: Optional[Queue] = None,
|
| 1276 |
step: int = 20,
|
| 1277 |
-
enable_gpt: bool = False,
|
| 1278 |
-
mission_spec=None, # Optional[MissionSpecification]
|
| 1279 |
-
first_frame_gpt_results: Optional[Dict[str, Any]] = None,
|
| 1280 |
_perf_metrics: Optional[Dict[str, float]] = None,
|
| 1281 |
_perf_lock=None,
|
| 1282 |
num_maskmem: Optional[int] = None,
|
|
@@ -1376,7 +1245,6 @@ def run_grounded_sam2_tracking(
|
|
| 1376 |
frm = _gsam2_render_frame(
|
| 1377 |
frame_dir, frame_names, fidx, fobjs,
|
| 1378 |
height, width,
|
| 1379 |
-
masks_only=enable_gpt,
|
| 1380 |
frame_store=frame_store,
|
| 1381 |
)
|
| 1382 |
|
|
@@ -1387,7 +1255,7 @@ def run_grounded_sam2_tracking(
|
|
| 1387 |
else:
|
| 1388 |
_perf_metrics["render_total_ms"] += _r_ms
|
| 1389 |
|
| 1390 |
-
payload = (fidx, frm,
|
| 1391 |
while True:
|
| 1392 |
try:
|
| 1393 |
render_out.put(payload, timeout=1.0)
|
|
@@ -1410,92 +1278,6 @@ def run_grounded_sam2_tracking(
|
|
| 1410 |
for t in r_workers:
|
| 1411 |
t.start()
|
| 1412 |
|
| 1413 |
-
# --- ObjectInfo → detection dict adapter ---
|
| 1414 |
-
def _objectinfo_to_dets(frame_objects_dict):
|
| 1415 |
-
dets = []
|
| 1416 |
-
for obj_id, info in frame_objects_dict.items():
|
| 1417 |
-
dets.append({
|
| 1418 |
-
"label": info.class_name,
|
| 1419 |
-
"bbox": [info.x1, info.y1, info.x2, info.y2],
|
| 1420 |
-
"score": 1.0,
|
| 1421 |
-
"track_id": f"T{obj_id:02d}",
|
| 1422 |
-
"instance_id": obj_id,
|
| 1423 |
-
})
|
| 1424 |
-
return dets
|
| 1425 |
-
|
| 1426 |
-
# --- GPT enrichment thread (when enabled) ---
|
| 1427 |
-
gpt_enrichment_queue: Queue = Queue(maxsize=4)
|
| 1428 |
-
gpt_data_by_track: Dict[str, Dict] = {}
|
| 1429 |
-
gpt_data_lock = RLock()
|
| 1430 |
-
_relevance_refined = Event()
|
| 1431 |
-
|
| 1432 |
-
def _gsam2_enrichment_thread_fn():
|
| 1433 |
-
while True:
|
| 1434 |
-
item = gpt_enrichment_queue.get()
|
| 1435 |
-
if item is None:
|
| 1436 |
-
break
|
| 1437 |
-
frame_idx, frame_data, gpt_dets, ms = item
|
| 1438 |
-
try:
|
| 1439 |
-
gpt_res = run_enrichment(
|
| 1440 |
-
frame_idx, frame_data, gpt_dets, ms,
|
| 1441 |
-
first_frame_gpt_results=first_frame_gpt_results,
|
| 1442 |
-
job_id=job_id,
|
| 1443 |
-
relevance_refined_event=_relevance_refined,
|
| 1444 |
-
)
|
| 1445 |
-
|
| 1446 |
-
# GSAM2-specific: store results in per-track dict and persist to job storage
|
| 1447 |
-
if gpt_res:
|
| 1448 |
-
for d in gpt_dets:
|
| 1449 |
-
tid = d.get("track_id")
|
| 1450 |
-
if tid and tid in gpt_res:
|
| 1451 |
-
merged = dict(gpt_res[tid])
|
| 1452 |
-
merged["gpt_raw"] = gpt_res[tid]
|
| 1453 |
-
merged["assessment_frame_index"] = frame_idx
|
| 1454 |
-
merged["assessment_status"] = merged.get(
|
| 1455 |
-
"assessment_status", AssessmentStatus.ASSESSED
|
| 1456 |
-
)
|
| 1457 |
-
with gpt_data_lock:
|
| 1458 |
-
gpt_data_by_track[tid] = merged
|
| 1459 |
-
logging.info("GSAM2 enrichment: GPT results stored for %d tracks", len(gpt_data_by_track))
|
| 1460 |
-
|
| 1461 |
-
# Persist GPT-enriched detections to job storage so
|
| 1462 |
-
# frontend polling (/detect/status) picks them up.
|
| 1463 |
-
if job_id:
|
| 1464 |
-
try:
|
| 1465 |
-
from jobs.storage import get_job_storage as _gjs
|
| 1466 |
-
_st = _gjs().get(job_id)
|
| 1467 |
-
if _st and _st.first_frame_detections:
|
| 1468 |
-
for det in _st.first_frame_detections:
|
| 1469 |
-
tid = det.get("track_id")
|
| 1470 |
-
with gpt_data_lock:
|
| 1471 |
-
payload = gpt_data_by_track.get(tid)
|
| 1472 |
-
if payload:
|
| 1473 |
-
det.update(payload)
|
| 1474 |
-
# Also sync relevance from gpt_dets
|
| 1475 |
-
src = next((d for d in gpt_dets if d.get("track_id") == tid), None)
|
| 1476 |
-
if src:
|
| 1477 |
-
if "mission_relevant" in src:
|
| 1478 |
-
det["mission_relevant"] = src["mission_relevant"]
|
| 1479 |
-
if "relevance_reason" in src:
|
| 1480 |
-
det["relevance_reason"] = src["relevance_reason"]
|
| 1481 |
-
from jobs.storage import get_job_storage as _gjs2
|
| 1482 |
-
_gjs2().update(
|
| 1483 |
-
job_id,
|
| 1484 |
-
first_frame_detections=_st.first_frame_detections,
|
| 1485 |
-
first_frame_gpt_results=gpt_res,
|
| 1486 |
-
)
|
| 1487 |
-
logging.info(
|
| 1488 |
-
"GSAM2 enrichment: updated first_frame_detections in job storage for %s",
|
| 1489 |
-
job_id,
|
| 1490 |
-
)
|
| 1491 |
-
except Exception:
|
| 1492 |
-
logging.exception(
|
| 1493 |
-
"GSAM2 enrichment: failed to update job storage for %s", job_id
|
| 1494 |
-
)
|
| 1495 |
-
|
| 1496 |
-
except Exception as e:
|
| 1497 |
-
logging.error("GSAM2 enrichment thread failed for frame %d: %s", frame_idx, e)
|
| 1498 |
-
|
| 1499 |
# Shared streaming state (publisher ↔ writer)
|
| 1500 |
_stream_deque: collections.deque = collections.deque(maxlen=200)
|
| 1501 |
_stream_lock = RLock()
|
|
@@ -1508,15 +1290,6 @@ def run_grounded_sam2_tracking(
|
|
| 1508 |
buf: Dict[int, Tuple] = {}
|
| 1509 |
|
| 1510 |
# Per-track bbox history (replaces ByteTracker for GSAM2)
|
| 1511 |
-
track_history: Dict[int, List] = {}
|
| 1512 |
-
speed_est = SpeedEstimator(fps=fps) if enable_gpt else None
|
| 1513 |
-
gpt_submitted = False
|
| 1514 |
-
|
| 1515 |
-
# Start enrichment thread when GPT enabled
|
| 1516 |
-
enrich_thread = None
|
| 1517 |
-
if enable_gpt:
|
| 1518 |
-
enrich_thread = Thread(target=_gsam2_enrichment_thread_fn, daemon=True)
|
| 1519 |
-
enrich_thread.start()
|
| 1520 |
|
| 1521 |
try:
|
| 1522 |
with StreamingVideoWriter(
|
|
@@ -1538,100 +1311,6 @@ def run_grounded_sam2_tracking(
|
|
| 1538 |
|
| 1539 |
frm, fobjs = buf.pop(next_idx)
|
| 1540 |
|
| 1541 |
-
# --- GPT enrichment path ---
|
| 1542 |
-
if enable_gpt and fobjs:
|
| 1543 |
-
dets = _objectinfo_to_dets(fobjs)
|
| 1544 |
-
|
| 1545 |
-
# Maintain per-track bbox history (30-frame window)
|
| 1546 |
-
for det in dets:
|
| 1547 |
-
iid = det["instance_id"]
|
| 1548 |
-
track_history.setdefault(iid, []).append(det["bbox"])
|
| 1549 |
-
if len(track_history[iid]) > 30:
|
| 1550 |
-
track_history[iid].pop(0)
|
| 1551 |
-
# Store an immutable per-frame snapshot.
|
| 1552 |
-
det["history"] = list(track_history[iid])
|
| 1553 |
-
|
| 1554 |
-
# Speed estimation
|
| 1555 |
-
if speed_est:
|
| 1556 |
-
speed_est.estimate(dets)
|
| 1557 |
-
|
| 1558 |
-
# Relevance gate
|
| 1559 |
-
if mission_spec:
|
| 1560 |
-
if (mission_spec.parse_mode == "LLM_EXTRACTED"
|
| 1561 |
-
and not _relevance_refined.is_set()):
|
| 1562 |
-
for d in dets:
|
| 1563 |
-
d["mission_relevant"] = True
|
| 1564 |
-
d["relevance_reason"] = "pending_llm_postfilter"
|
| 1565 |
-
gpt_dets = dets
|
| 1566 |
-
else:
|
| 1567 |
-
for d in dets:
|
| 1568 |
-
decision = evaluate_relevance(d, mission_spec.relevance_criteria)
|
| 1569 |
-
d["mission_relevant"] = decision.relevant
|
| 1570 |
-
d["relevance_reason"] = decision.reason
|
| 1571 |
-
gpt_dets = [d for d in dets if d.get("mission_relevant", True)]
|
| 1572 |
-
else:
|
| 1573 |
-
for d in dets:
|
| 1574 |
-
d["mission_relevant"] = None
|
| 1575 |
-
gpt_dets = dets
|
| 1576 |
-
|
| 1577 |
-
# GPT enrichment (one-shot, first frame with detections)
|
| 1578 |
-
if gpt_dets and not gpt_submitted:
|
| 1579 |
-
for d in gpt_dets:
|
| 1580 |
-
d["assessment_status"] = AssessmentStatus.PENDING_GPT
|
| 1581 |
-
try:
|
| 1582 |
-
gpt_enrichment_queue.put(
|
| 1583 |
-
(
|
| 1584 |
-
next_idx,
|
| 1585 |
-
frm.copy(),
|
| 1586 |
-
copy.deepcopy(gpt_dets),
|
| 1587 |
-
mission_spec,
|
| 1588 |
-
),
|
| 1589 |
-
timeout=1.0,
|
| 1590 |
-
)
|
| 1591 |
-
gpt_submitted = True
|
| 1592 |
-
logging.info("GSAM2 writer: offloaded GPT enrichment for frame %d", next_idx)
|
| 1593 |
-
except Full:
|
| 1594 |
-
logging.warning("GSAM2 GPT enrichment queue full, skipping")
|
| 1595 |
-
|
| 1596 |
-
# Merge persistent GPT data
|
| 1597 |
-
for det in dets:
|
| 1598 |
-
tid = det["track_id"]
|
| 1599 |
-
with gpt_data_lock:
|
| 1600 |
-
gpt_payload = gpt_data_by_track.get(tid)
|
| 1601 |
-
if gpt_payload:
|
| 1602 |
-
det.update(gpt_payload)
|
| 1603 |
-
det["assessment_status"] = AssessmentStatus.ASSESSED
|
| 1604 |
-
elif "assessment_status" not in det:
|
| 1605 |
-
det["assessment_status"] = AssessmentStatus.UNASSESSED
|
| 1606 |
-
|
| 1607 |
-
# Build enriched display labels
|
| 1608 |
-
display_labels = []
|
| 1609 |
-
for d in dets:
|
| 1610 |
-
if d.get("mission_relevant") is False:
|
| 1611 |
-
display_labels.append("")
|
| 1612 |
-
continue
|
| 1613 |
-
lbl = d.get("label", "obj")
|
| 1614 |
-
if d.get("gpt_distance_m") is not None:
|
| 1615 |
-
try:
|
| 1616 |
-
lbl = f"{lbl} {int(float(d['gpt_distance_m']))}m"
|
| 1617 |
-
except (TypeError, ValueError):
|
| 1618 |
-
pass
|
| 1619 |
-
display_labels.append(lbl)
|
| 1620 |
-
|
| 1621 |
-
# Draw boxes on mask-rendered frame
|
| 1622 |
-
if dets:
|
| 1623 |
-
boxes = np.array([d["bbox"] for d in dets])
|
| 1624 |
-
frm = draw_boxes(frm, boxes, label_names=display_labels)
|
| 1625 |
-
|
| 1626 |
-
# Store tracks for frontend
|
| 1627 |
-
if job_id:
|
| 1628 |
-
set_track_data(job_id, next_idx, copy.deepcopy(dets))
|
| 1629 |
-
|
| 1630 |
-
elif enable_gpt:
|
| 1631 |
-
# No objects this frame — still store empty track data
|
| 1632 |
-
if job_id:
|
| 1633 |
-
set_track_data(job_id, next_idx, [])
|
| 1634 |
-
|
| 1635 |
if _perf_metrics is not None:
|
| 1636 |
_t_w = time.perf_counter()
|
| 1637 |
|
|
@@ -1668,13 +1347,6 @@ def run_grounded_sam2_tracking(
|
|
| 1668 |
finally:
|
| 1669 |
render_done = True
|
| 1670 |
_stream_writer_done.set()
|
| 1671 |
-
# Shut down enrichment thread
|
| 1672 |
-
if enrich_thread:
|
| 1673 |
-
try:
|
| 1674 |
-
gpt_enrichment_queue.put(None, timeout=5.0)
|
| 1675 |
-
enrich_thread.join(timeout=30)
|
| 1676 |
-
except Exception:
|
| 1677 |
-
logging.warning("GSAM2 enrichment thread shutdown timed out")
|
| 1678 |
|
| 1679 |
def _stream_publisher_thread():
|
| 1680 |
"""Adaptive-rate publisher: reads from _stream_deque, publishes at measured pace."""
|
|
|
|
| 21 |
from models.segmenters.model_loader import load_segmenter, load_segmenter_on_device
|
| 22 |
from models.depth_estimators.model_loader import load_depth_estimator, load_depth_estimator_on_device
|
| 23 |
from utils.video import StreamingVideoWriter
|
|
|
|
|
|
|
|
|
|
| 24 |
from jobs.storage import set_track_data
|
| 25 |
import tempfile
|
|
|
|
| 26 |
|
| 27 |
|
| 28 |
class AsyncVideoReader:
|
|
|
|
| 297 |
|
| 298 |
dist_px = np.sqrt((cx1-cx2)**2 + (cy1-cy2)**2)
|
| 299 |
|
| 300 |
+
# Heuristic scale: 50px = 1m (very rough)
|
|
|
|
|
|
|
| 301 |
# Speed = (dist_px / 50) meters / (5 frames / 30 fps) seconds
|
| 302 |
# = (dist_px / 50) / (0.166) m/s
|
| 303 |
# = (dist_px * 0.12) m/s
|
|
|
|
| 397 |
depth_scale: float, # No longer used for distance calculation
|
| 398 |
estimator_instance: Optional[Any] = None,
|
| 399 |
) -> None:
|
| 400 |
+
"""Attach relative depth values to detection dicts for visualization."""
|
| 401 |
if not detections or (not depth_estimator_name and not estimator_instance):
|
| 402 |
return
|
| 403 |
|
|
|
|
| 508 |
except Exception:
|
| 509 |
logging.exception("Depth estimation failed for frame")
|
| 510 |
|
| 511 |
+
display_labels = [det["label"] for det in detections]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 512 |
|
| 513 |
except Exception:
|
| 514 |
logging.exception("Inference failed for queries %s", text_queries)
|
|
|
|
| 522 |
), detections
|
| 523 |
|
| 524 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 525 |
def _attach_depth_from_result(detections, depth_result, depth_scale):
|
| 526 |
+
"""Attach relative depth values to detection dicts for visualization."""
|
| 527 |
depth_map = depth_result.depth_map
|
| 528 |
if depth_map is None or depth_map.size == 0: return
|
| 529 |
|
|
|
|
| 622 |
) -> Tuple[np.ndarray, List[Dict[str, Any]]]:
|
| 623 |
"""Lightweight first-frame processing: detection + rendering only.
|
| 624 |
|
|
|
|
|
|
|
|
|
|
| 625 |
Returns:
|
| 626 |
+
(processed_frame, detections)
|
| 627 |
"""
|
| 628 |
frame, _, _, _ = extract_first_frame(video_path)
|
| 629 |
if mode == "segmentation":
|
|
|
|
| 640 |
"bbox": [int(c) for c in box],
|
| 641 |
"score": float(seg_result.scores[idx]) if seg_result.scores is not None and idx < len(seg_result.scores) else 1.0,
|
| 642 |
"track_id": f"T{idx + 1:02d}",
|
|
|
|
| 643 |
})
|
| 644 |
return processed, detections
|
| 645 |
|
|
|
|
| 647 |
frame, queries, detector_name=detector_name
|
| 648 |
)
|
| 649 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 650 |
return processed, detections
|
| 651 |
|
| 652 |
|
|
|
|
| 659 |
job_id: Optional[str] = None,
|
| 660 |
depth_estimator_name: Optional[str] = None,
|
| 661 |
depth_scale: float = 1.0,
|
|
|
|
| 662 |
stream_queue: Optional[Queue] = None,
|
|
|
|
|
|
|
| 663 |
first_frame_detections: Optional[List[Dict[str, Any]]] = None,
|
| 664 |
) -> Tuple[str, List[List[Dict[str, Any]]]]:
|
| 665 |
|
|
|
|
| 736 |
# queue_in: (frame_idx, frame_data)
|
| 737 |
# queue_out: (frame_idx, processed_frame, detections)
|
| 738 |
queue_in = Queue(maxsize=16)
|
| 739 |
+
# Buffer at least 32 frames per GPU for pipeline overlap
|
|
|
|
| 740 |
queue_out_max = max(128, (len(detectors) if detectors else 1) * 32)
|
| 741 |
queue_out = Queue(maxsize=queue_out_max)
|
| 742 |
|
|
|
|
| 914 |
# writer_finished = False
|
| 915 |
|
| 916 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 917 |
def writer_loop():
|
| 918 |
nonlocal writer_finished
|
| 919 |
next_idx = 0
|
|
|
|
| 922 |
# Initialize Tracker & Speed Estimator
|
| 923 |
tracker = ByteTracker(frame_rate=fps)
|
| 924 |
speed_est = SpeedEstimator(fps=fps)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 925 |
|
| 926 |
try:
|
| 927 |
with StreamingVideoWriter(output_video_path, fps, width, height) as writer:
|
|
|
|
| 951 |
next_idx, pre_track_count, len(dets))
|
| 952 |
speed_est.estimate(dets)
|
| 953 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 954 |
# --- RENDER BOXES & OVERLAYS ---
|
| 955 |
if dets:
|
| 956 |
display_boxes = np.array([d['bbox'] for d in dets])
|
| 957 |
display_labels = []
|
| 958 |
for d in dets:
|
|
|
|
|
|
|
|
|
|
| 959 |
lbl = d.get('label', 'obj')
|
| 960 |
display_labels.append(lbl)
|
| 961 |
|
|
|
|
| 1010 |
logging.exception("Writer loop failed")
|
| 1011 |
finally:
|
| 1012 |
logging.info("Writer loop finished. Wrote %d frames (target %d)", next_idx, total_frames)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1013 |
writer_finished = True
|
| 1014 |
|
| 1015 |
writer_thread = Thread(target=writer_loop, daemon=True)
|
|
|
|
| 1086 |
) -> np.ndarray:
|
| 1087 |
"""Render a single GSAM2 tracking frame (masks + boxes). CPU-only.
|
| 1088 |
|
| 1089 |
+
When *masks_only* is True, skip box rendering.
|
|
|
|
| 1090 |
"""
|
| 1091 |
if frame_store is not None:
|
| 1092 |
frame = frame_store.get_bgr(frame_idx).copy() # .copy() — render mutates
|
|
|
|
| 1146 |
job_id: Optional[str] = None,
|
| 1147 |
stream_queue: Optional[Queue] = None,
|
| 1148 |
step: int = 20,
|
|
|
|
|
|
|
|
|
|
| 1149 |
_perf_metrics: Optional[Dict[str, float]] = None,
|
| 1150 |
_perf_lock=None,
|
| 1151 |
num_maskmem: Optional[int] = None,
|
|
|
|
| 1245 |
frm = _gsam2_render_frame(
|
| 1246 |
frame_dir, frame_names, fidx, fobjs,
|
| 1247 |
height, width,
|
|
|
|
| 1248 |
frame_store=frame_store,
|
| 1249 |
)
|
| 1250 |
|
|
|
|
| 1255 |
else:
|
| 1256 |
_perf_metrics["render_total_ms"] += _r_ms
|
| 1257 |
|
| 1258 |
+
payload = (fidx, frm, {})
|
| 1259 |
while True:
|
| 1260 |
try:
|
| 1261 |
render_out.put(payload, timeout=1.0)
|
|
|
|
| 1278 |
for t in r_workers:
|
| 1279 |
t.start()
|
| 1280 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1281 |
# Shared streaming state (publisher ↔ writer)
|
| 1282 |
_stream_deque: collections.deque = collections.deque(maxlen=200)
|
| 1283 |
_stream_lock = RLock()
|
|
|
|
| 1290 |
buf: Dict[int, Tuple] = {}
|
| 1291 |
|
| 1292 |
# Per-track bbox history (replaces ByteTracker for GSAM2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1293 |
|
| 1294 |
try:
|
| 1295 |
with StreamingVideoWriter(
|
|
|
|
| 1311 |
|
| 1312 |
frm, fobjs = buf.pop(next_idx)
|
| 1313 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1314 |
if _perf_metrics is not None:
|
| 1315 |
_t_w = time.perf_counter()
|
| 1316 |
|
|
|
|
| 1347 |
finally:
|
| 1348 |
render_done = True
|
| 1349 |
_stream_writer_done.set()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1350 |
|
| 1351 |
def _stream_publisher_thread():
|
| 1352 |
"""Adaptive-rate publisher: reads from _stream_deque, publishes at measured pace."""
|
jobs/background.py
CHANGED
|
@@ -35,9 +35,6 @@ async def process_video_async(job_id: str) -> None:
|
|
| 35 |
job_id=job_id,
|
| 36 |
stream_queue=stream_queue,
|
| 37 |
step=job.step,
|
| 38 |
-
enable_gpt=job.enable_gpt,
|
| 39 |
-
mission_spec=job.mission_spec,
|
| 40 |
-
first_frame_gpt_results=job.first_frame_gpt_results,
|
| 41 |
num_maskmem=7,
|
| 42 |
detector_name=job.detector_name,
|
| 43 |
_ttfs_t0=job.ttfs_t0,
|
|
@@ -53,13 +50,10 @@ async def process_video_async(job_id: str) -> None:
|
|
| 53 |
None,
|
| 54 |
job.detector_name,
|
| 55 |
job_id,
|
| 56 |
-
job.depth_estimator_name,
|
| 57 |
job.depth_scale,
|
| 58 |
-
job.enable_gpt,
|
| 59 |
stream_queue,
|
| 60 |
-
job.
|
| 61 |
-
job.first_frame_gpt_results, # Avoid duplicate GPT call on frame 0
|
| 62 |
-
job.first_frame_detections, # Reuse frame 0 detections (avoid re-detecting)
|
| 63 |
)
|
| 64 |
detection_path, detections_list = result_pkg
|
| 65 |
|
|
|
|
| 35 |
job_id=job_id,
|
| 36 |
stream_queue=stream_queue,
|
| 37 |
step=job.step,
|
|
|
|
|
|
|
|
|
|
| 38 |
num_maskmem=7,
|
| 39 |
detector_name=job.detector_name,
|
| 40 |
_ttfs_t0=job.ttfs_t0,
|
|
|
|
| 50 |
None,
|
| 51 |
job.detector_name,
|
| 52 |
job_id,
|
| 53 |
+
job.depth_estimator_name,
|
| 54 |
job.depth_scale,
|
|
|
|
| 55 |
stream_queue,
|
| 56 |
+
job.first_frame_detections,
|
|
|
|
|
|
|
| 57 |
)
|
| 58 |
detection_path, detections_list = result_pkg
|
| 59 |
|
jobs/models.py
CHANGED
|
@@ -33,10 +33,5 @@ class JobInfo:
|
|
| 33 |
first_frame_depth_path: Optional[str] = None
|
| 34 |
partial_success: bool = False # True if one component failed but job completed
|
| 35 |
depth_error: Optional[str] = None # Error message if depth failed
|
| 36 |
-
enable_gpt: bool = True # Whether to use GPT for distance estimation
|
| 37 |
-
# Mission specification (None = LEGACY mode)
|
| 38 |
-
mission_spec: Optional[Any] = None # utils.schemas.MissionSpecification
|
| 39 |
-
mission_mode: str = "LEGACY" # "MISSION" or "LEGACY"
|
| 40 |
-
first_frame_gpt_results: Optional[Dict[str, Any]] = None # Cached GPT results from process_first_frame
|
| 41 |
step: int = 7 # Segmentation keyframe step (matches num_maskmem)
|
| 42 |
ttfs_t0: Optional[float] = None # TTFS anchor: time.perf_counter() at endpoint entry
|
|
|
|
| 33 |
first_frame_depth_path: Optional[str] = None
|
| 34 |
partial_success: bool = False # True if one component failed but job completed
|
| 35 |
depth_error: Optional[str] = None # Error message if depth failed
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
step: int = 7 # Segmentation keyframe step (matches num_maskmem)
|
| 37 |
ttfs_t0: Optional[float] = None # TTFS anchor: time.perf_counter() at endpoint entry
|
requirements.txt
CHANGED
|
@@ -7,9 +7,7 @@ python-multipart
|
|
| 7 |
pillow
|
| 8 |
huggingface-hub
|
| 9 |
ultralytics
|
| 10 |
-
python-dotenv
|
| 11 |
einops
|
| 12 |
-
sentence-transformers
|
| 13 |
SAM-2 @ git+https://github.com/facebookresearch/sam2.git
|
| 14 |
hydra-core>=1.3.2
|
| 15 |
iopath>=0.1.10
|
|
|
|
| 7 |
pillow
|
| 8 |
huggingface-hub
|
| 9 |
ultralytics
|
|
|
|
| 10 |
einops
|
|
|
|
| 11 |
SAM-2 @ git+https://github.com/facebookresearch/sam2.git
|
| 12 |
hydra-core>=1.3.2
|
| 13 |
iopath>=0.1.10
|
utils/enrichment.py
DELETED
|
@@ -1,122 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Shared enrichment workflow — single implementation of the 5-step GPT enrichment
|
| 3 |
-
pipeline used by inference.py (detection + GSAM2) and app.py (first-frame).
|
| 4 |
-
|
| 5 |
-
Consolidates duplicated logic from:
|
| 6 |
-
- inference.py enrichment_thread_fn
|
| 7 |
-
- inference.py _gsam2_enrichment_thread_fn
|
| 8 |
-
- app.py _enrich_first_frame_gpt
|
| 9 |
-
"""
|
| 10 |
-
|
| 11 |
-
import logging
|
| 12 |
-
from threading import Event
|
| 13 |
-
from typing import Any, Dict, List, Optional
|
| 14 |
-
|
| 15 |
-
from utils.gpt_reasoning import estimate_threat_gpt, encode_frame_to_b64
|
| 16 |
-
from utils.relevance import evaluate_relevance, evaluate_relevance_llm
|
| 17 |
-
from utils.schemas import AssessmentStatus
|
| 18 |
-
|
| 19 |
-
logger = logging.getLogger(__name__)
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
def run_enrichment(
|
| 23 |
-
frame_idx: int,
|
| 24 |
-
frame_data,
|
| 25 |
-
detections: List[Dict[str, Any]],
|
| 26 |
-
mission_spec,
|
| 27 |
-
*,
|
| 28 |
-
first_frame_gpt_results: Optional[Dict] = None,
|
| 29 |
-
job_id: Optional[str] = None,
|
| 30 |
-
relevance_refined_event: Optional[Event] = None,
|
| 31 |
-
) -> Optional[Dict[str, Any]]:
|
| 32 |
-
"""Run the shared enrichment workflow (LLM post-filter + GPT threat assessment).
|
| 33 |
-
|
| 34 |
-
Steps:
|
| 35 |
-
1. LLM post-filter via evaluate_relevance_llm() (if LLM_EXTRACTED mode)
|
| 36 |
-
2. Signal relevance_refined_event (if provided)
|
| 37 |
-
3. Check cached GPT results (parameter or JobStorage fallback)
|
| 38 |
-
4. Call estimate_threat_gpt() if no cache
|
| 39 |
-
5. Merge results into detections by track_id
|
| 40 |
-
|
| 41 |
-
Args:
|
| 42 |
-
frame_idx: Index of the frame being enriched.
|
| 43 |
-
frame_data: OpenCV BGR frame (numpy array).
|
| 44 |
-
detections: Mutable list of detection dicts to enrich in-place.
|
| 45 |
-
mission_spec: Optional MissionSpecification.
|
| 46 |
-
first_frame_gpt_results: Pre-computed GPT results (cache hit).
|
| 47 |
-
job_id: Job identifier for JobStorage fallback cache lookup.
|
| 48 |
-
relevance_refined_event: threading.Event to signal when LLM post-filter completes.
|
| 49 |
-
|
| 50 |
-
Returns:
|
| 51 |
-
GPT results dict (object_id -> assessment), or None if all detections
|
| 52 |
-
were filtered out.
|
| 53 |
-
"""
|
| 54 |
-
gpt_dets = detections
|
| 55 |
-
|
| 56 |
-
# --- Step 1: LLM post-filter (LLM_EXTRACTED mode) ---
|
| 57 |
-
if mission_spec and mission_spec.parse_mode == "LLM_EXTRACTED":
|
| 58 |
-
unique_labels = list({
|
| 59 |
-
d.get("label", "").lower()
|
| 60 |
-
for d in gpt_dets if d.get("label")
|
| 61 |
-
})
|
| 62 |
-
relevant_labels = evaluate_relevance_llm(
|
| 63 |
-
unique_labels, mission_spec.operator_text
|
| 64 |
-
)
|
| 65 |
-
mission_spec.relevance_criteria.required_classes = list(relevant_labels)
|
| 66 |
-
|
| 67 |
-
# --- Step 2: Signal writer loop ---
|
| 68 |
-
if relevance_refined_event is not None:
|
| 69 |
-
relevance_refined_event.set()
|
| 70 |
-
|
| 71 |
-
logger.info(
|
| 72 |
-
"Enrichment: LLM post-filter applied on frame %d: relevant=%s",
|
| 73 |
-
frame_idx, relevant_labels,
|
| 74 |
-
)
|
| 75 |
-
# Re-filter with refined classes
|
| 76 |
-
for d in gpt_dets:
|
| 77 |
-
decision = evaluate_relevance(d, mission_spec.relevance_criteria)
|
| 78 |
-
d["mission_relevant"] = decision.relevant
|
| 79 |
-
gpt_dets = [d for d in gpt_dets if d.get("mission_relevant", True)]
|
| 80 |
-
elif relevance_refined_event is not None:
|
| 81 |
-
# Non-LLM mode: signal immediately so writer doesn't block
|
| 82 |
-
relevance_refined_event.set()
|
| 83 |
-
|
| 84 |
-
if not gpt_dets:
|
| 85 |
-
return None
|
| 86 |
-
|
| 87 |
-
# --- Step 3: Check cached GPT results ---
|
| 88 |
-
cached_gpt = first_frame_gpt_results
|
| 89 |
-
if not cached_gpt and job_id:
|
| 90 |
-
try:
|
| 91 |
-
from jobs.storage import get_job_storage as _gjs
|
| 92 |
-
_job = _gjs().get(job_id)
|
| 93 |
-
if _job and _job.first_frame_gpt_results:
|
| 94 |
-
cached_gpt = _job.first_frame_gpt_results
|
| 95 |
-
except Exception:
|
| 96 |
-
pass
|
| 97 |
-
|
| 98 |
-
# --- Step 4: Call GPT if no cache ---
|
| 99 |
-
if cached_gpt:
|
| 100 |
-
logger.info("Enrichment: re-using cached GPT results for frame %d", frame_idx)
|
| 101 |
-
gpt_res = cached_gpt
|
| 102 |
-
else:
|
| 103 |
-
logger.info("Enrichment: running GPT estimation for frame %d...", frame_idx)
|
| 104 |
-
frame_b64 = encode_frame_to_b64(frame_data)
|
| 105 |
-
gpt_res = estimate_threat_gpt(
|
| 106 |
-
detections=gpt_dets, mission_spec=mission_spec,
|
| 107 |
-
image_b64=frame_b64,
|
| 108 |
-
)
|
| 109 |
-
|
| 110 |
-
# --- Step 5: Merge results into detections by track_id ---
|
| 111 |
-
for d in gpt_dets:
|
| 112 |
-
oid = d.get("track_id")
|
| 113 |
-
if oid and oid in gpt_res:
|
| 114 |
-
gpt_payload = gpt_res[oid]
|
| 115 |
-
d.update(gpt_payload)
|
| 116 |
-
d["gpt_raw"] = gpt_payload
|
| 117 |
-
d["assessment_frame_index"] = frame_idx
|
| 118 |
-
d["assessment_status"] = gpt_payload.get(
|
| 119 |
-
"assessment_status", AssessmentStatus.ASSESSED
|
| 120 |
-
)
|
| 121 |
-
|
| 122 |
-
return gpt_res
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
utils/gpt_reasoning.py
DELETED
|
@@ -1,374 +0,0 @@
|
|
| 1 |
-
import re
|
| 2 |
-
import json
|
| 3 |
-
import base64
|
| 4 |
-
import logging
|
| 5 |
-
from typing import List, Dict, Any, Optional
|
| 6 |
-
from utils.schemas import AssessmentStatus
|
| 7 |
-
from utils.openai_client import chat_completion, extract_content, get_api_key
|
| 8 |
-
|
| 9 |
-
logger = logging.getLogger(__name__)
|
| 10 |
-
|
| 11 |
-
def encode_image(image_path: str) -> str:
|
| 12 |
-
with open(image_path, "rb") as image_file:
|
| 13 |
-
return base64.b64encode(image_file.read()).decode('utf-8')
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
def encode_frame_to_b64(frame, quality=None) -> str:
|
| 17 |
-
"""Encode an OpenCV BGR frame to a base64 JPEG string in memory (no disk I/O).
|
| 18 |
-
|
| 19 |
-
Args:
|
| 20 |
-
frame: OpenCV BGR numpy array.
|
| 21 |
-
quality: Optional JPEG quality (1-100). Uses OpenCV default if None.
|
| 22 |
-
"""
|
| 23 |
-
import cv2
|
| 24 |
-
params = [int(cv2.IMWRITE_JPEG_QUALITY), quality] if quality is not None else None
|
| 25 |
-
success, buf = cv2.imencode('.jpg', frame, params) if params else cv2.imencode('.jpg', frame)
|
| 26 |
-
if not success:
|
| 27 |
-
raise ValueError("Failed to encode frame to JPEG")
|
| 28 |
-
return base64.b64encode(buf.tobytes()).decode('utf-8')
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
_DOMAIN_ROLES = {
|
| 32 |
-
"NAVAL": "Naval Intelligence Officer and Maritime Threat Analyst",
|
| 33 |
-
"GROUND": "Ground Surveillance Intelligence Officer",
|
| 34 |
-
"AERIAL": "Air Surveillance Intelligence Officer",
|
| 35 |
-
"URBAN": "Urban Surveillance Intelligence Officer",
|
| 36 |
-
"GENERIC": "Tactical Surveillance Analyst",
|
| 37 |
-
}
|
| 38 |
-
|
| 39 |
-
_HUMAN_LABEL_HINTS = frozenset({
|
| 40 |
-
"person", "people", "human", "pedestrian",
|
| 41 |
-
"man", "woman", "boy", "girl", "child",
|
| 42 |
-
"civilian", "soldier", "infantry", "troop", "trooper",
|
| 43 |
-
})
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
def _is_human_label(label: str) -> bool:
|
| 47 |
-
label_l = (label or "").lower().strip()
|
| 48 |
-
if not label_l:
|
| 49 |
-
return False
|
| 50 |
-
parts = [p for p in re.split(r"[^a-z0-9]+", label_l) if p]
|
| 51 |
-
return any(part in _HUMAN_LABEL_HINTS for part in parts)
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
def _build_status_fallback(
|
| 55 |
-
object_ids: List[str],
|
| 56 |
-
status: str,
|
| 57 |
-
reason: str,
|
| 58 |
-
) -> Dict[str, Dict[str, Any]]:
|
| 59 |
-
return {
|
| 60 |
-
obj_id: {
|
| 61 |
-
"assessment_status": status,
|
| 62 |
-
"gpt_reason": reason,
|
| 63 |
-
}
|
| 64 |
-
for obj_id in object_ids
|
| 65 |
-
}
|
| 66 |
-
|
| 67 |
-
_UNIVERSAL_SCHEMA = (
|
| 68 |
-
"RESPONSE SCHEMA (JSON):\n"
|
| 69 |
-
"{\n"
|
| 70 |
-
" \"objects\": {\n"
|
| 71 |
-
" \"T01\": {\n"
|
| 72 |
-
" \"object_type\": \"string (broad category, e.g. Warship, APC, Sedan, Person)\",\n"
|
| 73 |
-
" \"size\": \"string (e.g. Large, Medium, Small, ~50m length)\",\n"
|
| 74 |
-
" \"visible_weapons\": [\"string\"],\n"
|
| 75 |
-
" \"weapon_readiness\": \"string (e.g. Stowed/PEACE, Trained/Aiming, Firing/HOSTILE, Unknown)\",\n"
|
| 76 |
-
" \"motion_status\": \"string (e.g. Stationary, Moving Slow, Moving Fast, Hovering)\",\n"
|
| 77 |
-
" \"range_estimate\": \"string (e.g. ~500m, ~2NM, ~1km)\",\n"
|
| 78 |
-
" \"bearing\": \"string (e.g. 12 o'clock, NNE, 045°)\",\n"
|
| 79 |
-
" \"threat_level\": int (1-10, 1=Benign, 10=Imminent Attack),\n"
|
| 80 |
-
" \"threat_classification\": \"Friendly\" | \"Neutral\" | \"Suspect\" | \"Hostile\",\n"
|
| 81 |
-
" \"tactical_intent\": \"string (e.g. Transit, Patrol, Attack Profile)\",\n"
|
| 82 |
-
" \"dynamic_features\": [\n"
|
| 83 |
-
" {\"key\": \"string (domain-specific observation name)\", \"value\": \"string\"}\n"
|
| 84 |
-
" ] // up to 5 extra observations relevant to the domain\n"
|
| 85 |
-
" }\n"
|
| 86 |
-
" }\n"
|
| 87 |
-
"}\n"
|
| 88 |
-
)
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
def _parse_range_to_meters(range_text: str) -> Optional[float]:
|
| 92 |
-
"""Convert a free-text range string to meters.
|
| 93 |
-
|
| 94 |
-
Supports patterns like '~500m', '~2NM', '~1.5km', '500 meters', '2 nautical miles'.
|
| 95 |
-
Returns None if the string cannot be parsed.
|
| 96 |
-
"""
|
| 97 |
-
if not range_text or range_text == "Unknown":
|
| 98 |
-
return None
|
| 99 |
-
text = range_text.strip().lstrip("~").strip()
|
| 100 |
-
# Try NM / nautical miles
|
| 101 |
-
m = re.match(r"([0-9]*\.?[0-9]+)\s*(NM|nm|nautical\s*miles?)", text)
|
| 102 |
-
if m:
|
| 103 |
-
return float(m.group(1)) * 1852.0
|
| 104 |
-
# Try km / kilometers
|
| 105 |
-
m = re.match(r"([0-9]*\.?[0-9]+)\s*(km|kilometers?|kilometres?)", text, re.IGNORECASE)
|
| 106 |
-
if m:
|
| 107 |
-
return float(m.group(1)) * 1000.0
|
| 108 |
-
# Try meters (default)
|
| 109 |
-
m = re.match(r"([0-9]*\.?[0-9]+)\s*(m|meters?|metres?)?$", text, re.IGNORECASE)
|
| 110 |
-
if m:
|
| 111 |
-
return float(m.group(1))
|
| 112 |
-
return None
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
def _build_domain_system_prompt(domain: str, mission_spec=None) -> str:
|
| 116 |
-
"""Build a universal system prompt with domain-appropriate role."""
|
| 117 |
-
|
| 118 |
-
# Mission context block (injected regardless of domain)
|
| 119 |
-
mission_context = ""
|
| 120 |
-
if mission_spec:
|
| 121 |
-
mission_context = (
|
| 122 |
-
"\n\nMISSION CONTEXT:\n"
|
| 123 |
-
f"- Operator Intent: {mission_spec.mission_intent}\n"
|
| 124 |
-
f"- Domain: {mission_spec.domain}\n"
|
| 125 |
-
f"- Target Classes: {', '.join(mission_spec.object_classes)}\n"
|
| 126 |
-
)
|
| 127 |
-
if mission_spec.context_phrases:
|
| 128 |
-
mission_context += f"- Situational Context: {'; '.join(mission_spec.context_phrases)}\n"
|
| 129 |
-
if mission_spec.stripped_modifiers:
|
| 130 |
-
mission_context += f"- Operator Modifiers (stripped): {', '.join(mission_spec.stripped_modifiers)}\n"
|
| 131 |
-
mission_context += (
|
| 132 |
-
"\nUse the mission context to inform your analysis. "
|
| 133 |
-
"Focus assessment on the target classes and domain specified."
|
| 134 |
-
)
|
| 135 |
-
|
| 136 |
-
role = _DOMAIN_ROLES.get(domain, _DOMAIN_ROLES["GENERIC"])
|
| 137 |
-
|
| 138 |
-
return (
|
| 139 |
-
f"You are an elite {role}. "
|
| 140 |
-
"Your task is to analyze optical surveillance imagery and provide a detailed tactical assessment for every detected object. "
|
| 141 |
-
f"You must output a STRICT JSON object that matches the following schema for every object ID provided:\n\n"
|
| 142 |
-
f"{_UNIVERSAL_SCHEMA}\n"
|
| 143 |
-
"RULES:\n"
|
| 144 |
-
"- Use dynamic_features for domain-specific observations (e.g., wake_description, deck_activity, sensor_profile, camouflage, license_plate).\n"
|
| 145 |
-
"- Provide up to 5 dynamic_features per object. Choose the most tactically relevant observations.\n"
|
| 146 |
-
"- range_estimate should be a human-readable string with units (e.g., '~500m', '~2NM').\n"
|
| 147 |
-
"- Visible trained weapons are IMMINENT threat (Score 9-10).\n"
|
| 148 |
-
"- Ignore artifacts, focus on the objects."
|
| 149 |
-
+ mission_context
|
| 150 |
-
)
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
def estimate_threat_gpt(
|
| 154 |
-
image_path: Optional[str] = None,
|
| 155 |
-
detections: Optional[List[Dict[str, Any]]] = None,
|
| 156 |
-
mission_spec=None, # Optional[MissionSpecification]
|
| 157 |
-
image_b64: Optional[str] = None,
|
| 158 |
-
) -> Dict[str, Any]:
|
| 159 |
-
"""
|
| 160 |
-
Perform Threat Assessment on detected objects using GPT-4o.
|
| 161 |
-
|
| 162 |
-
Args:
|
| 163 |
-
image_path: Path to the image file (mutually exclusive with image_b64).
|
| 164 |
-
detections: List of detection dicts (bbox, label, etc.).
|
| 165 |
-
mission_spec: Optional MissionSpecification for domain-aware assessment.
|
| 166 |
-
image_b64: Pre-encoded base64 JPEG string (avoids disk round-trip).
|
| 167 |
-
|
| 168 |
-
Returns:
|
| 169 |
-
Dict mapping object ID (e.g., T01) to threat assessment dict.
|
| 170 |
-
"""
|
| 171 |
-
if detections is None:
|
| 172 |
-
detections = []
|
| 173 |
-
|
| 174 |
-
if not get_api_key():
|
| 175 |
-
logger.error("OPENAI_API_KEY not set. Skipping GPT threat assessment.")
|
| 176 |
-
return {}
|
| 177 |
-
|
| 178 |
-
# 1. Prepare detections summary for prompt.
|
| 179 |
-
# Human/person classes are explicitly skipped to avoid refusal paths.
|
| 180 |
-
prompt_items = []
|
| 181 |
-
skipped_human_ids: List[str] = []
|
| 182 |
-
for i, det in enumerate(detections):
|
| 183 |
-
obj_id = str(det.get("track_id") or det.get("id") or f"T{str(i+1).zfill(2)}")
|
| 184 |
-
bbox = det.get("bbox", [])
|
| 185 |
-
label = str(det.get("label", "object"))
|
| 186 |
-
if _is_human_label(label):
|
| 187 |
-
skipped_human_ids.append(obj_id)
|
| 188 |
-
continue
|
| 189 |
-
prompt_items.append({"obj_id": obj_id, "label": label, "bbox": bbox})
|
| 190 |
-
|
| 191 |
-
det_text = "\n".join(
|
| 192 |
-
[
|
| 193 |
-
f"- ID: {it['obj_id']}, Classification Hint: {it['label']}, BBox: {it['bbox']}"
|
| 194 |
-
for it in prompt_items
|
| 195 |
-
]
|
| 196 |
-
)
|
| 197 |
-
|
| 198 |
-
if not det_text:
|
| 199 |
-
if skipped_human_ids:
|
| 200 |
-
logger.warning(
|
| 201 |
-
"Skipping GPT threat assessment for %d human/person detections due policy constraints.",
|
| 202 |
-
len(skipped_human_ids),
|
| 203 |
-
)
|
| 204 |
-
return _build_status_fallback(
|
| 205 |
-
skipped_human_ids,
|
| 206 |
-
AssessmentStatus.SKIPPED_POLICY,
|
| 207 |
-
"Human/person analysis skipped due policy constraints.",
|
| 208 |
-
)
|
| 209 |
-
return {}
|
| 210 |
-
|
| 211 |
-
# 2. Encode image (prefer pre-encoded b64 to avoid disk I/O)
|
| 212 |
-
if image_b64:
|
| 213 |
-
base64_image = image_b64
|
| 214 |
-
elif image_path:
|
| 215 |
-
try:
|
| 216 |
-
base64_image = encode_image(image_path)
|
| 217 |
-
except Exception as e:
|
| 218 |
-
logger.error(f"Failed to encode image for GPT: {e}")
|
| 219 |
-
return {}
|
| 220 |
-
else:
|
| 221 |
-
logger.error("estimate_threat_gpt: no image_path or image_b64 provided")
|
| 222 |
-
return {}
|
| 223 |
-
|
| 224 |
-
# 3. Domain-aware prompt selection (INV-7)
|
| 225 |
-
domain = "GENERIC" # default — universal schema works for all domains
|
| 226 |
-
if mission_spec:
|
| 227 |
-
domain = mission_spec.domain
|
| 228 |
-
if mission_spec.domain_source == "INFERRED":
|
| 229 |
-
logger.info("GPT assessment using inferred domain=%s (domain_inferred=True)", domain)
|
| 230 |
-
|
| 231 |
-
system_prompt = _build_domain_system_prompt(domain, mission_spec)
|
| 232 |
-
|
| 233 |
-
domain_label = domain.lower() if domain != "NAVAL" else "naval"
|
| 234 |
-
user_prompt = (
|
| 235 |
-
f"Analyze this {domain_label} surveillance image. The following objects have been detected:\n"
|
| 236 |
-
f"{det_text}\n\n"
|
| 237 |
-
f"Provide a detailed Threat Assessment for each object based on its visual signatures."
|
| 238 |
-
)
|
| 239 |
-
|
| 240 |
-
# 4. Call API
|
| 241 |
-
payload = {
|
| 242 |
-
"model": "gpt-4o", # Use 4o for better vision analysis
|
| 243 |
-
"messages": [
|
| 244 |
-
{
|
| 245 |
-
"role": "system",
|
| 246 |
-
"content": system_prompt
|
| 247 |
-
},
|
| 248 |
-
{
|
| 249 |
-
"role": "user",
|
| 250 |
-
"content": [
|
| 251 |
-
{
|
| 252 |
-
"type": "text",
|
| 253 |
-
"text": user_prompt
|
| 254 |
-
},
|
| 255 |
-
{
|
| 256 |
-
"type": "image_url",
|
| 257 |
-
"image_url": {
|
| 258 |
-
"url": f"data:image/jpeg;base64,{base64_image}",
|
| 259 |
-
"detail": "low"
|
| 260 |
-
}
|
| 261 |
-
}
|
| 262 |
-
]
|
| 263 |
-
}
|
| 264 |
-
],
|
| 265 |
-
"max_tokens": 1500,
|
| 266 |
-
"temperature": 0.2, # Low temp for factual consistency
|
| 267 |
-
"response_format": { "type": "json_object" }
|
| 268 |
-
}
|
| 269 |
-
|
| 270 |
-
try:
|
| 271 |
-
resp_data = chat_completion(payload)
|
| 272 |
-
content, refusal = extract_content(resp_data)
|
| 273 |
-
if not content:
|
| 274 |
-
if refusal:
|
| 275 |
-
logger.warning("GPT refused threat assessment: %s", refusal)
|
| 276 |
-
else:
|
| 277 |
-
logger.warning(
|
| 278 |
-
"GPT returned empty content. response_id=%s finish_reason=%s",
|
| 279 |
-
resp_data.get("id"),
|
| 280 |
-
resp_data.get("choices", [{}])[0].get("finish_reason"),
|
| 281 |
-
)
|
| 282 |
-
fallback = _build_status_fallback(
|
| 283 |
-
[it["obj_id"] for it in prompt_items],
|
| 284 |
-
AssessmentStatus.REFUSED,
|
| 285 |
-
refusal or "GPT returned empty content.",
|
| 286 |
-
)
|
| 287 |
-
fallback.update(
|
| 288 |
-
_build_status_fallback(
|
| 289 |
-
skipped_human_ids,
|
| 290 |
-
AssessmentStatus.SKIPPED_POLICY,
|
| 291 |
-
"Human/person analysis skipped due policy constraints.",
|
| 292 |
-
)
|
| 293 |
-
)
|
| 294 |
-
return fallback
|
| 295 |
-
|
| 296 |
-
result_json = json.loads(content)
|
| 297 |
-
|
| 298 |
-
objects = result_json.get("objects", {})
|
| 299 |
-
if not isinstance(objects, dict):
|
| 300 |
-
logger.warning(
|
| 301 |
-
"GPT response 'objects' field is not a dict (got %s); using fallback.",
|
| 302 |
-
type(objects).__name__,
|
| 303 |
-
)
|
| 304 |
-
objects = {}
|
| 305 |
-
|
| 306 |
-
# Ensure every requested object receives an explicit assessment state.
|
| 307 |
-
for it in prompt_items:
|
| 308 |
-
oid = it["obj_id"]
|
| 309 |
-
if oid not in objects:
|
| 310 |
-
objects[oid] = {
|
| 311 |
-
"assessment_status": AssessmentStatus.NO_RESPONSE,
|
| 312 |
-
"gpt_reason": "No structured assessment returned for object.",
|
| 313 |
-
}
|
| 314 |
-
for oid in skipped_human_ids:
|
| 315 |
-
objects.setdefault(
|
| 316 |
-
oid,
|
| 317 |
-
{
|
| 318 |
-
"assessment_status": AssessmentStatus.SKIPPED_POLICY,
|
| 319 |
-
"gpt_reason": "Human/person analysis skipped due policy constraints.",
|
| 320 |
-
},
|
| 321 |
-
)
|
| 322 |
-
|
| 323 |
-
# Polyfill legacy fields for frontend compatibility
|
| 324 |
-
for obj_id, data in objects.items():
|
| 325 |
-
if not isinstance(data, dict):
|
| 326 |
-
data = {
|
| 327 |
-
"assessment_status": AssessmentStatus.NO_RESPONSE,
|
| 328 |
-
"gpt_reason": "Malformed object payload from GPT.",
|
| 329 |
-
}
|
| 330 |
-
objects[obj_id] = data
|
| 331 |
-
|
| 332 |
-
# 1. Distance: parse free-text range_estimate to meters
|
| 333 |
-
range_m = _parse_range_to_meters(data.get("range_estimate", ""))
|
| 334 |
-
if range_m is not None:
|
| 335 |
-
data["distance_m"] = range_m
|
| 336 |
-
data["gpt_distance_m"] = range_m
|
| 337 |
-
|
| 338 |
-
# 2. Direction (legacy alias)
|
| 339 |
-
bearing = data.get("bearing", "")
|
| 340 |
-
if bearing and bearing != "Unknown":
|
| 341 |
-
data["direction"] = bearing
|
| 342 |
-
data["gpt_direction"] = bearing
|
| 343 |
-
|
| 344 |
-
# 3. Description (summary of new fields)
|
| 345 |
-
obj_type = data.get("object_type", "Unknown")
|
| 346 |
-
threat = data.get("threat_classification", "Unknown")
|
| 347 |
-
score = data.get("threat_level", 0)
|
| 348 |
-
|
| 349 |
-
desc_parts = [obj_type]
|
| 350 |
-
desc_parts.append(f"[{threat.upper()} Lvl:{score}]")
|
| 351 |
-
|
| 352 |
-
data["description"] = " ".join(desc_parts)
|
| 353 |
-
data["gpt_description"] = data["description"]
|
| 354 |
-
|
| 355 |
-
# 4. Legacy threat_level_score alias
|
| 356 |
-
data["threat_level_score"] = data.get("threat_level", 0)
|
| 357 |
-
|
| 358 |
-
return objects
|
| 359 |
-
|
| 360 |
-
except Exception as e:
|
| 361 |
-
logger.error("GPT API call failed: %s", e, exc_info=True)
|
| 362 |
-
fallback = _build_status_fallback(
|
| 363 |
-
[it["obj_id"] for it in prompt_items],
|
| 364 |
-
AssessmentStatus.ERROR,
|
| 365 |
-
f"GPT API call failed: {e.__class__.__name__}",
|
| 366 |
-
)
|
| 367 |
-
fallback.update(
|
| 368 |
-
_build_status_fallback(
|
| 369 |
-
skipped_human_ids,
|
| 370 |
-
AssessmentStatus.SKIPPED_POLICY,
|
| 371 |
-
"Human/person analysis skipped due policy constraints.",
|
| 372 |
-
)
|
| 373 |
-
)
|
| 374 |
-
return fallback
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
utils/mission_parser.py
DELETED
|
@@ -1,481 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Mission text parser — converts raw operator text into a validated MissionSpecification.
|
| 3 |
-
|
| 4 |
-
Single public function: parse_mission_text(raw_text, detector_key) -> MissionSpecification
|
| 5 |
-
|
| 6 |
-
Internal flow:
|
| 7 |
-
1. Fast-path regex check -> skip LLM if comma-separated labels
|
| 8 |
-
2. LLM extraction call (GPT-4o, temperature 0.0)
|
| 9 |
-
3. Deterministic validation pipeline
|
| 10 |
-
4. COCO vocabulary mapping for COCO-only detectors
|
| 11 |
-
5. Build RelevanceCriteria deterministically from mapped classes
|
| 12 |
-
6. Return validated MissionSpecification or raise MissionParseError
|
| 13 |
-
"""
|
| 14 |
-
|
| 15 |
-
import json
|
| 16 |
-
import logging
|
| 17 |
-
import re
|
| 18 |
-
from typing import List, Optional
|
| 19 |
-
|
| 20 |
-
from utils.openai_client import chat_completion, extract_content, get_api_key, OpenAIAPIError
|
| 21 |
-
|
| 22 |
-
from coco_classes import COCO_CLASSES, canonicalize_coco_name, coco_class_catalog
|
| 23 |
-
from utils.schemas import MissionSpecification, RelevanceCriteria
|
| 24 |
-
|
| 25 |
-
logger = logging.getLogger(__name__)
|
| 26 |
-
|
| 27 |
-
# Detectors that only support COCO class vocabulary
|
| 28 |
-
_COCO_ONLY_DETECTORS = frozenset({"yolo11", "detr_resnet50"})
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
class MissionParseError(ValueError):
|
| 32 |
-
"""Raised when mission text cannot be parsed into a valid MissionSpecification."""
|
| 33 |
-
def __init__(self, message: str, warnings: Optional[List[str]] = None):
|
| 34 |
-
self.warnings = warnings or []
|
| 35 |
-
super().__init__(message)
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
def _is_comma_separated_labels(text: str) -> bool:
|
| 39 |
-
"""Fast-path: detect simple comma-separated class labels (no LLM needed)."""
|
| 40 |
-
# Match: word tokens separated by commas, each token <= 3 words
|
| 41 |
-
pattern = r"^[\w\s]+(,\s*[\w\s]+)*$"
|
| 42 |
-
if not re.match(pattern, text.strip()):
|
| 43 |
-
return False
|
| 44 |
-
tokens = [t.strip() for t in text.split(",") if t.strip()]
|
| 45 |
-
return all(len(t.split()) <= 3 for t in tokens)
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
def _is_coco_only(detector_key: str) -> bool:
|
| 49 |
-
return detector_key in _COCO_ONLY_DETECTORS
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
def _map_coco_classes(
|
| 53 |
-
object_classes: List[str], detector_key: str
|
| 54 |
-
) -> tuple[List[str], List[str], List[str]]:
|
| 55 |
-
"""Map object classes to COCO vocabulary for COCO-only detectors.
|
| 56 |
-
|
| 57 |
-
Returns:
|
| 58 |
-
(mapped_classes, unmappable_classes, warnings)
|
| 59 |
-
"""
|
| 60 |
-
if not _is_coco_only(detector_key):
|
| 61 |
-
return object_classes, [], []
|
| 62 |
-
|
| 63 |
-
mapped = []
|
| 64 |
-
unmappable = []
|
| 65 |
-
warnings = []
|
| 66 |
-
seen = set()
|
| 67 |
-
|
| 68 |
-
for cls in object_classes:
|
| 69 |
-
canonical = canonicalize_coco_name(cls)
|
| 70 |
-
if canonical is not None:
|
| 71 |
-
if canonical not in seen:
|
| 72 |
-
mapped.append(canonical)
|
| 73 |
-
seen.add(canonical)
|
| 74 |
-
if canonical.lower() != cls.lower():
|
| 75 |
-
warnings.append(
|
| 76 |
-
f"'{cls}' mapped to COCO class '{canonical}'."
|
| 77 |
-
)
|
| 78 |
-
else:
|
| 79 |
-
unmappable.append(cls)
|
| 80 |
-
warnings.append(
|
| 81 |
-
f"'{cls}' is not in COCO vocabulary. Will not be detected by {detector_key}."
|
| 82 |
-
)
|
| 83 |
-
|
| 84 |
-
return mapped, unmappable, warnings
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
def _build_fast_path_spec(
|
| 88 |
-
raw_text: str, object_classes: List[str], detector_key: str
|
| 89 |
-
) -> MissionSpecification:
|
| 90 |
-
"""Build MissionSpecification for simple comma-separated input (no LLM call)."""
|
| 91 |
-
mapped, unmappable, warnings = _map_coco_classes(object_classes, detector_key)
|
| 92 |
-
|
| 93 |
-
if _is_coco_only(detector_key) and not mapped:
|
| 94 |
-
raise MissionParseError(
|
| 95 |
-
f"None of the requested objects ({', '.join(object_classes)}) match the "
|
| 96 |
-
f"{detector_key} vocabulary. This detector supports: "
|
| 97 |
-
f"{coco_class_catalog()}. "
|
| 98 |
-
f"Use an open-vocabulary detector (Grounding DINO) or adjust your mission.",
|
| 99 |
-
warnings=warnings,
|
| 100 |
-
)
|
| 101 |
-
|
| 102 |
-
final_classes = mapped if _is_coco_only(detector_key) else object_classes
|
| 103 |
-
|
| 104 |
-
return MissionSpecification(
|
| 105 |
-
object_classes=final_classes,
|
| 106 |
-
mission_intent="DETECT",
|
| 107 |
-
domain="GENERIC",
|
| 108 |
-
domain_source="INFERRED",
|
| 109 |
-
relevance_criteria=RelevanceCriteria(
|
| 110 |
-
required_classes=final_classes,
|
| 111 |
-
min_confidence=0.0,
|
| 112 |
-
),
|
| 113 |
-
context_phrases=[],
|
| 114 |
-
stripped_modifiers=[],
|
| 115 |
-
operator_text=raw_text,
|
| 116 |
-
parse_mode="FAST_PATH",
|
| 117 |
-
parse_confidence="HIGH",
|
| 118 |
-
parse_warnings=warnings,
|
| 119 |
-
)
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
# --- LLM Extraction ---
|
| 123 |
-
|
| 124 |
-
_SYSTEM_PROMPT = (
|
| 125 |
-
"You are a mission text parser for an object detection system. Your ONLY job is to extract "
|
| 126 |
-
"structured fields from operator mission text. You do NOT assess threats. You do NOT reason "
|
| 127 |
-
"about tactics. You extract and classify.\n\n"
|
| 128 |
-
"OUTPUT SCHEMA (strict JSON):\n"
|
| 129 |
-
"{\n"
|
| 130 |
-
' "object_classes": ["string"],\n'
|
| 131 |
-
' "mission_intent": "ENUM",\n'
|
| 132 |
-
' "domain": "ENUM",\n'
|
| 133 |
-
' "context_phrases": ["string"],\n'
|
| 134 |
-
' "stripped_modifiers": ["string"],\n'
|
| 135 |
-
' "parse_confidence": "ENUM",\n'
|
| 136 |
-
' "parse_warnings": ["string"]\n'
|
| 137 |
-
"}\n\n"
|
| 138 |
-
"EXTRACTION RULES:\n\n"
|
| 139 |
-
"1. OBJECT_CLASSES — What to extract:\n"
|
| 140 |
-
" - Extract nouns and noun phrases that refer to PHYSICAL, VISUALLY DETECTABLE objects.\n"
|
| 141 |
-
" - Keep visual descriptors that narrow the category: 'small boat', 'military vehicle', 'cargo ship'.\n"
|
| 142 |
-
" - Use singular form: 'vessels' -> 'vessel', 'people' -> 'person'.\n"
|
| 143 |
-
" - If the input is already comma-separated class labels (e.g., 'person, car, boat'),\n"
|
| 144 |
-
" use them directly without modification.\n\n"
|
| 145 |
-
"2. OBJECT_CLASSES — What to strip:\n"
|
| 146 |
-
" - Remove threat/intent adjectives: 'hostile', 'suspicious', 'friendly', 'dangerous', 'enemy'.\n"
|
| 147 |
-
" -> Move these to stripped_modifiers.\n"
|
| 148 |
-
" - Remove action verbs: 'approaching', 'fleeing', 'attacking'.\n"
|
| 149 |
-
" -> Move the full phrase to context_phrases.\n"
|
| 150 |
-
" - Remove spatial/temporal phrases: 'from the east', 'near the harbor', 'at night'.\n"
|
| 151 |
-
" -> Move to context_phrases.\n"
|
| 152 |
-
" - Do NOT extract abstract concepts: 'threat', 'danger', 'hazard', 'risk' are not objects.\n\n"
|
| 153 |
-
"3. MISSION_INTENT — Infer from verbs:\n"
|
| 154 |
-
" - 'detect', 'find', 'locate', 'spot', 'search for' -> DETECT\n"
|
| 155 |
-
" - 'classify', 'identify', 'determine type of' -> CLASSIFY\n"
|
| 156 |
-
" - 'track', 'follow', 'monitor movement of' -> TRACK\n"
|
| 157 |
-
" - 'assess threat', 'evaluate danger', 'threat assessment' -> ASSESS_THREAT\n"
|
| 158 |
-
" - 'monitor', 'watch', 'observe', 'surveil' -> MONITOR\n"
|
| 159 |
-
" - If no verb present (bare class list), default to DETECT.\n\n"
|
| 160 |
-
"4. DOMAIN — Infer from contextual clues:\n"
|
| 161 |
-
" - Maritime vocabulary (vessel, ship, boat, harbor, naval, maritime, wake, sea) -> NAVAL\n"
|
| 162 |
-
" - Ground vocabulary (vehicle, convoy, checkpoint, road, building, infantry) -> GROUND\n"
|
| 163 |
-
" - Aerial vocabulary (aircraft, drone, UAV, airspace, altitude, flight) -> AERIAL\n"
|
| 164 |
-
" - Urban vocabulary (pedestrian, intersection, storefront, crowd, building) -> URBAN\n"
|
| 165 |
-
" - If no domain clues present -> GENERIC\n\n"
|
| 166 |
-
"5. PARSE_CONFIDENCE:\n"
|
| 167 |
-
" - HIGH: Clear object classes extracted, domain identifiable.\n"
|
| 168 |
-
" - MEDIUM: Some ambiguity but reasonable extraction possible. Include warnings.\n"
|
| 169 |
-
" - LOW: Cannot extract meaningful object classes. Input is too abstract,\n"
|
| 170 |
-
" contradictory, or contains no visual object references.\n"
|
| 171 |
-
" Examples of LOW: 'keep us safe', 'do your job', 'analyze everything'.\n\n"
|
| 172 |
-
"FORBIDDEN:\n"
|
| 173 |
-
"- Do NOT infer object classes not implied by the text. If the text says 'boats',\n"
|
| 174 |
-
" do not add 'person' or 'vehicle' unless mentioned.\n"
|
| 175 |
-
"- Do NOT add threat scores, engagement rules, or tactical recommendations.\n"
|
| 176 |
-
"- Do NOT interpret what 'threat' or 'danger' means in terms of specific objects.\n"
|
| 177 |
-
" If the operator writes 'detect threats', set parse_confidence to LOW and warn:\n"
|
| 178 |
-
" \"'threats' is not a visual object class. Specify what objects to detect.\""
|
| 179 |
-
)
|
| 180 |
-
|
| 181 |
-
_VISION_GROUNDING_ADDENDUM = (
|
| 182 |
-
"\n\nVISION GROUNDING (when an image is provided):\n"
|
| 183 |
-
"You may receive the first frame of the operator's video feed as an image.\n"
|
| 184 |
-
"Use it to REFINE your object_classes extraction:\n\n"
|
| 185 |
-
"1. If the operator uses a general term (e.g., 'vessels', 'vehicles'),\n"
|
| 186 |
-
" inspect the image and add MORE SPECIFIC subcategories visible in the scene.\n"
|
| 187 |
-
" Example: operator says 'detect vessels', image shows a speedboat and a cargo ship\n"
|
| 188 |
-
" -> object_classes: ['vessel', 'speedboat', 'cargo ship']\n\n"
|
| 189 |
-
"2. If the operator mentions objects NOT visible in the first frame,\n"
|
| 190 |
-
" still include them (later frames may contain them), but add a\n"
|
| 191 |
-
" parse_warning noting they were not visible in the first frame.\n\n"
|
| 192 |
-
"3. Use the image to CONFIRM or REFINE the domain. If the text is ambiguous\n"
|
| 193 |
-
" but the image clearly shows open water, set domain to NAVAL.\n\n"
|
| 194 |
-
"4. Do NOT hallucinate objects. Only add specific subcategories if clearly\n"
|
| 195 |
-
" identifiable. When uncertain, keep the general term.\n\n"
|
| 196 |
-
"5. The same OUTPUT SCHEMA and all EXTRACTION RULES still apply.\n"
|
| 197 |
-
" The image is supplementary context, not a replacement for the text.\n"
|
| 198 |
-
)
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
def _extract_and_encode_first_frame(video_path: Optional[str]) -> Optional[str]:
|
| 202 |
-
"""Extract the first frame from a video and return it as a base64-encoded JPEG.
|
| 203 |
-
|
| 204 |
-
Never raises — returns None on any failure so the caller can fall back
|
| 205 |
-
to text-only parsing.
|
| 206 |
-
"""
|
| 207 |
-
if not video_path:
|
| 208 |
-
return None
|
| 209 |
-
try:
|
| 210 |
-
from inference import extract_first_frame
|
| 211 |
-
from utils.gpt_reasoning import encode_frame_to_b64
|
| 212 |
-
|
| 213 |
-
frame, _fps, _w, _h = extract_first_frame(video_path)
|
| 214 |
-
return encode_frame_to_b64(frame, quality=85)
|
| 215 |
-
except Exception:
|
| 216 |
-
logger.warning("Failed to extract/encode first frame for vision grounding", exc_info=True)
|
| 217 |
-
return None
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
def _call_extraction_llm(raw_text: str, detector_key: str, first_frame_b64: Optional[str] = None) -> dict:
|
| 221 |
-
"""Call GPT-4o to extract structured mission fields from natural language."""
|
| 222 |
-
if not get_api_key():
|
| 223 |
-
raise MissionParseError(
|
| 224 |
-
"OPENAI_API_KEY not set. Cannot parse natural language mission text. "
|
| 225 |
-
"Use comma-separated class labels instead (e.g., 'person, car, boat')."
|
| 226 |
-
)
|
| 227 |
-
|
| 228 |
-
detector_type = "COCO_ONLY" if _is_coco_only(detector_key) else "OPEN_VOCAB"
|
| 229 |
-
|
| 230 |
-
user_prompt_text = (
|
| 231 |
-
f'OPERATOR MISSION TEXT:\n"{raw_text}"\n\n'
|
| 232 |
-
f"DETECTOR TYPE: {detector_type}\n\n"
|
| 233 |
-
"Extract the structured mission specification from the above text."
|
| 234 |
-
)
|
| 235 |
-
|
| 236 |
-
# Build system prompt (append vision addendum when image is available)
|
| 237 |
-
system_content = _SYSTEM_PROMPT
|
| 238 |
-
if first_frame_b64:
|
| 239 |
-
system_content = _SYSTEM_PROMPT + _VISION_GROUNDING_ADDENDUM
|
| 240 |
-
|
| 241 |
-
# Build user message: mixed content array when image is available, plain string otherwise
|
| 242 |
-
if first_frame_b64:
|
| 243 |
-
user_message = {
|
| 244 |
-
"role": "user",
|
| 245 |
-
"content": [
|
| 246 |
-
{"type": "text", "text": user_prompt_text},
|
| 247 |
-
{
|
| 248 |
-
"type": "image_url",
|
| 249 |
-
"image_url": {
|
| 250 |
-
"url": f"data:image/jpeg;base64,{first_frame_b64}",
|
| 251 |
-
"detail": "low",
|
| 252 |
-
},
|
| 253 |
-
},
|
| 254 |
-
],
|
| 255 |
-
}
|
| 256 |
-
else:
|
| 257 |
-
user_message = {"role": "user", "content": user_prompt_text}
|
| 258 |
-
|
| 259 |
-
max_tokens = 700 if first_frame_b64 else 500
|
| 260 |
-
timeout_s = 45 if first_frame_b64 else 30
|
| 261 |
-
|
| 262 |
-
payload = {
|
| 263 |
-
"model": "gpt-4o",
|
| 264 |
-
"temperature": 0.0,
|
| 265 |
-
"max_tokens": max_tokens,
|
| 266 |
-
"response_format": {"type": "json_object"},
|
| 267 |
-
"messages": [
|
| 268 |
-
{"role": "system", "content": system_content},
|
| 269 |
-
user_message,
|
| 270 |
-
],
|
| 271 |
-
}
|
| 272 |
-
|
| 273 |
-
try:
|
| 274 |
-
resp_data = chat_completion(payload, timeout=timeout_s)
|
| 275 |
-
content, _refusal = extract_content(resp_data)
|
| 276 |
-
if not content:
|
| 277 |
-
raise MissionParseError("GPT returned empty content during mission parsing.")
|
| 278 |
-
|
| 279 |
-
return json.loads(content)
|
| 280 |
-
|
| 281 |
-
except OpenAIAPIError as e:
|
| 282 |
-
raise MissionParseError(f"Mission parsing API call failed: {e}")
|
| 283 |
-
except json.JSONDecodeError:
|
| 284 |
-
raise MissionParseError(
|
| 285 |
-
"GPT returned invalid JSON. Please rephrase your mission."
|
| 286 |
-
)
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
def _validate_and_build(
|
| 290 |
-
llm_output: dict, raw_text: str, detector_key: str
|
| 291 |
-
) -> MissionSpecification:
|
| 292 |
-
"""Deterministic validation pipeline (Section 7.3 decision tree)."""
|
| 293 |
-
|
| 294 |
-
# Step 2: Extract fields with defaults
|
| 295 |
-
object_classes = llm_output.get("object_classes", [])
|
| 296 |
-
mission_intent = llm_output.get("mission_intent", "DETECT")
|
| 297 |
-
domain = llm_output.get("domain", "GENERIC")
|
| 298 |
-
context_phrases = llm_output.get("context_phrases", [])
|
| 299 |
-
stripped_modifiers = llm_output.get("stripped_modifiers", [])
|
| 300 |
-
parse_confidence = llm_output.get("parse_confidence", "LOW")
|
| 301 |
-
parse_warnings = llm_output.get("parse_warnings", [])
|
| 302 |
-
|
| 303 |
-
# Validate enum values
|
| 304 |
-
valid_intents = {"DETECT", "CLASSIFY", "TRACK", "ASSESS_THREAT", "MONITOR"}
|
| 305 |
-
if mission_intent not in valid_intents:
|
| 306 |
-
mission_intent = "DETECT"
|
| 307 |
-
parse_warnings.append(f"Invalid mission_intent '{llm_output.get('mission_intent')}', defaulted to DETECT.")
|
| 308 |
-
|
| 309 |
-
valid_domains = {"NAVAL", "GROUND", "AERIAL", "URBAN", "GENERIC"}
|
| 310 |
-
if domain not in valid_domains:
|
| 311 |
-
domain = "GENERIC"
|
| 312 |
-
parse_warnings.append(f"Invalid domain '{llm_output.get('domain')}', defaulted to GENERIC.")
|
| 313 |
-
|
| 314 |
-
valid_confidence = {"HIGH", "MEDIUM", "LOW"}
|
| 315 |
-
if parse_confidence not in valid_confidence:
|
| 316 |
-
parse_confidence = "LOW"
|
| 317 |
-
|
| 318 |
-
# Step 3: Parse confidence check
|
| 319 |
-
if parse_confidence == "LOW":
|
| 320 |
-
warnings_str = "; ".join(parse_warnings) if parse_warnings else "No details"
|
| 321 |
-
raise MissionParseError(
|
| 322 |
-
f"Could not extract object classes from mission text. "
|
| 323 |
-
f"Warnings: {warnings_str}. "
|
| 324 |
-
f"Please specify concrete objects to detect (e.g., 'vessel, small boat').",
|
| 325 |
-
warnings=parse_warnings,
|
| 326 |
-
)
|
| 327 |
-
|
| 328 |
-
# Validate object_classes is non-empty
|
| 329 |
-
if not object_classes:
|
| 330 |
-
raise MissionParseError(
|
| 331 |
-
"Mission text produced no detectable object classes. "
|
| 332 |
-
"Please specify concrete objects to detect.",
|
| 333 |
-
warnings=parse_warnings,
|
| 334 |
-
)
|
| 335 |
-
|
| 336 |
-
# Filter out empty strings
|
| 337 |
-
object_classes = [c.strip() for c in object_classes if c and c.strip()]
|
| 338 |
-
if not object_classes:
|
| 339 |
-
raise MissionParseError(
|
| 340 |
-
"All extracted object classes were empty after cleanup.",
|
| 341 |
-
warnings=parse_warnings,
|
| 342 |
-
)
|
| 343 |
-
|
| 344 |
-
# Step 4: COCO vocabulary mapping
|
| 345 |
-
mapped, unmappable, coco_warnings = _map_coco_classes(object_classes, detector_key)
|
| 346 |
-
parse_warnings.extend(coco_warnings)
|
| 347 |
-
|
| 348 |
-
if _is_coco_only(detector_key):
|
| 349 |
-
if not mapped:
|
| 350 |
-
raise MissionParseError(
|
| 351 |
-
f"None of the requested objects ({', '.join(object_classes)}) match the "
|
| 352 |
-
f"{detector_key} vocabulary. "
|
| 353 |
-
f"This detector supports: {coco_class_catalog()}. "
|
| 354 |
-
f"Use an open-vocabulary detector (Grounding DINO) or adjust your mission.",
|
| 355 |
-
warnings=parse_warnings,
|
| 356 |
-
)
|
| 357 |
-
final_classes = mapped
|
| 358 |
-
else:
|
| 359 |
-
final_classes = object_classes
|
| 360 |
-
|
| 361 |
-
# Step 5: Build RelevanceCriteria deterministically
|
| 362 |
-
relevance_criteria = RelevanceCriteria(
|
| 363 |
-
required_classes=final_classes,
|
| 364 |
-
min_confidence=0.0,
|
| 365 |
-
)
|
| 366 |
-
|
| 367 |
-
# Step 6: Construct MissionSpecification
|
| 368 |
-
return MissionSpecification(
|
| 369 |
-
object_classes=final_classes,
|
| 370 |
-
mission_intent=mission_intent,
|
| 371 |
-
domain=domain,
|
| 372 |
-
domain_source="INFERRED",
|
| 373 |
-
relevance_criteria=relevance_criteria,
|
| 374 |
-
# INVARIANT INV-13: context_phrases are forwarded to LLM reasoning layers
|
| 375 |
-
# (GPT threat assessment, threat chat) as situational context ONLY.
|
| 376 |
-
# They must NEVER be used in evaluate_relevance(), prioritization,
|
| 377 |
-
# or any deterministic filtering/sorting logic.
|
| 378 |
-
context_phrases=context_phrases,
|
| 379 |
-
stripped_modifiers=stripped_modifiers,
|
| 380 |
-
operator_text=raw_text,
|
| 381 |
-
parse_mode="LLM_EXTRACTED",
|
| 382 |
-
parse_confidence=parse_confidence,
|
| 383 |
-
parse_warnings=parse_warnings,
|
| 384 |
-
)
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
_DOMAIN_BROAD_CATEGORIES: dict[str, List[str]] = {
|
| 388 |
-
"NAVAL": ["vessel", "ship", "boat", "buoy", "person"],
|
| 389 |
-
"AERIAL": ["aircraft", "helicopter", "drone", "airplane"],
|
| 390 |
-
"GROUND": ["vehicle", "car", "truck", "person", "building"],
|
| 391 |
-
"URBAN": ["person", "vehicle", "car", "bicycle"],
|
| 392 |
-
"GENERIC": ["object"],
|
| 393 |
-
}
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
def build_broad_queries(
|
| 397 |
-
detector_key: str, mission_spec: MissionSpecification
|
| 398 |
-
) -> List[str]:
|
| 399 |
-
"""Build broad detector queries for LLM post-filter mode.
|
| 400 |
-
|
| 401 |
-
For FAST_PATH: return object_classes directly (unchanged behavior).
|
| 402 |
-
For COCO detectors (LLM_EXTRACTED): return ALL 80 COCO classes.
|
| 403 |
-
For open-vocab detectors (LLM_EXTRACTED): return LLM-extracted classes
|
| 404 |
-
PLUS broad domain categories to maximize recall.
|
| 405 |
-
"""
|
| 406 |
-
if mission_spec.parse_mode == "FAST_PATH":
|
| 407 |
-
return mission_spec.object_classes
|
| 408 |
-
|
| 409 |
-
# LLM_EXTRACTED path: detect broadly
|
| 410 |
-
if _is_coco_only(detector_key):
|
| 411 |
-
# COCO detectors ignore queries anyway (DETR detects all 80;
|
| 412 |
-
# YOLO11 falls back to all if no matches). Send everything.
|
| 413 |
-
return list(COCO_CLASSES)
|
| 414 |
-
|
| 415 |
-
# Open-vocab detector (e.g. Grounding DINO):
|
| 416 |
-
# Combine LLM-extracted classes with domain-specific broad categories
|
| 417 |
-
broad = list(mission_spec.object_classes)
|
| 418 |
-
domain_extras = _DOMAIN_BROAD_CATEGORIES.get(
|
| 419 |
-
mission_spec.domain, _DOMAIN_BROAD_CATEGORIES["GENERIC"]
|
| 420 |
-
)
|
| 421 |
-
seen = {c.lower() for c in broad}
|
| 422 |
-
for cat in domain_extras:
|
| 423 |
-
if cat.lower() not in seen:
|
| 424 |
-
broad.append(cat)
|
| 425 |
-
seen.add(cat.lower())
|
| 426 |
-
|
| 427 |
-
logger.info("Broad queries for %s: %s", detector_key, broad)
|
| 428 |
-
return broad
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
def parse_mission_text(
|
| 432 |
-
raw_text: str,
|
| 433 |
-
detector_key: str,
|
| 434 |
-
video_path: Optional[str] = None,
|
| 435 |
-
) -> MissionSpecification:
|
| 436 |
-
"""Parse raw mission text into a validated MissionSpecification.
|
| 437 |
-
|
| 438 |
-
Args:
|
| 439 |
-
raw_text: Verbatim mission text from the operator.
|
| 440 |
-
detector_key: Detector model key (determines COCO vocabulary constraints).
|
| 441 |
-
video_path: Optional path to input video; first frame used for vision grounding.
|
| 442 |
-
|
| 443 |
-
Returns:
|
| 444 |
-
Validated MissionSpecification.
|
| 445 |
-
|
| 446 |
-
Raises:
|
| 447 |
-
MissionParseError: If mission text cannot produce a valid specification.
|
| 448 |
-
"""
|
| 449 |
-
if not raw_text or not raw_text.strip():
|
| 450 |
-
raise MissionParseError(
|
| 451 |
-
"Mission text is empty. Specify objects to detect or use the default queries."
|
| 452 |
-
)
|
| 453 |
-
|
| 454 |
-
raw_text = raw_text.strip()
|
| 455 |
-
|
| 456 |
-
# Fast-path: simple comma-separated labels -> skip LLM
|
| 457 |
-
if _is_comma_separated_labels(raw_text):
|
| 458 |
-
object_classes = [t.strip() for t in raw_text.split(",") if t.strip()]
|
| 459 |
-
logger.info(
|
| 460 |
-
"Mission fast-path: comma-separated labels %s", object_classes
|
| 461 |
-
)
|
| 462 |
-
return _build_fast_path_spec(raw_text, object_classes, detector_key)
|
| 463 |
-
|
| 464 |
-
# LLM path: natural language mission text
|
| 465 |
-
logger.info("Mission LLM-path: extracting from natural language")
|
| 466 |
-
first_frame_b64 = _extract_and_encode_first_frame(video_path)
|
| 467 |
-
if first_frame_b64:
|
| 468 |
-
logger.info("Vision grounding: first frame encoded for LLM call")
|
| 469 |
-
llm_output = _call_extraction_llm(raw_text, detector_key, first_frame_b64=first_frame_b64)
|
| 470 |
-
logger.info("Mission LLM extraction result: %s", llm_output)
|
| 471 |
-
|
| 472 |
-
mission_spec = _validate_and_build(llm_output, raw_text, detector_key)
|
| 473 |
-
logger.info(
|
| 474 |
-
"Mission parsed: classes=%s intent=%s domain=%s(%s) confidence=%s",
|
| 475 |
-
mission_spec.object_classes,
|
| 476 |
-
mission_spec.mission_intent,
|
| 477 |
-
mission_spec.domain,
|
| 478 |
-
mission_spec.domain_source,
|
| 479 |
-
mission_spec.parse_confidence,
|
| 480 |
-
)
|
| 481 |
-
return mission_spec
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
utils/openai_client.py
DELETED
|
@@ -1,80 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Shared OpenAI HTTP client — single implementation of the chat-completions call.
|
| 3 |
-
|
| 4 |
-
Replaces duplicated urllib boilerplate in gpt_reasoning, relevance,
|
| 5 |
-
mission_parser, and threat_chat.
|
| 6 |
-
"""
|
| 7 |
-
|
| 8 |
-
import json
|
| 9 |
-
import logging
|
| 10 |
-
import os
|
| 11 |
-
import urllib.request
|
| 12 |
-
import urllib.error
|
| 13 |
-
from typing import Dict, Optional, Tuple
|
| 14 |
-
|
| 15 |
-
logger = logging.getLogger(__name__)
|
| 16 |
-
|
| 17 |
-
_API_URL = "https://api.openai.com/v1/chat/completions"
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
class OpenAIAPIError(Exception):
|
| 21 |
-
"""Raised when the OpenAI API call fails (HTTP or network error)."""
|
| 22 |
-
|
| 23 |
-
def __init__(self, message: str, status_code: Optional[int] = None):
|
| 24 |
-
self.status_code = status_code
|
| 25 |
-
super().__init__(message)
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
def get_api_key() -> Optional[str]:
|
| 29 |
-
"""Return the OpenAI API key from the environment, or None."""
|
| 30 |
-
return os.environ.get("OPENAI_API_KEY")
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
def chat_completion(payload: Dict, *, timeout: int = 30) -> Dict:
|
| 34 |
-
"""Send a chat-completion request and return the parsed JSON response.
|
| 35 |
-
|
| 36 |
-
Args:
|
| 37 |
-
payload: Full request body (model, messages, etc.).
|
| 38 |
-
timeout: HTTP timeout in seconds.
|
| 39 |
-
|
| 40 |
-
Returns:
|
| 41 |
-
Parsed response dict.
|
| 42 |
-
|
| 43 |
-
Raises:
|
| 44 |
-
OpenAIAPIError: On HTTP or network failure.
|
| 45 |
-
"""
|
| 46 |
-
api_key = get_api_key()
|
| 47 |
-
if not api_key:
|
| 48 |
-
raise OpenAIAPIError("OPENAI_API_KEY not set")
|
| 49 |
-
|
| 50 |
-
headers = {
|
| 51 |
-
"Content-Type": "application/json",
|
| 52 |
-
"Authorization": f"Bearer {api_key}",
|
| 53 |
-
}
|
| 54 |
-
|
| 55 |
-
try:
|
| 56 |
-
req = urllib.request.Request(
|
| 57 |
-
_API_URL,
|
| 58 |
-
data=json.dumps(payload).encode("utf-8"),
|
| 59 |
-
headers=headers,
|
| 60 |
-
method="POST",
|
| 61 |
-
)
|
| 62 |
-
with urllib.request.urlopen(req, timeout=timeout) as response:
|
| 63 |
-
return json.loads(response.read().decode("utf-8"))
|
| 64 |
-
except urllib.error.HTTPError as e:
|
| 65 |
-
raise OpenAIAPIError(
|
| 66 |
-
f"HTTP {e.code}: {e.reason}", status_code=e.code
|
| 67 |
-
) from e
|
| 68 |
-
except urllib.error.URLError as e:
|
| 69 |
-
raise OpenAIAPIError(f"URL error: {e.reason}") from e
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
def extract_content(resp_data: Dict) -> Tuple[Optional[str], Optional[str]]:
|
| 73 |
-
"""Safely extract content and refusal from a chat-completion response.
|
| 74 |
-
|
| 75 |
-
Returns:
|
| 76 |
-
(content, refusal) — either may be None.
|
| 77 |
-
"""
|
| 78 |
-
choice = resp_data.get("choices", [{}])[0]
|
| 79 |
-
message = choice.get("message", {})
|
| 80 |
-
return message.get("content"), message.get("refusal")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
utils/relevance.py
DELETED
|
@@ -1,141 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Object relevance evaluation — deterministic gate between detection and GPT assessment.
|
| 3 |
-
|
| 4 |
-
Public functions:
|
| 5 |
-
evaluate_relevance(detection, criteria) -> RelevanceDecision (deterministic)
|
| 6 |
-
evaluate_relevance_llm(detected_labels, mission_text) -> set[str] (LLM post-filter)
|
| 7 |
-
|
| 8 |
-
INVARIANT INV-13 enforcement: evaluate_relevance() accepts RelevanceCriteria, NOT
|
| 9 |
-
MissionSpecification. It cannot see context_phrases, stripped_modifiers, or any
|
| 10 |
-
LLM-derived field. This is structural, not by convention.
|
| 11 |
-
"""
|
| 12 |
-
|
| 13 |
-
import json
|
| 14 |
-
import logging
|
| 15 |
-
from typing import Any, Dict, List, NamedTuple, Set
|
| 16 |
-
|
| 17 |
-
from utils.openai_client import chat_completion, extract_content, get_api_key, OpenAIAPIError
|
| 18 |
-
|
| 19 |
-
from coco_classes import canonicalize_coco_name
|
| 20 |
-
from utils.schemas import RelevanceCriteria
|
| 21 |
-
|
| 22 |
-
logger = logging.getLogger(__name__)
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
class RelevanceDecision(NamedTuple):
|
| 26 |
-
relevant: bool
|
| 27 |
-
reason: str # "ok" | "label_not_in_required_classes" | "below_confidence"
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
def evaluate_relevance(
|
| 31 |
-
detection: Dict[str, Any],
|
| 32 |
-
criteria: RelevanceCriteria,
|
| 33 |
-
) -> RelevanceDecision:
|
| 34 |
-
"""Evaluate whether a detection is relevant to the mission.
|
| 35 |
-
|
| 36 |
-
Pure deterministic predicate — no LLM involvement.
|
| 37 |
-
|
| 38 |
-
Args:
|
| 39 |
-
detection: Detection dict with at least 'label' and 'score' keys.
|
| 40 |
-
criteria: RelevanceCriteria with required_classes and min_confidence.
|
| 41 |
-
|
| 42 |
-
Returns:
|
| 43 |
-
RelevanceDecision(relevant=bool, reason=str).
|
| 44 |
-
"""
|
| 45 |
-
label = (detection.get("label") or "").lower().strip()
|
| 46 |
-
confidence = detection.get("score", 0.0)
|
| 47 |
-
|
| 48 |
-
if not label:
|
| 49 |
-
return RelevanceDecision(False, "label_not_in_required_classes")
|
| 50 |
-
|
| 51 |
-
# Build lowercase set of required classes for comparison
|
| 52 |
-
required_lower = {c.lower() for c in criteria.required_classes}
|
| 53 |
-
|
| 54 |
-
# Direct match
|
| 55 |
-
if label in required_lower:
|
| 56 |
-
if confidence < criteria.min_confidence:
|
| 57 |
-
return RelevanceDecision(False, "below_confidence")
|
| 58 |
-
return RelevanceDecision(True, "ok")
|
| 59 |
-
|
| 60 |
-
# Synonym match via COCO canonicalization
|
| 61 |
-
canonical = canonicalize_coco_name(label)
|
| 62 |
-
if canonical and canonical.lower() in required_lower:
|
| 63 |
-
if confidence < criteria.min_confidence:
|
| 64 |
-
return RelevanceDecision(False, "below_confidence")
|
| 65 |
-
return RelevanceDecision(True, "ok")
|
| 66 |
-
|
| 67 |
-
# Check if any required class canonicalizes to the same COCO class as the label
|
| 68 |
-
if canonical:
|
| 69 |
-
for req in criteria.required_classes:
|
| 70 |
-
req_canonical = canonicalize_coco_name(req)
|
| 71 |
-
if req_canonical and req_canonical.lower() == canonical.lower():
|
| 72 |
-
if confidence < criteria.min_confidence:
|
| 73 |
-
return RelevanceDecision(False, "below_confidence")
|
| 74 |
-
return RelevanceDecision(True, "ok")
|
| 75 |
-
|
| 76 |
-
return RelevanceDecision(False, "label_not_in_required_classes")
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
def evaluate_relevance_llm(
|
| 80 |
-
detected_labels: List[str],
|
| 81 |
-
mission_text: str,
|
| 82 |
-
) -> Set[str]:
|
| 83 |
-
"""Ask GPT which detected labels are relevant to the mission.
|
| 84 |
-
|
| 85 |
-
Called ONCE on frame 0 with the unique labels found by the detector.
|
| 86 |
-
Returns a set of relevant label strings (lowercased).
|
| 87 |
-
|
| 88 |
-
On API failure, falls back to accepting all labels (fail-open, logged).
|
| 89 |
-
"""
|
| 90 |
-
if not detected_labels:
|
| 91 |
-
return set()
|
| 92 |
-
|
| 93 |
-
if not get_api_key():
|
| 94 |
-
logger.warning(
|
| 95 |
-
"OPENAI_API_KEY not set — LLM relevance filter falling back to accept-all"
|
| 96 |
-
)
|
| 97 |
-
return set(detected_labels)
|
| 98 |
-
|
| 99 |
-
prompt = (
|
| 100 |
-
f"Given this mission: \"{mission_text}\"\n\n"
|
| 101 |
-
f"Which of these detected object classes are relevant to the mission?\n"
|
| 102 |
-
f"{json.dumps(detected_labels)}\n\n"
|
| 103 |
-
"Return JSON: {\"relevant_labels\": [...]}\n"
|
| 104 |
-
"Only include labels from the provided list that are relevant to "
|
| 105 |
-
"accomplishing the mission. Be inclusive — if in doubt, include it."
|
| 106 |
-
)
|
| 107 |
-
|
| 108 |
-
payload = {
|
| 109 |
-
"model": "gpt-4o-mini",
|
| 110 |
-
"temperature": 0.0,
|
| 111 |
-
"max_tokens": 200,
|
| 112 |
-
"response_format": {"type": "json_object"},
|
| 113 |
-
"messages": [
|
| 114 |
-
{"role": "system", "content": "You are a mission relevance filter. Return only JSON."},
|
| 115 |
-
{"role": "user", "content": prompt},
|
| 116 |
-
],
|
| 117 |
-
}
|
| 118 |
-
|
| 119 |
-
try:
|
| 120 |
-
resp_data = chat_completion(payload)
|
| 121 |
-
content, _refusal = extract_content(resp_data)
|
| 122 |
-
if not content:
|
| 123 |
-
logger.warning("GPT returned empty content for relevance filter — accept-all")
|
| 124 |
-
return set(detected_labels)
|
| 125 |
-
|
| 126 |
-
result = json.loads(content)
|
| 127 |
-
relevant = result.get("relevant_labels", detected_labels)
|
| 128 |
-
relevant_set = {label.lower() for label in relevant}
|
| 129 |
-
|
| 130 |
-
logger.info(
|
| 131 |
-
"LLM relevance filter: mission=%r detected=%s relevant=%s",
|
| 132 |
-
mission_text, detected_labels, relevant_set,
|
| 133 |
-
)
|
| 134 |
-
return relevant_set
|
| 135 |
-
|
| 136 |
-
except OpenAIAPIError as e:
|
| 137 |
-
logger.warning("LLM relevance API call failed: %s — accept-all fallback", e)
|
| 138 |
-
return set(detected_labels)
|
| 139 |
-
except (json.JSONDecodeError, KeyError, TypeError) as e:
|
| 140 |
-
logger.warning("LLM relevance response parse failed: %s — accept-all fallback", e)
|
| 141 |
-
return set(detected_labels)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
utils/schemas.py
DELETED
|
@@ -1,115 +0,0 @@
|
|
| 1 |
-
from pydantic import BaseModel, Field
|
| 2 |
-
from typing import List, Literal
|
| 3 |
-
|
| 4 |
-
# --- Mission-Driven Abstractions ---
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
class RelevanceCriteria(BaseModel):
|
| 8 |
-
"""Deterministic boolean predicate for filtering detections against a mission.
|
| 9 |
-
|
| 10 |
-
This is the ONLY input to evaluate_relevance(). It intentionally excludes
|
| 11 |
-
context_phrases, stripped_modifiers, and all LLM-derived context so that
|
| 12 |
-
relevance filtering remains purely deterministic (INV-13).
|
| 13 |
-
"""
|
| 14 |
-
required_classes: List[str] = Field(
|
| 15 |
-
..., min_length=1,
|
| 16 |
-
description="Object categories that satisfy the mission. "
|
| 17 |
-
"Detections whose label is not in this list are excluded."
|
| 18 |
-
)
|
| 19 |
-
min_confidence: float = Field(
|
| 20 |
-
default=0.0, ge=0.0, le=1.0,
|
| 21 |
-
description="Minimum detector confidence to consider a detection relevant."
|
| 22 |
-
)
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
class MissionSpecification(BaseModel):
|
| 26 |
-
"""Structured representation of operator intent.
|
| 27 |
-
|
| 28 |
-
Created once from raw mission text at the API boundary (app.py).
|
| 29 |
-
Forwarded to: detector (object_classes), GPT (full spec), chat (full spec),
|
| 30 |
-
relevance gate (relevance_criteria only — INV-13).
|
| 31 |
-
|
| 32 |
-
INVARIANT INV-13: context_phrases are forwarded to LLM reasoning layers
|
| 33 |
-
(GPT threat assessment, threat chat) as situational context ONLY.
|
| 34 |
-
They must NEVER be used in evaluate_relevance(), prioritization,
|
| 35 |
-
or any deterministic filtering/sorting logic.
|
| 36 |
-
"""
|
| 37 |
-
|
| 38 |
-
# --- Extracted by LLM or fast-path ---
|
| 39 |
-
object_classes: List[str] = Field(
|
| 40 |
-
..., min_length=1,
|
| 41 |
-
description="Concrete, visually detectable object categories to detect. "
|
| 42 |
-
"These become detector queries. Must be nouns, not adjectives or verbs."
|
| 43 |
-
)
|
| 44 |
-
mission_intent: Literal[
|
| 45 |
-
"DETECT", "CLASSIFY", "TRACK", "ASSESS_THREAT", "MONITOR"
|
| 46 |
-
] = Field(
|
| 47 |
-
...,
|
| 48 |
-
description="Operator purpose. DETECT=find objects, CLASSIFY=identify type, "
|
| 49 |
-
"TRACK=follow over time, ASSESS_THREAT=evaluate danger, MONITOR=passive watch."
|
| 50 |
-
)
|
| 51 |
-
domain: Literal[
|
| 52 |
-
"NAVAL", "GROUND", "AERIAL", "URBAN", "GENERIC"
|
| 53 |
-
] = Field(
|
| 54 |
-
...,
|
| 55 |
-
description="Operational domain. Selects the GPT assessment schema and system prompt."
|
| 56 |
-
)
|
| 57 |
-
domain_source: Literal["INFERRED", "OPERATOR_SET"] = Field(
|
| 58 |
-
default="INFERRED",
|
| 59 |
-
description="Whether domain was LLM-inferred or explicitly set by operator."
|
| 60 |
-
)
|
| 61 |
-
|
| 62 |
-
# --- Deterministic (derived from object_classes) ---
|
| 63 |
-
relevance_criteria: RelevanceCriteria = Field(
|
| 64 |
-
...,
|
| 65 |
-
description="Boolean predicate for filtering detections. "
|
| 66 |
-
"Built deterministically from object_classes after extraction."
|
| 67 |
-
)
|
| 68 |
-
|
| 69 |
-
# --- Context preservation ---
|
| 70 |
-
context_phrases: List[str] = Field(
|
| 71 |
-
default_factory=list,
|
| 72 |
-
description="Non-class contextual phrases from mission text. "
|
| 73 |
-
"E.g., 'approaching from the east', 'near the harbor'. "
|
| 74 |
-
"Forwarded to GPT as situational context, NOT used for detection."
|
| 75 |
-
)
|
| 76 |
-
stripped_modifiers: List[str] = Field(
|
| 77 |
-
default_factory=list,
|
| 78 |
-
description="Adjectives/modifiers removed during extraction. "
|
| 79 |
-
"E.g., 'hostile', 'suspicious', 'friendly'. Logged for audit."
|
| 80 |
-
)
|
| 81 |
-
operator_text: str = Field(
|
| 82 |
-
...,
|
| 83 |
-
description="Original unmodified mission text from the operator. Preserved for audit."
|
| 84 |
-
)
|
| 85 |
-
|
| 86 |
-
# --- Parse mode ---
|
| 87 |
-
parse_mode: Literal["FAST_PATH", "LLM_EXTRACTED"] = Field(
|
| 88 |
-
default="FAST_PATH",
|
| 89 |
-
description="How this spec was created. FAST_PATH = comma-separated labels, "
|
| 90 |
-
"LLM_EXTRACTED = natural language parsed by GPT."
|
| 91 |
-
)
|
| 92 |
-
|
| 93 |
-
# --- LLM self-assessment ---
|
| 94 |
-
parse_confidence: Literal["HIGH", "MEDIUM", "LOW"] = Field(
|
| 95 |
-
...,
|
| 96 |
-
description="Confidence in the extraction. "
|
| 97 |
-
"LOW = could not reliably extract classes -> triggers rejection."
|
| 98 |
-
)
|
| 99 |
-
parse_warnings: List[str] = Field(
|
| 100 |
-
default_factory=list,
|
| 101 |
-
description="Specific issues encountered during extraction. "
|
| 102 |
-
"E.g., 'term \"threat\" is not a visual class, stripped'."
|
| 103 |
-
)
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
class AssessmentStatus:
|
| 107 |
-
"""Canonical string constants for detection assessment lifecycle."""
|
| 108 |
-
ASSESSED = "ASSESSED"
|
| 109 |
-
UNASSESSED = "UNASSESSED"
|
| 110 |
-
PENDING_GPT = "PENDING_GPT"
|
| 111 |
-
SKIPPED_POLICY = "SKIPPED_POLICY"
|
| 112 |
-
REFUSED = "REFUSED"
|
| 113 |
-
ERROR = "ERROR"
|
| 114 |
-
NO_RESPONSE = "NO_RESPONSE"
|
| 115 |
-
STALE = "STALE"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
utils/threat_chat.py
DELETED
|
@@ -1,154 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Threat Chat Module - GPT-powered Q&A about detected threats.
|
| 3 |
-
"""
|
| 4 |
-
|
| 5 |
-
import logging
|
| 6 |
-
from typing import List, Dict, Any
|
| 7 |
-
|
| 8 |
-
from utils.openai_client import chat_completion, extract_content, get_api_key, OpenAIAPIError
|
| 9 |
-
from utils.gpt_reasoning import _DOMAIN_ROLES
|
| 10 |
-
|
| 11 |
-
logger = logging.getLogger(__name__)
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
def chat_about_threats(
|
| 15 |
-
question: str,
|
| 16 |
-
detections: List[Dict[str, Any]],
|
| 17 |
-
mission_spec_dict: Dict[str, Any] = None,
|
| 18 |
-
) -> str:
|
| 19 |
-
"""
|
| 20 |
-
Answer user questions about detected threats using GPT.
|
| 21 |
-
|
| 22 |
-
Args:
|
| 23 |
-
question: User's question about the current threat situation.
|
| 24 |
-
detections: List of detection dicts with gpt_raw threat analysis.
|
| 25 |
-
mission_spec_dict: Optional dict of mission specification fields.
|
| 26 |
-
|
| 27 |
-
Returns:
|
| 28 |
-
GPT's response as a string.
|
| 29 |
-
"""
|
| 30 |
-
if not get_api_key():
|
| 31 |
-
logger.warning("OPENAI_API_KEY not set. Cannot process threat chat.")
|
| 32 |
-
return "Error: OpenAI API key not configured."
|
| 33 |
-
|
| 34 |
-
if not detections:
|
| 35 |
-
return "No threats detected yet. Run detection first to analyze the scene."
|
| 36 |
-
|
| 37 |
-
# Build threat context from detections
|
| 38 |
-
threat_context = _build_threat_context(detections)
|
| 39 |
-
|
| 40 |
-
# Domain-aware role selection
|
| 41 |
-
domain = "GENERIC"
|
| 42 |
-
if mission_spec_dict:
|
| 43 |
-
domain = mission_spec_dict.get("domain", "GENERIC")
|
| 44 |
-
role_label = _DOMAIN_ROLES.get(domain, _DOMAIN_ROLES["GENERIC"])
|
| 45 |
-
|
| 46 |
-
# Build mission context block (INV-8: mission context forwarded to LLM calls)
|
| 47 |
-
mission_block = ""
|
| 48 |
-
if mission_spec_dict:
|
| 49 |
-
mission_block = "\nMISSION CONTEXT:\n"
|
| 50 |
-
if mission_spec_dict.get("mission_intent"):
|
| 51 |
-
mission_block += f"- Intent: {mission_spec_dict['mission_intent']}\n"
|
| 52 |
-
if mission_spec_dict.get("domain"):
|
| 53 |
-
mission_block += f"- Domain: {mission_spec_dict['domain']}\n"
|
| 54 |
-
if mission_spec_dict.get("object_classes"):
|
| 55 |
-
mission_block += f"- Target Classes: {', '.join(mission_spec_dict['object_classes'])}\n"
|
| 56 |
-
if mission_spec_dict.get("context_phrases"):
|
| 57 |
-
mission_block += f"- Situation: {'; '.join(mission_spec_dict['context_phrases'])}\n"
|
| 58 |
-
mission_block += "\n"
|
| 59 |
-
|
| 60 |
-
system_prompt = (
|
| 61 |
-
f"You are a {role_label} providing real-time threat analysis support. "
|
| 62 |
-
"You have access to the current threat assessment data from optical surveillance. "
|
| 63 |
-
"Answer questions concisely and tactically. Use military terminology where appropriate. "
|
| 64 |
-
"If asked about engagement recommendations, always note that final decisions rest with the commanding officer.\n\n"
|
| 65 |
-
f"{mission_block}"
|
| 66 |
-
"CURRENT THREAT PICTURE:\n"
|
| 67 |
-
f"{threat_context}\n\n"
|
| 68 |
-
"Respond to the operator's question based on this threat data."
|
| 69 |
-
)
|
| 70 |
-
|
| 71 |
-
payload = {
|
| 72 |
-
"model": "gpt-4o",
|
| 73 |
-
"messages": [
|
| 74 |
-
{"role": "system", "content": system_prompt},
|
| 75 |
-
{"role": "user", "content": question}
|
| 76 |
-
],
|
| 77 |
-
"max_tokens": 500,
|
| 78 |
-
"temperature": 0.3,
|
| 79 |
-
}
|
| 80 |
-
|
| 81 |
-
try:
|
| 82 |
-
resp_data = chat_completion(payload)
|
| 83 |
-
content, _refusal = extract_content(resp_data)
|
| 84 |
-
return content.strip() if content else "No response generated."
|
| 85 |
-
|
| 86 |
-
except OpenAIAPIError as e:
|
| 87 |
-
logger.error("OpenAI API error: %s", e)
|
| 88 |
-
return f"API Error: {e}"
|
| 89 |
-
except Exception as e:
|
| 90 |
-
logger.error("Threat chat failed: %s", e)
|
| 91 |
-
return f"Error processing question: {str(e)}"
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
def _build_threat_context(detections: List[Dict[str, Any]]) -> str:
|
| 95 |
-
"""Build a text summary of all detected threats for GPT context."""
|
| 96 |
-
lines = []
|
| 97 |
-
|
| 98 |
-
for det in detections:
|
| 99 |
-
obj_id = det.get("id", "Unknown")
|
| 100 |
-
label = det.get("label", "object")
|
| 101 |
-
|
| 102 |
-
# Extract GPT raw data if available
|
| 103 |
-
gpt_raw = det.get("gpt_raw") or det.get("features") or {}
|
| 104 |
-
|
| 105 |
-
# Universal schema fields (with fallbacks to legacy names)
|
| 106 |
-
obj_type = gpt_raw.get("object_type") or gpt_raw.get("vessel_category", label)
|
| 107 |
-
size = gpt_raw.get("size", "")
|
| 108 |
-
threat_score = (
|
| 109 |
-
det.get("threat_level_score")
|
| 110 |
-
or gpt_raw.get("threat_level")
|
| 111 |
-
or gpt_raw.get("threat_level_score", "?")
|
| 112 |
-
)
|
| 113 |
-
threat_class = (
|
| 114 |
-
det.get("threat_classification")
|
| 115 |
-
or gpt_raw.get("threat_classification", "Unknown")
|
| 116 |
-
)
|
| 117 |
-
weapons = gpt_raw.get("visible_weapons", [])
|
| 118 |
-
weapon_ready = gpt_raw.get("weapon_readiness") or det.get("weapon_readiness", "Unknown")
|
| 119 |
-
motion = gpt_raw.get("motion_status", "Unknown")
|
| 120 |
-
range_est = gpt_raw.get("range_estimate") or gpt_raw.get("range_estimation_nm", "")
|
| 121 |
-
bearing = gpt_raw.get("bearing") or gpt_raw.get("bearing_clock") or det.get("gpt_direction", "")
|
| 122 |
-
intent = gpt_raw.get("tactical_intent", "")
|
| 123 |
-
dynamic_features = gpt_raw.get("dynamic_features", [])
|
| 124 |
-
|
| 125 |
-
# Build entry
|
| 126 |
-
entry = f"[{obj_id}] {obj_type}"
|
| 127 |
-
if size and size != "Unknown":
|
| 128 |
-
entry += f" ({size})"
|
| 129 |
-
entry += f"\n - Threat: {threat_class} (Score: {threat_score}/10)"
|
| 130 |
-
|
| 131 |
-
if range_est:
|
| 132 |
-
entry += f"\n - Range: {range_est}"
|
| 133 |
-
if bearing and bearing != "Unknown":
|
| 134 |
-
entry += f", Bearing: {bearing}"
|
| 135 |
-
if motion and motion != "Unknown":
|
| 136 |
-
entry += f"\n - Motion: {motion}"
|
| 137 |
-
if weapons:
|
| 138 |
-
entry += f"\n - Weapons: {', '.join(weapons) if isinstance(weapons, list) else weapons}"
|
| 139 |
-
if weapon_ready and weapon_ready != "Unknown":
|
| 140 |
-
entry += f" ({weapon_ready})"
|
| 141 |
-
if intent:
|
| 142 |
-
entry += f"\n - Assessed Intent: {intent}"
|
| 143 |
-
|
| 144 |
-
# Append dynamic features
|
| 145 |
-
for feat in dynamic_features:
|
| 146 |
-
if isinstance(feat, dict):
|
| 147 |
-
key = feat.get("key", "")
|
| 148 |
-
value = feat.get("value", "")
|
| 149 |
-
if key and value:
|
| 150 |
-
entry += f"\n - {key}: {value}"
|
| 151 |
-
|
| 152 |
-
lines.append(entry)
|
| 153 |
-
|
| 154 |
-
return "\n\n".join(lines) if lines else "No threat data available."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|