Spaces:

BiasLab2025
/

detection_base

Paused

File size: 19,714 Bytes

"""
Mission text parser — converts raw operator text into a validated MissionSpecification.

Single public function: parse_mission_text(raw_text, detector_key) -> MissionSpecification

Internal flow:
1. Fast-path regex check -> skip LLM if comma-separated labels
2. LLM extraction call (GPT-4o, temperature 0.0)
3. Deterministic validation pipeline
4. COCO vocabulary mapping for COCO-only detectors
5. Build RelevanceCriteria deterministically from mapped classes
6. Return validated MissionSpecification or raise MissionParseError
"""

import json
import logging
import re
from typing import List, Optional

from utils.openai_client import chat_completion, extract_content, get_api_key, OpenAIAPIError

from coco_classes import COCO_CLASSES, canonicalize_coco_name, coco_class_catalog
from utils.schemas import MissionSpecification, RelevanceCriteria

logger = logging.getLogger(__name__)

# Detectors that only support COCO class vocabulary
_COCO_ONLY_DETECTORS = frozenset({"yolo11", "detr_resnet50"})


class MissionParseError(ValueError):
    """Raised when mission text cannot be parsed into a valid MissionSpecification."""
    def __init__(self, message: str, warnings: Optional[List[str]] = None):
        self.warnings = warnings or []
        super().__init__(message)


def _is_comma_separated_labels(text: str) -> bool:
    """Fast-path: detect simple comma-separated class labels (no LLM needed)."""
    # Match: word tokens separated by commas, each token <= 3 words
    pattern = r"^[\w\s]+(,\s*[\w\s]+)*$"
    if not re.match(pattern, text.strip()):
        return False
    tokens = [t.strip() for t in text.split(",") if t.strip()]
    return all(len(t.split()) <= 3 for t in tokens)


def _is_coco_only(detector_key: str) -> bool:
    return detector_key in _COCO_ONLY_DETECTORS


def _map_coco_classes(
    object_classes: List[str], detector_key: str
) -> tuple[List[str], List[str], List[str]]:
    """Map object classes to COCO vocabulary for COCO-only detectors.

    Returns:
        (mapped_classes, unmappable_classes, warnings)
    """
    if not _is_coco_only(detector_key):
        return object_classes, [], []

    mapped = []
    unmappable = []
    warnings = []
    seen = set()

    for cls in object_classes:
        canonical = canonicalize_coco_name(cls)
        if canonical is not None:
            if canonical not in seen:
                mapped.append(canonical)
                seen.add(canonical)
            if canonical.lower() != cls.lower():
                warnings.append(
                    f"'{cls}' mapped to COCO class '{canonical}'."
                )
        else:
            unmappable.append(cls)
            warnings.append(
                f"'{cls}' is not in COCO vocabulary. Will not be detected by {detector_key}."
            )

    return mapped, unmappable, warnings


def _build_fast_path_spec(
    raw_text: str, object_classes: List[str], detector_key: str
) -> MissionSpecification:
    """Build MissionSpecification for simple comma-separated input (no LLM call)."""
    mapped, unmappable, warnings = _map_coco_classes(object_classes, detector_key)

    if _is_coco_only(detector_key) and not mapped:
        raise MissionParseError(
            f"None of the requested objects ({', '.join(object_classes)}) match the "
            f"{detector_key} vocabulary. This detector supports: "
            f"{coco_class_catalog()}. "
            f"Use an open-vocabulary detector (Grounding DINO) or adjust your mission.",
            warnings=warnings,
        )

    final_classes = mapped if _is_coco_only(detector_key) else object_classes

    return MissionSpecification(
        object_classes=final_classes,
        mission_intent="DETECT",
        domain="GENERIC",
        domain_source="INFERRED",
        relevance_criteria=RelevanceCriteria(
            required_classes=final_classes,
            min_confidence=0.0,
        ),
        context_phrases=[],
        stripped_modifiers=[],
        operator_text=raw_text,
        parse_mode="FAST_PATH",
        parse_confidence="HIGH",
        parse_warnings=warnings,
    )


# --- LLM Extraction ---

_SYSTEM_PROMPT = (
    "You are a mission text parser for an object detection system. Your ONLY job is to extract "
    "structured fields from operator mission text. You do NOT assess threats. You do NOT reason "
    "about tactics. You extract and classify.\n\n"
    "OUTPUT SCHEMA (strict JSON):\n"
    "{\n"
    '  "object_classes": ["string"],\n'
    '  "mission_intent": "ENUM",\n'
    '  "domain": "ENUM",\n'
    '  "context_phrases": ["string"],\n'
    '  "stripped_modifiers": ["string"],\n'
    '  "parse_confidence": "ENUM",\n'
    '  "parse_warnings": ["string"]\n'
    "}\n\n"
    "EXTRACTION RULES:\n\n"
    "1. OBJECT_CLASSES — What to extract:\n"
    "   - Extract nouns and noun phrases that refer to PHYSICAL, VISUALLY DETECTABLE objects.\n"
    "   - Keep visual descriptors that narrow the category: 'small boat', 'military vehicle', 'cargo ship'.\n"
    "   - Use singular form: 'vessels' -> 'vessel', 'people' -> 'person'.\n"
    "   - If the input is already comma-separated class labels (e.g., 'person, car, boat'),\n"
    "     use them directly without modification.\n\n"
    "2. OBJECT_CLASSES — What to strip:\n"
    "   - Remove threat/intent adjectives: 'hostile', 'suspicious', 'friendly', 'dangerous', 'enemy'.\n"
    "     -> Move these to stripped_modifiers.\n"
    "   - Remove action verbs: 'approaching', 'fleeing', 'attacking'.\n"
    "     -> Move the full phrase to context_phrases.\n"
    "   - Remove spatial/temporal phrases: 'from the east', 'near the harbor', 'at night'.\n"
    "     -> Move to context_phrases.\n"
    "   - Do NOT extract abstract concepts: 'threat', 'danger', 'hazard', 'risk' are not objects.\n\n"
    "3. MISSION_INTENT — Infer from verbs:\n"
    "   - 'detect', 'find', 'locate', 'spot', 'search for' -> DETECT\n"
    "   - 'classify', 'identify', 'determine type of' -> CLASSIFY\n"
    "   - 'track', 'follow', 'monitor movement of' -> TRACK\n"
    "   - 'assess threat', 'evaluate danger', 'threat assessment' -> ASSESS_THREAT\n"
    "   - 'monitor', 'watch', 'observe', 'surveil' -> MONITOR\n"
    "   - If no verb present (bare class list), default to DETECT.\n\n"
    "4. DOMAIN — Infer from contextual clues:\n"
    "   - Maritime vocabulary (vessel, ship, boat, harbor, naval, maritime, wake, sea) -> NAVAL\n"
    "   - Ground vocabulary (vehicle, convoy, checkpoint, road, building, infantry) -> GROUND\n"
    "   - Aerial vocabulary (aircraft, drone, UAV, airspace, altitude, flight) -> AERIAL\n"
    "   - Urban vocabulary (pedestrian, intersection, storefront, crowd, building) -> URBAN\n"
    "   - If no domain clues present -> GENERIC\n\n"
    "5. PARSE_CONFIDENCE:\n"
    "   - HIGH: Clear object classes extracted, domain identifiable.\n"
    "   - MEDIUM: Some ambiguity but reasonable extraction possible. Include warnings.\n"
    "   - LOW: Cannot extract meaningful object classes. Input is too abstract,\n"
    "     contradictory, or contains no visual object references.\n"
    "     Examples of LOW: 'keep us safe', 'do your job', 'analyze everything'.\n\n"
    "FORBIDDEN:\n"
    "- Do NOT infer object classes not implied by the text. If the text says 'boats',\n"
    "  do not add 'person' or 'vehicle' unless mentioned.\n"
    "- Do NOT add threat scores, engagement rules, or tactical recommendations.\n"
    "- Do NOT interpret what 'threat' or 'danger' means in terms of specific objects.\n"
    "  If the operator writes 'detect threats', set parse_confidence to LOW and warn:\n"
    "  \"'threats' is not a visual object class. Specify what objects to detect.\""
)

_VISION_GROUNDING_ADDENDUM = (
    "\n\nVISION GROUNDING (when an image is provided):\n"
    "You may receive the first frame of the operator's video feed as an image.\n"
    "Use it to REFINE your object_classes extraction:\n\n"
    "1. If the operator uses a general term (e.g., 'vessels', 'vehicles'),\n"
    "   inspect the image and add MORE SPECIFIC subcategories visible in the scene.\n"
    "   Example: operator says 'detect vessels', image shows a speedboat and a cargo ship\n"
    "   -> object_classes: ['vessel', 'speedboat', 'cargo ship']\n\n"
    "2. If the operator mentions objects NOT visible in the first frame,\n"
    "   still include them (later frames may contain them), but add a\n"
    "   parse_warning noting they were not visible in the first frame.\n\n"
    "3. Use the image to CONFIRM or REFINE the domain. If the text is ambiguous\n"
    "   but the image clearly shows open water, set domain to NAVAL.\n\n"
    "4. Do NOT hallucinate objects. Only add specific subcategories if clearly\n"
    "   identifiable. When uncertain, keep the general term.\n\n"
    "5. The same OUTPUT SCHEMA and all EXTRACTION RULES still apply.\n"
    "   The image is supplementary context, not a replacement for the text.\n"
)


def _extract_and_encode_first_frame(video_path: Optional[str]) -> Optional[str]:
    """Extract the first frame from a video and return it as a base64-encoded JPEG.

    Never raises — returns None on any failure so the caller can fall back
    to text-only parsing.
    """
    if not video_path:
        return None
    try:
        from inference import extract_first_frame
        from utils.gpt_reasoning import encode_frame_to_b64

        frame, _fps, _w, _h = extract_first_frame(video_path)
        return encode_frame_to_b64(frame, quality=85)
    except Exception:
        logger.warning("Failed to extract/encode first frame for vision grounding", exc_info=True)
        return None


def _call_extraction_llm(raw_text: str, detector_key: str, first_frame_b64: Optional[str] = None) -> dict:
    """Call GPT-4o to extract structured mission fields from natural language."""
    if not get_api_key():
        raise MissionParseError(
            "OPENAI_API_KEY not set. Cannot parse natural language mission text. "
            "Use comma-separated class labels instead (e.g., 'person, car, boat')."
        )

    detector_type = "COCO_ONLY" if _is_coco_only(detector_key) else "OPEN_VOCAB"

    user_prompt_text = (
        f'OPERATOR MISSION TEXT:\n"{raw_text}"\n\n'
        f"DETECTOR TYPE: {detector_type}\n\n"
        "Extract the structured mission specification from the above text."
    )

    # Build system prompt (append vision addendum when image is available)
    system_content = _SYSTEM_PROMPT
    if first_frame_b64:
        system_content = _SYSTEM_PROMPT + _VISION_GROUNDING_ADDENDUM

    # Build user message: mixed content array when image is available, plain string otherwise
    if first_frame_b64:
        user_message = {
            "role": "user",
            "content": [
                {"type": "text", "text": user_prompt_text},
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{first_frame_b64}",
                        "detail": "low",
                    },
                },
            ],
        }
    else:
        user_message = {"role": "user", "content": user_prompt_text}

    max_tokens = 700 if first_frame_b64 else 500
    timeout_s = 45 if first_frame_b64 else 30

    payload = {
        "model": "gpt-4o",
        "temperature": 0.0,
        "max_tokens": max_tokens,
        "response_format": {"type": "json_object"},
        "messages": [
            {"role": "system", "content": system_content},
            user_message,
        ],
    }

    try:
        resp_data = chat_completion(payload, timeout=timeout_s)
        content, _refusal = extract_content(resp_data)
        if not content:
            raise MissionParseError("GPT returned empty content during mission parsing.")

        return json.loads(content)

    except OpenAIAPIError as e:
        raise MissionParseError(f"Mission parsing API call failed: {e}")
    except json.JSONDecodeError:
        raise MissionParseError(
            "GPT returned invalid JSON. Please rephrase your mission."
        )


def _validate_and_build(
    llm_output: dict, raw_text: str, detector_key: str
) -> MissionSpecification:
    """Deterministic validation pipeline (Section 7.3 decision tree)."""

    # Step 2: Extract fields with defaults
    object_classes = llm_output.get("object_classes", [])
    mission_intent = llm_output.get("mission_intent", "DETECT")
    domain = llm_output.get("domain", "GENERIC")
    context_phrases = llm_output.get("context_phrases", [])
    stripped_modifiers = llm_output.get("stripped_modifiers", [])
    parse_confidence = llm_output.get("parse_confidence", "LOW")
    parse_warnings = llm_output.get("parse_warnings", [])

    # Validate enum values
    valid_intents = {"DETECT", "CLASSIFY", "TRACK", "ASSESS_THREAT", "MONITOR"}
    if mission_intent not in valid_intents:
        mission_intent = "DETECT"
        parse_warnings.append(f"Invalid mission_intent '{llm_output.get('mission_intent')}', defaulted to DETECT.")

    valid_domains = {"NAVAL", "GROUND", "AERIAL", "URBAN", "GENERIC"}
    if domain not in valid_domains:
        domain = "GENERIC"
        parse_warnings.append(f"Invalid domain '{llm_output.get('domain')}', defaulted to GENERIC.")

    valid_confidence = {"HIGH", "MEDIUM", "LOW"}
    if parse_confidence not in valid_confidence:
        parse_confidence = "LOW"

    # Step 3: Parse confidence check
    if parse_confidence == "LOW":
        warnings_str = "; ".join(parse_warnings) if parse_warnings else "No details"
        raise MissionParseError(
            f"Could not extract object classes from mission text. "
            f"Warnings: {warnings_str}. "
            f"Please specify concrete objects to detect (e.g., 'vessel, small boat').",
            warnings=parse_warnings,
        )

    # Validate object_classes is non-empty
    if not object_classes:
        raise MissionParseError(
            "Mission text produced no detectable object classes. "
            "Please specify concrete objects to detect.",
            warnings=parse_warnings,
        )

    # Filter out empty strings
    object_classes = [c.strip() for c in object_classes if c and c.strip()]
    if not object_classes:
        raise MissionParseError(
            "All extracted object classes were empty after cleanup.",
            warnings=parse_warnings,
        )

    # Step 4: COCO vocabulary mapping
    mapped, unmappable, coco_warnings = _map_coco_classes(object_classes, detector_key)
    parse_warnings.extend(coco_warnings)

    if _is_coco_only(detector_key):
        if not mapped:
            raise MissionParseError(
                f"None of the requested objects ({', '.join(object_classes)}) match the "
                f"{detector_key} vocabulary. "
                f"This detector supports: {coco_class_catalog()}. "
                f"Use an open-vocabulary detector (Grounding DINO) or adjust your mission.",
                warnings=parse_warnings,
            )
        final_classes = mapped
    else:
        final_classes = object_classes

    # Step 5: Build RelevanceCriteria deterministically
    relevance_criteria = RelevanceCriteria(
        required_classes=final_classes,
        min_confidence=0.0,
    )

    # Step 6: Construct MissionSpecification
    return MissionSpecification(
        object_classes=final_classes,
        mission_intent=mission_intent,
        domain=domain,
        domain_source="INFERRED",
        relevance_criteria=relevance_criteria,
        # INVARIANT INV-13: context_phrases are forwarded to LLM reasoning layers
        # (GPT threat assessment, threat chat) as situational context ONLY.
        # They must NEVER be used in evaluate_relevance(), prioritization,
        # or any deterministic filtering/sorting logic.
        context_phrases=context_phrases,
        stripped_modifiers=stripped_modifiers,
        operator_text=raw_text,
        parse_mode="LLM_EXTRACTED",
        parse_confidence=parse_confidence,
        parse_warnings=parse_warnings,
    )


_DOMAIN_BROAD_CATEGORIES: dict[str, List[str]] = {
    "NAVAL": ["vessel", "ship", "boat", "buoy", "person"],
    "AERIAL": ["aircraft", "helicopter", "drone", "airplane"],
    "GROUND": ["vehicle", "car", "truck", "person", "building"],
    "URBAN": ["person", "vehicle", "car", "bicycle"],
    "GENERIC": ["object"],
}


def build_broad_queries(
    detector_key: str, mission_spec: MissionSpecification
) -> List[str]:
    """Build broad detector queries for LLM post-filter mode.

    For FAST_PATH: return object_classes directly (unchanged behavior).
    For COCO detectors (LLM_EXTRACTED): return ALL 80 COCO classes.
    For open-vocab detectors (LLM_EXTRACTED): return LLM-extracted classes
    PLUS broad domain categories to maximize recall.
    """
    if mission_spec.parse_mode == "FAST_PATH":
        return mission_spec.object_classes

    # LLM_EXTRACTED path: detect broadly
    if _is_coco_only(detector_key):
        # COCO detectors ignore queries anyway (DETR detects all 80;
        # YOLO11 falls back to all if no matches). Send everything.
        return list(COCO_CLASSES)

    # Open-vocab detector (e.g. Grounding DINO):
    # Combine LLM-extracted classes with domain-specific broad categories
    broad = list(mission_spec.object_classes)
    domain_extras = _DOMAIN_BROAD_CATEGORIES.get(
        mission_spec.domain, _DOMAIN_BROAD_CATEGORIES["GENERIC"]
    )
    seen = {c.lower() for c in broad}
    for cat in domain_extras:
        if cat.lower() not in seen:
            broad.append(cat)
            seen.add(cat.lower())

    logger.info("Broad queries for %s: %s", detector_key, broad)
    return broad


def parse_mission_text(
    raw_text: str,
    detector_key: str,
    video_path: Optional[str] = None,
) -> MissionSpecification:
    """Parse raw mission text into a validated MissionSpecification.

    Args:
        raw_text: Verbatim mission text from the operator.
        detector_key: Detector model key (determines COCO vocabulary constraints).
        video_path: Optional path to input video; first frame used for vision grounding.

    Returns:
        Validated MissionSpecification.

    Raises:
        MissionParseError: If mission text cannot produce a valid specification.
    """
    if not raw_text or not raw_text.strip():
        raise MissionParseError(
            "Mission text is empty. Specify objects to detect or use the default queries."
        )

    raw_text = raw_text.strip()

    # Fast-path: simple comma-separated labels -> skip LLM
    if _is_comma_separated_labels(raw_text):
        object_classes = [t.strip() for t in raw_text.split(",") if t.strip()]
        logger.info(
            "Mission fast-path: comma-separated labels %s", object_classes
        )
        return _build_fast_path_spec(raw_text, object_classes, detector_key)

    # LLM path: natural language mission text
    logger.info("Mission LLM-path: extracting from natural language")
    first_frame_b64 = _extract_and_encode_first_frame(video_path)
    if first_frame_b64:
        logger.info("Vision grounding: first frame encoded for LLM call")
    llm_output = _call_extraction_llm(raw_text, detector_key, first_frame_b64=first_frame_b64)
    logger.info("Mission LLM extraction result: %s", llm_output)

    mission_spec = _validate_and_build(llm_output, raw_text, detector_key)
    logger.info(
        "Mission parsed: classes=%s intent=%s domain=%s(%s) confidence=%s",
        mission_spec.object_classes,
        mission_spec.mission_intent,
        mission_spec.domain,
        mission_spec.domain_source,
        mission_spec.parse_confidence,
    )
    return mission_spec