Spaces:

BiasLab2025
/

perception

Sleeping

App Files Files Community

Zhen Ye commited on 30 days ago

Commit

b30e7a3

1 Parent(s): 1ea465c

inital commit

Browse files

Files changed (18) hide show

.gitignore +8 -0
Dockerfile +21 -0
app.py +208 -0
coco_classes.py +163 -0
demo.html +618 -0
inference.py +182 -0
models/detectors/base.py +19 -0
models/detectors/detr.py +48 -0
models/detectors/grounding_dino.py +56 -0
models/detectors/owlv2.py +56 -0
models/detectors/yolov8.py +69 -0
models/model_loader.py +43 -0
models/segmenters/__init__.py +10 -0
models/segmenters/base.py +29 -0
models/segmenters/model_loader.py +44 -0
models/segmenters/sam3.py +134 -0
requirements.txt +13 -0
utils/video.py +79 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,8 @@

+__pycache__/
+.venv/
+*.mp4
+*.log
+*.tmp
+.DS_Store
+.env
+*.md

Dockerfile ADDED Viewed

	@@ -0,0 +1,21 @@

+FROM python:3.10-slim
+ENV PYTHONUNBUFFERED=1 \
+    PYTHONDONTWRITEBYTECODE=1
+WORKDIR /app
+COPY requirements.txt ./
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        libgl1 \
+        libglib2.0-0 \
+        ffmpeg \
+    && rm -rf /var/lib/apt/lists/* \
+    && pip install --no-cache-dir --upgrade pip \
+    && pip install --no-cache-dir -r requirements.txt
+COPY . .
+EXPOSE 7860
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

app.py ADDED Viewed

	@@ -0,0 +1,208 @@

+import logging
+import os
+import tempfile
+from pathlib import Path
+from fastapi import BackgroundTasks, FastAPI, File, Form, HTTPException, UploadFile
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import FileResponse, HTMLResponse, JSONResponse
+import uvicorn
+from inference import run_inference, run_segmentation
+logging.basicConfig(level=logging.INFO)
+app = FastAPI(title="Video Object Detection")
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Valid detection modes
+VALID_MODES = {"object_detection", "segmentation", "drone_detection"}
+def _save_upload_to_tmp(upload: UploadFile) -> str:
+    """Save uploaded file to temporary location."""
+    suffix = Path(upload.filename or "upload.mp4").suffix or ".mp4"
+    fd, path = tempfile.mkstemp(prefix="input_", suffix=suffix, dir="/tmp")
+    os.close(fd)
+    with open(path, "wb") as buffer:
+        data = upload.file.read()
+        buffer.write(data)
+    return path
+def _safe_delete(path: str) -> None:
+    """Safely delete a file, ignoring errors."""
+    try:
+        os.remove(path)
+    except FileNotFoundError:
+        return
+    except Exception:
+        logging.exception("Failed to remove temporary file: %s", path)
+def _schedule_cleanup(background_tasks: BackgroundTasks, path: str) -> None:
+    """Schedule file cleanup after response is sent."""
+    def _cleanup(target: str = path) -> None:
+        _safe_delete(target)
+    background_tasks.add_task(_cleanup)
+@app.get("/", response_class=HTMLResponse)
+async def demo_page() -> str:
+    """Serve the demo page."""
+    demo_path = Path(__file__).with_name("demo.html")
+    try:
+        return demo_path.read_text(encoding="utf-8")
+    except FileNotFoundError:
+        return "<h1>Demo page missing</h1>"
+@app.post("/detect")
+async def detect_endpoint(
+    background_tasks: BackgroundTasks,
+    video: UploadFile = File(...),
+    mode: str = Form(...),
+    queries: str = Form(""),
+    detector: str = Form("owlv2_base"),
+    segmenter: str = Form("sam3"),
+):
+    """
+    Main detection endpoint.
+    Args:
+        video: Video file to process
+        mode: Detection mode (object_detection, segmentation, drone_detection)
+        queries: Comma-separated object classes for object_detection mode
+        detector: Model to use (owlv2_base, hf_yolov8, detr_resnet50, grounding_dino)
+        segmenter: Segmentation model to use (sam3)
+    Returns:
+        - For object_detection: Processed video with bounding boxes
+        - For segmentation: Processed video with masks rendered
+        - For drone_detection: JSON with "coming_soon" status
+    """
+    # Validate mode
+    if mode not in VALID_MODES:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Invalid mode '{mode}'. Must be one of: {', '.join(VALID_MODES)}"
+        )
+    if mode == "segmentation":
+        if video is None:
+            raise HTTPException(status_code=400, detail="Video file is required.")
+        try:
+            input_path = _save_upload_to_tmp(video)
+        except Exception:
+            logging.exception("Failed to save uploaded file.")
+            raise HTTPException(status_code=500, detail="Failed to save uploaded video.")
+        finally:
+            await video.close()
+        fd, output_path = tempfile.mkstemp(prefix="output_", suffix=".mp4", dir="/tmp")
+        os.close(fd)
+        # Parse queries
+        query_list = [q.strip() for q in queries.split(",") if q.strip()]
+        if not query_list:
+            query_list = ["object"]
+        try:
+            output_path = run_segmentation(
+                input_path,
+                output_path,
+                query_list,
+                segmenter_name=segmenter,
+            )
+        except ValueError as exc:
+            logging.exception("Segmentation processing failed.")
+            _safe_delete(input_path)
+            _safe_delete(output_path)
+            raise HTTPException(status_code=500, detail=str(exc))
+        except Exception as exc:
+            logging.exception("Segmentation inference failed.")
+            _safe_delete(input_path)
+            _safe_delete(output_path)
+            return JSONResponse(status_code=500, content={"error": str(exc)})
+        _schedule_cleanup(background_tasks, input_path)
+        _schedule_cleanup(background_tasks, output_path)
+        return FileResponse(
+            path=output_path,
+            media_type="video/mp4",
+            filename="segmented.mp4",
+        )
+    if mode == "drone_detection":
+        return JSONResponse(
+            status_code=200,
+            content={
+                "status": "coming_soon",
+                "message": "Drone detection mode is under development. Stay tuned!",
+                "mode": "drone_detection"
+            }
+        )
+    # Handle object detection mode
+    if video is None:
+        raise HTTPException(status_code=400, detail="Video file is required.")
+    # Save uploaded video
+    try:
+        input_path = _save_upload_to_tmp(video)
+    except Exception:
+        logging.exception("Failed to save uploaded file.")
+        raise HTTPException(status_code=500, detail="Failed to save uploaded video.")
+    finally:
+        await video.close()
+    # Create output path
+    fd, output_path = tempfile.mkstemp(prefix="output_", suffix=".mp4", dir="/tmp")
+    os.close(fd)
+    # Parse queries
+    query_list = [q.strip() for q in queries.split(",") if q.strip()]
+    # Run inference
+    try:
+        output_path = run_inference(
+            input_path,
+            output_path,
+            query_list,
+            detector_name=detector,
+        )
+    except ValueError as exc:
+        logging.exception("Video processing failed.")
+        _safe_delete(input_path)
+        _safe_delete(output_path)
+        raise HTTPException(status_code=500, detail=str(exc))
+    except Exception as exc:
+        logging.exception("Inference failed.")
+        _safe_delete(input_path)
+        _safe_delete(output_path)
+        return JSONResponse(status_code=500, content={"error": str(exc)})
+    # Schedule cleanup
+    _schedule_cleanup(background_tasks, input_path)
+    _schedule_cleanup(background_tasks, output_path)
+    # Return processed video
+    response = FileResponse(
+        path=output_path,
+        media_type="video/mp4",
+        filename="processed.mp4",
+    )
+    return response
+if __name__ == "__main__":
+    uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)

coco_classes.py ADDED Viewed

	@@ -0,0 +1,163 @@

+from __future__ import annotations
+import difflib
+import re
+from typing import Dict, Tuple
+COCO_CLASSES: Tuple[str, ...] = (
+    "person",
+    "bicycle",
+    "car",
+    "motorcycle",
+    "airplane",
+    "bus",
+    "train",
+    "truck",
+    "boat",
+    "traffic light",
+    "fire hydrant",
+    "stop sign",
+    "parking meter",
+    "bench",
+    "bird",
+    "cat",
+    "dog",
+    "horse",
+    "sheep",
+    "cow",
+    "elephant",
+    "bear",
+    "zebra",
+    "giraffe",
+    "backpack",
+    "umbrella",
+    "handbag",
+    "tie",
+    "suitcase",
+    "frisbee",
+    "skis",
+    "snowboard",
+    "sports ball",
+    "kite",
+    "baseball bat",
+    "baseball glove",
+    "skateboard",
+    "surfboard",
+    "tennis racket",
+    "bottle",
+    "wine glass",
+    "cup",
+    "fork",
+    "knife",
+    "spoon",
+    "bowl",
+    "banana",
+    "apple",
+    "sandwich",
+    "orange",
+    "broccoli",
+    "carrot",
+    "hot dog",
+    "pizza",
+    "donut",
+    "cake",
+    "chair",
+    "couch",
+    "potted plant",
+    "bed",
+    "dining table",
+    "toilet",
+    "tv",
+    "laptop",
+    "mouse",
+    "remote",
+    "keyboard",
+    "cell phone",
+    "microwave",
+    "oven",
+    "toaster",
+    "sink",
+    "refrigerator",
+    "book",
+    "clock",
+    "vase",
+    "scissors",
+    "teddy bear",
+    "hair drier",
+    "toothbrush",
+)
+def coco_class_catalog() -> str:
+    """Return the COCO classes in a comma-separated catalog for prompts."""
+    return ", ".join(COCO_CLASSES)
+def _normalize(label: str) -> str:
+    return re.sub(r"[^a-z0-9]+", " ", label.lower()).strip()
+_CANONICAL_LOOKUP: Dict[str, str] = {_normalize(name): name for name in COCO_CLASSES}
+_COCO_SYNONYMS: Dict[str, str] = {
+    "people": "person",
+    "man": "person",
+    "woman": "person",
+    "men": "person",
+    "women": "person",
+    "motorbike": "motorcycle",
+    "motor bike": "motorcycle",
+    "bike": "bicycle",
+    "aircraft": "airplane",
+    "plane": "airplane",
+    "jet": "airplane",
+    "aeroplane": "airplane",
+    "pickup": "truck",
+    "pickup truck": "truck",
+    "semi": "truck",
+    "lorry": "truck",
+    "tractor trailer": "truck",
+    "coach": "bus",
+    "television": "tv",
+    "tv monitor": "tv",
+    "mobile phone": "cell phone",
+    "smartphone": "cell phone",
+    "cellphone": "cell phone",
+    "dinner table": "dining table",
+    "sofa": "couch",
+    "cooker": "oven",
+}
+_ALIAS_LOOKUP: Dict[str, str] = {_normalize(alias): canonical for alias, canonical in _COCO_SYNONYMS.items()}
+def canonicalize_coco_name(value: str | None) -> str | None:
+    """Map an arbitrary string to the closest COCO class name if possible."""
+    if not value:
+        return None
+    normalized = _normalize(value)
+    if not normalized:
+        return None
+    if normalized in _CANONICAL_LOOKUP:
+        return _CANONICAL_LOOKUP[normalized]
+    if normalized in _ALIAS_LOOKUP:
+        return _ALIAS_LOOKUP[normalized]
+    for alias_norm, canonical in _ALIAS_LOOKUP.items():
+        if alias_norm and alias_norm in normalized:
+            return canonical
+    for canonical_norm, canonical in _CANONICAL_LOOKUP.items():
+        if canonical_norm and canonical_norm in normalized:
+            return canonical
+    tokens = normalized.split()
+    for token in tokens:
+        if token in _CANONICAL_LOOKUP:
+            return _CANONICAL_LOOKUP[token]
+        if token in _ALIAS_LOOKUP:
+            return _ALIAS_LOOKUP[token]
+    close = difflib.get_close_matches(normalized, list(_CANONICAL_LOOKUP.keys()), n=1, cutoff=0.82)
+    if close:
+        return _CANONICAL_LOOKUP[close[0]]
+    return None

demo.html ADDED Viewed

	@@ -0,0 +1,618 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Video Object Detection</title>
+    <style>
+        * {
+            margin: 0;
+            padding: 0;
+            box-sizing: border-box;
+        }
+        body {
+            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Roboto', 'Helvetica', 'Arial', sans-serif;
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+            min-height: 100vh;
+            padding: 20px;
+        }
+        .container {
+            max-width: 1200px;
+            margin: 0 auto;
+        }
+        h1 {
+            color: white;
+            text-align: center;
+            margin-bottom: 30px;
+            font-size: 2.5rem;
+            text-shadow: 2px 2px 4px rgba(0,0,0,0.2);
+        }
+        .main-card {
+            background: white;
+            border-radius: 16px;
+            box-shadow: 0 20px 60px rgba(0,0,0,0.3);
+            padding: 40px;
+        }
+        .section {
+            margin-bottom: 30px;
+        }
+        .section-title {
+            font-size: 1.2rem;
+            font-weight: 600;
+            color: #333;
+            margin-bottom: 15px;
+        }
+        /* Mode selector */
+        .mode-selector {
+            display: grid;
+            grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
+            gap: 15px;
+        }
+        .mode-card {
+            position: relative;
+            padding: 20px;
+            border: 2px solid #e0e0e0;
+            border-radius: 12px;
+            cursor: pointer;
+            transition: all 0.3s ease;
+            text-align: center;
+        }
+        .mode-card:hover {
+            border-color: #667eea;
+            transform: translateY(-2px);
+            box-shadow: 0 4px 12px rgba(102, 126, 234, 0.2);
+        }
+        .mode-card.selected {
+            border-color: #667eea;
+            background: #f0f4ff;
+        }
+        .mode-card.disabled {
+            opacity: 0.5;
+            cursor: not-allowed;
+        }
+        .mode-card input[type="radio"] {
+            position: absolute;
+            opacity: 0;
+        }
+        .mode-icon {
+            font-size: 2rem;
+            margin-bottom: 10px;
+        }
+        .mode-title {
+            font-weight: 600;
+            color: #333;
+            margin-bottom: 5px;
+        }
+        .mode-badge {
+            display: inline-block;
+            padding: 4px 8px;
+            background: #ffc107;
+            color: white;
+            font-size: 0.7rem;
+            border-radius: 4px;
+            font-weight: 600;
+            margin-top: 8px;
+        }
+        /* Input fields */
+        .input-group {
+            margin-bottom: 20px;
+        }
+        .input-group label {
+            display: block;
+            font-weight: 500;
+            color: #555;
+            margin-bottom: 8px;
+        }
+        .input-group input[type="text"],
+        .input-group select {
+            width: 100%;
+            padding: 12px;
+            border: 2px solid #e0e0e0;
+            border-radius: 8px;
+            font-size: 1rem;
+            transition: border-color 0.3s;
+        }
+        .input-group input[type="text"]:focus,
+        .input-group select:focus {
+            outline: none;
+            border-color: #667eea;
+        }
+        .file-input-wrapper {
+            position: relative;
+            display: inline-block;
+            width: 100%;
+        }
+        .file-input-label {
+            display: block;
+            padding: 15px;
+            background: #f8f9fa;
+            border: 2px dashed #ccc;
+            border-radius: 8px;
+            text-align: center;
+            cursor: pointer;
+            transition: all 0.3s;
+        }
+        .file-input-label:hover {
+            border-color: #667eea;
+            background: #f0f4ff;
+        }
+        .file-input-label.has-file {
+            border-color: #28a745;
+            background: #d4edda;
+        }
+        input[type="file"] {
+            position: absolute;
+            opacity: 0;
+            width: 0;
+            height: 0;
+        }
+        /* Buttons */
+        .btn {
+            padding: 14px 28px;
+            font-size: 1rem;
+            font-weight: 600;
+            border: none;
+            border-radius: 8px;
+            cursor: pointer;
+            transition: all 0.3s;
+            width: 100%;
+        }
+        .btn-primary {
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+            color: white;
+        }
+        .btn-primary:hover:not(:disabled) {
+            transform: translateY(-2px);
+            box-shadow: 0 4px 12px rgba(102, 126, 234, 0.4);
+        }
+        .btn:disabled {
+            opacity: 0.5;
+            cursor: not-allowed;
+        }
+        /* Results */
+        .results-grid {
+            display: grid;
+            grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
+            gap: 20px;
+        }
+        .video-card {
+            border: 1px solid #e0e0e0;
+            border-radius: 8px;
+            overflow: hidden;
+        }
+        .video-card-header {
+            background: #f8f9fa;
+            padding: 12px 16px;
+            font-weight: 600;
+            color: #333;
+        }
+        .video-card-body {
+            padding: 16px;
+        }
+        video {
+            width: 100%;
+            border-radius: 8px;
+            background: #000;
+        }
+        .download-btn {
+            margin-top: 12px;
+            padding: 10px 16px;
+            background: #28a745;
+            color: white;
+            text-decoration: none;
+            border-radius: 6px;
+            display: inline-block;
+            font-size: 0.9rem;
+        }
+        .download-btn:hover {
+            background: #218838;
+        }
+        /* Loading spinner */
+        .loading {
+            display: none;
+            text-align: center;
+            padding: 20px;
+        }
+        .loading.show {
+            display: block;
+        }
+        .spinner {
+            border: 4px solid #f3f3f3;
+            border-top: 4px solid #667eea;
+            border-radius: 50%;
+            width: 40px;
+            height: 40px;
+            animation: spin 1s linear infinite;
+            margin: 0 auto 10px;
+        }
+        @keyframes spin {
+            0% { transform: rotate(0deg); }
+            100% { transform: rotate(360deg); }
+        }
+        .hidden {
+            display: none;
+        }
+        /* Modal */
+        .modal {
+            display: none;
+            position: fixed;
+            z-index: 1000;
+            left: 0;
+            top: 0;
+            width: 100%;
+            height: 100%;
+            background: rgba(0,0,0,0.5);
+            align-items: center;
+            justify-content: center;
+        }
+        .modal.show {
+            display: flex;
+        }
+        .modal-content {
+            background: white;
+            padding: 30px;
+            border-radius: 12px;
+            max-width: 500px;
+            text-align: center;
+        }
+        .modal-content h2 {
+            margin-bottom: 15px;
+            color: #333;
+        }
+        .modal-content p {
+            margin-bottom: 20px;
+            color: #666;
+        }
+        .modal-btn {
+            padding: 10px 24px;
+            background: #667eea;
+            color: white;
+            border: none;
+            border-radius: 6px;
+            cursor: pointer;
+            font-size: 1rem;
+        }
+        .modal-btn:hover {
+            background: #5568d3;
+        }
+    </style>
+</head>
+<body>
+    <div class="container">
+        <h1>🎥 Video Object Detection</h1>
+        <div class="main-card">
+            <!-- Mode Selection -->
+            <div class="section">
+                <div class="section-title">1. Select Detection Mode</div>
+                <div class="mode-selector">
+                    <label class="mode-card selected">
+                        <input type="radio" name="mode" value="object_detection" checked>
+                        <div class="mode-icon">🎯</div>
+                        <div class="mode-title">Object Detection</div>
+                    </label>
+                    <label class="mode-card">
+                        <input type="radio" name="mode" value="segmentation">
+                        <div class="mode-icon">🎨</div>
+                        <div class="mode-title">Segmentation</div>
+                    </label>
+                    <label class="mode-card disabled">
+                        <input type="radio" name="mode" value="drone_detection">
+                        <div class="mode-icon">🚁</div>
+                        <div class="mode-title">Drone Detection</div>
+                        <span class="mode-badge">COMING SOON</span>
+                    </label>
+                </div>
+            </div>
+            <!-- Text Prompts Input (for all modes) -->
+            <div class="section" id="queriesSection">
+                <div class="input-group">
+                    <label for="queries" id="queriesLabel">Text Prompts (comma-separated)</label>
+                    <input
+                        type="text"
+                        id="queries"
+                        placeholder="person, car, dog, bicycle"
+                    >
+                    <small id="queriesHint" style="color: #666; display: block; margin-top: 5px;">
+                        Enter objects to detect or segment
+                    </small>
+                </div>
+            </div>
+            <!-- Detector Selection -->
+            <div class="section" id="detectorSection">
+                <div class="input-group">
+                    <label for="detector">2. Select Detection Model</label>
+                    <select id="detector">
+                        <option value="owlv2_base">OWLv2 (Open-vocabulary, Default)</option>
+                        <option value="hf_yolov8">YOLOv8 (Fast, COCO classes)</option>
+                        <option value="detr_resnet50">DETR ResNet-50 (Transformer-based)</option>
+                        <option value="grounding_dino">Grounding DINO (Open-vocabulary)</option>
+                    </select>
+                </div>
+            </div>
+            <!-- Segmenter Selection -->
+            <div class="section hidden" id="segmenterSection">
+                <div class="input-group">
+                    <label for="segmenter">2. Select Segmentation Model</label>
+                    <select id="segmenter">
+                        <option value="sam3">SAM3 (Segment Anything Model 3)</option>
+                    </select>
+                </div>
+            </div>
+            <!-- Video Upload -->
+            <div class="section">
+                <div class="input-group">
+                    <label>3. Upload Video</label>
+                    <div class="file-input-wrapper">
+                        <label class="file-input-label" id="fileLabel" for="videoFile">
+                            📁 Click to select video file (MP4)
+                        </label>
+                        <input type="file" id="videoFile" accept="video/*">
+                    </div>
+                </div>
+            </div>
+            <!-- Process Button -->
+            <div class="section">
+                <button class="btn btn-primary" id="processBtn" disabled>
+                    🚀 Process Video
+                </button>
+            </div>
+            <!-- Loading -->
+            <div class="loading" id="loading">
+                <div class="spinner"></div>
+                <p>Processing video... This may take a while depending on video length.</p>
+            </div>
+            <!-- Results -->
+            <div class="section hidden" id="resultsSection">
+                <div class="section-title">Results</div>
+                <div class="results-grid">
+                    <div class="video-card">
+                        <div class="video-card-header">Original Video</div>
+                        <div class="video-card-body">
+                            <video id="originalVideo" controls></video>
+                        </div>
+                    </div>
+                    <div class="video-card">
+                        <div class="video-card-header">Processed Video</div>
+                        <div class="video-card-body">
+                            <video id="processedVideo" controls autoplay loop></video>
+                            <a id="downloadBtn" class="download-btn" download="processed.mp4">
+                                ⬇️ Download Processed Video
+                            </a>
+                        </div>
+                    </div>
+                </div>
+            </div>
+        </div>
+    </div>
+    <!-- Coming Soon Modal -->
+    <div class="modal" id="comingSoonModal">
+        <div class="modal-content">
+            <h2>Coming Soon!</h2>
+            <p id="modalMessage"></p>
+            <button class="modal-btn" id="modalClose">Got it</button>
+        </div>
+    </div>
+    <script>
+        // State
+        let selectedMode = 'object_detection';
+        let videoFile = null;
+        // Elements
+        const modeCards = document.querySelectorAll('.mode-card');
+        const queriesSection = document.getElementById('queriesSection');
+        const queriesLabel = document.getElementById('queriesLabel');
+        const queriesHint = document.getElementById('queriesHint');
+        const detectorSection = document.getElementById('detectorSection');
+        const segmenterSection = document.getElementById('segmenterSection');
+        const fileInput = document.getElementById('videoFile');
+        const fileLabel = document.getElementById('fileLabel');
+        const processBtn = document.getElementById('processBtn');
+        const loading = document.getElementById('loading');
+        const resultsSection = document.getElementById('resultsSection');
+        const originalVideo = document.getElementById('originalVideo');
+        const processedVideo = document.getElementById('processedVideo');
+        const downloadBtn = document.getElementById('downloadBtn');
+        const modal = document.getElementById('comingSoonModal');
+        const modalMessage = document.getElementById('modalMessage');
+        const modalClose = document.getElementById('modalClose');
+        // Mode selection handler
+        modeCards.forEach(card => {
+            card.addEventListener('click', (e) => {
+                const input = card.querySelector('input[type="radio"]');
+                const mode = input.value;
+                // Check if disabled
+                if (card.classList.contains('disabled')) {
+                    e.preventDefault();
+                    showComingSoonModal(mode);
+                    return;
+                }
+                // Update selected state
+                modeCards.forEach(c => c.classList.remove('selected'));
+                card.classList.add('selected');
+                selectedMode = mode;
+                // Update query label and hint based on mode
+                if (mode === 'object_detection') {
+                    queriesLabel.textContent = 'Objects to Detect (comma-separated)';
+                    queriesHint.textContent = 'Example: person, car, dog, bicycle';
+                    detectorSection.classList.remove('hidden');
+                    segmenterSection.classList.add('hidden');
+                } else if (mode === 'segmentation') {
+                    queriesLabel.textContent = 'Objects to Segment (comma-separated)';
+                    queriesHint.textContent = 'Example: person, car, building, tree';
+                    detectorSection.classList.add('hidden');
+                    segmenterSection.classList.remove('hidden');
+                } else if (mode === 'drone_detection') {
+                    queriesLabel.textContent = 'Drone Types to Detect (comma-separated)';
+                    queriesHint.textContent = 'Example: quadcopter, fixed-wing, drone';
+                    detectorSection.classList.add('hidden');
+                    segmenterSection.classList.add('hidden');
+                }
+                // Always show queries section
+                queriesSection.classList.remove('hidden');
+            });
+        });
+        // File input handler
+        fileInput.addEventListener('change', (e) => {
+            videoFile = e.target.files[0];
+            if (videoFile) {
+                fileLabel.textContent = `✅ ${videoFile.name}`;
+                fileLabel.classList.add('has-file');
+                processBtn.disabled = false;
+                // Preview original video
+                originalVideo.src = URL.createObjectURL(videoFile);
+            }
+        });
+        // Process button handler
+        processBtn.addEventListener('click', async () => {
+            if (!videoFile) {
+                alert('Please select a video file first.');
+                return;
+            }
+            // Show loading
+            processBtn.disabled = true;
+            loading.classList.add('show');
+            resultsSection.classList.add('hidden');
+            // Prepare form data
+            const formData = new FormData();
+            formData.append('video', videoFile);
+            formData.append('mode', selectedMode);
+            formData.append('queries', document.getElementById('queries').value);
+            formData.append('detector', document.getElementById('detector').value);
+            formData.append('segmenter', document.getElementById('segmenter').value);
+            try {
+                const response = await fetch('/detect', {
+                    method: 'POST',
+                    body: formData
+                });
+                if (response.ok) {
+                    const contentType = response.headers.get('content-type');
+                    if (contentType && contentType.includes('application/json')) {
+                        // Coming soon response
+                        const data = await response.json();
+                        showComingSoonModal(data.mode);
+                    } else {
+                        // Video response
+                        const blob = await response.blob();
+                        const videoUrl = URL.createObjectURL(blob);
+                        processedVideo.src = videoUrl;
+                        downloadBtn.href = videoUrl;
+                        resultsSection.classList.remove('hidden');
+                    }
+                } else {
+                    const error = await response.json();
+                    alert(`Error: ${error.detail || error.error || 'Processing failed'}`);
+                }
+            } catch (error) {
+                console.error('Error:', error);
+                alert('Network error: ' + error.message);
+            } finally {
+                loading.classList.remove('show');
+                processBtn.disabled = false;
+            }
+        });
+        // Coming soon modal
+        function showComingSoonModal(mode) {
+            const messages = {
+                'drone_detection': 'Drone detection mode is under development. Stay tuned for specialized UAV and aerial object detection!'
+            };
+            modalMessage.textContent = messages[mode] || 'This feature is coming soon!';
+            modal.classList.add('show');
+        }
+        modalClose.addEventListener('click', () => {
+            modal.classList.remove('show');
+            // Reset to object detection
+            document.querySelector('input[value="object_detection"]').checked = true;
+            modeCards.forEach(c => c.classList.remove('selected'));
+            document.querySelector('input[value="object_detection"]').closest('.mode-card').classList.add('selected');
+            selectedMode = 'object_detection';
+            // Update labels for object detection mode
+            queriesLabel.textContent = 'Objects to Detect (comma-separated)';
+            queriesHint.textContent = 'Example: person, car, dog, bicycle';
+            detectorSection.classList.remove('hidden');
+            segmenterSection.classList.add('hidden');
+        });
+        // Close modal on background click
+        modal.addEventListener('click', (e) => {
+            if (e.target === modal) {
+                modalClose.click();
+            }
+        });
+    </script>
+</body>
+</html>

inference.py ADDED Viewed

	@@ -0,0 +1,182 @@

+import logging
+from typing import Any, Dict, List, Optional, Sequence
+import cv2
+import numpy as np
+from models.model_loader import load_detector
+from models.segmenters.model_loader import load_segmenter
+from utils.video import extract_frames, write_video
+def draw_boxes(frame: np.ndarray, boxes: np.ndarray) -> np.ndarray:
+    output = frame.copy()
+    if boxes is None:
+        return output
+    for box in boxes:
+        x1, y1, x2, y2 = [int(coord) for coord in box]
+        cv2.rectangle(output, (x1, y1), (x2, y2), (0, 255, 0), thickness=2)
+    return output
+def draw_masks(frame: np.ndarray, masks: np.ndarray, alpha: float = 0.45) -> np.ndarray:
+    output = frame.copy()
+    if masks is None or len(masks) == 0:
+        return output
+    colors = [
+        (255, 0, 0),
+        (0, 255, 0),
+        (0, 0, 255),
+        (255, 255, 0),
+        (0, 255, 255),
+        (255, 0, 255),
+    ]
+    for idx, mask in enumerate(masks):
+        if mask is None:
+            continue
+        if mask.ndim == 3:
+            mask = mask[0]
+        if mask.shape[:2] != output.shape[:2]:
+            mask = cv2.resize(mask, (output.shape[1], output.shape[0]), interpolation=cv2.INTER_NEAREST)
+        mask_bool = mask.astype(bool)
+        overlay = np.zeros_like(output, dtype=np.uint8)
+        overlay[mask_bool] = colors[idx % len(colors)]
+        output = cv2.addWeighted(output, 1.0, overlay, alpha, 0)
+    return output
+def _build_detection_records(
+    boxes: np.ndarray,
+    scores: Sequence[float],
+    labels: Sequence[int],
+    queries: Sequence[str],
+    label_names: Optional[Sequence[str]] = None,
+) -> List[Dict[str, Any]]:
+    detections: List[Dict[str, Any]] = []
+    for idx, box in enumerate(boxes):
+        if label_names is not None and idx < len(label_names):
+            label = label_names[idx]
+        else:
+            label_idx = int(labels[idx]) if idx < len(labels) else -1
+            if 0 <= label_idx < len(queries):
+                label = queries[label_idx]
+            else:
+                label = f"label_{label_idx}"
+        detections.append(
+            {
+                "label": label,
+                "score": float(scores[idx]) if idx < len(scores) else 0.0,
+                "bbox": [int(coord) for coord in box.tolist()],
+            }
+        )
+    return detections
+def infer_frame(
+    frame: np.ndarray,
+    queries: Sequence[str],
+    detector_name: Optional[str] = None,
+) -> tuple[np.ndarray, List[Dict[str, Any]]]:
+    detector = load_detector(detector_name)
+    text_queries = list(queries) or ["object"]
+    try:
+        result = detector.predict(frame, text_queries)
+        detections = _build_detection_records(
+            result.boxes, result.scores, result.labels, text_queries, result.label_names
+        )
+    except Exception:
+        logging.exception("Inference failed for queries %s", text_queries)
+        raise
+    return draw_boxes(frame, result.boxes), detections
+def infer_segmentation_frame(
+    frame: np.ndarray,
+    text_queries: Optional[List[str]] = None,
+    segmenter_name: Optional[str] = None,
+) -> tuple[np.ndarray, Any]:
+    segmenter = load_segmenter(segmenter_name)
+    result = segmenter.predict(frame, text_prompts=text_queries)
+    return draw_masks(frame, result.masks), result
+def run_inference(
+    input_video_path: str,
+    output_video_path: str,
+    queries: List[str],
+    max_frames: Optional[int] = None,
+    detector_name: Optional[str] = None,
+) -> str:
+    """
+    Run object detection inference on a video.
+    Args:
+        input_video_path: Path to input video
+        output_video_path: Path to write processed video
+        queries: List of object classes to detect (e.g., ["person", "car"])
+        max_frames: Optional frame limit for testing
+        detector_name: Detector to use (default: owlv2_base)
+    Returns:
+        Path to processed output video
+    """
+    try:
+        frames, fps, width, height = extract_frames(input_video_path)
+    except ValueError as exc:
+        logging.exception("Failed to decode video at %s", input_video_path)
+        raise
+    # Use provided queries or default to common objects
+    if not queries:
+        queries = ["person", "car", "truck", "motorcycle", "bicycle", "bus", "train", "airplane"]
+        logging.info("No queries provided, using defaults: %s", queries)
+    logging.info("Detection queries: %s", queries)
+    # Select detector
+    active_detector = detector_name or "owlv2_base"
+    logging.info("Using detector: %s", active_detector)
+    # Process frames
+    processed_frames: List[np.ndarray] = []
+    for idx, frame in enumerate(frames):
+        if max_frames is not None and idx >= max_frames:
+            break
+        logging.debug("Processing frame %d", idx)
+        processed_frame, _ = infer_frame(frame, queries, detector_name=active_detector)
+        processed_frames.append(processed_frame)
+    # Write output video
+    write_video(processed_frames, output_video_path, fps=fps, width=width, height=height)
+    logging.info("Processed video written to: %s", output_video_path)
+    return output_video_path
+def run_segmentation(
+    input_video_path: str,
+    output_video_path: str,
+    queries: List[str],
+    max_frames: Optional[int] = None,
+    segmenter_name: Optional[str] = None,
+) -> str:
+    try:
+        frames, fps, width, height = extract_frames(input_video_path)
+    except ValueError as exc:
+        logging.exception("Failed to decode video at %s", input_video_path)
+        raise
+    active_segmenter = segmenter_name or "sam3"
+    logging.info("Using segmenter: %s with queries: %s", active_segmenter, queries)
+    processed_frames: List[np.ndarray] = []
+    for idx, frame in enumerate(frames):
+        if max_frames is not None and idx >= max_frames:
+            break
+        logging.debug("Processing frame %d", idx)
+        processed_frame, _ = infer_segmentation_frame(frame, text_queries=queries, segmenter_name=active_segmenter)
+        processed_frames.append(processed_frame)
+    write_video(processed_frames, output_video_path, fps=fps, width=width, height=height)
+    logging.info("Segmented video written to: %s", output_video_path)
+    return output_video_path

models/detectors/base.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from typing import NamedTuple, Optional, Sequence
+import numpy as np
+class DetectionResult(NamedTuple):
+    boxes: np.ndarray
+    scores: Sequence[float]
+    labels: Sequence[int]
+    label_names: Optional[Sequence[str]] = None
+class ObjectDetector:
+    """Detector interface to keep inference agnostic to model details."""
+    name: str
+    def predict(self, frame: np.ndarray, queries: Sequence[str]) -> DetectionResult:
+        raise NotImplementedError

models/detectors/detr.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import logging
+from typing import Sequence
+import numpy as np
+import torch
+from transformers import DetrForObjectDetection, DetrImageProcessor
+from models.detectors.base import DetectionResult, ObjectDetector
+class DetrDetector(ObjectDetector):
+    """Wrapper around facebook/detr-resnet-50 for mission-aligned detection."""
+    MODEL_NAME = "facebook/detr-resnet-50"
+    def __init__(self, score_threshold: float = 0.3) -> None:
+        self.name = "detr_resnet50"
+        self.score_threshold = score_threshold
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        logging.info("Loading %s onto %s", self.MODEL_NAME, self.device)
+        self.processor = DetrImageProcessor.from_pretrained(self.MODEL_NAME)
+        self.model = DetrForObjectDetection.from_pretrained(self.MODEL_NAME)
+        self.model.to(self.device)
+        self.model.eval()
+    def predict(self, frame: np.ndarray, queries: Sequence[str]) -> DetectionResult:
+        inputs = self.processor(images=frame, return_tensors="pt")
+        inputs = {key: value.to(self.device) for key, value in inputs.items()}
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+        target_sizes = torch.tensor([frame.shape[:2]], device=self.device)
+        processed = self.processor.post_process_object_detection(
+            outputs,
+            threshold=self.score_threshold,
+            target_sizes=target_sizes,
+        )[0]
+        boxes = processed["boxes"].cpu().numpy()
+        scores = processed["scores"].cpu().tolist()
+        labels = processed["labels"].cpu().tolist()
+        label_names = [
+            self.model.config.id2label.get(int(idx), f"class_{idx}") for idx in labels
+        ]
+        return DetectionResult(
+            boxes=boxes,
+            scores=scores,
+            labels=labels,
+            label_names=label_names,
+        )

models/detectors/grounding_dino.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import logging
+from typing import Sequence
+import numpy as np
+import torch
+from transformers import GroundingDinoForObjectDetection, GroundingDinoProcessor
+from models.detectors.base import DetectionResult, ObjectDetector
+class GroundingDinoDetector(ObjectDetector):
+    """IDEA-Research Grounding DINO-B detector for open-vocabulary missions."""
+    MODEL_NAME = "IDEA-Research/grounding-dino-base"
+    def __init__(self, box_threshold: float = 0.35, text_threshold: float = 0.25) -> None:
+        self.name = "grounding_dino"
+        self.box_threshold = box_threshold
+        self.text_threshold = text_threshold
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        logging.info("Loading %s onto %s", self.MODEL_NAME, self.device)
+        self.processor = GroundingDinoProcessor.from_pretrained(self.MODEL_NAME)
+        self.model = GroundingDinoForObjectDetection.from_pretrained(self.MODEL_NAME)
+        self.model.to(self.device)
+        self.model.eval()
+    def _build_prompt(self, queries: Sequence[str]) -> str:
+        filtered = [query.strip() for query in queries if query and query.strip()]
+        if not filtered:
+            return "object."
+        return " ".join(f"{term}." for term in filtered)
+    def predict(self, frame: np.ndarray, queries: Sequence[str]) -> DetectionResult:
+        prompt = self._build_prompt(queries)
+        inputs = self.processor(images=frame, text=prompt, return_tensors="pt")
+        inputs = {key: value.to(self.device) for key, value in inputs.items()}
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+        target_sizes = torch.tensor([frame.shape[:2]], device=self.device)
+        processed = self.processor.post_process_grounded_object_detection(
+            outputs,
+            inputs["input_ids"],
+            box_threshold=self.box_threshold,
+            text_threshold=self.text_threshold,
+            target_sizes=target_sizes,
+        )[0]
+        boxes = processed["boxes"].cpu().numpy()
+        scores = processed["scores"].cpu().tolist()
+        label_names = list(processed.get("labels") or [])
+        label_ids = list(range(len(label_names)))
+        return DetectionResult(
+            boxes=boxes,
+            scores=scores,
+            labels=label_ids,
+            label_names=label_names,
+        )

models/detectors/owlv2.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import logging
+from typing import Sequence
+import numpy as np
+import torch
+from transformers import Owlv2ForObjectDetection, Owlv2Processor
+from models.detectors.base import DetectionResult, ObjectDetector
+class Owlv2Detector(ObjectDetector):
+    MODEL_NAME = "google/owlv2-base-patch32"
+    def __init__(self) -> None:
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        logging.info("Loading %s onto %s", self.MODEL_NAME, self.device)
+        self.processor = Owlv2Processor.from_pretrained(self.MODEL_NAME)
+        torch_dtype = torch.float16 if self.device.type == "cuda" else torch.float32
+        self.model = Owlv2ForObjectDetection.from_pretrained(
+            self.MODEL_NAME, torch_dtype=torch_dtype
+        )
+        self.model.to(self.device)
+        self.model.eval()
+        self.name = "owlv2_base"
+    def predict(self, frame: np.ndarray, queries: Sequence[str]) -> DetectionResult:
+        inputs = self.processor(text=queries, images=frame, return_tensors="pt")
+        if hasattr(inputs, "to"):
+            inputs = inputs.to(self.device)
+        else:
+            inputs = {
+                key: value.to(self.device) if hasattr(value, "to") else value
+                for key, value in inputs.items()
+            }
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+        processed = self.processor.post_process_object_detection(
+            outputs, threshold=0.3, target_sizes=[frame.shape[:2]]
+        )[0]
+        boxes = processed["boxes"]
+        scores = processed.get("scores", [])
+        labels = processed.get("labels", [])
+        boxes_np = boxes.cpu().numpy() if hasattr(boxes, "cpu") else np.asarray(boxes)
+        if hasattr(scores, "cpu"):
+            scores_seq = scores.cpu().numpy().tolist()
+        elif isinstance(scores, np.ndarray):
+            scores_seq = scores.tolist()
+        else:
+            scores_seq = list(scores)
+        if hasattr(labels, "cpu"):
+            labels_seq = labels.cpu().numpy().tolist()
+        elif isinstance(labels, np.ndarray):
+            labels_seq = labels.tolist()
+        else:
+            labels_seq = list(labels)
+        return DetectionResult(boxes=boxes_np, scores=scores_seq, labels=labels_seq)

models/detectors/yolov8.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import logging
+from typing import List, Sequence
+import numpy as np
+import torch
+from huggingface_hub import hf_hub_download
+from ultralytics import YOLO
+from models.detectors.base import DetectionResult, ObjectDetector
+class HuggingFaceYoloV8Detector(ObjectDetector):
+    """YOLOv8 detector whose weights are fetched from the Hugging Face Hub."""
+    REPO_ID = "Ultralytics/YOLOv8"
+    WEIGHT_FILE = "yolov8s.pt"
+    def __init__(self, score_threshold: float = 0.3) -> None:
+        self.name = "hf_yolov8"
+        self.score_threshold = score_threshold
+        self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
+        logging.info(
+            "Loading Hugging Face YOLOv8 weights %s/%s onto %s",
+            self.REPO_ID,
+            self.WEIGHT_FILE,
+            self.device,
+        )
+        weight_path = hf_hub_download(repo_id=self.REPO_ID, filename=self.WEIGHT_FILE)
+        self.model = YOLO(weight_path)
+        self.model.to(self.device)
+        self.class_names = self.model.names
+    def _filter_indices(self, label_names: Sequence[str], queries: Sequence[str]) -> List[int]:
+        if not queries:
+            return list(range(len(label_names)))
+        allowed = {query.lower().strip() for query in queries if query}
+        keep = [idx for idx, name in enumerate(label_names) if name.lower() in allowed]
+        return keep or list(range(len(label_names)))
+    def predict(self, frame: np.ndarray, queries: Sequence[str]) -> DetectionResult:
+        device_arg = 0 if self.device.startswith("cuda") else "cpu"
+        results = self.model.predict(
+            source=frame,
+            device=device_arg,
+            conf=self.score_threshold,
+            verbose=False,
+        )
+        result = results[0]
+        boxes = result.boxes
+        if boxes is None or boxes.xyxy is None:
+            empty = np.empty((0, 4), dtype=np.float32)
+            return DetectionResult(empty, [], [], [])
+        xyxy = boxes.xyxy.cpu().numpy()
+        scores = boxes.conf.cpu().numpy().tolist()
+        label_ids = boxes.cls.cpu().numpy().astype(int).tolist()
+        label_names = [self.class_names.get(idx, f"class_{idx}") for idx in label_ids]
+        keep_indices = self._filter_indices(label_names, queries)
+        xyxy = xyxy[keep_indices] if len(xyxy) else xyxy
+        scores = [scores[i] for i in keep_indices]
+        label_ids = [label_ids[i] for i in keep_indices]
+        label_names = [label_names[i] for i in keep_indices]
+        return DetectionResult(
+            boxes=xyxy,
+            scores=scores,
+            labels=label_ids,
+            label_names=label_names,
+        )

models/model_loader.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import os
+from functools import lru_cache
+from typing import Callable, Dict, Optional
+from models.detectors.base import ObjectDetector
+from models.detectors.detr import DetrDetector
+from models.detectors.grounding_dino import GroundingDinoDetector
+from models.detectors.owlv2 import Owlv2Detector
+from models.detectors.yolov8 import HuggingFaceYoloV8Detector
+DEFAULT_DETECTOR = "owlv2_base"
+_REGISTRY: Dict[str, Callable[[], ObjectDetector]] = {
+    "owlv2_base": Owlv2Detector,
+    "hf_yolov8": HuggingFaceYoloV8Detector,
+    "detr_resnet50": DetrDetector,
+    "grounding_dino": GroundingDinoDetector,
+}
+def _create_detector(name: str) -> ObjectDetector:
+    try:
+        factory = _REGISTRY[name]
+    except KeyError as exc:
+        available = ", ".join(sorted(_REGISTRY))
+        raise ValueError(f"Unknown detector '{name}'. Available: {available}") from exc
+    return factory()
+@lru_cache(maxsize=None)
+def _get_cached_detector(name: str) -> ObjectDetector:
+    return _create_detector(name)
+def load_detector(name: Optional[str] = None) -> ObjectDetector:
+    """Return a cached detector instance selected via arg or OBJECT_DETECTOR env."""
+    detector_name = name or os.getenv("OBJECT_DETECTOR", DEFAULT_DETECTOR)
+    return _get_cached_detector(detector_name)
+# Backwards compatibility for existing callers.
+def load_model():
+    return load_detector()

models/segmenters/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from .base import Segmenter, SegmentationResult
+from .model_loader import load_segmenter
+from .sam3 import SAM3Segmenter
+__all__ = [
+    "Segmenter",
+    "SegmentationResult",
+    "load_segmenter",
+    "SAM3Segmenter",
+]

models/segmenters/base.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from typing import NamedTuple, Optional
+import numpy as np
+class SegmentationResult(NamedTuple):
+    """Result from segmentation inference."""
+    masks: np.ndarray  # NxHxW binary or soft masks
+    scores: Optional[np.ndarray] = None  # Confidence scores
+    boxes: Optional[np.ndarray] = None  # Bounding boxes (xyxy)
+class Segmenter:
+    """Base interface for segmentation models."""
+    name: str
+    def predict(self, frame: np.ndarray, text_prompts: Optional[list] = None) -> SegmentationResult:
+        """
+        Run segmentation on a single frame.
+        Args:
+            frame: Input image as numpy array (HxWxC)
+            text_prompts: Optional list of text prompts for segmentation
+        Returns:
+            SegmentationResult with masks and optional metadata
+        """
+        raise NotImplementedError

models/segmenters/model_loader.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import os
+from functools import lru_cache
+from typing import Callable, Dict, Optional
+from .base import Segmenter
+from .sam3 import SAM3Segmenter
+DEFAULT_SEGMENTER = "sam3"
+_REGISTRY: Dict[str, Callable[[], Segmenter]] = {
+    "sam3": SAM3Segmenter,
+}
+def _create_segmenter(name: str) -> Segmenter:
+    """Create a new segmenter instance."""
+    try:
+        factory = _REGISTRY[name]
+    except KeyError as exc:
+        available = ", ".join(sorted(_REGISTRY))
+        raise ValueError(
+            f"Unknown segmenter '{name}'. Available: {available}"
+        ) from exc
+    return factory()
+@lru_cache(maxsize=None)
+def _get_cached_segmenter(name: str) -> Segmenter:
+    """Get or create cached segmenter instance."""
+    return _create_segmenter(name)
+def load_segmenter(name: Optional[str] = None) -> Segmenter:
+    """
+    Load a segmenter by name.
+    Args:
+        name: Segmenter name (default: sam3)
+    Returns:
+        Cached segmenter instance
+    """
+    segmenter_name = name or os.getenv("SEGMENTER", DEFAULT_SEGMENTER)
+    return _get_cached_segmenter(segmenter_name)

models/segmenters/sam3.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import logging
+from typing import Optional
+import numpy as np
+import torch
+from PIL import Image
+from transformers import Sam3Model, Sam3Processor
+from .base import Segmenter, SegmentationResult
+class SAM3Segmenter(Segmenter):
+    """
+    SAM3 (Segment Anything Model 3) segmenter.
+    Performs automatic instance segmentation on images without prompts.
+    Uses facebook/sam3 model from HuggingFace.
+    """
+    name = "sam3"
+    def __init__(
+        self,
+        model_id: str = "facebook/sam3",
+        device: Optional[str] = None,
+        threshold: float = 0.5,
+        mask_threshold: float = 0.5,
+    ):
+        """
+        Initialize SAM3 segmenter.
+        Args:
+            model_id: HuggingFace model ID
+            device: Device to run on (cuda/cpu), auto-detected if None
+            threshold: Confidence threshold for filtering instances
+            mask_threshold: Threshold for binarizing masks
+        """
+        self.device = device or (
+            "cuda" if torch.cuda.is_available() else "cpu"
+        )
+        self.threshold = threshold
+        self.mask_threshold = mask_threshold
+        logging.info(
+            "Loading SAM3 model %s on device %s", model_id, self.device
+        )
+        try:
+            self.model = Sam3Model.from_pretrained(model_id).to(self.device)
+            self.processor = Sam3Processor.from_pretrained(model_id)
+            self.model.eval()
+        except Exception:
+            logging.exception("Failed to load SAM3 model")
+            raise
+        logging.info("SAM3 model loaded successfully")
+    def predict(self, frame: np.ndarray, text_prompts: Optional[list] = None) -> SegmentationResult:
+        """
+        Run SAM3 segmentation on a frame.
+        Args:
+            frame: Input image (HxWx3 numpy array in RGB)
+            text_prompts: List of text prompts for segmentation
+        Returns:
+            SegmentationResult with instance masks
+        """
+        # Convert numpy array to PIL Image
+        if frame.dtype == np.uint8:
+            pil_image = Image.fromarray(frame)
+        else:
+            # Normalize to 0-255 if needed
+            frame_uint8 = (frame * 255).astype(np.uint8)
+            pil_image = Image.fromarray(frame_uint8)
+        # Use default prompts if none provided
+        if not text_prompts:
+            text_prompts = ["object"]
+        # Process image with text prompts
+        inputs = self.processor(
+            images=pil_image, text=text_prompts, return_tensors="pt"
+        ).to(self.device)
+        # Run inference
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+        # Post-process to get instance masks
+        try:
+            results = self.processor.post_process_instance_segmentation(
+                outputs,
+                threshold=self.threshold,
+                mask_threshold=self.mask_threshold,
+                target_sizes=inputs.get("original_sizes").tolist(),
+            )[0]
+            # Extract results
+            masks = results.get("masks", [])
+            scores = results.get("scores", None)
+            boxes = results.get("boxes", None)
+            # Convert to numpy arrays
+            if len(masks) > 0:
+                # Stack masks: list of (H, W) -> (N, H, W)
+                masks_array = np.stack([m.cpu().numpy() for m in masks])
+            else:
+                # No objects detected
+                masks_array = np.zeros(
+                    (0, frame.shape[0], frame.shape[1]), dtype=bool
+                )
+            scores_array = (
+                scores.cpu().numpy() if scores is not None else None
+            )
+            boxes_array = (
+                boxes.cpu().numpy() if boxes is not None else None
+            )
+            return SegmentationResult(
+                masks=masks_array,
+                scores=scores_array,
+                boxes=boxes_array,
+            )
+        except Exception:
+            logging.exception("SAM3 post-processing failed")
+            # Return empty result
+            return SegmentationResult(
+                masks=np.zeros((0, frame.shape[0], frame.shape[1]), dtype=bool),
+                scores=None,
+                boxes=None,
+            )

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+fastapi
+uvicorn
+torch
+transformers
+opencv-python-headless
+python-multipart
+accelerate
+pillow
+scipy
+huggingface-hub
+ultralytics
+timm
+ffmpeg-python

utils/video.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import logging
+import os
+import shutil
+import subprocess
+import tempfile
+from typing import List, Tuple
+import cv2
+import numpy as np
+def extract_frames(video_path: str) -> Tuple[List[np.ndarray], float, int, int]:
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        raise ValueError("Unable to open video.")
+    fps = cap.get(cv2.CAP_PROP_FPS) or 0.0
+    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    frames: List[np.ndarray] = []
+    success, frame = cap.read()
+    while success:
+        frames.append(frame)
+        success, frame = cap.read()
+    cap.release()
+    if not frames:
+        raise ValueError("Video decode produced zero frames.")
+    return frames, fps, width, height
+def _transcode_with_ffmpeg(src_path: str, dst_path: str) -> None:
+    cmd = [
+        "ffmpeg",
+        "-y",
+        "-i",
+        src_path,
+        "-c:v",
+        "libx264",
+        "-preset",
+        "veryfast",
+        "-pix_fmt",
+        "yuv420p",
+        "-movflags",
+        "+faststart",
+        dst_path,
+    ]
+    process = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=False)
+    if process.returncode != 0:
+        raise RuntimeError(process.stderr.decode("utf-8", errors="ignore"))
+def write_video(frames: List[np.ndarray], output_path: str, fps: float, width: int, height: int) -> None:
+    if not frames:
+        raise ValueError("No frames available for writing.")
+    temp_fd, temp_path = tempfile.mkstemp(prefix="raw_", suffix=".mp4")
+    os.close(temp_fd)
+    writer = cv2.VideoWriter(temp_path, cv2.VideoWriter_fourcc(*"mp4v"), fps or 1.0, (width, height))
+    if not writer.isOpened():
+        os.remove(temp_path)
+        raise ValueError("Failed to open VideoWriter.")
+    for frame in frames:
+        writer.write(frame)
+    writer.release()
+    try:
+        _transcode_with_ffmpeg(temp_path, output_path)
+        logging.debug("Transcoded video to H.264 for browser compatibility.")
+        os.remove(temp_path)
+    except FileNotFoundError:
+        logging.warning("ffmpeg not found; serving fallback MP4V output.")
+        shutil.move(temp_path, output_path)
+    except RuntimeError as exc:
+        logging.warning("ffmpeg transcode failed (%s); serving fallback MP4V output.", exc)
+        shutil.move(temp_path, output_path)