Spaces:

Utkarshres32
/

Sentinelai_api

Running

App Files Files Community

Utkarshres32 commited on Apr 10

Commit

2758540

1 Parent(s): e2b09b1

Deploy Sentinelai API backend

Browse files

Files changed (30) hide show

.gitignore +8 -0
Dockerfile +43 -0
app.py +215 -0
audio/__init__.py +0 -0
audio/audio_module.py +141 -0
config.py +107 -0
database/__init__.py +0 -0
database/crud.py +164 -0
database/models.py +100 -0
database/session.py +46 -0
graph/__init__.py +0 -0
graph/movement_graph.py +236 -0
nlp/__init__.py +0 -0
nlp/qa.py +103 -0
nlp/report.py +114 -0
nlp/search.py +112 -0
nlp/summarizer.py +75 -0
requirements.txt +96 -0
routes/__init__.py +0 -0
routes/nlp_routes.py +235 -0
routes/stream_routes.py +69 -0
routes/vision_routes.py +421 -0
static/thumbnails/4f42294d-cdc1-4c64-abae-71a2891167b2.jpg +0 -0
vision/__init__.py +0 -0
vision/attributes.py +163 -0
vision/detector.py +91 -0
vision/pipeline.py +177 -0
vision/reid.py +144 -0
vision/stream_manager.py +151 -0
vision/tracker.py +186 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,8 @@

+.env*
+__pycache__/
+*.pyc
+.venv/
+venv/
+env/

Dockerfile ADDED Viewed

	@@ -0,0 +1,43 @@

+# Use an official Python runtime as a parent image
+FROM python:3.10-slim
+# Set environment variables for Python
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+# Hugging Face Spaces uses port 7860 by default
+ENV PORT=7860
+# Install system dependencies required for OpenCV and AI models
+RUN apt-get update && apt-get install -y \
+    libgl1-mesa-glx \
+    libglib2.0-0 \
+    libsm6 \
+    libxext6 \
+    libxrender-dev \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+# Set the working directory in the container
+WORKDIR /app
+# Copy the requirements file into the container
+COPY requirements.txt .
+# Install Python dependencies
+# We specifically install the CPU-only version of PyTorch to save space and avoid GPU driver issues on the free tier
+RUN pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy the current directory contents into the container at /app
+COPY . .
+# Create directories for static files and databases
+RUN mkdir -p static/thumbnails static/anomalies database/data
+RUN chmod -R 777 static database
+# Expose the port Hugging Face expects
+EXPOSE 7860
+# Command to run the FastAPI server
+CMD uvicorn app:app --host 0.0.0.0 --port $PORT

app.py ADDED Viewed

	@@ -0,0 +1,215 @@

+"""
+app.py - Main FastAPI Application Entry Point
+Multimodal Surveillance Intelligence System
+Run with: uvicorn app:app --host 0.0.0.0 --port 8000 --reload
+"""
+import asyncio
+import time
+from contextlib import asynccontextmanager
+from fastapi import FastAPI, Request
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse
+from fastapi.staticfiles import StaticFiles
+from loguru import logger
+import psutil
+import os
+from config import settings, DEVICE
+# ── Global Singletons (initialized on startup) ────────────────────────────────
+vision_pipeline = None
+search_engine = None
+qa_system = None
+report_generator = None
+summarizer = None
+movement_graph = None
+audio_asr = None
+audio_classifier = None
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """Application startup/shutdown lifecycle."""
+    global vision_pipeline, search_engine, qa_system, report_generator
+    global summarizer, movement_graph, audio_asr, audio_classifier
+    logger.info("=" * 60)
+    logger.info(f"🚀 Starting {settings.APP_NAME} v{settings.APP_VERSION}")
+    logger.info(f"   Device: {DEVICE}")
+    logger.info("=" * 60)
+    # 1. Initialize database tables
+    logger.info("📦 Initializing database...")
+    from database.session import create_tables
+    await create_tables()
+    # 2. Load Vision Pipeline
+    logger.info("🎥 Loading Vision Pipeline...")
+    from vision.pipeline import VisionPipeline
+    vision_pipeline = VisionPipeline()
+    # 3. Load NLP Components
+    logger.info("💬 Loading NLP: Semantic Search...")
+    from nlp.search import SemanticSearchEngine
+    search_engine = SemanticSearchEngine()
+    logger.info("❓ Loading NLP: QA System...")
+    from nlp.qa import SurveillanceQA
+    qa_system = SurveillanceQA()
+    logger.info("📝 Loading NLP: Report Generator...")
+    from nlp.report import IncidentReportGenerator
+    report_generator = IncidentReportGenerator()
+    logger.info("📋 Loading NLP: Summarizer...")
+    from nlp.summarizer import SurveillanceSummarizer
+    summarizer = SurveillanceSummarizer()
+    # 4. Load Graph Module
+    logger.info("🕸️  Initializing Movement Graph...")
+    from graph.movement_graph import MovementGraph
+    movement_graph = MovementGraph()
+    # 5. Load Audio (optional)
+    if settings.ENABLE_AUDIO:
+        logger.info("🎙️  Loading Audio Module...")
+        from audio.audio_module import WhisperASR, AudioClassifier
+        audio_asr = WhisperASR()
+        audio_classifier = AudioClassifier()
+    logger.info("✅ All components loaded successfully!")
+    logger.info(f"📊 Memory usage: {psutil.Process().memory_info().rss / 1e6:.1f} MB")
+    yield  # App is running
+    # Shutdown
+    logger.info("🛑 Shutting down Surveillance System...")
+    from vision.stream_manager import stream_manager
+    stream_manager.shutdown()
+# ── FastAPI App ────────────────────────────────────────────────────────────────
+app = FastAPI(
+    title=settings.APP_NAME,
+    version=settings.APP_VERSION,
+    description="""
+    **Multimodal Surveillance Intelligence System**
+    Capabilities:
+    - 🎥 Real-time multi-camera video processing
+    - 👤 Person detection (DETR) + Multi-object tracking (ByteTrack)
+    - 🔍 Cross-camera Re-Identification (ViT + FAISS)
+    - 👗 Clothing/attribute recognition (CLIP zero-shot)
+    - 💬 Semantic search over surveillance logs
+    - ❓ Natural language Q&A over events
+    - 📝 Automated incident report generation
+    - 🕸️ Movement graph anomaly detection
+    """,
+    lifespan=lifespan,
+    docs_url="/docs",
+    redoc_url="/redoc",
+)
+# ── CORS ───────────────────────────────────────────────────────────────────────
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=settings.CORS_ORIGINS + ["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# ── Request Latency Logging Middleware ─────────────────────────────────────────
+@app.middleware("http")
+async def log_requests(request: Request, call_next):
+    t0 = time.perf_counter()
+    response = await call_next(request)
+    latency = (time.perf_counter() - t0) * 1000
+    logger.debug(f"{request.method} {request.url.path} → {response.status_code} ({latency:.1f}ms)")
+    response.headers["X-Process-Time-Ms"] = f"{latency:.2f}"
+    return response
+# ── Register Routers ────────────────────────────────────────────────────────��──
+from routes.vision_routes import router as vision_router
+from routes.nlp_routes import router as nlp_router
+from routes.stream_routes import router as stream_router
+app.include_router(vision_router)
+app.include_router(nlp_router)
+app.include_router(stream_router)
+# ── Static Files ───────────────────────────────────────────────────────────────
+# Create static directory if it doesn't exist
+os.makedirs("static/thumbnails", exist_ok=True)
+app.mount("/static", StaticFiles(directory="static"), name="static")
+# ── Health & Status Routes ─────────────────────────────────────────────────────
+@app.get("/", tags=["Health"])
+async def root():
+    return {
+        "system": settings.APP_NAME,
+        "version": settings.APP_VERSION,
+        "status": "operational",
+        "device": str(DEVICE),
+        "docs": "/docs",
+    }
+@app.get("/health", tags=["Health"])
+async def health_check():
+    mem = psutil.Process().memory_info().rss / 1e6
+    cpu = psutil.cpu_percent(interval=0.1)
+    import torch
+    gpu_info = {}
+    if torch.cuda.is_available():
+        gpu_info = {
+            "name": torch.cuda.get_device_name(0),
+            "memory_allocated_mb": round(torch.cuda.memory_allocated(0) / 1e6, 1),
+            "memory_reserved_mb": round(torch.cuda.memory_reserved(0) / 1e6, 1),
+        }
+    return {
+        "status": "healthy",
+        "device": str(DEVICE),
+        "memory_mb": round(mem, 1),
+        "cpu_percent": cpu,
+        "gpu": gpu_info,
+        "models_loaded": {
+            "vision_pipeline": vision_pipeline is not None,
+            "search_engine": search_engine is not None,
+            "qa_system": qa_system is not None,
+            "report_generator": report_generator is not None,
+            "summarizer": summarizer is not None,
+            "movement_graph": movement_graph is not None,
+            "audio": audio_asr is not None,
+        },
+    }
+@app.get("/metrics", tags=["Health"])
+async def prometheus_metrics():
+    """Basic Prometheus-style metrics."""
+    from prometheus_client import generate_latest, CONTENT_TYPE_LATEST
+    from starlette.responses import Response
+    return Response(generate_latest(), media_type=CONTENT_TYPE_LATEST)
+# ── Entry point ────────────────────────────────────────────────────────────────
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(
+        "app:app",
+        host=settings.HOST,
+        port=settings.PORT,
+        reload=settings.DEBUG,
+        workers=1,  # 1 worker required for shared model singletons
+        log_level="info",
+    )

audio/__init__.py ADDED Viewed

File without changes

audio/audio_module.py ADDED Viewed

	@@ -0,0 +1,141 @@

+"""
+audio/asr.py - Automatic Speech Recognition using openai/whisper-small
+audio/classifier.py - Audio classification using wav2vec2
+audio/tts.py - Text-to-Speech using microsoft/speecht5_tts
+"""
+import time
+import torch
+import numpy as np
+from typing import Dict, Optional, Union
+from loguru import logger
+from config import settings, DEVICE
+# ═══════════════════════════════════════════════════════════════════════════
+# ASR - Automatic Speech Recognition
+# ═══════════════════════════════════════════════════════════════════════════
+class WhisperASR:
+    """Transcribes audio using openai/whisper-small."""
+    def __init__(self):
+        if not settings.ENABLE_AUDIO:
+            logger.info("Audio module disabled. Set ENABLE_AUDIO=true to activate.")
+            self._ready = False
+            return
+        logger.info(f"Loading Whisper ASR model: {settings.WHISPER_MODEL}")
+        try:
+            import whisper
+            model_name = settings.WHISPER_MODEL.split("/")[-1]  # "whisper-small" → "small"
+            self.model = whisper.load_model(model_name, device=str(DEVICE))
+            self._ready = True
+            logger.info("✅ WhisperASR ready.")
+        except ImportError:
+            logger.warning("openai-whisper not installed. ASR unavailable.")
+            self._ready = False
+    def transcribe(self, audio_path: str, language: Optional[str] = None) -> Dict:
+        if not self._ready:
+            return {"text": "", "error": "ASR not available"}
+        t0 = time.perf_counter()
+        opts = {}
+        if language:
+            opts["language"] = language
+        result = self.model.transcribe(audio_path, **opts)
+        latency_ms = (time.perf_counter() - t0) * 1000
+        return {
+            "text": result["text"],
+            "language": result.get("language", "unknown"),
+            "segments": result.get("segments", []),
+            "latency_ms": round(latency_ms, 2),
+        }
+    def transcribe_bytes(self, audio_bytes: bytes, sample_rate: int = 16000) -> Dict:
+        import tempfile, soundfile as sf, os
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
+            tmp_path = f.name
+        try:
+            audio_array = np.frombuffer(audio_bytes, dtype=np.float32)
+            sf.write(tmp_path, audio_array, sample_rate)
+            return self.transcribe(tmp_path)
+        finally:
+            os.unlink(tmp_path)
+# ═══════════════════════════════════════════════════════════════════════════
+# Audio Classifier
+# ═══════════════════════════════════════════════════════════════════════════
+class AudioClassifier:
+    """Classifies audio events (gunshot, scream, etc.) using wav2vec2."""
+    KEYWORDS = ["yes", "no", "up", "down", "left", "right", "on", "off", "stop", "go"]
+    def __init__(self):
+        if not settings.ENABLE_AUDIO:
+            self._ready = False
+            return
+        logger.info(f"Loading audio classifier: {settings.AUDIO_CLASS_MODEL}")
+        try:
+            from transformers import pipeline
+            device_id = 0 if str(DEVICE) == "cuda" else -1
+            self.classifier = pipeline(
+                "audio-classification",
+                model=settings.AUDIO_CLASS_MODEL,
+                device=device_id,
+            )
+            self._ready = True
+            logger.info("✅ AudioClassifier ready.")
+        except Exception as e:
+            logger.warning(f"AudioClassifier init failed: {e}")
+            self._ready = False
+    def classify(self, audio_path: str, top_k: int = 5) -> Dict:
+        if not self._ready:
+            return {"classes": [], "error": "Audio classifier not available"}
+        t0 = time.perf_counter()
+        results = self.classifier(audio_path, top_k=top_k)
+        latency_ms = (time.perf_counter() - t0) * 1000
+        return {
+            "classes": [{"label": r["label"], "score": round(r["score"], 4)} for r in results],
+            "latency_ms": round(latency_ms, 2),
+        }
+# ═══════════════════════════════════════════════════════════════════════════
+# TTS - Text to Speech
+# ══════════════════════════════════════════════════════════��════════════════
+class SpeechSynthesizer:
+    """Generates speech from text using microsoft/speecht5_tts."""
+    def __init__(self):
+        if not settings.ENABLE_AUDIO:
+            self._ready = False
+            return
+        logger.info(f"Loading TTS model: {settings.TTS_MODEL}")
+        try:
+            from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
+            from datasets import load_dataset
+            self.processor = SpeechT5Processor.from_pretrained(settings.TTS_MODEL)
+            self.model = SpeechT5ForTextToSpeech.from_pretrained(settings.TTS_MODEL).to(DEVICE)
+            self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(DEVICE)
+            # Load speaker embeddings
+            ds = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
+            self.speaker_embeddings = torch.tensor(ds[7306]["xvector"]).unsqueeze(0).to(DEVICE)
+            self._ready = True
+            logger.info("✅ SpeechSynthesizer ready.")
+        except Exception as e:
+            logger.warning(f"TTS init failed: {e}")
+            self._ready = False
+    def synthesize(self, text: str) -> Optional[np.ndarray]:
+        """Synthesize text to audio. Returns numpy array (float32) or None."""
+        if not self._ready:
+            return None
+        inputs = self.processor(text=text, return_tensors="pt").to(DEVICE)
+        with torch.inference_mode():
+            speech = self.model.generate_speech(
+                inputs["input_ids"], self.speaker_embeddings, vocoder=self.vocoder
+            )
+        return speech.cpu().numpy()

config.py ADDED Viewed

	@@ -0,0 +1,107 @@

+"""
+config.py - Central configuration for Multimodal Surveillance Intelligence System
+"""
+import os
+import torch
+from pathlib import Path
+from pydantic_settings import BaseSettings
+from pydantic import Field
+from loguru import logger
+class Settings(BaseSettings):
+    # ── App ───────────────────────────────────────────────
+    APP_NAME: str = "Multimodal Surveillance Intelligence System"
+    APP_VERSION: str = "1.0.0"
+    DEBUG: bool = False
+    HOST: str = "0.0.0.0"
+    PORT: int = 8000
+    # ── Database (NeonDB PostgreSQL) ───────────────────────
+    DATABASE_URL: str = Field(
+        default="postgresql+asyncpg://user:password@hostname/dbname?ssl=require",
+        description="Database connection URL"
+    )
+    DB_ECHO: bool = False
+    # ── Security ───────────────────────────────────────────
+    SECRET_KEY: str = Field(default="change-in-production-super-secret-key", env="SECRET_KEY")
+    API_KEY: str = Field(default="surveillance-api-key-2024", env="API_KEY")
+    # ── Model Paths / Cache ───────────────────────────────
+    MODEL_CACHE_DIR: str = Field(default="./model_cache", env="HF_HOME")
+    # ── Vision Models ─────────────────────────────────────
+    DETECTION_MODEL: str = "facebook/detr-resnet-50"
+    REID_MODEL: str = "google/vit-base-patch16-224"
+    CLIP_MODEL: str = "openai/clip-vit-base-patch32"
+    DETECTION_CONFIDENCE: float = 0.7
+    DETECTION_BATCH_SIZE: int = 4
+    # ── NLP Models ────────────────────────────────────────
+    SEMANTIC_SEARCH_MODEL: str = "sentence-transformers/all-MiniLM-L6-v2"
+    QA_MODEL: str = "deepset/roberta-base-squad2"
+    REPORT_MODEL: str = "google/flan-t5-base"
+    SUMMARIZER_MODEL: str = "facebook/bart-large-cnn"
+    # ── Audio Models (Optional) ───────────────────────────
+    WHISPER_MODEL: str = "openai/whisper-small"
+    AUDIO_CLASS_MODEL: str = "superb/wav2vec2-base-superb-ks"
+    TTS_MODEL: str = "microsoft/speecht5_tts"
+    ENABLE_AUDIO: bool = False
+    # ── FAISS ─────────────────────────────────────────────
+    FAISS_INDEX_PATH: str = "./faiss_indexes"
+    REID_EMBEDDING_DIM: int = 768   # ViT-base hidden size
+    CLIP_EMBEDDING_DIM: int = 512   # CLIP embedding size
+    SEARCH_EMBEDDING_DIM: int = 384  # MiniLM embedding size
+    FAISS_NPROBE: int = 10
+    # ── Tracking ──────────────────────────────────────────
+    TRACKER_TYPE: str = "bytetrack"   # "bytetrack" or "deepsort"
+    TRACK_THRESH: float = 0.5
+    TRACK_BUFFER: int = 30
+    MATCH_THRESH: float = 0.8
+    # ── WebSocket / Cameras ───────────────────────────────
+    MAX_CAMERAS: int = 16
+    FRAME_QUEUE_SIZE: int = 30
+    FPS_TARGET: int = 30
+    # ── Anomaly Detection ─────────────────────────────────
+    ANOMALY_THRESHOLD: float = 0.75
+    # ── CORS ──────────────────────────────────────────────
+    CORS_ORIGINS: list = ["http://localhost:3000", "http://127.0.0.1:3000"]
+    class Config:
+        env_file = ".env"
+        env_file_encoding = "utf-8"
+        case_sensitive = False
+settings = Settings()
+# ── Device Detection ──────────────────────────────────────────
+def get_device() -> torch.device:
+    if torch.cuda.is_available():
+        device = torch.device("cuda")
+        logger.info(f"🚀 GPU detected: {torch.cuda.get_device_name(0)} | VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
+    else:
+        device = torch.device("cpu")
+        logger.warning("⚠️  No GPU detected — running on CPU. Performance may be degraded.")
+    return device
+DEVICE = get_device()
+# ── Paths ─────────────────────────────────────────────────────
+BASE_DIR = Path(__file__).resolve().parent
+FAISS_DIR = BASE_DIR / settings.FAISS_INDEX_PATH
+MODEL_CACHE = BASE_DIR / settings.MODEL_CACHE_DIR
+FAISS_DIR.mkdir(parents=True, exist_ok=True)
+MODEL_CACHE.mkdir(parents=True, exist_ok=True)
+# Set HuggingFace cache
+os.environ["HF_HOME"] = str(MODEL_CACHE)
+os.environ["TRANSFORMERS_CACHE"] = str(MODEL_CACHE)

database/__init__.py ADDED Viewed

File without changes

database/crud.py ADDED Viewed

	@@ -0,0 +1,164 @@

+"""
+database/crud.py - CRUD operations for Persons, Events, and Incident Reports
+"""
+import uuid
+from datetime import datetime
+from typing import List, Optional, Dict, Any
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy import select, update, and_, desc
+from database.models import Person, Event, IncidentReport, ActivityType
+from loguru import logger
+# ─── Person Operations ────────────────────────────────────────────────────────
+async def create_person(
+    db: AsyncSession,
+    faiss_id: Optional[int] = None,
+    attributes: Optional[Dict] = None,
+    thumbnail_path: Optional[str] = None,
+) -> Person:
+    person = Person(
+        faiss_id=faiss_id,
+        attributes=attributes or {},
+        thumbnail_path=thumbnail_path,
+        track_ids=[],
+    )
+    db.add(person)
+    await db.flush()
+    logger.info(f"Created person: {person.id}")
+    return person
+async def get_person(db: AsyncSession, person_id: uuid.UUID) -> Optional[Person]:
+    result = await db.execute(select(Person).where(Person.id == person_id))
+    return result.scalar_one_or_none()
+async def get_all_persons(db: AsyncSession, limit: int = 100, offset: int = 0) -> List[Person]:
+    result = await db.execute(
+        select(Person).order_by(desc(Person.last_seen)).limit(limit).offset(offset)
+    )
+    return result.scalars().all()
+async def update_person_last_seen(db: AsyncSession, person_id: uuid.UUID) -> None:
+    await db.execute(
+        update(Person).where(Person.id == person_id).values(last_seen=datetime.utcnow())
+    )
+async def update_person_faiss_id(db: AsyncSession, person_id: uuid.UUID, faiss_id: int) -> None:
+    await db.execute(
+        update(Person).where(Person.id == person_id).values(faiss_id=faiss_id)
+    )
+async def update_person_attributes(db: AsyncSession, person_id: uuid.UUID, attributes: Dict) -> None:
+    await db.execute(
+        update(Person).where(Person.id == person_id).values(attributes=attributes)
+    )
+# ─── Event Operations ─────────────────────────────────────────────────────────
+async def create_event(
+    db: AsyncSession,
+    person_id: uuid.UUID,
+    camera_id: str,
+    activity_type: ActivityType = ActivityType.DETECTED,
+    bounding_box: Optional[Dict] = None,
+    confidence: Optional[float] = None,
+    track_id: Optional[int] = None,
+    location_zone: Optional[str] = None,
+    anomaly_score: float = 0.0,
+    description: Optional[str] = None,
+    raw_metadata: Optional[Dict] = None,
+) -> Event:
+    event = Event(
+        person_id=person_id,
+        camera_id=camera_id,
+        activity_type=activity_type,
+        bounding_box=bounding_box,
+        confidence=confidence,
+        track_id=track_id,
+        location_zone=location_zone,
+        anomaly_score=anomaly_score,
+        description=description,
+        raw_metadata=raw_metadata or {},
+    )
+    db.add(event)
+    await db.flush()
+    return event
+async def get_events_for_person(
+    db: AsyncSession,
+    person_id: uuid.UUID,
+    limit: int = 50,
+) -> List[Event]:
+    result = await db.execute(
+        select(Event)
+        .where(Event.person_id == person_id)
+        .order_by(desc(Event.timestamp))
+        .limit(limit)
+    )
+    return result.scalars().all()
+async def get_recent_events(
+    db: AsyncSession,
+    camera_id: Optional[str] = None,
+    limit: int = 100,
+) -> List[Event]:
+    query = select(Event).order_by(desc(Event.timestamp)).limit(limit)
+    if camera_id:
+        query = query.where(Event.camera_id == camera_id)
+    result = await db.execute(query)
+    return result.scalars().all()
+async def get_anomaly_events(db: AsyncSession, threshold: float = 0.75, limit: int = 50) -> List[Event]:
+    result = await db.execute(
+        select(Event)
+        .where(Event.anomaly_score >= threshold)
+        .order_by(desc(Event.timestamp))
+        .limit(limit)
+    )
+    return result.scalars().all()
+# ─── Incident Report Operations ───────────────────────────────────────────────
+async def create_incident_report(
+    db: AsyncSession,
+    person_id: Optional[uuid.UUID],
+    report_text: str,
+    summary: Optional[str] = None,
+    severity: str = "medium",
+    camera_ids: Optional[List[str]] = None,
+) -> IncidentReport:
+    report = IncidentReport(
+        person_id=person_id,
+        report_text=report_text,
+        summary=summary,
+        severity=severity,
+        camera_ids=camera_ids or [],
+    )
+    db.add(report)
+    await db.flush()
+    return report
+async def get_report(db: AsyncSession, report_id: uuid.UUID) -> Optional[IncidentReport]:
+    result = await db.execute(select(IncidentReport).where(IncidentReport.report_id == report_id))
+    return result.scalar_one_or_none()
+async def get_reports_for_person(db: AsyncSession, person_id: uuid.UUID) -> List[IncidentReport]:
+    result = await db.execute(
+        select(IncidentReport)
+        .where(IncidentReport.person_id == person_id)
+        .order_by(desc(IncidentReport.generated_at))
+    )
+    return result.scalars().all()

database/models.py ADDED Viewed

	@@ -0,0 +1,100 @@

+"""
+database/models.py - SQLAlchemy ORM models for Surveillance System
+"""
+import uuid
+from datetime import datetime
+from sqlalchemy import Column, String, DateTime, Float, Text, Integer, ForeignKey, JSON, Enum
+from sqlalchemy.dialects.postgresql import UUID
+from sqlalchemy.orm import relationship
+from database.session import Base
+import enum
+class ActivityType(str, enum.Enum):
+    DETECTED = "detected"
+    TRACKED = "tracked"
+    REID_MATCH = "reid_match"
+    ANOMALY = "anomaly"
+    LOITERING = "loitering"
+    RUNNING = "running"
+    FIGHTING = "fighting"
+    TRESPASSING = "trespassing"
+class AnalysisSession(Base):
+    __tablename__ = "analysis_sessions"
+    id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4, index=True)
+    timestamp = Column(DateTime, default=datetime.utcnow, nullable=False, index=True)
+    video_filename = Column(String(255), nullable=False)
+    thumbnail_path = Column(String(500), nullable=True)
+    # Analysis Summary Stats
+    duration_sec = Column(Float, nullable=False, default=0.0)
+    frames_processed = Column(Integer, nullable=False, default=0)
+    unique_persons = Column(Integer, nullable=False, default=0)
+    peak_count = Column(Integer, nullable=False, default=0)
+    def __repr__(self):
+        return f"<AnalysisSession id={self.id} video={self.video_filename}>"
+class Person(Base):
+    __tablename__ = "persons"
+    id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4, index=True)
+    first_seen = Column(DateTime, default=datetime.utcnow, nullable=False)
+    last_seen = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False)
+    # ReID embedding reference
+    faiss_id = Column(Integer, nullable=True, unique=True)
+    embedding_version = Column(String(50), default="vit-base-patch16-224")
+    # Attributes from CLIP
+    attributes = Column(JSON, nullable=True)  # {"clothing": [...], "colors": [...]}
+    thumbnail_path = Column(String(500), nullable=True)
+    track_ids = Column(JSON, default=list)  # list of all track IDs assigned cross-camera
+    events = relationship("Event", back_populates="person", cascade="all, delete-orphan")
+    def __repr__(self):
+        return f"<Person id={self.id} first_seen={self.first_seen}>"
+class Event(Base):
+    __tablename__ = "events"
+    event_id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4, index=True)
+    person_id = Column(UUID(as_uuid=True), ForeignKey("persons.id", ondelete="CASCADE"), nullable=False, index=True)
+    camera_id = Column(String(64), nullable=False, index=True)
+    timestamp = Column(DateTime, default=datetime.utcnow, nullable=False, index=True)
+    activity_type = Column(Enum(ActivityType), default=ActivityType.DETECTED, nullable=False)
+    # Spatial & tracking info
+    bounding_box = Column(JSON, nullable=True)       # {"x1": f, "y1": f, "x2": f, "y2": f}
+    confidence = Column(Float, nullable=True)
+    track_id = Column(Integer, nullable=True)
+    location_zone = Column(String(128), nullable=True)
+    # Additional metadata
+    anomaly_score = Column(Float, default=0.0)
+    raw_metadata = Column(JSON, nullable=True)
+    description = Column(Text, nullable=True)        # NLP-generated description
+    person = relationship("Person", back_populates="events")
+    def __repr__(self):
+        return f"<Event {self.event_id} person={self.person_id} cam={self.camera_id} type={self.activity_type}>"
+class IncidentReport(Base):
+    __tablename__ = "incident_reports"
+    report_id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4, index=True)
+    person_id = Column(UUID(as_uuid=True), ForeignKey("persons.id", ondelete="SET NULL"), nullable=True)
+    generated_at = Column(DateTime, default=datetime.utcnow)
+    report_text = Column(Text, nullable=False)
+    summary = Column(Text, nullable=True)
+    severity = Column(String(20), default="medium")  # low / medium / high / critical
+    camera_ids = Column(JSON, default=list)
+    model_version = Column(String(50), default="flan-t5-base")

database/session.py ADDED Viewed

	@@ -0,0 +1,46 @@

+"""
+database/session.py - Async SQLAlchemy session management for NeonDB PostgreSQL
+"""
+from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine, async_sessionmaker
+from sqlalchemy.orm import DeclarativeBase
+from config import settings
+from loguru import logger
+engine = create_async_engine(
+    settings.DATABASE_URL,
+    echo=settings.DB_ECHO,
+    pool_size=10,
+    max_overflow=20,
+    pool_pre_ping=True,
+    connect_args={"ssl": "require"},
+)
+AsyncSessionLocal = async_sessionmaker(
+    engine,
+    class_=AsyncSession,
+    expire_on_commit=False,
+)
+class Base(DeclarativeBase):
+    pass
+async def get_db() -> AsyncSession:
+    async with AsyncSessionLocal() as session:
+        try:
+            yield session
+            await session.commit()
+        except Exception:
+            await session.rollback()
+            raise
+        finally:
+            await session.close()
+async def create_tables():
+    """Create all tables on startup."""
+    async with engine.begin() as conn:
+        from database.models import Person, Event  # noqa: F401
+        await conn.run_sync(Base.metadata.create_all)
+    logger.info("✅ Database tables ensured.")

graph/__init__.py ADDED Viewed

File without changes

graph/movement_graph.py ADDED Viewed

	@@ -0,0 +1,236 @@

+"""
+graph/movement_graph.py - PyTorch Geometric Movement Graph & Anomaly Detection
+Builds a directed graph of person movements across cameras and zones.
+Detects abnormal route patterns using GNN-based anomaly scoring.
+"""
+import time
+import torch
+import numpy as np
+from typing import Dict, List, Optional, Tuple
+from collections import defaultdict
+from loguru import logger
+from config import DEVICE, settings
+try:
+    import torch_geometric
+    from torch_geometric.data import Data
+    from torch_geometric.nn import GCNConv, global_mean_pool
+    GEO_AVAILABLE = True
+except ImportError:
+    GEO_AVAILABLE = False
+    logger.warning("torch-geometric not installed. Movement graph module will use fallback.")
+# ── GNN Autoencoder for Anomaly Detection ──────────────────────────────────────
+class MovementGNN(torch.nn.Module):
+    """
+    Simple GCN autoencoder to encode node features (camera visit patterns).
+    Reconstruction error → anomaly score.
+    """
+    def __init__(self, in_channels: int = 8, hidden: int = 16, out_channels: int = 8):
+        super().__init__()
+        if GEO_AVAILABLE:
+            self.enc1 = GCNConv(in_channels, hidden)
+            self.enc2 = GCNConv(hidden, out_channels)
+            self.dec1 = GCNConv(out_channels, hidden)
+            self.dec2 = GCNConv(hidden, in_channels)
+        self.relu = torch.nn.ReLU()
+    def forward(self, x, edge_index):
+        # Encode
+        z = self.relu(self.enc1(x, edge_index))
+        z = self.enc2(z, edge_index)
+        # Decode
+        x_hat = self.relu(self.dec1(z, edge_index))
+        x_hat = self.dec2(x_hat, edge_index)
+        return x_hat, z
+# ── Movement Graph Builder ──────────────────────────────────────────────────────
+class MovementGraph:
+    """
+    Maintains a person-level movement graph.
+    Nodes = cameras/zones; Edges = observed transitions.
+    """
+    def __init__(self):
+        # person_id → list of (camera_id, timestamp, zone)
+        self.person_trails: Dict[str, List[Dict]] = defaultdict(list)
+        # camera graph: edge_weights[cam_a][cam_b] = count
+        self.edge_weights: Dict[str, Dict[str, int]] = defaultdict(lambda: defaultdict(int))
+        self.camera_ids: List[str] = []
+        self.cam_index: Dict[str, int] = {}
+        if GEO_AVAILABLE:
+            self.gnn = MovementGNN().to(DEVICE)
+            self.gnn.eval()
+        else:
+            self.gnn = None
+        logger.info(f"MovementGraph initialized. PyG available: {GEO_AVAILABLE}")
+    def register_camera(self, camera_id: str):
+        if camera_id not in self.cam_index:
+            self.cam_index[camera_id] = len(self.camera_ids)
+            self.camera_ids.append(camera_id)
+    def add_observation(
+        self,
+        person_id: str,
+        camera_id: str,
+        timestamp: float,
+        zone: Optional[str] = None,
+    ):
+        """Record that a person was observed at camera/zone at timestamp."""
+        self.register_camera(camera_id)
+        trail = self.person_trails[person_id]
+        # Add transition edge if person has prior observation
+        if trail:
+            last_cam = trail[-1]["camera_id"]
+            if last_cam != camera_id:
+                self.edge_weights[last_cam][camera_id] += 1
+        trail.append({"camera_id": camera_id, "timestamp": timestamp, "zone": zone})
+        # Keep last 50 observations per person
+        if len(trail) > 50:
+            self.person_trails[person_id] = trail[-50:]
+    def _build_graph(self) -> Optional["Data"]:
+        """Convert current camera graph to PyG Data object."""
+        if not GEO_AVAILABLE or len(self.camera_ids) == 0:
+            return None
+        n = len(self.camera_ids)
+        # Node features: [visit_count_normalized, in_degree, out_degree, ...]
+        node_features = np.zeros((n, 8), dtype=np.float32)
+        edge_src, edge_dst = [], []
+        # Count visits per camera
+        cam_visits = defaultdict(int)
+        for trails in self.person_trails.values():
+            for obs in trails:
+                cam_visits[obs["camera_id"]] += 1
+        max_visits = max(cam_visits.values()) if cam_visits else 1
+        for cam, idx in self.cam_index.items():
+            node_features[idx, 0] = cam_visits[cam] / max_visits
+        # Build edges and compute in/out degree
+        for src_cam, dst_dict in self.edge_weights.items():
+            for dst_cam, weight in dst_dict.items():
+                si = self.cam_index.get(src_cam)
+                di = self.cam_index.get(dst_cam)
+                if si is not None and di is not None:
+                    edge_src.append(si)
+                    edge_dst.append(di)
+                    node_features[si, 1] += 1  # out-degree
+                    node_features[di, 2] += 1  # in-degree
+        if not edge_src:
+            # Add self-loops to avoid empty graph
+            edge_src = list(range(n))
+            edge_dst = list(range(n))
+        x = torch.tensor(node_features, dtype=torch.float32).to(DEVICE)
+        edge_index = torch.tensor([edge_src, edge_dst], dtype=torch.long).to(DEVICE)
+        return Data(x=x, edge_index=edge_index)
+    @torch.inference_mode()
+    def compute_anomaly_score(self, person_id: str) -> Dict:
+        """
+        Compute anomaly score for a person's movement trail.
+        Returns:
+            {"person_id": str, "anomaly_score": float, "route": list, "verdict": str}
+        """
+        trail = self.person_trails.get(person_id, [])
+        if len(trail) < 2:
+            return {"person_id": person_id, "anomaly_score": 0.0, "verdict": "insufficient_data", "route": []}
+        t0 = time.perf_counter()
+        # Heuristic features for pattern scoring
+        cameras = [obs["camera_id"] for obs in trail]
+        timestamps = [obs["timestamp"] for obs in trail]
+        unique_cams = len(set(cameras))
+        total_obs = len(cameras)
+        # Time between observations
+        gaps = np.diff(timestamps)
+        avg_gap = float(np.mean(gaps)) if len(gaps) > 0 else 0
+        max_gap = float(np.max(gaps)) if len(gaps) > 0 else 0
+        # Suspicious patterns:
+        # 1. Too many unique cameras in short time → rapid movement
+        # 2. Very short dwell time per camera → running/fleeing behavior
+        # 3. Visiting same camera repeatedly in short time → loitering
+        rapid_movement = unique_cams / max(total_obs, 1) > 0.8
+        loitering = cameras.count(cameras[-1]) / total_obs > 0.6 if cameras else False
+        fast_dwell = avg_gap < 10 and unique_cams > 3  # under 10s per camera
+        heuristic_score = 0.0
+        if rapid_movement:
+            heuristic_score += 0.4
+        if loitering:
+            heuristic_score += 0.3
+        if fast_dwell:
+            heuristic_score += 0.3
+        # GNN-based score (if available)
+        gnn_score = 0.0
+        if GEO_AVAILABLE and self.gnn is not None:
+            graph = self._build_graph()
+            if graph is not None and graph.num_nodes > 0:
+                x_hat, _ = self.gnn(graph.x, graph.edge_index)
+                reconstruction_error = float(torch.mean((graph.x - x_hat) ** 2))
+                gnn_score = min(reconstruction_error * 5, 1.0)
+        # Combined score
+        anomaly_score = round(0.5 * heuristic_score + 0.5 * gnn_score, 4)
+        anomaly_score = min(anomaly_score, 1.0)
+        latency = (time.perf_counter() - t0) * 1000
+        if anomaly_score > settings.ANOMALY_THRESHOLD:
+            verdict = "anomalous"
+        elif anomaly_score > 0.4:
+            verdict = "suspicious"
+        else:
+            verdict = "normal"
+        return {
+            "person_id": person_id,
+            "anomaly_score": anomaly_score,
+            "verdict": verdict,
+            "route": [{"camera_id": obs["camera_id"], "timestamp": obs["timestamp"]} for obs in trail[-10:]],
+            "unique_cameras": unique_cams,
+            "total_observations": total_obs,
+            "avg_dwell_seconds": round(avg_gap, 2),
+            "flags": {
+                "rapid_movement": rapid_movement,
+                "loitering": loitering,
+                "fast_dwell": fast_dwell,
+            },
+            "latency_ms": round(latency, 2),
+        }
+    def get_all_anomalies(self, threshold: float = 0.75) -> List[Dict]:
+        """Compute anomaly scores for all tracked persons."""
+        results = []
+        for pid in self.person_trails:
+            score_data = self.compute_anomaly_score(pid)
+            if score_data["anomaly_score"] >= threshold:
+                results.append(score_data)
+        return sorted(results, key=lambda x: -x["anomaly_score"])
+    def get_movement_summary(self) -> Dict:
+        return {
+            "total_persons_tracked": len(self.person_trails),
+            "total_cameras": len(self.camera_ids),
+            "cameras": self.camera_ids,
+            "edge_count": sum(len(v) for v in self.edge_weights.values()),
+        }

nlp/__init__.py ADDED Viewed

File without changes

nlp/qa.py ADDED Viewed

	@@ -0,0 +1,103 @@

+"""
+nlp/qa.py - Question Answering over Surveillance Logs using deepset/roberta-base-squad2
+"""
+import time
+from typing import Optional, Dict, List
+from transformers import pipeline, Pipeline
+from loguru import logger
+from config import settings, DEVICE
+class SurveillanceQA:
+    """
+    Extractive QA system. Given a question and a context built from
+    surveillance logs/events, extracts the most relevant answer span.
+    """
+    def __init__(self):
+        logger.info(f"Loading QA model: {settings.QA_MODEL}")
+        device_id = 0 if str(DEVICE) == "cuda" else -1
+        self.qa_pipeline: Pipeline = pipeline(
+            "question-answering",
+            model=settings.QA_MODEL,
+            tokenizer=settings.QA_MODEL,
+            device=device_id,
+        )
+        logger.info("✅ SurveillanceQA ready.")
+    def _build_context(self, events: List[Dict]) -> str:
+        """Build a natural language context string from event records."""
+        lines = []
+        for e in events:
+            ts = e.get("timestamp", "unknown time")
+            cam = e.get("camera_id", "unknown camera")
+            activity = e.get("activity_type", "detected")
+            person_id = str(e.get("person_id", "unknown"))[:8]
+            attrs = e.get("attributes", {})
+            desc = e.get("description", "")
+            attr_str = ""
+            if attrs:
+                gender = attrs.get("gender", "")
+                color = attrs.get("color", "")
+                clothing = ", ".join([c.get("label", "") for c in attrs.get("clothing", [])[:2]])
+                attr_str = f"({gender}, {color} clothing, {clothing})"
+            line = f"At {ts}, camera {cam} detected person {person_id} {attr_str} with activity: {activity}."
+            if desc:
+                line += f" {desc}"
+            lines.append(line)
+        return " ".join(lines)
+    def answer(
+        self,
+        question: str,
+        events: Optional[List[Dict]] = None,
+        context: Optional[str] = None,
+        top_k: int = 3,
+    ) -> Dict:
+        """
+        Answer a natural language question about surveillance data.
+        Args:
+            question: User's question
+            events: List of event dicts (auto-builds context)
+            context: Pre-built context string
+            top_k: Number of answer candidates
+        Returns:
+            {"answer": str, "score": float, "start": int, "end": int, "context": str, "latency_ms": float}
+        """
+        if context is None:
+            if not events:
+                return {"answer": "No surveillance data available to answer from.", "score": 0.0}
+            context = self._build_context(events)
+        if not context.strip():
+            return {"answer": "No context available.", "score": 0.0}
+        # Truncate context to model max (512 tokens ≈ ~2000 chars)
+        context = context[:4000]
+        t0 = time.perf_counter()
+        result = self.qa_pipeline(
+            question=question,
+            context=context,
+            top_k=top_k,
+            handle_impossible_answer=True,
+        )
+        latency_ms = (time.perf_counter() - t0) * 1000
+        if isinstance(result, list):
+            best = result[0]
+        else:
+            best = result
+        logger.debug(f"QA answered '{question[:50]}' in {latency_ms:.1f}ms | score={best.get('score', 0):.3f}")
+        return {
+            "answer": best.get("answer", ""),
+            "score": round(best.get("score", 0.0), 4),
+            "start": best.get("start", 0),
+            "end": best.get("end", 0),
+            "context_used": context[:500] + "..." if len(context) > 500 else context,
+            "latency_ms": round(latency_ms, 2),
+            "all_answers": result if isinstance(result, list) else [result],
+        }

nlp/report.py ADDED Viewed

	@@ -0,0 +1,114 @@

+"""
+nlp/report.py - Incident Report Generation using google/flan-t5-base
+"""
+import time
+from typing import Dict, List, Optional
+from transformers import pipeline, Pipeline
+from loguru import logger
+from config import settings, DEVICE
+REPORT_PROMPT_TEMPLATE = """Generate a structured surveillance incident report based on the following events:
+Events:
+{events_summary}
+Format the report as:
+INCIDENT REPORT
+Date/Time: [datetime]
+Cameras Involved: [cameras]
+Subject Description: [physical description]
+Activity Observed: [description of events]
+Anomaly Level: [low/medium/high]
+Recommended Action: [action]
+"""
+class IncidentReportGenerator:
+    """
+    Generates structured incident reports from surveillance events using Flan-T5.
+    """
+    def __init__(self):
+        logger.info(f"Loading report generation model: {settings.REPORT_MODEL}")
+        device_id = 0 if str(DEVICE) == "cuda" else -1
+        self.generator: Pipeline = pipeline(
+            "text2text-generation",
+            model=settings.REPORT_MODEL,
+            tokenizer=settings.REPORT_MODEL,
+            device=device_id,
+        )
+        logger.info("✅ IncidentReportGenerator ready.")
+    def _format_events(self, events: List[Dict]) -> str:
+        """Format events list into a readable string for the prompt."""
+        lines = []
+        for i, e in enumerate(events, 1):
+            ts = e.get("timestamp", "unknown time")
+            cam = e.get("camera_id", "unknown camera")
+            activity = e.get("activity_type", "detected")
+            attrs = e.get("attributes", {})
+            gender = attrs.get("gender", "") if attrs else ""
+            color = attrs.get("color", "") if attrs else ""
+            anomaly_score = e.get("anomaly_score", 0.0)
+            lines.append(
+                f"{i}. [{ts}] Camera {cam}: {gender} person in {color} clothing, "
+                f"activity={activity}, anomaly_score={anomaly_score:.2f}"
+            )
+        return "\n".join(lines)
+    def generate(
+        self,
+        events: List[Dict],
+        person_id: Optional[str] = None,
+        max_length: int = 512,
+        severity_hint: Optional[str] = None,
+    ) -> Dict:
+        """
+        Generate a structured incident report from a list of event records.
+        Returns:
+            {"report_text": str, "severity": str, "latency_ms": float}
+        """
+        if not events:
+            return {"report_text": "No events provided for report generation.", "severity": "low"}
+        events_summary = self._format_events(events[:20])  # limit for token budget
+        prompt = REPORT_PROMPT_TEMPLATE.format(events_summary=events_summary)
+        if person_id:
+            prompt = f"Person ID: {person_id[:8]}\n" + prompt
+        t0 = time.perf_counter()
+        outputs = self.generator(
+            prompt,
+            max_new_tokens=max_length,
+            num_beams=4,
+            early_stopping=True,
+            no_repeat_ngram_size=3,
+        )
+        latency_ms = (time.perf_counter() - t0) * 1000
+        report_text = outputs[0]["generated_text"]
+        # Determine severity from anomaly scores
+        scores = [e.get("anomaly_score", 0.0) for e in events]
+        avg_anomaly = sum(scores) / max(len(scores), 1)
+        if avg_anomaly > 0.8:
+            severity = "critical"
+        elif avg_anomaly > 0.6:
+            severity = "high"
+        elif avg_anomaly > 0.3:
+            severity = "medium"
+        else:
+            severity = "low"
+        severity = severity_hint or severity
+        logger.info(f"Report generated in {latency_ms:.1f}ms | severity={severity}")
+        return {
+            "report_text": report_text,
+            "severity": severity,
+            "event_count": len(events),
+            "avg_anomaly_score": round(avg_anomaly, 4),
+            "latency_ms": round(latency_ms, 2),
+            "person_id": person_id,
+        }

nlp/search.py ADDED Viewed

	@@ -0,0 +1,112 @@

+"""
+nlp/search.py - Semantic Search using sentence-transformers/all-MiniLM-L6-v2 + FAISS
+Embeds natural language queries and matches against stored surveillance metadata.
+"""
+import os
+import time
+import numpy as np
+import faiss
+import torch
+from typing import List, Dict, Optional
+from sentence_transformers import SentenceTransformer
+from loguru import logger
+from config import settings, DEVICE, FAISS_DIR
+class SemanticSearchEngine:
+    """
+    Encodes surveillance metadata (event descriptions, attributes) into
+    sentence embeddings stored in FAISS. Supports natural-language querying.
+    """
+    INDEX_FILE = str(FAISS_DIR / "search_index.faiss")
+    META_FILE = str(FAISS_DIR / "search_meta.npy")
+    def __init__(self):
+        logger.info(f"Loading semantic search model: {settings.SEMANTIC_SEARCH_MODEL}")
+        self.model = SentenceTransformer(settings.SEMANTIC_SEARCH_MODEL, device=str(DEVICE))
+        self.dim = settings.SEARCH_EMBEDDING_DIM
+        self.index = self._load_or_create_index()
+        self.meta: List[Dict] = self._load_meta()
+        logger.info(f"✅ SemanticSearchEngine ready. Index size: {self.index.ntotal}")
+    def _load_or_create_index(self) -> faiss.IndexFlatIP:
+        if os.path.exists(self.INDEX_FILE):
+            logger.info("Loading existing FAISS search index.")
+            return faiss.read_index(self.INDEX_FILE)
+        return faiss.IndexFlatIP(self.dim)
+    def _load_meta(self) -> List[Dict]:
+        if os.path.exists(self.META_FILE):
+            return list(np.load(self.META_FILE, allow_pickle=True))
+        return []
+    def save(self):
+        faiss.write_index(self.index, self.INDEX_FILE)
+        np.save(self.META_FILE, np.array(self.meta, dtype=object))
+    def encode(self, texts: List[str]) -> np.ndarray:
+        """Encode texts to L2-normalized embeddings (batch)."""
+        embeddings = self.model.encode(
+            texts,
+            batch_size=32,
+            normalize_embeddings=True,
+            convert_to_numpy=True,
+            show_progress_bar=False,
+        )
+        return embeddings.astype(np.float32)
+    def index_event(self, text: str, metadata: Dict) -> int:
+        """
+        Add a single surveillance event description to the FAISS search index.
+        Args:
+            text: Natural language description of the event
+            metadata: {"event_id", "person_id", "camera_id", "timestamp", "activity_type", ...}
+        Returns:
+            faiss_id (row index)
+        """
+        embedding = self.encode([text])
+        faiss_id = self.index.ntotal
+        self.index.add(embedding)
+        self.meta.append({**metadata, "text": text, "faiss_id": faiss_id})
+        self.save()
+        return faiss_id
+    def index_batch(self, texts: List[str], metadatas: List[Dict]):
+        """Batch indexing for bulk ingestion."""
+        embeddings = self.encode(texts)
+        base_id = self.index.ntotal
+        self.index.add(embeddings)
+        for i, (text, meta) in enumerate(zip(texts, metadatas)):
+            self.meta.append({**meta, "text": text, "faiss_id": base_id + i})
+        self.save()
+        logger.info(f"Indexed {len(texts)} events into search index.")
+    def search(self, query: str, top_k: int = 10, score_threshold: float = 0.4) -> List[Dict]:
+        """
+        Search surveillance logs by natural language query.
+        Returns:
+            List of {"text": str, "score": float, ...metadata fields}
+        """
+        if self.index.ntotal == 0:
+            return []
+        t0 = time.perf_counter()
+        query_emb = self.encode([query])
+        k = min(top_k, self.index.ntotal)
+        distances, indices = self.index.search(query_emb, k)
+        latency = (time.perf_counter() - t0) * 1000
+        results = []
+        for dist, idx in zip(distances[0], indices[0]):
+            if idx == -1 or float(dist) < score_threshold:
+                continue
+            entry = dict(self.meta[idx])
+            entry["score"] = round(float(dist), 4)
+            results.append(entry)
+        logger.debug(f"Semantic search '{query[:40]}...' → {len(results)} results in {latency:.1f}ms")
+        return sorted(results, key=lambda x: -x["score"])

nlp/summarizer.py ADDED Viewed

	@@ -0,0 +1,75 @@

+"""
+nlp/summarizer.py - Surveillance log summarization using facebook/bart-large-cnn
+"""
+import time
+from typing import List, Dict
+from transformers import pipeline, Pipeline
+from loguru import logger
+from config import settings, DEVICE
+class SurveillanceSummarizer:
+    """Abstractive summarization of surveillance event logs using BART."""
+    def __init__(self):
+        logger.info(f"Loading summarization model: {settings.SUMMARIZER_MODEL}")
+        device_id = 0 if str(DEVICE) == "cuda" else -1
+        self.summarizer: Pipeline = pipeline(
+            "summarization",
+            model=settings.SUMMARIZER_MODEL,
+            tokenizer=settings.SUMMARIZER_MODEL,
+            device=device_id,
+        )
+        logger.info("✅ SurveillanceSummarizer ready.")
+    def _events_to_text(self, events: List[Dict]) -> str:
+        parts = []
+        for e in events:
+            ts = e.get("timestamp", "")
+            cam = e.get("camera_id", "")
+            activity = e.get("activity_type", "")
+            anomaly = e.get("anomaly_score", 0.0)
+            attrs = e.get("attributes", {})
+            gender = attrs.get("gender", "") if attrs else ""
+            color = attrs.get("color", "") if attrs else ""
+            parts.append(
+                f"Camera {cam} at {ts}: {gender} person in {color} clothing observed {activity} "
+                f"with anomaly score {anomaly:.2f}."
+            )
+        return " ".join(parts)
+    def summarize(
+        self,
+        events: List[Dict],
+        min_length: int = 30,
+        max_length: int = 200,
+    ) -> Dict:
+        """Summarize a list of surveillance events."""
+        if not events:
+            return {"summary": "No events to summarize.", "latency_ms": 0}
+        text = self._events_to_text(events[:30])
+        # BART max input is ~1024 tokens
+        text = text[:3000]
+        t0 = time.perf_counter()
+        result = self.summarizer(
+            text,
+            min_length=min_length,
+            max_length=max_length,
+            do_sample=False,
+        )
+        latency_ms = (time.perf_counter() - t0) * 1000
+        summary = result[0]["summary_text"]
+        logger.debug(f"Summarized {len(events)} events in {latency_ms:.1f}ms")
+        return {
+            "summary": summary,
+            "event_count": len(events),
+            "latency_ms": round(latency_ms, 2),
+        }
+    def summarize_text(self, text: str, min_length: int = 30, max_length: int = 150) -> str:
+        """Summarize arbitrary text string."""
+        text = text[:3000]
+        result = self.summarizer(text, min_length=min_length, max_length=max_length, do_sample=False)
+        return result[0]["summary_text"]

requirements.txt ADDED Viewed

	@@ -0,0 +1,96 @@

+# ============================================================
+# Multimodal Surveillance Intelligence System - Requirements
+# Python 3.10+
+# ============================================================
+# --- Core API Framework ---
+fastapi==0.115.0
+uvicorn[standard]==0.30.6
+python-multipart==0.0.9
+websockets==12.0
+httpx==0.27.2
+pydantic==2.9.2
+pydantic-settings==2.5.2
+# --- Database ---
+asyncpg==0.29.0
+sqlalchemy[asyncio]==2.0.35
+alembic==1.13.3
+psycopg2-binary==2.9.9
+# --- PyTorch (install separately for CUDA - see README) ---
+# torch==2.4.1+cu124
+# torchvision==0.19.1+cu124
+# torchaudio==0.19.1+cu124
+# --- HuggingFace Ecosystem ---
+transformers==4.45.0
+accelerate==0.34.2
+datasets==3.0.1
+tokenizers==0.20.0
+safetensors==0.4.5
+huggingface-hub==0.25.1
+sentence-transformers==3.1.1
+# --- Vision & Computer Vision ---
+Pillow==10.4.0
+opencv-python==4.10.0.84
+numpy==1.26.4
+scipy==1.14.1
+scikit-image==0.24.0
+torchvision
+timm
+# --- FAISS (install separately for GPU - see README) ---
+# faiss-gpu  -> pip install faiss-gpu  (CUDA)
+# faiss-cpu  -> pip install faiss-cpu  (CPU only)
+# --- Object Detection & Tracking ---
+# ByteTrack / DeepSORT dependencies
+filterpy==1.4.5
+lapx==0.5.9.post1
+# --- PyTorch Geometric (Graph Module) ---
+# Install separately:
+# pip install torch-geometric
+# pip install torch-scatter torch-sparse -f https://data.pyg.org/whl/torch-2.4.1+cu124.html
+torch-geometric==2.6.1
+# --- Audio Processing ---
+librosa==0.10.2
+soundfile==0.12.1
+pyaudio==0.2.14
+torchaudio
+openai-whisper
+# --- NLP & Text Processing ---
+nltk==3.9.1
+spacy==3.8.2
+tiktoken==0.7.0
+# --- Logging & Monitoring ---
+loguru==0.7.2
+prometheus-client==0.21.0
+rich==13.8.1
+# --- Utilities ---
+python-dotenv==1.0.1
+aiofiles==24.1.0
+anyio==4.6.2
+tenacity==9.0.0
+cachetools==5.5.0
+click==8.1.7
+tqdm==4.66.5
+psutil==6.0.0
+GPUtil==1.4.0
+PyYAML==6.0.2
+# --- Testing & Benchmarks ---
+pytest==8.3.3
+pytest-asyncio==0.24.0
+locust==2.31.8
+# --- Image & Annotation ---
+matplotlib==3.9.2
+seaborn==0.13.2

routes/__init__.py ADDED Viewed

File without changes

routes/nlp_routes.py ADDED Viewed

	@@ -0,0 +1,235 @@

+"""
+routes/nlp_routes.py - NLP API endpoints:
+  GET  /search              (semantic search over surveillance logs)
+  POST /ask                 (QA over logs)
+  GET  /report/{person_id}  (generate incident report)
+  GET  /anomalies           (get anomalous persons)
+"""
+import uuid
+from typing import Optional, List
+from fastapi import APIRouter, Depends, HTTPException, Query
+from pydantic import BaseModel
+from loguru import logger
+from sqlalchemy.ext.asyncio import AsyncSession
+from database.session import get_db
+from database import crud
+router = APIRouter(prefix="/api/v1", tags=["NLP & Intelligence"])
+# ── Dependency helpers ─────────────────────────────────────────────────────────
+def get_search_engine():
+    from app import search_engine
+    return search_engine
+def get_qa_system():
+    from app import qa_system
+    return qa_system
+def get_report_generator():
+    from app import report_generator
+    return report_generator
+def get_summarizer():
+    from app import summarizer
+    return summarizer
+def get_movement_graph():
+    from app import movement_graph
+    return movement_graph
+# ── Request models ──────────────────────────────────────────────────────────────
+class SearchRequest(BaseModel):
+    query: str
+    top_k: int = 10
+    score_threshold: float = 0.35
+class AskRequest(BaseModel):
+    question: str
+    camera_id: Optional[str] = None
+    person_id: Optional[str] = None
+    context: Optional[str] = None
+# ── GET /search ───────────────────────────────────────────────────────────────
+@router.get("/search", summary="Semantic search over surveillance logs")
+async def semantic_search(
+    q: str = Query(..., description="Natural language search query"),
+    top_k: int = Query(10, ge=1, le=50),
+    score_threshold: float = Query(0.35, ge=0.0, le=1.0),
+    search_engine=Depends(get_search_engine),
+):
+    """
+    Search across all indexed surveillance events using natural language.
+    Example: "person in red jacket near camera 3"
+    """
+    if not q.strip():
+        raise HTTPException(status_code=400, detail="Query cannot be empty")
+    results = search_engine.search(query=q, top_k=top_k, score_threshold=score_threshold)
+    return {
+        "query": q,
+        "results": results,
+        "total": len(results),
+    }
+# ── POST /ask ─────────────────────────────────────────────────────────────────
+@router.post("/ask", summary="Ask a natural language question about surveillance data")
+async def ask_question(
+    request: AskRequest,
+    db: AsyncSession = Depends(get_db),
+    qa=Depends(get_qa_system),
+):
+    """
+    Extractive QA over recent surveillance events.
+    Example: "Which camera last detected person 123abc?"
+    """
+    events = []
+    if request.context:
+        # Use provided context directly
+        result = qa.answer(question=request.question, context=request.context)
+    else:
+        # Fetch recent events from DB
+        if request.person_id:
+            try:
+                pid = uuid.UUID(request.person_id)
+            except ValueError:
+                raise HTTPException(status_code=400, detail="Invalid person_id UUID")
+            db_events = await crud.get_events_for_person(db, pid, limit=30)
+        else:
+            db_events = await crud.get_recent_events(db, camera_id=request.camera_id, limit=50)
+        # Convert to dicts
+        events = [
+            {
+                "event_id": str(e.event_id),
+                "person_id": str(e.person_id),
+                "camera_id": e.camera_id,
+                "timestamp": str(e.timestamp),
+                "activity_type": e.activity_type.value,
+                "anomaly_score": e.anomaly_score or 0.0,
+                "bounding_box": e.bounding_box or {},
+            }
+            for e in db_events
+        ]
+        result = qa.answer(question=request.question, events=events)
+    return {
+        "question": request.question,
+        **result,
+        "events_used": len(events),
+    }
+# ── GET /report/{person_id} ───────────────────────────────────────────────────
+@router.get("/report/{person_id}", summary="Generate and store an incident report for a person")
+async def generate_report(
+    person_id: str,
+    db: AsyncSession = Depends(get_db),
+    report_gen=Depends(get_report_generator),
+    _summarizer=Depends(get_summarizer),
+):
+    """Generate a structured incident report for a tracked person."""
+    try:
+        pid = uuid.UUID(person_id)
+    except ValueError:
+        raise HTTPException(status_code=400, detail="Invalid person_id UUID")
+    person = await crud.get_person(db, pid)
+    if not person:
+        raise HTTPException(status_code=404, detail=f"Person {person_id} not found")
+    events = await crud.get_events_for_person(db, pid, limit=30)
+    if not events:
+        raise HTTPException(status_code=404, detail="No events found for this person")
+    events_dicts = [
+        {
+            "event_id": str(e.event_id),
+            "camera_id": e.camera_id,
+            "timestamp": str(e.timestamp),
+            "activity_type": e.activity_type.value,
+            "anomaly_score": e.anomaly_score or 0.0,
+            "attributes": person.attributes or {},
+        }
+        for e in events
+    ]
+    # Generate report
+    report_result = report_gen.generate(events=events_dicts, person_id=person_id)
+    summary_result = _summarizer.summarize(events=events_dicts)
+    # Store report in DB
+    camera_ids = list({e["camera_id"] for e in events_dicts})
+    db_report = await crud.create_incident_report(
+        db,
+        person_id=pid,
+        report_text=report_result["report_text"],
+        summary=summary_result["summary"],
+        severity=report_result.get("severity", "medium"),
+        camera_ids=camera_ids,
+    )
+    return {
+        "report_id": str(db_report.report_id),
+        "person_id": person_id,
+        "generated_at": str(db_report.generated_at),
+        "severity": db_report.severity,
+        "summary": summary_result["summary"],
+        "report_text": report_result["report_text"],
+        "cameras_involved": camera_ids,
+        "event_count": len(events),
+        "latency_ms": report_result.get("latency_ms"),
+    }
+# ── GET /anomalies ────────────────────────────────────────────────────────────
+@router.get("/anomalies", summary="Get persons with anomalous movement patterns")
+async def get_anomalies(
+    threshold: float = Query(0.75, ge=0.0, le=1.0),
+    graph=Depends(get_movement_graph),
+):
+    anomalies = graph.get_all_anomalies(threshold=threshold)
+    return {
+        "threshold": threshold,
+        "anomalous_persons": anomalies,
+        "count": len(anomalies),
+    }
+# ── GET /persons ───────────────────────────────────────────────────────────────
+@router.get("/persons", summary="List all tracked persons")
+async def list_persons(
+    limit: int = Query(50, ge=1, le=200),
+    offset: int = Query(0, ge=0),
+    db: AsyncSession = Depends(get_db),
+):
+    persons = await crud.get_all_persons(db, limit=limit, offset=offset)
+    return {
+        "persons": [
+            {
+                "id": str(p.id),
+                "first_seen": str(p.first_seen),
+                "last_seen": str(p.last_seen),
+                "faiss_id": p.faiss_id,
+                "attributes": p.attributes,
+            }
+            for p in persons
+        ],
+        "total": len(persons),
+    }

routes/stream_routes.py ADDED Viewed

	@@ -0,0 +1,69 @@

+from fastapi import APIRouter, Request
+from fastapi.responses import StreamingResponse
+import asyncio
+from typing import List
+from pydantic import BaseModel
+from loguru import logger
+from vision.stream_manager import stream_manager
+router = APIRouter(prefix="/api/v1/streams", tags=["Streams"])
+class StreamCreateRequest(BaseModel):
+    camera_id: str
+    source: str  # e.g. '0', 'rtsp://...', 'http://...'
+@router.post("/add")
+async def add_stream(request: StreamCreateRequest):
+    """Start a new background stream"""
+    stream_manager.add_stream(request.camera_id, request.source)
+    return {"status": "added", "camera_id": request.camera_id}
+@router.delete("/{camera_id}")
+async def remove_stream(camera_id: str):
+    """Stop a background stream"""
+    stream_manager.remove_stream(camera_id)
+    return {"status": "removed", "camera_id": camera_id}
+@router.get("/")
+async def list_streams():
+    """Get active streams"""
+    return {"streams": list(stream_manager.streams.keys())}
+@router.get("/results")
+async def stream_results():
+    """Get the latest inference results for all active streams"""
+    return stream_manager.results
+@router.get("/feed/{camera_id}")
+async def video_feed(camera_id: str, request: Request):
+    """
+    Multipart MJPEG video feed for `<img>` tags.
+    """
+    # If a feed is requested but not added, try a local fallback
+    if camera_id not in stream_manager.streams:
+        # Fallback to local webcam '0' for demo purposes
+        stream_manager.add_stream(camera_id, "0")
+    async def frame_generator():
+        while True:
+            if await request.is_disconnected():
+                logger.info(f"Client disconnected from feed {camera_id}")
+                break
+            frame = stream_manager.get_frame(camera_id)
+            if frame is None:
+                await asyncio.sleep(0.1)
+                continue
+            # Yield multipart boundary + image bytes
+            yield (b'--frame\r\n'
+                   b'Content-Type: image/jpeg\r\n\r\n' + frame + b'\r\n')
+            # Target ~30fps
+            await asyncio.sleep(0.033)
+    return StreamingResponse(
+        frame_generator(),
+        media_type="multipart/x-mixed-replace; boundary=frame"
+    )

routes/vision_routes.py ADDED Viewed

	@@ -0,0 +1,421 @@

+"""
+routes/vision_routes.py - Vision API endpoints:
+  POST /process-frame  (single frame inference)
+  GET  /live-feed      (WebSocket stream)
+  GET  /cameras        (list active cameras)
+"""
+import asyncio
+import base64
+import json
+import os
+import tempfile
+import time
+import uuid
+from io import BytesIO
+from typing import Optional
+from fastapi import APIRouter, Depends, HTTPException, UploadFile, File, Form, WebSocket, WebSocketDisconnect
+from fastapi.responses import JSONResponse, StreamingResponse
+from pydantic import BaseModel
+from PIL import Image
+from loguru import logger
+from sqlalchemy.ext.asyncio import AsyncSession
+from database.session import get_db, AsyncSessionLocal
+from database import crud
+from database.models import ActivityType, AnalysisSession
+router = APIRouter(prefix="/api/v1", tags=["Vision"])
+# ── Request / Response Models ─────────────────────────────────────────────────
+class FrameProcessRequest(BaseModel):
+    camera_id: str
+    image_b64: str              # base64-encoded JPEG/PNG
+    run_attributes: bool = True
+    run_reid: bool = True
+    reid_threshold: float = 0.85
+class FrameProcessResponse(BaseModel):
+    camera_id: str
+    frame_id: int
+    person_count: int
+    persons: list
+    latency: dict
+    fps: float
+    timestamp: float
+# ── Dependency: get vision pipeline singleton ──────────────────────────────────
+def get_pipeline():
+    from app import vision_pipeline
+    return vision_pipeline
+def get_movement_graph():
+    from app import movement_graph
+    return movement_graph
+# ── POST /process-frame ───────────────────────────────────────────────────────
+@router.post("/process-frame", response_model=FrameProcessResponse, summary="Process a single camera frame")
+async def process_frame(
+    request: FrameProcessRequest,
+    db: AsyncSession = Depends(get_db),
+    pipeline=Depends(get_pipeline),
+    graph=Depends(get_movement_graph),
+):
+    """
+    Submit a single camera frame for full vision pipeline processing.
+    Returns detected persons with tracking IDs, ReID matches, and attributes.
+    """
+    try:
+        result = pipeline.process_frame(
+            image_input=request.image_b64,
+            camera_id=request.camera_id,
+            run_attributes=request.run_attributes,
+            run_reid=request.run_reid,
+            reid_threshold=request.reid_threshold,
+        )
+    except Exception as e:
+        logger.error(f"Vision pipeline error: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+    # Persist events to database
+    for person in result.get("persons", []):
+        person_id_str = person.get("assigned_person_id")
+        if not person_id_str:
+            continue
+        try:
+            person_uuid = uuid.UUID(person_id_str)
+        except ValueError:
+            continue
+        # Upsert person record
+        db_person = await crud.get_person(db, person_uuid)
+        if db_person is None and person.get("is_new_person"):
+            db_person = await crud.create_person(
+                db,
+                faiss_id=person.get("faiss_id"),
+                attributes=person.get("attributes"),
+            )
+        if db_person:
+            await crud.update_person_last_seen(db, db_person.id)
+            await crud.create_event(
+                db,
+                person_id=db_person.id,
+                camera_id=request.camera_id,
+                activity_type=ActivityType.DETECTED,
+                bounding_box={"x1": person["bbox"][0], "y1": person["bbox"][1],
+                               "x2": person["bbox"][2], "y2": person["bbox"][3]},
+                confidence=person.get("score"),
+                track_id=person.get("track_id"),
+                raw_metadata={"reid_matches": person.get("reid_matches", [])},
+            )
+            # Update movement graph
+            graph.add_observation(
+                person_id=str(db_person.id),
+                camera_id=request.camera_id,
+                timestamp=result["timestamp"],
+            )
+    return JSONResponse(content=result)
+# ── POST /process-frame/upload (multipart form data) ─────────────────────────
+@router.post("/process-frame/upload", summary="Upload image file for processing")
+async def process_frame_upload(
+    camera_id: str = Form(...),
+    run_attributes: bool = Form(True),
+    run_reid: bool = Form(True),
+    image: UploadFile = File(...),
+    db: AsyncSession = Depends(get_db),
+    pipeline=Depends(get_pipeline),
+    graph=Depends(get_movement_graph),
+):
+    """Submit a frame via multipart file upload."""
+    content = await image.read()
+    b64 = base64.b64encode(content).decode()
+    # Reuse main endpoint logic via internal call
+    from routes.vision_routes import process_frame, FrameProcessRequest
+    req = FrameProcessRequest(
+        camera_id=camera_id,
+        image_b64=b64,
+        run_attributes=run_attributes,
+        run_reid=run_reid,
+    )
+    return await process_frame(req, db, pipeline, graph)
+# ── POST /analyze-video ──────────────────────────────────────────────────────
+@router.post("/analyze-video", summary="Upload a video file and stream per-frame detection results via SSE")
+async def analyze_video(
+    camera_id: str = Form("VIDEO-UPLOAD"),
+    frame_interval: int = Form(5),
+    run_attributes: bool = Form(True),
+    run_reid: bool = Form(True),
+    video: UploadFile = File(...),
+    pipeline=Depends(get_pipeline),
+):
+    """
+    Upload a video file. Frames are sampled every `frame_interval` frames.
+    Results are streamed back as Server-Sent Events (SSE).
+    SSE event format:
+      data: {"type": "frame",   "frame_id": int, "timestamp_sec": float, ...}
+      data: {"type": "summary", "total_frames": int, "unique_persons": int, ...}
+      data: {"type": "error",   "message": str}
+    """
+    # Read video bytes into a temp file (cv2 needs a real path on Windows)
+    content = await video.read()
+    filename = video.filename or "uploaded_video.mp4"
+    suffix = os.path.splitext(filename)[1] or ".mp4"
+    # Pre-generate an ID and path for the thumbnail
+    session_id = uuid.uuid4()
+    thumb_filename = f"{session_id}.jpg"
+    thumb_path = os.path.join("static", "thumbnails", thumb_filename)
+    async def event_stream():
+        tmp_path = None
+        try:
+            import cv2
+            with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
+                tmp.write(content)
+                tmp_path = tmp.name
+            cap = cv2.VideoCapture(tmp_path)
+            if not cap.isOpened():
+                yield f"data: {json.dumps({'type': 'error', 'message': 'Cannot open video file'})}\n\n"
+                return
+            fps_native  = cap.get(cv2.CAP_PROP_FPS) or 25.0
+            total_raw   = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+            total_sampled = max(1, total_raw // max(1, frame_interval))
+            # Metadata handshake
+            yield f"data: {json.dumps({'type': 'meta', 'total_sampled': total_sampled, 'fps_native': fps_native, 'total_frames_raw': total_raw})}\n\n"
+            raw_idx        = 0
+            processed_idx  = 0
+            unique_ids     = set()
+            peak_count     = 0
+            t_video_start  = time.time()
+            while True:
+                ret, frame = cap.read()
+                if not ret:
+                    break
+                if raw_idx % max(1, frame_interval) == 0:
+                    # Convert BGR → RGB → PIL
+                    import numpy as np
+                    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                    pil_img = Image.fromarray(rgb)
+                    # Save the very first processed frame as the session thumbnail
+                    if processed_idx == 0:
+                        pil_img.save(thumb_path, "JPEG", quality=85)
+                    timestamp_sec = round(raw_idx / fps_native, 3)
+                    try:
+                        result = pipeline.process_frame(
+                            image_input=pil_img,
+                            camera_id=camera_id,
+                            run_attributes=run_attributes,
+                            run_reid=run_reid,
+                        )
+                    except Exception as e:
+                        logger.warning(f"Frame {raw_idx} error: {e}")
+                        raw_idx += 1
+                        continue
+                    persons = result.get("persons", [])
+                    for p in persons:
+                        pid = p.get("assigned_person_id")
+                        if pid:
+                            unique_ids.add(pid)
+                    peak_count = max(peak_count, len(persons))
+                    # Slim down persons payload for SSE
+                    slim_persons = [
+                        {
+                            "track_id":   p.get("track_id"),
+                            "bbox":       p.get("bbox"),
+                            "score":      round(p.get("score", 0), 3),
+                            "is_new":     p.get("is_new_person", False),
+                            "attributes": p.get("attributes", {}),
+                            "reid_sim":   round(p["reid_matches"][0]["similarity"], 3)
+                                          if p.get("reid_matches") else None,
+                        }
+                        for p in persons
+                    ]
+                    event = {
+                        "type":          "frame",
+                        "frame_id":      processed_idx,
+                        "raw_frame":     raw_idx,
+                        "timestamp_sec": timestamp_sec,
+                        "person_count":  len(persons),
+                        "persons":       slim_persons,
+                        "latency_ms":    round(result["latency"]["total_ms"], 1),
+                        "progress":      round((processed_idx + 1) / total_sampled * 100, 1),
+                    }
+                    yield f"data: {json.dumps(event)}\n\n"
+                    processed_idx += 1
+                    # Yield control so FastAPI can flush the buffer
+                    await asyncio.sleep(0)
+                raw_idx += 1
+            cap.release()
+            duration_sec = round(time.time() - t_video_start, 2)
+            summary = {
+                "type":             "summary",
+                "total_frames_processed": processed_idx,
+                "unique_person_count":     len(unique_ids),
+                "peak_person_count":       peak_count,
+                "duration_sec":            duration_sec,
+                "video_duration_sec":      round(total_raw / fps_native, 2),
+            }
+            yield f"data: {json.dumps(summary)}\n\n"
+            logger.info(f"Video analysis done — {processed_idx} frames, {len(unique_ids)} unique persons")
+            # Save the session to the database
+            try:
+                # We need a fresh session here inside the generator
+                async with AsyncSessionLocal() as db:
+                    new_session = AnalysisSession(
+                        id=session_id,
+                        video_filename=filename,
+                        thumbnail_path=f"/static/thumbnails/{thumb_filename}",
+                        duration_sec=duration_sec,
+                        frames_processed=processed_idx,
+                        unique_persons=len(unique_ids),
+                        peak_count=peak_count,
+                    )
+                    db.add(new_session)
+                    await db.commit()
+            except Exception as e:
+                logger.error(f"Failed to save session to DB: {e}")
+        except Exception as e:
+            logger.error(f"analyze_video error: {e}")
+            yield f"data: {json.dumps({'type': 'error', 'message': str(e)})}\n\n"
+        finally:
+            if tmp_path and os.path.exists(tmp_path):
+                os.unlink(tmp_path)
+    return StreamingResponse(
+        event_stream(),
+        media_type="text/event-stream",
+        headers={
+            "Cache-Control":      "no-cache",
+            "X-Accel-Buffering": "no",
+            "Connection":         "keep-alive",
+        },
+    )
+# ── GET /sessions ─────────────────────────────────────────────────────────────
+@router.get("/sessions", summary="List historical video analysis sessions")
+async def list_sessions(db: AsyncSession = Depends(get_db)):
+    from sqlalchemy.future import select
+    query = select(AnalysisSession).order_by(AnalysisSession.timestamp.desc())
+    result = await db.execute(query)
+    sessions = result.scalars().all()
+    return [
+        {
+            "id": str(s.id),
+            "timestamp": s.timestamp.isoformat(),
+            "video_filename": s.video_filename,
+            "thumbnail_path": s.thumbnail_path,
+            "duration_sec": s.duration_sec,
+            "frames_processed": s.frames_processed,
+            "unique_persons": s.unique_persons,
+            "peak_count": s.peak_count,
+        }
+        for s in sessions
+    ]
+# ── GET /cameras ──────────────────────────────────────────────────────────────
+@router.get("/cameras", summary="List active cameras and their status")
+async def list_cameras(pipeline=Depends(get_pipeline), graph=Depends(get_movement_graph)):
+    summary = graph.get_movement_summary()
+    trackers = list(pipeline.tracker_manager._trackers.keys())
+    fps_data = {cam: pipeline._compute_fps(cam) for cam in trackers}
+    return {
+        "active_cameras": trackers,
+        "fps_per_camera": fps_data,
+        "movement_graph_summary": summary,
+    }
+# ── WebSocket /live-feed ──────────────────────────────────────────────────────
+class ConnectionManager:
+    def __init__(self):
+        self.active_connections: list[WebSocket] = []
+    async def connect(self, ws: WebSocket):
+        await ws.accept()
+        self.active_connections.append(ws)
+        logger.info(f"WebSocket connected. Total: {len(self.active_connections)}")
+    def disconnect(self, ws: WebSocket):
+        self.active_connections.remove(ws)
+        logger.info(f"WebSocket disconnected. Total: {len(self.active_connections)}")
+    async def broadcast(self, data: dict):
+        import json
+        msg = json.dumps(data)
+        for conn in self.active_connections:
+            try:
+                await conn.send_text(msg)
+            except Exception:
+                pass
+ws_manager = ConnectionManager()
+@router.websocket("/live-feed")
+async def live_feed_websocket(websocket: WebSocket, pipeline=Depends(get_pipeline)):
+    """
+    WebSocket endpoint for realtime surveillance feed.
+    Client sends: {"camera_id": str, "image_b64": str}
+    Server broadcasts processed results to all connected clients.
+    """
+    await ws_manager.connect(websocket)
+    try:
+        while True:
+            data = await websocket.receive_json()
+            camera_id = data.get("camera_id", "cam-0")
+            image_b64 = data.get("image_b64", "")
+            if not image_b64:
+                await websocket.send_json({"error": "No image provided"})
+                continue
+            result = pipeline.process_frame(
+                image_input=image_b64,
+                camera_id=camera_id,
+                run_attributes=data.get("run_attributes", False),
+                run_reid=data.get("run_reid", True),
+            )
+            await websocket.send_json(result)
+    except WebSocketDisconnect:
+        ws_manager.disconnect(websocket)

static/thumbnails/4f42294d-cdc1-4c64-abae-71a2891167b2.jpg ADDED Viewed

vision/__init__.py ADDED Viewed

File without changes

vision/attributes.py ADDED Viewed

	@@ -0,0 +1,163 @@

+"""
+vision/attributes.py - CLIP-based Zero-Shot Clothing & Attribute Recognition
+"""
+import time
+import torch
+import numpy as np
+import faiss
+import os
+from PIL import Image
+from typing import List, Dict, Tuple, Optional
+from transformers import CLIPProcessor, CLIPModel
+from loguru import logger
+from config import settings, DEVICE, FAISS_DIR
+# Attribute taxonomies for zero-shot classification
+CLOTHING_LABELS = [
+    "wearing a red shirt", "wearing a blue shirt", "wearing a white shirt",
+    "wearing a black shirt", "wearing a yellow jacket", "wearing a green jacket",
+    "wearing jeans", "wearing formal trousers", "wearing shorts", "wearing a dress",
+    "wearing a hoodie", "wearing a suit", "wearing a uniform", "wearing a coat",
+]
+COLOR_LABELS = [
+    "person in red clothing", "person in blue clothing", "person in black clothing",
+    "person in white clothing", "person in gray clothing", "person in green clothing",
+    "person in yellow clothing", "person in orange clothing", "person in brown clothing",
+]
+GENDER_LABELS = ["a male person", "a female person"]
+ACCESSORY_LABELS = [
+    "wearing a backpack", "carrying a bag", "wearing a hat", "wearing sunglasses",
+    "carrying an umbrella", "wearing a mask", "no accessories",
+]
+AGE_LABELS = [
+    "a child person", "a teenager person", "a young adult person",
+    "a middle-aged person", "an elderly person",
+]
+class AttributeRecognizer:
+    """
+    Zero-shot attribute recognition using CLIP.
+    Generates structured attribute dict and CLIP visual embeddings per person.
+    """
+    ATTR_INDEX_FILE = str(FAISS_DIR / "attr_index.faiss")
+    ATTR_META_FILE = str(FAISS_DIR / "attr_meta.npy")
+    def __init__(self):
+        logger.info(f"Loading CLIP model: {settings.CLIP_MODEL}")
+        self.processor = CLIPProcessor.from_pretrained(settings.CLIP_MODEL)
+        self.model = CLIPModel.from_pretrained(settings.CLIP_MODEL)
+        self.model.to(DEVICE)
+        self.model.eval()
+        self.dim = settings.CLIP_EMBEDDING_DIM
+        self.index = self._load_or_create_index()
+        self.meta: List[Dict] = self._load_meta()
+        logger.info(f"✅ AttributeRecognizer ready. FAISS attr index size: {self.index.ntotal}")
+    def _load_or_create_index(self):
+        if os.path.exists(self.ATTR_INDEX_FILE):
+            return faiss.read_index(self.ATTR_INDEX_FILE)
+        return faiss.IndexFlatIP(self.dim)
+    def _load_meta(self) -> List[Dict]:
+        if os.path.exists(self.ATTR_META_FILE):
+            return list(np.load(self.ATTR_META_FILE, allow_pickle=True))
+        return []
+    def save(self):
+        faiss.write_index(self.index, self.ATTR_INDEX_FILE)
+        np.save(self.ATTR_META_FILE, np.array(self.meta, dtype=object))
+    @torch.inference_mode()
+    def _classify(self, image: Image.Image, labels: List[str]) -> List[Tuple[str, float]]:
+        """Run zero-shot CLIP classification. Returns sorted (label, prob) list."""
+        inputs = self.processor(
+            text=labels, images=image, return_tensors="pt", padding=True
+        )
+        inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
+        outputs = self.model(**inputs)
+        logits = outputs.logits_per_image[0]
+        probs = torch.softmax(logits, dim=0).cpu().numpy()
+        return sorted(zip(labels, probs.tolist()), key=lambda x: -x[1])
+    @torch.inference_mode()
+    def extract_visual_embedding(self, image: Image.Image) -> np.ndarray:
+        """Extract L2-normalized CLIP visual embedding."""
+        inputs = self.processor(images=image, return_tensors="pt")
+        inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
+        features = self.model.get_image_features(**inputs)
+        features = features / features.norm(dim=-1, keepdim=True)
+        return features.cpu().numpy().astype(np.float32)
+    def recognize(self, image: Image.Image) -> Dict:
+        """
+        Run all attribute classifiers on a cropped person image.
+        Returns:
+            {
+                "clothing": [{"label": str, "confidence": float}],
+                "color": str,
+                "gender": str,
+                "accessories": [str],
+                "age_group": str,
+            }
+        """
+        t0 = time.perf_counter()
+        clothing_results = self._classify(image, CLOTHING_LABELS)
+        color_results = self._classify(image, COLOR_LABELS)
+        gender_results = self._classify(image, GENDER_LABELS)
+        accessory_results = self._classify(image, ACCESSORY_LABELS)
+        age_results = self._classify(image, AGE_LABELS)
+        latency = (time.perf_counter() - t0) * 1000
+        attributes = {
+            "clothing": [
+                {"label": l, "confidence": round(p, 4)}
+                for l, p in clothing_results[:3]
+                if p > 0.1
+            ],
+            "color": color_results[0][0].replace("person in ", "").replace(" clothing", "") if color_results else "unknown",
+            "gender": gender_results[0][0].replace("a ", "").replace(" person", "") if gender_results else "unknown",
+            "accessories": [l for l, p in accessory_results if p > 0.3 and "no accessories" not in l],
+            "age_group": age_results[0][0].replace("a ", "").replace(" person", "") if age_results else "unknown",
+            "inference_ms": round(latency, 2),
+        }
+        logger.debug(f"Attributes recognized in {latency:.1f}ms")
+        return attributes
+    def add_to_gallery(self, image: Image.Image, person_id: str) -> int:
+        """Store CLIP visual embedding in FAISS for attribute-based search."""
+        embedding = self.extract_visual_embedding(image)
+        faiss_id = self.index.ntotal
+        self.index.add(embedding)
+        self.meta.append({"person_id": person_id, "faiss_id": faiss_id})
+        self.save()
+        return faiss_id
+    def search_by_attribute_query(self, text_query: str, top_k: int = 10) -> List[Dict]:
+        """Search gallery using a natural language attribute query."""
+        if self.index.ntotal == 0:
+            return []
+        inputs = self.processor(text=[text_query], return_tensors="pt", padding=True)
+        inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
+        with torch.inference_mode():
+            text_features = self.model.get_text_features(**inputs)
+            text_features = text_features / text_features.norm(dim=-1, keepdim=True)
+        query = text_features.cpu().numpy().astype(np.float32)
+        k = min(top_k, self.index.ntotal)
+        distances, indices = self.index.search(query, k)
+        return [
+            {"person_id": self.meta[idx]["person_id"], "similarity": round(float(dist), 4)}
+            for dist, idx in zip(distances[0], indices[0])
+            if idx != -1
+        ]

vision/detector.py ADDED Viewed

	@@ -0,0 +1,91 @@

+"""
+vision/detector.py - Person Detection using facebook/detr-resnet-50
+"""
+import time
+import torch
+import numpy as np
+from PIL import Image
+from typing import List, Dict, Tuple, Optional
+from transformers import DetrImageProcessor, DetrForObjectDetection
+from loguru import logger
+from config import settings, DEVICE
+class PersonDetector:
+    """
+    DETR-based person detector.
+    Returns bounding boxes, confidence scores, and processing latency.
+    """
+    PERSON_LABEL = "person"
+    COCO_LABEL_MAP = None  # populated after model loads
+    def __init__(self):
+        logger.info(f"Loading detection model: {settings.DETECTION_MODEL}")
+        self.processor = DetrImageProcessor.from_pretrained(settings.DETECTION_MODEL)
+        self.model = DetrForObjectDetection.from_pretrained(settings.DETECTION_MODEL)
+        self.model.to(DEVICE)
+        self.model.eval()
+        # Build label → id map
+        self.id2label = self.model.config.id2label
+        self.person_label_ids = [
+            k for k, v in self.id2label.items() if v.lower() == self.PERSON_LABEL
+        ]
+        logger.info(f"✅ PersonDetector ready on {DEVICE}. Person class ids: {self.person_label_ids}")
+    @torch.inference_mode()
+    def detect(
+        self,
+        image: Image.Image,
+        confidence_threshold: Optional[float] = None,
+    ) -> Tuple[List[Dict], float]:
+        """
+        Detect persons in a PIL image.
+        Returns:
+            detections: list of {"bbox": [x1,y1,x2,y2], "score": float, "label": "person"}
+            latency_ms: inference time in milliseconds
+        """
+        threshold = confidence_threshold or settings.DETECTION_CONFIDENCE
+        t0 = time.perf_counter()
+        inputs = self.processor(images=image, return_tensors="pt")
+        inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
+        outputs = self.model(**inputs)
+        t1 = time.perf_counter()
+        latency_ms = (t1 - t0) * 1000
+        # Post-process to original image size
+        target_sizes = torch.tensor([image.size[::-1]], device=DEVICE)  # (H, W)
+        results = self.processor.post_process_object_detection(
+            outputs, threshold=threshold, target_sizes=target_sizes
+        )[0]
+        detections = []
+        for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
+            label_id = label.item()
+            if label_id in self.person_label_ids:
+                x1, y1, x2, y2 = box.tolist()
+                detections.append({
+                    "bbox": [round(x1, 2), round(y1, 2), round(x2, 2), round(y2, 2)],
+                    "score": round(score.item(), 4),
+                    "label": "person",
+                })
+        logger.debug(f"Detected {len(detections)} persons in {latency_ms:.1f}ms")
+        return detections, latency_ms
+    def detect_batch(
+        self,
+        images: List[Image.Image],
+        confidence_threshold: Optional[float] = None,
+    ) -> List[Tuple[List[Dict], float]]:
+        """Batch detection for multiple frames."""
+        return [self.detect(img, confidence_threshold) for img in images]
+    @staticmethod
+    def crop_person(image: Image.Image, bbox: List[float]) -> Image.Image:
+        """Crop a person region from image given bbox [x1, y1, x2, y2]."""
+        x1, y1, x2, y2 = [int(v) for v in bbox]
+        return image.crop((x1, y1, x2, y2))

vision/pipeline.py ADDED Viewed

	@@ -0,0 +1,177 @@

+"""
+vision/pipeline.py - Full Vision Inference Pipeline
+Orchestrates detection → tracking → ReID → attribute recognition per frame
+"""
+import time
+import base64
+import numpy as np
+import uuid
+from io import BytesIO
+from PIL import Image
+from typing import Dict, List, Optional, Any
+from loguru import logger
+from vision.detector import PersonDetector
+from vision.tracker import TrackerManager
+from vision.reid import PersonReID
+from vision.attributes import AttributeRecognizer
+from config import settings
+class VisionPipeline:
+    """
+    End-to-end vision pipeline for a single frame from a camera.
+    Components initialized lazily (singletons shared across requests).
+    """
+    def __init__(self):
+        logger.info("Initializing VisionPipeline...")
+        self.detector = PersonDetector()
+        self.tracker_manager = TrackerManager()
+        self.reid = PersonReID()
+        self.attributes = AttributeRecognizer()
+        self._frame_counts: Dict[str, int] = {}
+        self._fps_timers: Dict[str, List[float]] = {}
+        logger.info("✅ VisionPipeline ready.")
+    def _decode_image(self, image_input: Any) -> Image.Image:
+        """Accept PIL Image, numpy array, bytes, or base64 string."""
+        if isinstance(image_input, Image.Image):
+            return image_input.convert("RGB")
+        if isinstance(image_input, np.ndarray):
+            return Image.fromarray(image_input).convert("RGB")
+        if isinstance(image_input, bytes):
+            return Image.open(BytesIO(image_input)).convert("RGB")
+        if isinstance(image_input, str):
+            # base64 encoded
+            data = base64.b64decode(image_input)
+            return Image.open(BytesIO(data)).convert("RGB")
+        raise ValueError(f"Unsupported image type: {type(image_input)}")
+    def _compute_fps(self, camera_id: str) -> float:
+        """Compute rolling FPS over last 30 frames."""
+        now = time.perf_counter()
+        if camera_id not in self._fps_timers:
+            self._fps_timers[camera_id] = []
+        self._fps_timers[camera_id].append(now)
+        if len(self._fps_timers[camera_id]) > 30:
+            self._fps_timers[camera_id].pop(0)
+        times = self._fps_timers[camera_id]
+        if len(times) < 2:
+            return 0.0
+        return round((len(times) - 1) / (times[-1] - times[0]), 2)
+    def process_frame(
+        self,
+        image_input: Any,
+        camera_id: str,
+        run_attributes: bool = True,
+        run_reid: bool = True,
+        reid_threshold: float = 0.85,
+    ) -> Dict:
+        """
+        Full pipeline for a single frame.
+        Args:
+            image_input: PIL Image | numpy array | bytes | base64 str
+            camera_id: unique camera identifier
+            run_attributes: whether to run CLIP attribute recognition
+            run_reid: whether to run ReID matching
+            reid_threshold: cosine similarity threshold for ReID
+        Returns:
+            Result dict with detections, tracks, reid matches, attributes, and latency breakdown.
+        """
+        t_start = time.perf_counter()
+        # 1. Decode image
+        image = self._decode_image(image_input)
+        w, h = image.size
+        # 2. Detection
+        detections, det_ms = self.detector.detect(image)
+        # 3. Tracking
+        t_track = time.perf_counter()
+        tracks = self.tracker_manager.update(camera_id, detections)
+        track_ms = (time.perf_counter() - t_track) * 1000
+        # 4. Per-person: ReID + Attributes
+        persons_data = []
+        for track in tracks:
+            bbox = track["bbox"]
+            # Crop person region
+            try:
+                crop = PersonDetector.crop_person(image, bbox)
+                if crop.width < 10 or crop.height < 10:
+                    continue
+            except Exception:
+                continue
+            person_entry: Dict = {
+                "track_id": track["track_id"],
+                "bbox": bbox,
+                "score": track["score"],
+                "camera_id": camera_id,
+                "reid_matches": [],
+                "attributes": {},
+                "is_new_person": False,
+                "assigned_person_id": None,
+            }
+            # 4a. ReID — try to match against gallery
+            if run_reid:
+                t_reid = time.perf_counter()
+                reid_matches = self.reid.search(crop, top_k=3, similarity_threshold=reid_threshold)
+                person_entry["reid_matches"] = reid_matches
+                person_entry["reid_ms"] = round((time.perf_counter() - t_reid) * 1000, 2)
+                if reid_matches:
+                    person_entry["assigned_person_id"] = reid_matches[0]["person_id"]
+                else:
+                    # New person — register in gallery with temporary UUID
+                    new_pid = str(uuid.uuid4())
+                    faiss_id = self.reid.add_person(crop, new_pid, camera_id)
+                    person_entry["assigned_person_id"] = new_pid
+                    person_entry["is_new_person"] = True
+                    person_entry["faiss_id"] = faiss_id
+                    import os
+                    os.makedirs("static/thumbnails", exist_ok=True)
+                    try:
+                        crop.save(f"static/thumbnails/{new_pid}.jpg", "JPEG", quality=85)
+                    except Exception as e:
+                        logger.warning(f"Failed to save thumbnail for {new_pid}: {e}")
+            # 4b. Attribute recognition
+            if run_attributes:
+                t_attr = time.perf_counter()
+                attrs = self.attributes.recognize(crop)
+                person_entry["attributes"] = attrs
+                person_entry["attr_ms"] = round((time.perf_counter() - t_attr) * 1000, 2)
+                # Also store visual embedding for attribute-based search
+                if run_reid:
+                    self.attributes.add_to_gallery(crop, person_entry["assigned_person_id"])
+            persons_data.append(person_entry)
+        total_ms = (time.perf_counter() - t_start) * 1000
+        fps = self._compute_fps(camera_id)
+        self._frame_counts[camera_id] = self._frame_counts.get(camera_id, 0) + 1
+        return {
+            "camera_id": camera_id,
+            "frame_id": self._frame_counts[camera_id],
+            "image_size": {"width": w, "height": h},
+            "persons": persons_data,
+            "person_count": len(persons_data),
+            "detection_count": len(detections),
+            "latency": {
+                "detection_ms": round(det_ms, 2),
+                "tracking_ms": round(track_ms, 2),
+                "total_ms": round(total_ms, 2),
+            },
+            "fps": fps,
+            "timestamp": time.time(),
+        }

vision/reid.py ADDED Viewed

	@@ -0,0 +1,144 @@

+"""
+vision/reid.py - Cross-Camera Person Re-Identification using ViT + FAISS
+"""
+import os
+import time
+import numpy as np
+import faiss
+import torch
+import torch.nn.functional as F
+from PIL import Image
+from typing import List, Dict, Optional, Tuple
+from transformers import ViTImageProcessor, ViTModel
+from loguru import logger
+from config import settings, DEVICE, FAISS_DIR
+class PersonReID:
+    """
+    Person Re-Identification using google/vit-base-patch16-224 embeddings.
+    Embeddings are stored in a FAISS IndexFlatIP (inner product = cosine after normalization).
+    """
+    INDEX_FILE = str(FAISS_DIR / "reid_index.faiss")
+    META_FILE = str(FAISS_DIR / "reid_meta.npy")
+    def __init__(self):
+        logger.info(f"Loading ReID model: {settings.REID_MODEL}")
+        self.processor = ViTImageProcessor.from_pretrained(settings.REID_MODEL)
+        self.model = ViTModel.from_pretrained(settings.REID_MODEL)
+        self.model.to(DEVICE)
+        self.model.eval()
+        self.dim = settings.REID_EMBEDDING_DIM
+        self.index = self._load_or_create_index()
+        # meta list: maps faiss internal id (row index) → {"person_id": str, "camera_id": str}
+        self.meta: List[Dict] = self._load_meta()
+        logger.info(f"✅ ReID ready. FAISS index size: {self.index.ntotal}")
+    # ── Index Management ──────────────────────────────────────────────────────
+    def _load_or_create_index(self) -> faiss.IndexFlatIP:
+        if os.path.exists(self.INDEX_FILE):
+            logger.info("Loading existing FAISS ReID index.")
+            return faiss.read_index(self.INDEX_FILE)
+        logger.info("Creating new FAISS ReID index (IndexFlatIP).")
+        return faiss.IndexFlatIP(self.dim)
+    def _load_meta(self) -> List[Dict]:
+        if os.path.exists(self.META_FILE):
+            data = np.load(self.META_FILE, allow_pickle=True)
+            return list(data)
+        return []
+    def save(self):
+        faiss.write_index(self.index, self.INDEX_FILE)
+        np.save(self.META_FILE, np.array(self.meta, dtype=object))
+        logger.debug("FAISS ReID index saved.")
+    # ── Embedding Extraction ──────────────────────────────────────────────────
+    @torch.inference_mode()
+    def extract_embedding(self, image: Image.Image) -> np.ndarray:
+        """Extract L2-normalized ViT CLS token embedding from a cropped person image."""
+        inputs = self.processor(images=image, return_tensors="pt")
+        inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
+        outputs = self.model(**inputs)
+        # CLS token → (1, hidden_size)
+        cls = outputs.last_hidden_state[:, 0, :]
+        # L2 normalize for cosine similarity via inner product
+        embedding = F.normalize(cls, p=2, dim=-1).cpu().numpy().astype(np.float32)
+        return embedding  # shape: (1, 768)
+    # ── Gallery Operations ────────────────────────────────────────────────────
+    def add_person(self, image: Image.Image, person_id: str, camera_id: str) -> int:
+        """Add a new person embedding to the FAISS gallery. Returns faiss_id."""
+        embedding = self.extract_embedding(image)
+        faiss_id = self.index.ntotal
+        self.index.add(embedding)
+        self.meta.append({"person_id": person_id, "camera_id": camera_id, "faiss_id": faiss_id})
+        self.save()
+        return faiss_id
+    def search(
+        self,
+        image: Image.Image,
+        top_k: int = 5,
+        similarity_threshold: float = 0.85,
+    ) -> List[Dict]:
+        """
+        Search gallery for matching persons.
+        Returns:
+            list of {"person_id": str, "camera_id": str, "similarity": float, "faiss_id": int}
+        """
+        if self.index.ntotal == 0:
+            return []
+        t0 = time.perf_counter()
+        query = self.extract_embedding(image)
+        k = min(top_k, self.index.ntotal)
+        distances, indices = self.index.search(query, k)
+        latency = (time.perf_counter() - t0) * 1000
+        results = []
+        for dist, idx in zip(distances[0], indices[0]):
+            if idx == -1:
+                continue
+            similarity = float(dist)
+            if similarity >= similarity_threshold:
+                meta = self.meta[idx]
+                results.append({
+                    "person_id": meta["person_id"],
+                    "camera_id": meta["camera_id"],
+                    "similarity": round(similarity, 4),
+                    "faiss_id": int(idx),
+                })
+        logger.debug(f"ReID search: {len(results)} matches in {latency:.1f}ms")
+        return results
+    def search_by_embedding(
+        self,
+        embedding: np.ndarray,
+        top_k: int = 5,
+        similarity_threshold: float = 0.85,
+    ) -> List[Dict]:
+        """Direct search with a precomputed embedding."""
+        if self.index.ntotal == 0:
+            return []
+        k = min(top_k, self.index.ntotal)
+        distances, indices = self.index.search(embedding, k)
+        results = []
+        for dist, idx in zip(distances[0], indices[0]):
+            if idx == -1 or float(dist) < similarity_threshold:
+                continue
+            meta = self.meta[idx]
+            results.append({
+                "person_id": meta["person_id"],
+                "camera_id": meta["camera_id"],
+                "similarity": round(float(dist), 4),
+                "faiss_id": int(idx),
+            })
+        return results

vision/stream_manager.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import cv2
+import threading
+import time
+import io
+from PIL import Image, ImageDraw, ImageFont
+from loguru import logger
+class StreamManager:
+    """
+    Manages background threads that constantly pull frames from RTSP streams or local webcams,
+    run the vision pipeline, and store annotated JPEG bytes for MJPEG streaming.
+    """
+    def __init__(self):
+        self.streams = {}   # camera_id -> dict with thread info
+        self.frames = {}    # camera_id -> latest JPEG bytes
+        self.results = {}   # camera_id -> latest inference result dict
+        self.running = True
+    def add_stream(self, camera_id: str, source: str):
+        """Add a new camera stream"""
+        if camera_id in self.streams:
+            logger.info(f"Stream {camera_id} is already running.")
+            return
+        # If source is just a digit like "0", handle it as an int for local webcam
+        if source.isdigit():
+            source = int(source)
+        logger.info(f"Adding stream {camera_id} from {source}")
+        thread = threading.Thread(target=self._stream_loop, args=(camera_id, source), daemon=True)
+        self.streams[camera_id] = {
+            "thread": thread,
+            "source": source,
+            "active": True
+        }
+        thread.start()
+    def remove_stream(self, camera_id: str):
+        if camera_id in self.streams:
+            logger.info(f"Removing stream {camera_id}")
+            self.streams[camera_id]["active"] = False
+            del self.streams[camera_id]
+    def _stream_loop(self, camera_id: str, source):
+        # Import inside the loop to avoid circular import issues if imported from app.py
+        from app import vision_pipeline
+        cap = cv2.VideoCapture(source)
+        if not cap.isOpened():
+            logger.error(f"Failed to open Stream: {camera_id} -> {source}")
+            self.remove_stream(camera_id)
+            return
+        fps_native = cap.get(cv2.CAP_PROP_FPS) or 25.0
+        delay = 1.0 / max(1, fps_native)
+        logger.info(f"Stream {camera_id} connected. Target FPS: {fps_native}")
+        while self.streams.get(camera_id, {}).get("active", False) and self.running:
+            start_t = time.perf_counter()
+            ret, frame = cap.read()
+            if not ret:
+                logger.warning(f"Stream {camera_id} disconnected. Attempting reconnect...")
+                time.sleep(2)
+                cap = cv2.VideoCapture(source)
+                continue
+            # Convert to PIL
+            rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            pil_img = Image.fromarray(rgb_frame)
+            # ── Inference ──
+            has_result = False
+            result_data = None
+            try:
+                if vision_pipeline:
+                    result_data = vision_pipeline.process_frame(
+                        image_input=pil_img,
+                        camera_id=camera_id,
+                        run_attributes=True,
+                        run_reid=True
+                    )
+                    self.results[camera_id] = result_data
+                    has_result = True
+            except Exception as e:
+                logger.error(f"Inference error on stream {camera_id}: {e}")
+            # ── Annotation ──
+            if has_result and result_data:
+                pil_img = self._annotate_frame(pil_img, result_data)
+            # ── Encode to JPEG ──
+            buf = io.BytesIO()
+            pil_img.save(buf, format="JPEG", quality=75)
+            self.frames[camera_id] = buf.getvalue()
+            # Enforce FPS limit
+            elapsed = time.perf_counter() - start_t
+            if elapsed < delay:
+                time.sleep(delay - elapsed)
+        cap.release()
+        logger.info(f"Stream {camera_id} loop terminated.")
+    def _annotate_frame(self, image: Image.Image, result: dict) -> Image.Image:
+        """Draw bounding boxes natively on the PIL image before encoding to MJPEG"""
+        draw = ImageDraw.Draw(image)
+        try:
+            # Using default font for robust cross-platform rendering
+            font = ImageFont.load_default()
+        except:
+            font = None
+        for p in result.get("persons", []):
+            x1, y1, x2, y2 = p["bbox"]
+            is_new = p.get("is_new_person", False)
+            # Extract ReID sim
+            reid_sim = 0
+            if p.get("reid_matches"):
+                reid_sim = p["reid_matches"][0].get("similarity", 0)
+            is_alert = not is_new and reid_sim > 0.85
+            # Colors match the frontend UI standard
+            color = "#FF1744" if is_alert else ("#FFB300" if is_new else "#00E5FF")
+            # Bounding Box
+            draw.rectangle([x1, y1, x2, y2], outline=color, width=3)
+            # Label
+            label = f"TRK-{p.get('track_id')} {(p.get('score', 0)*100):.0f}%"
+            # Draw label background
+            text_bg_y0 = max(0, y1 - 16)
+            draw.rectangle([x1, text_bg_y0, x1 + 120, text_bg_y0 + 16], fill=color)
+            if font:
+                draw.text((x1 + 4, text_bg_y0 + 2), label, fill="black", font=font)
+        return image
+    def get_frame(self, camera_id: str):
+        return self.frames.get(camera_id)
+    def shutdown(self):
+        self.running = False
+        self.streams.clear()
+# Global Singleton
+stream_manager = StreamManager()

vision/tracker.py ADDED Viewed

	@@ -0,0 +1,186 @@

+"""
+vision/tracker.py - Multi-Object Tracking using ByteTrack algorithm
+Assigns persistent track IDs across frames for each camera.
+"""
+import numpy as np
+from typing import List, Dict, Tuple, Optional
+from dataclasses import dataclass, field
+from loguru import logger
+from config import settings
+@dataclass
+class Track:
+    track_id: int
+    bbox: List[float]       # [x1, y1, x2, y2]
+    score: float
+    age: int = 0
+    hits: int = 1
+    time_since_update: int = 0
+    state: str = "active"   # active | lost | removed
+    history: List[List[float]] = field(default_factory=list)
+    def update(self, bbox: List[float], score: float):
+        self.bbox = bbox
+        self.score = score
+        self.hits += 1
+        self.age += 1
+        self.time_since_update = 0
+        self.state = "active"
+        self.history.append(bbox)
+        if len(self.history) > 30:
+            self.history.pop(0)
+    def predict(self):
+        """Simple linear prediction (extend with Kalman for production)."""
+        self.time_since_update += 1
+        self.age += 1
+        if self.time_since_update > settings.TRACK_BUFFER:
+            self.state = "removed"
+        elif self.time_since_update > 5:
+            self.state = "lost"
+def iou(boxA: List[float], boxB: List[float]) -> float:
+    """Compute Intersection over Union between two [x1,y1,x2,y2] boxes."""
+    xA = max(boxA[0], boxB[0])
+    yA = max(boxA[1], boxB[1])
+    xB = min(boxA[2], boxB[2])
+    yB = min(boxA[3], boxB[3])
+    inter = max(0, xB - xA) * max(0, yB - yA)
+    areaA = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
+    areaB = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])
+    union = areaA + areaB - inter
+    return inter / (union + 1e-6)
+class ByteTracker:
+    """
+    Simplified ByteTrack-style multi-object tracker.
+    Uses two-stage matching: high-confidence detections first, then low-confidence.
+    One instance per camera.
+    """
+    def __init__(self, camera_id: str):
+        self.camera_id = camera_id
+        self.tracks: List[Track] = []
+        self._next_id = 1
+        self.frame_id = 0
+        logger.info(f"ByteTracker initialized for camera: {camera_id}")
+    def _new_track(self, bbox: List[float], score: float) -> Track:
+        t = Track(track_id=self._next_id, bbox=bbox, score=score, history=[bbox])
+        self._next_id += 1
+        return t
+    def _match(
+        self,
+        detections: List[Dict],
+        threshold: float = 0.5,
+    ) -> Tuple[List[Tuple[int, int]], List[int], List[int]]:
+        """
+        Greedy IoU matching between active tracks and detections.
+        Returns: (matched pairs), (unmatched track indices), (unmatched det indices)
+        """
+        active = [i for i, t in enumerate(self.tracks) if t.state != "removed"]
+        if not active or not detections:
+            return [], active, list(range(len(detections)))
+        iou_matrix = np.zeros((len(active), len(detections)))
+        for i, ti in enumerate(active):
+            for j, det in enumerate(detections):
+                iou_matrix[i, j] = iou(self.tracks[ti].bbox, det["bbox"])
+        matched, unmatched_tracks, unmatched_dets = [], list(active), list(range(len(detections)))
+        while True:
+            if iou_matrix.size == 0:
+                break
+            flat_idx = np.argmax(iou_matrix)
+            ti_local, di = divmod(flat_idx, iou_matrix.shape[1])
+            if iou_matrix[ti_local, di] < threshold:
+                break
+            ti_global = active[ti_local]
+            matched.append((ti_global, di))
+            unmatched_tracks.remove(ti_global)
+            unmatched_dets.remove(di)
+            iou_matrix[ti_local, :] = -1
+            iou_matrix[:, di] = -1
+        return matched, unmatched_tracks, unmatched_dets
+    def update(self, detections: List[Dict]) -> List[Dict]:
+        """
+        Update tracker with new detections.
+        Args:
+            detections: list of {"bbox": [...], "score": float}
+        Returns:
+            tracked_objects: list of {"track_id": int, "bbox": [...], "score": float, "state": str}
+        """
+        self.frame_id += 1
+        # Predict existing tracks
+        for t in self.tracks:
+            t.predict()
+        # Remove permanently dead tracks
+        self.tracks = [t for t in self.tracks if t.state != "removed"]
+        # High confidence detections
+        high_dets = [d for d in detections if d["score"] >= settings.TRACK_THRESH]
+        low_dets = [d for d in detections if d["score"] < settings.TRACK_THRESH]
+        # Stage 1: Match high-confidence detections
+        matched, unmatched_tracks, unmatched_high = self._match(high_dets, threshold=settings.MATCH_THRESH)
+        for ti, di in matched:
+            self.tracks[ti].update(high_dets[di]["bbox"], high_dets[di]["score"])
+        # Stage 2: Match remaining tracks with low-confidence detections
+        remaining_unmatched = [ti for ti in unmatched_tracks if self.tracks[ti].state == "lost"]
+        if remaining_unmatched and low_dets:
+            low_iou_matrix = np.zeros((len(remaining_unmatched), len(low_dets)))
+            for i, ti in enumerate(remaining_unmatched):
+                for j, det in enumerate(low_dets):
+                    low_iou_matrix[i, j] = iou(self.tracks[ti].bbox, det["bbox"])
+            for i, ti in enumerate(remaining_unmatched):
+                best_j = int(np.argmax(low_iou_matrix[i]))
+                if low_iou_matrix[i, best_j] > 0.5:
+                    self.tracks[ti].update(low_dets[best_j]["bbox"], low_dets[best_j]["score"])
+        # Create new tracks for unmatched high-confidence detections
+        for di in unmatched_high:
+            self.tracks.append(self._new_track(high_dets[di]["bbox"], high_dets[di]["score"]))
+        # Return active tracks
+        return [
+            {
+                "track_id": t.track_id,
+                "bbox": t.bbox,
+                "score": t.score,
+                "state": t.state,
+                "age": t.age,
+                "hits": t.hits,
+            }
+            for t in self.tracks
+            if t.state == "active"
+        ]
+class TrackerManager:
+    """Manages one ByteTracker per camera."""
+    def __init__(self):
+        self._trackers: Dict[str, ByteTracker] = {}
+    def get_tracker(self, camera_id: str) -> ByteTracker:
+        if camera_id not in self._trackers:
+            self._trackers[camera_id] = ByteTracker(camera_id)
+        return self._trackers[camera_id]
+    def update(self, camera_id: str, detections: List[Dict]) -> List[Dict]:
+        return self.get_tracker(camera_id).update(detections)
+    def reset(self, camera_id: str):
+        if camera_id in self._trackers:
+            del self._trackers[camera_id]