| """Vision endpoints attēlu un kadru analīzei.""" |
|
|
| from __future__ import annotations |
|
|
| import base64 |
| import binascii |
| import io |
| import logging |
| import os |
| from collections import Counter |
| from datetime import UTC, datetime |
| from typing import Any |
| from uuid import uuid4 |
|
|
| import httpx |
| import numpy as np |
| from fastapi import APIRouter, HTTPException |
| from PIL import Image, ImageDraw, ImageStat |
| from pydantic import BaseModel, Field, field_validator, model_validator |
|
|
| from maris_core.memory_context import memory_store |
|
|
| logger = logging.getLogger(__name__) |
| router = APIRouter() |
|
|
| _DETECTOR: Any | None = None |
| _DETECTOR_FAILED = False |
| _SEGMENTER: Any | None = None |
| _SEGMENTER_FAILED = False |
| _OCR_ENGINE: Any | None = None |
| _OCR_ENGINE_KIND: str | None = None |
| _OCR_FAILED = False |
| _LIVE_CAMERAS: dict[str, dict[str, Any]] = {} |
| _LIVE_REID_INDEX: dict[str, list[dict[str, Any]]] = {} |
|
|
| SCENE_BRIGHTNESS_DELTA = 30.0 |
| TRACKING_DISTANCE_RATIO = 0.18 |
| POSE_CONNECTIONS = [ |
| ("nose", "left_shoulder"), |
| ("nose", "right_shoulder"), |
| ("left_shoulder", "right_shoulder"), |
| ("left_shoulder", "left_elbow"), |
| ("left_elbow", "left_wrist"), |
| ("right_shoulder", "right_elbow"), |
| ("right_elbow", "right_wrist"), |
| ("left_shoulder", "left_hip"), |
| ("right_shoulder", "right_hip"), |
| ("left_hip", "right_hip"), |
| ("left_hip", "left_knee"), |
| ("left_knee", "left_ankle"), |
| ("right_hip", "right_knee"), |
| ("right_knee", "right_ankle"), |
| ] |
|
|
|
|
| class BoundingBox(BaseModel): |
| x: float |
| y: float |
| width: float |
| height: float |
|
|
|
|
| class VisionDetection(BaseModel): |
| label: str |
| confidence: float |
| bbox: BoundingBox |
|
|
|
|
| class ImageSourceRequest(BaseModel): |
| image_url: str | None = None |
| image_base64: str | None = None |
| session_id: str | None = Field(default=None, max_length=120) |
| camera_id: str | None = Field(default=None, max_length=120) |
| max_detections: int = Field(default=10, ge=1, le=50) |
| confidence_threshold: float = Field(default=0.25, ge=0.0, le=1.0) |
|
|
| @model_validator(mode="after") |
| def validate_source(self) -> ImageSourceRequest: |
| has_url = bool((self.image_url or "").strip()) |
| has_base64 = bool((self.image_base64 or "").strip()) |
| if has_url == has_base64: |
| raise ValueError("Norādi tieši vienu no image_url vai image_base64.") |
| return self |
|
|
| @field_validator("session_id", "camera_id") |
| @classmethod |
| def normalize_optional_text(cls, value: str | None) -> str | None: |
| normalized = (value or "").strip() |
| return normalized or None |
|
|
|
|
| class FrameSequenceRequest(BaseModel): |
| frames_base64: list[str] = Field(min_length=1, max_length=24) |
| max_detections: int = Field(default=10, ge=1, le=50) |
| confidence_threshold: float = Field(default=0.25, ge=0.0, le=1.0) |
|
|
| @field_validator("frames_base64") |
| @classmethod |
| def validate_frames(cls, value: list[str]) -> list[str]: |
| cleaned = [item.strip() for item in value if item.strip()] |
| if not cleaned: |
| raise ValueError("frames_base64 nedrīkst būt tukšs.") |
| return cleaned |
|
|
|
|
| class VisionAnalyzeResponse(BaseModel): |
| summary: str |
| detections: list[VisionDetection] |
| width: int |
| height: int |
| model: str |
| fallback_used: bool = False |
|
|
|
|
| class OCRTextBlock(BaseModel): |
| text: str |
| confidence: float |
| bbox: BoundingBox |
| language: str |
|
|
|
|
| class VisionOCRResponse(BaseModel): |
| summary: str |
| results: list[OCRTextBlock] |
| width: int |
| height: int |
| model: str |
| fallback_used: bool = False |
|
|
|
|
| class PoseKeypoint(BaseModel): |
| name: str |
| x: float |
| y: float |
| confidence: float |
|
|
|
|
| class PoseConnection(BaseModel): |
| start: str |
| end: str |
|
|
|
|
| class PoseDetection(BaseModel): |
| person_id: int |
| confidence: float |
| bbox: BoundingBox |
| keypoints: list[PoseKeypoint] |
| connections: list[PoseConnection] |
|
|
|
|
| class VisionPoseResponse(BaseModel): |
| summary: str |
| poses: list[PoseDetection] |
| width: int |
| height: int |
| model: str |
| fallback_used: bool = False |
|
|
|
|
| class SegmentationMask(BaseModel): |
| label: str |
| confidence: float |
| mask_data_url: str |
| bbox: BoundingBox |
| area_pixels: int |
|
|
|
|
| class VisionSegmentationResponse(BaseModel): |
| summary: str |
| masks: list[SegmentationMask] |
| width: int |
| height: int |
| model: str |
| fallback_used: bool = False |
|
|
|
|
| class ActionPrediction(BaseModel): |
| action: str |
| confidence: float |
| subject_label: str |
| rationale: str |
|
|
|
|
| class VisionActionResponse(BaseModel): |
| summary: str |
| actions: list[ActionPrediction] |
| width: int |
| height: int |
| model: str |
| fallback_used: bool = False |
|
|
|
|
| class TrackObservation(BaseModel): |
| frame_index: int |
| confidence: float |
| bbox: BoundingBox |
|
|
|
|
| class TrackedObject(BaseModel): |
| track_id: int |
| label: str |
| average_confidence: float |
| observations: list[TrackObservation] |
|
|
|
|
| class VisionTrackingResponse(BaseModel): |
| summary: str |
| tracks: list[TrackedObject] |
| frame_count: int |
| model: str |
| fallback_used: bool = False |
|
|
|
|
| class FrameAnalysis(BaseModel): |
| frame_index: int |
| summary: str |
| detections: list[VisionDetection] |
| dominant_labels: list[str] |
| brightness: float |
|
|
|
|
| class VisionFrameAnalysisResponse(BaseModel): |
| summary: str |
| frames: list[FrameAnalysis] |
| frame_count: int |
| model: str |
| fallback_used: bool = False |
|
|
|
|
| class SceneSegment(BaseModel): |
| scene_index: int |
| start_frame: int |
| end_frame: int |
| summary: str |
| dominant_labels: list[str] |
| average_brightness: float |
|
|
|
|
| class VisionSceneTimelineResponse(BaseModel): |
| summary: str |
| scenes: list[SceneSegment] |
| frame_count: int |
| model: str |
| fallback_used: bool = False |
|
|
|
|
| class CameraResolution(BaseModel): |
| width: int = Field(default=1280, ge=1, le=8192) |
| height: int = Field(default=720, ge=1, le=8192) |
|
|
|
|
| class CameraHealth(BaseModel): |
| connected: bool = True |
| analysis_active: bool = False |
| reconnect_attempts: int = Field(default=0, ge=0) |
| dropped_frames: int = Field(default=0, ge=0) |
| events_emitted: int = Field(default=0, ge=0) |
| last_frame_at: str | None = None |
| last_event_at: str | None = None |
| last_error: str | None = None |
| ingest_mode: str = "client_push" |
|
|
|
|
| class LiveCameraConnectRequest(BaseModel): |
| camera_id: str | None = Field(default=None, min_length=1, max_length=120) |
| source_type: str = Field(min_length=2, max_length=40) |
| transport: str = Field(min_length=2, max_length=40) |
| url: str | None = None |
| device_id: str | None = None |
| auth: dict[str, str] = Field(default_factory=dict) |
| resolution: CameraResolution = Field(default_factory=CameraResolution) |
| fps: float = Field(default=10.0, ge=0.1, le=120.0) |
| enabled_pipelines: list[str] = Field(default_factory=list) |
| detection_stride: int = Field(default=3, ge=1, le=10) |
| ocr_interval: int = Field(default=12, ge=1, le=120) |
| fps_budget: float = Field(default=6.0, ge=0.5, le=60.0) |
| roi_zones: list[dict[str, Any]] = Field(default_factory=list) |
| alert_rules: list[str] = Field(default_factory=list) |
|
|
| @model_validator(mode="after") |
| def validate_source(self) -> LiveCameraConnectRequest: |
| if not (self.url or self.device_id): |
| raise ValueError("Norādi url vai device_id kamerai.") |
| return self |
|
|
|
|
| class LiveSessionCommandRequest(BaseModel): |
| camera_id: str = Field(min_length=1, max_length=120) |
| enabled_pipelines: list[str] | None = None |
| detection_stride: int | None = Field(default=None, ge=1, le=10) |
| ocr_interval: int | None = Field(default=None, ge=1, le=120) |
| fps_budget: float | None = Field(default=None, ge=0.5, le=60.0) |
|
|
|
|
| class LiveCameraConfigRequest(BaseModel): |
| camera_id: str = Field(min_length=1, max_length=120) |
| roi_zones: list[dict[str, Any]] = Field(default_factory=list) |
| alert_rules: list[str] = Field(default_factory=list) |
| enabled_pipelines: list[str] | None = None |
| fps_budget: float | None = Field(default=None, ge=0.5, le=60.0) |
|
|
|
|
| class LiveFrameRequest(BaseModel): |
| camera_id: str = Field(min_length=1, max_length=120) |
| image_base64: str = Field(min_length=8) |
| frame_index: int | None = Field(default=None, ge=0) |
| timestamp_ms: int | None = Field(default=None, ge=0) |
|
|
|
|
| class LiveEvent(BaseModel): |
| event_id: str |
| camera_id: str |
| type: str |
| severity: str |
| timestamp: str |
| summary: str |
| payload: dict[str, Any] = Field(default_factory=dict) |
|
|
|
|
| class LiveCameraSession(BaseModel): |
| camera_id: str |
| source_type: str |
| transport: str |
| url: str | None = None |
| device_id: str | None = None |
| auth: dict[str, Any] = Field(default_factory=dict) |
| resolution: CameraResolution |
| fps: float |
| status: str |
| health: CameraHealth |
| enabled_pipelines: list[str] |
| detection_stride: int |
| ocr_interval: int |
| fps_budget: float |
| roi_zones: list[dict[str, Any]] = Field(default_factory=list) |
| alert_rules: list[str] = Field(default_factory=list) |
| latest_snapshot: str | None = None |
| latest_result: dict[str, Any] = Field(default_factory=dict) |
| recent_events: list[LiveEvent] = Field(default_factory=list) |
| timeline: list[SceneSegment] = Field(default_factory=list) |
| tracks: list[TrackedObject] = Field(default_factory=list) |
|
|
|
|
| class LiveCameraCatalogResponse(BaseModel): |
| summary: str |
| cameras: list[LiveCameraSession] |
|
|
|
|
| class LiveCameraResponse(BaseModel): |
| summary: str |
| camera: LiveCameraSession |
|
|
|
|
| class LiveEventsResponse(BaseModel): |
| summary: str |
| camera_id: str |
| events: list[LiveEvent] |
|
|
|
|
| class LiveSnapshotResponse(BaseModel): |
| summary: str |
| camera_id: str |
| snapshot_data_url: str | None = None |
|
|
|
|
| class LiveFrameResponse(BaseModel): |
| summary: str |
| camera: LiveCameraSession |
| events: list[LiveEvent] = Field(default_factory=list) |
|
|
|
|
| def _decode_base64_payload(value: str) -> bytes: |
| payload = value.strip() |
| if payload.startswith("data:"): |
| _, _, payload = payload.partition(",") |
| try: |
| return base64.b64decode(payload, validate=True) |
| except (ValueError, binascii.Error) as exc: |
| raise HTTPException(status_code=400, detail="Nederīgs base64 saturs.") from exc |
|
|
|
|
| async def _load_image_from_source(image_url: str | None, image_base64: str | None) -> Image.Image: |
| if image_base64: |
| image_bytes = _decode_base64_payload(image_base64) |
| elif image_url and image_url.startswith("data:"): |
| image_bytes = _decode_base64_payload(image_url) |
| elif image_url and image_url.startswith(("http://", "https://")): |
| async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client: |
| response = await client.get(image_url) |
| response.raise_for_status() |
| image_bytes = response.content |
| else: |
| raise HTTPException( |
| status_code=400, |
| detail="Atbalstīts ir tikai http(s) URL vai base64 attēls.", |
| ) |
|
|
| try: |
| return Image.open(io.BytesIO(image_bytes)).convert("RGB") |
| except Exception as exc: |
| raise HTTPException(status_code=400, detail="Neizdevās nolasīt attēlu.") from exc |
|
|
|
|
| async def _load_image(req: ImageSourceRequest) -> Image.Image: |
| return await _load_image_from_source(req.image_url, req.image_base64) |
|
|
|
|
| async def _load_frames(req: FrameSequenceRequest) -> list[Image.Image]: |
| return [await _load_image_from_source(None, frame) for frame in req.frames_base64] |
|
|
|
|
| def _image_to_data_url(image: Image.Image) -> str: |
| buffer = io.BytesIO() |
| image.save(buffer, format="PNG") |
| encoded = base64.b64encode(buffer.getvalue()).decode() |
| return f"data:image/png;base64,{encoded}" |
|
|
|
|
| def _color_tone_name(rgb: tuple[int, int, int]) -> str: |
| red, green, blue = rgb |
| if max(rgb) - min(rgb) < 20: |
| return "neitrāli" |
| if red >= green and red >= blue: |
| return "silti" |
| if blue >= red and blue >= green: |
| return "vēsi" |
| return "zaļgani" |
|
|
|
|
| def _fallback_summary(image: Image.Image, reason: str) -> str: |
| width, height = image.size |
| orientation = "horizontāls" if width >= height else "vertikāls" |
| brightness = ImageStat.Stat(image.convert("L")).mean[0] |
| light = "gaišu" if brightness >= 150 else "tumšu" if brightness <= 85 else "vidēji apgaismotu" |
| dominant_rgb = image.resize((1, 1)).getpixel((0, 0)) |
| tone = _color_tone_name(dominant_rgb) |
| return ( |
| f"Fallback vision summary: {orientation} {width}x{height} attēls ar {light} ekspozīciju " |
| f"un {tone} krāsu toni. {reason}" |
| ) |
|
|
|
|
| def _detection_center(detection: VisionDetection) -> tuple[float, float]: |
| return ( |
| detection.bbox.x + detection.bbox.width / 2.0, |
| detection.bbox.y + detection.bbox.height / 2.0, |
| ) |
|
|
|
|
| def _build_detection_summary(detections: list[VisionDetection], width: int, height: int) -> str: |
| if not detections: |
| return ( |
| f"Vision model neredzēja objektus virs sliekšņa šajā attēlā ({width}x{height}). " |
| "Pamēģini zemāku confidence_threshold vai citu kadru." |
| ) |
|
|
| counts = Counter(detection.label for detection in detections) |
| ordered = ", ".join( |
| f"{label}×{count}" if count > 1 else label for label, count in counts.most_common(5) |
| ) |
| return f"Analīze pabeigta: attēlā ({width}x{height}) atrasti {len(detections)} objekti — {ordered}." |
|
|
|
|
| def _dominant_labels(detections: list[VisionDetection], limit: int = 4) -> list[str]: |
| counts = Counter(item.label for item in detections) |
| return [label for label, _ in counts.most_common(limit)] |
|
|
|
|
| def _frame_brightness(image: Image.Image) -> float: |
| return float(ImageStat.Stat(image.convert("L")).mean[0]) |
|
|
|
|
| def _get_detector() -> tuple[Any | None, str]: |
| global _DETECTOR, _DETECTOR_FAILED |
|
|
| model_name = os.getenv("VISION_DETECTION_MODEL", "facebook/detr-resnet-50") |
| if _DETECTOR is not None: |
| return _DETECTOR, model_name |
| if _DETECTOR_FAILED: |
| return None, model_name |
|
|
| try: |
| import torch |
| from transformers import pipeline |
|
|
| device = 0 if torch.cuda.is_available() else -1 |
| _DETECTOR = pipeline("object-detection", model=model_name, device=device) |
| except Exception as exc: |
| logger.warning("Vision detector unavailable, using fallback summary: %s", exc) |
| _DETECTOR_FAILED = True |
| return None, model_name |
|
|
| return _DETECTOR, model_name |
|
|
|
|
| def _run_detection( |
| detector: Any, |
| image: Image.Image, |
| *, |
| threshold: float, |
| max_detections: int, |
| ) -> list[VisionDetection]: |
| raw_detections = detector(image) |
| detections: list[VisionDetection] = [] |
|
|
| for item in raw_detections: |
| score = float(item.get("score", 0.0)) |
| if score < threshold: |
| continue |
|
|
| box = item.get("box") or {} |
| xmin = float(box.get("xmin", 0.0)) |
| ymin = float(box.get("ymin", 0.0)) |
| xmax = float(box.get("xmax", xmin)) |
| ymax = float(box.get("ymax", ymin)) |
| width = max(0.0, xmax - xmin) |
| height = max(0.0, ymax - ymin) |
| if width <= 0.0 or height <= 0.0: |
| continue |
|
|
| detections.append( |
| VisionDetection( |
| label=str(item.get("label", "unknown")).strip() or "unknown", |
| confidence=score, |
| bbox=BoundingBox(x=xmin, y=ymin, width=width, height=height), |
| ) |
| ) |
| if len(detections) >= max_detections: |
| break |
|
|
| return detections |
|
|
|
|
| def _detect_image_payload( |
| image: Image.Image, |
| *, |
| threshold: float, |
| max_detections: int, |
| ) -> tuple[list[VisionDetection], str, bool]: |
| detector, model_name = _get_detector() |
| if detector is None: |
| return [], "fallback/basic-image-summary", True |
|
|
| try: |
| return ( |
| _run_detection( |
| detector, |
| image, |
| threshold=threshold, |
| max_detections=max_detections, |
| ), |
| model_name, |
| False, |
| ) |
| except Exception as exc: |
| logger.warning("Vision detection failed, using fallback: %s", exc) |
| return [], f"{model_name} (fallback)", True |
|
|
|
|
| def _get_segmenter() -> tuple[Any | None, str]: |
| global _SEGMENTER, _SEGMENTER_FAILED |
|
|
| model_name = os.getenv( |
| "VISION_SEGMENTATION_MODEL", |
| "facebook/mask2former-swin-small-coco-instance", |
| ) |
| if _SEGMENTER is not None: |
| return _SEGMENTER, model_name |
| if _SEGMENTER_FAILED: |
| return None, model_name |
|
|
| try: |
| import torch |
| from transformers import pipeline |
|
|
| device = 0 if torch.cuda.is_available() else -1 |
| _SEGMENTER = pipeline("image-segmentation", model=model_name, device=device) |
| except Exception as exc: |
| logger.warning("Vision segmenter unavailable, using bbox masks: %s", exc) |
| _SEGMENTER_FAILED = True |
| return None, model_name |
|
|
| return _SEGMENTER, model_name |
|
|
|
|
| def _bbox_from_mask(mask_array: np.ndarray) -> BoundingBox | None: |
| ys, xs = np.where(mask_array > 0) |
| if len(xs) == 0 or len(ys) == 0: |
| return None |
| xmin = float(xs.min()) |
| xmax = float(xs.max()) |
| ymin = float(ys.min()) |
| ymax = float(ys.max()) |
| return BoundingBox( |
| x=xmin, y=ymin, width=max(1.0, xmax - xmin + 1.0), height=max(1.0, ymax - ymin + 1.0) |
| ) |
|
|
|
|
| def _mask_to_data_url(mask_array: np.ndarray) -> str: |
| mask_image = Image.fromarray(np.where(mask_array > 0, 255, 0).astype(np.uint8), mode="L") |
| return _image_to_data_url(mask_image) |
|
|
|
|
| def _bbox_mask(width: int, height: int, bbox: BoundingBox) -> np.ndarray: |
| mask = Image.new("L", (width, height), 0) |
| draw = ImageDraw.Draw(mask) |
| draw.rectangle( |
| [bbox.x, bbox.y, bbox.x + bbox.width, bbox.y + bbox.height], |
| fill=255, |
| ) |
| return np.array(mask, dtype=np.uint8) |
|
|
|
|
| def _coerce_mask_array(mask: Any) -> np.ndarray | None: |
| if isinstance(mask, Image.Image): |
| return np.array(mask.convert("L"), dtype=np.uint8) |
| if isinstance(mask, np.ndarray): |
| return mask.astype(np.uint8) |
| try: |
| array = np.asarray(mask, dtype=np.uint8) |
| except Exception: |
| return None |
| if array.ndim < 2: |
| return None |
| return array |
|
|
|
|
| def _segment_from_detections( |
| image: Image.Image, detections: list[VisionDetection] |
| ) -> list[SegmentationMask]: |
| width, height = image.size |
| masks: list[SegmentationMask] = [] |
| for detection in detections: |
| mask_array = _bbox_mask(width, height, detection.bbox) |
| masks.append( |
| SegmentationMask( |
| label=detection.label, |
| confidence=detection.confidence, |
| mask_data_url=_mask_to_data_url(mask_array), |
| bbox=detection.bbox, |
| area_pixels=int((mask_array > 0).sum()), |
| ) |
| ) |
| return masks |
|
|
|
|
| def _extract_segmentation_masks( |
| image: Image.Image, |
| detections: list[VisionDetection], |
| ) -> tuple[list[SegmentationMask], str, bool]: |
| segmenter, model_name = _get_segmenter() |
| if segmenter is None: |
| return _segment_from_detections(image, detections), "bbox-mask-fallback", True |
|
|
| try: |
| raw_masks = segmenter(image) |
| masks: list[SegmentationMask] = [] |
| for item in raw_masks: |
| mask_array = _coerce_mask_array(item.get("mask")) |
| if mask_array is None: |
| continue |
| bbox = _bbox_from_mask(mask_array) |
| if bbox is None: |
| continue |
| masks.append( |
| SegmentationMask( |
| label=str(item.get("label", "segment")).strip() or "segment", |
| confidence=float(item.get("score", 0.0)), |
| mask_data_url=_mask_to_data_url(mask_array), |
| bbox=bbox, |
| area_pixels=int((mask_array > 0).sum()), |
| ) |
| ) |
| if masks: |
| return masks, model_name, False |
| except Exception as exc: |
| logger.warning("Vision segmentation failed, using bbox masks: %s", exc) |
|
|
| return _segment_from_detections(image, detections), f"{model_name} (fallback)", True |
|
|
|
|
| def _get_ocr_engine() -> tuple[tuple[str, Any] | None, str]: |
| global _OCR_ENGINE, _OCR_ENGINE_KIND, _OCR_FAILED |
|
|
| trocr_model = os.getenv("VISION_OCR_MODEL", "microsoft/trocr-base-printed") |
| if _OCR_ENGINE is not None and _OCR_ENGINE_KIND is not None: |
| return (_OCR_ENGINE_KIND, _OCR_ENGINE), trocr_model |
| if _OCR_FAILED: |
| return None, trocr_model |
|
|
| try: |
| import pytesseract |
|
|
| _OCR_ENGINE = pytesseract |
| _OCR_ENGINE_KIND = "pytesseract" |
| return (_OCR_ENGINE_KIND, _OCR_ENGINE), "pytesseract" |
| except Exception: |
| pass |
|
|
| try: |
| import torch |
| from transformers import TrOCRProcessor, VisionEncoderDecoderModel |
|
|
| processor = TrOCRProcessor.from_pretrained(trocr_model) |
| model = VisionEncoderDecoderModel.from_pretrained(trocr_model) |
| if torch.cuda.is_available(): |
| model = model.to("cuda") |
| _OCR_ENGINE = {"processor": processor, "model": model, "torch": torch} |
| _OCR_ENGINE_KIND = "trocr" |
| return (_OCR_ENGINE_KIND, _OCR_ENGINE), trocr_model |
| except Exception as exc: |
| logger.warning("Vision OCR engine unavailable, using fallback summary: %s", exc) |
| _OCR_FAILED = True |
| return None, trocr_model |
|
|
|
|
| def _extract_ocr_blocks(image: Image.Image) -> tuple[list[OCRTextBlock], str, bool]: |
| engine, model_name = _get_ocr_engine() |
| width, height = image.size |
| if engine is None: |
| return [], "fallback/ocr-unavailable", True |
|
|
| engine_kind, payload = engine |
| if engine_kind == "pytesseract": |
| try: |
| data = payload.image_to_data(image, output_type=payload.Output.DICT) |
| blocks: list[OCRTextBlock] = [] |
| total = len(data.get("text", [])) |
| for index in range(total): |
| text = str(data["text"][index]).strip() |
| if not text: |
| continue |
| confidence_raw = str(data.get("conf", ["0"])[index]).strip() |
| try: |
| confidence = max(0.0, min(1.0, float(confidence_raw) / 100.0)) |
| except ValueError: |
| confidence = 0.0 |
| blocks.append( |
| OCRTextBlock( |
| text=text, |
| confidence=confidence, |
| bbox=BoundingBox( |
| x=float(data["left"][index]), |
| y=float(data["top"][index]), |
| width=float(data["width"][index]), |
| height=float(data["height"][index]), |
| ), |
| language="unknown", |
| ) |
| ) |
| return blocks, model_name, False |
| except Exception as exc: |
| logger.warning("pytesseract OCR failed, falling back: %s", exc) |
|
|
| if engine_kind == "trocr": |
| try: |
| processor = payload["processor"] |
| model = payload["model"] |
| torch = payload["torch"] |
| pixel_values = processor(images=image, return_tensors="pt").pixel_values |
| if torch.cuda.is_available(): |
| pixel_values = pixel_values.to("cuda") |
| generated_ids = model.generate(pixel_values) |
| text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip() |
| if text: |
| return ( |
| [ |
| OCRTextBlock( |
| text=text, |
| confidence=0.65, |
| bbox=BoundingBox( |
| x=0.0, y=0.0, width=float(width), height=float(height) |
| ), |
| language="auto", |
| ) |
| ], |
| model_name, |
| False, |
| ) |
| except Exception as exc: |
| logger.warning("TrOCR inference failed, falling back: %s", exc) |
|
|
| return [], f"{model_name} (fallback)", True |
|
|
|
|
| def _keypoint(x: float, y: float, confidence: float, name: str) -> PoseKeypoint: |
| return PoseKeypoint(name=name, x=x, y=y, confidence=confidence) |
|
|
|
|
| def _estimate_pose_from_detections(detections: list[VisionDetection]) -> list[PoseDetection]: |
| people = [item for item in detections if item.label.lower() == "person"] |
| poses: list[PoseDetection] = [] |
| for index, person in enumerate(people, start=1): |
| x = person.bbox.x |
| y = person.bbox.y |
| width = person.bbox.width |
| height = person.bbox.height |
| confidence = max(0.2, min(1.0, person.confidence * 0.92)) |
| points = [ |
| _keypoint(x + width * 0.50, y + height * 0.12, confidence, "nose"), |
| _keypoint(x + width * 0.32, y + height * 0.26, confidence, "left_shoulder"), |
| _keypoint(x + width * 0.68, y + height * 0.26, confidence, "right_shoulder"), |
| _keypoint(x + width * 0.24, y + height * 0.44, confidence * 0.95, "left_elbow"), |
| _keypoint(x + width * 0.76, y + height * 0.44, confidence * 0.95, "right_elbow"), |
| _keypoint(x + width * 0.18, y + height * 0.62, confidence * 0.88, "left_wrist"), |
| _keypoint(x + width * 0.82, y + height * 0.62, confidence * 0.88, "right_wrist"), |
| _keypoint(x + width * 0.38, y + height * 0.56, confidence, "left_hip"), |
| _keypoint(x + width * 0.62, y + height * 0.56, confidence, "right_hip"), |
| _keypoint(x + width * 0.36, y + height * 0.77, confidence * 0.9, "left_knee"), |
| _keypoint(x + width * 0.64, y + height * 0.77, confidence * 0.9, "right_knee"), |
| _keypoint(x + width * 0.34, y + height * 0.97, confidence * 0.82, "left_ankle"), |
| _keypoint(x + width * 0.66, y + height * 0.97, confidence * 0.82, "right_ankle"), |
| ] |
| poses.append( |
| PoseDetection( |
| person_id=index, |
| confidence=confidence, |
| bbox=person.bbox, |
| keypoints=points, |
| connections=[ |
| PoseConnection(start=start, end=end) for start, end in POSE_CONNECTIONS |
| ], |
| ) |
| ) |
| return poses |
|
|
|
|
| def _predict_actions(detections: list[VisionDetection]) -> list[ActionPrediction]: |
| labels = {item.label.lower() for item in detections} |
| actions: list[ActionPrediction] = [] |
| people = [item for item in detections if item.label.lower() == "person"] |
| for person in people: |
| ratio = person.bbox.height / max(person.bbox.width, 1.0) |
| if "cell phone" in labels: |
| action = "using_phone" |
| confidence = min(0.97, 0.55 + person.confidence * 0.35) |
| rationale = "Persona ir kopā ar phone tipa objektu vienā kadrā." |
| elif "sports ball" in labels: |
| action = "playing_ball" |
| confidence = min(0.94, 0.5 + person.confidence * 0.3) |
| rationale = "Kadrā redzams cilvēks un sporta bumba." |
| elif ratio > 2.3: |
| action = "standing" |
| confidence = min(0.9, 0.45 + person.confidence * 0.4) |
| rationale = "Cilvēka bbox ir izteikti vertikāls, kas atbilst stāvēšanai." |
| elif ratio > 1.6: |
| action = "walking" |
| confidence = min(0.84, 0.4 + person.confidence * 0.32) |
| rationale = "Cilvēka siluets izskatās kustībā vai solī." |
| else: |
| action = "sitting_or_crouching" |
| confidence = min(0.78, 0.38 + person.confidence * 0.28) |
| rationale = "Cilvēka bbox proporcijas norāda uz sēdošu vai pietupušos pozu." |
|
|
| actions.append( |
| ActionPrediction( |
| action=action, |
| confidence=confidence, |
| subject_label=person.label, |
| rationale=rationale, |
| ) |
| ) |
|
|
| return actions |
|
|
|
|
| def _frame_detections( |
| image: Image.Image, |
| *, |
| threshold: float, |
| max_detections: int, |
| ) -> tuple[list[VisionDetection], str, bool]: |
| return _detect_image_payload(image, threshold=threshold, max_detections=max_detections) |
|
|
|
|
| def _build_frame_analysis( |
| frames: list[Image.Image], |
| *, |
| threshold: float, |
| max_detections: int, |
| ) -> tuple[list[FrameAnalysis], str, bool]: |
| analyses: list[FrameAnalysis] = [] |
| model_names: list[str] = [] |
| fallback_used = False |
| for index, frame in enumerate(frames): |
| detections, model_name, frame_fallback = _frame_detections( |
| frame, |
| threshold=threshold, |
| max_detections=max_detections, |
| ) |
| model_names.append(model_name) |
| fallback_used = fallback_used or frame_fallback |
| analyses.append( |
| FrameAnalysis( |
| frame_index=index, |
| summary=_build_detection_summary(detections, frame.size[0], frame.size[1]) |
| if detections |
| else _fallback_summary( |
| frame, "Objektu noteikšanas modelis šim kadrām nav pieejams." |
| ), |
| detections=detections, |
| dominant_labels=_dominant_labels(detections), |
| brightness=_frame_brightness(frame), |
| ) |
| ) |
|
|
| model_name = ( |
| Counter(model_names).most_common(1)[0][0] if model_names else "fallback/basic-image-summary" |
| ) |
| return analyses, model_name, fallback_used |
|
|
|
|
| def _build_tracks( |
| frames: list[FrameAnalysis], |
| frame_size: tuple[int, int], |
| ) -> list[TrackedObject]: |
| width, height = frame_size |
| max_distance = ((width**2 + height**2) ** 0.5) * TRACKING_DISTANCE_RATIO |
| active_tracks: dict[int, tuple[str, tuple[float, float]]] = {} |
| observations: dict[int, list[tuple[str, TrackObservation, float]]] = {} |
| next_track_id = 1 |
|
|
| for frame in frames: |
| frame_active: dict[int, tuple[str, tuple[float, float]]] = {} |
| for detection in frame.detections: |
| center = _detection_center(detection) |
| track_id: int | None = None |
| best_distance = float("inf") |
| for candidate_id, (candidate_label, candidate_center) in active_tracks.items(): |
| if candidate_label != detection.label: |
| continue |
| distance = ( |
| (candidate_center[0] - center[0]) ** 2 + (candidate_center[1] - center[1]) ** 2 |
| ) ** 0.5 |
| if distance <= max_distance and distance < best_distance: |
| best_distance = distance |
| track_id = candidate_id |
| if track_id is None: |
| track_id = next_track_id |
| next_track_id += 1 |
| frame_active[track_id] = (detection.label, center) |
| observations.setdefault(track_id, []).append( |
| ( |
| detection.label, |
| TrackObservation( |
| frame_index=frame.frame_index, |
| confidence=detection.confidence, |
| bbox=detection.bbox, |
| ), |
| detection.confidence, |
| ) |
| ) |
| active_tracks = frame_active |
|
|
| tracks: list[TrackedObject] = [] |
| for track_id, items in observations.items(): |
| label = items[0][0] |
| confs = [item[2] for item in items] |
| tracks.append( |
| TrackedObject( |
| track_id=track_id, |
| label=label, |
| average_confidence=sum(confs) / len(confs), |
| observations=[item[1] for item in items], |
| ) |
| ) |
| return tracks |
|
|
|
|
| def _build_scenes(frame_analyses: list[FrameAnalysis]) -> list[SceneSegment]: |
| if not frame_analyses: |
| return [] |
|
|
| scenes: list[list[FrameAnalysis]] = [[frame_analyses[0]]] |
| for frame in frame_analyses[1:]: |
| previous = scenes[-1][-1] |
| previous_labels = set(previous.dominant_labels) |
| current_labels = set(frame.dominant_labels) |
| if previous_labels or current_labels: |
| overlap = len(previous_labels & current_labels) / max( |
| len(previous_labels | current_labels), 1 |
| ) |
| else: |
| overlap = 1.0 |
| brightness_delta = abs(frame.brightness - previous.brightness) |
| if overlap < 0.4 or brightness_delta >= SCENE_BRIGHTNESS_DELTA: |
| scenes.append([frame]) |
| else: |
| scenes[-1].append(frame) |
|
|
| response: list[SceneSegment] = [] |
| for scene_index, scene_frames in enumerate(scenes): |
| labels = Counter(label for frame in scene_frames for label in frame.dominant_labels) |
| dominant_labels = [label for label, _ in labels.most_common(4)] |
| avg_brightness = sum(frame.brightness for frame in scene_frames) / len(scene_frames) |
| response.append( |
| SceneSegment( |
| scene_index=scene_index, |
| start_frame=scene_frames[0].frame_index, |
| end_frame=scene_frames[-1].frame_index, |
| summary=( |
| f"Scene {scene_index + 1}: kadri {scene_frames[0].frame_index}-{scene_frames[-1].frame_index} " |
| f"ar dominējošiem elementiem {', '.join(dominant_labels) if dominant_labels else 'nav noteikts'}." |
| ), |
| dominant_labels=dominant_labels, |
| average_brightness=avg_brightness, |
| ) |
| ) |
| return response |
|
|
|
|
| def _utc_now_iso() -> str: |
| return datetime.now(UTC).isoformat().replace("+00:00", "Z") |
|
|
|
|
| def _default_live_pipelines() -> list[str]: |
| return [ |
| "object_detection", |
| "tracking", |
| "action_recognition", |
| "scene_timeline", |
| "ocr", |
| "pose_estimation", |
| "segmentation", |
| "anomaly_detection", |
| ] |
|
|
|
|
| def _public_auth(auth: dict[str, str]) -> dict[str, Any]: |
| username = (auth.get("username") or "").strip() |
| return { |
| "username_hint": f"{username[:2]}***" if username else None, |
| "token_present": bool(auth.get("token")), |
| "password_present": bool(auth.get("password")), |
| } |
|
|
|
|
| def _create_live_event( |
| camera_id: str, |
| event_type: str, |
| summary: str, |
| *, |
| severity: str = "info", |
| payload: dict[str, Any] | None = None, |
| ) -> dict[str, Any]: |
| return { |
| "event_id": f"evt_{uuid4().hex}", |
| "camera_id": camera_id, |
| "type": event_type, |
| "severity": severity, |
| "timestamp": _utc_now_iso(), |
| "summary": summary, |
| "payload": payload or {}, |
| } |
|
|
|
|
| def _camera_health(session: dict[str, Any]) -> CameraHealth: |
| return CameraHealth(**session["health"]) |
|
|
|
|
| def _camera_resolution(session: dict[str, Any]) -> CameraResolution: |
| return CameraResolution(**session["resolution"]) |
|
|
|
|
| def _camera_events(session: dict[str, Any]) -> list[LiveEvent]: |
| return [LiveEvent(**item) for item in session.get("recent_events", [])] |
|
|
|
|
| def _session_to_response(session: dict[str, Any]) -> LiveCameraSession: |
| return LiveCameraSession( |
| camera_id=session["camera_id"], |
| source_type=session["source_type"], |
| transport=session["transport"], |
| url=session.get("url"), |
| device_id=session.get("device_id"), |
| auth=_public_auth(session.get("auth", {})), |
| resolution=_camera_resolution(session), |
| fps=float(session["fps"]), |
| status=session["status"], |
| health=_camera_health(session), |
| enabled_pipelines=list(session.get("enabled_pipelines", [])), |
| detection_stride=int(session.get("detection_stride", 3)), |
| ocr_interval=int(session.get("ocr_interval", 12)), |
| fps_budget=float(session.get("fps_budget", 6.0)), |
| roi_zones=list(session.get("roi_zones", [])), |
| alert_rules=list(session.get("alert_rules", [])), |
| latest_snapshot=session.get("latest_snapshot"), |
| latest_result=dict(session.get("latest_result", {})), |
| recent_events=_camera_events(session), |
| timeline=list(session.get("timeline", [])), |
| tracks=list(session.get("tracks", [])), |
| ) |
|
|
|
|
| def _append_session_event(session: dict[str, Any], event: dict[str, Any]) -> LiveEvent: |
| session.setdefault("recent_events", []).append(event) |
| session["recent_events"] = session["recent_events"][-40:] |
| health = session["health"] |
| health["events_emitted"] = int(health.get("events_emitted", 0)) + 1 |
| health["last_event_at"] = event["timestamp"] |
| return LiveEvent(**event) |
|
|
|
|
| def _scene_changed( |
| previous: FrameAnalysis | None, |
| current: FrameAnalysis, |
| threshold: float, |
| ) -> bool: |
| if previous is None: |
| return True |
|
|
| previous_labels = set(previous.dominant_labels) |
| current_labels = set(current.dominant_labels) |
| overlap = ( |
| len(previous_labels & current_labels) / max(len(previous_labels | current_labels), 1) |
| if previous_labels or current_labels |
| else 1.0 |
| ) |
| brightness_delta = abs(current.brightness - previous.brightness) |
| return overlap < 0.4 or brightness_delta >= threshold |
|
|
|
|
| def _live_alerts(frame: FrameAnalysis, scene_changed_flag: bool) -> list[dict[str, Any]]: |
| alerts: list[dict[str, Any]] = [] |
| detection_count = len(frame.detections) |
| person_count = sum(1 for item in frame.detections if item.label.lower() == "person") |
| if frame.brightness < 45: |
| alerts.append( |
| { |
| "rule": "low_light", |
| "severity": "warning", |
| "summary": "Kamera redz ļoti tumšu ainu; kvalitāte var kristies.", |
| } |
| ) |
| if person_count >= 4 or detection_count >= 8: |
| alerts.append( |
| { |
| "rule": "crowded_scene", |
| "severity": "warning", |
| "summary": "Ainā ir liela objektu koncentrācija; var būt vajadzīga prioritizācija.", |
| } |
| ) |
| if scene_changed_flag: |
| alerts.append( |
| { |
| "rule": "scene_change", |
| "severity": "info", |
| "summary": "Atklāta būtiska ainas maiņa; timeline un OCR tiek atsvaidzināti.", |
| } |
| ) |
| return alerts |
|
|
|
|
| def _bbox_intersects_roi(bbox: BoundingBox, roi: dict[str, Any]) -> bool: |
| roi_x = float(roi.get("x", 0.0)) |
| roi_y = float(roi.get("y", 0.0)) |
| roi_width = float(roi.get("width", 0.0)) |
| roi_height = float(roi.get("height", 0.0)) |
| if roi_width <= 0 or roi_height <= 0: |
| return True |
|
|
| return not ( |
| bbox.x + bbox.width < roi_x |
| or roi_x + roi_width < bbox.x |
| or bbox.y + bbox.height < roi_y |
| or roi_y + roi_height < bbox.y |
| ) |
|
|
|
|
| def _apply_roi_zones( |
| detections: list[VisionDetection], |
| roi_zones: list[dict[str, Any]], |
| ) -> list[VisionDetection]: |
| if not roi_zones: |
| return detections |
|
|
| filtered: list[VisionDetection] = [] |
| for detection in detections: |
| if any(_bbox_intersects_roi(detection.bbox, roi) for roi in roi_zones): |
| filtered.append(detection) |
| return filtered |
|
|
|
|
| def _evaluate_alert_rules( |
| detections: list[VisionDetection], |
| alert_rules: list[str], |
| camera_id: str, |
| ) -> list[dict[str, Any]]: |
| if not alert_rules: |
| return [] |
|
|
| alerts: list[dict[str, Any]] = [] |
| for raw_rule in alert_rules: |
| parts = [part.strip() for part in raw_rule.split(":") if part.strip()] |
| if len(parts) < 2: |
| continue |
| rule_name = parts[0] |
| target_label = parts[1].lower() |
| min_count = int(parts[2]) if len(parts) > 2 and parts[2].isdigit() else 1 |
| confidence_floor = float(parts[3]) if len(parts) > 3 else 0.0 |
| matched = [ |
| detection |
| for detection in detections |
| if detection.label.lower() == target_label and detection.confidence >= confidence_floor |
| ] |
| if len(matched) >= min_count: |
| alerts.append( |
| { |
| "rule": rule_name, |
| "severity": "warning", |
| "summary": ( |
| f"Alert rule `{rule_name}` aktivizējās kamerai {camera_id}: " |
| f"{len(matched)}× {target_label} virs sliekšņa." |
| ), |
| "target_label": target_label, |
| "count": len(matched), |
| "confidence_floor": confidence_floor, |
| } |
| ) |
| return alerts |
|
|
|
|
| def _reid_signature( |
| camera_id: str, |
| track: TrackedObject, |
| frame_analysis: FrameAnalysis, |
| ) -> dict[str, Any] | None: |
| if not track.observations: |
| return None |
| latest = track.observations[-1] |
| bbox = latest.bbox |
| width_norm = bbox.width / max( |
| frame_analysis.detections[0].bbox.width if frame_analysis.detections else 1.0, 1.0 |
| ) |
| height_norm = bbox.height / max(frame_analysis.brightness, 1.0) |
| center_x = bbox.x + bbox.width / 2.0 |
| center_y = bbox.y + bbox.height / 2.0 |
| return { |
| "camera_id": camera_id, |
| "track_id": track.track_id, |
| "label": track.label, |
| "vector": [ |
| round(track.average_confidence, 4), |
| round(width_norm, 4), |
| round(height_norm, 4), |
| round(center_x / max(bbox.x + bbox.width, 1.0), 4), |
| round(center_y / max(bbox.y + bbox.height, 1.0), 4), |
| round(frame_analysis.brightness / 255.0, 4), |
| ], |
| } |
|
|
|
|
| def _vector_similarity(left: list[float], right: list[float]) -> float: |
| left_norm = sum(value * value for value in left) ** 0.5 |
| right_norm = sum(value * value for value in right) ** 0.5 |
| if left_norm == 0 or right_norm == 0: |
| return 0.0 |
| dot = sum(a * b for a, b in zip(left, right, strict=False)) |
| return dot / (left_norm * right_norm) |
|
|
|
|
| def _update_reid_index( |
| session: dict[str, Any], |
| tracks: list[TrackedObject], |
| frame_analysis: FrameAnalysis, |
| ) -> list[dict[str, Any]]: |
| camera_id = session["camera_id"] |
| matches: list[dict[str, Any]] = [] |
| camera_signatures = [ |
| signature |
| for track in tracks |
| if (signature := _reid_signature(camera_id, track, frame_analysis)) is not None |
| ] |
|
|
| for signature in camera_signatures: |
| for other_camera_id, items in _LIVE_REID_INDEX.items(): |
| if other_camera_id == camera_id: |
| continue |
| for candidate in items: |
| if candidate["label"].lower() != signature["label"].lower(): |
| continue |
| similarity = _vector_similarity(signature["vector"], candidate["vector"]) |
| if similarity >= 0.94: |
| matches.append( |
| { |
| "target_camera_id": other_camera_id, |
| "source_track_id": signature["track_id"], |
| "target_track_id": candidate["track_id"], |
| "source_label": signature["label"], |
| "target_label": candidate["label"], |
| "similarity_score": round(similarity, 4), |
| "summary": ( |
| f"Iespējams cross-camera match starp {camera_id} track {signature['track_id']} " |
| f"un {other_camera_id} track {candidate['track_id']}." |
| ), |
| } |
| ) |
|
|
| _LIVE_REID_INDEX[camera_id] = camera_signatures[-12:] |
| return matches |
|
|
|
|
| def _build_live_frame_payload( |
| session: dict[str, Any], |
| frame: Image.Image, |
| frame_index: int, |
| detections: list[VisionDetection], |
| model_name: str, |
| fallback_used: bool, |
| ) -> tuple[dict[str, Any], list[LiveEvent]]: |
| detections = _apply_roi_zones(detections, list(session.get("roi_zones", []))) |
| analyses: list[FrameAnalysis] = session.setdefault("frame_analyses", []) |
| frame_analysis = FrameAnalysis( |
| frame_index=frame_index, |
| summary=_build_detection_summary(detections, frame.size[0], frame.size[1]) |
| if detections |
| else _fallback_summary(frame, "Live stream šajā kadrā nedeva stabilus objektus."), |
| detections=detections, |
| dominant_labels=_dominant_labels(detections), |
| brightness=_frame_brightness(frame), |
| ) |
| previous = analyses[-1] if analyses else None |
| analyses.append(frame_analysis) |
| session["frame_analyses"] = analyses[-24:] |
|
|
| scene_changed_flag = _scene_changed( |
| previous, |
| frame_analysis, |
| float(session.get("scene_change_threshold", SCENE_BRIGHTNESS_DELTA)), |
| ) |
|
|
| tracks = ( |
| _build_tracks(session["frame_analyses"], frame.size) |
| if "tracking" in session["enabled_pipelines"] |
| else session.get("tracks", []) |
| ) |
| timeline = ( |
| _build_scenes(session["frame_analyses"]) |
| if "scene_timeline" in session["enabled_pipelines"] |
| else session.get("timeline", []) |
| ) |
| session["tracks"] = tracks |
| session["timeline"] = timeline |
|
|
| should_run_ocr = "ocr" in session["enabled_pipelines"] and ( |
| scene_changed_flag or frame_index % max(int(session.get("ocr_interval", 12)), 1) == 0 |
| ) |
| ocr_results: list[OCRTextBlock] = [] |
| ocr_model = "disabled" |
| ocr_fallback = False |
| if should_run_ocr: |
| ocr_results, ocr_model, ocr_fallback = _extract_ocr_blocks(frame) |
|
|
| poses = ( |
| _estimate_pose_from_detections(detections) |
| if "pose_estimation" in session["enabled_pipelines"] |
| else [] |
| ) |
| masks = ( |
| _segment_from_detections(frame, detections) |
| if "segmentation" in session["enabled_pipelines"] |
| else [] |
| ) |
| actions = ( |
| _predict_actions(detections) if "action_recognition" in session["enabled_pipelines"] else [] |
| ) |
| alerts = ( |
| _live_alerts(frame_analysis, scene_changed_flag) |
| if "anomaly_detection" in session["enabled_pipelines"] |
| else [] |
| ) |
| alerts.extend( |
| _evaluate_alert_rules( |
| detections, list(session.get("alert_rules", [])), session["camera_id"] |
| ) |
| ) |
| reid_matches = _update_reid_index(session, tracks, frame_analysis) if tracks else [] |
|
|
| session["latest_snapshot"] = _image_to_data_url(frame) |
| session["latest_result"] = { |
| "summary": frame_analysis.summary, |
| "frame_index": frame_index, |
| "model": model_name, |
| "fallback_used": fallback_used, |
| "width": frame.size[0], |
| "height": frame.size[1], |
| "detections": [item.model_dump() for item in detections], |
| "results": [item.model_dump() for item in ocr_results], |
| "poses": [item.model_dump() for item in poses], |
| "masks": [item.model_dump() for item in masks], |
| "actions": [item.model_dump() for item in actions], |
| "tracks": [item.model_dump() for item in tracks], |
| "scenes": [item.model_dump() for item in timeline], |
| "alerts": alerts, |
| "reid_matches": reid_matches, |
| "ocr_model": ocr_model, |
| "ocr_fallback_used": ocr_fallback, |
| } |
|
|
| events: list[LiveEvent] = [ |
| _append_session_event( |
| session, |
| _create_live_event( |
| session["camera_id"], |
| "analysis_result", |
| f"Live frame {frame_index} analizēts ar {len(detections)} detekcijām.", |
| payload={ |
| "frame_index": frame_index, |
| "detection_count": len(detections), |
| "scene_changed": scene_changed_flag, |
| }, |
| ), |
| ) |
| ] |
|
|
| if tracks: |
| events.append( |
| _append_session_event( |
| session, |
| _create_live_event( |
| session["camera_id"], |
| "track_update", |
| f"Track layer atjaunināts ar {len(tracks)} aktīvām trajektorijām.", |
| payload={"track_count": len(tracks)}, |
| ), |
| ) |
| ) |
| if timeline: |
| latest_scene = timeline[-1] |
| events.append( |
| _append_session_event( |
| session, |
| _create_live_event( |
| session["camera_id"], |
| "timeline_update", |
| latest_scene.summary, |
| payload=latest_scene.model_dump(), |
| ), |
| ) |
| ) |
| for alert in alerts: |
| events.append( |
| _append_session_event( |
| session, |
| _create_live_event( |
| session["camera_id"], |
| "alert", |
| alert["summary"], |
| severity=alert["severity"], |
| payload=alert, |
| ), |
| ) |
| ) |
| for match in reid_matches: |
| events.append( |
| _append_session_event( |
| session, |
| _create_live_event( |
| session["camera_id"], |
| "reid_match", |
| match["summary"], |
| severity="info", |
| payload=match, |
| ), |
| ) |
| ) |
|
|
| return session["latest_result"], events |
|
|
|
|
| async def _save_generation(event: str, metadata: dict[str, Any]) -> None: |
| from maris_core.utils.hf_integration import HFIntegration |
|
|
| hf = HFIntegration() |
| await hf.save_generation("vision", event, metadata) |
|
|
|
|
| @router.post("/analyze", response_model=VisionAnalyzeResponse) |
| async def analyze_image(req: ImageSourceRequest) -> VisionAnalyzeResponse: |
| """Analizē attēlu ar objektu noteikšanu.""" |
| image = await _load_image(req) |
| detections, model_name, fallback_used = _detect_image_payload( |
| image, |
| threshold=req.confidence_threshold, |
| max_detections=req.max_detections, |
| ) |
| width, height = image.size |
| summary = ( |
| _fallback_summary(image, "Objekta noteikšanas modelis šobrīd nav pieejams.") |
| if fallback_used and not detections |
| else _build_detection_summary(detections, width, height) |
| ) |
| await _save_generation( |
| "vision/analyze", |
| { |
| "model": model_name, |
| "width": width, |
| "height": height, |
| "detections": len(detections), |
| "fallback_used": fallback_used, |
| "session_id": req.session_id, |
| "camera_id": req.camera_id, |
| }, |
| ) |
| if req.session_id: |
| memory_store.remember_message( |
| req.session_id, |
| "assistant", |
| summary, |
| source="vision_camera" if req.camera_id else "vision_analyze", |
| ) |
| return VisionAnalyzeResponse( |
| summary=summary, |
| detections=detections, |
| width=width, |
| height=height, |
| model=model_name, |
| fallback_used=fallback_used, |
| ) |
|
|
|
|
| @router.post("/ocr", response_model=VisionOCRResponse) |
| async def ocr_image(req: ImageSourceRequest) -> VisionOCRResponse: |
| """Izlasa tekstu no attēla.""" |
| image = await _load_image(req) |
| width, height = image.size |
| results, model_name, fallback_used = _extract_ocr_blocks(image) |
| summary = ( |
| f"OCR pabeigts: atrasti {len(results)} teksta bloki attēlā ({width}x{height})." |
| if results |
| else _fallback_summary(image, "OCR modelis šobrīd nav pieejams vai teksts nav atrasts.") |
| ) |
| await _save_generation( |
| "vision/ocr", |
| { |
| "model": model_name, |
| "width": width, |
| "height": height, |
| "blocks": len(results), |
| "fallback_used": fallback_used, |
| }, |
| ) |
| return VisionOCRResponse( |
| summary=summary, |
| results=results, |
| width=width, |
| height=height, |
| model=model_name, |
| fallback_used=fallback_used, |
| ) |
|
|
|
|
| @router.post("/pose-estimate", response_model=VisionPoseResponse) |
| async def estimate_pose(req: ImageSourceRequest) -> VisionPoseResponse: |
| """Aprēķina aptuvenus ķermeņa punktus no noteiktām personām.""" |
| image = await _load_image(req) |
| detections, _, detection_fallback = _detect_image_payload( |
| image, |
| threshold=req.confidence_threshold, |
| max_detections=req.max_detections, |
| ) |
| poses = _estimate_pose_from_detections(detections) |
| width, height = image.size |
| fallback_used = detection_fallback or not poses |
| model_name = "bbox-derived-pose-v1" |
| summary = ( |
| f"Pose estimation pabeigta: atrasti {len(poses)} cilvēku skeleti attēlā ({width}x{height})." |
| if poses |
| else _fallback_summary( |
| image, "Pose estimation nevarēja atrast personu bbox, no kā atvasināt skeletu." |
| ) |
| ) |
| await _save_generation( |
| "vision/pose-estimate", |
| { |
| "model": model_name, |
| "width": width, |
| "height": height, |
| "poses": len(poses), |
| "fallback_used": fallback_used, |
| }, |
| ) |
| return VisionPoseResponse( |
| summary=summary, |
| poses=poses, |
| width=width, |
| height=height, |
| model=model_name, |
| fallback_used=fallback_used, |
| ) |
|
|
|
|
| @router.post("/segment", response_model=VisionSegmentationResponse) |
| async def segment_image(req: ImageSourceRequest) -> VisionSegmentationResponse: |
| """Atgriež objektu segmentācijas maskas.""" |
| image = await _load_image(req) |
| detections, _, detection_fallback = _detect_image_payload( |
| image, |
| threshold=req.confidence_threshold, |
| max_detections=req.max_detections, |
| ) |
| masks, model_name, segmentation_fallback = _extract_segmentation_masks(image, detections) |
| width, height = image.size |
| fallback_used = segmentation_fallback or detection_fallback |
| summary = ( |
| f"Segmentation pabeigta: ģenerētas {len(masks)} maskas attēlā ({width}x{height})." |
| if masks |
| else _fallback_summary(image, "Segmentācijas modelis nevarēja izveidot maskas.") |
| ) |
| await _save_generation( |
| "vision/segment", |
| { |
| "model": model_name, |
| "width": width, |
| "height": height, |
| "masks": len(masks), |
| "fallback_used": fallback_used, |
| }, |
| ) |
| return VisionSegmentationResponse( |
| summary=summary, |
| masks=masks, |
| width=width, |
| height=height, |
| model=model_name, |
| fallback_used=fallback_used, |
| ) |
|
|
|
|
| @router.post("/action-recognize", response_model=VisionActionResponse) |
| async def recognize_action(req: ImageSourceRequest) -> VisionActionResponse: |
| """Atgriež darbību prognozes no viena kadra.""" |
| image = await _load_image(req) |
| detections, _, detection_fallback = _detect_image_payload( |
| image, |
| threshold=req.confidence_threshold, |
| max_detections=req.max_detections, |
| ) |
| actions = _predict_actions(detections) |
| width, height = image.size |
| fallback_used = detection_fallback or not actions |
| model_name = "vision-action-heuristics-v1" |
| summary = ( |
| f"Action recognition pabeigta: atrastas {len(actions)} darbību hipotēzes attēlā ({width}x{height})." |
| if actions |
| else _fallback_summary( |
| image, "Darbību noteikšanai vajadzīgs vismaz viens person objekts kadrā." |
| ) |
| ) |
| await _save_generation( |
| "vision/action-recognize", |
| { |
| "model": model_name, |
| "width": width, |
| "height": height, |
| "actions": len(actions), |
| "fallback_used": fallback_used, |
| }, |
| ) |
| return VisionActionResponse( |
| summary=summary, |
| actions=actions, |
| width=width, |
| height=height, |
| model=model_name, |
| fallback_used=fallback_used, |
| ) |
|
|
|
|
| @router.post("/tracking", response_model=VisionTrackingResponse) |
| async def track_objects(req: FrameSequenceRequest) -> VisionTrackingResponse: |
| """Seko objektiem kadru secībā.""" |
| frames = await _load_frames(req) |
| analyses, model_name, fallback_used = _build_frame_analysis( |
| frames, |
| threshold=req.confidence_threshold, |
| max_detections=req.max_detections, |
| ) |
| tracks = _build_tracks(analyses, frames[0].size) |
| summary = ( |
| f"Tracking pabeigts: {len(tracks)} trajektorijas pāri {len(frames)} kadriem." |
| if tracks |
| else "Tracking pabeigts bez stabilām trajektorijām — pārbaudi ievades kadrus vai modeļa pieejamību." |
| ) |
| await _save_generation( |
| "vision/tracking", |
| { |
| "model": model_name, |
| "frame_count": len(frames), |
| "tracks": len(tracks), |
| "fallback_used": fallback_used, |
| }, |
| ) |
| return VisionTrackingResponse( |
| summary=summary, |
| tracks=tracks, |
| frame_count=len(frames), |
| model=model_name, |
| fallback_used=fallback_used, |
| ) |
|
|
|
|
| @router.post("/frame-analysis", response_model=VisionFrameAnalysisResponse) |
| async def analyze_frames(req: FrameSequenceRequest) -> VisionFrameAnalysisResponse: |
| """Analizē katru video kadru atsevišķi.""" |
| frames = await _load_frames(req) |
| analyses, model_name, fallback_used = _build_frame_analysis( |
| frames, |
| threshold=req.confidence_threshold, |
| max_detections=req.max_detections, |
| ) |
| summary = f"Frame-by-frame analīze pabeigta {len(analyses)} kadriem." |
| await _save_generation( |
| "vision/frame-analysis", |
| { |
| "model": model_name, |
| "frame_count": len(frames), |
| "fallback_used": fallback_used, |
| }, |
| ) |
| return VisionFrameAnalysisResponse( |
| summary=summary, |
| frames=analyses, |
| frame_count=len(analyses), |
| model=model_name, |
| fallback_used=fallback_used, |
| ) |
|
|
|
|
| @router.post("/scene-timeline", response_model=VisionSceneTimelineResponse) |
| async def scene_timeline(req: FrameSequenceRequest) -> VisionSceneTimelineResponse: |
| """Saspiež kadru analīzi ainu laika skalā.""" |
| frames = await _load_frames(req) |
| analyses, model_name, fallback_used = _build_frame_analysis( |
| frames, |
| threshold=req.confidence_threshold, |
| max_detections=req.max_detections, |
| ) |
| scenes = _build_scenes(analyses) |
| summary = ( |
| f"Scene timeline pabeigta: {len(scenes)} ainas pāri {len(frames)} kadriem." |
| if scenes |
| else "Scene timeline nevarēja atrast ainu robežas dotajos kadros." |
| ) |
| await _save_generation( |
| "vision/scene-timeline", |
| { |
| "model": model_name, |
| "frame_count": len(frames), |
| "scenes": len(scenes), |
| "fallback_used": fallback_used, |
| }, |
| ) |
| return VisionSceneTimelineResponse( |
| summary=summary, |
| scenes=scenes, |
| frame_count=len(analyses), |
| model=model_name, |
| fallback_used=fallback_used, |
| ) |
|
|
|
|
| @router.get("/live/cameras", response_model=LiveCameraCatalogResponse) |
| async def list_live_cameras() -> LiveCameraCatalogResponse: |
| cameras = [_session_to_response(session) for session in _LIVE_CAMERAS.values()] |
| return LiveCameraCatalogResponse( |
| summary=f"Live camera registry satur {len(cameras)} kameras.", |
| cameras=cameras, |
| ) |
|
|
|
|
| @router.post("/live/connect", response_model=LiveCameraResponse) |
| async def connect_live_camera(req: LiveCameraConnectRequest) -> LiveCameraResponse: |
| camera_id = (req.camera_id or f"cam_{uuid4().hex[:10]}").strip() |
| session = { |
| "camera_id": camera_id, |
| "source_type": req.source_type, |
| "transport": req.transport, |
| "url": req.url, |
| "device_id": req.device_id, |
| "auth": dict(req.auth), |
| "resolution": req.resolution.model_dump(), |
| "fps": req.fps, |
| "status": "connected", |
| "health": CameraHealth().model_dump(), |
| "enabled_pipelines": req.enabled_pipelines or _default_live_pipelines(), |
| "detection_stride": req.detection_stride, |
| "ocr_interval": req.ocr_interval, |
| "fps_budget": req.fps_budget, |
| "scene_change_threshold": SCENE_BRIGHTNESS_DELTA, |
| "roi_zones": req.roi_zones, |
| "alert_rules": req.alert_rules, |
| "latest_snapshot": None, |
| "latest_result": {}, |
| "recent_events": [], |
| "frame_analyses": [], |
| "timeline": [], |
| "tracks": [], |
| "frame_counter": 0, |
| } |
| session["health"]["connected"] = True |
| session["health"]["analysis_active"] = False |
| _append_session_event( |
| session, |
| _create_live_event( |
| camera_id, |
| "camera_connected", |
| f"Kamera {camera_id} piereģistrēta ar transportu {req.transport}.", |
| payload={ |
| "source_type": req.source_type, |
| "transport": req.transport, |
| "device_id": req.device_id, |
| "url": req.url, |
| }, |
| ), |
| ) |
| _LIVE_CAMERAS[camera_id] = session |
| await _save_generation( |
| "vision/live-connect", |
| { |
| "camera_id": camera_id, |
| "source_type": req.source_type, |
| "transport": req.transport, |
| }, |
| ) |
| return LiveCameraResponse( |
| summary=f"Live kamera {camera_id} ir savienota.", |
| camera=_session_to_response(session), |
| ) |
|
|
|
|
| def _require_live_camera(camera_id: str) -> dict[str, Any]: |
| session = _LIVE_CAMERAS.get(camera_id) |
| if session is None: |
| raise HTTPException(status_code=404, detail="Live kamera nav atrasta.") |
| return session |
|
|
|
|
| @router.post("/live/start", response_model=LiveCameraResponse) |
| async def start_live_camera(req: LiveSessionCommandRequest) -> LiveCameraResponse: |
| session = _require_live_camera(req.camera_id) |
| if req.enabled_pipelines is not None: |
| session["enabled_pipelines"] = req.enabled_pipelines or _default_live_pipelines() |
| if req.detection_stride is not None: |
| session["detection_stride"] = req.detection_stride |
| if req.ocr_interval is not None: |
| session["ocr_interval"] = req.ocr_interval |
| if req.fps_budget is not None: |
| session["fps_budget"] = req.fps_budget |
| session["status"] = "streaming" |
| session["health"]["analysis_active"] = True |
| _append_session_event( |
| session, |
| _create_live_event( |
| req.camera_id, |
| "analysis_started", |
| "Live analīzes sesija ir startēta.", |
| payload={"enabled_pipelines": session["enabled_pipelines"]}, |
| ), |
| ) |
| await _save_generation( |
| "vision/live-start", |
| {"camera_id": req.camera_id, "pipelines": session["enabled_pipelines"]}, |
| ) |
| return LiveCameraResponse( |
| summary=f"Live analīze kamerai {req.camera_id} ir palaista.", |
| camera=_session_to_response(session), |
| ) |
|
|
|
|
| @router.post("/live/config", response_model=LiveCameraResponse) |
| async def configure_live_camera(req: LiveCameraConfigRequest) -> LiveCameraResponse: |
| session = _require_live_camera(req.camera_id) |
| session["roi_zones"] = req.roi_zones |
| session["alert_rules"] = req.alert_rules |
| if req.enabled_pipelines is not None: |
| session["enabled_pipelines"] = req.enabled_pipelines or _default_live_pipelines() |
| if req.fps_budget is not None: |
| session["fps_budget"] = req.fps_budget |
| _append_session_event( |
| session, |
| _create_live_event( |
| req.camera_id, |
| "config_updated", |
| "Kameras ROI, rules vai pipeline konfigurācija tika atjaunināta.", |
| payload={ |
| "roi_zone_count": len(req.roi_zones), |
| "alert_rule_count": len(req.alert_rules), |
| "enabled_pipelines": session["enabled_pipelines"], |
| "fps_budget": session["fps_budget"], |
| }, |
| ), |
| ) |
| await _save_generation( |
| "vision/live-config", |
| { |
| "camera_id": req.camera_id, |
| "roi_zone_count": len(req.roi_zones), |
| "alert_rule_count": len(req.alert_rules), |
| }, |
| ) |
| return LiveCameraResponse( |
| summary=f"Kameras {req.camera_id} konfigurācija ir atjaunināta.", |
| camera=_session_to_response(session), |
| ) |
|
|
|
|
| @router.post("/live/pause", response_model=LiveCameraResponse) |
| async def pause_live_camera(req: LiveSessionCommandRequest) -> LiveCameraResponse: |
| session = _require_live_camera(req.camera_id) |
| session["status"] = "paused" |
| session["health"]["analysis_active"] = False |
| _append_session_event( |
| session, |
| _create_live_event(req.camera_id, "analysis_paused", "Live analīze ir pauzēta."), |
| ) |
| return LiveCameraResponse( |
| summary=f"Live analīze kamerai {req.camera_id} ir pauzēta.", |
| camera=_session_to_response(session), |
| ) |
|
|
|
|
| @router.post("/live/stop", response_model=LiveCameraResponse) |
| async def stop_live_camera(req: LiveSessionCommandRequest) -> LiveCameraResponse: |
| session = _require_live_camera(req.camera_id) |
| session["status"] = "stopped" |
| session["health"]["analysis_active"] = False |
| _append_session_event( |
| session, |
| _create_live_event(req.camera_id, "analysis_stopped", "Live analīze ir apturēta."), |
| ) |
| await _save_generation("vision/live-stop", {"camera_id": req.camera_id}) |
| return LiveCameraResponse( |
| summary=f"Live analīze kamerai {req.camera_id} ir apturēta.", |
| camera=_session_to_response(session), |
| ) |
|
|
|
|
| @router.get("/live/{camera_id}/state", response_model=LiveCameraResponse) |
| async def live_camera_state(camera_id: str) -> LiveCameraResponse: |
| session = _require_live_camera(camera_id) |
| return LiveCameraResponse( |
| summary=f"Stāvoklis kamerai {camera_id} ir atjaunots.", |
| camera=_session_to_response(session), |
| ) |
|
|
|
|
| @router.get("/live/{camera_id}/snapshot", response_model=LiveSnapshotResponse) |
| async def live_camera_snapshot(camera_id: str) -> LiveSnapshotResponse: |
| session = _require_live_camera(camera_id) |
| return LiveSnapshotResponse( |
| summary=f"Atgriezts pēdējais snapshot kamerai {camera_id}.", |
| camera_id=camera_id, |
| snapshot_data_url=session.get("latest_snapshot"), |
| ) |
|
|
|
|
| @router.get("/live/{camera_id}/events", response_model=LiveEventsResponse) |
| async def live_camera_events(camera_id: str) -> LiveEventsResponse: |
| session = _require_live_camera(camera_id) |
| return LiveEventsResponse( |
| summary=f"Atgriezti {len(session.get('recent_events', []))} live notikumi kamerai {camera_id}.", |
| camera_id=camera_id, |
| events=_camera_events(session), |
| ) |
|
|
|
|
| @router.post("/live/frame", response_model=LiveFrameResponse) |
| async def process_live_frame(req: LiveFrameRequest) -> LiveFrameResponse: |
| session = _require_live_camera(req.camera_id) |
| if not session["health"]["analysis_active"]: |
| raise HTTPException(status_code=409, detail="Live sesija nav palaista.") |
|
|
| frame_index = req.frame_index if req.frame_index is not None else int(session["frame_counter"]) |
| session["frame_counter"] = frame_index + 1 |
| if req.timestamp_ms is not None: |
| last_timestamp_ms = session.get("last_timestamp_ms") |
| min_interval_ms = 1000.0 / max(float(session.get("fps_budget", 6.0)), 0.5) |
| if ( |
| isinstance(last_timestamp_ms, int) |
| and req.timestamp_ms >= last_timestamp_ms |
| and req.timestamp_ms - last_timestamp_ms < min_interval_ms |
| ): |
| session["health"]["dropped_frames"] = ( |
| int(session["health"].get("dropped_frames", 0)) + 1 |
| ) |
| event = _append_session_event( |
| session, |
| _create_live_event( |
| req.camera_id, |
| "frame_dropped", |
| "Kadrs tika atmests, lai ievērotu FPS budget un backpressure politiku.", |
| severity="warning", |
| payload={ |
| "frame_index": frame_index, |
| "fps_budget": session.get("fps_budget", 6.0), |
| }, |
| ), |
| ) |
| return LiveFrameResponse( |
| summary=f"Live frame {frame_index} tika atmests backpressure dēļ.", |
| camera=_session_to_response(session), |
| events=[event], |
| ) |
| session["last_timestamp_ms"] = int(req.timestamp_ms) |
|
|
| image = await _load_image_from_source(None, req.image_base64) |
| session["health"]["last_frame_at"] = _utc_now_iso() |
|
|
| run_detection = ( |
| frame_index % max(int(session.get("detection_stride", 3)), 1) == 0 |
| or not session.get("latest_detections") |
| or "tracking" in session["enabled_pipelines"] |
| ) |
| if run_detection: |
| detections, model_name, fallback_used = _frame_detections( |
| image, |
| threshold=0.25, |
| max_detections=10, |
| ) |
| session["latest_detections"] = [item.model_dump() for item in detections] |
| session["latest_model_name"] = model_name |
| session["latest_fallback_used"] = fallback_used |
| else: |
| detections = [ |
| VisionDetection.model_validate(item) for item in session.get("latest_detections", []) |
| ] |
| model_name = str(session.get("latest_model_name", "scheduled-cache")) |
| fallback_used = bool(session.get("latest_fallback_used", False)) |
|
|
| _, events = _build_live_frame_payload( |
| session, |
| image, |
| frame_index, |
| detections, |
| model_name, |
| fallback_used, |
| ) |
| return LiveFrameResponse( |
| summary=f"Live frame {frame_index} apstrādāts kamerai {req.camera_id}.", |
| camera=_session_to_response(session), |
| events=events, |
| ) |
|
|