""" models/inference.py — Pydantic models for the Inference Engine. Covers request, response, session history, and pipeline stage telemetry. """ from __future__ import annotations from enum import Enum from typing import Any, Literal from pydantic import BaseModel, Field import time import uuid class AdapterType(str, Enum): YOLO = "yolo" TRANSFORMERS = "transformers" ONNX = "onnx" CUSTOM = "custom" class InferencePrecision(str, Enum): FP32 = "FP32" FP16 = "FP16" INT8 = "INT8" class YOLOConfig(BaseModel): confidence: float = Field(0.25, ge=0.0, le=1.0) iou_threshold: float = Field(0.45, ge=0.1, le=0.9) class_filter: list[str] = Field(default_factory=list) max_detections: int = Field(300, ge=1, le=1000) class TransformersConfig(BaseModel): max_new_tokens: int = Field(256, ge=1, le=4096) temperature: float = Field(0.7, ge=0.0, le=2.0) top_p: float = Field(0.9, ge=0.0, le=1.0) top_k: int = Field(50, ge=0, le=200) beam_width: int = Field(1, ge=1, le=8) do_sample: bool = True class ONNXConfig(BaseModel): execution_provider: Literal["CUDAExecutionProvider", "CPUExecutionProvider"] = "CUDAExecutionProvider" input_size: int = Field(640, ge=32, le=1280) normalize: bool = True class CustomConfig(BaseModel): preprocess_script: str = "" postprocess_script: str = "" class InferenceRequest(BaseModel): model_id: str adapter_type: AdapterType precision: InferencePrecision = InferencePrecision.FP16 # Input — one of these must be set image_base64: str | None = None # base64-encoded image text_input: str | None = None # text/prompt # Per-adapter config yolo_config: YOLOConfig | None = None transformers_config: TransformersConfig | None = None onnx_config: ONNXConfig | None = None custom_config: CustomConfig | None = None # Execution run_mode: Literal["single", "stream"] = "single" class PipelineStage(BaseModel): name: str status: Literal["pending", "running", "done", "error"] = "pending" latency_ms: float | None = None detail: str | None = None class Detection(BaseModel): x1: float y1: float x2: float y2: float confidence: float class_id: int class_name: str class InferenceResult(BaseModel): # Identity request_id: str = Field(default_factory=lambda: str(uuid.uuid4())) model_id: str adapter_type: AdapterType timestamp: float = Field(default_factory=time.time) # Timing preprocess_ms: float = 0.0 inference_ms: float = 0.0 postprocess_ms: float = 0.0 total_ms: float = 0.0 # Output — adapter-specific, all optional detections: list[Detection] = Field(default_factory=list) text_output: str | None = None class_label: str | None = None confidence: float | None = None embeddings: list[float] | None = None raw_output: Any = None # raw JSON for inspector # Pipeline trace pipeline: list[PipelineStage] = Field(default_factory=list) # Quality score (0–5) derived from confidence mean quality_score: float | None = None # Error error: str | None = None status: Literal["ok", "error"] = "ok" class InferenceHistoryEntry(BaseModel): id: str = Field(default_factory=lambda: str(uuid.uuid4())) model_id: str model_name: str adapter_type: AdapterType timestamp: float = Field(default_factory=time.time) total_ms: float quality_score: float | None status: Literal["ok", "error"] # Compact snapshot of result for re-run request_snapshot: dict[str, Any] = Field(default_factory=dict) class SystemVitals(BaseModel): ts: float latency_ms: float fps: float vram_used_gb: float vram_total_gb: float gpu_temp_c: float | None = None cpu_pct: float = 0.0