mlforge / models /inference.py
senthil2421's picture
Deploy cloud brain to HF Spaces
ac5551d
"""
models/inference.py — Pydantic models for the Inference Engine.
Covers request, response, session history, and pipeline stage telemetry.
"""
from __future__ import annotations
from enum import Enum
from typing import Any, Literal
from pydantic import BaseModel, Field
import time
import uuid
class AdapterType(str, Enum):
YOLO = "yolo"
TRANSFORMERS = "transformers"
ONNX = "onnx"
CUSTOM = "custom"
class InferencePrecision(str, Enum):
FP32 = "FP32"
FP16 = "FP16"
INT8 = "INT8"
class YOLOConfig(BaseModel):
confidence: float = Field(0.25, ge=0.0, le=1.0)
iou_threshold: float = Field(0.45, ge=0.1, le=0.9)
class_filter: list[str] = Field(default_factory=list)
max_detections: int = Field(300, ge=1, le=1000)
class TransformersConfig(BaseModel):
max_new_tokens: int = Field(256, ge=1, le=4096)
temperature: float = Field(0.7, ge=0.0, le=2.0)
top_p: float = Field(0.9, ge=0.0, le=1.0)
top_k: int = Field(50, ge=0, le=200)
beam_width: int = Field(1, ge=1, le=8)
do_sample: bool = True
class ONNXConfig(BaseModel):
execution_provider: Literal["CUDAExecutionProvider", "CPUExecutionProvider"] = "CUDAExecutionProvider"
input_size: int = Field(640, ge=32, le=1280)
normalize: bool = True
class CustomConfig(BaseModel):
preprocess_script: str = ""
postprocess_script: str = ""
class InferenceRequest(BaseModel):
model_id: str
adapter_type: AdapterType
precision: InferencePrecision = InferencePrecision.FP16
# Input — one of these must be set
image_base64: str | None = None # base64-encoded image
text_input: str | None = None # text/prompt
# Per-adapter config
yolo_config: YOLOConfig | None = None
transformers_config: TransformersConfig | None = None
onnx_config: ONNXConfig | None = None
custom_config: CustomConfig | None = None
# Execution
run_mode: Literal["single", "stream"] = "single"
class PipelineStage(BaseModel):
name: str
status: Literal["pending", "running", "done", "error"] = "pending"
latency_ms: float | None = None
detail: str | None = None
class Detection(BaseModel):
x1: float
y1: float
x2: float
y2: float
confidence: float
class_id: int
class_name: str
class InferenceResult(BaseModel):
# Identity
request_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
model_id: str
adapter_type: AdapterType
timestamp: float = Field(default_factory=time.time)
# Timing
preprocess_ms: float = 0.0
inference_ms: float = 0.0
postprocess_ms: float = 0.0
total_ms: float = 0.0
# Output — adapter-specific, all optional
detections: list[Detection] = Field(default_factory=list)
text_output: str | None = None
class_label: str | None = None
confidence: float | None = None
embeddings: list[float] | None = None
raw_output: Any = None # raw JSON for inspector
# Pipeline trace
pipeline: list[PipelineStage] = Field(default_factory=list)
# Quality score (0–5) derived from confidence mean
quality_score: float | None = None
# Error
error: str | None = None
status: Literal["ok", "error"] = "ok"
class InferenceHistoryEntry(BaseModel):
id: str = Field(default_factory=lambda: str(uuid.uuid4()))
model_id: str
model_name: str
adapter_type: AdapterType
timestamp: float = Field(default_factory=time.time)
total_ms: float
quality_score: float | None
status: Literal["ok", "error"]
# Compact snapshot of result for re-run
request_snapshot: dict[str, Any] = Field(default_factory=dict)
class SystemVitals(BaseModel):
ts: float
latency_ms: float
fps: float
vram_used_gb: float
vram_total_gb: float
gpu_temp_c: float | None = None
cpu_pct: float = 0.0