Spaces:
Sleeping
Sleeping
File size: 4,164 Bytes
ac5551d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 | """
models/inference.py — Pydantic models for the Inference Engine.
Covers request, response, session history, and pipeline stage telemetry.
"""
from __future__ import annotations
from enum import Enum
from typing import Any, Literal
from pydantic import BaseModel, Field
import time
import uuid
class AdapterType(str, Enum):
YOLO = "yolo"
TRANSFORMERS = "transformers"
ONNX = "onnx"
CUSTOM = "custom"
class InferencePrecision(str, Enum):
FP32 = "FP32"
FP16 = "FP16"
INT8 = "INT8"
class YOLOConfig(BaseModel):
confidence: float = Field(0.25, ge=0.0, le=1.0)
iou_threshold: float = Field(0.45, ge=0.1, le=0.9)
class_filter: list[str] = Field(default_factory=list)
max_detections: int = Field(300, ge=1, le=1000)
class TransformersConfig(BaseModel):
max_new_tokens: int = Field(256, ge=1, le=4096)
temperature: float = Field(0.7, ge=0.0, le=2.0)
top_p: float = Field(0.9, ge=0.0, le=1.0)
top_k: int = Field(50, ge=0, le=200)
beam_width: int = Field(1, ge=1, le=8)
do_sample: bool = True
class ONNXConfig(BaseModel):
execution_provider: Literal["CUDAExecutionProvider", "CPUExecutionProvider"] = "CUDAExecutionProvider"
input_size: int = Field(640, ge=32, le=1280)
normalize: bool = True
class CustomConfig(BaseModel):
preprocess_script: str = ""
postprocess_script: str = ""
class InferenceRequest(BaseModel):
model_id: str
adapter_type: AdapterType
precision: InferencePrecision = InferencePrecision.FP16
# Input — one of these must be set
image_base64: str | None = None # base64-encoded image
text_input: str | None = None # text/prompt
# Per-adapter config
yolo_config: YOLOConfig | None = None
transformers_config: TransformersConfig | None = None
onnx_config: ONNXConfig | None = None
custom_config: CustomConfig | None = None
# Execution
run_mode: Literal["single", "stream"] = "single"
class PipelineStage(BaseModel):
name: str
status: Literal["pending", "running", "done", "error"] = "pending"
latency_ms: float | None = None
detail: str | None = None
class Detection(BaseModel):
x1: float
y1: float
x2: float
y2: float
confidence: float
class_id: int
class_name: str
class InferenceResult(BaseModel):
# Identity
request_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
model_id: str
adapter_type: AdapterType
timestamp: float = Field(default_factory=time.time)
# Timing
preprocess_ms: float = 0.0
inference_ms: float = 0.0
postprocess_ms: float = 0.0
total_ms: float = 0.0
# Output — adapter-specific, all optional
detections: list[Detection] = Field(default_factory=list)
text_output: str | None = None
class_label: str | None = None
confidence: float | None = None
embeddings: list[float] | None = None
raw_output: Any = None # raw JSON for inspector
# Pipeline trace
pipeline: list[PipelineStage] = Field(default_factory=list)
# Quality score (0–5) derived from confidence mean
quality_score: float | None = None
# Error
error: str | None = None
status: Literal["ok", "error"] = "ok"
class InferenceHistoryEntry(BaseModel):
id: str = Field(default_factory=lambda: str(uuid.uuid4()))
model_id: str
model_name: str
adapter_type: AdapterType
timestamp: float = Field(default_factory=time.time)
total_ms: float
quality_score: float | None
status: Literal["ok", "error"]
# Compact snapshot of result for re-run
request_snapshot: dict[str, Any] = Field(default_factory=dict)
class SystemVitals(BaseModel):
ts: float
latency_ms: float
fps: float
vram_used_gb: float
vram_total_gb: float
gpu_temp_c: float | None = None
cpu_pct: float = 0.0
|