File size: 4,164 Bytes
ac5551d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
"""
models/inference.py — Pydantic models for the Inference Engine.
Covers request, response, session history, and pipeline stage telemetry.
"""
from __future__ import annotations

from enum import Enum
from typing import Any, Literal
from pydantic import BaseModel, Field
import time
import uuid


class AdapterType(str, Enum):
    YOLO        = "yolo"
    TRANSFORMERS = "transformers"
    ONNX        = "onnx"
    CUSTOM      = "custom"


class InferencePrecision(str, Enum):
    FP32 = "FP32"
    FP16 = "FP16"
    INT8 = "INT8"


class YOLOConfig(BaseModel):
    confidence:    float = Field(0.25, ge=0.0, le=1.0)
    iou_threshold: float = Field(0.45, ge=0.1, le=0.9)
    class_filter:  list[str] = Field(default_factory=list)
    max_detections: int = Field(300, ge=1, le=1000)


class TransformersConfig(BaseModel):
    max_new_tokens: int  = Field(256, ge=1, le=4096)
    temperature:    float = Field(0.7, ge=0.0, le=2.0)
    top_p:          float = Field(0.9, ge=0.0, le=1.0)
    top_k:          int   = Field(50, ge=0, le=200)
    beam_width:     int   = Field(1, ge=1, le=8)
    do_sample:      bool  = True


class ONNXConfig(BaseModel):
    execution_provider: Literal["CUDAExecutionProvider", "CPUExecutionProvider"] = "CUDAExecutionProvider"
    input_size:  int = Field(640, ge=32, le=1280)
    normalize:   bool = True


class CustomConfig(BaseModel):
    preprocess_script:  str = ""
    postprocess_script: str = ""


class InferenceRequest(BaseModel):
    model_id:    str
    adapter_type: AdapterType
    precision:   InferencePrecision = InferencePrecision.FP16

    # Input — one of these must be set
    image_base64: str | None = None   # base64-encoded image
    text_input:   str | None = None   # text/prompt

    # Per-adapter config
    yolo_config:         YOLOConfig | None         = None
    transformers_config: TransformersConfig | None = None
    onnx_config:         ONNXConfig | None         = None
    custom_config:       CustomConfig | None       = None

    # Execution
    run_mode: Literal["single", "stream"] = "single"


class PipelineStage(BaseModel):
    name:        str
    status:      Literal["pending", "running", "done", "error"] = "pending"
    latency_ms:  float | None = None
    detail:      str | None   = None


class Detection(BaseModel):
    x1:         float
    y1:         float
    x2:         float
    y2:         float
    confidence: float
    class_id:   int
    class_name: str


class InferenceResult(BaseModel):
    # Identity
    request_id:  str = Field(default_factory=lambda: str(uuid.uuid4()))
    model_id:    str
    adapter_type: AdapterType
    timestamp:   float = Field(default_factory=time.time)

    # Timing
    preprocess_ms:  float = 0.0
    inference_ms:   float = 0.0
    postprocess_ms: float = 0.0
    total_ms:       float = 0.0

    # Output — adapter-specific, all optional
    detections:    list[Detection] = Field(default_factory=list)
    text_output:   str | None      = None
    class_label:   str | None      = None
    confidence:    float | None    = None
    embeddings:    list[float] | None = None
    raw_output:    Any             = None          # raw JSON for inspector

    # Pipeline trace
    pipeline:      list[PipelineStage] = Field(default_factory=list)

    # Quality score (0–5) derived from confidence mean
    quality_score: float | None = None

    # Error
    error:         str | None = None
    status:        Literal["ok", "error"] = "ok"


class InferenceHistoryEntry(BaseModel):
    id:           str = Field(default_factory=lambda: str(uuid.uuid4()))
    model_id:     str
    model_name:   str
    adapter_type: AdapterType
    timestamp:    float = Field(default_factory=time.time)
    total_ms:     float
    quality_score: float | None
    status:       Literal["ok", "error"]
    # Compact snapshot of result for re-run
    request_snapshot: dict[str, Any] = Field(default_factory=dict)


class SystemVitals(BaseModel):
    ts:           float
    latency_ms:   float
    fps:          float
    vram_used_gb: float
    vram_total_gb: float
    gpu_temp_c:   float | None = None
    cpu_pct:      float        = 0.0