jkottu's picture
Initial commit: LLM Inference Dashboard
aefabf0
"""Data models for storage layer."""
from dataclasses import dataclass, field
from datetime import datetime
from typing import Optional, Dict, Any
import json
@dataclass
class MetricRecord:
"""A single metric record for storage."""
metric_name: str
value: float
timestamp: datetime = field(default_factory=datetime.now)
labels: Dict[str, str] = field(default_factory=dict)
id: Optional[int] = None
def to_dict(self) -> Dict[str, Any]:
return {
"id": self.id,
"metric_name": self.metric_name,
"value": self.value,
"timestamp": self.timestamp.isoformat(),
"labels": self.labels,
}
@classmethod
def from_row(cls, row: tuple) -> "MetricRecord":
return cls(
id=row[0],
timestamp=datetime.fromisoformat(row[1]),
metric_name=row[2],
value=row[3],
labels=json.loads(row[4]) if row[4] else {},
)
@dataclass
class AlertRecord:
"""An alert record for storage."""
rule_name: str
severity: str
metric_name: str
value: float
threshold: float
message: str
timestamp: datetime = field(default_factory=datetime.now)
resolved_at: Optional[datetime] = None
id: Optional[int] = None
def to_dict(self) -> Dict[str, Any]:
return {
"id": self.id,
"rule_name": self.rule_name,
"severity": self.severity,
"metric_name": self.metric_name,
"value": self.value,
"threshold": self.threshold,
"message": self.message,
"timestamp": self.timestamp.isoformat(),
"resolved_at": self.resolved_at.isoformat() if self.resolved_at else None,
}
@classmethod
def from_row(cls, row: tuple) -> "AlertRecord":
return cls(
id=row[0],
timestamp=datetime.fromisoformat(row[1]),
rule_name=row[2],
severity=row[3],
metric_name=row[4],
value=row[5],
threshold=row[6],
message=row[7] if len(row) > 7 else "",
resolved_at=datetime.fromisoformat(row[8]) if len(row) > 8 and row[8] else None,
)
@dataclass
class RequestTrace:
"""A request trace for latency analysis."""
request_id: str
prompt_tokens: int
output_tokens: int
queue_time_ms: float
prefill_time_ms: float
decode_time_ms: float
total_time_ms: float
tokens_per_second: float
gpu_memory_at_start: float = 0.0
gpu_memory_at_end: float = 0.0
is_slow: bool = False
timestamp: datetime = field(default_factory=datetime.now)
id: Optional[int] = None
def to_dict(self) -> Dict[str, Any]:
return {
"id": self.id,
"request_id": self.request_id,
"timestamp": self.timestamp.isoformat(),
"prompt_tokens": self.prompt_tokens,
"output_tokens": self.output_tokens,
"queue_time_ms": round(self.queue_time_ms, 2),
"prefill_time_ms": round(self.prefill_time_ms, 2),
"decode_time_ms": round(self.decode_time_ms, 2),
"total_time_ms": round(self.total_time_ms, 2),
"tokens_per_second": round(self.tokens_per_second, 2),
"is_slow": self.is_slow,
}
@classmethod
def from_row(cls, row: tuple) -> "RequestTrace":
return cls(
id=row[0],
request_id=row[1],
timestamp=datetime.fromisoformat(row[2]),
prompt_tokens=row[3],
output_tokens=row[4],
queue_time_ms=row[5],
prefill_time_ms=row[6],
decode_time_ms=row[7],
total_time_ms=row[8],
tokens_per_second=row[9] if len(row) > 9 else 0,
is_slow=bool(row[10]) if len(row) > 10 else False,
)
@dataclass
class LoadTestResult:
"""Results from a load test run."""
test_id: str
target_endpoint: str
concurrent_users: int
requests_per_second: float
duration_seconds: int
total_requests: int
successful_requests: int
failed_requests: int
avg_latency_ms: float
p50_latency_ms: float
p95_latency_ms: float
p99_latency_ms: float
throughput_rps: float
saturation_point: Optional[float] = None
timestamp: datetime = field(default_factory=datetime.now)
id: Optional[int] = None
def to_dict(self) -> Dict[str, Any]:
return {
"test_id": self.test_id,
"target_endpoint": self.target_endpoint,
"concurrent_users": self.concurrent_users,
"requests_per_second": self.requests_per_second,
"duration_seconds": self.duration_seconds,
"total_requests": self.total_requests,
"successful_requests": self.successful_requests,
"failed_requests": self.failed_requests,
"avg_latency_ms": round(self.avg_latency_ms, 2),
"p50_latency_ms": round(self.p50_latency_ms, 2),
"p95_latency_ms": round(self.p95_latency_ms, 2),
"p99_latency_ms": round(self.p99_latency_ms, 2),
"throughput_rps": round(self.throughput_rps, 2),
"saturation_point": self.saturation_point,
"timestamp": self.timestamp.isoformat(),
}