| """ | |
| Worker Schemas for Distributed Worker Coordination | |
| Defines Pydantic schemas for worker registration, heartbeat, and status management. | |
| """ | |
| import uuid | |
| from datetime import datetime | |
| from enum import Enum | |
| from typing import Any, Dict, List, Optional | |
| from pydantic import BaseModel, Field | |
| class WorkerStatus(str, Enum): | |
| """Worker status enumeration.""" | |
| REGISTERED = "registered" | |
| ACTIVE = "active" | |
| DEGRADED = "degraded" | |
| OFFLINE = "offline" | |
| class GPUInfo(BaseModel): | |
| """GPU information for a worker.""" | |
| gpu_index: int = Field(description="GPU index") | |
| gpu_name: str = Field(description="GPU name") | |
| memory_total_mb: int = Field(description="Total memory in MB") | |
| memory_used_mb: int = Field(description="Used memory in MB") | |
| utilization_percent: float = Field(default=0.0, description="GPU utilization %") | |
| class WorkerRegistrationRequest(BaseModel): | |
| """Request model for worker registration.""" | |
| hostname: str = Field(description="Worker hostname") | |
| gpu_count: int = Field(default=0, description="Number of GPUs") | |
| gpu_info: Optional[List[GPUInfo]] = Field(default=None, description="GPU information") | |
| gpu_memory_total: int = Field(default=0, description="Total GPU memory in MB") | |
| max_concurrent_jobs: int = Field(default=1, description="Max concurrent jobs") | |
| capabilities: Optional[Dict[str, Any]] = Field( | |
| default=None, | |
| description="Worker capabilities (supported models, etc.)" | |
| ) | |
| worker_metadata: Optional[Dict[str, Any]] = Field( | |
| default=None, | |
| description="Additional worker metadata" | |
| ) | |
| class WorkerRegistrationResponse(BaseModel): | |
| """Response model for worker registration.""" | |
| worker_id: str = Field(description="Assigned worker ID") | |
| status: WorkerStatus = Field(description="Worker status") | |
| registered_at: datetime = Field(description="Registration timestamp") | |
| heartbeat_interval: int = Field( | |
| default=30, | |
| description="Heartbeat interval in seconds" | |
| ) | |
| heartbeat_timeout: int = Field( | |
| default=120, | |
| description="Heartbeat timeout in seconds (worker marked offline after this)" | |
| ) | |
| class HeartbeatRequest(BaseModel): | |
| """Request model for worker heartbeat.""" | |
| worker_id: str = Field(description="Worker ID") | |
| gpu_usage: Optional[float] = Field( | |
| default=None, | |
| description="Current GPU usage percentage" | |
| ) | |
| gpu_memory_used: Optional[int] = Field( | |
| default=None, | |
| description="Current GPU memory used in MB" | |
| ) | |
| active_jobs: Optional[int] = Field( | |
| default=None, | |
| description="Number of active jobs" | |
| ) | |
| gpu_info: Optional[List[GPUInfo]] = Field( | |
| default=None, | |
| description="Updated GPU information" | |
| ) | |
| status: Optional[WorkerStatus] = Field( | |
| default=None, | |
| description="Worker status (optional override)" | |
| ) | |
| class HeartbeatResponse(BaseModel): | |
| """Response model for worker heartbeat.""" | |
| worker_id: str = Field(description="Worker ID") | |
| status: WorkerStatus = Field(description="Worker status") | |
| timestamp: datetime = Field(description="Heartbeat timestamp") | |
| assigned_jobs: int = Field(description="Number of assigned jobs") | |
| class WorkerStatusResponse(BaseModel): | |
| """Response model for worker status.""" | |
| worker_id: str = Field(description="Worker ID") | |
| hostname: str = Field(description="Worker hostname") | |
| status: WorkerStatus = Field(description="Worker status") | |
| gpu_count: int = Field(description="Number of GPUs") | |
| gpu_memory_total: int = Field(description="Total GPU memory in MB") | |
| gpu_memory_used: int = Field(description="Used GPU memory in MB") | |
| gpu_usage_percent: float = Field(description="GPU usage percentage") | |
| active_jobs: int = Field(description="Number of active jobs") | |
| max_concurrent_jobs: int = Field(description="Max concurrent jobs") | |
| load_factor: float = Field( | |
| description="Load factor (active_jobs / max_concurrent_jobs)" | |
| ) | |
| last_heartbeat: datetime = Field(description="Last heartbeat timestamp") | |
| registered_at: datetime = Field(description="Registration timestamp") | |
| capabilities: Optional[Dict[str, Any]] = Field( | |
| default=None, | |
| description="Worker capabilities" | |
| ) | |
| uptime_seconds: int = Field(description="Worker uptime in seconds") | |
| class WorkerListResponse(BaseModel): | |
| """Response model for listing workers.""" | |
| workers: List[WorkerStatusResponse] = Field(description="List of workers") | |
| total: int = Field(description="Total number of workers") | |
| active: int = Field(description="Number of active workers") | |
| offline: int = Field(description="Number of offline workers") | |
| class WorkerMetricsResponse(BaseModel): | |
| """Response model for worker cluster metrics.""" | |
| total_workers: int = Field(description="Total number of workers") | |
| active_workers: int = Field(description="Number of active workers") | |
| offline_workers: int = Field(description="Number of offline workers") | |
| degraded_workers: int = Field(description="Number of degraded workers") | |
| total_gpus: int = Field(description="Total number of GPUs") | |
| total_gpu_memory_mb: int = Field( | |
| description="Total GPU memory in MB" | |
| ) | |
| used_gpu_memory_mb: int = Field( | |
| description="Used GPU memory in MB" | |
| ) | |
| total_active_jobs: int = Field( | |
| description="Total number of active jobs across all workers" | |
| ) | |
| average_load_factor: float = Field( | |
| description="Average load factor across workers" | |
| ) | |
| queue_length: int = Field(description="Number of pending jobs in queue") | |
| __all__ = [ | |
| "WorkerStatus", | |
| "GPUInfo", | |
| "WorkerRegistrationRequest", | |
| "WorkerRegistrationResponse", | |
| "HeartbeatRequest", | |
| "HeartbeatResponse", | |
| "WorkerStatusResponse", | |
| "WorkerListResponse", | |
| "WorkerMetricsResponse", | |
| ] | |