Spaces:

Smilyai-labs
/

Sam-Z-api

Sleeping

App Files Files Community

Bc-AI commited on Nov 4

Commit

74ffe1c

verified ·

1 Parent(s): f937aaa

Update app.py

Browse files

Files changed (1) hide show

app.py +282 -424

app.py CHANGED Viewed

@@ -1,7 +1,9 @@
 """
-SAM-Z-1 Distributed Compute Cluster Head Node
 - Smart load balancing with distributed compute
 - Real-time status dashboard
 """
 from fastapi import FastAPI, HTTPException, WebSocket
@@ -15,7 +17,7 @@ from typing import List, Optional, Dict
 from collections import deque
 import random
-app = FastAPI(title="SAM-Z-1 Distributed Cluster", version="4.0.0")
 # ============================================================================
 # Configuration
@@ -30,13 +32,21 @@ WORKER_URLS = [
     "https://bc-ai-worker-5.hf.space"
 ]
-HEALTH_CHECK_INTERVAL = 5  # faster checks for real-time dashboard
 LOAD_CHECK_WINDOW = 10
 LIGHT_LOAD_THRESHOLD = 2
 HEAVY_LOAD_THRESHOLD = 5
-# Worker state
 worker_health = {
     url: {
         "healthy": True,
@@ -45,12 +55,14 @@ worker_health = {
         "total_requests": 0,
         "total_tokens": 0,
         "avg_latency": 0,
-        "role": "idle"  # "generator", "decoder", "full", "idle"
     } for url in WORKER_URLS
 }
 request_timestamps = deque(maxlen=100)
-current_load_mode = "light"  # "light", "medium", "heavy"
 cluster_stats = {
     "total_requests": 0,
     "successful_requests": 0,
@@ -58,7 +70,6 @@ cluster_stats = {
     "uptime_start": time.time()
 }
-# Active WebSocket connections for real-time updates
 active_connections = set()
 # ============================================================================
@@ -73,6 +84,7 @@ class GenerateRequest(BaseModel):
     top_p: float = 0.9
     repetition_penalty: float = 1.1
     stream: bool = True
 class ChatMessage(BaseModel):
     role: str
@@ -86,6 +98,138 @@ class ChatRequest(BaseModel):
     top_p: float = 0.9
     repetition_penalty: float = 1.1
     stream: bool = True
 # ============================================================================
 # Load Management
@@ -100,21 +244,20 @@ def update_load_mode():
     load = get_current_load()
     healthy_count = len(get_healthy_workers())
-    # Adjust thresholds based on available workers
     if healthy_count >= 5:
         if load <= LIGHT_LOAD_THRESHOLD:
-            current_load_mode = "light"      # 1 gen + 4 decoders
-        elif load <= MEDIUM_LOAD_THRESHOLD:
-            current_load_mode = "medium"     # 2 gens + 3 decoders OR parallel requests
         else:
-            current_load_mode = "heavy"      # all workers independent
     elif healthy_count >= 3:
         if load <= 2:
-            current_load_mode = "light"      # 1 gen + 2 decoders
         else:
-            current_load_mode = "heavy"      # distribute requests
     else:
-        current_load_mode = "heavy"          # fallback to simple distribution
     return current_load_mode, load
@@ -122,42 +265,11 @@ def track_request():
     request_timestamps.append(time.time())
     cluster_stats["total_requests"] += 1
-def get_healthy_workers() -> List[str]:
-    return [url for url, status in worker_health.items() if status["healthy"]]
-def get_least_busy_worker() -> Optional[str]:
-    healthy = get_healthy_workers()
-    if not healthy:
-        return None
-    return min(healthy, key=lambda url: worker_health[url]["active_requests"])
-def select_distributed_workers() -> tuple:
-    """
-    Select workers for distributed compute
-    Returns: (generators: List[str], decoders: List[str])
-    """
-    healthy = get_healthy_workers()
-    if len(healthy) < 2:
-        return ([healthy[0]], []) if len(healthy) == 1 else ([], [])
-    # Sort by least busy
-    sorted_workers = sorted(healthy, key=lambda url: worker_health[url]["active_requests"])
-    if len(healthy) >= 5:
-        # OPTIMAL: 1 generator, 4 decoders
-        return ([sorted_workers[0]], sorted_workers[1:5])
-    elif len(healthy) == 4:
-        # 1 generator, 3 decoders
-        return ([sorted_workers[0]], sorted_workers[1:4])
-    elif len(healthy) == 3:
-        # 1 generator, 2 decoders
-        return ([sorted_workers[0]], sorted_workers[1:3])
-    else:
-        # 1 generator, 1 decoder
-        return ([sorted_workers[0]], [sorted_workers[1]])
 async def broadcast_stats():
-    """Broadcast stats to all connected WebSocket clients"""
     if not active_connections:
         return
@@ -170,13 +282,15 @@ async def broadcast_stats():
         "load": load,
         "workers": [
             {
-                "url": url.split("//")[1].split(".")[0],  # shorter name
                 "healthy": status["healthy"],
                 "active": status["active_requests"],
                 "total": status["total_requests"],
                 "tokens": status["total_tokens"],
                 "latency": round(status["avg_latency"], 2),
-                "role": status["role"]
             }
             for url, status in worker_health.items()
         ],
@@ -189,7 +303,6 @@ async def broadcast_stats():
         }
     }
-    # Broadcast to all connections
     disconnected = set()
     for ws in active_connections:
         try:
@@ -197,36 +310,24 @@ async def broadcast_stats():
         except:
             disconnected.add(ws)
-    # Remove disconnected
     active_connections.difference_update(disconnected)
-async def check_worker_health(worker_url: str) -> bool:
     try:
-        async with httpx.AsyncClient(timeout=5.0) as client:
-            response = await client.get(f"{worker_url}/health")
-            return response.status_code == 200
-    except:
-        return False
-async def health_check_loop():
-    while True:
-        # Check all workers
-        for worker_url in WORKER_URLS:
-            healthy = await check_worker_health(worker_url)
-            worker_health[worker_url]["healthy"] = healthy
-            worker_health[worker_url]["last_check"] = time.time()
-        # Always broadcast stats to connected clients
         await broadcast_stats()
-        await asyncio.sleep(HEALTH_CHECK_INTERVAL)
-@app.on_event("startup")
-async def startup_event():
-    asyncio.create_task(health_check_loop())
 # ============================================================================
-# Distributed Compute Generation
 # ============================================================================
 async def distributed_generation(
@@ -235,11 +336,7 @@ async def distributed_generation(
     request_data: dict,
     endpoint: str = "generate"
 ):
-    """
-    DISTRIBUTED COMPUTE MODE
-    - Generator(s) produce token IDs
-    - Multiple decoders process in parallel (load balanced)
-    """
     if not generators or not decoders:
         return
@@ -247,15 +344,13 @@ async def distributed_generation(
     token_queue = asyncio.Queue(maxsize=50)
     text_queue = asyncio.Queue(maxsize=50)
-    # Mark roles
     for gen_url in generators:
         worker_health[gen_url]["role"] = "generator"
     for dec_url in decoders:
         worker_health[dec_url]["role"] = "decoder"
     async def generate_tokens():
-        """Generator worker(s)"""
-        gen_url = generators[0]  # primary generator
         try:
             worker_health[gen_url]["active_requests"] += 1
             request_data_tokens = {**request_data, "return_token_ids": True}
@@ -273,7 +368,6 @@ async def distributed_generation(
                                 if "token_id" in data:
                                     await token_queue.put(data["token_id"])
                                 elif "done" in data:
-                                    # Send done signal for each decoder
                                     for _ in decoders:
                                         await token_queue.put(None)
                                     break
@@ -288,18 +382,16 @@ async def distributed_generation(
             worker_health[gen_url]["role"] = "idle"
     async def decode_tokens(decoder_url: str, decoder_id: int):
-        """Decoder worker - processes tokens from shared queue"""
         try:
             worker_health[decoder_url]["active_requests"] += 1
             batch = []
-            batch_size = 2  # smaller batches for faster streaming
             while True:
                 try:
                     token_id = await asyncio.wait_for(token_queue.get(), timeout=2.0)
                     if token_id is None:
-                        # Decode remaining batch
                         if batch:
                             async with httpx.AsyncClient(timeout=10.0) as client:
                                 response = await client.post(
@@ -315,7 +407,6 @@ async def distributed_generation(
                     batch.append(token_id)
-                    # Decode when batch is full
                     if len(batch) >= batch_size:
                         async with httpx.AsyncClient(timeout=10.0) as client:
                             response = await client.post(
@@ -338,16 +429,12 @@ async def distributed_generation(
             worker_health[decoder_url]["active_requests"] -= 1
             worker_health[decoder_url]["role"] = "idle"
-    # Start generator
     gen_task = asyncio.create_task(generate_tokens())
-    # Start all decoders
     decoder_tasks = [
         asyncio.create_task(decode_tokens(dec_url, i))
         for i, dec_url in enumerate(decoders)
     ]
-    # Stream results
     accumulated_text = ""
     decoders_done = 0
     total_decoders = len(decoders)
@@ -393,24 +480,19 @@ async def heavy_load_generation(worker_url: str, request_data: dict, endpoint: s
         worker_health[worker_url]["role"] = "idle"
 # ============================================================================
-# Dashboard
 # ============================================================================
 @app.get("/", response_class=HTMLResponse)
 async def dashboard():
-    """Real-time futuristic dashboard"""
     return """
 <!DOCTYPE html>
 <html>
 <head>
-    <title>SAM-Z-1 Cluster Control</title>
     <style>
-        * {
-            margin: 0;
-            padding: 0;
-            box-sizing: border-box;
-        }
         body {
             font-family: 'Courier New', monospace;
             background: linear-gradient(135deg, #0a0e27 0%, #1a1f3a 100%);
@@ -419,14 +501,12 @@ async def dashboard():
             overflow-x: hidden;
             overflow-y: auto;
         }
         .container {
             padding: 20px;
             max-width: 1400px;
             margin: 0 auto;
             padding-bottom: 40px;
         }
         .header {
             text-align: center;
             margin-bottom: 30px;
@@ -436,7 +516,6 @@ async def dashboard():
             border-radius: 10px;
             box-shadow: 0 0 20px rgba(0, 255, 136, 0.3);
         }
         .header h1 {
             font-size: 2.5em;
             text-transform: uppercase;
@@ -444,18 +523,33 @@ async def dashboard():
             text-shadow: 0 0 10px #00ff88;
             animation: glow 2s ease-in-out infinite alternate;
         }
         @keyframes glow {
             from { text-shadow: 0 0 10px #00ff88, 0 0 20px #00ff88; }
             to { text-shadow: 0 0 20px #00ff88, 0 0 30px #00ff88, 0 0 40px #00ff88; }
         }
         .status-bar {
             display: flex;
             gap: 20px;
             margin-bottom: 30px;
         }
         .stat-card {
             flex: 1;
             background: rgba(0, 255, 136, 0.05);
@@ -465,83 +559,22 @@ async def dashboard():
             position: relative;
             overflow: hidden;
         }
-        .stat-card::before {
-            content: '';
-            position: absolute;
-            top: 0;
-            left: -100%;
-            width: 100%;
-            height: 100%;
-            background: linear-gradient(90deg, transparent, rgba(0, 255, 136, 0.2), transparent);
-            animation: scan 3s infinite;
-        }
-        @keyframes scan {
-            0% { left: -100%; }
-            100% { left: 100%; }
-        }
         .stat-label {
             font-size: 0.8em;
             opacity: 0.7;
             text-transform: uppercase;
         }
         .stat-value {
             font-size: 2em;
             font-weight: bold;
             margin-top: 5px;
         }
-        .mode-badge {
-            display: inline-block;
-            padding: 5px 15px;
-            border-radius: 20px;
-            font-size: 0.9em;
-            font-weight: bold;
-            text-transform: uppercase;
-            margin-top: 10px;
-        }
-        .mode-light {
-            background: rgba(0, 255, 136, 0.2);
-            border: 1px solid #00ff88;
-            color: #00ff88;
-        }
-        .mode-heavy {
-            background: rgba(255, 68, 68, 0.2);
-            border: 1px solid #ff4444;
-            color: #ff4444;
-        }
         .workers-grid {
             display: grid;
             grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
             gap: 20px;
             margin-bottom: 30px;
         }
-        @media (max-width: 768px) {
-            .workers-grid {
-                grid-template-columns: 1fr;
-            }
-            .status-bar {
-                flex-direction: column;
-            }
-            .info-grid {
-                grid-template-columns: repeat(2, 1fr);
-            }
-            .header h1 {
-                font-size: 1.5em;
-                letter-spacing: 2px;
-            }
-        }
         .worker-card {
             background: rgba(10, 14, 39, 0.8);
             border: 2px solid #00ff88;
@@ -550,166 +583,89 @@ async def dashboard():
             position: relative;
             transition: all 0.3s;
         }
         .worker-card:hover {
             transform: translateY(-5px);
             box-shadow: 0 5px 30px rgba(0, 255, 136, 0.4);
         }
         .worker-card.offline {
             border-color: #ff4444;
             opacity: 0.6;
         }
         .worker-header {
             display: flex;
             justify-content: space-between;
             align-items: center;
             margin-bottom: 15px;
         }
         .worker-name {
             font-size: 1.2em;
             font-weight: bold;
         }
         .status-dot {
             width: 12px;
             height: 12px;
             border-radius: 50%;
             animation: pulse 2s infinite;
         }
         .status-dot.online {
             background: #00ff88;
             box-shadow: 0 0 10px #00ff88;
         }
         .status-dot.offline {
             background: #ff4444;
             box-shadow: 0 0 10px #ff4444;
         }
         @keyframes pulse {
             0%, 100% { opacity: 1; }
             50% { opacity: 0.5; }
         }
         .worker-stats {
             display: grid;
             grid-template-columns: repeat(2, 1fr);
             gap: 10px;
             margin-top: 15px;
         }
         .worker-stat {
             background: rgba(0, 255, 136, 0.05);
             padding: 10px;
             border-radius: 5px;
         }
         .worker-stat-label {
             font-size: 0.7em;
             opacity: 0.7;
         }
         .worker-stat-value {
             font-size: 1.3em;
             font-weight: bold;
             margin-top: 3px;
         }
-        .role-badge {
-            display: inline-block;
-            padding: 3px 10px;
-            border-radius: 12px;
-            font-size: 0.75em;
-            margin-top: 10px;
-            font-weight: bold;
-        }
-        .role-generator {
-            background: rgba(255, 165, 0, 0.2);
-            border: 1px solid #ffa500;
-            color: #ffa500;
-        }
-        .role-decoder {
-            background: rgba(0, 191, 255, 0.2);
-            border: 1px solid #00bfff;
-            color: #00bfff;
-        }
-        .role-full {
-            background: rgba(138, 43, 226, 0.2);
-            border: 1px solid #8a2be2;
-            color: #8a2be2;
-        }
-        .role-idle {
-            background: rgba(128, 128, 128, 0.2);
-            border: 1px solid #808080;
-            color: #808080;
-        }
-        .progress-bar {
-            width: 100%;
-            height: 4px;
-            background: rgba(0, 255, 136, 0.1);
-            border-radius: 2px;
-            margin-top: 10px;
-            overflow: hidden;
-        }
-        .progress-fill {
-            height: 100%;
-            background: linear-gradient(90deg, #00ff88, #00ffff);
-            transition: width 0.3s;
-            box-shadow: 0 0 10px #00ff88;
-        }
-        .cluster-info {
-            background: rgba(0, 255, 136, 0.05);
-            border: 1px solid #00ff88;
-            border-radius: 8px;
-            padding: 20px;
-        }
-        .info-grid {
-            display: grid;
-            grid-template-columns: repeat(4, 1fr);
-            gap: 20px;
-        }
-        .info-item {
-            text-align: center;
-        }
         .timestamp {
             text-align: center;
             margin-top: 20px;
             opacity: 0.5;
             font-size: 0.9em;
         }
     </style>
 </head>
 <body>
     <div class="container">
         <div class="header">
             <h1>⚡ SAM-Z-1 CLUSTER ⚡</h1>
-            <div>DISTRIBUTED COMPUTE SYSTEM v4.0</div>
         </div>
         <div class="status-bar">
             <div class="stat-card">
                 <div class="stat-label">Load Mode</div>
                 <div class="stat-value" id="mode">--</div>
-                <div class="mode-badge" id="mode-badge">INITIALIZING</div>
             </div>
             <div class="stat-card">
                 <div class="stat-label">Current Load</div>
                 <div class="stat-value" id="load">0</div>
-                <div class="stat-label">requests / 10s</div>
             </div>
             <div class="stat-card">
                 <div class="stat-label">Total Requests</div>
@@ -721,149 +677,47 @@ async def dashboard():
             </div>
         </div>
-        <div class="workers-grid" id="workers">
-            <!-- Workers populated by JS -->
-        </div>
-        <div class="cluster-info">
-            <div class="stat-label" style="margin-bottom: 15px;">CLUSTER STATISTICS</div>
-            <div class="info-grid">
-                <div class="info-item">
-                    <div class="stat-label">Successful</div>
-                    <div class="stat-value" style="font-size: 1.5em;" id="success">0</div>
-                </div>
-                <div class="info-item">
-                    <div class="stat-label">Failed</div>
-                    <div class="stat-value" style="font-size: 1.5em;" id="failed">0</div>
-                </div>
-                <div class="info-item">
-                    <div class="stat-label">Uptime</div>
-                    <div class="stat-value" style="font-size: 1.5em;" id="uptime">0s</div>
-                </div>
-                <div class="info-item">
-                    <div class="stat-label">Healthy Workers</div>
-                    <div class="stat-value" style="font-size: 1.5em;" id="healthy">0</div>
-                </div>
-            </div>
-        </div>
         <div class="timestamp" id="timestamp">Last update: --</div>
     </div>
     <script>
-        // Use wss:// for HTTPS, ws:// for HTTP
         const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
         let ws;
-        let usePolling = false;
         function connectWebSocket() {
             try {
                 ws = new WebSocket(`${protocol}//${window.location.host}/ws`);
-                ws.onopen = () => {
-                    console.log('✅ WebSocket connected');
-                    usePolling = false;
-                };
-                ws.onmessage = (event) => {
-                    const data = JSON.parse(event.data);
-                    updateDashboard(data);
-                };
-                ws.onerror = (error) => {
-                    console.error('❌ WebSocket error, switching to polling');
-                    usePolling = true;
-                    startPolling();
-                };
-                ws.onclose = () => {
-                    console.log('🔌 WebSocket disconnected');
-                    if (!usePolling) {
-                        setTimeout(connectWebSocket, 3000);
-                    }
-                };
             } catch (e) {
-                console.error('Failed to connect WebSocket, using polling');
-                usePolling = true;
-                startPolling();
             }
         }
-        async function pollStats() {
-            if (!usePolling) return;
-            try {
-                const response = await fetch('/api/status');
-                const data = await response.json();
-                // Fetch worker stats too
-                const workersRes = await fetch('/workers');
-                const workersData = await workersRes.json();
-                // Format data like WebSocket
-                const formattedData = {
-                    timestamp: Date.now() / 1000,
-                    mode: data.mode,
-                    load: data.current_load,
-                    workers: workersData.workers.map(w => ({
-                        url: w.url.split("//")[1].split(".")[0],
-                        healthy: w.healthy,
-                        active: w.active_requests || 0,
-                        total: 0,
-                        tokens: 0,
-                        latency: 0,
-                        role: "idle"
-                    })),
-                    cluster: {
-                        total_requests: 0,
-                        successful: 0,
-                        failed: 0,
-                        uptime: 0,
-                        rps: 0
-                    }
-                };
-                updateDashboard(formattedData);
-            } catch (e) {
-                console.error('Polling error:', e);
-            }
-        }
-        function startPolling() {
-            pollStats();
-            setInterval(pollStats, 1000);
-        }
-        // Try WebSocket first
         connectWebSocket();
         function updateDashboard(data) {
-            // Mode
             document.getElementById('mode').textContent = data.mode.toUpperCase();
-            const modeBadge = document.getElementById('mode-badge');
-            modeBadge.textContent = `${data.mode.toUpperCase()} MODE`;
-            modeBadge.className = `mode-badge mode-${data.mode}`;
-            // Stats
             document.getElementById('load').textContent = data.load;
             document.getElementById('total-req').textContent = data.cluster.total_requests;
             document.getElementById('rps').textContent = data.cluster.rps;
-            document.getElementById('success').textContent = data.cluster.successful;
-            document.getElementById('failed').textContent = data.cluster.failed;
-            document.getElementById('uptime').textContent = formatUptime(data.cluster.uptime);
-            // Workers
             const workersDiv = document.getElementById('workers');
-            const healthyCount = data.workers.filter(w => w.healthy).length;
-            document.getElementById('healthy').textContent = `${healthyCount}/${data.workers.length}`;
             workersDiv.innerHTML = data.workers.map(worker => `
                 <div class="worker-card ${worker.healthy ? '' : 'offline'}">
                     <div class="worker-header">
-                        <div class="worker-name">${worker.url}</div>
                         <div class="status-dot ${worker.healthy ? 'online' : 'offline'}"></div>
                     </div>
-                    <div class="role-badge role-${worker.role}">${worker.role.toUpperCase()}</div>
                     <div class="worker-stats">
                         <div class="worker-stat">
                             <div class="worker-stat-label">Active</div>
@@ -878,69 +732,46 @@ async def dashboard():
                             <div class="worker-stat-value">${worker.tokens}</div>
                         </div>
                         <div class="worker-stat">
-                            <div class="worker-stat-label">Latency</div>
-                            <div class="worker-stat-value">${worker.latency}ms</div>
                         </div>
                     </div>
-                    <div class="progress-bar">
-                        <div class="progress-fill" style="width: ${Math.min(worker.active * 33, 100)}%"></div>
-                    </div>
                 </div>
             `).join('');
-            // Timestamp
-            const now = new Date();
             document.getElementById('timestamp').textContent =
-                `Last update: ${now.toLocaleTimeString()}`;
-        }
-        function formatUptime(seconds) {
-            const h = Math.floor(seconds / 3600);
-            const m = Math.floor((seconds % 3600) / 60);
-            const s = Math.floor(seconds % 60);
-            return `${h}h ${m}m ${s}s`;
         }
     </script>
 </body>
 </html>
     """
-@app.websocket("/ws")
-async def websocket_endpoint(websocket: WebSocket):
-    """WebSocket for real-time dashboard updates"""
-    await websocket.accept()
-    active_connections.add(websocket)
-    try:
-        # Send initial data
-        await broadcast_stats()
-        # Keep connection alive
-        while True:
-            await websocket.receive_text()
-    except:
-        pass
-    finally:
-        active_connections.discard(websocket)
-# ============================================================================
-# API Endpoints
-# ============================================================================
 @app.get("/api/status")
 async def api_status():
-    """JSON API for status"""
     mode, load = update_load_mode()
     healthy_count = len(get_healthy_workers())
     return {
         "name": "SAM-Z-1 Distributed Cluster",
-        "version": "4.0.0",
         "mode": mode,
         "current_load": load,
         "workers": len(WORKER_URLS),
         "healthy_workers": healthy_count,
-        "features": ["distributed_compute", "smart_load_balancing", "real_time_dashboard"]
     }
 @app.get("/health")
@@ -951,16 +782,34 @@ async def health():
         "workers_healthy": healthy_count
     }
 @app.post("/v1/generate")
 async def generate(request: GenerateRequest):
-    """Generate text with distributed compute"""
     track_request()
     mode, load = update_load_mode()
-    healthy = get_healthy_workers()
-    if not healthy:
         cluster_stats["failed_requests"] += 1
-        raise HTTPException(status_code=503, detail="No healthy workers")
     request_data = {
         "prompt": request.prompt,
@@ -972,12 +821,15 @@ async def generate(request: GenerateRequest):
         "stream": True
     }
-    print(f"🎯 {mode.upper()} | Load: {load} | Workers: {len(healthy)}")
     try:
-        if mode == "light" and len(healthy) >= 2:
-            # DISTRIBUTED MODE - 1 gen + multiple decoders
-            generators, decoders = select_distributed_workers()
             if decoders:
                 cluster_stats["successful_requests"] += 1
                 return StreamingResponse(
@@ -985,8 +837,7 @@ async def generate(request: GenerateRequest):
                     media_type="text/event-stream"
                 )
-        # HEAVY/FALLBACK - single worker
-        worker = get_least_busy_worker()
         cluster_stats["successful_requests"] += 1
         return StreamingResponse(
             heavy_load_generation(worker, request_data, "generate"),
@@ -998,14 +849,19 @@ async def generate(request: GenerateRequest):
 @app.post("/v1/chat")
 async def chat(request: ChatRequest):
-    """Chat with distributed compute"""
     track_request()
     mode, load = update_load_mode()
-    healthy = get_healthy_workers()
-    if not healthy:
         cluster_stats["failed_requests"] += 1
-        raise HTTPException(status_code=503, detail="No healthy workers")
     request_data = {
         "messages": [{"role": m.role, "content": m.content} for m in request.messages],
@@ -1017,12 +873,15 @@ async def chat(request: ChatRequest):
         "stream": True
     }
-    print(f"💬 {mode.upper()} | Load: {load} | Workers: {len(healthy)}")
     try:
-        if mode == "light" and len(healthy) >= 2:
-            # DISTRIBUTED MODE - 1 gen + multiple decoders
-            generators, decoders = select_distributed_workers()
             if decoders:
                 cluster_stats["successful_requests"] += 1
                 return StreamingResponse(
@@ -1030,8 +889,7 @@ async def chat(request: ChatRequest):
                     media_type="text/event-stream"
                 )
-        # HEAVY/FALLBACK - single worker
-        worker = get_least_busy_worker()
         cluster_stats["successful_requests"] += 1
         return StreamingResponse(
             heavy_load_generation(worker, request_data, "chat"),

 """
+SAM-Z-1 Distributed Compute Cluster Head Node v5.0
 - Smart load balancing with distributed compute
 - Real-time status dashboard
+- Auto-detects worker version (v4 vs v5)
+- Supports 4 new models with backward compatibility
 """
 from fastapi import FastAPI, HTTPException, WebSocket
 from collections import deque
 import random
+app = FastAPI(title="SAM-Z-1 Distributed Cluster", version="5.0.0")
 # ============================================================================
 # Configuration
     "https://bc-ai-worker-5.hf.space"
 ]
+HEALTH_CHECK_INTERVAL = 5
 LOAD_CHECK_WINDOW = 10
 LIGHT_LOAD_THRESHOLD = 2
 HEAVY_LOAD_THRESHOLD = 5
+# New models added in v5
+NEW_MODELS = [
+    "SAM-X-1-Large",
+    "SAM-X-1-Fast",
+    "SAM-X-1-Mini",
+    "SAM-X-1-Nano"
+]
+# Worker state with version detection
 worker_health = {
     url: {
         "healthy": True,
         "total_requests": 0,
         "total_tokens": 0,
         "avg_latency": 0,
+        "role": "idle",
+        "version": None,  # Will be auto-detected: "v4" or "v5"
+        "supports_models": []  # Models this worker supports
     } for url in WORKER_URLS
 }
 request_timestamps = deque(maxlen=100)
+current_load_mode = "light"
 cluster_stats = {
     "total_requests": 0,
     "successful_requests": 0,
     "uptime_start": time.time()
 }
 active_connections = set()
 # ============================================================================
     top_p: float = 0.9
     repetition_penalty: float = 1.1
     stream: bool = True
+    model: Optional[str] = None  # NEW: Model selection
 class ChatMessage(BaseModel):
     role: str
     top_p: float = 0.9
     repetition_penalty: float = 1.1
     stream: bool = True
+    model: Optional[str] = None  # NEW: Model selection
+# ============================================================================
+# Worker Version Detection
+# ============================================================================
+async def detect_worker_version(worker_url: str) -> tuple:
+    """
+    Detect worker version and supported models
+    Returns: (version: str, supported_models: List[str])
+    """
+    try:
+        async with httpx.AsyncClient(timeout=10.0) as client:
+            # Try to get worker info endpoint (v5 feature)
+            try:
+                response = await client.get(f"{worker_url}/info")
+                if response.status_code == 200:
+                    data = response.json()
+                    version = data.get("version", "v5")
+                    models = data.get("models", NEW_MODELS)
+                    return version, models
+            except:
+                pass
+            # Try to get models list (v5 feature)
+            try:
+                response = await client.get(f"{worker_url}/models")
+                if response.status_code == 200:
+                    data = response.json()
+                    models = data.get("models", [])
+                    if models:
+                        return "v5", models
+            except:
+                pass
+            # Fallback: worker is v4 (no model selection)
+            return "v4", []
+    except Exception as e:
+        print(f"⚠️  Version detection failed for {worker_url}: {e}")
+        return "v4", []
+async def check_worker_health(worker_url: str) -> bool:
+    """Check if worker is healthy"""
+    try:
+        async with httpx.AsyncClient(timeout=5.0) as client:
+            response = await client.get(f"{worker_url}/health")
+            return response.status_code == 200
+    except:
+        return False
+async def health_check_loop():
+    """Health check with version detection"""
+    while True:
+        for worker_url in WORKER_URLS:
+            # Check health
+            healthy = await check_worker_health(worker_url)
+            worker_health[worker_url]["healthy"] = healthy
+            worker_health[worker_url]["last_check"] = time.time()
+            # Detect version if not yet detected
+            if worker_health[worker_url]["version"] is None:
+                version, models = await detect_worker_version(worker_url)
+                worker_health[worker_url]["version"] = version
+                worker_health[worker_url]["supports_models"] = models
+                status = "✅" if healthy else "❌"
+                print(f"{status} Worker: {worker_url.split('//')[1].split('.')[0]} | Version: {version} | Models: {len(models)}")
+        await broadcast_stats()
+        await asyncio.sleep(HEALTH_CHECK_INTERVAL)
+@app.on_event("startup")
+async def startup_event():
+    asyncio.create_task(health_check_loop())
+# ============================================================================
+# Smart Worker Selection
+# ============================================================================
+def get_workers_for_model(model_name: Optional[str]) -> List[str]:
+    """Get workers that support the requested model"""
+    healthy = get_healthy_workers()
+    if not model_name:
+        # No specific model requested, use any healthy worker
+        return healthy
+    # Filter workers by model support
+    compatible = []
+    for url in healthy:
+        version = worker_health[url]["version"]
+        models = worker_health[url]["supports_models"]
+        if version == "v5" and model_name in models:
+            # v5 worker with explicit model support
+            compatible.append(url)
+        elif version == "v4":
+            # v4 workers don't support model selection but work with default
+            compatible.append(url)
+    return compatible if compatible else healthy
+def get_healthy_workers() -> List[str]:
+    return [url for url, status in worker_health.items() if status["healthy"]]
+def get_least_busy_worker(worker_list: List[str] = None) -> Optional[str]:
+    workers = worker_list if worker_list is not None else get_healthy_workers()
+    if not workers:
+        return None
+    return min(workers, key=lambda url: worker_health[url]["active_requests"])
+def select_distributed_workers(model_name: Optional[str] = None) -> tuple:
+    """
+    Select workers for distributed compute with model compatibility
+    Returns: (generators: List[str], decoders: List[str])
+    """
+    compatible = get_workers_for_model(model_name)
+    if len(compatible) < 2:
+        return ([compatible[0]], []) if len(compatible) == 1 else ([], [])
+    sorted_workers = sorted(compatible, key=lambda url: worker_health[url]["active_requests"])
+    if len(compatible) >= 5:
+        return ([sorted_workers[0]], sorted_workers[1:5])
+    elif len(compatible) == 4:
+        return ([sorted_workers[0]], sorted_workers[1:4])
+    elif len(compatible) == 3:
+        return ([sorted_workers[0]], sorted_workers[1:3])
+    else:
+        return ([sorted_workers[0]], [sorted_workers[1]])
 # ============================================================================
 # Load Management
     load = get_current_load()
     healthy_count = len(get_healthy_workers())
     if healthy_count >= 5:
         if load <= LIGHT_LOAD_THRESHOLD:
+            current_load_mode = "light"
+        elif load <= HEAVY_LOAD_THRESHOLD:
+            current_load_mode = "medium"
         else:
+            current_load_mode = "heavy"
     elif healthy_count >= 3:
         if load <= 2:
+            current_load_mode = "light"
         else:
+            current_load_mode = "heavy"
     else:
+        current_load_mode = "heavy"
     return current_load_mode, load
     request_timestamps.append(time.time())
     cluster_stats["total_requests"] += 1
+# ============================================================================
+# Dashboard & WebSocket
+# ============================================================================
 async def broadcast_stats():
     if not active_connections:
         return
         "load": load,
         "workers": [
             {
+                "url": url.split("//")[1].split(".")[0],
                 "healthy": status["healthy"],
                 "active": status["active_requests"],
                 "total": status["total_requests"],
                 "tokens": status["total_tokens"],
                 "latency": round(status["avg_latency"], 2),
+                "role": status["role"],
+                "version": status["version"] or "detecting...",
+                "models": len(status["supports_models"])
             }
             for url, status in worker_health.items()
         ],
         }
     }
     disconnected = set()
     for ws in active_connections:
         try:
         except:
             disconnected.add(ws)
     active_connections.difference_update(disconnected)
+@app.websocket("/ws")
+async def websocket_endpoint(websocket: WebSocket):
+    await websocket.accept()
+    active_connections.add(websocket)
     try:
         await broadcast_stats()
+        while True:
+            await websocket.receive_text()
+    except:
+        pass
+    finally:
+        active_connections.discard(websocket)
 # ============================================================================
+# Distributed Generation
 # ============================================================================
 async def distributed_generation(
     request_data: dict,
     endpoint: str = "generate"
 ):
+    """Distributed compute with v4/v5 compatibility"""
     if not generators or not decoders:
         return
     token_queue = asyncio.Queue(maxsize=50)
     text_queue = asyncio.Queue(maxsize=50)
     for gen_url in generators:
         worker_health[gen_url]["role"] = "generator"
     for dec_url in decoders:
         worker_health[dec_url]["role"] = "decoder"
     async def generate_tokens():
+        gen_url = generators[0]
         try:
             worker_health[gen_url]["active_requests"] += 1
             request_data_tokens = {**request_data, "return_token_ids": True}
                                 if "token_id" in data:
                                     await token_queue.put(data["token_id"])
                                 elif "done" in data:
                                     for _ in decoders:
                                         await token_queue.put(None)
                                     break
             worker_health[gen_url]["role"] = "idle"
     async def decode_tokens(decoder_url: str, decoder_id: int):
         try:
             worker_health[decoder_url]["active_requests"] += 1
             batch = []
+            batch_size = 2
             while True:
                 try:
                     token_id = await asyncio.wait_for(token_queue.get(), timeout=2.0)
                     if token_id is None:
                         if batch:
                             async with httpx.AsyncClient(timeout=10.0) as client:
                                 response = await client.post(
                     batch.append(token_id)
                     if len(batch) >= batch_size:
                         async with httpx.AsyncClient(timeout=10.0) as client:
                             response = await client.post(
             worker_health[decoder_url]["active_requests"] -= 1
             worker_health[decoder_url]["role"] = "idle"
     gen_task = asyncio.create_task(generate_tokens())
     decoder_tasks = [
         asyncio.create_task(decode_tokens(dec_url, i))
         for i, dec_url in enumerate(decoders)
     ]
     accumulated_text = ""
     decoders_done = 0
     total_decoders = len(decoders)
         worker_health[worker_url]["role"] = "idle"
 # ============================================================================
+# API Endpoints
 # ============================================================================
 @app.get("/", response_class=HTMLResponse)
 async def dashboard():
+    """Real-time dashboard with version info"""
     return """
 <!DOCTYPE html>
 <html>
 <head>
+    <title>SAM-Z-1 Cluster Control v5.0</title>
     <style>
+        * { margin: 0; padding: 0; box-sizing: border-box; }
         body {
             font-family: 'Courier New', monospace;
             background: linear-gradient(135deg, #0a0e27 0%, #1a1f3a 100%);
             overflow-x: hidden;
             overflow-y: auto;
         }
         .container {
             padding: 20px;
             max-width: 1400px;
             margin: 0 auto;
             padding-bottom: 40px;
         }
         .header {
             text-align: center;
             margin-bottom: 30px;
             border-radius: 10px;
             box-shadow: 0 0 20px rgba(0, 255, 136, 0.3);
         }
         .header h1 {
             font-size: 2.5em;
             text-transform: uppercase;
             text-shadow: 0 0 10px #00ff88;
             animation: glow 2s ease-in-out infinite alternate;
         }
         @keyframes glow {
             from { text-shadow: 0 0 10px #00ff88, 0 0 20px #00ff88; }
             to { text-shadow: 0 0 20px #00ff88, 0 0 30px #00ff88, 0 0 40px #00ff88; }
         }
+        .version-badge {
+            display: inline-block;
+            padding: 3px 10px;
+            border-radius: 12px;
+            font-size: 10px;
+            margin-left: 8px;
+            font-weight: bold;
+        }
+        .version-v5 {
+            background: rgba(0, 255, 136, 0.2);
+            border: 1px solid #00ff88;
+            color: #00ff88;
+        }
+        .version-v4 {
+            background: rgba(255, 165, 0, 0.2);
+            border: 1px solid #ffa500;
+            color: #ffa500;
+        }
         .status-bar {
             display: flex;
             gap: 20px;
             margin-bottom: 30px;
         }
         .stat-card {
             flex: 1;
             background: rgba(0, 255, 136, 0.05);
             position: relative;
             overflow: hidden;
         }
         .stat-label {
             font-size: 0.8em;
             opacity: 0.7;
             text-transform: uppercase;
         }
         .stat-value {
             font-size: 2em;
             font-weight: bold;
             margin-top: 5px;
         }
         .workers-grid {
             display: grid;
             grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
             gap: 20px;
             margin-bottom: 30px;
         }
         .worker-card {
             background: rgba(10, 14, 39, 0.8);
             border: 2px solid #00ff88;
             position: relative;
             transition: all 0.3s;
         }
         .worker-card:hover {
             transform: translateY(-5px);
             box-shadow: 0 5px 30px rgba(0, 255, 136, 0.4);
         }
         .worker-card.offline {
             border-color: #ff4444;
             opacity: 0.6;
         }
         .worker-header {
             display: flex;
             justify-content: space-between;
             align-items: center;
             margin-bottom: 15px;
         }
         .worker-name {
             font-size: 1.2em;
             font-weight: bold;
         }
         .status-dot {
             width: 12px;
             height: 12px;
             border-radius: 50%;
             animation: pulse 2s infinite;
         }
         .status-dot.online {
             background: #00ff88;
             box-shadow: 0 0 10px #00ff88;
         }
         .status-dot.offline {
             background: #ff4444;
             box-shadow: 0 0 10px #ff4444;
         }
         @keyframes pulse {
             0%, 100% { opacity: 1; }
             50% { opacity: 0.5; }
         }
         .worker-stats {
             display: grid;
             grid-template-columns: repeat(2, 1fr);
             gap: 10px;
             margin-top: 15px;
         }
         .worker-stat {
             background: rgba(0, 255, 136, 0.05);
             padding: 10px;
             border-radius: 5px;
         }
         .worker-stat-label {
             font-size: 0.7em;
             opacity: 0.7;
         }
         .worker-stat-value {
             font-size: 1.3em;
             font-weight: bold;
             margin-top: 3px;
         }
         .timestamp {
             text-align: center;
             margin-top: 20px;
             opacity: 0.5;
             font-size: 0.9em;
         }
+        @media (max-width: 768px) {
+            .workers-grid { grid-template-columns: 1fr; }
+            .status-bar { flex-direction: column; }
+        }
     </style>
 </head>
 <body>
     <div class="container">
         <div class="header">
             <h1>⚡ SAM-Z-1 CLUSTER ⚡</h1>
+            <div>DISTRIBUTED COMPUTE SYSTEM v5.0 • AUTO VERSION DETECTION</div>
         </div>
         <div class="status-bar">
             <div class="stat-card">
                 <div class="stat-label">Load Mode</div>
                 <div class="stat-value" id="mode">--</div>
             </div>
             <div class="stat-card">
                 <div class="stat-label">Current Load</div>
                 <div class="stat-value" id="load">0</div>
             </div>
             <div class="stat-card">
                 <div class="stat-label">Total Requests</div>
             </div>
         </div>
+        <div class="workers-grid" id="workers"></div>
         <div class="timestamp" id="timestamp">Last update: --</div>
     </div>
     <script>
         const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
         let ws;
         function connectWebSocket() {
             try {
                 ws = new WebSocket(`${protocol}//${window.location.host}/ws`);
+                ws.onopen = () => console.log('✅ WebSocket connected');
+                ws.onmessage = (event) => updateDashboard(JSON.parse(event.data));
+                ws.onerror = () => console.error('❌ WebSocket error');
+                ws.onclose = () => setTimeout(connectWebSocket, 3000);
             } catch (e) {
+                console.error('Failed to connect WebSocket');
             }
         }
         connectWebSocket();
         function updateDashboard(data) {
             document.getElementById('mode').textContent = data.mode.toUpperCase();
             document.getElementById('load').textContent = data.load;
             document.getElementById('total-req').textContent = data.cluster.total_requests;
             document.getElementById('rps').textContent = data.cluster.rps;
             const workersDiv = document.getElementById('workers');
             workersDiv.innerHTML = data.workers.map(worker => `
                 <div class="worker-card ${worker.healthy ? '' : 'offline'}">
                     <div class="worker-header">
+                        <div>
+                            <div class="worker-name">${worker.url}</div>
+                            <span class="version-badge version-${worker.version}">${worker.version.toUpperCase()}</span>
+                            ${worker.models > 0 ? `<span style="font-size:0.8em;opacity:0.7;margin-left:5px">${worker.models} models</span>` : ''}
+                        </div>
                         <div class="status-dot ${worker.healthy ? 'online' : 'offline'}"></div>
                     </div>
                     <div class="worker-stats">
                         <div class="worker-stat">
                             <div class="worker-stat-label">Active</div>
                             <div class="worker-stat-value">${worker.tokens}</div>
                         </div>
                         <div class="worker-stat">
+                            <div class="worker-stat-label">Role</div>
+                            <div class="worker-stat-value" style="font-size:1em;">${worker.role}</div>
                         </div>
                     </div>
                 </div>
             `).join('');
             document.getElementById('timestamp').textContent =
+                `Last update: ${new Date().toLocaleTimeString()}`;
         }
     </script>
 </body>
 </html>
     """
 @app.get("/api/status")
 async def api_status():
     mode, load = update_load_mode()
     healthy_count = len(get_healthy_workers())
+    # Count v4 vs v5 workers
+    v4_count = sum(1 for w in worker_health.values() if w["version"] == "v4")
+    v5_count = sum(1 for w in worker_health.values() if w["version"] == "v5")
     return {
         "name": "SAM-Z-1 Distributed Cluster",
+        "version": "5.0.0",
         "mode": mode,
         "current_load": load,
         "workers": len(WORKER_URLS),
         "healthy_workers": healthy_count,
+        "v4_workers": v4_count,
+        "v5_workers": v5_count,
+        "features": [
+            "distributed_compute",
+            "smart_load_balancing",
+            "auto_version_detection",
+            "multi_model_support",
+            "real_time_dashboard"
+        ]
     }
 @app.get("/health")
         "workers_healthy": healthy_count
     }
+@app.get("/models")
+async def list_models():
+    """List all available models across all workers"""
+    all_models = set()
+    for url, status in worker_health.items():
+        if status["healthy"] and status["version"] == "v5":
+            all_models.update(status["supports_models"])
+    return {
+        "models": sorted(list(all_models)),
+        "default": "SAM-X-1-Nano" if "SAM-X-1-Nano" in all_models else None
+    }
 @app.post("/v1/generate")
 async def generate(request: GenerateRequest):
+    """Generate text with automatic model routing"""
     track_request()
     mode, load = update_load_mode()
+    # Get compatible workers
+    compatible = get_workers_for_model(request.model)
+    if not compatible:
         cluster_stats["failed_requests"] += 1
+        raise HTTPException(
+            status_code=503,
+            detail=f"No workers available for model: {request.model or 'default'}"
+        )
     request_data = {
         "prompt": request.prompt,
         "stream": True
     }
+    # Add model parameter for v5 workers
+    if request.model:
+        request_data["model"] = request.model
+    print(f"🎯 {mode.upper()} | Load: {load} | Model: {request.model or 'default'} | Workers: {len(compatible)}")
     try:
+        if mode == "light" and len(compatible) >= 2:
+            generators, decoders = select_distributed_workers(request.model)
             if decoders:
                 cluster_stats["successful_requests"] += 1
                 return StreamingResponse(
                     media_type="text/event-stream"
                 )
+        worker = get_least_busy_worker(compatible)
         cluster_stats["successful_requests"] += 1
         return StreamingResponse(
             heavy_load_generation(worker, request_data, "generate"),
 @app.post("/v1/chat")
 async def chat(request: ChatRequest):
+    """Chat with automatic model routing"""
     track_request()
     mode, load = update_load_mode()
+    # Get compatible workers
+    compatible = get_workers_for_model(request.model)
+    if not compatible:
         cluster_stats["failed_requests"] += 1
+        raise HTTPException(
+            status_code=503,
+            detail=f"No workers available for model: {request.model or 'default'}"
+        )
     request_data = {
         "messages": [{"role": m.role, "content": m.content} for m in request.messages],
         "stream": True
     }
+    # Add model parameter for v5 workers
+    if request.model:
+        request_data["model"] = request.model
+    print(f"💬 {mode.upper()} | Load: {load} | Model: {request.model or 'default'} | Workers: {len(compatible)}")
     try:
+        if mode == "light" and len(compatible) >= 2:
+            generators, decoders = select_distributed_workers(request.model)
             if decoders:
                 cluster_stats["successful_requests"] += 1
                 return StreamingResponse(
                     media_type="text/event-stream"
                 )
+        worker = get_least_busy_worker(compatible)
         cluster_stats["successful_requests"] += 1
         return StreamingResponse(
             heavy_load_generation(worker, request_data, "chat"),