Spaces:

Smilyai-labs
/

Sam-Z-api

Sleeping

App Files Files Community

Bc-AI commited on Nov 4

Commit

7f764e5

verified ·

1 Parent(s): 74ffe1c

Update app.py

Browse files

Files changed (1) hide show

app.py +424 -286

app.py CHANGED Viewed

@@ -1,9 +1,7 @@
 """
-SAM-Z-1 Distributed Compute Cluster Head Node v5.0
 - Smart load balancing with distributed compute
 - Real-time status dashboard
-- Auto-detects worker version (v4 vs v5)
-- Supports 4 new models with backward compatibility
 """
 from fastapi import FastAPI, HTTPException, WebSocket
@@ -17,7 +15,7 @@ from typing import List, Optional, Dict
 from collections import deque
 import random
-app = FastAPI(title="SAM-Z-1 Distributed Cluster", version="5.0.0")
 # ============================================================================
 # Configuration
@@ -26,27 +24,15 @@ app = FastAPI(title="SAM-Z-1 Distributed Cluster", version="5.0.0")
 WORKER_URLS = [
     "https://bc-ai-worker-2.hf.space",
     "https://bc-ai-worker-sam-z-api.hf.space",
-    "https://bc-ai-worker-sam-z-api.hf.space",
-    "https://bc-ai-worker-3.hf.space",
-    "https://bc-ai-worker-4.hf.space",
-    "https://bc-ai-worker-5.hf.space"
 ]
-HEALTH_CHECK_INTERVAL = 5
 LOAD_CHECK_WINDOW = 10
 LIGHT_LOAD_THRESHOLD = 2
 HEAVY_LOAD_THRESHOLD = 5
-# New models added in v5
-NEW_MODELS = [
-    "SAM-X-1-Large",
-    "SAM-X-1-Fast",
-    "SAM-X-1-Mini",
-    "SAM-X-1-Nano"
-]
-# Worker state with version detection
 worker_health = {
     url: {
         "healthy": True,
@@ -55,14 +41,12 @@ worker_health = {
         "total_requests": 0,
         "total_tokens": 0,
         "avg_latency": 0,
-        "role": "idle",
-        "version": None,  # Will be auto-detected: "v4" or "v5"
-        "supports_models": []  # Models this worker supports
     } for url in WORKER_URLS
 }
 request_timestamps = deque(maxlen=100)
-current_load_mode = "light"
 cluster_stats = {
     "total_requests": 0,
     "successful_requests": 0,
@@ -70,6 +54,7 @@ cluster_stats = {
     "uptime_start": time.time()
 }
 active_connections = set()
 # ============================================================================
@@ -84,7 +69,6 @@ class GenerateRequest(BaseModel):
     top_p: float = 0.9
     repetition_penalty: float = 1.1
     stream: bool = True
-    model: Optional[str] = None  # NEW: Model selection
 class ChatMessage(BaseModel):
     role: str
@@ -98,138 +82,6 @@ class ChatRequest(BaseModel):
     top_p: float = 0.9
     repetition_penalty: float = 1.1
     stream: bool = True
-    model: Optional[str] = None  # NEW: Model selection
-# ============================================================================
-# Worker Version Detection
-# ============================================================================
-async def detect_worker_version(worker_url: str) -> tuple:
-    """
-    Detect worker version and supported models
-    Returns: (version: str, supported_models: List[str])
-    """
-    try:
-        async with httpx.AsyncClient(timeout=10.0) as client:
-            # Try to get worker info endpoint (v5 feature)
-            try:
-                response = await client.get(f"{worker_url}/info")
-                if response.status_code == 200:
-                    data = response.json()
-                    version = data.get("version", "v5")
-                    models = data.get("models", NEW_MODELS)
-                    return version, models
-            except:
-                pass
-            # Try to get models list (v5 feature)
-            try:
-                response = await client.get(f"{worker_url}/models")
-                if response.status_code == 200:
-                    data = response.json()
-                    models = data.get("models", [])
-                    if models:
-                        return "v5", models
-            except:
-                pass
-            # Fallback: worker is v4 (no model selection)
-            return "v4", []
-    except Exception as e:
-        print(f"⚠️  Version detection failed for {worker_url}: {e}")
-        return "v4", []
-async def check_worker_health(worker_url: str) -> bool:
-    """Check if worker is healthy"""
-    try:
-        async with httpx.AsyncClient(timeout=5.0) as client:
-            response = await client.get(f"{worker_url}/health")
-            return response.status_code == 200
-    except:
-        return False
-async def health_check_loop():
-    """Health check with version detection"""
-    while True:
-        for worker_url in WORKER_URLS:
-            # Check health
-            healthy = await check_worker_health(worker_url)
-            worker_health[worker_url]["healthy"] = healthy
-            worker_health[worker_url]["last_check"] = time.time()
-            # Detect version if not yet detected
-            if worker_health[worker_url]["version"] is None:
-                version, models = await detect_worker_version(worker_url)
-                worker_health[worker_url]["version"] = version
-                worker_health[worker_url]["supports_models"] = models
-                status = "✅" if healthy else "❌"
-                print(f"{status} Worker: {worker_url.split('//')[1].split('.')[0]} | Version: {version} | Models: {len(models)}")
-        await broadcast_stats()
-        await asyncio.sleep(HEALTH_CHECK_INTERVAL)
-@app.on_event("startup")
-async def startup_event():
-    asyncio.create_task(health_check_loop())
-# ============================================================================
-# Smart Worker Selection
-# ============================================================================
-def get_workers_for_model(model_name: Optional[str]) -> List[str]:
-    """Get workers that support the requested model"""
-    healthy = get_healthy_workers()
-    if not model_name:
-        # No specific model requested, use any healthy worker
-        return healthy
-    # Filter workers by model support
-    compatible = []
-    for url in healthy:
-        version = worker_health[url]["version"]
-        models = worker_health[url]["supports_models"]
-        if version == "v5" and model_name in models:
-            # v5 worker with explicit model support
-            compatible.append(url)
-        elif version == "v4":
-            # v4 workers don't support model selection but work with default
-            compatible.append(url)
-    return compatible if compatible else healthy
-def get_healthy_workers() -> List[str]:
-    return [url for url, status in worker_health.items() if status["healthy"]]
-def get_least_busy_worker(worker_list: List[str] = None) -> Optional[str]:
-    workers = worker_list if worker_list is not None else get_healthy_workers()
-    if not workers:
-        return None
-    return min(workers, key=lambda url: worker_health[url]["active_requests"])
-def select_distributed_workers(model_name: Optional[str] = None) -> tuple:
-    """
-    Select workers for distributed compute with model compatibility
-    Returns: (generators: List[str], decoders: List[str])
-    """
-    compatible = get_workers_for_model(model_name)
-    if len(compatible) < 2:
-        return ([compatible[0]], []) if len(compatible) == 1 else ([], [])
-    sorted_workers = sorted(compatible, key=lambda url: worker_health[url]["active_requests"])
-    if len(compatible) >= 5:
-        return ([sorted_workers[0]], sorted_workers[1:5])
-    elif len(compatible) == 4:
-        return ([sorted_workers[0]], sorted_workers[1:4])
-    elif len(compatible) == 3:
-        return ([sorted_workers[0]], sorted_workers[1:3])
-    else:
-        return ([sorted_workers[0]], [sorted_workers[1]])
 # ============================================================================
 # Load Management
@@ -244,20 +96,21 @@ def update_load_mode():
     load = get_current_load()
     healthy_count = len(get_healthy_workers())
     if healthy_count >= 5:
         if load <= LIGHT_LOAD_THRESHOLD:
-            current_load_mode = "light"
-        elif load <= HEAVY_LOAD_THRESHOLD:
-            current_load_mode = "medium"
         else:
-            current_load_mode = "heavy"
     elif healthy_count >= 3:
         if load <= 2:
-            current_load_mode = "light"
         else:
-            current_load_mode = "heavy"
     else:
-        current_load_mode = "heavy"
     return current_load_mode, load
@@ -265,11 +118,42 @@ def track_request():
     request_timestamps.append(time.time())
     cluster_stats["total_requests"] += 1
-# ============================================================================
-# Dashboard & WebSocket
-# ============================================================================
 async def broadcast_stats():
     if not active_connections:
         return
@@ -282,15 +166,13 @@ async def broadcast_stats():
         "load": load,
         "workers": [
             {
-                "url": url.split("//")[1].split(".")[0],
                 "healthy": status["healthy"],
                 "active": status["active_requests"],
                 "total": status["total_requests"],
                 "tokens": status["total_tokens"],
                 "latency": round(status["avg_latency"], 2),
-                "role": status["role"],
-                "version": status["version"] or "detecting...",
-                "models": len(status["supports_models"])
             }
             for url, status in worker_health.items()
         ],
@@ -303,6 +185,7 @@ async def broadcast_stats():
         }
     }
     disconnected = set()
     for ws in active_connections:
         try:
@@ -310,24 +193,36 @@ async def broadcast_stats():
         except:
             disconnected.add(ws)
     active_connections.difference_update(disconnected)
-@app.websocket("/ws")
-async def websocket_endpoint(websocket: WebSocket):
-    await websocket.accept()
-    active_connections.add(websocket)
     try:
-        await broadcast_stats()
-        while True:
-            await websocket.receive_text()
     except:
-        pass
-    finally:
-        active_connections.discard(websocket)
 # ============================================================================
-# Distributed Generation
 # ============================================================================
 async def distributed_generation(
@@ -336,7 +231,11 @@ async def distributed_generation(
     request_data: dict,
     endpoint: str = "generate"
 ):
-    """Distributed compute with v4/v5 compatibility"""
     if not generators or not decoders:
         return
@@ -344,13 +243,15 @@ async def distributed_generation(
     token_queue = asyncio.Queue(maxsize=50)
     text_queue = asyncio.Queue(maxsize=50)
     for gen_url in generators:
         worker_health[gen_url]["role"] = "generator"
     for dec_url in decoders:
         worker_health[dec_url]["role"] = "decoder"
     async def generate_tokens():
-        gen_url = generators[0]
         try:
             worker_health[gen_url]["active_requests"] += 1
             request_data_tokens = {**request_data, "return_token_ids": True}
@@ -368,6 +269,7 @@ async def distributed_generation(
                                 if "token_id" in data:
                                     await token_queue.put(data["token_id"])
                                 elif "done" in data:
                                     for _ in decoders:
                                         await token_queue.put(None)
                                     break
@@ -382,16 +284,18 @@ async def distributed_generation(
             worker_health[gen_url]["role"] = "idle"
     async def decode_tokens(decoder_url: str, decoder_id: int):
         try:
             worker_health[decoder_url]["active_requests"] += 1
             batch = []
-            batch_size = 2
             while True:
                 try:
                     token_id = await asyncio.wait_for(token_queue.get(), timeout=2.0)
                     if token_id is None:
                         if batch:
                             async with httpx.AsyncClient(timeout=10.0) as client:
                                 response = await client.post(
@@ -407,6 +311,7 @@ async def distributed_generation(
                     batch.append(token_id)
                     if len(batch) >= batch_size:
                         async with httpx.AsyncClient(timeout=10.0) as client:
                             response = await client.post(
@@ -429,12 +334,16 @@ async def distributed_generation(
             worker_health[decoder_url]["active_requests"] -= 1
             worker_health[decoder_url]["role"] = "idle"
     gen_task = asyncio.create_task(generate_tokens())
     decoder_tasks = [
         asyncio.create_task(decode_tokens(dec_url, i))
         for i, dec_url in enumerate(decoders)
     ]
     accumulated_text = ""
     decoders_done = 0
     total_decoders = len(decoders)
@@ -480,19 +389,24 @@ async def heavy_load_generation(worker_url: str, request_data: dict, endpoint: s
         worker_health[worker_url]["role"] = "idle"
 # ============================================================================
-# API Endpoints
 # ============================================================================
 @app.get("/", response_class=HTMLResponse)
 async def dashboard():
-    """Real-time dashboard with version info"""
     return """
 <!DOCTYPE html>
 <html>
 <head>
-    <title>SAM-Z-1 Cluster Control v5.0</title>
     <style>
-        * { margin: 0; padding: 0; box-sizing: border-box; }
         body {
             font-family: 'Courier New', monospace;
             background: linear-gradient(135deg, #0a0e27 0%, #1a1f3a 100%);
@@ -501,12 +415,14 @@ async def dashboard():
             overflow-x: hidden;
             overflow-y: auto;
         }
         .container {
             padding: 20px;
             max-width: 1400px;
             margin: 0 auto;
             padding-bottom: 40px;
         }
         .header {
             text-align: center;
             margin-bottom: 30px;
@@ -516,6 +432,7 @@ async def dashboard():
             border-radius: 10px;
             box-shadow: 0 0 20px rgba(0, 255, 136, 0.3);
         }
         .header h1 {
             font-size: 2.5em;
             text-transform: uppercase;
@@ -523,33 +440,18 @@ async def dashboard():
             text-shadow: 0 0 10px #00ff88;
             animation: glow 2s ease-in-out infinite alternate;
         }
         @keyframes glow {
             from { text-shadow: 0 0 10px #00ff88, 0 0 20px #00ff88; }
             to { text-shadow: 0 0 20px #00ff88, 0 0 30px #00ff88, 0 0 40px #00ff88; }
         }
-        .version-badge {
-            display: inline-block;
-            padding: 3px 10px;
-            border-radius: 12px;
-            font-size: 10px;
-            margin-left: 8px;
-            font-weight: bold;
-        }
-        .version-v5 {
-            background: rgba(0, 255, 136, 0.2);
-            border: 1px solid #00ff88;
-            color: #00ff88;
-        }
-        .version-v4 {
-            background: rgba(255, 165, 0, 0.2);
-            border: 1px solid #ffa500;
-            color: #ffa500;
-        }
         .status-bar {
             display: flex;
             gap: 20px;
             margin-bottom: 30px;
         }
         .stat-card {
             flex: 1;
             background: rgba(0, 255, 136, 0.05);
@@ -559,22 +461,83 @@ async def dashboard():
             position: relative;
             overflow: hidden;
         }
         .stat-label {
             font-size: 0.8em;
             opacity: 0.7;
             text-transform: uppercase;
         }
         .stat-value {
             font-size: 2em;
             font-weight: bold;
             margin-top: 5px;
         }
         .workers-grid {
             display: grid;
             grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
             gap: 20px;
             margin-bottom: 30px;
         }
         .worker-card {
             background: rgba(10, 14, 39, 0.8);
             border: 2px solid #00ff88;
@@ -583,89 +546,166 @@ async def dashboard():
             position: relative;
             transition: all 0.3s;
         }
         .worker-card:hover {
             transform: translateY(-5px);
             box-shadow: 0 5px 30px rgba(0, 255, 136, 0.4);
         }
         .worker-card.offline {
             border-color: #ff4444;
             opacity: 0.6;
         }
         .worker-header {
             display: flex;
             justify-content: space-between;
             align-items: center;
             margin-bottom: 15px;
         }
         .worker-name {
             font-size: 1.2em;
             font-weight: bold;
         }
         .status-dot {
             width: 12px;
             height: 12px;
             border-radius: 50%;
             animation: pulse 2s infinite;
         }
         .status-dot.online {
             background: #00ff88;
             box-shadow: 0 0 10px #00ff88;
         }
         .status-dot.offline {
             background: #ff4444;
             box-shadow: 0 0 10px #ff4444;
         }
         @keyframes pulse {
             0%, 100% { opacity: 1; }
             50% { opacity: 0.5; }
         }
         .worker-stats {
             display: grid;
             grid-template-columns: repeat(2, 1fr);
             gap: 10px;
             margin-top: 15px;
         }
         .worker-stat {
             background: rgba(0, 255, 136, 0.05);
             padding: 10px;
             border-radius: 5px;
         }
         .worker-stat-label {
             font-size: 0.7em;
             opacity: 0.7;
         }
         .worker-stat-value {
             font-size: 1.3em;
             font-weight: bold;
             margin-top: 3px;
         }
         .timestamp {
             text-align: center;
             margin-top: 20px;
             opacity: 0.5;
             font-size: 0.9em;
         }
-        @media (max-width: 768px) {
-            .workers-grid { grid-template-columns: 1fr; }
-            .status-bar { flex-direction: column; }
-        }
     </style>
 </head>
 <body>
     <div class="container">
         <div class="header">
             <h1>⚡ SAM-Z-1 CLUSTER ⚡</h1>
-            <div>DISTRIBUTED COMPUTE SYSTEM v5.0 • AUTO VERSION DETECTION</div>
         </div>
         <div class="status-bar">
             <div class="stat-card">
                 <div class="stat-label">Load Mode</div>
                 <div class="stat-value" id="mode">--</div>
             </div>
             <div class="stat-card">
                 <div class="stat-label">Current Load</div>
                 <div class="stat-value" id="load">0</div>
             </div>
             <div class="stat-card">
                 <div class="stat-label">Total Requests</div>
@@ -677,47 +717,149 @@ async def dashboard():
             </div>
         </div>
-        <div class="workers-grid" id="workers"></div>
         <div class="timestamp" id="timestamp">Last update: --</div>
     </div>
     <script>
         const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
         let ws;
         function connectWebSocket() {
             try {
                 ws = new WebSocket(`${protocol}//${window.location.host}/ws`);
-                ws.onopen = () => console.log('✅ WebSocket connected');
-                ws.onmessage = (event) => updateDashboard(JSON.parse(event.data));
-                ws.onerror = () => console.error('❌ WebSocket error');
-                ws.onclose = () => setTimeout(connectWebSocket, 3000);
             } catch (e) {
-                console.error('Failed to connect WebSocket');
             }
         }
         connectWebSocket();
         function updateDashboard(data) {
             document.getElementById('mode').textContent = data.mode.toUpperCase();
             document.getElementById('load').textContent = data.load;
             document.getElementById('total-req').textContent = data.cluster.total_requests;
             document.getElementById('rps').textContent = data.cluster.rps;
             const workersDiv = document.getElementById('workers');
             workersDiv.innerHTML = data.workers.map(worker => `
                 <div class="worker-card ${worker.healthy ? '' : 'offline'}">
                     <div class="worker-header">
-                        <div>
-                            <div class="worker-name">${worker.url}</div>
-                            <span class="version-badge version-${worker.version}">${worker.version.toUpperCase()}</span>
-                            ${worker.models > 0 ? `<span style="font-size:0.8em;opacity:0.7;margin-left:5px">${worker.models} models</span>` : ''}
-                        </div>
                         <div class="status-dot ${worker.healthy ? 'online' : 'offline'}"></div>
                     </div>
                     <div class="worker-stats">
                         <div class="worker-stat">
                             <div class="worker-stat-label">Active</div>
@@ -732,46 +874,69 @@ async def dashboard():
                             <div class="worker-stat-value">${worker.tokens}</div>
                         </div>
                         <div class="worker-stat">
-                            <div class="worker-stat-label">Role</div>
-                            <div class="worker-stat-value" style="font-size:1em;">${worker.role}</div>
                         </div>
                     </div>
                 </div>
             `).join('');
             document.getElementById('timestamp').textContent =
-                `Last update: ${new Date().toLocaleTimeString()}`;
         }
     </script>
 </body>
 </html>
     """
 @app.get("/api/status")
 async def api_status():
     mode, load = update_load_mode()
     healthy_count = len(get_healthy_workers())
-    # Count v4 vs v5 workers
-    v4_count = sum(1 for w in worker_health.values() if w["version"] == "v4")
-    v5_count = sum(1 for w in worker_health.values() if w["version"] == "v5")
     return {
         "name": "SAM-Z-1 Distributed Cluster",
-        "version": "5.0.0",
         "mode": mode,
         "current_load": load,
         "workers": len(WORKER_URLS),
         "healthy_workers": healthy_count,
-        "v4_workers": v4_count,
-        "v5_workers": v5_count,
-        "features": [
-            "distributed_compute",
-            "smart_load_balancing",
-            "auto_version_detection",
-            "multi_model_support",
-            "real_time_dashboard"
-        ]
     }
 @app.get("/health")
@@ -782,34 +947,16 @@ async def health():
         "workers_healthy": healthy_count
     }
-@app.get("/models")
-async def list_models():
-    """List all available models across all workers"""
-    all_models = set()
-    for url, status in worker_health.items():
-        if status["healthy"] and status["version"] == "v5":
-            all_models.update(status["supports_models"])
-    return {
-        "models": sorted(list(all_models)),
-        "default": "SAM-X-1-Nano" if "SAM-X-1-Nano" in all_models else None
-    }
 @app.post("/v1/generate")
 async def generate(request: GenerateRequest):
-    """Generate text with automatic model routing"""
     track_request()
     mode, load = update_load_mode()
-    # Get compatible workers
-    compatible = get_workers_for_model(request.model)
-    if not compatible:
         cluster_stats["failed_requests"] += 1
-        raise HTTPException(
-            status_code=503,
-            detail=f"No workers available for model: {request.model or 'default'}"
-        )
     request_data = {
         "prompt": request.prompt,
@@ -821,15 +968,12 @@ async def generate(request: GenerateRequest):
         "stream": True
     }
-    # Add model parameter for v5 workers
-    if request.model:
-        request_data["model"] = request.model
-    print(f"🎯 {mode.upper()} | Load: {load} | Model: {request.model or 'default'} | Workers: {len(compatible)}")
     try:
-        if mode == "light" and len(compatible) >= 2:
-            generators, decoders = select_distributed_workers(request.model)
             if decoders:
                 cluster_stats["successful_requests"] += 1
                 return StreamingResponse(
@@ -837,7 +981,8 @@ async def generate(request: GenerateRequest):
                     media_type="text/event-stream"
                 )
-        worker = get_least_busy_worker(compatible)
         cluster_stats["successful_requests"] += 1
         return StreamingResponse(
             heavy_load_generation(worker, request_data, "generate"),
@@ -849,19 +994,14 @@ async def generate(request: GenerateRequest):
 @app.post("/v1/chat")
 async def chat(request: ChatRequest):
-    """Chat with automatic model routing"""
     track_request()
     mode, load = update_load_mode()
-    # Get compatible workers
-    compatible = get_workers_for_model(request.model)
-    if not compatible:
         cluster_stats["failed_requests"] += 1
-        raise HTTPException(
-            status_code=503,
-            detail=f"No workers available for model: {request.model or 'default'}"
-        )
     request_data = {
         "messages": [{"role": m.role, "content": m.content} for m in request.messages],
@@ -873,15 +1013,12 @@ async def chat(request: ChatRequest):
         "stream": True
     }
-    # Add model parameter for v5 workers
-    if request.model:
-        request_data["model"] = request.model
-    print(f"💬 {mode.upper()} | Load: {load} | Model: {request.model or 'default'} | Workers: {len(compatible)}")
     try:
-        if mode == "light" and len(compatible) >= 2:
-            generators, decoders = select_distributed_workers(request.model)
             if decoders:
                 cluster_stats["successful_requests"] += 1
                 return StreamingResponse(
@@ -889,7 +1026,8 @@ async def chat(request: ChatRequest):
                     media_type="text/event-stream"
                 )
-        worker = get_least_busy_worker(compatible)
         cluster_stats["successful_requests"] += 1
         return StreamingResponse(
             heavy_load_generation(worker, request_data, "chat"),

 """
+SAM-Z-1 Distributed Compute Cluster Head Node
 - Smart load balancing with distributed compute
 - Real-time status dashboard
 """
 from fastapi import FastAPI, HTTPException, WebSocket
 from collections import deque
 import random
+app = FastAPI(title="SAM-Z-1 Distributed Cluster", version="4.0.0")
 # ============================================================================
 # Configuration
 WORKER_URLS = [
     "https://bc-ai-worker-2.hf.space",
     "https://bc-ai-worker-sam-z-api.hf.space",
 ]
+HEALTH_CHECK_INTERVAL = 5  # faster checks for real-time dashboard
 LOAD_CHECK_WINDOW = 10
 LIGHT_LOAD_THRESHOLD = 2
 HEAVY_LOAD_THRESHOLD = 5
+# Worker state
 worker_health = {
     url: {
         "healthy": True,
         "total_requests": 0,
         "total_tokens": 0,
         "avg_latency": 0,
+        "role": "idle"  # "generator", "decoder", "full", "idle"
     } for url in WORKER_URLS
 }
 request_timestamps = deque(maxlen=100)
+current_load_mode = "light"  # "light", "medium", "heavy"
 cluster_stats = {
     "total_requests": 0,
     "successful_requests": 0,
     "uptime_start": time.time()
 }
+# Active WebSocket connections for real-time updates
 active_connections = set()
 # ============================================================================
     top_p: float = 0.9
     repetition_penalty: float = 1.1
     stream: bool = True
 class ChatMessage(BaseModel):
     role: str
     top_p: float = 0.9
     repetition_penalty: float = 1.1
     stream: bool = True
 # ============================================================================
 # Load Management
     load = get_current_load()
     healthy_count = len(get_healthy_workers())
+    # Adjust thresholds based on available workers
     if healthy_count >= 5:
         if load <= LIGHT_LOAD_THRESHOLD:
+            current_load_mode = "light"      # 1 gen + 4 decoders
+        elif load <= MEDIUM_LOAD_THRESHOLD:
+            current_load_mode = "medium"     # 2 gens + 3 decoders OR parallel requests
         else:
+            current_load_mode = "heavy"      # all workers independent
     elif healthy_count >= 3:
         if load <= 2:
+            current_load_mode = "light"      # 1 gen + 2 decoders
         else:
+            current_load_mode = "heavy"      # distribute requests
     else:
+        current_load_mode = "heavy"          # fallback to simple distribution
     return current_load_mode, load
     request_timestamps.append(time.time())
     cluster_stats["total_requests"] += 1
+def get_healthy_workers() -> List[str]:
+    return [url for url, status in worker_health.items() if status["healthy"]]
+def get_least_busy_worker() -> Optional[str]:
+    healthy = get_healthy_workers()
+    if not healthy:
+        return None
+    return min(healthy, key=lambda url: worker_health[url]["active_requests"])
+def select_distributed_workers() -> tuple:
+    """
+    Select workers for distributed compute
+    Returns: (generators: List[str], decoders: List[str])
+    """
+    healthy = get_healthy_workers()
+    if len(healthy) < 2:
+        return ([healthy[0]], []) if len(healthy) == 1 else ([], [])
+    # Sort by least busy
+    sorted_workers = sorted(healthy, key=lambda url: worker_health[url]["active_requests"])
+    if len(healthy) >= 5:
+        # OPTIMAL: 1 generator, 4 decoders
+        return ([sorted_workers[0]], sorted_workers[1:5])
+    elif len(healthy) == 4:
+        # 1 generator, 3 decoders
+        return ([sorted_workers[0]], sorted_workers[1:4])
+    elif len(healthy) == 3:
+        # 1 generator, 2 decoders
+        return ([sorted_workers[0]], sorted_workers[1:3])
+    else:
+        # 1 generator, 1 decoder
+        return ([sorted_workers[0]], [sorted_workers[1]])
 async def broadcast_stats():
+    """Broadcast stats to all connected WebSocket clients"""
     if not active_connections:
         return
         "load": load,
         "workers": [
             {
+                "url": url.split("//")[1].split(".")[0],  # shorter name
                 "healthy": status["healthy"],
                 "active": status["active_requests"],
                 "total": status["total_requests"],
                 "tokens": status["total_tokens"],
                 "latency": round(status["avg_latency"], 2),
+                "role": status["role"]
             }
             for url, status in worker_health.items()
         ],
         }
     }
+    # Broadcast to all connections
     disconnected = set()
     for ws in active_connections:
         try:
         except:
             disconnected.add(ws)
+    # Remove disconnected
     active_connections.difference_update(disconnected)
+async def check_worker_health(worker_url: str) -> bool:
     try:
+        async with httpx.AsyncClient(timeout=5.0) as client:
+            response = await client.get(f"{worker_url}/health")
+            return response.status_code == 200
     except:
+        return False
+async def health_check_loop():
+    while True:
+        # Check all workers
+        for worker_url in WORKER_URLS:
+            healthy = await check_worker_health(worker_url)
+            worker_health[worker_url]["healthy"] = healthy
+            worker_health[worker_url]["last_check"] = time.time()
+        # Always broadcast stats to connected clients
+        await broadcast_stats()
+        await asyncio.sleep(HEALTH_CHECK_INTERVAL)
+@app.on_event("startup")
+async def startup_event():
+    asyncio.create_task(health_check_loop())
 # ============================================================================
+# Distributed Compute Generation
 # ============================================================================
 async def distributed_generation(
     request_data: dict,
     endpoint: str = "generate"
 ):
+    """
+    DISTRIBUTED COMPUTE MODE
+    - Generator(s) produce token IDs
+    - Multiple decoders process in parallel (load balanced)
+    """
     if not generators or not decoders:
         return
     token_queue = asyncio.Queue(maxsize=50)
     text_queue = asyncio.Queue(maxsize=50)
+    # Mark roles
     for gen_url in generators:
         worker_health[gen_url]["role"] = "generator"
     for dec_url in decoders:
         worker_health[dec_url]["role"] = "decoder"
     async def generate_tokens():
+        """Generator worker(s)"""
+        gen_url = generators[0]  # primary generator
         try:
             worker_health[gen_url]["active_requests"] += 1
             request_data_tokens = {**request_data, "return_token_ids": True}
                                 if "token_id" in data:
                                     await token_queue.put(data["token_id"])
                                 elif "done" in data:
+                                    # Send done signal for each decoder
                                     for _ in decoders:
                                         await token_queue.put(None)
                                     break
             worker_health[gen_url]["role"] = "idle"
     async def decode_tokens(decoder_url: str, decoder_id: int):
+        """Decoder worker - processes tokens from shared queue"""
         try:
             worker_health[decoder_url]["active_requests"] += 1
             batch = []
+            batch_size = 2  # smaller batches for faster streaming
             while True:
                 try:
                     token_id = await asyncio.wait_for(token_queue.get(), timeout=2.0)
                     if token_id is None:
+                        # Decode remaining batch
                         if batch:
                             async with httpx.AsyncClient(timeout=10.0) as client:
                                 response = await client.post(
                     batch.append(token_id)
+                    # Decode when batch is full
                     if len(batch) >= batch_size:
                         async with httpx.AsyncClient(timeout=10.0) as client:
                             response = await client.post(
             worker_health[decoder_url]["active_requests"] -= 1
             worker_health[decoder_url]["role"] = "idle"
+    # Start generator
     gen_task = asyncio.create_task(generate_tokens())
+    # Start all decoders
     decoder_tasks = [
         asyncio.create_task(decode_tokens(dec_url, i))
         for i, dec_url in enumerate(decoders)
     ]
+    # Stream results
     accumulated_text = ""
     decoders_done = 0
     total_decoders = len(decoders)
         worker_health[worker_url]["role"] = "idle"
 # ============================================================================
+# Dashboard
 # ============================================================================
 @app.get("/", response_class=HTMLResponse)
 async def dashboard():
+    """Real-time futuristic dashboard"""
     return """
 <!DOCTYPE html>
 <html>
 <head>
+    <title>SAM-Z-1 Cluster Control</title>
     <style>
+        * {
+            margin: 0;
+            padding: 0;
+            box-sizing: border-box;
+        }
         body {
             font-family: 'Courier New', monospace;
             background: linear-gradient(135deg, #0a0e27 0%, #1a1f3a 100%);
             overflow-x: hidden;
             overflow-y: auto;
         }
         .container {
             padding: 20px;
             max-width: 1400px;
             margin: 0 auto;
             padding-bottom: 40px;
         }
         .header {
             text-align: center;
             margin-bottom: 30px;
             border-radius: 10px;
             box-shadow: 0 0 20px rgba(0, 255, 136, 0.3);
         }
         .header h1 {
             font-size: 2.5em;
             text-transform: uppercase;
             text-shadow: 0 0 10px #00ff88;
             animation: glow 2s ease-in-out infinite alternate;
         }
         @keyframes glow {
             from { text-shadow: 0 0 10px #00ff88, 0 0 20px #00ff88; }
             to { text-shadow: 0 0 20px #00ff88, 0 0 30px #00ff88, 0 0 40px #00ff88; }
         }
         .status-bar {
             display: flex;
             gap: 20px;
             margin-bottom: 30px;
         }
         .stat-card {
             flex: 1;
             background: rgba(0, 255, 136, 0.05);
             position: relative;
             overflow: hidden;
         }
+        .stat-card::before {
+            content: '';
+            position: absolute;
+            top: 0;
+            left: -100%;
+            width: 100%;
+            height: 100%;
+            background: linear-gradient(90deg, transparent, rgba(0, 255, 136, 0.2), transparent);
+            animation: scan 3s infinite;
+        }
+        @keyframes scan {
+            0% { left: -100%; }
+            100% { left: 100%; }
+        }
         .stat-label {
             font-size: 0.8em;
             opacity: 0.7;
             text-transform: uppercase;
         }
         .stat-value {
             font-size: 2em;
             font-weight: bold;
             margin-top: 5px;
         }
+        .mode-badge {
+            display: inline-block;
+            padding: 5px 15px;
+            border-radius: 20px;
+            font-size: 0.9em;
+            font-weight: bold;
+            text-transform: uppercase;
+            margin-top: 10px;
+        }
+        .mode-light {
+            background: rgba(0, 255, 136, 0.2);
+            border: 1px solid #00ff88;
+            color: #00ff88;
+        }
+        .mode-heavy {
+            background: rgba(255, 68, 68, 0.2);
+            border: 1px solid #ff4444;
+            color: #ff4444;
+        }
         .workers-grid {
             display: grid;
             grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
             gap: 20px;
             margin-bottom: 30px;
         }
+        @media (max-width: 768px) {
+            .workers-grid {
+                grid-template-columns: 1fr;
+            }
+            .status-bar {
+                flex-direction: column;
+            }
+            .info-grid {
+                grid-template-columns: repeat(2, 1fr);
+            }
+            .header h1 {
+                font-size: 1.5em;
+                letter-spacing: 2px;
+            }
+        }
         .worker-card {
             background: rgba(10, 14, 39, 0.8);
             border: 2px solid #00ff88;
             position: relative;
             transition: all 0.3s;
         }
         .worker-card:hover {
             transform: translateY(-5px);
             box-shadow: 0 5px 30px rgba(0, 255, 136, 0.4);
         }
         .worker-card.offline {
             border-color: #ff4444;
             opacity: 0.6;
         }
         .worker-header {
             display: flex;
             justify-content: space-between;
             align-items: center;
             margin-bottom: 15px;
         }
         .worker-name {
             font-size: 1.2em;
             font-weight: bold;
         }
         .status-dot {
             width: 12px;
             height: 12px;
             border-radius: 50%;
             animation: pulse 2s infinite;
         }
         .status-dot.online {
             background: #00ff88;
             box-shadow: 0 0 10px #00ff88;
         }
         .status-dot.offline {
             background: #ff4444;
             box-shadow: 0 0 10px #ff4444;
         }
         @keyframes pulse {
             0%, 100% { opacity: 1; }
             50% { opacity: 0.5; }
         }
         .worker-stats {
             display: grid;
             grid-template-columns: repeat(2, 1fr);
             gap: 10px;
             margin-top: 15px;
         }
         .worker-stat {
             background: rgba(0, 255, 136, 0.05);
             padding: 10px;
             border-radius: 5px;
         }
         .worker-stat-label {
             font-size: 0.7em;
             opacity: 0.7;
         }
         .worker-stat-value {
             font-size: 1.3em;
             font-weight: bold;
             margin-top: 3px;
         }
+        .role-badge {
+            display: inline-block;
+            padding: 3px 10px;
+            border-radius: 12px;
+            font-size: 0.75em;
+            margin-top: 10px;
+            font-weight: bold;
+        }
+        .role-generator {
+            background: rgba(255, 165, 0, 0.2);
+            border: 1px solid #ffa500;
+            color: #ffa500;
+        }
+        .role-decoder {
+            background: rgba(0, 191, 255, 0.2);
+            border: 1px solid #00bfff;
+            color: #00bfff;
+        }
+        .role-full {
+            background: rgba(138, 43, 226, 0.2);
+            border: 1px solid #8a2be2;
+            color: #8a2be2;
+        }
+        .role-idle {
+            background: rgba(128, 128, 128, 0.2);
+            border: 1px solid #808080;
+            color: #808080;
+        }
+        .progress-bar {
+            width: 100%;
+            height: 4px;
+            background: rgba(0, 255, 136, 0.1);
+            border-radius: 2px;
+            margin-top: 10px;
+            overflow: hidden;
+        }
+        .progress-fill {
+            height: 100%;
+            background: linear-gradient(90deg, #00ff88, #00ffff);
+            transition: width 0.3s;
+            box-shadow: 0 0 10px #00ff88;
+        }
+        .cluster-info {
+            background: rgba(0, 255, 136, 0.05);
+            border: 1px solid #00ff88;
+            border-radius: 8px;
+            padding: 20px;
+        }
+        .info-grid {
+            display: grid;
+            grid-template-columns: repeat(4, 1fr);
+            gap: 20px;
+        }
+        .info-item {
+            text-align: center;
+        }
         .timestamp {
             text-align: center;
             margin-top: 20px;
             opacity: 0.5;
             font-size: 0.9em;
         }
     </style>
 </head>
 <body>
     <div class="container">
         <div class="header">
             <h1>⚡ SAM-Z-1 CLUSTER ⚡</h1>
+            <div>DISTRIBUTED COMPUTE SYSTEM v4.0</div>
         </div>
         <div class="status-bar">
             <div class="stat-card">
                 <div class="stat-label">Load Mode</div>
                 <div class="stat-value" id="mode">--</div>
+                <div class="mode-badge" id="mode-badge">INITIALIZING</div>
             </div>
             <div class="stat-card">
                 <div class="stat-label">Current Load</div>
                 <div class="stat-value" id="load">0</div>
+                <div class="stat-label">requests / 10s</div>
             </div>
             <div class="stat-card">
                 <div class="stat-label">Total Requests</div>
             </div>
         </div>
+        <div class="workers-grid" id="workers">
+            <!-- Workers populated by JS -->
+        </div>
+        <div class="cluster-info">
+            <div class="stat-label" style="margin-bottom: 15px;">CLUSTER STATISTICS</div>
+            <div class="info-grid">
+                <div class="info-item">
+                    <div class="stat-label">Successful</div>
+                    <div class="stat-value" style="font-size: 1.5em;" id="success">0</div>
+                </div>
+                <div class="info-item">
+                    <div class="stat-label">Failed</div>
+                    <div class="stat-value" style="font-size: 1.5em;" id="failed">0</div>
+                </div>
+                <div class="info-item">
+                    <div class="stat-label">Uptime</div>
+                    <div class="stat-value" style="font-size: 1.5em;" id="uptime">0s</div>
+                </div>
+                <div class="info-item">
+                    <div class="stat-label">Healthy Workers</div>
+                    <div class="stat-value" style="font-size: 1.5em;" id="healthy">0</div>
+                </div>
+            </div>
+        </div>
         <div class="timestamp" id="timestamp">Last update: --</div>
     </div>
     <script>
+        // Use wss:// for HTTPS, ws:// for HTTP
         const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
         let ws;
+        let usePolling = false;
         function connectWebSocket() {
             try {
                 ws = new WebSocket(`${protocol}//${window.location.host}/ws`);
+                ws.onopen = () => {
+                    console.log('✅ WebSocket connected');
+                    usePolling = false;
+                };
+                ws.onmessage = (event) => {
+                    const data = JSON.parse(event.data);
+                    updateDashboard(data);
+                };
+                ws.onerror = (error) => {
+                    console.error('❌ WebSocket error, switching to polling');
+                    usePolling = true;
+                    startPolling();
+                };
+                ws.onclose = () => {
+                    console.log('🔌 WebSocket disconnected');
+                    if (!usePolling) {
+                        setTimeout(connectWebSocket, 3000);
+                    }
+                };
             } catch (e) {
+                console.error('Failed to connect WebSocket, using polling');
+                usePolling = true;
+                startPolling();
             }
         }
+        async function pollStats() {
+            if (!usePolling) return;
+            try {
+                const response = await fetch('/api/status');
+                const data = await response.json();
+                // Fetch worker stats too
+                const workersRes = await fetch('/workers');
+                const workersData = await workersRes.json();
+                // Format data like WebSocket
+                const formattedData = {
+                    timestamp: Date.now() / 1000,
+                    mode: data.mode,
+                    load: data.current_load,
+                    workers: workersData.workers.map(w => ({
+                        url: w.url.split("//")[1].split(".")[0],
+                        healthy: w.healthy,
+                        active: w.active_requests || 0,
+                        total: 0,
+                        tokens: 0,
+                        latency: 0,
+                        role: "idle"
+                    })),
+                    cluster: {
+                        total_requests: 0,
+                        successful: 0,
+                        failed: 0,
+                        uptime: 0,
+                        rps: 0
+                    }
+                };
+                updateDashboard(formattedData);
+            } catch (e) {
+                console.error('Polling error:', e);
+            }
+        }
+        function startPolling() {
+            pollStats();
+            setInterval(pollStats, 1000);
+        }
+        // Try WebSocket first
         connectWebSocket();
         function updateDashboard(data) {
+            // Mode
             document.getElementById('mode').textContent = data.mode.toUpperCase();
+            const modeBadge = document.getElementById('mode-badge');
+            modeBadge.textContent = `${data.mode.toUpperCase()} MODE`;
+            modeBadge.className = `mode-badge mode-${data.mode}`;
+            // Stats
             document.getElementById('load').textContent = data.load;
             document.getElementById('total-req').textContent = data.cluster.total_requests;
             document.getElementById('rps').textContent = data.cluster.rps;
+            document.getElementById('success').textContent = data.cluster.successful;
+            document.getElementById('failed').textContent = data.cluster.failed;
+            document.getElementById('uptime').textContent = formatUptime(data.cluster.uptime);
+            // Workers
             const workersDiv = document.getElementById('workers');
+            const healthyCount = data.workers.filter(w => w.healthy).length;
+            document.getElementById('healthy').textContent = `${healthyCount}/${data.workers.length}`;
             workersDiv.innerHTML = data.workers.map(worker => `
                 <div class="worker-card ${worker.healthy ? '' : 'offline'}">
                     <div class="worker-header">
+                        <div class="worker-name">${worker.url}</div>
                         <div class="status-dot ${worker.healthy ? 'online' : 'offline'}"></div>
                     </div>
+                    <div class="role-badge role-${worker.role}">${worker.role.toUpperCase()}</div>
                     <div class="worker-stats">
                         <div class="worker-stat">
                             <div class="worker-stat-label">Active</div>
                             <div class="worker-stat-value">${worker.tokens}</div>
                         </div>
                         <div class="worker-stat">
+                            <div class="worker-stat-label">Latency</div>
+                            <div class="worker-stat-value">${worker.latency}ms</div>
                         </div>
                     </div>
+                    <div class="progress-bar">
+                        <div class="progress-fill" style="width: ${Math.min(worker.active * 33, 100)}%"></div>
+                    </div>
                 </div>
             `).join('');
+            // Timestamp
+            const now = new Date();
             document.getElementById('timestamp').textContent =
+                `Last update: ${now.toLocaleTimeString()}`;
+        }
+        function formatUptime(seconds) {
+            const h = Math.floor(seconds / 3600);
+            const m = Math.floor((seconds % 3600) / 60);
+            const s = Math.floor(seconds % 60);
+            return `${h}h ${m}m ${s}s`;
         }
     </script>
 </body>
 </html>
     """
+@app.websocket("/ws")
+async def websocket_endpoint(websocket: WebSocket):
+    """WebSocket for real-time dashboard updates"""
+    await websocket.accept()
+    active_connections.add(websocket)
+    try:
+        # Send initial data
+        await broadcast_stats()
+        # Keep connection alive
+        while True:
+            await websocket.receive_text()
+    except:
+        pass
+    finally:
+        active_connections.discard(websocket)
+# ============================================================================
+# API Endpoints
+# ============================================================================
 @app.get("/api/status")
 async def api_status():
+    """JSON API for status"""
     mode, load = update_load_mode()
     healthy_count = len(get_healthy_workers())
     return {
         "name": "SAM-Z-1 Distributed Cluster",
+        "version": "4.0.0",
         "mode": mode,
         "current_load": load,
         "workers": len(WORKER_URLS),
         "healthy_workers": healthy_count,
+        "features": ["distributed_compute", "smart_load_balancing", "real_time_dashboard"]
     }
 @app.get("/health")
         "workers_healthy": healthy_count
     }
 @app.post("/v1/generate")
 async def generate(request: GenerateRequest):
+    """Generate text with distributed compute"""
     track_request()
     mode, load = update_load_mode()
+    healthy = get_healthy_workers()
+    if not healthy:
         cluster_stats["failed_requests"] += 1
+        raise HTTPException(status_code=503, detail="No healthy workers")
     request_data = {
         "prompt": request.prompt,
         "stream": True
     }
+    print(f"🎯 {mode.upper()} | Load: {load} | Workers: {len(healthy)}")
     try:
+        if mode == "light" and len(healthy) >= 2:
+            # DISTRIBUTED MODE - 1 gen + multiple decoders
+            generators, decoders = select_distributed_workers()
             if decoders:
                 cluster_stats["successful_requests"] += 1
                 return StreamingResponse(
                     media_type="text/event-stream"
                 )
+        # HEAVY/FALLBACK - single worker
+        worker = get_least_busy_worker()
         cluster_stats["successful_requests"] += 1
         return StreamingResponse(
             heavy_load_generation(worker, request_data, "generate"),
 @app.post("/v1/chat")
 async def chat(request: ChatRequest):
+    """Chat with distributed compute"""
     track_request()
     mode, load = update_load_mode()
+    healthy = get_healthy_workers()
+    if not healthy:
         cluster_stats["failed_requests"] += 1
+        raise HTTPException(status_code=503, detail="No healthy workers")
     request_data = {
         "messages": [{"role": m.role, "content": m.content} for m in request.messages],
         "stream": True
     }
+    print(f"💬 {mode.upper()} | Load: {load} | Workers: {len(healthy)}")
     try:
+        if mode == "light" and len(healthy) >= 2:
+            # DISTRIBUTED MODE - 1 gen + multiple decoders
+            generators, decoders = select_distributed_workers()
             if decoders:
                 cluster_stats["successful_requests"] += 1
                 return StreamingResponse(
                     media_type="text/event-stream"
                 )
+        # HEAVY/FALLBACK - single worker
+        worker = get_least_busy_worker()
         cluster_stats["successful_requests"] += 1
         return StreamingResponse(
             heavy_load_generation(worker, request_data, "chat"),