Bc-AI commited on
Commit
7f764e5
·
verified ·
1 Parent(s): 74ffe1c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +424 -286
app.py CHANGED
@@ -1,9 +1,7 @@
1
  """
2
- SAM-Z-1 Distributed Compute Cluster Head Node v5.0
3
  - Smart load balancing with distributed compute
4
  - Real-time status dashboard
5
- - Auto-detects worker version (v4 vs v5)
6
- - Supports 4 new models with backward compatibility
7
  """
8
 
9
  from fastapi import FastAPI, HTTPException, WebSocket
@@ -17,7 +15,7 @@ from typing import List, Optional, Dict
17
  from collections import deque
18
  import random
19
 
20
- app = FastAPI(title="SAM-Z-1 Distributed Cluster", version="5.0.0")
21
 
22
  # ============================================================================
23
  # Configuration
@@ -26,27 +24,15 @@ app = FastAPI(title="SAM-Z-1 Distributed Cluster", version="5.0.0")
26
  WORKER_URLS = [
27
  "https://bc-ai-worker-2.hf.space",
28
  "https://bc-ai-worker-sam-z-api.hf.space",
29
- "https://bc-ai-worker-sam-z-api.hf.space",
30
- "https://bc-ai-worker-3.hf.space",
31
- "https://bc-ai-worker-4.hf.space",
32
- "https://bc-ai-worker-5.hf.space"
33
  ]
34
 
35
- HEALTH_CHECK_INTERVAL = 5
36
  LOAD_CHECK_WINDOW = 10
37
 
38
  LIGHT_LOAD_THRESHOLD = 2
39
  HEAVY_LOAD_THRESHOLD = 5
40
 
41
- # New models added in v5
42
- NEW_MODELS = [
43
- "SAM-X-1-Large",
44
- "SAM-X-1-Fast",
45
- "SAM-X-1-Mini",
46
- "SAM-X-1-Nano"
47
- ]
48
-
49
- # Worker state with version detection
50
  worker_health = {
51
  url: {
52
  "healthy": True,
@@ -55,14 +41,12 @@ worker_health = {
55
  "total_requests": 0,
56
  "total_tokens": 0,
57
  "avg_latency": 0,
58
- "role": "idle",
59
- "version": None, # Will be auto-detected: "v4" or "v5"
60
- "supports_models": [] # Models this worker supports
61
  } for url in WORKER_URLS
62
  }
63
 
64
  request_timestamps = deque(maxlen=100)
65
- current_load_mode = "light"
66
  cluster_stats = {
67
  "total_requests": 0,
68
  "successful_requests": 0,
@@ -70,6 +54,7 @@ cluster_stats = {
70
  "uptime_start": time.time()
71
  }
72
 
 
73
  active_connections = set()
74
 
75
  # ============================================================================
@@ -84,7 +69,6 @@ class GenerateRequest(BaseModel):
84
  top_p: float = 0.9
85
  repetition_penalty: float = 1.1
86
  stream: bool = True
87
- model: Optional[str] = None # NEW: Model selection
88
 
89
  class ChatMessage(BaseModel):
90
  role: str
@@ -98,138 +82,6 @@ class ChatRequest(BaseModel):
98
  top_p: float = 0.9
99
  repetition_penalty: float = 1.1
100
  stream: bool = True
101
- model: Optional[str] = None # NEW: Model selection
102
-
103
- # ============================================================================
104
- # Worker Version Detection
105
- # ============================================================================
106
-
107
- async def detect_worker_version(worker_url: str) -> tuple:
108
- """
109
- Detect worker version and supported models
110
- Returns: (version: str, supported_models: List[str])
111
- """
112
- try:
113
- async with httpx.AsyncClient(timeout=10.0) as client:
114
- # Try to get worker info endpoint (v5 feature)
115
- try:
116
- response = await client.get(f"{worker_url}/info")
117
- if response.status_code == 200:
118
- data = response.json()
119
- version = data.get("version", "v5")
120
- models = data.get("models", NEW_MODELS)
121
- return version, models
122
- except:
123
- pass
124
-
125
- # Try to get models list (v5 feature)
126
- try:
127
- response = await client.get(f"{worker_url}/models")
128
- if response.status_code == 200:
129
- data = response.json()
130
- models = data.get("models", [])
131
- if models:
132
- return "v5", models
133
- except:
134
- pass
135
-
136
- # Fallback: worker is v4 (no model selection)
137
- return "v4", []
138
-
139
- except Exception as e:
140
- print(f"⚠️ Version detection failed for {worker_url}: {e}")
141
- return "v4", []
142
-
143
- async def check_worker_health(worker_url: str) -> bool:
144
- """Check if worker is healthy"""
145
- try:
146
- async with httpx.AsyncClient(timeout=5.0) as client:
147
- response = await client.get(f"{worker_url}/health")
148
- return response.status_code == 200
149
- except:
150
- return False
151
-
152
- async def health_check_loop():
153
- """Health check with version detection"""
154
- while True:
155
- for worker_url in WORKER_URLS:
156
- # Check health
157
- healthy = await check_worker_health(worker_url)
158
- worker_health[worker_url]["healthy"] = healthy
159
- worker_health[worker_url]["last_check"] = time.time()
160
-
161
- # Detect version if not yet detected
162
- if worker_health[worker_url]["version"] is None:
163
- version, models = await detect_worker_version(worker_url)
164
- worker_health[worker_url]["version"] = version
165
- worker_health[worker_url]["supports_models"] = models
166
-
167
- status = "✅" if healthy else "❌"
168
- print(f"{status} Worker: {worker_url.split('//')[1].split('.')[0]} | Version: {version} | Models: {len(models)}")
169
-
170
- await broadcast_stats()
171
- await asyncio.sleep(HEALTH_CHECK_INTERVAL)
172
-
173
- @app.on_event("startup")
174
- async def startup_event():
175
- asyncio.create_task(health_check_loop())
176
-
177
- # ============================================================================
178
- # Smart Worker Selection
179
- # ============================================================================
180
-
181
- def get_workers_for_model(model_name: Optional[str]) -> List[str]:
182
- """Get workers that support the requested model"""
183
- healthy = get_healthy_workers()
184
-
185
- if not model_name:
186
- # No specific model requested, use any healthy worker
187
- return healthy
188
-
189
- # Filter workers by model support
190
- compatible = []
191
- for url in healthy:
192
- version = worker_health[url]["version"]
193
- models = worker_health[url]["supports_models"]
194
-
195
- if version == "v5" and model_name in models:
196
- # v5 worker with explicit model support
197
- compatible.append(url)
198
- elif version == "v4":
199
- # v4 workers don't support model selection but work with default
200
- compatible.append(url)
201
-
202
- return compatible if compatible else healthy
203
-
204
- def get_healthy_workers() -> List[str]:
205
- return [url for url, status in worker_health.items() if status["healthy"]]
206
-
207
- def get_least_busy_worker(worker_list: List[str] = None) -> Optional[str]:
208
- workers = worker_list if worker_list is not None else get_healthy_workers()
209
- if not workers:
210
- return None
211
- return min(workers, key=lambda url: worker_health[url]["active_requests"])
212
-
213
- def select_distributed_workers(model_name: Optional[str] = None) -> tuple:
214
- """
215
- Select workers for distributed compute with model compatibility
216
- Returns: (generators: List[str], decoders: List[str])
217
- """
218
- compatible = get_workers_for_model(model_name)
219
-
220
- if len(compatible) < 2:
221
- return ([compatible[0]], []) if len(compatible) == 1 else ([], [])
222
-
223
- sorted_workers = sorted(compatible, key=lambda url: worker_health[url]["active_requests"])
224
-
225
- if len(compatible) >= 5:
226
- return ([sorted_workers[0]], sorted_workers[1:5])
227
- elif len(compatible) == 4:
228
- return ([sorted_workers[0]], sorted_workers[1:4])
229
- elif len(compatible) == 3:
230
- return ([sorted_workers[0]], sorted_workers[1:3])
231
- else:
232
- return ([sorted_workers[0]], [sorted_workers[1]])
233
 
234
  # ============================================================================
235
  # Load Management
@@ -244,20 +96,21 @@ def update_load_mode():
244
  load = get_current_load()
245
  healthy_count = len(get_healthy_workers())
246
 
 
247
  if healthy_count >= 5:
248
  if load <= LIGHT_LOAD_THRESHOLD:
249
- current_load_mode = "light"
250
- elif load <= HEAVY_LOAD_THRESHOLD:
251
- current_load_mode = "medium"
252
  else:
253
- current_load_mode = "heavy"
254
  elif healthy_count >= 3:
255
  if load <= 2:
256
- current_load_mode = "light"
257
  else:
258
- current_load_mode = "heavy"
259
  else:
260
- current_load_mode = "heavy"
261
 
262
  return current_load_mode, load
263
 
@@ -265,11 +118,42 @@ def track_request():
265
  request_timestamps.append(time.time())
266
  cluster_stats["total_requests"] += 1
267
 
268
- # ============================================================================
269
- # Dashboard & WebSocket
270
- # ============================================================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
271
 
272
  async def broadcast_stats():
 
273
  if not active_connections:
274
  return
275
 
@@ -282,15 +166,13 @@ async def broadcast_stats():
282
  "load": load,
283
  "workers": [
284
  {
285
- "url": url.split("//")[1].split(".")[0],
286
  "healthy": status["healthy"],
287
  "active": status["active_requests"],
288
  "total": status["total_requests"],
289
  "tokens": status["total_tokens"],
290
  "latency": round(status["avg_latency"], 2),
291
- "role": status["role"],
292
- "version": status["version"] or "detecting...",
293
- "models": len(status["supports_models"])
294
  }
295
  for url, status in worker_health.items()
296
  ],
@@ -303,6 +185,7 @@ async def broadcast_stats():
303
  }
304
  }
305
 
 
306
  disconnected = set()
307
  for ws in active_connections:
308
  try:
@@ -310,24 +193,36 @@ async def broadcast_stats():
310
  except:
311
  disconnected.add(ws)
312
 
 
313
  active_connections.difference_update(disconnected)
314
 
315
- @app.websocket("/ws")
316
- async def websocket_endpoint(websocket: WebSocket):
317
- await websocket.accept()
318
- active_connections.add(websocket)
319
-
320
  try:
321
- await broadcast_stats()
322
- while True:
323
- await websocket.receive_text()
324
  except:
325
- pass
326
- finally:
327
- active_connections.discard(websocket)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
328
 
329
  # ============================================================================
330
- # Distributed Generation
331
  # ============================================================================
332
 
333
  async def distributed_generation(
@@ -336,7 +231,11 @@ async def distributed_generation(
336
  request_data: dict,
337
  endpoint: str = "generate"
338
  ):
339
- """Distributed compute with v4/v5 compatibility"""
 
 
 
 
340
 
341
  if not generators or not decoders:
342
  return
@@ -344,13 +243,15 @@ async def distributed_generation(
344
  token_queue = asyncio.Queue(maxsize=50)
345
  text_queue = asyncio.Queue(maxsize=50)
346
 
 
347
  for gen_url in generators:
348
  worker_health[gen_url]["role"] = "generator"
349
  for dec_url in decoders:
350
  worker_health[dec_url]["role"] = "decoder"
351
 
352
  async def generate_tokens():
353
- gen_url = generators[0]
 
354
  try:
355
  worker_health[gen_url]["active_requests"] += 1
356
  request_data_tokens = {**request_data, "return_token_ids": True}
@@ -368,6 +269,7 @@ async def distributed_generation(
368
  if "token_id" in data:
369
  await token_queue.put(data["token_id"])
370
  elif "done" in data:
 
371
  for _ in decoders:
372
  await token_queue.put(None)
373
  break
@@ -382,16 +284,18 @@ async def distributed_generation(
382
  worker_health[gen_url]["role"] = "idle"
383
 
384
  async def decode_tokens(decoder_url: str, decoder_id: int):
 
385
  try:
386
  worker_health[decoder_url]["active_requests"] += 1
387
  batch = []
388
- batch_size = 2
389
 
390
  while True:
391
  try:
392
  token_id = await asyncio.wait_for(token_queue.get(), timeout=2.0)
393
 
394
  if token_id is None:
 
395
  if batch:
396
  async with httpx.AsyncClient(timeout=10.0) as client:
397
  response = await client.post(
@@ -407,6 +311,7 @@ async def distributed_generation(
407
 
408
  batch.append(token_id)
409
 
 
410
  if len(batch) >= batch_size:
411
  async with httpx.AsyncClient(timeout=10.0) as client:
412
  response = await client.post(
@@ -429,12 +334,16 @@ async def distributed_generation(
429
  worker_health[decoder_url]["active_requests"] -= 1
430
  worker_health[decoder_url]["role"] = "idle"
431
 
 
432
  gen_task = asyncio.create_task(generate_tokens())
 
 
433
  decoder_tasks = [
434
  asyncio.create_task(decode_tokens(dec_url, i))
435
  for i, dec_url in enumerate(decoders)
436
  ]
437
 
 
438
  accumulated_text = ""
439
  decoders_done = 0
440
  total_decoders = len(decoders)
@@ -480,19 +389,24 @@ async def heavy_load_generation(worker_url: str, request_data: dict, endpoint: s
480
  worker_health[worker_url]["role"] = "idle"
481
 
482
  # ============================================================================
483
- # API Endpoints
484
  # ============================================================================
485
 
486
  @app.get("/", response_class=HTMLResponse)
487
  async def dashboard():
488
- """Real-time dashboard with version info"""
489
  return """
490
  <!DOCTYPE html>
491
  <html>
492
  <head>
493
- <title>SAM-Z-1 Cluster Control v5.0</title>
494
  <style>
495
- * { margin: 0; padding: 0; box-sizing: border-box; }
 
 
 
 
 
496
  body {
497
  font-family: 'Courier New', monospace;
498
  background: linear-gradient(135deg, #0a0e27 0%, #1a1f3a 100%);
@@ -501,12 +415,14 @@ async def dashboard():
501
  overflow-x: hidden;
502
  overflow-y: auto;
503
  }
 
504
  .container {
505
  padding: 20px;
506
  max-width: 1400px;
507
  margin: 0 auto;
508
  padding-bottom: 40px;
509
  }
 
510
  .header {
511
  text-align: center;
512
  margin-bottom: 30px;
@@ -516,6 +432,7 @@ async def dashboard():
516
  border-radius: 10px;
517
  box-shadow: 0 0 20px rgba(0, 255, 136, 0.3);
518
  }
 
519
  .header h1 {
520
  font-size: 2.5em;
521
  text-transform: uppercase;
@@ -523,33 +440,18 @@ async def dashboard():
523
  text-shadow: 0 0 10px #00ff88;
524
  animation: glow 2s ease-in-out infinite alternate;
525
  }
 
526
  @keyframes glow {
527
  from { text-shadow: 0 0 10px #00ff88, 0 0 20px #00ff88; }
528
  to { text-shadow: 0 0 20px #00ff88, 0 0 30px #00ff88, 0 0 40px #00ff88; }
529
  }
530
- .version-badge {
531
- display: inline-block;
532
- padding: 3px 10px;
533
- border-radius: 12px;
534
- font-size: 10px;
535
- margin-left: 8px;
536
- font-weight: bold;
537
- }
538
- .version-v5 {
539
- background: rgba(0, 255, 136, 0.2);
540
- border: 1px solid #00ff88;
541
- color: #00ff88;
542
- }
543
- .version-v4 {
544
- background: rgba(255, 165, 0, 0.2);
545
- border: 1px solid #ffa500;
546
- color: #ffa500;
547
- }
548
  .status-bar {
549
  display: flex;
550
  gap: 20px;
551
  margin-bottom: 30px;
552
  }
 
553
  .stat-card {
554
  flex: 1;
555
  background: rgba(0, 255, 136, 0.05);
@@ -559,22 +461,83 @@ async def dashboard():
559
  position: relative;
560
  overflow: hidden;
561
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
562
  .stat-label {
563
  font-size: 0.8em;
564
  opacity: 0.7;
565
  text-transform: uppercase;
566
  }
 
567
  .stat-value {
568
  font-size: 2em;
569
  font-weight: bold;
570
  margin-top: 5px;
571
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
572
  .workers-grid {
573
  display: grid;
574
  grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
575
  gap: 20px;
576
  margin-bottom: 30px;
577
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
578
  .worker-card {
579
  background: rgba(10, 14, 39, 0.8);
580
  border: 2px solid #00ff88;
@@ -583,89 +546,166 @@ async def dashboard():
583
  position: relative;
584
  transition: all 0.3s;
585
  }
 
586
  .worker-card:hover {
587
  transform: translateY(-5px);
588
  box-shadow: 0 5px 30px rgba(0, 255, 136, 0.4);
589
  }
 
590
  .worker-card.offline {
591
  border-color: #ff4444;
592
  opacity: 0.6;
593
  }
 
594
  .worker-header {
595
  display: flex;
596
  justify-content: space-between;
597
  align-items: center;
598
  margin-bottom: 15px;
599
  }
 
600
  .worker-name {
601
  font-size: 1.2em;
602
  font-weight: bold;
603
  }
 
604
  .status-dot {
605
  width: 12px;
606
  height: 12px;
607
  border-radius: 50%;
608
  animation: pulse 2s infinite;
609
  }
 
610
  .status-dot.online {
611
  background: #00ff88;
612
  box-shadow: 0 0 10px #00ff88;
613
  }
 
614
  .status-dot.offline {
615
  background: #ff4444;
616
  box-shadow: 0 0 10px #ff4444;
617
  }
 
618
  @keyframes pulse {
619
  0%, 100% { opacity: 1; }
620
  50% { opacity: 0.5; }
621
  }
 
622
  .worker-stats {
623
  display: grid;
624
  grid-template-columns: repeat(2, 1fr);
625
  gap: 10px;
626
  margin-top: 15px;
627
  }
 
628
  .worker-stat {
629
  background: rgba(0, 255, 136, 0.05);
630
  padding: 10px;
631
  border-radius: 5px;
632
  }
 
633
  .worker-stat-label {
634
  font-size: 0.7em;
635
  opacity: 0.7;
636
  }
 
637
  .worker-stat-value {
638
  font-size: 1.3em;
639
  font-weight: bold;
640
  margin-top: 3px;
641
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
642
  .timestamp {
643
  text-align: center;
644
  margin-top: 20px;
645
  opacity: 0.5;
646
  font-size: 0.9em;
647
  }
648
- @media (max-width: 768px) {
649
- .workers-grid { grid-template-columns: 1fr; }
650
- .status-bar { flex-direction: column; }
651
- }
652
  </style>
653
  </head>
654
  <body>
655
  <div class="container">
656
  <div class="header">
657
  <h1>⚡ SAM-Z-1 CLUSTER ⚡</h1>
658
- <div>DISTRIBUTED COMPUTE SYSTEM v5.0 • AUTO VERSION DETECTION</div>
659
  </div>
660
 
661
  <div class="status-bar">
662
  <div class="stat-card">
663
  <div class="stat-label">Load Mode</div>
664
  <div class="stat-value" id="mode">--</div>
 
665
  </div>
666
  <div class="stat-card">
667
  <div class="stat-label">Current Load</div>
668
  <div class="stat-value" id="load">0</div>
 
669
  </div>
670
  <div class="stat-card">
671
  <div class="stat-label">Total Requests</div>
@@ -677,47 +717,149 @@ async def dashboard():
677
  </div>
678
  </div>
679
 
680
- <div class="workers-grid" id="workers"></div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
681
 
682
  <div class="timestamp" id="timestamp">Last update: --</div>
683
  </div>
684
 
685
  <script>
 
686
  const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
687
  let ws;
 
688
 
689
  function connectWebSocket() {
690
  try {
691
  ws = new WebSocket(`${protocol}//${window.location.host}/ws`);
692
 
693
- ws.onopen = () => console.log('✅ WebSocket connected');
694
- ws.onmessage = (event) => updateDashboard(JSON.parse(event.data));
695
- ws.onerror = () => console.error('❌ WebSocket error');
696
- ws.onclose = () => setTimeout(connectWebSocket, 3000);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
697
  } catch (e) {
698
- console.error('Failed to connect WebSocket');
 
 
699
  }
700
  }
701
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
702
  connectWebSocket();
703
 
704
  function updateDashboard(data) {
 
705
  document.getElementById('mode').textContent = data.mode.toUpperCase();
 
 
 
 
 
706
  document.getElementById('load').textContent = data.load;
707
  document.getElementById('total-req').textContent = data.cluster.total_requests;
708
  document.getElementById('rps').textContent = data.cluster.rps;
 
 
 
709
 
 
710
  const workersDiv = document.getElementById('workers');
 
 
 
711
  workersDiv.innerHTML = data.workers.map(worker => `
712
  <div class="worker-card ${worker.healthy ? '' : 'offline'}">
713
  <div class="worker-header">
714
- <div>
715
- <div class="worker-name">${worker.url}</div>
716
- <span class="version-badge version-${worker.version}">${worker.version.toUpperCase()}</span>
717
- ${worker.models > 0 ? `<span style="font-size:0.8em;opacity:0.7;margin-left:5px">${worker.models} models</span>` : ''}
718
- </div>
719
  <div class="status-dot ${worker.healthy ? 'online' : 'offline'}"></div>
720
  </div>
 
721
  <div class="worker-stats">
722
  <div class="worker-stat">
723
  <div class="worker-stat-label">Active</div>
@@ -732,46 +874,69 @@ async def dashboard():
732
  <div class="worker-stat-value">${worker.tokens}</div>
733
  </div>
734
  <div class="worker-stat">
735
- <div class="worker-stat-label">Role</div>
736
- <div class="worker-stat-value" style="font-size:1em;">${worker.role}</div>
737
  </div>
738
  </div>
 
 
 
739
  </div>
740
  `).join('');
741
 
 
 
742
  document.getElementById('timestamp').textContent =
743
- `Last update: ${new Date().toLocaleTimeString()}`;
 
 
 
 
 
 
 
744
  }
745
  </script>
746
  </body>
747
  </html>
748
  """
749
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
750
  @app.get("/api/status")
751
  async def api_status():
 
752
  mode, load = update_load_mode()
753
  healthy_count = len(get_healthy_workers())
754
 
755
- # Count v4 vs v5 workers
756
- v4_count = sum(1 for w in worker_health.values() if w["version"] == "v4")
757
- v5_count = sum(1 for w in worker_health.values() if w["version"] == "v5")
758
-
759
  return {
760
  "name": "SAM-Z-1 Distributed Cluster",
761
- "version": "5.0.0",
762
  "mode": mode,
763
  "current_load": load,
764
  "workers": len(WORKER_URLS),
765
  "healthy_workers": healthy_count,
766
- "v4_workers": v4_count,
767
- "v5_workers": v5_count,
768
- "features": [
769
- "distributed_compute",
770
- "smart_load_balancing",
771
- "auto_version_detection",
772
- "multi_model_support",
773
- "real_time_dashboard"
774
- ]
775
  }
776
 
777
  @app.get("/health")
@@ -782,34 +947,16 @@ async def health():
782
  "workers_healthy": healthy_count
783
  }
784
 
785
- @app.get("/models")
786
- async def list_models():
787
- """List all available models across all workers"""
788
- all_models = set()
789
- for url, status in worker_health.items():
790
- if status["healthy"] and status["version"] == "v5":
791
- all_models.update(status["supports_models"])
792
-
793
- return {
794
- "models": sorted(list(all_models)),
795
- "default": "SAM-X-1-Nano" if "SAM-X-1-Nano" in all_models else None
796
- }
797
-
798
  @app.post("/v1/generate")
799
  async def generate(request: GenerateRequest):
800
- """Generate text with automatic model routing"""
801
  track_request()
802
  mode, load = update_load_mode()
803
 
804
- # Get compatible workers
805
- compatible = get_workers_for_model(request.model)
806
-
807
- if not compatible:
808
  cluster_stats["failed_requests"] += 1
809
- raise HTTPException(
810
- status_code=503,
811
- detail=f"No workers available for model: {request.model or 'default'}"
812
- )
813
 
814
  request_data = {
815
  "prompt": request.prompt,
@@ -821,15 +968,12 @@ async def generate(request: GenerateRequest):
821
  "stream": True
822
  }
823
 
824
- # Add model parameter for v5 workers
825
- if request.model:
826
- request_data["model"] = request.model
827
-
828
- print(f"🎯 {mode.upper()} | Load: {load} | Model: {request.model or 'default'} | Workers: {len(compatible)}")
829
 
830
  try:
831
- if mode == "light" and len(compatible) >= 2:
832
- generators, decoders = select_distributed_workers(request.model)
 
833
  if decoders:
834
  cluster_stats["successful_requests"] += 1
835
  return StreamingResponse(
@@ -837,7 +981,8 @@ async def generate(request: GenerateRequest):
837
  media_type="text/event-stream"
838
  )
839
 
840
- worker = get_least_busy_worker(compatible)
 
841
  cluster_stats["successful_requests"] += 1
842
  return StreamingResponse(
843
  heavy_load_generation(worker, request_data, "generate"),
@@ -849,19 +994,14 @@ async def generate(request: GenerateRequest):
849
 
850
  @app.post("/v1/chat")
851
  async def chat(request: ChatRequest):
852
- """Chat with automatic model routing"""
853
  track_request()
854
  mode, load = update_load_mode()
855
 
856
- # Get compatible workers
857
- compatible = get_workers_for_model(request.model)
858
-
859
- if not compatible:
860
  cluster_stats["failed_requests"] += 1
861
- raise HTTPException(
862
- status_code=503,
863
- detail=f"No workers available for model: {request.model or 'default'}"
864
- )
865
 
866
  request_data = {
867
  "messages": [{"role": m.role, "content": m.content} for m in request.messages],
@@ -873,15 +1013,12 @@ async def chat(request: ChatRequest):
873
  "stream": True
874
  }
875
 
876
- # Add model parameter for v5 workers
877
- if request.model:
878
- request_data["model"] = request.model
879
-
880
- print(f"💬 {mode.upper()} | Load: {load} | Model: {request.model or 'default'} | Workers: {len(compatible)}")
881
 
882
  try:
883
- if mode == "light" and len(compatible) >= 2:
884
- generators, decoders = select_distributed_workers(request.model)
 
885
  if decoders:
886
  cluster_stats["successful_requests"] += 1
887
  return StreamingResponse(
@@ -889,7 +1026,8 @@ async def chat(request: ChatRequest):
889
  media_type="text/event-stream"
890
  )
891
 
892
- worker = get_least_busy_worker(compatible)
 
893
  cluster_stats["successful_requests"] += 1
894
  return StreamingResponse(
895
  heavy_load_generation(worker, request_data, "chat"),
 
1
  """
2
+ SAM-Z-1 Distributed Compute Cluster Head Node
3
  - Smart load balancing with distributed compute
4
  - Real-time status dashboard
 
 
5
  """
6
 
7
  from fastapi import FastAPI, HTTPException, WebSocket
 
15
  from collections import deque
16
  import random
17
 
18
+ app = FastAPI(title="SAM-Z-1 Distributed Cluster", version="4.0.0")
19
 
20
  # ============================================================================
21
  # Configuration
 
24
  WORKER_URLS = [
25
  "https://bc-ai-worker-2.hf.space",
26
  "https://bc-ai-worker-sam-z-api.hf.space",
 
 
 
 
27
  ]
28
 
29
+ HEALTH_CHECK_INTERVAL = 5 # faster checks for real-time dashboard
30
  LOAD_CHECK_WINDOW = 10
31
 
32
  LIGHT_LOAD_THRESHOLD = 2
33
  HEAVY_LOAD_THRESHOLD = 5
34
 
35
+ # Worker state
 
 
 
 
 
 
 
 
36
  worker_health = {
37
  url: {
38
  "healthy": True,
 
41
  "total_requests": 0,
42
  "total_tokens": 0,
43
  "avg_latency": 0,
44
+ "role": "idle" # "generator", "decoder", "full", "idle"
 
 
45
  } for url in WORKER_URLS
46
  }
47
 
48
  request_timestamps = deque(maxlen=100)
49
+ current_load_mode = "light" # "light", "medium", "heavy"
50
  cluster_stats = {
51
  "total_requests": 0,
52
  "successful_requests": 0,
 
54
  "uptime_start": time.time()
55
  }
56
 
57
+ # Active WebSocket connections for real-time updates
58
  active_connections = set()
59
 
60
  # ============================================================================
 
69
  top_p: float = 0.9
70
  repetition_penalty: float = 1.1
71
  stream: bool = True
 
72
 
73
  class ChatMessage(BaseModel):
74
  role: str
 
82
  top_p: float = 0.9
83
  repetition_penalty: float = 1.1
84
  stream: bool = True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
  # ============================================================================
87
  # Load Management
 
96
  load = get_current_load()
97
  healthy_count = len(get_healthy_workers())
98
 
99
+ # Adjust thresholds based on available workers
100
  if healthy_count >= 5:
101
  if load <= LIGHT_LOAD_THRESHOLD:
102
+ current_load_mode = "light" # 1 gen + 4 decoders
103
+ elif load <= MEDIUM_LOAD_THRESHOLD:
104
+ current_load_mode = "medium" # 2 gens + 3 decoders OR parallel requests
105
  else:
106
+ current_load_mode = "heavy" # all workers independent
107
  elif healthy_count >= 3:
108
  if load <= 2:
109
+ current_load_mode = "light" # 1 gen + 2 decoders
110
  else:
111
+ current_load_mode = "heavy" # distribute requests
112
  else:
113
+ current_load_mode = "heavy" # fallback to simple distribution
114
 
115
  return current_load_mode, load
116
 
 
118
  request_timestamps.append(time.time())
119
  cluster_stats["total_requests"] += 1
120
 
121
+ def get_healthy_workers() -> List[str]:
122
+ return [url for url, status in worker_health.items() if status["healthy"]]
123
+
124
+ def get_least_busy_worker() -> Optional[str]:
125
+ healthy = get_healthy_workers()
126
+ if not healthy:
127
+ return None
128
+ return min(healthy, key=lambda url: worker_health[url]["active_requests"])
129
+
130
+ def select_distributed_workers() -> tuple:
131
+ """
132
+ Select workers for distributed compute
133
+ Returns: (generators: List[str], decoders: List[str])
134
+ """
135
+ healthy = get_healthy_workers()
136
+ if len(healthy) < 2:
137
+ return ([healthy[0]], []) if len(healthy) == 1 else ([], [])
138
+
139
+ # Sort by least busy
140
+ sorted_workers = sorted(healthy, key=lambda url: worker_health[url]["active_requests"])
141
+
142
+ if len(healthy) >= 5:
143
+ # OPTIMAL: 1 generator, 4 decoders
144
+ return ([sorted_workers[0]], sorted_workers[1:5])
145
+ elif len(healthy) == 4:
146
+ # 1 generator, 3 decoders
147
+ return ([sorted_workers[0]], sorted_workers[1:4])
148
+ elif len(healthy) == 3:
149
+ # 1 generator, 2 decoders
150
+ return ([sorted_workers[0]], sorted_workers[1:3])
151
+ else:
152
+ # 1 generator, 1 decoder
153
+ return ([sorted_workers[0]], [sorted_workers[1]])
154
 
155
  async def broadcast_stats():
156
+ """Broadcast stats to all connected WebSocket clients"""
157
  if not active_connections:
158
  return
159
 
 
166
  "load": load,
167
  "workers": [
168
  {
169
+ "url": url.split("//")[1].split(".")[0], # shorter name
170
  "healthy": status["healthy"],
171
  "active": status["active_requests"],
172
  "total": status["total_requests"],
173
  "tokens": status["total_tokens"],
174
  "latency": round(status["avg_latency"], 2),
175
+ "role": status["role"]
 
 
176
  }
177
  for url, status in worker_health.items()
178
  ],
 
185
  }
186
  }
187
 
188
+ # Broadcast to all connections
189
  disconnected = set()
190
  for ws in active_connections:
191
  try:
 
193
  except:
194
  disconnected.add(ws)
195
 
196
+ # Remove disconnected
197
  active_connections.difference_update(disconnected)
198
 
199
+ async def check_worker_health(worker_url: str) -> bool:
 
 
 
 
200
  try:
201
+ async with httpx.AsyncClient(timeout=5.0) as client:
202
+ response = await client.get(f"{worker_url}/health")
203
+ return response.status_code == 200
204
  except:
205
+ return False
206
+
207
+ async def health_check_loop():
208
+ while True:
209
+ # Check all workers
210
+ for worker_url in WORKER_URLS:
211
+ healthy = await check_worker_health(worker_url)
212
+ worker_health[worker_url]["healthy"] = healthy
213
+ worker_health[worker_url]["last_check"] = time.time()
214
+
215
+ # Always broadcast stats to connected clients
216
+ await broadcast_stats()
217
+
218
+ await asyncio.sleep(HEALTH_CHECK_INTERVAL)
219
+
220
+ @app.on_event("startup")
221
+ async def startup_event():
222
+ asyncio.create_task(health_check_loop())
223
 
224
  # ============================================================================
225
+ # Distributed Compute Generation
226
  # ============================================================================
227
 
228
  async def distributed_generation(
 
231
  request_data: dict,
232
  endpoint: str = "generate"
233
  ):
234
+ """
235
+ DISTRIBUTED COMPUTE MODE
236
+ - Generator(s) produce token IDs
237
+ - Multiple decoders process in parallel (load balanced)
238
+ """
239
 
240
  if not generators or not decoders:
241
  return
 
243
  token_queue = asyncio.Queue(maxsize=50)
244
  text_queue = asyncio.Queue(maxsize=50)
245
 
246
+ # Mark roles
247
  for gen_url in generators:
248
  worker_health[gen_url]["role"] = "generator"
249
  for dec_url in decoders:
250
  worker_health[dec_url]["role"] = "decoder"
251
 
252
  async def generate_tokens():
253
+ """Generator worker(s)"""
254
+ gen_url = generators[0] # primary generator
255
  try:
256
  worker_health[gen_url]["active_requests"] += 1
257
  request_data_tokens = {**request_data, "return_token_ids": True}
 
269
  if "token_id" in data:
270
  await token_queue.put(data["token_id"])
271
  elif "done" in data:
272
+ # Send done signal for each decoder
273
  for _ in decoders:
274
  await token_queue.put(None)
275
  break
 
284
  worker_health[gen_url]["role"] = "idle"
285
 
286
  async def decode_tokens(decoder_url: str, decoder_id: int):
287
+ """Decoder worker - processes tokens from shared queue"""
288
  try:
289
  worker_health[decoder_url]["active_requests"] += 1
290
  batch = []
291
+ batch_size = 2 # smaller batches for faster streaming
292
 
293
  while True:
294
  try:
295
  token_id = await asyncio.wait_for(token_queue.get(), timeout=2.0)
296
 
297
  if token_id is None:
298
+ # Decode remaining batch
299
  if batch:
300
  async with httpx.AsyncClient(timeout=10.0) as client:
301
  response = await client.post(
 
311
 
312
  batch.append(token_id)
313
 
314
+ # Decode when batch is full
315
  if len(batch) >= batch_size:
316
  async with httpx.AsyncClient(timeout=10.0) as client:
317
  response = await client.post(
 
334
  worker_health[decoder_url]["active_requests"] -= 1
335
  worker_health[decoder_url]["role"] = "idle"
336
 
337
+ # Start generator
338
  gen_task = asyncio.create_task(generate_tokens())
339
+
340
+ # Start all decoders
341
  decoder_tasks = [
342
  asyncio.create_task(decode_tokens(dec_url, i))
343
  for i, dec_url in enumerate(decoders)
344
  ]
345
 
346
+ # Stream results
347
  accumulated_text = ""
348
  decoders_done = 0
349
  total_decoders = len(decoders)
 
389
  worker_health[worker_url]["role"] = "idle"
390
 
391
  # ============================================================================
392
+ # Dashboard
393
  # ============================================================================
394
 
395
  @app.get("/", response_class=HTMLResponse)
396
  async def dashboard():
397
+ """Real-time futuristic dashboard"""
398
  return """
399
  <!DOCTYPE html>
400
  <html>
401
  <head>
402
+ <title>SAM-Z-1 Cluster Control</title>
403
  <style>
404
+ * {
405
+ margin: 0;
406
+ padding: 0;
407
+ box-sizing: border-box;
408
+ }
409
+
410
  body {
411
  font-family: 'Courier New', monospace;
412
  background: linear-gradient(135deg, #0a0e27 0%, #1a1f3a 100%);
 
415
  overflow-x: hidden;
416
  overflow-y: auto;
417
  }
418
+
419
  .container {
420
  padding: 20px;
421
  max-width: 1400px;
422
  margin: 0 auto;
423
  padding-bottom: 40px;
424
  }
425
+
426
  .header {
427
  text-align: center;
428
  margin-bottom: 30px;
 
432
  border-radius: 10px;
433
  box-shadow: 0 0 20px rgba(0, 255, 136, 0.3);
434
  }
435
+
436
  .header h1 {
437
  font-size: 2.5em;
438
  text-transform: uppercase;
 
440
  text-shadow: 0 0 10px #00ff88;
441
  animation: glow 2s ease-in-out infinite alternate;
442
  }
443
+
444
  @keyframes glow {
445
  from { text-shadow: 0 0 10px #00ff88, 0 0 20px #00ff88; }
446
  to { text-shadow: 0 0 20px #00ff88, 0 0 30px #00ff88, 0 0 40px #00ff88; }
447
  }
448
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
449
  .status-bar {
450
  display: flex;
451
  gap: 20px;
452
  margin-bottom: 30px;
453
  }
454
+
455
  .stat-card {
456
  flex: 1;
457
  background: rgba(0, 255, 136, 0.05);
 
461
  position: relative;
462
  overflow: hidden;
463
  }
464
+
465
+ .stat-card::before {
466
+ content: '';
467
+ position: absolute;
468
+ top: 0;
469
+ left: -100%;
470
+ width: 100%;
471
+ height: 100%;
472
+ background: linear-gradient(90deg, transparent, rgba(0, 255, 136, 0.2), transparent);
473
+ animation: scan 3s infinite;
474
+ }
475
+
476
+ @keyframes scan {
477
+ 0% { left: -100%; }
478
+ 100% { left: 100%; }
479
+ }
480
+
481
  .stat-label {
482
  font-size: 0.8em;
483
  opacity: 0.7;
484
  text-transform: uppercase;
485
  }
486
+
487
  .stat-value {
488
  font-size: 2em;
489
  font-weight: bold;
490
  margin-top: 5px;
491
  }
492
+
493
+ .mode-badge {
494
+ display: inline-block;
495
+ padding: 5px 15px;
496
+ border-radius: 20px;
497
+ font-size: 0.9em;
498
+ font-weight: bold;
499
+ text-transform: uppercase;
500
+ margin-top: 10px;
501
+ }
502
+
503
+ .mode-light {
504
+ background: rgba(0, 255, 136, 0.2);
505
+ border: 1px solid #00ff88;
506
+ color: #00ff88;
507
+ }
508
+
509
+ .mode-heavy {
510
+ background: rgba(255, 68, 68, 0.2);
511
+ border: 1px solid #ff4444;
512
+ color: #ff4444;
513
+ }
514
+
515
  .workers-grid {
516
  display: grid;
517
  grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
518
  gap: 20px;
519
  margin-bottom: 30px;
520
  }
521
+
522
+ @media (max-width: 768px) {
523
+ .workers-grid {
524
+ grid-template-columns: 1fr;
525
+ }
526
+
527
+ .status-bar {
528
+ flex-direction: column;
529
+ }
530
+
531
+ .info-grid {
532
+ grid-template-columns: repeat(2, 1fr);
533
+ }
534
+
535
+ .header h1 {
536
+ font-size: 1.5em;
537
+ letter-spacing: 2px;
538
+ }
539
+ }
540
+
541
  .worker-card {
542
  background: rgba(10, 14, 39, 0.8);
543
  border: 2px solid #00ff88;
 
546
  position: relative;
547
  transition: all 0.3s;
548
  }
549
+
550
  .worker-card:hover {
551
  transform: translateY(-5px);
552
  box-shadow: 0 5px 30px rgba(0, 255, 136, 0.4);
553
  }
554
+
555
  .worker-card.offline {
556
  border-color: #ff4444;
557
  opacity: 0.6;
558
  }
559
+
560
  .worker-header {
561
  display: flex;
562
  justify-content: space-between;
563
  align-items: center;
564
  margin-bottom: 15px;
565
  }
566
+
567
  .worker-name {
568
  font-size: 1.2em;
569
  font-weight: bold;
570
  }
571
+
572
  .status-dot {
573
  width: 12px;
574
  height: 12px;
575
  border-radius: 50%;
576
  animation: pulse 2s infinite;
577
  }
578
+
579
  .status-dot.online {
580
  background: #00ff88;
581
  box-shadow: 0 0 10px #00ff88;
582
  }
583
+
584
  .status-dot.offline {
585
  background: #ff4444;
586
  box-shadow: 0 0 10px #ff4444;
587
  }
588
+
589
  @keyframes pulse {
590
  0%, 100% { opacity: 1; }
591
  50% { opacity: 0.5; }
592
  }
593
+
594
  .worker-stats {
595
  display: grid;
596
  grid-template-columns: repeat(2, 1fr);
597
  gap: 10px;
598
  margin-top: 15px;
599
  }
600
+
601
  .worker-stat {
602
  background: rgba(0, 255, 136, 0.05);
603
  padding: 10px;
604
  border-radius: 5px;
605
  }
606
+
607
  .worker-stat-label {
608
  font-size: 0.7em;
609
  opacity: 0.7;
610
  }
611
+
612
  .worker-stat-value {
613
  font-size: 1.3em;
614
  font-weight: bold;
615
  margin-top: 3px;
616
  }
617
+
618
+ .role-badge {
619
+ display: inline-block;
620
+ padding: 3px 10px;
621
+ border-radius: 12px;
622
+ font-size: 0.75em;
623
+ margin-top: 10px;
624
+ font-weight: bold;
625
+ }
626
+
627
+ .role-generator {
628
+ background: rgba(255, 165, 0, 0.2);
629
+ border: 1px solid #ffa500;
630
+ color: #ffa500;
631
+ }
632
+
633
+ .role-decoder {
634
+ background: rgba(0, 191, 255, 0.2);
635
+ border: 1px solid #00bfff;
636
+ color: #00bfff;
637
+ }
638
+
639
+ .role-full {
640
+ background: rgba(138, 43, 226, 0.2);
641
+ border: 1px solid #8a2be2;
642
+ color: #8a2be2;
643
+ }
644
+
645
+ .role-idle {
646
+ background: rgba(128, 128, 128, 0.2);
647
+ border: 1px solid #808080;
648
+ color: #808080;
649
+ }
650
+
651
+ .progress-bar {
652
+ width: 100%;
653
+ height: 4px;
654
+ background: rgba(0, 255, 136, 0.1);
655
+ border-radius: 2px;
656
+ margin-top: 10px;
657
+ overflow: hidden;
658
+ }
659
+
660
+ .progress-fill {
661
+ height: 100%;
662
+ background: linear-gradient(90deg, #00ff88, #00ffff);
663
+ transition: width 0.3s;
664
+ box-shadow: 0 0 10px #00ff88;
665
+ }
666
+
667
+ .cluster-info {
668
+ background: rgba(0, 255, 136, 0.05);
669
+ border: 1px solid #00ff88;
670
+ border-radius: 8px;
671
+ padding: 20px;
672
+ }
673
+
674
+ .info-grid {
675
+ display: grid;
676
+ grid-template-columns: repeat(4, 1fr);
677
+ gap: 20px;
678
+ }
679
+
680
+ .info-item {
681
+ text-align: center;
682
+ }
683
+
684
  .timestamp {
685
  text-align: center;
686
  margin-top: 20px;
687
  opacity: 0.5;
688
  font-size: 0.9em;
689
  }
 
 
 
 
690
  </style>
691
  </head>
692
  <body>
693
  <div class="container">
694
  <div class="header">
695
  <h1>⚡ SAM-Z-1 CLUSTER ⚡</h1>
696
+ <div>DISTRIBUTED COMPUTE SYSTEM v4.0</div>
697
  </div>
698
 
699
  <div class="status-bar">
700
  <div class="stat-card">
701
  <div class="stat-label">Load Mode</div>
702
  <div class="stat-value" id="mode">--</div>
703
+ <div class="mode-badge" id="mode-badge">INITIALIZING</div>
704
  </div>
705
  <div class="stat-card">
706
  <div class="stat-label">Current Load</div>
707
  <div class="stat-value" id="load">0</div>
708
+ <div class="stat-label">requests / 10s</div>
709
  </div>
710
  <div class="stat-card">
711
  <div class="stat-label">Total Requests</div>
 
717
  </div>
718
  </div>
719
 
720
+ <div class="workers-grid" id="workers">
721
+ <!-- Workers populated by JS -->
722
+ </div>
723
+
724
+ <div class="cluster-info">
725
+ <div class="stat-label" style="margin-bottom: 15px;">CLUSTER STATISTICS</div>
726
+ <div class="info-grid">
727
+ <div class="info-item">
728
+ <div class="stat-label">Successful</div>
729
+ <div class="stat-value" style="font-size: 1.5em;" id="success">0</div>
730
+ </div>
731
+ <div class="info-item">
732
+ <div class="stat-label">Failed</div>
733
+ <div class="stat-value" style="font-size: 1.5em;" id="failed">0</div>
734
+ </div>
735
+ <div class="info-item">
736
+ <div class="stat-label">Uptime</div>
737
+ <div class="stat-value" style="font-size: 1.5em;" id="uptime">0s</div>
738
+ </div>
739
+ <div class="info-item">
740
+ <div class="stat-label">Healthy Workers</div>
741
+ <div class="stat-value" style="font-size: 1.5em;" id="healthy">0</div>
742
+ </div>
743
+ </div>
744
+ </div>
745
 
746
  <div class="timestamp" id="timestamp">Last update: --</div>
747
  </div>
748
 
749
  <script>
750
+ // Use wss:// for HTTPS, ws:// for HTTP
751
  const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
752
  let ws;
753
+ let usePolling = false;
754
 
755
  function connectWebSocket() {
756
  try {
757
  ws = new WebSocket(`${protocol}//${window.location.host}/ws`);
758
 
759
+ ws.onopen = () => {
760
+ console.log('✅ WebSocket connected');
761
+ usePolling = false;
762
+ };
763
+
764
+ ws.onmessage = (event) => {
765
+ const data = JSON.parse(event.data);
766
+ updateDashboard(data);
767
+ };
768
+
769
+ ws.onerror = (error) => {
770
+ console.error('❌ WebSocket error, switching to polling');
771
+ usePolling = true;
772
+ startPolling();
773
+ };
774
+
775
+ ws.onclose = () => {
776
+ console.log('🔌 WebSocket disconnected');
777
+ if (!usePolling) {
778
+ setTimeout(connectWebSocket, 3000);
779
+ }
780
+ };
781
  } catch (e) {
782
+ console.error('Failed to connect WebSocket, using polling');
783
+ usePolling = true;
784
+ startPolling();
785
  }
786
  }
787
 
788
+ async function pollStats() {
789
+ if (!usePolling) return;
790
+
791
+ try {
792
+ const response = await fetch('/api/status');
793
+ const data = await response.json();
794
+
795
+ // Fetch worker stats too
796
+ const workersRes = await fetch('/workers');
797
+ const workersData = await workersRes.json();
798
+
799
+ // Format data like WebSocket
800
+ const formattedData = {
801
+ timestamp: Date.now() / 1000,
802
+ mode: data.mode,
803
+ load: data.current_load,
804
+ workers: workersData.workers.map(w => ({
805
+ url: w.url.split("//")[1].split(".")[0],
806
+ healthy: w.healthy,
807
+ active: w.active_requests || 0,
808
+ total: 0,
809
+ tokens: 0,
810
+ latency: 0,
811
+ role: "idle"
812
+ })),
813
+ cluster: {
814
+ total_requests: 0,
815
+ successful: 0,
816
+ failed: 0,
817
+ uptime: 0,
818
+ rps: 0
819
+ }
820
+ };
821
+
822
+ updateDashboard(formattedData);
823
+ } catch (e) {
824
+ console.error('Polling error:', e);
825
+ }
826
+ }
827
+
828
+ function startPolling() {
829
+ pollStats();
830
+ setInterval(pollStats, 1000);
831
+ }
832
+
833
+ // Try WebSocket first
834
  connectWebSocket();
835
 
836
  function updateDashboard(data) {
837
+ // Mode
838
  document.getElementById('mode').textContent = data.mode.toUpperCase();
839
+ const modeBadge = document.getElementById('mode-badge');
840
+ modeBadge.textContent = `${data.mode.toUpperCase()} MODE`;
841
+ modeBadge.className = `mode-badge mode-${data.mode}`;
842
+
843
+ // Stats
844
  document.getElementById('load').textContent = data.load;
845
  document.getElementById('total-req').textContent = data.cluster.total_requests;
846
  document.getElementById('rps').textContent = data.cluster.rps;
847
+ document.getElementById('success').textContent = data.cluster.successful;
848
+ document.getElementById('failed').textContent = data.cluster.failed;
849
+ document.getElementById('uptime').textContent = formatUptime(data.cluster.uptime);
850
 
851
+ // Workers
852
  const workersDiv = document.getElementById('workers');
853
+ const healthyCount = data.workers.filter(w => w.healthy).length;
854
+ document.getElementById('healthy').textContent = `${healthyCount}/${data.workers.length}`;
855
+
856
  workersDiv.innerHTML = data.workers.map(worker => `
857
  <div class="worker-card ${worker.healthy ? '' : 'offline'}">
858
  <div class="worker-header">
859
+ <div class="worker-name">${worker.url}</div>
 
 
 
 
860
  <div class="status-dot ${worker.healthy ? 'online' : 'offline'}"></div>
861
  </div>
862
+ <div class="role-badge role-${worker.role}">${worker.role.toUpperCase()}</div>
863
  <div class="worker-stats">
864
  <div class="worker-stat">
865
  <div class="worker-stat-label">Active</div>
 
874
  <div class="worker-stat-value">${worker.tokens}</div>
875
  </div>
876
  <div class="worker-stat">
877
+ <div class="worker-stat-label">Latency</div>
878
+ <div class="worker-stat-value">${worker.latency}ms</div>
879
  </div>
880
  </div>
881
+ <div class="progress-bar">
882
+ <div class="progress-fill" style="width: ${Math.min(worker.active * 33, 100)}%"></div>
883
+ </div>
884
  </div>
885
  `).join('');
886
 
887
+ // Timestamp
888
+ const now = new Date();
889
  document.getElementById('timestamp').textContent =
890
+ `Last update: ${now.toLocaleTimeString()}`;
891
+ }
892
+
893
+ function formatUptime(seconds) {
894
+ const h = Math.floor(seconds / 3600);
895
+ const m = Math.floor((seconds % 3600) / 60);
896
+ const s = Math.floor(seconds % 60);
897
+ return `${h}h ${m}m ${s}s`;
898
  }
899
  </script>
900
  </body>
901
  </html>
902
  """
903
 
904
+ @app.websocket("/ws")
905
+ async def websocket_endpoint(websocket: WebSocket):
906
+ """WebSocket for real-time dashboard updates"""
907
+ await websocket.accept()
908
+ active_connections.add(websocket)
909
+
910
+ try:
911
+ # Send initial data
912
+ await broadcast_stats()
913
+
914
+ # Keep connection alive
915
+ while True:
916
+ await websocket.receive_text()
917
+ except:
918
+ pass
919
+ finally:
920
+ active_connections.discard(websocket)
921
+
922
+ # ============================================================================
923
+ # API Endpoints
924
+ # ============================================================================
925
+
926
  @app.get("/api/status")
927
  async def api_status():
928
+ """JSON API for status"""
929
  mode, load = update_load_mode()
930
  healthy_count = len(get_healthy_workers())
931
 
 
 
 
 
932
  return {
933
  "name": "SAM-Z-1 Distributed Cluster",
934
+ "version": "4.0.0",
935
  "mode": mode,
936
  "current_load": load,
937
  "workers": len(WORKER_URLS),
938
  "healthy_workers": healthy_count,
939
+ "features": ["distributed_compute", "smart_load_balancing", "real_time_dashboard"]
 
 
 
 
 
 
 
 
940
  }
941
 
942
  @app.get("/health")
 
947
  "workers_healthy": healthy_count
948
  }
949
 
 
 
 
 
 
 
 
 
 
 
 
 
 
950
  @app.post("/v1/generate")
951
  async def generate(request: GenerateRequest):
952
+ """Generate text with distributed compute"""
953
  track_request()
954
  mode, load = update_load_mode()
955
 
956
+ healthy = get_healthy_workers()
957
+ if not healthy:
 
 
958
  cluster_stats["failed_requests"] += 1
959
+ raise HTTPException(status_code=503, detail="No healthy workers")
 
 
 
960
 
961
  request_data = {
962
  "prompt": request.prompt,
 
968
  "stream": True
969
  }
970
 
971
+ print(f"🎯 {mode.upper()} | Load: {load} | Workers: {len(healthy)}")
 
 
 
 
972
 
973
  try:
974
+ if mode == "light" and len(healthy) >= 2:
975
+ # DISTRIBUTED MODE - 1 gen + multiple decoders
976
+ generators, decoders = select_distributed_workers()
977
  if decoders:
978
  cluster_stats["successful_requests"] += 1
979
  return StreamingResponse(
 
981
  media_type="text/event-stream"
982
  )
983
 
984
+ # HEAVY/FALLBACK - single worker
985
+ worker = get_least_busy_worker()
986
  cluster_stats["successful_requests"] += 1
987
  return StreamingResponse(
988
  heavy_load_generation(worker, request_data, "generate"),
 
994
 
995
  @app.post("/v1/chat")
996
  async def chat(request: ChatRequest):
997
+ """Chat with distributed compute"""
998
  track_request()
999
  mode, load = update_load_mode()
1000
 
1001
+ healthy = get_healthy_workers()
1002
+ if not healthy:
 
 
1003
  cluster_stats["failed_requests"] += 1
1004
+ raise HTTPException(status_code=503, detail="No healthy workers")
 
 
 
1005
 
1006
  request_data = {
1007
  "messages": [{"role": m.role, "content": m.content} for m in request.messages],
 
1013
  "stream": True
1014
  }
1015
 
1016
+ print(f"💬 {mode.upper()} | Load: {load} | Workers: {len(healthy)}")
 
 
 
 
1017
 
1018
  try:
1019
+ if mode == "light" and len(healthy) >= 2:
1020
+ # DISTRIBUTED MODE - 1 gen + multiple decoders
1021
+ generators, decoders = select_distributed_workers()
1022
  if decoders:
1023
  cluster_stats["successful_requests"] += 1
1024
  return StreamingResponse(
 
1026
  media_type="text/event-stream"
1027
  )
1028
 
1029
+ # HEAVY/FALLBACK - single worker
1030
+ worker = get_least_busy_worker()
1031
  cluster_stats["successful_requests"] += 1
1032
  return StreamingResponse(
1033
  heavy_load_generation(worker, request_data, "chat"),