Bc-AI commited on
Commit
74ffe1c
·
verified ·
1 Parent(s): f937aaa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +282 -424
app.py CHANGED
@@ -1,7 +1,9 @@
1
  """
2
- SAM-Z-1 Distributed Compute Cluster Head Node
3
  - Smart load balancing with distributed compute
4
  - Real-time status dashboard
 
 
5
  """
6
 
7
  from fastapi import FastAPI, HTTPException, WebSocket
@@ -15,7 +17,7 @@ from typing import List, Optional, Dict
15
  from collections import deque
16
  import random
17
 
18
- app = FastAPI(title="SAM-Z-1 Distributed Cluster", version="4.0.0")
19
 
20
  # ============================================================================
21
  # Configuration
@@ -30,13 +32,21 @@ WORKER_URLS = [
30
  "https://bc-ai-worker-5.hf.space"
31
  ]
32
 
33
- HEALTH_CHECK_INTERVAL = 5 # faster checks for real-time dashboard
34
  LOAD_CHECK_WINDOW = 10
35
 
36
  LIGHT_LOAD_THRESHOLD = 2
37
  HEAVY_LOAD_THRESHOLD = 5
38
 
39
- # Worker state
 
 
 
 
 
 
 
 
40
  worker_health = {
41
  url: {
42
  "healthy": True,
@@ -45,12 +55,14 @@ worker_health = {
45
  "total_requests": 0,
46
  "total_tokens": 0,
47
  "avg_latency": 0,
48
- "role": "idle" # "generator", "decoder", "full", "idle"
 
 
49
  } for url in WORKER_URLS
50
  }
51
 
52
  request_timestamps = deque(maxlen=100)
53
- current_load_mode = "light" # "light", "medium", "heavy"
54
  cluster_stats = {
55
  "total_requests": 0,
56
  "successful_requests": 0,
@@ -58,7 +70,6 @@ cluster_stats = {
58
  "uptime_start": time.time()
59
  }
60
 
61
- # Active WebSocket connections for real-time updates
62
  active_connections = set()
63
 
64
  # ============================================================================
@@ -73,6 +84,7 @@ class GenerateRequest(BaseModel):
73
  top_p: float = 0.9
74
  repetition_penalty: float = 1.1
75
  stream: bool = True
 
76
 
77
  class ChatMessage(BaseModel):
78
  role: str
@@ -86,6 +98,138 @@ class ChatRequest(BaseModel):
86
  top_p: float = 0.9
87
  repetition_penalty: float = 1.1
88
  stream: bool = True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
  # ============================================================================
91
  # Load Management
@@ -100,21 +244,20 @@ def update_load_mode():
100
  load = get_current_load()
101
  healthy_count = len(get_healthy_workers())
102
 
103
- # Adjust thresholds based on available workers
104
  if healthy_count >= 5:
105
  if load <= LIGHT_LOAD_THRESHOLD:
106
- current_load_mode = "light" # 1 gen + 4 decoders
107
- elif load <= MEDIUM_LOAD_THRESHOLD:
108
- current_load_mode = "medium" # 2 gens + 3 decoders OR parallel requests
109
  else:
110
- current_load_mode = "heavy" # all workers independent
111
  elif healthy_count >= 3:
112
  if load <= 2:
113
- current_load_mode = "light" # 1 gen + 2 decoders
114
  else:
115
- current_load_mode = "heavy" # distribute requests
116
  else:
117
- current_load_mode = "heavy" # fallback to simple distribution
118
 
119
  return current_load_mode, load
120
 
@@ -122,42 +265,11 @@ def track_request():
122
  request_timestamps.append(time.time())
123
  cluster_stats["total_requests"] += 1
124
 
125
- def get_healthy_workers() -> List[str]:
126
- return [url for url, status in worker_health.items() if status["healthy"]]
127
-
128
- def get_least_busy_worker() -> Optional[str]:
129
- healthy = get_healthy_workers()
130
- if not healthy:
131
- return None
132
- return min(healthy, key=lambda url: worker_health[url]["active_requests"])
133
-
134
- def select_distributed_workers() -> tuple:
135
- """
136
- Select workers for distributed compute
137
- Returns: (generators: List[str], decoders: List[str])
138
- """
139
- healthy = get_healthy_workers()
140
- if len(healthy) < 2:
141
- return ([healthy[0]], []) if len(healthy) == 1 else ([], [])
142
-
143
- # Sort by least busy
144
- sorted_workers = sorted(healthy, key=lambda url: worker_health[url]["active_requests"])
145
-
146
- if len(healthy) >= 5:
147
- # OPTIMAL: 1 generator, 4 decoders
148
- return ([sorted_workers[0]], sorted_workers[1:5])
149
- elif len(healthy) == 4:
150
- # 1 generator, 3 decoders
151
- return ([sorted_workers[0]], sorted_workers[1:4])
152
- elif len(healthy) == 3:
153
- # 1 generator, 2 decoders
154
- return ([sorted_workers[0]], sorted_workers[1:3])
155
- else:
156
- # 1 generator, 1 decoder
157
- return ([sorted_workers[0]], [sorted_workers[1]])
158
 
159
  async def broadcast_stats():
160
- """Broadcast stats to all connected WebSocket clients"""
161
  if not active_connections:
162
  return
163
 
@@ -170,13 +282,15 @@ async def broadcast_stats():
170
  "load": load,
171
  "workers": [
172
  {
173
- "url": url.split("//")[1].split(".")[0], # shorter name
174
  "healthy": status["healthy"],
175
  "active": status["active_requests"],
176
  "total": status["total_requests"],
177
  "tokens": status["total_tokens"],
178
  "latency": round(status["avg_latency"], 2),
179
- "role": status["role"]
 
 
180
  }
181
  for url, status in worker_health.items()
182
  ],
@@ -189,7 +303,6 @@ async def broadcast_stats():
189
  }
190
  }
191
 
192
- # Broadcast to all connections
193
  disconnected = set()
194
  for ws in active_connections:
195
  try:
@@ -197,36 +310,24 @@ async def broadcast_stats():
197
  except:
198
  disconnected.add(ws)
199
 
200
- # Remove disconnected
201
  active_connections.difference_update(disconnected)
202
 
203
- async def check_worker_health(worker_url: str) -> bool:
 
 
 
 
204
  try:
205
- async with httpx.AsyncClient(timeout=5.0) as client:
206
- response = await client.get(f"{worker_url}/health")
207
- return response.status_code == 200
208
- except:
209
- return False
210
-
211
- async def health_check_loop():
212
- while True:
213
- # Check all workers
214
- for worker_url in WORKER_URLS:
215
- healthy = await check_worker_health(worker_url)
216
- worker_health[worker_url]["healthy"] = healthy
217
- worker_health[worker_url]["last_check"] = time.time()
218
-
219
- # Always broadcast stats to connected clients
220
  await broadcast_stats()
221
-
222
- await asyncio.sleep(HEALTH_CHECK_INTERVAL)
223
-
224
- @app.on_event("startup")
225
- async def startup_event():
226
- asyncio.create_task(health_check_loop())
227
 
228
  # ============================================================================
229
- # Distributed Compute Generation
230
  # ============================================================================
231
 
232
  async def distributed_generation(
@@ -235,11 +336,7 @@ async def distributed_generation(
235
  request_data: dict,
236
  endpoint: str = "generate"
237
  ):
238
- """
239
- DISTRIBUTED COMPUTE MODE
240
- - Generator(s) produce token IDs
241
- - Multiple decoders process in parallel (load balanced)
242
- """
243
 
244
  if not generators or not decoders:
245
  return
@@ -247,15 +344,13 @@ async def distributed_generation(
247
  token_queue = asyncio.Queue(maxsize=50)
248
  text_queue = asyncio.Queue(maxsize=50)
249
 
250
- # Mark roles
251
  for gen_url in generators:
252
  worker_health[gen_url]["role"] = "generator"
253
  for dec_url in decoders:
254
  worker_health[dec_url]["role"] = "decoder"
255
 
256
  async def generate_tokens():
257
- """Generator worker(s)"""
258
- gen_url = generators[0] # primary generator
259
  try:
260
  worker_health[gen_url]["active_requests"] += 1
261
  request_data_tokens = {**request_data, "return_token_ids": True}
@@ -273,7 +368,6 @@ async def distributed_generation(
273
  if "token_id" in data:
274
  await token_queue.put(data["token_id"])
275
  elif "done" in data:
276
- # Send done signal for each decoder
277
  for _ in decoders:
278
  await token_queue.put(None)
279
  break
@@ -288,18 +382,16 @@ async def distributed_generation(
288
  worker_health[gen_url]["role"] = "idle"
289
 
290
  async def decode_tokens(decoder_url: str, decoder_id: int):
291
- """Decoder worker - processes tokens from shared queue"""
292
  try:
293
  worker_health[decoder_url]["active_requests"] += 1
294
  batch = []
295
- batch_size = 2 # smaller batches for faster streaming
296
 
297
  while True:
298
  try:
299
  token_id = await asyncio.wait_for(token_queue.get(), timeout=2.0)
300
 
301
  if token_id is None:
302
- # Decode remaining batch
303
  if batch:
304
  async with httpx.AsyncClient(timeout=10.0) as client:
305
  response = await client.post(
@@ -315,7 +407,6 @@ async def distributed_generation(
315
 
316
  batch.append(token_id)
317
 
318
- # Decode when batch is full
319
  if len(batch) >= batch_size:
320
  async with httpx.AsyncClient(timeout=10.0) as client:
321
  response = await client.post(
@@ -338,16 +429,12 @@ async def distributed_generation(
338
  worker_health[decoder_url]["active_requests"] -= 1
339
  worker_health[decoder_url]["role"] = "idle"
340
 
341
- # Start generator
342
  gen_task = asyncio.create_task(generate_tokens())
343
-
344
- # Start all decoders
345
  decoder_tasks = [
346
  asyncio.create_task(decode_tokens(dec_url, i))
347
  for i, dec_url in enumerate(decoders)
348
  ]
349
 
350
- # Stream results
351
  accumulated_text = ""
352
  decoders_done = 0
353
  total_decoders = len(decoders)
@@ -393,24 +480,19 @@ async def heavy_load_generation(worker_url: str, request_data: dict, endpoint: s
393
  worker_health[worker_url]["role"] = "idle"
394
 
395
  # ============================================================================
396
- # Dashboard
397
  # ============================================================================
398
 
399
  @app.get("/", response_class=HTMLResponse)
400
  async def dashboard():
401
- """Real-time futuristic dashboard"""
402
  return """
403
  <!DOCTYPE html>
404
  <html>
405
  <head>
406
- <title>SAM-Z-1 Cluster Control</title>
407
  <style>
408
- * {
409
- margin: 0;
410
- padding: 0;
411
- box-sizing: border-box;
412
- }
413
-
414
  body {
415
  font-family: 'Courier New', monospace;
416
  background: linear-gradient(135deg, #0a0e27 0%, #1a1f3a 100%);
@@ -419,14 +501,12 @@ async def dashboard():
419
  overflow-x: hidden;
420
  overflow-y: auto;
421
  }
422
-
423
  .container {
424
  padding: 20px;
425
  max-width: 1400px;
426
  margin: 0 auto;
427
  padding-bottom: 40px;
428
  }
429
-
430
  .header {
431
  text-align: center;
432
  margin-bottom: 30px;
@@ -436,7 +516,6 @@ async def dashboard():
436
  border-radius: 10px;
437
  box-shadow: 0 0 20px rgba(0, 255, 136, 0.3);
438
  }
439
-
440
  .header h1 {
441
  font-size: 2.5em;
442
  text-transform: uppercase;
@@ -444,18 +523,33 @@ async def dashboard():
444
  text-shadow: 0 0 10px #00ff88;
445
  animation: glow 2s ease-in-out infinite alternate;
446
  }
447
-
448
  @keyframes glow {
449
  from { text-shadow: 0 0 10px #00ff88, 0 0 20px #00ff88; }
450
  to { text-shadow: 0 0 20px #00ff88, 0 0 30px #00ff88, 0 0 40px #00ff88; }
451
  }
452
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
453
  .status-bar {
454
  display: flex;
455
  gap: 20px;
456
  margin-bottom: 30px;
457
  }
458
-
459
  .stat-card {
460
  flex: 1;
461
  background: rgba(0, 255, 136, 0.05);
@@ -465,83 +559,22 @@ async def dashboard():
465
  position: relative;
466
  overflow: hidden;
467
  }
468
-
469
- .stat-card::before {
470
- content: '';
471
- position: absolute;
472
- top: 0;
473
- left: -100%;
474
- width: 100%;
475
- height: 100%;
476
- background: linear-gradient(90deg, transparent, rgba(0, 255, 136, 0.2), transparent);
477
- animation: scan 3s infinite;
478
- }
479
-
480
- @keyframes scan {
481
- 0% { left: -100%; }
482
- 100% { left: 100%; }
483
- }
484
-
485
  .stat-label {
486
  font-size: 0.8em;
487
  opacity: 0.7;
488
  text-transform: uppercase;
489
  }
490
-
491
  .stat-value {
492
  font-size: 2em;
493
  font-weight: bold;
494
  margin-top: 5px;
495
  }
496
-
497
- .mode-badge {
498
- display: inline-block;
499
- padding: 5px 15px;
500
- border-radius: 20px;
501
- font-size: 0.9em;
502
- font-weight: bold;
503
- text-transform: uppercase;
504
- margin-top: 10px;
505
- }
506
-
507
- .mode-light {
508
- background: rgba(0, 255, 136, 0.2);
509
- border: 1px solid #00ff88;
510
- color: #00ff88;
511
- }
512
-
513
- .mode-heavy {
514
- background: rgba(255, 68, 68, 0.2);
515
- border: 1px solid #ff4444;
516
- color: #ff4444;
517
- }
518
-
519
  .workers-grid {
520
  display: grid;
521
  grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
522
  gap: 20px;
523
  margin-bottom: 30px;
524
  }
525
-
526
- @media (max-width: 768px) {
527
- .workers-grid {
528
- grid-template-columns: 1fr;
529
- }
530
-
531
- .status-bar {
532
- flex-direction: column;
533
- }
534
-
535
- .info-grid {
536
- grid-template-columns: repeat(2, 1fr);
537
- }
538
-
539
- .header h1 {
540
- font-size: 1.5em;
541
- letter-spacing: 2px;
542
- }
543
- }
544
-
545
  .worker-card {
546
  background: rgba(10, 14, 39, 0.8);
547
  border: 2px solid #00ff88;
@@ -550,166 +583,89 @@ async def dashboard():
550
  position: relative;
551
  transition: all 0.3s;
552
  }
553
-
554
  .worker-card:hover {
555
  transform: translateY(-5px);
556
  box-shadow: 0 5px 30px rgba(0, 255, 136, 0.4);
557
  }
558
-
559
  .worker-card.offline {
560
  border-color: #ff4444;
561
  opacity: 0.6;
562
  }
563
-
564
  .worker-header {
565
  display: flex;
566
  justify-content: space-between;
567
  align-items: center;
568
  margin-bottom: 15px;
569
  }
570
-
571
  .worker-name {
572
  font-size: 1.2em;
573
  font-weight: bold;
574
  }
575
-
576
  .status-dot {
577
  width: 12px;
578
  height: 12px;
579
  border-radius: 50%;
580
  animation: pulse 2s infinite;
581
  }
582
-
583
  .status-dot.online {
584
  background: #00ff88;
585
  box-shadow: 0 0 10px #00ff88;
586
  }
587
-
588
  .status-dot.offline {
589
  background: #ff4444;
590
  box-shadow: 0 0 10px #ff4444;
591
  }
592
-
593
  @keyframes pulse {
594
  0%, 100% { opacity: 1; }
595
  50% { opacity: 0.5; }
596
  }
597
-
598
  .worker-stats {
599
  display: grid;
600
  grid-template-columns: repeat(2, 1fr);
601
  gap: 10px;
602
  margin-top: 15px;
603
  }
604
-
605
  .worker-stat {
606
  background: rgba(0, 255, 136, 0.05);
607
  padding: 10px;
608
  border-radius: 5px;
609
  }
610
-
611
  .worker-stat-label {
612
  font-size: 0.7em;
613
  opacity: 0.7;
614
  }
615
-
616
  .worker-stat-value {
617
  font-size: 1.3em;
618
  font-weight: bold;
619
  margin-top: 3px;
620
  }
621
-
622
- .role-badge {
623
- display: inline-block;
624
- padding: 3px 10px;
625
- border-radius: 12px;
626
- font-size: 0.75em;
627
- margin-top: 10px;
628
- font-weight: bold;
629
- }
630
-
631
- .role-generator {
632
- background: rgba(255, 165, 0, 0.2);
633
- border: 1px solid #ffa500;
634
- color: #ffa500;
635
- }
636
-
637
- .role-decoder {
638
- background: rgba(0, 191, 255, 0.2);
639
- border: 1px solid #00bfff;
640
- color: #00bfff;
641
- }
642
-
643
- .role-full {
644
- background: rgba(138, 43, 226, 0.2);
645
- border: 1px solid #8a2be2;
646
- color: #8a2be2;
647
- }
648
-
649
- .role-idle {
650
- background: rgba(128, 128, 128, 0.2);
651
- border: 1px solid #808080;
652
- color: #808080;
653
- }
654
-
655
- .progress-bar {
656
- width: 100%;
657
- height: 4px;
658
- background: rgba(0, 255, 136, 0.1);
659
- border-radius: 2px;
660
- margin-top: 10px;
661
- overflow: hidden;
662
- }
663
-
664
- .progress-fill {
665
- height: 100%;
666
- background: linear-gradient(90deg, #00ff88, #00ffff);
667
- transition: width 0.3s;
668
- box-shadow: 0 0 10px #00ff88;
669
- }
670
-
671
- .cluster-info {
672
- background: rgba(0, 255, 136, 0.05);
673
- border: 1px solid #00ff88;
674
- border-radius: 8px;
675
- padding: 20px;
676
- }
677
-
678
- .info-grid {
679
- display: grid;
680
- grid-template-columns: repeat(4, 1fr);
681
- gap: 20px;
682
- }
683
-
684
- .info-item {
685
- text-align: center;
686
- }
687
-
688
  .timestamp {
689
  text-align: center;
690
  margin-top: 20px;
691
  opacity: 0.5;
692
  font-size: 0.9em;
693
  }
 
 
 
 
694
  </style>
695
  </head>
696
  <body>
697
  <div class="container">
698
  <div class="header">
699
  <h1>⚡ SAM-Z-1 CLUSTER ⚡</h1>
700
- <div>DISTRIBUTED COMPUTE SYSTEM v4.0</div>
701
  </div>
702
 
703
  <div class="status-bar">
704
  <div class="stat-card">
705
  <div class="stat-label">Load Mode</div>
706
  <div class="stat-value" id="mode">--</div>
707
- <div class="mode-badge" id="mode-badge">INITIALIZING</div>
708
  </div>
709
  <div class="stat-card">
710
  <div class="stat-label">Current Load</div>
711
  <div class="stat-value" id="load">0</div>
712
- <div class="stat-label">requests / 10s</div>
713
  </div>
714
  <div class="stat-card">
715
  <div class="stat-label">Total Requests</div>
@@ -721,149 +677,47 @@ async def dashboard():
721
  </div>
722
  </div>
723
 
724
- <div class="workers-grid" id="workers">
725
- <!-- Workers populated by JS -->
726
- </div>
727
-
728
- <div class="cluster-info">
729
- <div class="stat-label" style="margin-bottom: 15px;">CLUSTER STATISTICS</div>
730
- <div class="info-grid">
731
- <div class="info-item">
732
- <div class="stat-label">Successful</div>
733
- <div class="stat-value" style="font-size: 1.5em;" id="success">0</div>
734
- </div>
735
- <div class="info-item">
736
- <div class="stat-label">Failed</div>
737
- <div class="stat-value" style="font-size: 1.5em;" id="failed">0</div>
738
- </div>
739
- <div class="info-item">
740
- <div class="stat-label">Uptime</div>
741
- <div class="stat-value" style="font-size: 1.5em;" id="uptime">0s</div>
742
- </div>
743
- <div class="info-item">
744
- <div class="stat-label">Healthy Workers</div>
745
- <div class="stat-value" style="font-size: 1.5em;" id="healthy">0</div>
746
- </div>
747
- </div>
748
- </div>
749
 
750
  <div class="timestamp" id="timestamp">Last update: --</div>
751
  </div>
752
 
753
  <script>
754
- // Use wss:// for HTTPS, ws:// for HTTP
755
  const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
756
  let ws;
757
- let usePolling = false;
758
 
759
  function connectWebSocket() {
760
  try {
761
  ws = new WebSocket(`${protocol}//${window.location.host}/ws`);
762
 
763
- ws.onopen = () => {
764
- console.log('✅ WebSocket connected');
765
- usePolling = false;
766
- };
767
-
768
- ws.onmessage = (event) => {
769
- const data = JSON.parse(event.data);
770
- updateDashboard(data);
771
- };
772
-
773
- ws.onerror = (error) => {
774
- console.error('❌ WebSocket error, switching to polling');
775
- usePolling = true;
776
- startPolling();
777
- };
778
-
779
- ws.onclose = () => {
780
- console.log('🔌 WebSocket disconnected');
781
- if (!usePolling) {
782
- setTimeout(connectWebSocket, 3000);
783
- }
784
- };
785
  } catch (e) {
786
- console.error('Failed to connect WebSocket, using polling');
787
- usePolling = true;
788
- startPolling();
789
  }
790
  }
791
 
792
- async function pollStats() {
793
- if (!usePolling) return;
794
-
795
- try {
796
- const response = await fetch('/api/status');
797
- const data = await response.json();
798
-
799
- // Fetch worker stats too
800
- const workersRes = await fetch('/workers');
801
- const workersData = await workersRes.json();
802
-
803
- // Format data like WebSocket
804
- const formattedData = {
805
- timestamp: Date.now() / 1000,
806
- mode: data.mode,
807
- load: data.current_load,
808
- workers: workersData.workers.map(w => ({
809
- url: w.url.split("//")[1].split(".")[0],
810
- healthy: w.healthy,
811
- active: w.active_requests || 0,
812
- total: 0,
813
- tokens: 0,
814
- latency: 0,
815
- role: "idle"
816
- })),
817
- cluster: {
818
- total_requests: 0,
819
- successful: 0,
820
- failed: 0,
821
- uptime: 0,
822
- rps: 0
823
- }
824
- };
825
-
826
- updateDashboard(formattedData);
827
- } catch (e) {
828
- console.error('Polling error:', e);
829
- }
830
- }
831
-
832
- function startPolling() {
833
- pollStats();
834
- setInterval(pollStats, 1000);
835
- }
836
-
837
- // Try WebSocket first
838
  connectWebSocket();
839
 
840
  function updateDashboard(data) {
841
- // Mode
842
  document.getElementById('mode').textContent = data.mode.toUpperCase();
843
- const modeBadge = document.getElementById('mode-badge');
844
- modeBadge.textContent = `${data.mode.toUpperCase()} MODE`;
845
- modeBadge.className = `mode-badge mode-${data.mode}`;
846
-
847
- // Stats
848
  document.getElementById('load').textContent = data.load;
849
  document.getElementById('total-req').textContent = data.cluster.total_requests;
850
  document.getElementById('rps').textContent = data.cluster.rps;
851
- document.getElementById('success').textContent = data.cluster.successful;
852
- document.getElementById('failed').textContent = data.cluster.failed;
853
- document.getElementById('uptime').textContent = formatUptime(data.cluster.uptime);
854
 
855
- // Workers
856
  const workersDiv = document.getElementById('workers');
857
- const healthyCount = data.workers.filter(w => w.healthy).length;
858
- document.getElementById('healthy').textContent = `${healthyCount}/${data.workers.length}`;
859
-
860
  workersDiv.innerHTML = data.workers.map(worker => `
861
  <div class="worker-card ${worker.healthy ? '' : 'offline'}">
862
  <div class="worker-header">
863
- <div class="worker-name">${worker.url}</div>
 
 
 
 
864
  <div class="status-dot ${worker.healthy ? 'online' : 'offline'}"></div>
865
  </div>
866
- <div class="role-badge role-${worker.role}">${worker.role.toUpperCase()}</div>
867
  <div class="worker-stats">
868
  <div class="worker-stat">
869
  <div class="worker-stat-label">Active</div>
@@ -878,69 +732,46 @@ async def dashboard():
878
  <div class="worker-stat-value">${worker.tokens}</div>
879
  </div>
880
  <div class="worker-stat">
881
- <div class="worker-stat-label">Latency</div>
882
- <div class="worker-stat-value">${worker.latency}ms</div>
883
  </div>
884
  </div>
885
- <div class="progress-bar">
886
- <div class="progress-fill" style="width: ${Math.min(worker.active * 33, 100)}%"></div>
887
- </div>
888
  </div>
889
  `).join('');
890
 
891
- // Timestamp
892
- const now = new Date();
893
  document.getElementById('timestamp').textContent =
894
- `Last update: ${now.toLocaleTimeString()}`;
895
- }
896
-
897
- function formatUptime(seconds) {
898
- const h = Math.floor(seconds / 3600);
899
- const m = Math.floor((seconds % 3600) / 60);
900
- const s = Math.floor(seconds % 60);
901
- return `${h}h ${m}m ${s}s`;
902
  }
903
  </script>
904
  </body>
905
  </html>
906
  """
907
 
908
- @app.websocket("/ws")
909
- async def websocket_endpoint(websocket: WebSocket):
910
- """WebSocket for real-time dashboard updates"""
911
- await websocket.accept()
912
- active_connections.add(websocket)
913
-
914
- try:
915
- # Send initial data
916
- await broadcast_stats()
917
-
918
- # Keep connection alive
919
- while True:
920
- await websocket.receive_text()
921
- except:
922
- pass
923
- finally:
924
- active_connections.discard(websocket)
925
-
926
- # ============================================================================
927
- # API Endpoints
928
- # ============================================================================
929
-
930
  @app.get("/api/status")
931
  async def api_status():
932
- """JSON API for status"""
933
  mode, load = update_load_mode()
934
  healthy_count = len(get_healthy_workers())
935
 
 
 
 
 
936
  return {
937
  "name": "SAM-Z-1 Distributed Cluster",
938
- "version": "4.0.0",
939
  "mode": mode,
940
  "current_load": load,
941
  "workers": len(WORKER_URLS),
942
  "healthy_workers": healthy_count,
943
- "features": ["distributed_compute", "smart_load_balancing", "real_time_dashboard"]
 
 
 
 
 
 
 
 
944
  }
945
 
946
  @app.get("/health")
@@ -951,16 +782,34 @@ async def health():
951
  "workers_healthy": healthy_count
952
  }
953
 
 
 
 
 
 
 
 
 
 
 
 
 
 
954
  @app.post("/v1/generate")
955
  async def generate(request: GenerateRequest):
956
- """Generate text with distributed compute"""
957
  track_request()
958
  mode, load = update_load_mode()
959
 
960
- healthy = get_healthy_workers()
961
- if not healthy:
 
 
962
  cluster_stats["failed_requests"] += 1
963
- raise HTTPException(status_code=503, detail="No healthy workers")
 
 
 
964
 
965
  request_data = {
966
  "prompt": request.prompt,
@@ -972,12 +821,15 @@ async def generate(request: GenerateRequest):
972
  "stream": True
973
  }
974
 
975
- print(f"🎯 {mode.upper()} | Load: {load} | Workers: {len(healthy)}")
 
 
 
 
976
 
977
  try:
978
- if mode == "light" and len(healthy) >= 2:
979
- # DISTRIBUTED MODE - 1 gen + multiple decoders
980
- generators, decoders = select_distributed_workers()
981
  if decoders:
982
  cluster_stats["successful_requests"] += 1
983
  return StreamingResponse(
@@ -985,8 +837,7 @@ async def generate(request: GenerateRequest):
985
  media_type="text/event-stream"
986
  )
987
 
988
- # HEAVY/FALLBACK - single worker
989
- worker = get_least_busy_worker()
990
  cluster_stats["successful_requests"] += 1
991
  return StreamingResponse(
992
  heavy_load_generation(worker, request_data, "generate"),
@@ -998,14 +849,19 @@ async def generate(request: GenerateRequest):
998
 
999
  @app.post("/v1/chat")
1000
  async def chat(request: ChatRequest):
1001
- """Chat with distributed compute"""
1002
  track_request()
1003
  mode, load = update_load_mode()
1004
 
1005
- healthy = get_healthy_workers()
1006
- if not healthy:
 
 
1007
  cluster_stats["failed_requests"] += 1
1008
- raise HTTPException(status_code=503, detail="No healthy workers")
 
 
 
1009
 
1010
  request_data = {
1011
  "messages": [{"role": m.role, "content": m.content} for m in request.messages],
@@ -1017,12 +873,15 @@ async def chat(request: ChatRequest):
1017
  "stream": True
1018
  }
1019
 
1020
- print(f"💬 {mode.upper()} | Load: {load} | Workers: {len(healthy)}")
 
 
 
 
1021
 
1022
  try:
1023
- if mode == "light" and len(healthy) >= 2:
1024
- # DISTRIBUTED MODE - 1 gen + multiple decoders
1025
- generators, decoders = select_distributed_workers()
1026
  if decoders:
1027
  cluster_stats["successful_requests"] += 1
1028
  return StreamingResponse(
@@ -1030,8 +889,7 @@ async def chat(request: ChatRequest):
1030
  media_type="text/event-stream"
1031
  )
1032
 
1033
- # HEAVY/FALLBACK - single worker
1034
- worker = get_least_busy_worker()
1035
  cluster_stats["successful_requests"] += 1
1036
  return StreamingResponse(
1037
  heavy_load_generation(worker, request_data, "chat"),
 
1
  """
2
+ SAM-Z-1 Distributed Compute Cluster Head Node v5.0
3
  - Smart load balancing with distributed compute
4
  - Real-time status dashboard
5
+ - Auto-detects worker version (v4 vs v5)
6
+ - Supports 4 new models with backward compatibility
7
  """
8
 
9
  from fastapi import FastAPI, HTTPException, WebSocket
 
17
  from collections import deque
18
  import random
19
 
20
+ app = FastAPI(title="SAM-Z-1 Distributed Cluster", version="5.0.0")
21
 
22
  # ============================================================================
23
  # Configuration
 
32
  "https://bc-ai-worker-5.hf.space"
33
  ]
34
 
35
+ HEALTH_CHECK_INTERVAL = 5
36
  LOAD_CHECK_WINDOW = 10
37
 
38
  LIGHT_LOAD_THRESHOLD = 2
39
  HEAVY_LOAD_THRESHOLD = 5
40
 
41
+ # New models added in v5
42
+ NEW_MODELS = [
43
+ "SAM-X-1-Large",
44
+ "SAM-X-1-Fast",
45
+ "SAM-X-1-Mini",
46
+ "SAM-X-1-Nano"
47
+ ]
48
+
49
+ # Worker state with version detection
50
  worker_health = {
51
  url: {
52
  "healthy": True,
 
55
  "total_requests": 0,
56
  "total_tokens": 0,
57
  "avg_latency": 0,
58
+ "role": "idle",
59
+ "version": None, # Will be auto-detected: "v4" or "v5"
60
+ "supports_models": [] # Models this worker supports
61
  } for url in WORKER_URLS
62
  }
63
 
64
  request_timestamps = deque(maxlen=100)
65
+ current_load_mode = "light"
66
  cluster_stats = {
67
  "total_requests": 0,
68
  "successful_requests": 0,
 
70
  "uptime_start": time.time()
71
  }
72
 
 
73
  active_connections = set()
74
 
75
  # ============================================================================
 
84
  top_p: float = 0.9
85
  repetition_penalty: float = 1.1
86
  stream: bool = True
87
+ model: Optional[str] = None # NEW: Model selection
88
 
89
  class ChatMessage(BaseModel):
90
  role: str
 
98
  top_p: float = 0.9
99
  repetition_penalty: float = 1.1
100
  stream: bool = True
101
+ model: Optional[str] = None # NEW: Model selection
102
+
103
+ # ============================================================================
104
+ # Worker Version Detection
105
+ # ============================================================================
106
+
107
+ async def detect_worker_version(worker_url: str) -> tuple:
108
+ """
109
+ Detect worker version and supported models
110
+ Returns: (version: str, supported_models: List[str])
111
+ """
112
+ try:
113
+ async with httpx.AsyncClient(timeout=10.0) as client:
114
+ # Try to get worker info endpoint (v5 feature)
115
+ try:
116
+ response = await client.get(f"{worker_url}/info")
117
+ if response.status_code == 200:
118
+ data = response.json()
119
+ version = data.get("version", "v5")
120
+ models = data.get("models", NEW_MODELS)
121
+ return version, models
122
+ except:
123
+ pass
124
+
125
+ # Try to get models list (v5 feature)
126
+ try:
127
+ response = await client.get(f"{worker_url}/models")
128
+ if response.status_code == 200:
129
+ data = response.json()
130
+ models = data.get("models", [])
131
+ if models:
132
+ return "v5", models
133
+ except:
134
+ pass
135
+
136
+ # Fallback: worker is v4 (no model selection)
137
+ return "v4", []
138
+
139
+ except Exception as e:
140
+ print(f"⚠️ Version detection failed for {worker_url}: {e}")
141
+ return "v4", []
142
+
143
+ async def check_worker_health(worker_url: str) -> bool:
144
+ """Check if worker is healthy"""
145
+ try:
146
+ async with httpx.AsyncClient(timeout=5.0) as client:
147
+ response = await client.get(f"{worker_url}/health")
148
+ return response.status_code == 200
149
+ except:
150
+ return False
151
+
152
+ async def health_check_loop():
153
+ """Health check with version detection"""
154
+ while True:
155
+ for worker_url in WORKER_URLS:
156
+ # Check health
157
+ healthy = await check_worker_health(worker_url)
158
+ worker_health[worker_url]["healthy"] = healthy
159
+ worker_health[worker_url]["last_check"] = time.time()
160
+
161
+ # Detect version if not yet detected
162
+ if worker_health[worker_url]["version"] is None:
163
+ version, models = await detect_worker_version(worker_url)
164
+ worker_health[worker_url]["version"] = version
165
+ worker_health[worker_url]["supports_models"] = models
166
+
167
+ status = "✅" if healthy else "❌"
168
+ print(f"{status} Worker: {worker_url.split('//')[1].split('.')[0]} | Version: {version} | Models: {len(models)}")
169
+
170
+ await broadcast_stats()
171
+ await asyncio.sleep(HEALTH_CHECK_INTERVAL)
172
+
173
+ @app.on_event("startup")
174
+ async def startup_event():
175
+ asyncio.create_task(health_check_loop())
176
+
177
+ # ============================================================================
178
+ # Smart Worker Selection
179
+ # ============================================================================
180
+
181
+ def get_workers_for_model(model_name: Optional[str]) -> List[str]:
182
+ """Get workers that support the requested model"""
183
+ healthy = get_healthy_workers()
184
+
185
+ if not model_name:
186
+ # No specific model requested, use any healthy worker
187
+ return healthy
188
+
189
+ # Filter workers by model support
190
+ compatible = []
191
+ for url in healthy:
192
+ version = worker_health[url]["version"]
193
+ models = worker_health[url]["supports_models"]
194
+
195
+ if version == "v5" and model_name in models:
196
+ # v5 worker with explicit model support
197
+ compatible.append(url)
198
+ elif version == "v4":
199
+ # v4 workers don't support model selection but work with default
200
+ compatible.append(url)
201
+
202
+ return compatible if compatible else healthy
203
+
204
+ def get_healthy_workers() -> List[str]:
205
+ return [url for url, status in worker_health.items() if status["healthy"]]
206
+
207
+ def get_least_busy_worker(worker_list: List[str] = None) -> Optional[str]:
208
+ workers = worker_list if worker_list is not None else get_healthy_workers()
209
+ if not workers:
210
+ return None
211
+ return min(workers, key=lambda url: worker_health[url]["active_requests"])
212
+
213
+ def select_distributed_workers(model_name: Optional[str] = None) -> tuple:
214
+ """
215
+ Select workers for distributed compute with model compatibility
216
+ Returns: (generators: List[str], decoders: List[str])
217
+ """
218
+ compatible = get_workers_for_model(model_name)
219
+
220
+ if len(compatible) < 2:
221
+ return ([compatible[0]], []) if len(compatible) == 1 else ([], [])
222
+
223
+ sorted_workers = sorted(compatible, key=lambda url: worker_health[url]["active_requests"])
224
+
225
+ if len(compatible) >= 5:
226
+ return ([sorted_workers[0]], sorted_workers[1:5])
227
+ elif len(compatible) == 4:
228
+ return ([sorted_workers[0]], sorted_workers[1:4])
229
+ elif len(compatible) == 3:
230
+ return ([sorted_workers[0]], sorted_workers[1:3])
231
+ else:
232
+ return ([sorted_workers[0]], [sorted_workers[1]])
233
 
234
  # ============================================================================
235
  # Load Management
 
244
  load = get_current_load()
245
  healthy_count = len(get_healthy_workers())
246
 
 
247
  if healthy_count >= 5:
248
  if load <= LIGHT_LOAD_THRESHOLD:
249
+ current_load_mode = "light"
250
+ elif load <= HEAVY_LOAD_THRESHOLD:
251
+ current_load_mode = "medium"
252
  else:
253
+ current_load_mode = "heavy"
254
  elif healthy_count >= 3:
255
  if load <= 2:
256
+ current_load_mode = "light"
257
  else:
258
+ current_load_mode = "heavy"
259
  else:
260
+ current_load_mode = "heavy"
261
 
262
  return current_load_mode, load
263
 
 
265
  request_timestamps.append(time.time())
266
  cluster_stats["total_requests"] += 1
267
 
268
+ # ============================================================================
269
+ # Dashboard & WebSocket
270
+ # ============================================================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
271
 
272
  async def broadcast_stats():
 
273
  if not active_connections:
274
  return
275
 
 
282
  "load": load,
283
  "workers": [
284
  {
285
+ "url": url.split("//")[1].split(".")[0],
286
  "healthy": status["healthy"],
287
  "active": status["active_requests"],
288
  "total": status["total_requests"],
289
  "tokens": status["total_tokens"],
290
  "latency": round(status["avg_latency"], 2),
291
+ "role": status["role"],
292
+ "version": status["version"] or "detecting...",
293
+ "models": len(status["supports_models"])
294
  }
295
  for url, status in worker_health.items()
296
  ],
 
303
  }
304
  }
305
 
 
306
  disconnected = set()
307
  for ws in active_connections:
308
  try:
 
310
  except:
311
  disconnected.add(ws)
312
 
 
313
  active_connections.difference_update(disconnected)
314
 
315
+ @app.websocket("/ws")
316
+ async def websocket_endpoint(websocket: WebSocket):
317
+ await websocket.accept()
318
+ active_connections.add(websocket)
319
+
320
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
321
  await broadcast_stats()
322
+ while True:
323
+ await websocket.receive_text()
324
+ except:
325
+ pass
326
+ finally:
327
+ active_connections.discard(websocket)
328
 
329
  # ============================================================================
330
+ # Distributed Generation
331
  # ============================================================================
332
 
333
  async def distributed_generation(
 
336
  request_data: dict,
337
  endpoint: str = "generate"
338
  ):
339
+ """Distributed compute with v4/v5 compatibility"""
 
 
 
 
340
 
341
  if not generators or not decoders:
342
  return
 
344
  token_queue = asyncio.Queue(maxsize=50)
345
  text_queue = asyncio.Queue(maxsize=50)
346
 
 
347
  for gen_url in generators:
348
  worker_health[gen_url]["role"] = "generator"
349
  for dec_url in decoders:
350
  worker_health[dec_url]["role"] = "decoder"
351
 
352
  async def generate_tokens():
353
+ gen_url = generators[0]
 
354
  try:
355
  worker_health[gen_url]["active_requests"] += 1
356
  request_data_tokens = {**request_data, "return_token_ids": True}
 
368
  if "token_id" in data:
369
  await token_queue.put(data["token_id"])
370
  elif "done" in data:
 
371
  for _ in decoders:
372
  await token_queue.put(None)
373
  break
 
382
  worker_health[gen_url]["role"] = "idle"
383
 
384
  async def decode_tokens(decoder_url: str, decoder_id: int):
 
385
  try:
386
  worker_health[decoder_url]["active_requests"] += 1
387
  batch = []
388
+ batch_size = 2
389
 
390
  while True:
391
  try:
392
  token_id = await asyncio.wait_for(token_queue.get(), timeout=2.0)
393
 
394
  if token_id is None:
 
395
  if batch:
396
  async with httpx.AsyncClient(timeout=10.0) as client:
397
  response = await client.post(
 
407
 
408
  batch.append(token_id)
409
 
 
410
  if len(batch) >= batch_size:
411
  async with httpx.AsyncClient(timeout=10.0) as client:
412
  response = await client.post(
 
429
  worker_health[decoder_url]["active_requests"] -= 1
430
  worker_health[decoder_url]["role"] = "idle"
431
 
 
432
  gen_task = asyncio.create_task(generate_tokens())
 
 
433
  decoder_tasks = [
434
  asyncio.create_task(decode_tokens(dec_url, i))
435
  for i, dec_url in enumerate(decoders)
436
  ]
437
 
 
438
  accumulated_text = ""
439
  decoders_done = 0
440
  total_decoders = len(decoders)
 
480
  worker_health[worker_url]["role"] = "idle"
481
 
482
  # ============================================================================
483
+ # API Endpoints
484
  # ============================================================================
485
 
486
  @app.get("/", response_class=HTMLResponse)
487
  async def dashboard():
488
+ """Real-time dashboard with version info"""
489
  return """
490
  <!DOCTYPE html>
491
  <html>
492
  <head>
493
+ <title>SAM-Z-1 Cluster Control v5.0</title>
494
  <style>
495
+ * { margin: 0; padding: 0; box-sizing: border-box; }
 
 
 
 
 
496
  body {
497
  font-family: 'Courier New', monospace;
498
  background: linear-gradient(135deg, #0a0e27 0%, #1a1f3a 100%);
 
501
  overflow-x: hidden;
502
  overflow-y: auto;
503
  }
 
504
  .container {
505
  padding: 20px;
506
  max-width: 1400px;
507
  margin: 0 auto;
508
  padding-bottom: 40px;
509
  }
 
510
  .header {
511
  text-align: center;
512
  margin-bottom: 30px;
 
516
  border-radius: 10px;
517
  box-shadow: 0 0 20px rgba(0, 255, 136, 0.3);
518
  }
 
519
  .header h1 {
520
  font-size: 2.5em;
521
  text-transform: uppercase;
 
523
  text-shadow: 0 0 10px #00ff88;
524
  animation: glow 2s ease-in-out infinite alternate;
525
  }
 
526
  @keyframes glow {
527
  from { text-shadow: 0 0 10px #00ff88, 0 0 20px #00ff88; }
528
  to { text-shadow: 0 0 20px #00ff88, 0 0 30px #00ff88, 0 0 40px #00ff88; }
529
  }
530
+ .version-badge {
531
+ display: inline-block;
532
+ padding: 3px 10px;
533
+ border-radius: 12px;
534
+ font-size: 10px;
535
+ margin-left: 8px;
536
+ font-weight: bold;
537
+ }
538
+ .version-v5 {
539
+ background: rgba(0, 255, 136, 0.2);
540
+ border: 1px solid #00ff88;
541
+ color: #00ff88;
542
+ }
543
+ .version-v4 {
544
+ background: rgba(255, 165, 0, 0.2);
545
+ border: 1px solid #ffa500;
546
+ color: #ffa500;
547
+ }
548
  .status-bar {
549
  display: flex;
550
  gap: 20px;
551
  margin-bottom: 30px;
552
  }
 
553
  .stat-card {
554
  flex: 1;
555
  background: rgba(0, 255, 136, 0.05);
 
559
  position: relative;
560
  overflow: hidden;
561
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
562
  .stat-label {
563
  font-size: 0.8em;
564
  opacity: 0.7;
565
  text-transform: uppercase;
566
  }
 
567
  .stat-value {
568
  font-size: 2em;
569
  font-weight: bold;
570
  margin-top: 5px;
571
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
572
  .workers-grid {
573
  display: grid;
574
  grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
575
  gap: 20px;
576
  margin-bottom: 30px;
577
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
578
  .worker-card {
579
  background: rgba(10, 14, 39, 0.8);
580
  border: 2px solid #00ff88;
 
583
  position: relative;
584
  transition: all 0.3s;
585
  }
 
586
  .worker-card:hover {
587
  transform: translateY(-5px);
588
  box-shadow: 0 5px 30px rgba(0, 255, 136, 0.4);
589
  }
 
590
  .worker-card.offline {
591
  border-color: #ff4444;
592
  opacity: 0.6;
593
  }
 
594
  .worker-header {
595
  display: flex;
596
  justify-content: space-between;
597
  align-items: center;
598
  margin-bottom: 15px;
599
  }
 
600
  .worker-name {
601
  font-size: 1.2em;
602
  font-weight: bold;
603
  }
 
604
  .status-dot {
605
  width: 12px;
606
  height: 12px;
607
  border-radius: 50%;
608
  animation: pulse 2s infinite;
609
  }
 
610
  .status-dot.online {
611
  background: #00ff88;
612
  box-shadow: 0 0 10px #00ff88;
613
  }
 
614
  .status-dot.offline {
615
  background: #ff4444;
616
  box-shadow: 0 0 10px #ff4444;
617
  }
 
618
  @keyframes pulse {
619
  0%, 100% { opacity: 1; }
620
  50% { opacity: 0.5; }
621
  }
 
622
  .worker-stats {
623
  display: grid;
624
  grid-template-columns: repeat(2, 1fr);
625
  gap: 10px;
626
  margin-top: 15px;
627
  }
 
628
  .worker-stat {
629
  background: rgba(0, 255, 136, 0.05);
630
  padding: 10px;
631
  border-radius: 5px;
632
  }
 
633
  .worker-stat-label {
634
  font-size: 0.7em;
635
  opacity: 0.7;
636
  }
 
637
  .worker-stat-value {
638
  font-size: 1.3em;
639
  font-weight: bold;
640
  margin-top: 3px;
641
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
642
  .timestamp {
643
  text-align: center;
644
  margin-top: 20px;
645
  opacity: 0.5;
646
  font-size: 0.9em;
647
  }
648
+ @media (max-width: 768px) {
649
+ .workers-grid { grid-template-columns: 1fr; }
650
+ .status-bar { flex-direction: column; }
651
+ }
652
  </style>
653
  </head>
654
  <body>
655
  <div class="container">
656
  <div class="header">
657
  <h1>⚡ SAM-Z-1 CLUSTER ⚡</h1>
658
+ <div>DISTRIBUTED COMPUTE SYSTEM v5.0 • AUTO VERSION DETECTION</div>
659
  </div>
660
 
661
  <div class="status-bar">
662
  <div class="stat-card">
663
  <div class="stat-label">Load Mode</div>
664
  <div class="stat-value" id="mode">--</div>
 
665
  </div>
666
  <div class="stat-card">
667
  <div class="stat-label">Current Load</div>
668
  <div class="stat-value" id="load">0</div>
 
669
  </div>
670
  <div class="stat-card">
671
  <div class="stat-label">Total Requests</div>
 
677
  </div>
678
  </div>
679
 
680
+ <div class="workers-grid" id="workers"></div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
681
 
682
  <div class="timestamp" id="timestamp">Last update: --</div>
683
  </div>
684
 
685
  <script>
 
686
  const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
687
  let ws;
 
688
 
689
  function connectWebSocket() {
690
  try {
691
  ws = new WebSocket(`${protocol}//${window.location.host}/ws`);
692
 
693
+ ws.onopen = () => console.log('✅ WebSocket connected');
694
+ ws.onmessage = (event) => updateDashboard(JSON.parse(event.data));
695
+ ws.onerror = () => console.error('❌ WebSocket error');
696
+ ws.onclose = () => setTimeout(connectWebSocket, 3000);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
697
  } catch (e) {
698
+ console.error('Failed to connect WebSocket');
 
 
699
  }
700
  }
701
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
702
  connectWebSocket();
703
 
704
  function updateDashboard(data) {
 
705
  document.getElementById('mode').textContent = data.mode.toUpperCase();
 
 
 
 
 
706
  document.getElementById('load').textContent = data.load;
707
  document.getElementById('total-req').textContent = data.cluster.total_requests;
708
  document.getElementById('rps').textContent = data.cluster.rps;
 
 
 
709
 
 
710
  const workersDiv = document.getElementById('workers');
 
 
 
711
  workersDiv.innerHTML = data.workers.map(worker => `
712
  <div class="worker-card ${worker.healthy ? '' : 'offline'}">
713
  <div class="worker-header">
714
+ <div>
715
+ <div class="worker-name">${worker.url}</div>
716
+ <span class="version-badge version-${worker.version}">${worker.version.toUpperCase()}</span>
717
+ ${worker.models > 0 ? `<span style="font-size:0.8em;opacity:0.7;margin-left:5px">${worker.models} models</span>` : ''}
718
+ </div>
719
  <div class="status-dot ${worker.healthy ? 'online' : 'offline'}"></div>
720
  </div>
 
721
  <div class="worker-stats">
722
  <div class="worker-stat">
723
  <div class="worker-stat-label">Active</div>
 
732
  <div class="worker-stat-value">${worker.tokens}</div>
733
  </div>
734
  <div class="worker-stat">
735
+ <div class="worker-stat-label">Role</div>
736
+ <div class="worker-stat-value" style="font-size:1em;">${worker.role}</div>
737
  </div>
738
  </div>
 
 
 
739
  </div>
740
  `).join('');
741
 
 
 
742
  document.getElementById('timestamp').textContent =
743
+ `Last update: ${new Date().toLocaleTimeString()}`;
 
 
 
 
 
 
 
744
  }
745
  </script>
746
  </body>
747
  </html>
748
  """
749
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
750
  @app.get("/api/status")
751
  async def api_status():
 
752
  mode, load = update_load_mode()
753
  healthy_count = len(get_healthy_workers())
754
 
755
+ # Count v4 vs v5 workers
756
+ v4_count = sum(1 for w in worker_health.values() if w["version"] == "v4")
757
+ v5_count = sum(1 for w in worker_health.values() if w["version"] == "v5")
758
+
759
  return {
760
  "name": "SAM-Z-1 Distributed Cluster",
761
+ "version": "5.0.0",
762
  "mode": mode,
763
  "current_load": load,
764
  "workers": len(WORKER_URLS),
765
  "healthy_workers": healthy_count,
766
+ "v4_workers": v4_count,
767
+ "v5_workers": v5_count,
768
+ "features": [
769
+ "distributed_compute",
770
+ "smart_load_balancing",
771
+ "auto_version_detection",
772
+ "multi_model_support",
773
+ "real_time_dashboard"
774
+ ]
775
  }
776
 
777
  @app.get("/health")
 
782
  "workers_healthy": healthy_count
783
  }
784
 
785
+ @app.get("/models")
786
+ async def list_models():
787
+ """List all available models across all workers"""
788
+ all_models = set()
789
+ for url, status in worker_health.items():
790
+ if status["healthy"] and status["version"] == "v5":
791
+ all_models.update(status["supports_models"])
792
+
793
+ return {
794
+ "models": sorted(list(all_models)),
795
+ "default": "SAM-X-1-Nano" if "SAM-X-1-Nano" in all_models else None
796
+ }
797
+
798
  @app.post("/v1/generate")
799
  async def generate(request: GenerateRequest):
800
+ """Generate text with automatic model routing"""
801
  track_request()
802
  mode, load = update_load_mode()
803
 
804
+ # Get compatible workers
805
+ compatible = get_workers_for_model(request.model)
806
+
807
+ if not compatible:
808
  cluster_stats["failed_requests"] += 1
809
+ raise HTTPException(
810
+ status_code=503,
811
+ detail=f"No workers available for model: {request.model or 'default'}"
812
+ )
813
 
814
  request_data = {
815
  "prompt": request.prompt,
 
821
  "stream": True
822
  }
823
 
824
+ # Add model parameter for v5 workers
825
+ if request.model:
826
+ request_data["model"] = request.model
827
+
828
+ print(f"🎯 {mode.upper()} | Load: {load} | Model: {request.model or 'default'} | Workers: {len(compatible)}")
829
 
830
  try:
831
+ if mode == "light" and len(compatible) >= 2:
832
+ generators, decoders = select_distributed_workers(request.model)
 
833
  if decoders:
834
  cluster_stats["successful_requests"] += 1
835
  return StreamingResponse(
 
837
  media_type="text/event-stream"
838
  )
839
 
840
+ worker = get_least_busy_worker(compatible)
 
841
  cluster_stats["successful_requests"] += 1
842
  return StreamingResponse(
843
  heavy_load_generation(worker, request_data, "generate"),
 
849
 
850
  @app.post("/v1/chat")
851
  async def chat(request: ChatRequest):
852
+ """Chat with automatic model routing"""
853
  track_request()
854
  mode, load = update_load_mode()
855
 
856
+ # Get compatible workers
857
+ compatible = get_workers_for_model(request.model)
858
+
859
+ if not compatible:
860
  cluster_stats["failed_requests"] += 1
861
+ raise HTTPException(
862
+ status_code=503,
863
+ detail=f"No workers available for model: {request.model or 'default'}"
864
+ )
865
 
866
  request_data = {
867
  "messages": [{"role": m.role, "content": m.content} for m in request.messages],
 
873
  "stream": True
874
  }
875
 
876
+ # Add model parameter for v5 workers
877
+ if request.model:
878
+ request_data["model"] = request.model
879
+
880
+ print(f"💬 {mode.upper()} | Load: {load} | Model: {request.model or 'default'} | Workers: {len(compatible)}")
881
 
882
  try:
883
+ if mode == "light" and len(compatible) >= 2:
884
+ generators, decoders = select_distributed_workers(request.model)
 
885
  if decoders:
886
  cluster_stats["successful_requests"] += 1
887
  return StreamingResponse(
 
889
  media_type="text/event-stream"
890
  )
891
 
892
+ worker = get_least_busy_worker(compatible)
 
893
  cluster_stats["successful_requests"] += 1
894
  return StreamingResponse(
895
  heavy_load_generation(worker, request_data, "chat"),