yangtb24 commited on
Commit
9a61cfe
·
verified ·
1 Parent(s): 6c2683b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +226 -349
app.py CHANGED
@@ -1,8 +1,9 @@
1
- from flask import Flask, Response, render_template_string
2
  import requests
3
  import json
4
- from threading import Lock
5
  import time
 
6
 
7
  app = Flask(__name__)
8
 
@@ -225,177 +226,108 @@ htmlTemplate = f"""
225
  <script src="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/js/all.min.js" integrity="sha512-yFjZbTYRCJodnuyGlsKamNE/LlEaEA/3uWCGാരി7eIq7jWqVl3J8jL/kof/tfu9Xqzh/y/VM5sJd/tq5iEew==" crossorigin="anonymous" referrerpolicy="no-referrer"></script>
226
 
227
  <script>
228
- const username = 'yangtb24';
229
-
230
- async function fetchInstances() {{
231
- try {{
232
- const response = await fetch(`https://huggingface.co/api/spaces?author=${{username}}`);
233
- const userInstances = await response.json();
234
- return userInstances.map(instance => ({{
235
- id: instance.id.split('/')[1],
236
- owner: username
237
- }}));
238
- }} catch (error) {{
239
- console.error("获取实例列表失败:", error);
240
- return [];
241
- }}
242
- }}
243
-
244
- class MetricsManager {{
245
- constructor() {{
246
- this.eventSources = new Map();
247
- this.servers = new Map();
248
- this.instanceOwners = new Map();
249
- this.spaceIds = new Map();
250
- }}
251
-
252
- async connect(instanceId, username) {{
253
- if (this.eventSources.has(instanceId)) return;
254
-
255
- try {{
256
- const eventSource = new EventSource(
257
- `https://api.hf.space/v1/${{username}}/${{instanceId}}/live-metrics/sse`
258
- );
259
-
260
- this.spaceIds.set(instanceId, instanceId);
261
- this.instanceOwners.set(instanceId, username);
262
-
263
- eventSource.addEventListener("metric", (event) => {{
264
- try {{
265
- const data = JSON.parse(event.data);
266
- updateServerCard(data, instanceId);
267
- }} catch (error) {{
268
- console.error(`解析数据失败 (${{instanceId}}):`, error);
269
- }}
270
- }});
271
-
272
- eventSource.onerror = (error) => {{
273
- console.error(`EventSource 错误 (${{instanceId}}):`, error);
274
- eventSource.close();
275
- }};
276
-
277
- this.eventSources.set(instanceId, eventSource);
278
- }} catch (error) {{
279
- console.error(`连接失败 (${{username}}/${{instanceId}}):`, error);
280
- }}
281
- }}
282
-
283
- disconnectAll() {{
284
- this.eventSources.forEach(es => es.close());
285
- this.eventSources.clear();
286
- }}
287
- }}
288
-
289
- const metricsManager = new MetricsManager();
290
- const servers = new Map();
291
-
292
- async function initialize() {{
293
- const instances = await fetchInstances();
294
- instances.forEach(instance => {{
295
- metricsManager.connect(instance.id, instance.owner);
296
- }});
297
- }}
298
-
299
- initialize();
300
 
301
  function updateServerCard(data, spaceId) {{
302
- const serverId = data.replica;
303
- const serverElement = document.getElementById(`server-${{serverId}}`);
304
- const owner = metricsManager.instanceOwners.get(spaceId);
305
-
306
- if (!serverElement) {{
307
- const card = document.createElement('div');
308
- card.id = `server-${{serverId}}`;
309
- card.className = 'server-card';
310
- card.innerHTML = `
311
- <div class="server-header">
312
- <div class="server-name">
313
- <div class="status-dot status-online"></div>
314
- <svg class="server-flag" width="20" height="20" viewBox="0 0 24 24" fill="currentColor">
315
- <path d="M21 3H3C1.9 3 1 3.9 1 5v3c0 1.1.9 2 2 2h18c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zm-1 5H4V6h16v2zm1 4H3c-1.1 0-2 .9-2 2v3c0 1.1.9 2 2 2h18c1.1 0 2-.9 2-2v-3c0-1.1-.9-2-2-2zm-1 5H4v-2h16v2zm1 4H3c-1.1 0-2 .9-2 2v3c0 1.1.9 2 2 2h18c1.1 0 2-.9 2-2v-3c0-1.1-.9-2-2-2zm-1 5H4v-2h16v2z"/>
316
- </svg>
317
- <div>${{serverId}} (${{owner}}/${{spaceId}})</div>
318
- </div>
319
- </div>
320
- <div class="metric-grid">
321
- <div class="metric-item">
322
- <div class="metric-label">CPU</div>
323
- <div class="progress-bar-container">
324
- <div class="cpu-progress-bar"></div>
325
  </div>
326
- <div class="metric-value cpu-usage">0%</div>
327
- </div>
328
- <div class="metric-item">
329
- <div class="metric-label">内存</div>
330
- <div class="progress-bar-container">
331
- <div class="memory-progress-bar"></div>
 
 
332
  </div>
333
- <div class="metric-value memory-usage">0%</div>
334
- </div>
335
- <div class="metric-item">
336
- <div class="metric-label">上传</div>
337
- <div class="metric-value upload">0 KB/s</div>
338
- </div>
339
- <div class="metric-item">
340
- <div class="metric-label">下载</div>
341
- <div class="metric-value download">0 KB/s</div>
342
- </div>
343
- </div>
344
- `;
345
- document.getElementById('servers').appendChild(card);
346
- }}
347
-
348
- const card = document.getElementById(`server-${{serverId}}`);
349
- const cpuUsage = data.cpu_usage_pct;
350
- const memoryUsage = (data.memory_used_bytes / data.memory_total_bytes) * 100;
351
- const uploadBps = data.tx_bps;
352
- const downloadBps = data.rx_bps;
353
 
354
- card.querySelector('.cpu-usage').textContent = `${{cpuUsage.toFixed(2)}}%`;
355
- card.querySelector('.cpu-progress-bar').style.width = `${{cpuUsage}}%`;
 
 
 
356
 
357
- card.querySelector('.memory-usage').textContent = `${{memoryUsage.toFixed(2)}}%`;
358
- card.querySelector('.memory-progress-bar').style.width = `${{memoryUsage}}%`;
359
 
360
- card.querySelector('.upload').textContent = `${{formatBytes(uploadBps)}}/s`;
 
361
 
362
- card.querySelector('.download').textContent = `${{formatBytes(downloadBps)}}/s`;
 
363
 
364
- servers.set(serverId, Date.now());
365
- updateSummary();
366
  }}
367
 
368
  function updateSummary() {{
369
- const now = Date.now();
370
- let online = 0;
371
- let offline = 0;
372
- let totalUpload = 0;
373
- let totalDownload = 0;
374
-
375
- servers.forEach((lastSeen, serverId) => {{
376
- const isOnline = (now - lastSeen) < 10000;
377
- const serverCard = document.getElementById(`server-${{serverId}}`);
 
 
 
 
378
  if (serverCard) {{
379
  const statusDot = serverCard.querySelector('.status-dot');
380
  statusDot.className = `status-dot status-${{isOnline ? 'online' : 'offline'}}`;
381
 
382
  if (isOnline) {{
383
- const uploadText = serverCard.querySelector('.upload').textContent;
384
- const downloadText = serverCard.querySelector('.download').textContent;
385
- totalUpload += parseFloat(uploadText) || 0;
386
- totalDownload += parseFloat(downloadText) || 0;
387
  }}
388
  }}
389
  isOnline ? online++ : offline++;
390
- }});
 
391
 
392
- document.getElementById('totalServers').textContent = servers.size;
393
  document.getElementById('onlineServers').textContent = online;
394
  document.getElementById('offlineServers').textContent = offline;
395
- document.getElementById('totalUpload').textContent = `${{formatBytes(totalUpload)}}/s`;
396
- document.getElementById('totalDownload').textContent = `${{formatBytes(totalDownload)}}/s`;
397
  }}
398
 
 
399
  function formatBytes(bytes) {{
400
  if (bytes === 0) return '0 B';
401
  const k = 1024;
@@ -404,214 +336,159 @@ htmlTemplate = f"""
404
  return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i];
405
  }}
406
 
407
- setInterval(updateSummary, 2000);
408
-
409
- setInterval(async () => {{
410
- metricsManager.disconnectAll();
411
- await initialize();
412
- }}, 300000);
413
 
 
414
  </script>
415
  </body>
416
  </html>
417
  """
418
 
419
-
420
  class MetricsManager:
421
- def __init__(self):
422
- self.event_sources = {}
423
- self.servers = {}
424
- self.instance_owners = {}
425
- self.space_ids = {}
426
- self.lock = Lock() # Use a lock for thread safety
427
-
428
- def connect(self, instance_id, username):
429
- with self.lock:
430
- if instance_id in self.event_sources:
431
- return
432
-
433
- self.space_ids[instance_id] = instance_id
434
- self.instance_owners[instance_id] = username
435
- # In a real deployment, you'd manage the EventSource connections more carefully.
436
- # For simplicity, we're not actually creating them here.
437
- self.event_sources[instance_id] = True # Simulate connection
438
-
439
-
440
- def disconnect_all(self):
441
- with self.lock:
442
- # In a production environment, you would properly close the EventSource connections here.
443
- self.event_sources.clear()
444
- self.servers.clear() # Also clear server data on disconnect
445
- self.instance_owners.clear()
446
- self.space_ids.clear()
447
-
448
-
449
- metrics_manager = MetricsManager()
450
-
451
- def fetch_instances(username):
452
- try:
453
- response = requests.get(f"https://huggingface.co/api/spaces?author={username}")
454
- response.raise_for_status() # Raise an exception for bad status codes
455
- user_instances = response.json()
456
- return [{"id": instance["id"].split("/")[1], "owner": username} for instance in user_instances]
457
- except requests.exceptions.RequestException as e:
458
- print(f"Error fetching instances: {e}")
459
- return []
460
-
461
- def format_bytes(bytes_value):
462
- if bytes_value == 0:
463
- return "0 B"
464
- k = 1024
465
- sizes = ["B", "KB", "MB", "GB", "TB"]
466
- i = 0
467
- while bytes_value >= k and i < len(sizes) -1 :
468
- bytes_value /= k
469
- i += 1
470
- return f"{bytes_value:.2f} {sizes[i]}"
471
-
472
-
473
- def generate_server_card(data, space_id):
474
- server_id = data['replica']
475
- owner = metrics_manager.instance_owners.get(space_id, "Unknown")
476
-
477
- cpu_usage = data.get('cpu_usage_pct', 0)
478
- memory_used = data.get('memory_used_bytes', 1) # Default to 1 to avoid division by zero
479
- memory_total = data.get('memory_total_bytes', 1)
480
- memory_usage = (memory_used / memory_total) * 100 if memory_total else 0
481
- upload_bps = data.get('tx_bps', 0)
482
- download_bps = data.get('rx_bps', 0)
483
-
484
- with metrics_manager.lock:
485
- metrics_manager.servers[server_id] = time.time()
486
-
487
- return f"""
488
- <div class="server-card" id="server-{server_id}">
489
- <div class="server-header">
490
- <div class="server-name">
491
- <div class="status-dot status-online"></div>
492
- <svg class="server-flag" width="20" height="20" viewBox="0 0 24 24" fill="currentColor">
493
- <path d="M21 3H3C1.9 3 1 3.9 1 5v3c0 1.1.9 2 2 2h18c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zm-1 5H4V6h16v2zm1 4H3c-1.1 0-2 .9-2 2v3c0 1.1.9 2 2 2h18c1.1 0 2-.9 2-2v-3c0-1.1-.9-2-2-2zm-1 5H4v-2h16v2zm1 4H3c-1.1 0-2 .9-2 2v3c0 1.1.9 2 2 2h18c1.1 0 2-.9 2-2v-3c0-1.1-.9-2-2-2zm-1 5H4v-2h16v2z"/>
494
- </svg>
495
- <div>{server_id} ({owner}/{space_id})</div>
496
- </div>
497
- </div>
498
- <div class="metric-grid">
499
- <div class="metric-item">
500
- <div class="metric-label">CPU</div>
501
- <div class="progress-bar-container">
502
- <div class="cpu-progress-bar" style="width:{cpu_usage:.2f}%"></div>
503
- </div>
504
- <div class="metric-value cpu-usage">{cpu_usage:.2f}%</div>
505
- </div>
506
- <div class="metric-item">
507
- <div class="metric-label">内存</div>
508
- <div class="progress-bar-container">
509
- <div class="memory-progress-bar" style="width:{memory_usage:.2f}%"></div>
510
- </div>
511
- <div class="metric-value memory-usage">{memory_usage:.2f}%</div>
512
- </div>
513
- <div class="metric-item">
514
- <div class="metric-label">上传</div>
515
- <div class="metric-value upload">{format_bytes(upload_bps)}/s</div>
516
- </div>
517
- <div class="metric-item">
518
- <div class="metric-label">下载</div>
519
- <div class="metric-value download">{format_bytes(download_bps)}/s</div>
520
- </div>
521
- </div>
522
- </div>
523
- """
524
-
525
- def update_summary():
526
- now = time.time()
527
- online = 0
528
- offline = 0
529
- total_upload = 0
530
- total_download = 0
531
-
532
- with metrics_manager.lock:
533
- for server_id, last_seen in metrics_manager.servers.items():
534
- is_online = (now - last_seen) < 10
535
- if is_online:
536
- online += 1
537
- else:
538
- offline += 1
539
-
540
- return online, offline, total_upload, total_download
541
-
542
-
543
- @app.route("/")
 
 
 
 
 
 
 
 
 
 
544
  def home():
545
- username = 'yangtb24'
546
- instances = fetch_instances(username)
547
- for instance in instances:
548
- metrics_manager.connect(instance['id'], instance['owner'])
549
-
550
- server_cards_html = ""
551
- # We're not getting live updates, so just create static cards
552
- for instance in instances:
553
- # Simulate some data. In a real application, this would come from the EventSource.
554
- simulated_data = {
555
- 'replica': f"{instance['id']}-replica",
556
- 'cpu_usage_pct': 10.5, # Example value
557
- 'memory_used_bytes': 1024 * 1024 * 200, # 200MB
558
- 'memory_total_bytes': 1024 * 1024 * 1024, # 1GB
559
- 'tx_bps': 50000,
560
- 'rx_bps': 25000,
561
- }
562
- server_cards_html += generate_server_card(simulated_data, instance['id'])
563
-
564
- online, offline, total_upload, total_download = update_summary()
565
- total_servers = len(metrics_manager.servers)
566
-
567
- # Inject dynamic data into the HTML template
568
- rendered_html = htmlTemplate.replace(
569
- '<!-- 服务器卡片将在这里动态生成 -->', server_cards_html
570
- ).replace(
571
- '<div>总实例数: <span id="totalServers">0</span></div>',
572
- f'<div>总实例数: <span id="totalServers">{total_servers}</span></div>'
573
- ).replace(
574
- '<div>在线实例: <span id="onlineServers">0</span></div>',
575
- f'<div>在线实例: <span id="onlineServers">{online}</span></div>'
576
- ).replace(
577
- '<div>离线实例: <span id="offlineServers">0</span></div>',
578
- f'<div>离线实例: <span id="offlineServers">{offline}</span></div>'
579
- ).replace(
580
- '<div>总上传: <span id="totalUpload">0 B/s</span></div>',
581
- f'<div>总上传: <span id="totalUpload">{format_bytes(total_upload)}/s</span></div>'
582
- ).replace(
583
- '<div>总下载: <span id="totalDownload">0 B/s</span></div>',
584
- f'<div>总下载: <span id="totalDownload">{format_bytes(total_download)}/s</span></div>'
585
- )
586
-
587
-
588
- return render_template_string(rendered_html)
589
-
590
-
591
- @app.route('/metrics/<username>/<instance_id>')
592
- def metrics(username, instance_id):
593
- # This route is where you would handle real-time metrics updates using Server-Sent Events.
594
- # This is a simplified example and does not include the actual SSE implementation.
595
- def generate():
596
- while True:
597
- # Simulate receiving data
598
- data = {
599
- 'replica': f"{instance_id}-replica", # Correctly format replica ID
600
- 'cpu_usage_pct': 25.5,
601
- 'memory_used_bytes': 536870912, # 512MB
602
- 'memory_total_bytes': 1073741824, # 1GB
603
- 'tx_bps': 120000,
604
- 'rx_bps': 60000,
605
- }
606
-
607
- # Format the data as an SSE event
608
- yield f"data: {json.dumps(data)}\n\n"
609
- time.sleep(2) # Send updates every 2 seconds
610
-
611
- return Response(generate(), mimetype='text/event-stream')
612
-
613
-
614
- if __name__ == "__main__":
615
- app.run(debug=True, port=7860, host="0.0.0.0") # Make the server externally visible
616
 
 
 
617
 
 
1
+ from flask import Flask, render_template_string
2
  import requests
3
  import json
4
+ from threading import Thread, Lock
5
  import time
6
+ from datetime import datetime, timedelta
7
 
8
  app = Flask(__name__)
9
 
 
226
  <script src="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/js/all.min.js" integrity="sha512-yFjZbTYRCJodnuyGlsKamNE/LlEaEA/3uWCGാരി7eIq7jWqVl3J8jL/kof/tfu9Xqzh/y/VM5sJd/tq5iEew==" crossorigin="anonymous" referrerpolicy="no-referrer"></script>
227
 
228
  <script>
229
+ const serversData = {{ servers_data|tojson }};
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
230
 
231
  function updateServerCard(data, spaceId) {{
232
+ const serverId = data.replica;
233
+ const serverElement = document.getElementById(`server-${serverId}`);
234
+ const owner = data.owner;
235
+
236
+ if (!serverElement) {{
237
+ const card = document.createElement('div');
238
+ card.id = `server-${serverId}`;
239
+ card.className = 'server-card';
240
+ card.innerHTML = `
241
+ <div class="server-header">
242
+ <div class="server-name">
243
+ <div class="status-dot status-online"></div>
244
+ <svg class="server-flag" width="20" height="20" viewBox="0 0 24 24" fill="currentColor">
245
+ <path d="M21 3H3C1.9 3 1 3.9 1 5v3c0 1.1.9 2 2 2h18c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zm-1 5H4V6h16v2zm1 4H3c-1.1 0-2 .9-2 2v3c0 1.1.9 2 2 2h18c1.1 0 2-.9 2-2v-3c0-1.1-.9-2-2-2zm-1 5H4v-2h16v2zm1 4H3c-1.1 0-2 .9-2 2v3c0 1.1.9 2 2 2h18c1.1 0 2-.9 2-2v-3c0-1.1-.9-2-2-2zm-1 5H4v-2h16v2z"/>
246
+ </svg>
247
+ <div>${{serverId}} (${{owner}}/${{spaceId}})</div>
 
 
 
 
 
 
 
248
  </div>
249
+ </div>
250
+ <div class="metric-grid">
251
+ <div class="metric-item">
252
+ <div class="metric-label">CPU</div>
253
+ <div class="progress-bar-container">
254
+ <div class="cpu-progress-bar"></div>
255
+ </div>
256
+ <div class="metric-value cpu-usage">0%</div>
257
  </div>
258
+ <div class="metric-item">
259
+ <div class="metric-label">内存</div>
260
+ <div class="progress-bar-container">
261
+ <div class="memory-progress-bar"></div>
262
+ </div>
263
+ <div class="metric-value memory-usage">0%</div>
264
+ </div>
265
+ <div class="metric-item">
266
+ <div class="metric-label">上传</div>
267
+ <div class="metric-value upload">0 KB/s</div>
268
+ </div>
269
+ <div class="metric-item">
270
+ <div class="metric-label">下载</div>
271
+ <div class="metric-value download">0 KB/s</div>
272
+ </div>
273
+ </div>
274
+ `;
275
+ document.getElementById('servers').appendChild(card);
276
+ }}
 
277
 
278
+ const card = document.getElementById(`server-${serverId}`);
279
+ const cpuUsage = data.cpu_usage_pct;
280
+ const memoryUsage = (data.memory_used_bytes / data.memory_total_bytes) * 100;
281
+ const uploadBps = data.tx_bps;
282
+ const downloadBps = data.rx_bps;
283
 
284
+ card.querySelector('.cpu-usage').textContent = `${cpuUsage.toFixed(2)}%`;
285
+ card.querySelector('.cpu-progress-bar').style.width = `${cpuUsage}%`;
286
 
287
+ card.querySelector('.memory-usage').textContent = `${memoryUsage.toFixed(2)}%`;
288
+ card.querySelector('.memory-progress-bar').style.width = `${memoryUsage}%`;
289
 
290
+ card.querySelector('.upload').textContent = `${formatBytes(uploadBps)}/s`;
291
+ card.querySelector('.download').textContent = `${formatBytes(downloadBps)}/s`;
292
 
293
+ updateSummary();
 
294
  }}
295
 
296
  function updateSummary() {{
297
+ let online = 0;
298
+ let offline = 0;
299
+ let totalUpload = 0;
300
+ let totalDownload = 0;
301
+
302
+ for (const serverId in serversData) {{
303
+ const serverData = serversData[serverId];
304
+
305
+ if (!serverData) continue; // Skip if serverData is null
306
+
307
+ const isOnline = serverData.status === 'online';
308
+ const serverCard = document.getElementById(`server-${serverId}`);
309
+
310
  if (serverCard) {{
311
  const statusDot = serverCard.querySelector('.status-dot');
312
  statusDot.className = `status-dot status-${{isOnline ? 'online' : 'offline'}}`;
313
 
314
  if (isOnline) {{
315
+ totalUpload += serverData.tx_bps;
316
+ totalDownload += serverData.rx_bps;
 
 
317
  }}
318
  }}
319
  isOnline ? online++ : offline++;
320
+ }}
321
+
322
 
323
+ document.getElementById('totalServers').textContent = Object.keys(serversData).length;
324
  document.getElementById('onlineServers').textContent = online;
325
  document.getElementById('offlineServers').textContent = offline;
326
+ document.getElementById('totalUpload').textContent = `${formatBytes(totalUpload)}/s`;
327
+ document.getElementById('totalDownload').textContent = `${formatBytes(totalDownload)}/s`;
328
  }}
329
 
330
+
331
  function formatBytes(bytes) {{
332
  if (bytes === 0) return '0 B';
333
  const k = 1024;
 
336
  return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i];
337
  }}
338
 
339
+ // Initial update with existing data
340
+ for (const spaceId in serversData) {{
341
+ for(const replicaId in serversData[spaceId]){{
342
+ updateServerCard(serversData[spaceId][replicaId], spaceId);
343
+ }}
344
+ }}
345
 
346
+ // No need for setInterval here; data is updated via Flask
347
  </script>
348
  </body>
349
  </html>
350
  """
351
 
 
352
  class MetricsManager:
353
+ def __init__(self, username):
354
+ self.username = username
355
+ self.servers_data = {} # Store all server data
356
+ self.data_lock = Lock() # Lock for thread-safe updates
357
+ self.last_fetch_time = {} # Track last fetch time for each instance
358
+ self.instance_ids = set() # store instance ids
359
+
360
+
361
+ def fetch_instances(self):
362
+ """Fetches instances for the given username."""
363
+ try:
364
+ response = requests.get(f"https://huggingface.co/api/spaces?author={self.username}")
365
+ response.raise_for_status() # Raise an exception for bad status codes
366
+ user_instances = response.json()
367
+ new_instance_ids = {instance['id'].split('/')[1] for instance in user_instances}
368
+
369
+ # Check for removed instances
370
+ for instance_id in list(self.instance_ids): # Iterate over a copy
371
+ if instance_id not in new_instance_ids:
372
+ self.remove_instance(instance_id)
373
+ print(f"Instance removed: {instance_id}")
374
+
375
+ self.instance_ids.update(new_instance_ids) # Update the set of instance IDs
376
+ return [{'id': instance_id, 'owner': self.username} for instance_id in new_instance_ids]
377
+
378
+ except requests.RequestException as e:
379
+ print(f"Error fetching instances: {e}")
380
+ return []
381
+ except (KeyError, IndexError, json.JSONDecodeError) as e:
382
+ print(f"Error parsing instance data: {e}")
383
+ return []
384
+
385
+ def remove_instance(self, instance_id):
386
+ """Removes an instance and its associated data."""
387
+ with self.data_lock:
388
+ if instance_id in self.servers_data:
389
+ del self.servers_data[instance_id]
390
+ if instance_id in self.last_fetch_time:
391
+ del self.last_fetch_time[instance_id]
392
+ if instance_id in self.instance_ids:
393
+ self.instance_ids.remove(instance_id)
394
+
395
+ def fetch_metrics(self, instance_id, owner):
396
+ """Fetches metrics for a single instance."""
397
+ url = f"https://api.hf.space/v1/{owner}/{instance_id}/live-metrics/sse"
398
+ try:
399
+ response = requests.get(url, stream=True, timeout=10) # Timeout for connection
400
+ response.raise_for_status()
401
+
402
+ for line in response.iter_lines():
403
+ if line:
404
+ try:
405
+ decoded_line = line.decode('utf-8')
406
+ if decoded_line.startswith("event: metric"):
407
+ data_part = decoded_line.split("data: ", 1)[1]
408
+ data = json.loads(data_part)
409
+ self.update_server_data(data, instance_id, owner)
410
+ except (IndexError, json.JSONDecodeError) as e:
411
+ print(f"Error parsing metric data for {instance_id}: {e}, Line: {line.decode('utf-8')}")
412
+ continue # Continue to the next line
413
+
414
+ except requests.exceptions.RequestException as e:
415
+ print(f"Error fetching metrics for {instance_id}: {e}")
416
+ self.mark_offline(instance_id) # Mark as offline
417
+ except Exception as e:
418
+ print(f"An unexpected error occurred for {instance_id}: {e}")
419
+ self.mark_offline(instance_id)
420
+
421
+
422
+
423
+ def update_server_data(self, data, space_id, owner):
424
+ """Updates server data in a thread-safe manner."""
425
+ with self.data_lock:
426
+ replica_id = data['replica']
427
+
428
+ # Check if the space_id exists in servers_data, if not, create it
429
+ if space_id not in self.servers_data:
430
+ self.servers_data[space_id] = {}
431
+
432
+ # Now, update the data for the specific replica within that space_id
433
+ if replica_id not in self.servers_data[space_id]:
434
+ self.servers_data[space_id][replica_id] = {}
435
+
436
+ self.servers_data[space_id][replica_id] = {
437
+ 'replica': replica_id,
438
+ 'owner': owner,
439
+ 'cpu_usage_pct': data['cpu_usage_pct'],
440
+ 'memory_used_bytes': data['memory_used_bytes'],
441
+ 'memory_total_bytes': data['memory_total_bytes'],
442
+ 'tx_bps': data['tx_bps'],
443
+ 'rx_bps': data['rx_bps'],
444
+ 'status': 'online' # Mark as online when data is received
445
+ }
446
+
447
+ self.last_fetch_time[space_id] = datetime.utcnow()
448
+
449
+ def mark_offline(self, instance_id):
450
+ """Marks an instance as offline."""
451
+ with self.data_lock:
452
+ if instance_id in self.servers_data:
453
+ for replica_id in self.servers_data[instance_id]:
454
+ if self.servers_data[instance_id][replica_id]: # Check if not None
455
+ self.servers_data[instance_id][replica_id]['status'] = 'offline'
456
+
457
+ def check_timeouts(self):
458
+ """Checks for instances that haven't been updated recently."""
459
+ now = datetime.utcnow()
460
+ with self.data_lock:
461
+ for instance_id in list(self.last_fetch_time.keys()): # Iterate on a copy
462
+ if now - self.last_fetch_time.get(instance_id, datetime.min) > timedelta(seconds=10):
463
+ self.mark_offline(instance_id)
464
+
465
+ def run_fetch(self):
466
+ """Fetches metrics for all instances in a loop."""
467
+ while True:
468
+ instances = self.fetch_instances()
469
+ for instance in instances:
470
+ self.fetch_metrics(instance['id'], instance['owner'])
471
+ self.check_timeouts()
472
+ time.sleep(2) # Fetch every 2 seconds
473
+
474
+ def start(self):
475
+ """Starts the data fetching in a separate thread."""
476
+ thread = Thread(target=self.run_fetch)
477
+ thread.daemon = True # Allow the program to exit even if the thread is running
478
+ thread.start()
479
+
480
+ # --- Flask App Setup ---
481
+ username = 'yangtb24' # Replace with your Hugging Face username
482
+ metrics_manager = MetricsManager(username)
483
+ metrics_manager.start() # Start the data collection
484
+
485
+ @app.route('/')
486
  def home():
487
+ with metrics_manager.data_lock:
488
+ # Make a copy to avoid modification during iteration
489
+ servers_data_copy = metrics_manager.servers_data.copy()
490
+ return render_template_string(htmlTemplate, servers_data=servers_data_copy)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
491
 
492
+ if __name__ == '__main__':
493
+ app.run(debug=True, host='0.0.0.0', port=7860)
494