Spaces:

jkottu
/

llm-inference-dashboard

Sleeping

jkottu Claude Opus 4.5 commited on Feb 19

Commit

a18ef33

1 Parent(s): 54ccf76

Implement full GPU/Rank monitoring dashboard

Features:
- GPU/Rank Status: Per-GPU memory, utilization, temperature, power, TP rank
- Inference Metrics: tokens/sec, batch size, KV cache, TTFT, request queues
- System Metrics: CPU usage, RAM usage
- Test Inference: Send prompts and measure latency
- Auto-refresh every 3 seconds
- Demo mode for HF Spaces (simulated GPU data)
- Real metrics when running locally with vLLM + GPUs

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

Files changed (2) hide show

app.py +489 -293
requirements.txt +2 -0

app.py CHANGED Viewed

@@ -1,12 +1,16 @@
 """
-LLM Inference Dashboard - Works on HF Spaces and locally with vLLM
 """
 import time
 import logging
 import os
 import requests
 from datetime import datetime
 import gradio as gr
 import pandas as pd
@@ -14,37 +18,170 @@ import pandas as pd
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-# Detect if running on HuggingFace Spaces
 IS_HF_SPACE = os.getenv("SPACE_ID") is not None
-# vLLM server configuration (for local use)
 VLLM_HOST = os.getenv("VLLM_HOST", "localhost")
 VLLM_PORT = os.getenv("VLLM_PORT", "8000")
 VLLM_URL = f"http://{VLLM_HOST}:{VLLM_PORT}"
-# HuggingFace Inference API (for HF Spaces)
-HF_TOKEN = os.getenv("HF_TOKEN", "")
-HF_MODEL = "mistralai/Mistral-7B-Instruct-v0.2"  # Popular free model
-# Initialize HF client if on Spaces
 hf_client = None
 if IS_HF_SPACE:
     try:
         from huggingface_hub import InferenceClient
-        if HF_TOKEN:
-            hf_client = InferenceClient(token=HF_TOKEN)
-        else:
-            hf_client = InferenceClient()
     except ImportError:
-        logger.warning("huggingface_hub not installed")
 START_TIME = time.time()
-METRICS_HISTORY = {"tokens_per_sec": [], "latency": [], "timestamps": []}
 TOTAL_REQUESTS = 0
 TOTAL_TOKENS = 0
-def check_vllm_connection():
     """Check if vLLM server is running."""
     if IS_HF_SPACE:
         return False
@@ -55,61 +192,65 @@ def check_vllm_connection():
         return False
-def get_vllm_metrics():
     """Fetch metrics from vLLM Prometheus endpoint."""
     try:
         resp = requests.get(f"{VLLM_URL}/metrics", timeout=5)
         if resp.status_code == 200:
-            return parse_prometheus(resp.text)
-    except Exception as e:
-        logger.debug(f"Metrics fetch failed: {e}")
     return None
-def parse_prometheus(text):
-    """Parse Prometheus metrics text."""
-    metrics = {}
-    for line in text.strip().split("\n"):
-        if line.startswith("#") or not line.strip():
-            continue
-        try:
-            if " " in line:
-                name_part, value = line.rsplit(" ", 1)
-                name = name_part.split("{")[0]
-                metrics[name] = float(value)
-        except:
-            pass
-    return metrics
-def get_model_info():
-    """Get model info."""
-    if IS_HF_SPACE:
-        return HF_MODEL
-    try:
-        resp = requests.get(f"{VLLM_URL}/v1/models", timeout=5)
-        if resp.status_code == 200:
-            data = resp.json()
-            if data.get("data"):
-                return data["data"][0].get("id", "Unknown")
-    except:
-        pass
-    return "Not connected"
-def send_hf_inference(prompt, max_tokens=100):
-    """Send inference request to HuggingFace API."""
-    global TOTAL_REQUESTS, TOTAL_TOKENS
     if hf_client is None:
-        return {"success": False, "error": "HuggingFace client not initialized. Check huggingface_hub installation."}
     try:
         start = time.time()
-        # Use chat_completion for conversational models
         messages = [{"role": "user", "content": prompt}]
         response = hf_client.chat_completion(
             messages=messages,
             model=HF_MODEL,
@@ -117,15 +258,14 @@ def send_hf_inference(prompt, max_tokens=100):
         )
         latency = (time.time() - start) * 1000
         output = response.choices[0].message.content
-        # Get token counts
         prompt_tokens = len(prompt) // 4
         completion_tokens = len(output) // 4
         TOTAL_REQUESTS += 1
         TOTAL_TOKENS += completion_tokens
         return {
             "success": True,
@@ -135,17 +275,12 @@ def send_hf_inference(prompt, max_tokens=100):
             "completion_tokens": completion_tokens,
         }
     except Exception as e:
-        error_msg = str(e)
-        if "401" in error_msg:
-            return {"success": False, "error": "Invalid HF_TOKEN. Add it in Space Settings > Secrets."}
-        elif "503" in error_msg or "loading" in error_msg.lower():
-            return {"success": False, "error": "Model is loading, please wait 20-30 seconds and try again..."}
-        return {"success": False, "error": f"Error: {error_msg}"}
-def send_vllm_prompt(prompt, max_tokens=100):
-    """Send a test prompt to vLLM and measure latency."""
-    global TOTAL_REQUESTS, TOTAL_TOKENS
     try:
         start = time.time()
@@ -168,6 +303,7 @@ def send_vllm_prompt(prompt, max_tokens=100):
             TOTAL_REQUESTS += 1
             TOTAL_TOKENS += usage.get("completion_tokens", 0)
             return {
                 "success": True,
@@ -181,329 +317,389 @@ def send_vllm_prompt(prompt, max_tokens=100):
     return {"success": False, "error": "Unknown error"}
-def refresh_metrics():
-    """Refresh all metrics."""
-    global METRICS_HISTORY
-    elapsed = time.time() - START_TIME
-    now = datetime.now().strftime("%H:%M:%S")
     if IS_HF_SPACE:
-        # HF Spaces mode - show simulated/tracked metrics
-        tokens_per_sec = TOTAL_TOKENS / elapsed if elapsed > 0 else 0
-        METRICS_HISTORY["tokens_per_sec"].append(round(tokens_per_sec, 1))
-        METRICS_HISTORY["timestamps"].append(now)
-        # Keep last 20 points
-        if len(METRICS_HISTORY["tokens_per_sec"]) > 20:
-            METRICS_HISTORY["tokens_per_sec"] = METRICS_HISTORY["tokens_per_sec"][-20:]
-            METRICS_HISTORY["timestamps"] = METRICS_HISTORY["timestamps"][-20:]
-        history_df = pd.DataFrame({
-            "Time": METRICS_HISTORY["timestamps"],
-            "Tokens/s": METRICS_HISTORY["tokens_per_sec"],
-        })
         return (
-            "HF Inference API",
-            HF_MODEL,
-            round(tokens_per_sec, 1),
-            TOTAL_REQUESTS,
-            0,  # No queue on HF API
-            0,  # No KV cache info
-            0,
-            TOTAL_TOKENS,
-            history_df,
         )
-    # Local vLLM mode
-    connected = check_vllm_connection()
-    if not connected:
-        return (
-            "Disconnected",
-            "Start vLLM server first",
-            0, 0, 0, 0, 0, 0,
-            pd.DataFrame({"Time": [], "Tokens/s": []}),
-        )
-    model = get_model_info()
-    metrics = get_vllm_metrics()
-    if metrics:
-        running = metrics.get("vllm:num_requests_running", 0)
-        waiting = metrics.get("vllm:num_requests_waiting", 0)
-        gpu_cache = metrics.get("vllm:gpu_cache_usage_perc", 0) * 100
-        prompt_tokens = metrics.get("vllm:prompt_tokens_total", 0)
-        gen_tokens = metrics.get("vllm:generation_tokens_total", 0)
-        tokens_per_sec = gen_tokens / elapsed if elapsed > 0 else 0
-        METRICS_HISTORY["tokens_per_sec"].append(tokens_per_sec)
-        METRICS_HISTORY["timestamps"].append(now)
-        if len(METRICS_HISTORY["tokens_per_sec"]) > 20:
-            METRICS_HISTORY["tokens_per_sec"] = METRICS_HISTORY["tokens_per_sec"][-20:]
-            METRICS_HISTORY["timestamps"] = METRICS_HISTORY["timestamps"][-20:]
-        history_df = pd.DataFrame({
-            "Time": METRICS_HISTORY["timestamps"],
-            "Tokens/s": [round(t, 1) for t in METRICS_HISTORY["tokens_per_sec"]],
-        })
-        return (
-            "Connected",
-            model,
-            round(tokens_per_sec, 1),
-            int(running),
-            int(waiting),
-            round(gpu_cache, 1),
-            int(prompt_tokens),
-            int(gen_tokens),
-            history_df,
-        )
     return (
-        "Connected (no metrics)",
         model,
-        0, 0, 0, 0, 0, 0,
-        pd.DataFrame({"Time": [], "Tokens/s": []}),
     )
-def run_inference(prompt, max_tokens):
-    """Run inference and return results."""
-    if not prompt.strip():
-        return "Please enter a prompt", "", 0, 0, 0
-    # Choose backend based on environment
-    if IS_HF_SPACE:
-        result = send_hf_inference(prompt, int(max_tokens))
-    else:
-        result = send_vllm_prompt(prompt, int(max_tokens))
-    if result["success"]:
-        # Update metrics history with latency
-        METRICS_HISTORY["latency"].append(result["latency_ms"])
-        if len(METRICS_HISTORY["latency"]) > 20:
-            METRICS_HISTORY["latency"] = METRICS_HISTORY["latency"][-20:]
-        return (
-            "Success",
-            result["output"],
-            round(result["latency_ms"], 1),
-            result["prompt_tokens"],
-            result["completion_tokens"],
-        )
-    else:
-        return (
-            f"Error: {result.get('error', 'Unknown')}",
-            "",
-            0, 0, 0,
-        )
-# Build the dashboard
-with gr.Blocks(title="LLM Inference Dashboard") as demo:
-    gr.Markdown("# LLM Inference Dashboard")
-    if IS_HF_SPACE:
-        gr.Markdown("*Running on HuggingFace Spaces with HF Inference API*")
-    else:
-        gr.Markdown("*Real-time monitoring for vLLM inference*")
-    with gr.Row():
-        status = gr.Textbox(value="Checking...", label="Status", interactive=False)
-        model_name = gr.Textbox(value="", label="Model", interactive=False)
-        refresh_btn = gr.Button("Refresh Metrics", variant="primary")
-    with gr.Tabs():
-        # Tab 1: Test Inference
-        with gr.Tab("Test Inference"):
-            gr.Markdown("### Send a prompt to the model")
-            if IS_HF_SPACE:
-                gr.Markdown("*Using HuggingFace Inference API (free, may have cold start delay)*")
             with gr.Row():
-                prompt_input = gr.Textbox(
-                    label="Prompt",
-                    placeholder="Enter your prompt here...",
-                    lines=3,
-                    value="Explain quantum computing in simple terms."
-                )
-                max_tokens_input = gr.Slider(
-                    minimum=10, maximum=500, value=100,
-                    label="Max Tokens"
-                )
-            send_btn = gr.Button("Send Prompt", variant="primary")
             with gr.Row():
-                inference_status = gr.Textbox(label="Status", interactive=False)
-                latency_output = gr.Number(label="Latency (ms)", interactive=False)
             with gr.Row():
-                prompt_tokens_out = gr.Number(label="Prompt Tokens", interactive=False)
-                completion_tokens_out = gr.Number(label="Completion Tokens", interactive=False)
-            response_output = gr.Textbox(label="Response", lines=10, interactive=False)
-            send_btn.click(
-                fn=run_inference,
-                inputs=[prompt_input, max_tokens_input],
-                outputs=[inference_status, response_output, latency_output, prompt_tokens_out, completion_tokens_out],
             )
-        # Tab 2: Live Metrics
-        with gr.Tab("Live Metrics"):
             if IS_HF_SPACE:
-                gr.Markdown("### Session Metrics (HF Inference API)")
-                gr.Markdown("*Note: Full vLLM metrics available when running locally*")
-            else:
-                gr.Markdown("### vLLM Server Metrics")
             with gr.Row():
-                tokens_per_sec = gr.Number(label="Avg Tokens/sec", value=0, interactive=False)
-                requests_running = gr.Number(label="Total Requests" if IS_HF_SPACE else "Running Requests", value=0, interactive=False)
-                requests_waiting = gr.Number(label="Waiting Requests", value=0, interactive=False, visible=not IS_HF_SPACE)
-                kv_cache_usage = gr.Number(label="KV Cache %", value=0, interactive=False, visible=not IS_HF_SPACE)
             with gr.Row():
-                total_prompt_tokens = gr.Number(label="Total Prompt Tokens", value=0, interactive=False, visible=not IS_HF_SPACE)
-                total_gen_tokens = gr.Number(label="Total Generated Tokens", value=0, interactive=False)
-            metrics_history = gr.Dataframe(
-                value=pd.DataFrame({"Time": [], "Tokens/s": []}),
-                label="Metrics History",
-                interactive=False,
             )
-        # Tab 3: Setup Guide
-        with gr.Tab("Setup Guide"):
             if IS_HF_SPACE:
                 gr.Markdown("""
 ### Running on HuggingFace Spaces
-This dashboard uses the **HuggingFace Inference API** to run inference.
-**Setup Required (one-time):**
-1. Go to your Space Settings (⚙️ icon)
-2. Click "Variables and secrets"
-3. Add a new secret: `HF_TOKEN` = your HuggingFace token
-4. Get your token from: https://huggingface.co/settings/tokens
-**How to use:**
-1. Go to the "Test Inference" tab
-2. Enter a prompt and click "Send Prompt"
-3. First request may take 20-30 seconds (model cold start)
-4. Subsequent requests will be faster
-**Current Model:** `mistralai/Mistral-7B-Instruct-v0.2`
 ---
-### For Full vLLM Metrics (Local Setup)
-To get full vLLM metrics (KV cache, batch size, GPU utilization), run locally:
-**Step 1: Clone and install**
 ```bash
 git clone https://huggingface.co/spaces/jkottu/llm-inference-dashboard
 cd llm-inference-dashboard
 pip install -r requirements.txt
-pip install vllm
-```
-**Step 2: Start vLLM server**
-```bash
 python -m vllm.entrypoints.openai.api_server \\
     --model Qwen/Qwen2.5-0.5B-Instruct \\
     --port 8000
 ```
-**Step 3: Run dashboard**
 ```bash
-python app.py
 ```
 """)
             else:
                 gr.Markdown("""
-### Quick Start Guide
-**Step 1: Install vLLM**
-```bash
-pip install vllm
-```
-**Step 2: Start vLLM server** (choose one model)
-**Option A - Tiny (0.5B, ~2GB VRAM):**
 ```bash
 python -m vllm.entrypoints.openai.api_server \\
     --model Qwen/Qwen2.5-0.5B-Instruct \\
     --port 8000
-```
-**Option B - Small (1.5B, ~4GB VRAM):**
-```bash
-python -m vllm.entrypoints.openai.api_server \\
-    --model Qwen/Qwen2.5-1.5B-Instruct \\
-    --port 8000
-```
-**Option C - Medium (3B, ~8GB VRAM):**
-```bash
 python -m vllm.entrypoints.openai.api_server \\
-    --model Qwen/Qwen2.5-3B-Instruct \\
     --port 8000
 ```
-**Step 3: Run Dashboard**
 ```bash
 python app.py
 ```
-**Step 4: Test**
-1. Go to "Test Inference" tab
-2. Enter a prompt
-3. Click "Send Prompt"
-4. Watch metrics update in "Live Metrics" tab
----
-*Dashboard expects vLLM at http://localhost:8000*
 """)
-    # Connect refresh button to metrics
-    refresh_btn.click(
-        fn=refresh_metrics,
-        outputs=[
-            status, model_name,
-            tokens_per_sec, requests_running, requests_waiting, kv_cache_usage,
-            total_prompt_tokens, total_gen_tokens, metrics_history,
-        ],
     )
-    # Auto-refresh every 5 seconds
-    timer = gr.Timer(5)
     timer.tick(
-        fn=refresh_metrics,
-        outputs=[
-            status, model_name,
-            tokens_per_sec, requests_running, requests_waiting, kv_cache_usage,
-            total_prompt_tokens, total_gen_tokens, metrics_history,
-        ],
     )
-if __name__ == "__main__":
-    if IS_HF_SPACE:
-        logger.info("Running on HuggingFace Spaces with HF Inference API")
-    else:
-        logger.info(f"Dashboard connecting to vLLM at {VLLM_URL}")
-    demo.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-    )

 """
+LLM Inference Dashboard - Full GPU/Rank Monitoring
+Works on HF Spaces (demo mode) and locally with real vLLM + GPUs
 """
 import time
 import logging
 import os
+import random
 import requests
 from datetime import datetime
+from dataclasses import dataclass
+from typing import List, Dict, Optional
 import gradio as gr
 import pandas as pd
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# Environment detection
 IS_HF_SPACE = os.getenv("SPACE_ID") is not None
+HF_TOKEN = os.getenv("HF_TOKEN", "")
+# vLLM configuration
 VLLM_HOST = os.getenv("VLLM_HOST", "localhost")
 VLLM_PORT = os.getenv("VLLM_PORT", "8000")
 VLLM_URL = f"http://{VLLM_HOST}:{VLLM_PORT}"
+# HF Inference
+HF_MODEL = "mistralai/Mistral-7B-Instruct-v0.2"
+# Try to import GPU libraries
+try:
+    import pynvml
+    pynvml.nvmlInit()
+    HAS_NVML = True
+    GPU_COUNT = pynvml.nvmlDeviceGetCount()
+    logger.info(f"NVML initialized: {GPU_COUNT} GPUs detected")
+except:
+    HAS_NVML = False
+    GPU_COUNT = 0
+    logger.info("NVML not available - using demo GPU data")
+# Try to import HF client
 hf_client = None
 if IS_HF_SPACE:
     try:
         from huggingface_hub import InferenceClient
+        hf_client = InferenceClient(token=HF_TOKEN) if HF_TOKEN else InferenceClient()
     except ImportError:
+        pass
+# Global state
 START_TIME = time.time()
+METRICS_HISTORY = {
+    "timestamps": [],
+    "tokens_per_sec": [],
+    "gpu_memory": [],
+    "gpu_util": [],
+    "batch_size": [],
+    "kv_cache": [],
+}
 TOTAL_REQUESTS = 0
 TOTAL_TOKENS = 0
+LAST_INFERENCE_LATENCY = 0
+# =============================================================================
+# GPU Metrics Collection
+# =============================================================================
+@dataclass
+class GPUStats:
+    gpu_id: int
+    name: str
+    memory_used_gb: float
+    memory_total_gb: float
+    memory_percent: float
+    utilization: float
+    temperature: int
+    power_watts: float
+    tp_rank: int  # Tensor parallel rank
+def get_real_gpu_stats() -> List[GPUStats]:
+    """Get real GPU stats via pynvml."""
+    stats = []
+    for i in range(GPU_COUNT):
+        try:
+            handle = pynvml.nvmlDeviceGetHandleByIndex(i)
+            name = pynvml.nvmlDeviceGetName(handle)
+            if isinstance(name, bytes):
+                name = name.decode('utf-8')
+            mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
+            util = pynvml.nvmlDeviceGetUtilizationRates(handle)
+            temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
+            try:
+                power = pynvml.nvmlDeviceGetPowerUsage(handle) / 1000  # mW to W
+            except:
+                power = 0
+            stats.append(GPUStats(
+                gpu_id=i,
+                name=name,
+                memory_used_gb=mem.used / 1e9,
+                memory_total_gb=mem.total / 1e9,
+                memory_percent=(mem.used / mem.total) * 100,
+                utilization=util.gpu,
+                temperature=temp,
+                power_watts=power,
+                tp_rank=i,  # Assume TP rank = GPU ID
+            ))
+        except Exception as e:
+            logger.error(f"Error getting GPU {i} stats: {e}")
+    return stats
+def get_demo_gpu_stats() -> List[GPUStats]:
+    """Generate realistic demo GPU stats."""
+    elapsed = time.time() - START_TIME
+    base_util = 45 + 30 * abs((elapsed % 20) - 10) / 10
+    base_memory = 18.5 + random.uniform(-0.5, 0.5)
+    # Simulate 4 GPUs for tensor parallel
+    stats = []
+    for i in range(4):
+        util_variance = random.uniform(-8, 8)
+        mem_variance = random.uniform(-0.3, 0.3)
+        stats.append(GPUStats(
+            gpu_id=i,
+            name="NVIDIA A100-SXM4-40GB",
+            memory_used_gb=base_memory + mem_variance + i * 0.2,
+            memory_total_gb=40.0,
+            memory_percent=(base_memory + mem_variance + i * 0.2) / 40.0 * 100,
+            utilization=min(100, max(0, base_util + util_variance + i * 2)),
+            temperature=int(52 + base_util * 0.15 + i * 2),
+            power_watts=180 + base_util * 1.5 + random.uniform(-10, 10),
+            tp_rank=i,
+        ))
+    return stats
+def get_gpu_stats() -> List[GPUStats]:
+    """Get GPU stats - real or demo."""
+    if HAS_NVML and GPU_COUNT > 0:
+        return get_real_gpu_stats()
+    return get_demo_gpu_stats()
+# =============================================================================
+# System Metrics
+# =============================================================================
+def get_system_metrics() -> Dict:
+    """Get system-level metrics."""
+    try:
+        import psutil
+        cpu_percent = psutil.cpu_percent(interval=0.1)
+        memory = psutil.virtual_memory()
+        return {
+            "cpu_percent": cpu_percent,
+            "ram_used_gb": memory.used / 1e9,
+            "ram_total_gb": memory.total / 1e9,
+            "ram_percent": memory.percent,
+        }
+    except ImportError:
+        # Demo data
+        return {
+            "cpu_percent": 35 + random.uniform(-10, 10),
+            "ram_used_gb": 48 + random.uniform(-5, 5),
+            "ram_total_gb": 128,
+            "ram_percent": 38 + random.uniform(-5, 5),
+        }
+# =============================================================================
+# vLLM Metrics
+# =============================================================================
+def check_vllm_connection() -> bool:
     """Check if vLLM server is running."""
     if IS_HF_SPACE:
         return False
         return False
+def get_vllm_metrics() -> Optional[Dict]:
     """Fetch metrics from vLLM Prometheus endpoint."""
     try:
         resp = requests.get(f"{VLLM_URL}/metrics", timeout=5)
         if resp.status_code == 200:
+            metrics = {}
+            for line in resp.text.strip().split("\n"):
+                if line.startswith("#") or not line.strip():
+                    continue
+                try:
+                    if " " in line:
+                        name_part, value = line.rsplit(" ", 1)
+                        name = name_part.split("{")[0]
+                        metrics[name] = float(value)
+                except:
+                    pass
+            return metrics
+    except:
+        pass
     return None
+def get_demo_inference_metrics() -> Dict:
+    """Generate demo inference metrics."""
+    elapsed = time.time() - START_TIME
+    load_factor = 0.5 + 0.3 * abs((elapsed % 30) - 15) / 15
+    batch_size = int(4 + 8 * load_factor + random.randint(-1, 1))
+    tokens_per_sec = 45 * load_factor + random.uniform(-5, 5)
+    kv_cache = 35 + batch_size * 4 + random.uniform(-3, 3)
+    return {
+        "tokens_per_sec": round(tokens_per_sec, 1),
+        "batch_size": batch_size,
+        "kv_cache_percent": round(min(95, kv_cache), 1),
+        "running_requests": batch_size,
+        "waiting_requests": int(max(0, (load_factor - 0.6) * 15)),
+        "ttft_ms": round(80 + (1 - load_factor) * 40 + random.uniform(-10, 20), 1),
+        "tpot_ms": round(22 + random.uniform(-2, 3), 1),
+        "prompt_tokens": TOTAL_TOKENS // 3,
+        "generation_tokens": TOTAL_TOKENS,
+    }
+# =============================================================================
+# Inference Functions
+# =============================================================================
+def send_hf_inference(prompt: str, max_tokens: int = 100) -> Dict:
+    """Send inference via HuggingFace API."""
+    global TOTAL_REQUESTS, TOTAL_TOKENS, LAST_INFERENCE_LATENCY
     if hf_client is None:
+        return {"success": False, "error": "HF client not initialized. Add HF_TOKEN in Space secrets."}
     try:
         start = time.time()
         messages = [{"role": "user", "content": prompt}]
         response = hf_client.chat_completion(
             messages=messages,
             model=HF_MODEL,
         )
         latency = (time.time() - start) * 1000
         output = response.choices[0].message.content
         prompt_tokens = len(prompt) // 4
         completion_tokens = len(output) // 4
         TOTAL_REQUESTS += 1
         TOTAL_TOKENS += completion_tokens
+        LAST_INFERENCE_LATENCY = latency
         return {
             "success": True,
             "completion_tokens": completion_tokens,
         }
     except Exception as e:
+        return {"success": False, "error": str(e)}
+def send_vllm_inference(prompt: str, max_tokens: int = 100) -> Dict:
+    """Send inference via vLLM."""
+    global TOTAL_REQUESTS, TOTAL_TOKENS, LAST_INFERENCE_LATENCY
     try:
         start = time.time()
             TOTAL_REQUESTS += 1
             TOTAL_TOKENS += usage.get("completion_tokens", 0)
+            LAST_INFERENCE_LATENCY = latency
             return {
                 "success": True,
     return {"success": False, "error": "Unknown error"}
+def run_inference(prompt: str, max_tokens: int) -> tuple:
+    """Run inference and return results."""
+    if not prompt.strip():
+        return "Please enter a prompt", "", 0, 0, 0
     if IS_HF_SPACE:
+        result = send_hf_inference(prompt, int(max_tokens))
+    else:
+        result = send_vllm_inference(prompt, int(max_tokens))
+    if result["success"]:
         return (
+            "Success",
+            result["output"],
+            round(result["latency_ms"], 1),
+            result["prompt_tokens"],
+            result["completion_tokens"],
         )
+    return (f"Error: {result.get('error', 'Unknown')}", "", 0, 0, 0)
+# =============================================================================
+# Dashboard Refresh Functions
+# =============================================================================
+def refresh_gpu_panel():
+    """Refresh GPU panel data."""
+    stats = get_gpu_stats()
+    # Build GPU table
+    gpu_data = []
+    for s in stats:
+        gpu_data.append({
+            "GPU": f"GPU {s.gpu_id}",
+            "Name": s.name[:25],
+            "Memory": f"{s.memory_used_gb:.1f} / {s.memory_total_gb:.0f} GB",
+            "Mem %": f"{s.memory_percent:.1f}%",
+            "Util %": f"{s.utilization:.0f}%",
+            "Temp": f"{s.temperature}°C",
+            "Power": f"{s.power_watts:.0f}W",
+            "TP Rank": str(s.tp_rank),
+        })
+    gpu_df = pd.DataFrame(gpu_data)
+    # Calculate totals
+    total_mem_used = sum(s.memory_used_gb for s in stats)
+    total_mem = sum(s.memory_total_gb for s in stats)
+    avg_util = sum(s.utilization for s in stats) / len(stats) if stats else 0
+    avg_temp = sum(s.temperature for s in stats) / len(stats) if stats else 0
+    total_power = sum(s.power_watts for s in stats)
+    # Update history
+    now = datetime.now().strftime("%H:%M:%S")
+    METRICS_HISTORY["timestamps"].append(now)
+    METRICS_HISTORY["gpu_memory"].append(round(total_mem_used, 1))
+    METRICS_HISTORY["gpu_util"].append(round(avg_util, 1))
+    # Keep last 30 points
+    for key in METRICS_HISTORY:
+        if len(METRICS_HISTORY[key]) > 30:
+            METRICS_HISTORY[key] = METRICS_HISTORY[key][-30:]
+    # Memory history chart data
+    mem_df = pd.DataFrame({
+        "Time": METRICS_HISTORY["timestamps"],
+        "GPU Memory (GB)": METRICS_HISTORY["gpu_memory"],
+    })
+    return (
+        gpu_df,
+        f"{total_mem_used:.1f} / {total_mem:.0f} GB",
+        f"{avg_util:.1f}%",
+        f"{avg_temp:.0f}°C",
+        f"{total_power:.0f}W",
+        mem_df,
+    )
+def refresh_inference_panel():
+    """Refresh inference metrics panel."""
+    if IS_HF_SPACE:
+        metrics = get_demo_inference_metrics()
+        status = "HF Inference API (Demo Metrics)"
+        model = HF_MODEL
+    elif check_vllm_connection():
+        vllm_metrics = get_vllm_metrics()
+        if vllm_metrics:
+            elapsed = time.time() - START_TIME
+            gen_tokens = vllm_metrics.get("vllm:generation_tokens_total", 0)
+            metrics = {
+                "tokens_per_sec": round(gen_tokens / elapsed, 1) if elapsed > 0 else 0,
+                "batch_size": int(vllm_metrics.get("vllm:num_requests_running", 0)),
+                "kv_cache_percent": round(vllm_metrics.get("vllm:gpu_cache_usage_perc", 0) * 100, 1),
+                "running_requests": int(vllm_metrics.get("vllm:num_requests_running", 0)),
+                "waiting_requests": int(vllm_metrics.get("vllm:num_requests_waiting", 0)),
+                "ttft_ms": 0,
+                "tpot_ms": 0,
+                "prompt_tokens": int(vllm_metrics.get("vllm:prompt_tokens_total", 0)),
+                "generation_tokens": int(gen_tokens),
+            }
+            status = "Connected to vLLM"
+            model = "vLLM Model"
+        else:
+            metrics = get_demo_inference_metrics()
+            status = "Connected (no metrics)"
+            model = "Unknown"
+    else:
+        metrics = get_demo_inference_metrics()
+        status = "Disconnected - Using Demo Data"
+        model = "Demo Mode"
+    # Update history
+    METRICS_HISTORY["tokens_per_sec"].append(metrics["tokens_per_sec"])
+    METRICS_HISTORY["batch_size"].append(metrics["batch_size"])
+    METRICS_HISTORY["kv_cache"].append(metrics["kv_cache_percent"])
+    # Throughput chart
+    throughput_df = pd.DataFrame({
+        "Time": METRICS_HISTORY["timestamps"][-len(METRICS_HISTORY["tokens_per_sec"]):],
+        "Tokens/sec": METRICS_HISTORY["tokens_per_sec"],
+    })
     return (
+        status,
         model,
+        metrics["tokens_per_sec"],
+        metrics["batch_size"],
+        metrics["kv_cache_percent"],
+        metrics["running_requests"],
+        metrics["waiting_requests"],
+        metrics["ttft_ms"],
+        LAST_INFERENCE_LATENCY,
+        metrics["prompt_tokens"],
+        metrics["generation_tokens"],
+        throughput_df,
     )
+def refresh_system_panel():
+    """Refresh system metrics panel."""
+    sys = get_system_metrics()
+    return (
+        f"{sys['cpu_percent']:.1f}%",
+        f"{sys['ram_used_gb']:.1f} / {sys['ram_total_gb']:.0f} GB",
+        f"{sys['ram_percent']:.1f}%",
+    )
+# =============================================================================
+# Build Gradio Dashboard
+# =============================================================================
+with gr.Blocks(title="LLM Inference Dashboard", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🚀 LLM Inference Dashboard")
+    mode_text = "HF Spaces (Demo Mode)" if IS_HF_SPACE else "Local Mode"
+    gr.Markdown(f"*Real-time GPU/Rank monitoring and inference metrics* | **Mode:** {mode_text}")
+    with gr.Tabs():
+        # =================================================================
+        # Tab 1: GPU / Rank Status
+        # =================================================================
+        with gr.Tab("🎮 GPU / Rank Status"):
+            gr.Markdown("### Per-GPU Metrics & Tensor Parallel Rank Mapping")
+            with gr.Row():
+                total_gpu_mem = gr.Textbox(label="Total GPU Memory", value="...", interactive=False)
+                avg_gpu_util = gr.Textbox(label="Avg GPU Util", value="...", interactive=False)
+                avg_gpu_temp = gr.Textbox(label="Avg Temperature", value="...", interactive=False)
+                total_power = gr.Textbox(label="Total Power", value="...", interactive=False)
+            gpu_table = gr.Dataframe(
+                headers=["GPU", "Name", "Memory", "Mem %", "Util %", "Temp", "Power", "TP Rank"],
+                label="GPU Status per Rank",
+                interactive=False,
+            )
+            gpu_mem_chart = gr.Dataframe(
+                label="GPU Memory History",
+                interactive=False,
+            )
+            gpu_refresh_btn = gr.Button("🔄 Refresh GPU Stats", variant="primary")
+            gpu_refresh_btn.click(
+                fn=refresh_gpu_panel,
+                outputs=[gpu_table, total_gpu_mem, avg_gpu_util, avg_gpu_temp, total_power, gpu_mem_chart],
+            )
+        # =================================================================
+        # Tab 2: Inference Metrics
+        # =================================================================
+        with gr.Tab("📊 Inference Metrics"):
+            gr.Markdown("### Real-time Inference Performance")
             with gr.Row():
+                inf_status = gr.Textbox(label="Status", value="...", interactive=False)
+                inf_model = gr.Textbox(label="Model", value="...", interactive=False)
+            with gr.Row():
+                tokens_sec = gr.Number(label="Tokens/sec", value=0, interactive=False)
+                batch_size = gr.Number(label="Batch Size", value=0, interactive=False)
+                kv_cache = gr.Number(label="KV Cache %", value=0, interactive=False)
             with gr.Row():
+                running_req = gr.Number(label="Running Requests", value=0, interactive=False)
+                waiting_req = gr.Number(label="Waiting Requests", value=0, interactive=False)
+                ttft = gr.Number(label="TTFT (ms)", value=0, interactive=False)
+                last_latency = gr.Number(label="Last Latency (ms)", value=0, interactive=False)
             with gr.Row():
+                prompt_tokens = gr.Number(label="Total Prompt Tokens", value=0, interactive=False)
+                gen_tokens = gr.Number(label="Total Gen Tokens", value=0, interactive=False)
+            throughput_chart = gr.Dataframe(
+                label="Throughput History",
+                interactive=False,
+            )
+            inf_refresh_btn = gr.Button("🔄 Refresh Inference Metrics", variant="primary")
+            inf_refresh_btn.click(
+                fn=refresh_inference_panel,
+                outputs=[inf_status, inf_model, tokens_sec, batch_size, kv_cache,
+                        running_req, waiting_req, ttft, last_latency, prompt_tokens,
+                        gen_tokens, throughput_chart],
             )
+        # =================================================================
+        # Tab 3: System Metrics
+        # =================================================================
+        with gr.Tab("💻 System Metrics"):
+            gr.Markdown("### Host System Resources")
+            with gr.Row():
+                cpu_usage = gr.Textbox(label="CPU Usage", value="...", interactive=False)
+                ram_usage = gr.Textbox(label="RAM Usage", value="...", interactive=False)
+                ram_percent = gr.Textbox(label="RAM %", value="...", interactive=False)
+            sys_refresh_btn = gr.Button("🔄 Refresh System Metrics", variant="primary")
+            sys_refresh_btn.click(
+                fn=refresh_system_panel,
+                outputs=[cpu_usage, ram_usage, ram_percent],
+            )
+        # =================================================================
+        # Tab 4: Test Inference
+        # =================================================================
+        with gr.Tab("🧪 Test Inference"):
+            gr.Markdown("### Send Prompts to Model")
             if IS_HF_SPACE:
+                gr.Markdown(f"*Using HuggingFace Inference API: `{HF_MODEL}`*")
             with gr.Row():
+                prompt_input = gr.Textbox(
+                    label="Prompt",
+                    placeholder="Enter your prompt...",
+                    lines=3,
+                    value="Explain how GPU memory affects LLM inference performance.",
+                )
+                max_tokens_slider = gr.Slider(10, 500, value=100, label="Max Tokens")
+            send_btn = gr.Button("🚀 Send Prompt", variant="primary")
             with gr.Row():
+                inf_result_status = gr.Textbox(label="Status", interactive=False)
+                inf_result_latency = gr.Number(label="Latency (ms)", interactive=False)
+            with gr.Row():
+                inf_prompt_tokens = gr.Number(label="Prompt Tokens", interactive=False)
+                inf_comp_tokens = gr.Number(label="Completion Tokens", interactive=False)
+            response_output = gr.Textbox(label="Response", lines=10, interactive=False)
+            send_btn.click(
+                fn=run_inference,
+                inputs=[prompt_input, max_tokens_slider],
+                outputs=[inf_result_status, response_output, inf_result_latency,
+                        inf_prompt_tokens, inf_comp_tokens],
             )
+        # =================================================================
+        # Tab 5: Setup Guide
+        # =================================================================
+        with gr.Tab("📖 Setup Guide"):
             if IS_HF_SPACE:
                 gr.Markdown("""
 ### Running on HuggingFace Spaces
+**Current Mode:** Demo GPU data + HuggingFace Inference API
+**To enable inference:**
+1. Go to Space Settings → Variables and secrets
+2. Add secret: `HF_TOKEN` = your token from https://huggingface.co/settings/tokens
+3. Restart the Space
 ---
+### For Real GPU Metrics (Local Setup)
 ```bash
+# Clone the repo
 git clone https://huggingface.co/spaces/jkottu/llm-inference-dashboard
 cd llm-inference-dashboard
+# Install dependencies
 pip install -r requirements.txt
+# Start vLLM (pick a model based on your GPU)
 python -m vllm.entrypoints.openai.api_server \\
     --model Qwen/Qwen2.5-0.5B-Instruct \\
+    --tensor-parallel-size 1 \\
     --port 8000
+# Run dashboard
+python app.py
 ```
+**For Multi-GPU (Tensor Parallel):**
 ```bash
+python -m vllm.entrypoints.openai.api_server \\
+    --model meta-llama/Llama-2-7b-chat-hf \\
+    --tensor-parallel-size 4 \\
+    --port 8000
 ```
 """)
             else:
                 gr.Markdown("""
+### Local Setup Guide
+**Step 1: Start vLLM Server**
 ```bash
+# Single GPU (small model)
 python -m vllm.entrypoints.openai.api_server \\
     --model Qwen/Qwen2.5-0.5B-Instruct \\
     --port 8000
+# Multi-GPU with Tensor Parallelism
 python -m vllm.entrypoints.openai.api_server \\
+    --model meta-llama/Llama-2-13b-chat-hf \\
+    --tensor-parallel-size 4 \\
     --port 8000
 ```
+**Step 2: Run Dashboard**
 ```bash
 python app.py
 ```
+**Step 3: Monitor**
+- GPU tab shows per-rank memory, utilization, temperature
+- Inference tab shows throughput, batch size, KV cache
+- System tab shows CPU/RAM usage
 """)
+    # =================================================================
+    # Auto-refresh timer
+    # =================================================================
+    timer = gr.Timer(3)
+    # GPU panel refresh
+    timer.tick(
+        fn=refresh_gpu_panel,
+        outputs=[gpu_table, total_gpu_mem, avg_gpu_util, avg_gpu_temp, total_power, gpu_mem_chart],
     )
+    # Inference panel refresh
     timer.tick(
+        fn=refresh_inference_panel,
+        outputs=[inf_status, inf_model, tokens_sec, batch_size, kv_cache,
+                running_req, waiting_req, ttft, last_latency, prompt_tokens,
+                gen_tokens, throughput_chart],
     )
+    # System panel refresh
+    timer.tick(
+        fn=refresh_system_panel,
+        outputs=[cpu_usage, ram_usage, ram_percent],
+    )
+if __name__ == "__main__":
+    logger.info(f"Starting dashboard - Mode: {'HF Spaces' if IS_HF_SPACE else 'Local'}")
+    logger.info(f"GPUs detected: {GPU_COUNT if HAS_NVML else 'None (demo mode)'}")
+    demo.launch(server_name="0.0.0.0", server_port=7860)

requirements.txt CHANGED Viewed

@@ -1,3 +1,5 @@
 gradio>=5.0.0
 pandas>=2.0.0
 huggingface_hub>=0.20.0

 gradio>=5.0.0
 pandas>=2.0.0
 huggingface_hub>=0.20.0
+psutil>=5.9.0
+requests>=2.28.0