Spaces:

PyxiLabs
/

mine

Runtime error

App Files Files Community

PyxiLabs commited on Mar 7

Commit

401c156

verified ·

1 Parent(s): 96769eb

Upload 4 files

Browse files

Files changed (4) hide show

Dockerfile +51 -0
index (2).html +901 -0
requirements (2).txt +15 -0
server.py +367 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,51 @@

+# ╔══════════════════════════════════════════════════════════════╗
+# ║   Granite 4.0 ONNX Inference Server                         ║
+# ║   Model: onnx-community/granite-4.0-h-350m-ONNX             ║
+# ║   Runtime: ONNX Runtime CPU · FastAPI · Beautiful UI         ║
+# ╚══════════════════════════════════════════════════════════════╝
+FROM python:3.11-slim
+# ── System dependencies ───────────────────────────────────────────────────────
+RUN apt-get update && apt-get install -y \
+    git \
+    curl \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+# ── Create non-root user (HuggingFace Spaces requirement) ─────────────────────
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH \
+    HF_HOME=/home/user/.cache/huggingface \
+    TRANSFORMERS_CACHE=/home/user/.cache/huggingface \
+    # Prevents OMP issues on CPU
+    OMP_NUM_THREADS=4 \
+    MKL_NUM_THREADS=4
+WORKDIR /app
+# ── Install Python dependencies ───────────────────────────────────────────────
+COPY --chown=user requirements.txt .
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r requirements.txt
+# ── Copy application files ────────────────────────────────────────────────────
+COPY --chown=user server.py .
+COPY --chown=user static/ ./static/
+# ── Expose port (HF Spaces uses 7860) ────────────────────────────────────────
+EXPOSE 7860
+# ── Health check ─────────────────────────────────────────────────────────────
+HEALTHCHECK --interval=30s --timeout=10s --start-period=120s --retries=3 \
+    CMD curl -f http://localhost:7860/health || exit 1
+# ── Launch server ─────────────────────────────────────────────────────────────
+CMD ["uvicorn", "server:app", \
+     "--host", "0.0.0.0", \
+     "--port", "7860", \
+     "--workers", "1", \
+     "--log-level", "info"]

index (2).html ADDED Viewed

	@@ -0,0 +1,901 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8" />
+  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+  <title>Granite 4.0 · ONNX Inference</title>
+  <link rel="preconnect" href="https://fonts.googleapis.com" />
+  <link href="https://fonts.googleapis.com/css2?family=Space+Mono:ital,wght@0,400;0,700;1,400&family=Syne:wght@400;600;700;800&display=swap" rel="stylesheet" />
+  <style>
+    :root {
+      --bg: #060810;
+      --surface: #0d1117;
+      --surface2: #131922;
+      --border: #1e2d3d;
+      --accent: #00d4ff;
+      --accent2: #7c3aed;
+      --accent3: #10b981;
+      --warn: #f59e0b;
+      --danger: #ef4444;
+      --text: #e2e8f0;
+      --muted: #4a5568;
+      --dim: #718096;
+    }
+    * { margin: 0; padding: 0; box-sizing: border-box; }
+    body {
+      background: var(--bg);
+      color: var(--text);
+      font-family: 'Space Mono', monospace;
+      min-height: 100vh;
+      overflow-x: hidden;
+    }
+    /* ── Animated background ── */
+    body::before {
+      content: '';
+      position: fixed;
+      inset: 0;
+      background:
+        radial-gradient(ellipse 80% 60% at 10% 20%, rgba(0,212,255,0.04) 0%, transparent 60%),
+        radial-gradient(ellipse 60% 80% at 90% 80%, rgba(124,58,237,0.04) 0%, transparent 60%);
+      pointer-events: none;
+      z-index: 0;
+    }
+    /* ── Grid lines ── */
+    body::after {
+      content: '';
+      position: fixed;
+      inset: 0;
+      background-image:
+        linear-gradient(rgba(0,212,255,0.03) 1px, transparent 1px),
+        linear-gradient(90deg, rgba(0,212,255,0.03) 1px, transparent 1px);
+      background-size: 40px 40px;
+      pointer-events: none;
+      z-index: 0;
+    }
+    .app {
+      position: relative;
+      z-index: 1;
+      display: grid;
+      grid-template-rows: auto 1fr;
+      min-height: 100vh;
+      max-width: 1400px;
+      margin: 0 auto;
+      padding: 0 20px;
+    }
+    /* ── Header ── */
+    header {
+      padding: 24px 0 20px;
+      border-bottom: 1px solid var(--border);
+      display: flex;
+      align-items: center;
+      justify-content: space-between;
+      gap: 16px;
+      flex-wrap: wrap;
+    }
+    .logo-block {
+      display: flex;
+      align-items: center;
+      gap: 14px;
+    }
+    .logo-icon {
+      width: 42px;
+      height: 42px;
+      border: 1px solid var(--accent);
+      border-radius: 8px;
+      display: flex;
+      align-items: center;
+      justify-content: center;
+      font-family: 'Syne', sans-serif;
+      font-weight: 800;
+      font-size: 18px;
+      color: var(--accent);
+      box-shadow: 0 0 20px rgba(0,212,255,0.2), inset 0 0 20px rgba(0,212,255,0.05);
+      animation: pulse-border 3s ease-in-out infinite;
+    }
+    @keyframes pulse-border {
+      0%, 100% { box-shadow: 0 0 20px rgba(0,212,255,0.2), inset 0 0 20px rgba(0,212,255,0.05); }
+      50% { box-shadow: 0 0 30px rgba(0,212,255,0.4), inset 0 0 20px rgba(0,212,255,0.1); }
+    }
+    .logo-text {
+      font-family: 'Syne', sans-serif;
+    }
+    .logo-text h1 {
+      font-size: 20px;
+      font-weight: 800;
+      letter-spacing: -0.5px;
+      color: var(--text);
+    }
+    .logo-text p {
+      font-size: 11px;
+      color: var(--dim);
+      margin-top: 2px;
+    }
+    #status-badge {
+      display: flex;
+      align-items: center;
+      gap: 8px;
+      font-size: 12px;
+      padding: 6px 14px;
+      border-radius: 999px;
+      border: 1px solid var(--border);
+      background: var(--surface);
+      transition: all 0.3s;
+    }
+    #status-dot {
+      width: 8px; height: 8px;
+      border-radius: 50%;
+      background: var(--warn);
+      animation: blink 1s infinite;
+    }
+    @keyframes blink { 0%,100%{opacity:1} 50%{opacity:0.3} }
+    #status-badge.ready { border-color: var(--accent3); }
+    #status-badge.ready #status-dot { background: var(--accent3); animation: none; }
+    #status-badge.error { border-color: var(--danger); }
+    #status-badge.error #status-dot { background: var(--danger); animation: none; }
+    /* ── Main layout ── */
+    main {
+      display: grid;
+      grid-template-columns: 1fr 340px;
+      grid-template-rows: 1fr;
+      gap: 20px;
+      padding: 20px 0 20px;
+      height: calc(100vh - 100px);
+    }
+    /* ── Chat panel ── */
+    .chat-panel {
+      display: flex;
+      flex-direction: column;
+      gap: 16px;
+      min-height: 0;
+    }
+    .messages-container {
+      flex: 1;
+      overflow-y: auto;
+      display: flex;
+      flex-direction: column;
+      gap: 12px;
+      padding-right: 6px;
+      scroll-behavior: smooth;
+    }
+    .messages-container::-webkit-scrollbar { width: 4px; }
+    .messages-container::-webkit-scrollbar-track { background: transparent; }
+    .messages-container::-webkit-scrollbar-thumb { background: var(--border); border-radius: 2px; }
+    .message {
+      display: flex;
+      gap: 12px;
+      animation: fade-in 0.3s ease;
+    }
+    @keyframes fade-in { from { opacity:0; transform:translateY(8px); } to { opacity:1; transform:none; } }
+    .message.user { flex-direction: row-reverse; }
+    .avatar {
+      width: 32px; height: 32px;
+      border-radius: 8px;
+      display: flex;
+      align-items: center;
+      justify-content: center;
+      font-size: 13px;
+      font-weight: 700;
+      flex-shrink: 0;
+      font-family: 'Syne', sans-serif;
+    }
+    .message.user .avatar { background: var(--accent2); color: white; }
+    .message.assistant .avatar {
+      background: linear-gradient(135deg, rgba(0,212,255,0.2), rgba(0,212,255,0.05));
+      border: 1px solid rgba(0,212,255,0.3);
+      color: var(--accent);
+    }
+    .bubble {
+      max-width: 75%;
+      padding: 12px 16px;
+      border-radius: 12px;
+      font-size: 13px;
+      line-height: 1.7;
+    }
+    .message.user .bubble {
+      background: rgba(124,58,237,0.15);
+      border: 1px solid rgba(124,58,237,0.3);
+      color: var(--text);
+      border-top-right-radius: 2px;
+    }
+    .message.assistant .bubble {
+      background: var(--surface);
+      border: 1px solid var(--border);
+      color: var(--text);
+      border-top-left-radius: 2px;
+    }
+    .bubble-meta {
+      margin-top: 6px;
+      font-size: 10px;
+      color: var(--muted);
+      display: flex;
+      gap: 10px;
+    }
+    .bubble-meta span { display: flex; align-items: center; gap: 3px; }
+    .typing-indicator {
+      display: flex;
+      gap: 5px;
+      padding: 4px 0;
+      align-items: center;
+    }
+    .typing-indicator span {
+      width: 6px; height: 6px;
+      background: var(--accent);
+      border-radius: 50%;
+      animation: bounce 1.2s infinite;
+    }
+    .typing-indicator span:nth-child(2) { animation-delay: 0.2s; }
+    .typing-indicator span:nth-child(3) { animation-delay: 0.4s; }
+    @keyframes bounce { 0%,80%,100%{transform:scale(0.8);opacity:0.5} 40%{transform:scale(1.2);opacity:1} }
+    /* ── Input area ── */
+    .input-area {
+      display: flex;
+      gap: 10px;
+      align-items: flex-end;
+    }
+    textarea {
+      flex: 1;
+      background: var(--surface);
+      border: 1px solid var(--border);
+      border-radius: 10px;
+      color: var(--text);
+      font-family: 'Space Mono', monospace;
+      font-size: 13px;
+      padding: 12px 14px;
+      resize: none;
+      min-height: 46px;
+      max-height: 120px;
+      outline: none;
+      transition: border-color 0.2s;
+      line-height: 1.5;
+    }
+    textarea:focus { border-color: var(--accent); }
+    textarea::placeholder { color: var(--muted); }
+    .send-btn {
+      width: 46px; height: 46px;
+      border-radius: 10px;
+      border: 1px solid var(--accent);
+      background: rgba(0,212,255,0.1);
+      color: var(--accent);
+      cursor: pointer;
+      display: flex;
+      align-items: center;
+      justify-content: center;
+      transition: all 0.2s;
+      flex-shrink: 0;
+    }
+    .send-btn:hover { background: rgba(0,212,255,0.2); box-shadow: 0 0 15px rgba(0,212,255,0.3); }
+    .send-btn:disabled { opacity: 0.4; cursor: not-allowed; }
+    .send-btn svg { width: 18px; height: 18px; }
+    .input-hint {
+      font-size: 10px;
+      color: var(--muted);
+      margin-top: 4px;
+      padding-left: 2px;
+    }
+    /* ── Right sidebar ── */
+    .sidebar {
+      display: flex;
+      flex-direction: column;
+      gap: 14px;
+      overflow-y: auto;
+    }
+    .sidebar::-webkit-scrollbar { width: 4px; }
+    .sidebar::-webkit-scrollbar-thumb { background: var(--border); border-radius: 2px; }
+    .card {
+      background: var(--surface);
+      border: 1px solid var(--border);
+      border-radius: 12px;
+      padding: 16px;
+    }
+    .card-title {
+      font-family: 'Syne', sans-serif;
+      font-size: 11px;
+      font-weight: 700;
+      letter-spacing: 1.5px;
+      text-transform: uppercase;
+      color: var(--dim);
+      margin-bottom: 14px;
+      display: flex;
+      align-items: center;
+      gap: 6px;
+    }
+    .card-title::before {
+      content: '';
+      display: block;
+      width: 3px;
+      height: 12px;
+      background: var(--accent);
+      border-radius: 2px;
+    }
+    /* ── Metric rows ── */
+    .metric-row {
+      display: flex;
+      justify-content: space-between;
+      align-items: center;
+      padding: 8px 0;
+      border-bottom: 1px solid rgba(30,45,61,0.5);
+      font-size: 12px;
+    }
+    .metric-row:last-child { border-bottom: none; padding-bottom: 0; }
+    .metric-label { color: var(--dim); }
+    .metric-value {
+      font-weight: 700;
+      font-size: 14px;
+      color: var(--text);
+    }
+    .metric-value.accent { color: var(--accent); }
+    .metric-value.green { color: var(--accent3); }
+    .metric-value.warn { color: var(--warn); }
+    /* ── Big TPS display ── */
+    .tps-display {
+      text-align: center;
+      padding: 16px 0 8px;
+    }
+    .tps-number {
+      font-family: 'Syne', sans-serif;
+      font-size: 48px;
+      font-weight: 800;
+      color: var(--accent);
+      line-height: 1;
+      text-shadow: 0 0 30px rgba(0,212,255,0.5);
+      transition: all 0.3s;
+    }
+    .tps-label {
+      font-size: 11px;
+      color: var(--dim);
+      letter-spacing: 2px;
+      text-transform: uppercase;
+      margin-top: 4px;
+    }
+    /* ── Mini sparkline ── */
+    .sparkline-wrap {
+      margin-top: 12px;
+      height: 40px;
+      position: relative;
+    }
+    canvas#sparkline {
+      width: 100%;
+      height: 100%;
+    }
+    /* ── Model info ── */
+    .model-tag {
+      display: inline-flex;
+      align-items: center;
+      gap: 6px;
+      background: rgba(0,212,255,0.08);
+      border: 1px solid rgba(0,212,255,0.2);
+      border-radius: 6px;
+      padding: 5px 10px;
+      font-size: 11px;
+      color: var(--accent);
+      word-break: break-all;
+      line-height: 1.4;
+      margin-top: 2px;
+    }
+    /* ── Settings sliders ── */
+    .slider-row {
+      padding: 8px 0;
+    }
+    .slider-label {
+      display: flex;
+      justify-content: space-between;
+      font-size: 11px;
+      color: var(--dim);
+      margin-bottom: 6px;
+    }
+    .slider-label span:last-child { color: var(--text); font-weight: 700; }
+    input[type="range"] {
+      width: 100%;
+      accent-color: var(--accent);
+      cursor: pointer;
+    }
+    /* ── Loading overlay ── */
+    #loading-overlay {
+      position: fixed;
+      inset: 0;
+      background: rgba(6,8,16,0.9);
+      z-index: 100;
+      display: flex;
+      flex-direction: column;
+      align-items: center;
+      justify-content: center;
+      gap: 20px;
+      backdrop-filter: blur(8px);
+    }
+    #loading-overlay.hidden { display: none; }
+    .loading-logo {
+      font-family: 'Syne', sans-serif;
+      font-size: 32px;
+      font-weight: 800;
+      color: var(--accent);
+      text-shadow: 0 0 40px rgba(0,212,255,0.5);
+    }
+    .loading-spinner {
+      width: 48px; height: 48px;
+      border: 2px solid var(--border);
+      border-top-color: var(--accent);
+      border-radius: 50%;
+      animation: spin 0.8s linear infinite;
+    }
+    @keyframes spin { to { transform: rotate(360deg); } }
+    .loading-text { font-size: 13px; color: var(--dim); }
+    .welcome-msg {
+      text-align: center;
+      padding: 40px 20px;
+      color: var(--muted);
+    }
+    .welcome-msg h2 {
+      font-family: 'Syne', sans-serif;
+      font-size: 20px;
+      font-weight: 700;
+      color: var(--dim);
+      margin-bottom: 8px;
+    }
+    .welcome-msg p { font-size: 12px; line-height: 1.8; }
+    @media (max-width: 900px) {
+      main { grid-template-columns: 1fr; grid-template-rows: 1fr auto; }
+      .sidebar { display: grid; grid-template-columns: 1fr 1fr; }
+    }
+  </style>
+</head>
+<body>
+<div id="loading-overlay">
+  <div class="loading-logo">GRANITE</div>
+  <div class="loading-spinner"></div>
+  <div class="loading-text" id="loading-msg">Loading model — this may take a minute...</div>
+</div>
+<div class="app">
+  <header>
+    <div class="logo-block">
+      <div class="logo-icon">G4</div>
+      <div class="logo-text">
+        <h1>Granite 4.0 · ONNX</h1>
+        <p>granite-4.0-h-350m · CPU Inference Server</p>
+      </div>
+    </div>
+    <div id="status-badge">
+      <div id="status-dot"></div>
+      <span id="status-text">Initializing...</span>
+    </div>
+  </header>
+  <main>
+    <!-- ── Chat ── -->
+    <div class="chat-panel">
+      <div class="messages-container" id="messages">
+        <div class="welcome-msg">
+          <h2>Ready to chat</h2>
+          <p>IBM Granite 4.0 Hybrid · 350M params<br/>Running on ONNX Runtime · CPU</p>
+        </div>
+      </div>
+      <div>
+        <div class="input-area">
+          <textarea
+            id="user-input"
+            placeholder="Send a message... (Shift+Enter for newline)"
+            rows="1"
+          ></textarea>
+          <button class="send-btn" id="send-btn" title="Send">
+            <svg fill="none" viewBox="0 0 24 24" stroke="currentColor" stroke-width="2">
+              <path stroke-linecap="round" stroke-linejoin="round" d="M5 12h14M12 5l7 7-7 7"/>
+            </svg>
+          </button>
+        </div>
+        <div class="input-hint">Enter to send · Shift+Enter for newline · streaming enabled</div>
+      </div>
+    </div>
+    <!-- ── Sidebar ── -->
+    <div class="sidebar">
+      <!-- TPS card -->
+      <div class="card">
+        <div class="card-title">Live Performance</div>
+        <div class="tps-display">
+          <div class="tps-number" id="tps-big">—</div>
+          <div class="tps-label">tokens / second</div>
+        </div>
+        <div class="sparkline-wrap">
+          <canvas id="sparkline"></canvas>
+        </div>
+      </div>
+      <!-- Server metrics -->
+      <div class="card">
+        <div class="card-title">Server Metrics</div>
+        <div class="metric-row">
+          <span class="metric-label">Uptime</span>
+          <span class="metric-value" id="m-uptime">—</span>
+        </div>
+        <div class="metric-row">
+          <span class="metric-label">Total Requests</span>
+          <span class="metric-value accent" id="m-requests">0</span>
+        </div>
+        <div class="metric-row">
+          <span class="metric-label">Active</span>
+          <span class="metric-value green" id="m-active">0</span>
+        </div>
+        <div class="metric-row">
+          <span class="metric-label">Tokens Generated</span>
+          <span class="metric-value" id="m-tokens">0</span>
+        </div>
+        <div class="metric-row">
+          <span class="metric-label">Avg Latency</span>
+          <span class="metric-value warn" id="m-latency">—</span>
+        </div>
+        <div class="metric-row">
+          <span class="metric-label">Errors</span>
+          <span class="metric-value" id="m-errors">0</span>
+        </div>
+      </div>
+      <!-- Settings -->
+      <div class="card">
+        <div class="card-title">Generation Settings</div>
+        <div class="slider-row">
+          <div class="slider-label">
+            <span>Max Tokens</span>
+            <span id="val-max-tokens">256</span>
+          </div>
+          <input type="range" id="max-tokens" min="64" max="1024" step="64" value="256" />
+        </div>
+        <div class="slider-row">
+          <div class="slider-label">
+            <span>Temperature</span>
+            <span id="val-temp">0.7</span>
+          </div>
+          <input type="range" id="temperature" min="0.1" max="2.0" step="0.1" value="0.7" />
+        </div>
+      </div>
+      <!-- Model info -->
+      <div class="card">
+        <div class="card-title">Model Info</div>
+        <div class="metric-row">
+          <span class="metric-label">Format</span>
+          <span class="metric-value green">ONNX Q4</span>
+        </div>
+        <div class="metric-row">
+          <span class="metric-label">Params</span>
+          <span class="metric-value">350M</span>
+        </div>
+        <div class="metric-row">
+          <span class="metric-label">Architecture</span>
+          <span class="metric-value">Hybrid MoE</span>
+        </div>
+        <div class="metric-row">
+          <span class="metric-label">Device</span>
+          <span class="metric-value accent">CPU</span>
+        </div>
+        <div style="margin-top:10px">
+          <div class="model-tag">onnx-community/granite-4.0-h-350m-ONNX</div>
+        </div>
+      </div>
+    </div>
+  </main>
+</div>
+<script>
+  // ── State ─────────────────────────────────────────────────────────────────
+  const conversationHistory = [];
+  let isGenerating = false;
+  const tpsHistory = [];
+  // ── DOM refs ──────────────────────────────────────────────────────────────
+  const messagesEl = document.getElementById('messages');
+  const inputEl = document.getElementById('user-input');
+  const sendBtn = document.getElementById('send-btn');
+  const loadingOverlay = document.getElementById('loading-overlay');
+  const loadingMsg = document.getElementById('loading-msg');
+  const statusBadge = document.getElementById('status-badge');
+  const statusText = document.getElementById('status-text');
+  const statusDot = document.getElementById('status-dot');
+  const tpsBig = document.getElementById('tps-big');
+  // ── Metrics polling ───────────────────────────────────────────────────────
+  async function pollMetrics() {
+    try {
+      const r = await fetch('/metrics');
+      const d = await r.json();
+      // Update status
+      const ready = d.model_loaded;
+      const loading = d.model_loading;
+      if (ready) {
+        statusBadge.className = 'ready';
+        statusText.textContent = 'Model Ready';
+        loadingOverlay.classList.add('hidden');
+      } else if (loading) {
+        statusText.textContent = 'Loading model...';
+        loadingMsg.textContent = `Downloading & loading ONNX model — uptime ${formatUptime(d.uptime_seconds)}`;
+      } else {
+        statusBadge.className = 'error';
+        statusText.textContent = 'Error';
+      }
+      // Update metric cards
+      document.getElementById('m-uptime').textContent = formatUptime(d.uptime_seconds);
+      document.getElementById('m-requests').textContent = d.total_requests.toLocaleString();
+      document.getElementById('m-active').textContent = d.active_requests;
+      document.getElementById('m-tokens').textContent = d.total_tokens_generated.toLocaleString();
+      document.getElementById('m-latency').textContent = d.average_latency_ms > 0 ? `${d.average_latency_ms.toFixed(0)}ms` : '—';
+      document.getElementById('m-errors').textContent = d.errors;
+      // TPS
+      const tps = d.last_tokens_per_second;
+      tpsBig.textContent = tps > 0 ? tps.toFixed(1) : '—';
+      if (d.tps_history && d.tps_history.length > 0) {
+        drawSparkline(d.tps_history);
+      }
+    } catch (e) { /* server not ready */ }
+  }
+  function formatUptime(s) {
+    const h = Math.floor(s / 3600);
+    const m = Math.floor((s % 3600) / 60);
+    const sec = Math.floor(s % 60);
+    if (h > 0) return `${h}h ${m}m`;
+    if (m > 0) return `${m}m ${sec}s`;
+    return `${sec}s`;
+  }
+  // ── Sparkline ─────────────────────────────────────────────────────────────
+  function drawSparkline(data) {
+    const canvas = document.getElementById('sparkline');
+    const ctx = canvas.getContext('2d');
+    const dpr = window.devicePixelRatio || 1;
+    const w = canvas.offsetWidth;
+    const h = canvas.offsetHeight;
+    canvas.width = w * dpr;
+    canvas.height = h * dpr;
+    ctx.scale(dpr, dpr);
+    ctx.clearRect(0, 0, w, h);
+    if (data.length < 2) return;
+    const max = Math.max(...data, 1);
+    const step = w / (data.length - 1);
+    // Gradient fill
+    const grad = ctx.createLinearGradient(0, 0, 0, h);
+    grad.addColorStop(0, 'rgba(0,212,255,0.3)');
+    grad.addColorStop(1, 'rgba(0,212,255,0)');
+    ctx.beginPath();
+    data.forEach((v, i) => {
+      const x = i * step;
+      const y = h - (v / max) * h * 0.9 - 2;
+      i === 0 ? ctx.moveTo(x, y) : ctx.lineTo(x, y);
+    });
+    ctx.lineTo(w, h);
+    ctx.lineTo(0, h);
+    ctx.closePath();
+    ctx.fillStyle = grad;
+    ctx.fill();
+    // Line
+    ctx.beginPath();
+    data.forEach((v, i) => {
+      const x = i * step;
+      const y = h - (v / max) * h * 0.9 - 2;
+      i === 0 ? ctx.moveTo(x, y) : ctx.lineTo(x, y);
+    });
+    ctx.strokeStyle = '#00d4ff';
+    ctx.lineWidth = 2;
+    ctx.stroke();
+  }
+  // ── Auto-resize textarea ──────────────────────────────────────────────────
+  inputEl.addEventListener('input', () => {
+    inputEl.style.height = 'auto';
+    inputEl.style.height = Math.min(inputEl.scrollHeight, 120) + 'px';
+  });
+  // ── Settings sliders ──────────────────────────────────────────────────────
+  document.getElementById('max-tokens').addEventListener('input', e => {
+    document.getElementById('val-max-tokens').textContent = e.target.value;
+  });
+  document.getElementById('temperature').addEventListener('input', e => {
+    document.getElementById('val-temp').textContent = parseFloat(e.target.value).toFixed(1);
+  });
+  // ── Keyboard handler ──────────────────────────────────────────────────────
+  inputEl.addEventListener('keydown', e => {
+    if (e.key === 'Enter' && !e.shiftKey) {
+      e.preventDefault();
+      if (!isGenerating) sendMessage();
+    }
+  });
+  sendBtn.addEventListener('click', () => { if (!isGenerating) sendMessage(); });
+  // ── Chat functions ────────────────────────────────────────────────────────
+  function appendMessage(role, content, meta) {
+    // Remove welcome msg on first real message
+    const welcome = messagesEl.querySelector('.welcome-msg');
+    if (welcome) welcome.remove();
+    const div = document.createElement('div');
+    div.className = `message ${role}`;
+    const avatar = document.createElement('div');
+    avatar.className = 'avatar';
+    avatar.textContent = role === 'user' ? 'U' : 'G4';
+    const bubble = document.createElement('div');
+    bubble.className = 'bubble';
+    const textNode = document.createElement('div');
+    textNode.className = 'bubble-text';
+    textNode.textContent = content;
+    bubble.appendChild(textNode);
+    if (meta) {
+      const metaDiv = document.createElement('div');
+      metaDiv.className = 'bubble-meta';
+      metaDiv.innerHTML = meta;
+      bubble.appendChild(metaDiv);
+    }
+    div.appendChild(avatar);
+    div.appendChild(bubble);
+    messagesEl.appendChild(div);
+    messagesEl.scrollTop = messagesEl.scrollHeight;
+    return { div, textNode };
+  }
+  function appendTyping() {
+    const welcome = messagesEl.querySelector('.welcome-msg');
+    if (welcome) welcome.remove();
+    const div = document.createElement('div');
+    div.className = 'message assistant';
+    div.id = 'typing-msg';
+    const avatar = document.createElement('div');
+    avatar.className = 'avatar';
+    avatar.textContent = 'G4';
+    const bubble = document.createElement('div');
+    bubble.className = 'bubble';
+    bubble.innerHTML = `<div class="typing-indicator"><span></span><span></span><span></span></div>`;
+    div.appendChild(avatar);
+    div.appendChild(bubble);
+    messagesEl.appendChild(div);
+    messagesEl.scrollTop = messagesEl.scrollHeight;
+    return div;
+  }
+  async function sendMessage() {
+    const text = inputEl.value.trim();
+    if (!text) return;
+    const maxTokens = parseInt(document.getElementById('max-tokens').value);
+    const temperature = parseFloat(document.getElementById('temperature').value);
+    inputEl.value = '';
+    inputEl.style.height = 'auto';
+    isGenerating = true;
+    sendBtn.disabled = true;
+    appendMessage('user', text);
+    conversationHistory.push({ role: 'user', content: text });
+    const typingEl = appendTyping();
+    const t0 = performance.now();
+    try {
+      const response = await fetch('/chat/stream', {
+        method: 'POST',
+        headers: { 'Content-Type': 'application/json' },
+        body: JSON.stringify({
+          messages: conversationHistory,
+          max_new_tokens: maxTokens,
+          temperature,
+          stream: true
+        })
+      });
+      typingEl.remove();
+      const { div: msgDiv, textNode } = appendMessage('assistant', '');
+      const reader = response.body.getReader();
+      const decoder = new TextDecoder();
+      let fullText = '';
+      let tokenCount = 0;
+      while (true) {
+        const { done, value } = await reader.read();
+        if (done) break;
+        const chunk = decoder.decode(value);
+        const lines = chunk.split('\n');
+        for (const line of lines) {
+          if (line.startsWith('data: ')) {
+            const data = line.slice(6);
+            if (data === '[DONE]') break;
+            fullText += data;
+            tokenCount++;
+            textNode.textContent = fullText;
+            messagesEl.scrollTop = messagesEl.scrollHeight;
+          }
+        }
+      }
+      const elapsed = (performance.now() - t0) / 1000;
+      const tps = (tokenCount / elapsed).toFixed(1);
+      tpsBig.textContent = tps;
+      conversationHistory.push({ role: 'assistant', content: fullText });
+      // Add meta to bubble
+      const metaDiv = document.createElement('div');
+      metaDiv.className = 'bubble-meta';
+      metaDiv.innerHTML = `
+        <span>⚡ ${tps} t/s</span>
+        <span>📝 ${tokenCount} tokens</span>
+        <span>⏱ ${elapsed.toFixed(1)}s</span>
+      `;
+      msgDiv.querySelector('.bubble').appendChild(metaDiv);
+    } catch (err) {
+      typingEl.remove();
+      appendMessage('assistant', `Error: ${err.message}`);
+    }
+    isGenerating = false;
+    sendBtn.disabled = false;
+    inputEl.focus();
+  }
+  // ── Boot ──────────────────────────────────────────────────────────────────
+  setInterval(pollMetrics, 2000);
+  pollMetrics();
+</script>
+</body>
+</html>

requirements (2).txt ADDED Viewed

	@@ -0,0 +1,15 @@

+# ── Core inference ────────────────────────────────────────────
+onnxruntime==1.20.1
+numpy==1.26.4
+transformers==4.47.0
+huggingface_hub==0.26.5
+# ── Web server ────────────────────────────────────────────────
+fastapi==0.115.5
+uvicorn[standard]==0.32.1
+pydantic==2.10.1
+# ── Utilities ─────────────────────────────────────────────────
+accelerate==1.2.1
+sentencepiece==0.2.0
+protobuf==5.29.0

server.py ADDED Viewed

	@@ -0,0 +1,367 @@

+"""
+╔══════════════════════════════════════════════════════════════╗
+║   Granite 4.0 ONNX Inference Server                         ║
+║   Model: onnx-community/granite-4.0-h-350m-ONNX             ║
+╚══════════════════════════════════════════════════════════════╝
+"""
+import asyncio
+import time
+import uuid
+import threading
+from collections import deque
+from contextlib import asynccontextmanager
+from typing import AsyncGenerator, List, Optional
+import numpy as np
+import onnxruntime
+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import HTMLResponse, StreamingResponse
+from fastapi.staticfiles import StaticFiles
+from huggingface_hub import snapshot_download
+from pydantic import BaseModel
+from transformers import AutoConfig, AutoTokenizer
+# ── Global model state ────────────────────────────────────────────────────────
+MODEL_ID = "onnx-community/granite-4.0-h-350m-ONNX"
+MODEL_FILENAME = "model_q4"  # use quantized for speed
+decoder_session = None
+tokenizer = None
+config = None
+# ── Metrics state ─────────────────────────────────────────────────────────────
+metrics = {
+    "total_requests": 0,
+    "active_requests": 0,
+    "total_tokens_generated": 0,
+    "total_prompt_tokens": 0,
+    "request_latencies": deque(maxlen=100),
+    "tokens_per_second_history": deque(maxlen=50),
+    "errors": 0,
+    "start_time": time.time(),
+    "last_tps": 0.0,
+    "model_loaded": False,
+    "model_loading": True,
+}
+metrics_lock = threading.Lock()
+# ── Pydantic models ───────────────────────────────────────────────────────────
+class Message(BaseModel):
+    role: str
+    content: str
+class ChatRequest(BaseModel):
+    messages: List[Message]
+    max_new_tokens: int = 512
+    temperature: float = 1.0
+    stream: bool = False
+class ChatResponse(BaseModel):
+    id: str
+    content: str
+    prompt_tokens: int
+    completion_tokens: int
+    total_tokens: int
+    latency_ms: float
+    tokens_per_second: float
+# ── Model loader ──────────────────────────────────────────────────────────────
+def load_model():
+    global decoder_session, tokenizer, config
+    print(f"[INFO] Downloading model {MODEL_ID}...")
+    try:
+        model_dir = snapshot_download(
+            MODEL_ID,
+            ignore_patterns=["*.msgpack", "*.h5", "flax_model*",
+                             "model.onnx", "model_fp16.onnx", "model_q4f16.onnx"],
+        )
+        import os
+        model_path = os.path.join(model_dir, "onnx", f"{MODEL_FILENAME}.onnx")
+        print(f"[INFO] Loading ONNX session from {model_path}...")
+        sess_options = onnxruntime.SessionOptions()
+        sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+        sess_options.intra_op_num_threads = 4
+        decoder_session = onnxruntime.InferenceSession(
+            model_path,
+            sess_options=sess_options,
+            providers=["CPUExecutionProvider"],
+        )
+        print("[INFO] Loading tokenizer and config...")
+        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+        config = AutoConfig.from_pretrained(MODEL_ID)
+        with metrics_lock:
+            metrics["model_loaded"] = True
+            metrics["model_loading"] = False
+        print("[INFO] ✅ Model loaded successfully!")
+    except Exception as e:
+        with metrics_lock:
+            metrics["model_loading"] = False
+            metrics["errors"] += 1
+        print(f"[ERROR] Failed to load model: {e}")
+        raise
+# ── Cache initializer ─────────────────────────────────────────────────────────
+def init_cache(batch_size: int, dtype=np.float32):
+    cache = {}
+    head_dim = config.hidden_size // config.num_attention_heads
+    d_conv = config.mamba_d_conv
+    mamba_expand = config.mamba_expand
+    mamba_n_groups = config.mamba_n_groups
+    mamba_d_state = config.mamba_d_state
+    conv_d_inner = (mamba_expand * config.hidden_size) + (2 * mamba_n_groups * mamba_d_state)
+    for i, layer_type in enumerate(config.layer_types):
+        if layer_type == "attention":
+            for kv in ("key", "value"):
+                cache[f"past_key_values.{i}.{kv}"] = np.zeros(
+                    [batch_size, config.num_key_value_heads, 0, head_dim], dtype=dtype
+                )
+        elif layer_type == "mamba":
+            cache[f"past_conv.{i}"] = np.zeros(
+                [batch_size, conv_d_inner, d_conv], dtype=dtype
+            )
+            cache[f"past_ssm.{i}"] = np.zeros(
+                [batch_size, config.mamba_n_heads, config.mamba_d_head, mamba_d_state], dtype=dtype
+            )
+    return cache
+# ── Core generation ───────────────────────────────────────────────────────────
+def generate_tokens(input_ids: np.ndarray, attention_mask: np.ndarray,
+                    max_new_tokens: int = 512) -> AsyncGenerator:
+    """Synchronous token generation — yields (token_str, is_done)"""
+    dtype = np.float32
+    cache = init_cache(batch_size=1, dtype=dtype)
+    output_names = [o.name for o in decoder_session.get_outputs()]
+    eos_token_id = config.eos_token_id if not isinstance(
+        config.eos_token_id, list) else config.eos_token_id[0]
+    generated = []
+    t_start = time.perf_counter()
+    for step in range(max_new_tokens):
+        feed_dict = {"input_ids": input_ids, "attention_mask": attention_mask}
+        outputs = decoder_session.run(None, feed_dict | cache)
+        named_outputs = dict(zip(output_names, outputs))
+        next_token = outputs[0][:, -1].argmax(-1, keepdims=True)
+        attention_mask = np.concatenate(
+            [attention_mask, np.ones_like(next_token, dtype=np.int64)], axis=-1
+        )
+        input_ids = next_token
+        for name in cache:
+            new_name = name.replace("past_key_values", "present").replace("past_", "present_")
+            cache[name] = named_outputs[new_name]
+        token_id = int(next_token[0, 0])
+        generated.append(token_id)
+        token_str = tokenizer.decode([token_id], skip_special_tokens=True)
+        elapsed = time.perf_counter() - t_start
+        tps = (step + 1) / elapsed if elapsed > 0 else 0
+        is_done = token_id == eos_token_id
+        yield token_str, is_done, tps
+        if is_done:
+            break
+    return generated
+# ── Lifespan ──────────────────────────────────────────────────────────────────
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    loop = asyncio.get_event_loop()
+    await loop.run_in_executor(None, load_model)
+    yield
+# ── FastAPI app ───────────────────────────────────────────────────────────────
+app = FastAPI(
+    title="Granite 4.0 ONNX Server",
+    description="High-performance inference server for granite-4.0-h-350m-ONNX",
+    version="1.0.0",
+    lifespan=lifespan,
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# ── API Routes ────────────────────────────────────────────────────────────────
+@app.get("/health")
+def health():
+    with metrics_lock:
+        return {
+            "status": "ready" if metrics["model_loaded"] else "loading",
+            "model": MODEL_ID,
+            "uptime_seconds": round(time.time() - metrics["start_time"], 1),
+        }
+@app.get("/metrics")
+def get_metrics():
+    with metrics_lock:
+        uptime = time.time() - metrics["start_time"]
+        avg_latency = (
+            sum(metrics["request_latencies"]) / len(metrics["request_latencies"])
+            if metrics["request_latencies"] else 0
+        )
+        return {
+            "uptime_seconds": round(uptime, 1),
+            "total_requests": metrics["total_requests"],
+            "active_requests": metrics["active_requests"],
+            "total_tokens_generated": metrics["total_tokens_generated"],
+            "total_prompt_tokens": metrics["total_prompt_tokens"],
+            "average_latency_ms": round(avg_latency, 2),
+            "last_tokens_per_second": round(metrics["last_tps"], 2),
+            "tps_history": list(metrics["tokens_per_second_history"]),
+            "errors": metrics["errors"],
+            "model_loaded": metrics["model_loaded"],
+            "model_loading": metrics["model_loading"],
+            "requests_per_minute": round(metrics["total_requests"] / max(uptime / 60, 1), 2),
+        }
+@app.post("/chat", response_model=ChatResponse)
+async def chat(req: ChatRequest):
+    if not metrics["model_loaded"]:
+        raise HTTPException(status_code=503, detail="Model still loading, please wait...")
+    with metrics_lock:
+        metrics["total_requests"] += 1
+        metrics["active_requests"] += 1
+    t0 = time.perf_counter()
+    request_id = str(uuid.uuid4())[:8]
+    try:
+        messages = [{"role": m.role, "content": m.content} for m in req.messages]
+        loop = asyncio.get_event_loop()
+        inputs = await loop.run_in_executor(
+            None,
+            lambda: tokenizer.apply_chat_template(
+                messages, add_generation_prompt=True,
+                tokenize=True, return_dict=True, return_tensors="np"
+            )
+        )
+        input_ids = inputs["input_ids"]
+        attention_mask = inputs["attention_mask"]
+        prompt_tokens = int(input_ids.shape[1])
+        full_text = ""
+        final_tps = 0.0
+        completion_tokens = 0
+        def run_generation():
+            nonlocal full_text, final_tps, completion_tokens
+            for token_str, is_done, tps in generate_tokens(
+                input_ids, attention_mask, req.max_new_tokens
+            ):
+                full_text += token_str
+                completion_tokens += 1
+                final_tps = tps
+                if is_done:
+                    break
+        await loop.run_in_executor(None, run_generation)
+        latency_ms = (time.perf_counter() - t0) * 1000
+        with metrics_lock:
+            metrics["active_requests"] -= 1
+            metrics["total_tokens_generated"] += completion_tokens
+            metrics["total_prompt_tokens"] += prompt_tokens
+            metrics["request_latencies"].append(latency_ms)
+            metrics["tokens_per_second_history"].append(round(final_tps, 2))
+            metrics["last_tps"] = final_tps
+        return ChatResponse(
+            id=request_id,
+            content=full_text,
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
+            total_tokens=prompt_tokens + completion_tokens,
+            latency_ms=round(latency_ms, 2),
+            tokens_per_second=round(final_tps, 2),
+        )
+    except Exception as e:
+        with metrics_lock:
+            metrics["active_requests"] -= 1
+            metrics["errors"] += 1
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/chat/stream")
+async def chat_stream(req: ChatRequest):
+    if not metrics["model_loaded"]:
+        raise HTTPException(status_code=503, detail="Model still loading...")
+    with metrics_lock:
+        metrics["total_requests"] += 1
+        metrics["active_requests"] += 1
+    messages = [{"role": m.role, "content": m.content} for m in req.messages]
+    inputs = tokenizer.apply_chat_template(
+        messages, add_generation_prompt=True,
+        tokenize=True, return_dict=True, return_tensors="np"
+    )
+    input_ids = inputs["input_ids"]
+    attention_mask = inputs["attention_mask"]
+    async def event_stream():
+        completion_tokens = 0
+        try:
+            loop = asyncio.get_event_loop()
+            gen = generate_tokens(input_ids, attention_mask, req.max_new_tokens)
+            def next_token():
+                return next(gen, None)
+            while True:
+                result = await loop.run_in_executor(None, next_token)
+                if result is None:
+                    break
+                token_str, is_done, tps = result
+                completion_tokens += 1
+                yield f"data: {token_str}\n\n"
+                if is_done:
+                    break
+            yield f"data: [DONE]\n\n"
+        finally:
+            with metrics_lock:
+                metrics["active_requests"] -= 1
+                metrics["total_tokens_generated"] += completion_tokens
+    return StreamingResponse(event_stream(), media_type="text/event-stream")
+@app.get("/", response_class=HTMLResponse)
+async def ui():
+    with open("/app/static/index.html") as f:
+        return f.read()