Spaces:

enCoder
/

tiny-vllm

Running

App Files Files Community

enCoder commited on 11 days ago

Commit

8fa0f9d

1 Parent(s): 4fa42d8

Wire HF Spaces deploymen

Browse files

Files changed (9) hide show

.dockerignore +25 -0
.github/workflows/sync-huggingface.yml +53 -0
.gitignore +4 -1
Dockerfile +59 -0
README.md +59 -0
tiny_vllm/server.py +35 -4
web/app.js +15 -0
web/index.html +3 -1
web/style.css +20 -2

.dockerignore ADDED Viewed

	@@ -0,0 +1,25 @@

+__pycache__/
+*.py[cod]
+*.egg-info/
+.venv/
+venv/
+.git/
+.github/
+.pytest_cache/
+.mypy_cache/
+.ruff_cache/
+tests/
+examples/
+scripts/
+.DS_Store
+*.log
+.cache/
+hf_cache/
+.vscode/
+.idea/
+.claude/
+.cursor/
+.env
+.env.*
+secrets/

.github/workflows/sync-huggingface.yml ADDED Viewed

	@@ -0,0 +1,53 @@

+name: Sync to Hugging Face Space
+# Mirrors the repo to your Hugging Face Space on every push to main.
+# HF Spaces then rebuilds the Docker image and redeploys.
+#
+# One-time setup (see README "Hugging Face Space — live demo"):
+#   1. Create the Space at https://huggingface.co/new-space (SDK: Docker).
+#   2. Generate a write-access token at https://huggingface.co/settings/tokens.
+#   3. In the GitHub repo: Settings → Secrets and variables → Actions → add:
+#        HF_TOKEN          ← the token from step 2
+#        HF_USERNAME       ← your HF username (e.g. surajsharan)
+#        HF_SPACE_NAME     ← the Space name (e.g. tiny-vllm)
+#   4. Push to main, or trigger this workflow manually from the Actions tab.
+on:
+  push:
+    branches: [main]
+  workflow_dispatch:
+concurrency:
+  group: huggingface-sync
+  cancel-in-progress: false
+jobs:
+  sync:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          lfs: true
+      - name: Push to HF Space
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          HF_USERNAME: ${{ secrets.HF_USERNAME }}
+          HF_SPACE_NAME: ${{ secrets.HF_SPACE_NAME }}
+        run: |
+          set -e
+          if [ -z "$HF_TOKEN" ] || [ -z "$HF_USERNAME" ] || [ -z "$HF_SPACE_NAME" ]; then
+            echo "::notice::HF secrets not configured (HF_TOKEN/HF_USERNAME/HF_SPACE_NAME)."
+            echo "::notice::Skipping HF Space sync. See README for setup."
+            exit 0
+          fi
+          git config user.email "actions@github.com"
+          git config user.name  "github-actions[bot]"
+          REMOTE="https://${HF_USERNAME}:${HF_TOKEN}@huggingface.co/spaces/${HF_USERNAME}/${HF_SPACE_NAME}"
+          git remote add huggingface "$REMOTE"
+          # Force-push: the HF Space repo is a mirror of this branch.
+          # If you also commit on HF (e.g., README edits in the Space UI),
+          # those would be overwritten — keep edits in this repo.
+          git push --force huggingface HEAD:main
+          echo "::notice::Synced to https://huggingface.co/spaces/${HF_USERNAME}/${HF_SPACE_NAME}"

.gitignore CHANGED Viewed

@@ -16,4 +16,7 @@ hf_cache/
 .vscode/
 .idea/
 .claude/
-.cursor/

 .vscode/
 .idea/
 .claude/
+.cursor/
+.env
+.env.*
+secrets/

Dockerfile ADDED Viewed

	@@ -0,0 +1,59 @@

+# Dockerfile for the Hugging Face Spaces deployment.
+#
+# This image is small enough to fit comfortably in HF's free CPU tier
+# (16 GB RAM, 2 vCPU): CPU-only torch + pre-downloaded Qwen2.5-0.5B.
+#
+# HF Spaces convention: listen on port 7860, bound to 0.0.0.0.
+FROM python:3.11-slim
+ENV PYTHONUNBUFFERED=1 \
+    PYTHONDONTWRITEBYTECODE=1 \
+    PIP_NO_CACHE_DIR=1 \
+    PIP_DISABLE_PIP_VERSION_CHECK=1 \
+    HF_HOME=/tmp/.cache/huggingface \
+    TRANSFORMERS_CACHE=/tmp/.cache/huggingface/transformers \
+    TINY_VLLM_MODEL=Qwen/Qwen2.5-0.5B-Instruct \
+    TINY_VLLM_DEVICE=cpu \
+    TINY_VLLM_DTYPE=float32
+WORKDIR /app
+# Minimal system deps: curl for healthcheck, ca-certs for HTTPS.
+RUN apt-get update && apt-get install -y --no-install-recommends \
+      curl ca-certificates \
+    && rm -rf /var/lib/apt/lists/*
+# Install CPU-only PyTorch first (much smaller than the default GPU build).
+RUN pip install --upgrade pip \
+ && pip install torch --index-url https://download.pytorch.org/whl/cpu
+# Install the rest of our deps (skip torch — already done).
+COPY requirements.txt .
+RUN grep -v '^torch' requirements.txt > requirements.no-torch.txt \
+ && pip install -r requirements.no-torch.txt
+# Pre-download the model so cold-start latency is just engine warmup.
+# Failing this step at build time is better than failing on first request.
+RUN python -c "import os; m=os.environ['TINY_VLLM_MODEL']; \
+    from transformers import AutoTokenizer, AutoModelForCausalLM; \
+    AutoTokenizer.from_pretrained(m); \
+    AutoModelForCausalLM.from_pretrained(m); \
+    print(f'pre-fetched {m}')"
+# Now copy the source (placed AFTER the heavy deps so layer cache helps reruns).
+COPY tiny_vllm/ ./tiny_vllm/
+COPY web/ ./web/
+COPY README.md LICENSE pyproject.toml ./
+EXPOSE 7860
+HEALTHCHECK --interval=30s --timeout=5s --start-period=120s \
+  CMD curl -fsS http://localhost:7860/health || exit 1
+# Conservative resource settings — HF free CPU is small.
+CMD ["python", "-m", "tiny_vllm.server", \
+     "--host", "0.0.0.0", "--port", "7860", \
+     "--block-size", "16", "--num-blocks", "128", \
+     "--max-num-seqs", "4", "--max-num-batched-tokens", "256", \
+     "--max-model-len", "1024"]

README.md CHANGED Viewed

@@ -1,3 +1,14 @@
 # tiny_vllm
 A **minimal continuous-batching LLM engine** built to be read end-to-end.  It
@@ -76,6 +87,54 @@ pip install pytest
 python -m pytest tests/
 ```
 ## GitHub Pages demo (replay mode)
 The visualization can run as a **static page** on GitHub Pages with no

+---
+title: tiny_vllm
+emoji: 🪶
+colorFrom: gray
+colorTo: green
+sdk: docker
+app_port: 7860
+pinned: false
+short_description: Minimal continuous-batching LLM engine — paged KV, prefix caching, SSE
+---
 # tiny_vllm
 A **minimal continuous-batching LLM engine** built to be read end-to-end.  It
 python -m pytest tests/
 ```
+## Hugging Face Space — live demo
+For a *live* (not recorded) demo you can talk to from any browser, deploy this
+repo as a Docker-based Hugging Face Space.  HF's free CPU tier (16 GB RAM,
+2 vCPU) fits Qwen2.5-0.5B comfortably.
+**One-time setup:**
+1. **Create the Space.**  Go to [huggingface.co/new-space](https://huggingface.co/new-space):
+   - Owner: your HF username
+   - Space name: e.g. `tiny-vllm` (must match `HF_SPACE_NAME` below)
+   - SDK: **Docker**
+   - License: MIT
+2. **Generate a write-access token** at
+   [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens) → New
+   token → role **Write**.
+3. **Add three secrets** to this GitHub repo (Settings → Secrets and variables
+   → Actions → New repository secret):
+   - `HF_TOKEN` — the token from step 2
+   - `HF_USERNAME` — your HF username
+   - `HF_SPACE_NAME` — e.g. `tiny-vllm`
+On the next push to `main`, the `Sync to Hugging Face Space` workflow mirrors
+the repo to the Space.  HF then builds the Docker image (~3–5 min on first
+build because of the pre-fetched model) and the Space goes live at:
+```
+https://<lowercased-HF_USERNAME>-<HF_SPACE_NAME>.hf.space
+```
+(HF normalises subdomains to lowercase — `enCoder/tiny-vllm` becomes
+`encoder-tiny-vllm.hf.space`.)
+The GH Pages page links to this URL as a **"try live ↗"** pill in the
+topbar — update `data-hf-space` on `<body>` in `web/index.html` if your
+Space URL differs.
+**HF Spaces cost: free.**  Cold-start (after ~48 h of inactivity) takes ~30 s
+while the container wakes; subsequent requests are warm.
+**Files involved:**
+- `Dockerfile` — CPU-only torch, pre-downloads the model at build time.
+- `README.md` frontmatter — HF reads `sdk: docker`, `app_port: 7860`, etc.
+- `.github/workflows/sync-huggingface.yml` — mirrors GitHub → HF Spaces.
+- CORS is enabled on the server so the GH Pages frontend can call the HF
+  backend cross-origin (`?mode=live&backend=https://...hf.space` is a
+  potential future addition).
 ## GitHub Pages demo (replay mode)
 The visualization can run as a **static page** on GitHub Pages with no

tiny_vllm/server.py CHANGED Viewed

@@ -23,6 +23,7 @@ from pathlib import Path
 from typing import AsyncIterator, Optional
 from fastapi import FastAPI, HTTPException, Request
 from fastapi.responses import FileResponse, JSONResponse, StreamingResponse
 from fastapi.staticfiles import StaticFiles
 from pydantic import BaseModel, Field
@@ -70,8 +71,18 @@ def _sse(data: dict | str) -> bytes:
     return f"data: {data}\n\n".encode("utf-8")
-def build_app(config: EngineConfig) -> FastAPI:
     app = FastAPI(title="tiny_vllm", version="0.1.0")
     engine = LLMEngine(config)
     @app.on_event("startup")
@@ -114,6 +125,14 @@ def build_app(config: EngineConfig) -> FastAPI:
     # ---- introspection -------------------------------------------------
     @app.get("/engine/snapshot")
     async def snapshot() -> dict:
         return engine.snapshot()
@@ -300,8 +319,14 @@ def main() -> None:
     parser.add_argument("--record", default=None,
                         help="Append every engine event to this JSONL file "
                              "(e.g. web/events.jsonl) to power the static replay demo.")
-    parser.add_argument("--host", default="0.0.0.0")
-    parser.add_argument("--port", type=int, default=8000)
     args = parser.parse_args()
     cfg = EngineConfig(
@@ -317,8 +342,14 @@ def main() -> None:
         record_path=args.record,
     )
     import uvicorn
-    app = build_app(cfg)
     uvicorn.run(app, host=args.host, port=args.port, log_level="info")

 from typing import AsyncIterator, Optional
 from fastapi import FastAPI, HTTPException, Request
+from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import FileResponse, JSONResponse, StreamingResponse
 from fastapi.staticfiles import StaticFiles
 from pydantic import BaseModel, Field
     return f"data: {data}\n\n".encode("utf-8")
+def build_app(config: EngineConfig, cors_allow_origins: Optional[list[str]] = None) -> FastAPI:
     app = FastAPI(title="tiny_vllm", version="0.1.0")
+    # CORS so the GH-Pages frontend (or any external page) can call this
+    # backend when it's deployed on HF Spaces.  Read-only inference, so the
+    # blast radius of opening this up is small.
+    app.add_middleware(
+        CORSMiddleware,
+        allow_origins=cors_allow_origins or ["*"],
+        allow_credentials=False,
+        allow_methods=["*"],
+        allow_headers=["*"],
+    )
     engine = LLMEngine(config)
     @app.on_event("startup")
     # ---- introspection -------------------------------------------------
+    @app.get("/health")
+    async def health() -> dict:
+        return {
+            "status": "ok" if engine.model_runner is not None else "starting",
+            "model": config.model,
+            "device": config.device,
+        }
     @app.get("/engine/snapshot")
     async def snapshot() -> dict:
         return engine.snapshot()
     parser.add_argument("--record", default=None,
                         help="Append every engine event to this JSONL file "
                              "(e.g. web/events.jsonl) to power the static replay demo.")
+    parser.add_argument("--host", default=os.environ.get("HOST", "0.0.0.0"))
+    parser.add_argument("--port", type=int,
+                        default=int(os.environ.get("PORT", "8000")))
+    parser.add_argument(
+        "--cors-origins", default=os.environ.get("TINY_VLLM_CORS_ORIGINS", "*"),
+        help="Comma-separated allowed origins for CORS (default '*' — fine "
+             "for the demo since this server is read-only inference).",
+    )
     args = parser.parse_args()
     cfg = EngineConfig(
         record_path=args.record,
     )
+    cors_origins: list[str] | None
+    if args.cors_origins.strip() in ("*", ""):
+        cors_origins = None  # defaults to ["*"]
+    else:
+        cors_origins = [o.strip() for o in args.cors_origins.split(",") if o.strip()]
     import uvicorn
+    app = build_app(cfg, cors_allow_origins=cors_origins)
     uvicorn.run(app, host=args.host, port=args.port, log_level="info")

web/app.js CHANGED Viewed

@@ -467,6 +467,21 @@ if (ui.playPause) ui.playPause.addEventListener("click", () => {
 });
 if (ui.restart) ui.restart.addEventListener("click", () => state.replay?.start());
 // ---------- entry point ----------
 (function boot() {

 });
 if (ui.restart) ui.restart.addEventListener("click", () => state.replay?.start());
+// ---------- "Try live" link → Hugging Face Space ----------
+(function setupHFLink() {
+  const url = document.body.getAttribute("data-hf-space") || "";
+  // Don't advertise the live link if we're already on it (avoids
+  // showing "try live →" while on the live page).
+  const onHF = /\.hf\.space$/i.test(location.hostname) ||
+               /huggingface\.co$/i.test(location.hostname);
+  if (!url || onHF) return;
+  const top = document.getElementById("hf-live");
+  if (top) { top.href = url; top.style.display = ""; }
+  const rl = document.getElementById("rl-hf");
+  if (rl)  { rl.href  = url; rl.style.display  = ""; }
+})();
 // ---------- entry point ----------
 (function boot() {

web/index.html CHANGED Viewed

@@ -10,7 +10,7 @@
 <link rel="stylesheet" href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&family=JetBrains+Mono:wght@400;500;600&display=swap">
 <link rel="stylesheet" href="style.css">
 </head>
-<body>
 <nav class="topbar">
   <div class="crumb">
@@ -21,6 +21,7 @@
     <span>tiny_vllm</span>
   </div>
   <div class="right">
     <a href="https://github.com/surajsharan/tiny_vLLM" target="_blank" rel="noopener">github ↗</a>
     <a href="https://surajsharan.github.io/" title="Back to portfolio">← back</a>
   </div>
@@ -67,6 +68,7 @@ python -m tiny_vllm.server --model Qwen/Qwen2.5-0.5B-Instruct
 # then open http://localhost:8000</code></pre>
     <div class="rl-foot">
       <button id="rl-copy" class="ghost">Copy command</button>
       <button id="rl-close" class="ghost">Dismiss</button>
     </div>
   </div>

 <link rel="stylesheet" href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&family=JetBrains+Mono:wght@400;500;600&display=swap">
 <link rel="stylesheet" href="style.css">
 </head>
+<body data-hf-space="https://encoder-tiny-vllm.hf.space">
 <nav class="topbar">
   <div class="crumb">
     <span>tiny_vllm</span>
   </div>
   <div class="right">
+    <a id="hf-live" href="#" target="_blank" rel="noopener" class="live-link" style="display:none">try live ↗</a>
     <a href="https://github.com/surajsharan/tiny_vLLM" target="_blank" rel="noopener">github ↗</a>
     <a href="https://surajsharan.github.io/" title="Back to portfolio">← back</a>
   </div>
 # then open http://localhost:8000</code></pre>
     <div class="rl-foot">
       <button id="rl-copy" class="ghost">Copy command</button>
+      <a id="rl-hf" class="rl-cta" href="#" target="_blank" rel="noopener" style="display:none">Or use the hosted live demo ↗</a>
       <button id="rl-close" class="ghost">Dismiss</button>
     </div>
   </div>

web/style.css CHANGED Viewed

@@ -65,9 +65,17 @@ body {
 .topbar a:hover { color: var(--accent); }
 .topbar .crumb { color: var(--muted); }
 .topbar .crumb .sep { margin: 0 6px; }
-.topbar .right { display: flex; gap: 16px; align-items: center; }
 .topbar .right a { color: var(--muted); }
 .topbar .right a:hover { color: var(--accent); }
 header.hero {
   padding: 28px 24px 20px;
@@ -231,7 +239,17 @@ textarea:disabled { opacity: 0.5; }
   color: var(--accent);
   overflow-x: auto;
 }
-.run-locally .rl-foot { display: flex; gap: 8px; }
 .replay-controls { margin-left: auto; display: flex; gap: 6px; align-items: center; }
 .replay-controls select {

 .topbar a:hover { color: var(--accent); }
 .topbar .crumb { color: var(--muted); }
 .topbar .crumb .sep { margin: 0 6px; }
+.topbar .right { display: flex; gap: 14px; align-items: center; }
 .topbar .right a { color: var(--muted); }
 .topbar .right a:hover { color: var(--accent); }
+.topbar .right a.live-link {
+  color: var(--bg);
+  background: var(--accent);
+  padding: 3px 10px;
+  border-radius: 999px;
+  font-weight: 600;
+}
+.topbar .right a.live-link:hover { background: var(--accent-dim); color: var(--bg); }
 header.hero {
   padding: 28px 24px 20px;
   color: var(--accent);
   overflow-x: auto;
 }
+.run-locally .rl-foot { display: flex; gap: 8px; align-items: center; flex-wrap: wrap; }
+.run-locally .rl-cta {
+  margin-left: auto;
+  color: var(--bg);
+  background: var(--accent);
+  padding: 6px 12px;
+  border-radius: var(--radius);
+  font-family: var(--mono); font-size: 12px; font-weight: 600;
+  text-decoration: none;
+}
+.run-locally .rl-cta:hover { background: var(--accent-dim); }
 .replay-controls { margin-left: auto; display: flex; gap: 6px; align-items: center; }
 .replay-controls select {