enCoder commited on
Commit
8fa0f9d
·
1 Parent(s): 4fa42d8

Wire HF Spaces deploymen

Browse files
.dockerignore ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.py[cod]
3
+ *.egg-info/
4
+ .venv/
5
+ venv/
6
+ .git/
7
+ .github/
8
+ .pytest_cache/
9
+ .mypy_cache/
10
+ .ruff_cache/
11
+ tests/
12
+ examples/
13
+ scripts/
14
+ .DS_Store
15
+ *.log
16
+ .cache/
17
+ hf_cache/
18
+ .vscode/
19
+ .idea/
20
+ .claude/
21
+ .cursor/
22
+ .env
23
+ .env.*
24
+ secrets/
25
+
.github/workflows/sync-huggingface.yml ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Sync to Hugging Face Space
2
+
3
+ # Mirrors the repo to your Hugging Face Space on every push to main.
4
+ # HF Spaces then rebuilds the Docker image and redeploys.
5
+ #
6
+ # One-time setup (see README "Hugging Face Space — live demo"):
7
+ # 1. Create the Space at https://huggingface.co/new-space (SDK: Docker).
8
+ # 2. Generate a write-access token at https://huggingface.co/settings/tokens.
9
+ # 3. In the GitHub repo: Settings → Secrets and variables → Actions → add:
10
+ # HF_TOKEN ← the token from step 2
11
+ # HF_USERNAME ← your HF username (e.g. surajsharan)
12
+ # HF_SPACE_NAME ← the Space name (e.g. tiny-vllm)
13
+ # 4. Push to main, or trigger this workflow manually from the Actions tab.
14
+
15
+ on:
16
+ push:
17
+ branches: [main]
18
+ workflow_dispatch:
19
+
20
+ concurrency:
21
+ group: huggingface-sync
22
+ cancel-in-progress: false
23
+
24
+ jobs:
25
+ sync:
26
+ runs-on: ubuntu-latest
27
+ steps:
28
+ - uses: actions/checkout@v4
29
+ with:
30
+ fetch-depth: 0
31
+ lfs: true
32
+
33
+ - name: Push to HF Space
34
+ env:
35
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
36
+ HF_USERNAME: ${{ secrets.HF_USERNAME }}
37
+ HF_SPACE_NAME: ${{ secrets.HF_SPACE_NAME }}
38
+ run: |
39
+ set -e
40
+ if [ -z "$HF_TOKEN" ] || [ -z "$HF_USERNAME" ] || [ -z "$HF_SPACE_NAME" ]; then
41
+ echo "::notice::HF secrets not configured (HF_TOKEN/HF_USERNAME/HF_SPACE_NAME)."
42
+ echo "::notice::Skipping HF Space sync. See README for setup."
43
+ exit 0
44
+ fi
45
+ git config user.email "actions@github.com"
46
+ git config user.name "github-actions[bot]"
47
+ REMOTE="https://${HF_USERNAME}:${HF_TOKEN}@huggingface.co/spaces/${HF_USERNAME}/${HF_SPACE_NAME}"
48
+ git remote add huggingface "$REMOTE"
49
+ # Force-push: the HF Space repo is a mirror of this branch.
50
+ # If you also commit on HF (e.g., README edits in the Space UI),
51
+ # those would be overwritten — keep edits in this repo.
52
+ git push --force huggingface HEAD:main
53
+ echo "::notice::Synced to https://huggingface.co/spaces/${HF_USERNAME}/${HF_SPACE_NAME}"
.gitignore CHANGED
@@ -16,4 +16,7 @@ hf_cache/
16
  .vscode/
17
  .idea/
18
  .claude/
19
- .cursor/
 
 
 
 
16
  .vscode/
17
  .idea/
18
  .claude/
19
+ .cursor/
20
+ .env
21
+ .env.*
22
+ secrets/
Dockerfile ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dockerfile for the Hugging Face Spaces deployment.
2
+ #
3
+ # This image is small enough to fit comfortably in HF's free CPU tier
4
+ # (16 GB RAM, 2 vCPU): CPU-only torch + pre-downloaded Qwen2.5-0.5B.
5
+ #
6
+ # HF Spaces convention: listen on port 7860, bound to 0.0.0.0.
7
+
8
+ FROM python:3.11-slim
9
+
10
+ ENV PYTHONUNBUFFERED=1 \
11
+ PYTHONDONTWRITEBYTECODE=1 \
12
+ PIP_NO_CACHE_DIR=1 \
13
+ PIP_DISABLE_PIP_VERSION_CHECK=1 \
14
+ HF_HOME=/tmp/.cache/huggingface \
15
+ TRANSFORMERS_CACHE=/tmp/.cache/huggingface/transformers \
16
+ TINY_VLLM_MODEL=Qwen/Qwen2.5-0.5B-Instruct \
17
+ TINY_VLLM_DEVICE=cpu \
18
+ TINY_VLLM_DTYPE=float32
19
+
20
+ WORKDIR /app
21
+
22
+ # Minimal system deps: curl for healthcheck, ca-certs for HTTPS.
23
+ RUN apt-get update && apt-get install -y --no-install-recommends \
24
+ curl ca-certificates \
25
+ && rm -rf /var/lib/apt/lists/*
26
+
27
+ # Install CPU-only PyTorch first (much smaller than the default GPU build).
28
+ RUN pip install --upgrade pip \
29
+ && pip install torch --index-url https://download.pytorch.org/whl/cpu
30
+
31
+ # Install the rest of our deps (skip torch — already done).
32
+ COPY requirements.txt .
33
+ RUN grep -v '^torch' requirements.txt > requirements.no-torch.txt \
34
+ && pip install -r requirements.no-torch.txt
35
+
36
+ # Pre-download the model so cold-start latency is just engine warmup.
37
+ # Failing this step at build time is better than failing on first request.
38
+ RUN python -c "import os; m=os.environ['TINY_VLLM_MODEL']; \
39
+ from transformers import AutoTokenizer, AutoModelForCausalLM; \
40
+ AutoTokenizer.from_pretrained(m); \
41
+ AutoModelForCausalLM.from_pretrained(m); \
42
+ print(f'pre-fetched {m}')"
43
+
44
+ # Now copy the source (placed AFTER the heavy deps so layer cache helps reruns).
45
+ COPY tiny_vllm/ ./tiny_vllm/
46
+ COPY web/ ./web/
47
+ COPY README.md LICENSE pyproject.toml ./
48
+
49
+ EXPOSE 7860
50
+
51
+ HEALTHCHECK --interval=30s --timeout=5s --start-period=120s \
52
+ CMD curl -fsS http://localhost:7860/health || exit 1
53
+
54
+ # Conservative resource settings — HF free CPU is small.
55
+ CMD ["python", "-m", "tiny_vllm.server", \
56
+ "--host", "0.0.0.0", "--port", "7860", \
57
+ "--block-size", "16", "--num-blocks", "128", \
58
+ "--max-num-seqs", "4", "--max-num-batched-tokens", "256", \
59
+ "--max-model-len", "1024"]
README.md CHANGED
@@ -1,3 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
1
  # tiny_vllm
2
 
3
  A **minimal continuous-batching LLM engine** built to be read end-to-end. It
@@ -76,6 +87,54 @@ pip install pytest
76
  python -m pytest tests/
77
  ```
78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  ## GitHub Pages demo (replay mode)
80
 
81
  The visualization can run as a **static page** on GitHub Pages with no
 
1
+ ---
2
+ title: tiny_vllm
3
+ emoji: 🪶
4
+ colorFrom: gray
5
+ colorTo: green
6
+ sdk: docker
7
+ app_port: 7860
8
+ pinned: false
9
+ short_description: Minimal continuous-batching LLM engine — paged KV, prefix caching, SSE
10
+ ---
11
+
12
  # tiny_vllm
13
 
14
  A **minimal continuous-batching LLM engine** built to be read end-to-end. It
 
87
  python -m pytest tests/
88
  ```
89
 
90
+ ## Hugging Face Space — live demo
91
+
92
+ For a *live* (not recorded) demo you can talk to from any browser, deploy this
93
+ repo as a Docker-based Hugging Face Space. HF's free CPU tier (16 GB RAM,
94
+ 2 vCPU) fits Qwen2.5-0.5B comfortably.
95
+
96
+ **One-time setup:**
97
+
98
+ 1. **Create the Space.** Go to [huggingface.co/new-space](https://huggingface.co/new-space):
99
+ - Owner: your HF username
100
+ - Space name: e.g. `tiny-vllm` (must match `HF_SPACE_NAME` below)
101
+ - SDK: **Docker**
102
+ - License: MIT
103
+ 2. **Generate a write-access token** at
104
+ [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens) → New
105
+ token → role **Write**.
106
+ 3. **Add three secrets** to this GitHub repo (Settings → Secrets and variables
107
+ → Actions → New repository secret):
108
+ - `HF_TOKEN` — the token from step 2
109
+ - `HF_USERNAME` — your HF username
110
+ - `HF_SPACE_NAME` — e.g. `tiny-vllm`
111
+
112
+ On the next push to `main`, the `Sync to Hugging Face Space` workflow mirrors
113
+ the repo to the Space. HF then builds the Docker image (~3–5 min on first
114
+ build because of the pre-fetched model) and the Space goes live at:
115
+
116
+ ```
117
+ https://<lowercased-HF_USERNAME>-<HF_SPACE_NAME>.hf.space
118
+ ```
119
+
120
+ (HF normalises subdomains to lowercase — `enCoder/tiny-vllm` becomes
121
+ `encoder-tiny-vllm.hf.space`.)
122
+
123
+ The GH Pages page links to this URL as a **"try live ↗"** pill in the
124
+ topbar — update `data-hf-space` on `<body>` in `web/index.html` if your
125
+ Space URL differs.
126
+
127
+ **HF Spaces cost: free.** Cold-start (after ~48 h of inactivity) takes ~30 s
128
+ while the container wakes; subsequent requests are warm.
129
+
130
+ **Files involved:**
131
+ - `Dockerfile` — CPU-only torch, pre-downloads the model at build time.
132
+ - `README.md` frontmatter — HF reads `sdk: docker`, `app_port: 7860`, etc.
133
+ - `.github/workflows/sync-huggingface.yml` — mirrors GitHub → HF Spaces.
134
+ - CORS is enabled on the server so the GH Pages frontend can call the HF
135
+ backend cross-origin (`?mode=live&backend=https://...hf.space` is a
136
+ potential future addition).
137
+
138
  ## GitHub Pages demo (replay mode)
139
 
140
  The visualization can run as a **static page** on GitHub Pages with no
tiny_vllm/server.py CHANGED
@@ -23,6 +23,7 @@ from pathlib import Path
23
  from typing import AsyncIterator, Optional
24
 
25
  from fastapi import FastAPI, HTTPException, Request
 
26
  from fastapi.responses import FileResponse, JSONResponse, StreamingResponse
27
  from fastapi.staticfiles import StaticFiles
28
  from pydantic import BaseModel, Field
@@ -70,8 +71,18 @@ def _sse(data: dict | str) -> bytes:
70
  return f"data: {data}\n\n".encode("utf-8")
71
 
72
 
73
- def build_app(config: EngineConfig) -> FastAPI:
74
  app = FastAPI(title="tiny_vllm", version="0.1.0")
 
 
 
 
 
 
 
 
 
 
75
  engine = LLMEngine(config)
76
 
77
  @app.on_event("startup")
@@ -114,6 +125,14 @@ def build_app(config: EngineConfig) -> FastAPI:
114
 
115
  # ---- introspection -------------------------------------------------
116
 
 
 
 
 
 
 
 
 
117
  @app.get("/engine/snapshot")
118
  async def snapshot() -> dict:
119
  return engine.snapshot()
@@ -300,8 +319,14 @@ def main() -> None:
300
  parser.add_argument("--record", default=None,
301
  help="Append every engine event to this JSONL file "
302
  "(e.g. web/events.jsonl) to power the static replay demo.")
303
- parser.add_argument("--host", default="0.0.0.0")
304
- parser.add_argument("--port", type=int, default=8000)
 
 
 
 
 
 
305
  args = parser.parse_args()
306
 
307
  cfg = EngineConfig(
@@ -317,8 +342,14 @@ def main() -> None:
317
  record_path=args.record,
318
  )
319
 
 
 
 
 
 
 
320
  import uvicorn
321
- app = build_app(cfg)
322
  uvicorn.run(app, host=args.host, port=args.port, log_level="info")
323
 
324
 
 
23
  from typing import AsyncIterator, Optional
24
 
25
  from fastapi import FastAPI, HTTPException, Request
26
+ from fastapi.middleware.cors import CORSMiddleware
27
  from fastapi.responses import FileResponse, JSONResponse, StreamingResponse
28
  from fastapi.staticfiles import StaticFiles
29
  from pydantic import BaseModel, Field
 
71
  return f"data: {data}\n\n".encode("utf-8")
72
 
73
 
74
+ def build_app(config: EngineConfig, cors_allow_origins: Optional[list[str]] = None) -> FastAPI:
75
  app = FastAPI(title="tiny_vllm", version="0.1.0")
76
+ # CORS so the GH-Pages frontend (or any external page) can call this
77
+ # backend when it's deployed on HF Spaces. Read-only inference, so the
78
+ # blast radius of opening this up is small.
79
+ app.add_middleware(
80
+ CORSMiddleware,
81
+ allow_origins=cors_allow_origins or ["*"],
82
+ allow_credentials=False,
83
+ allow_methods=["*"],
84
+ allow_headers=["*"],
85
+ )
86
  engine = LLMEngine(config)
87
 
88
  @app.on_event("startup")
 
125
 
126
  # ---- introspection -------------------------------------------------
127
 
128
+ @app.get("/health")
129
+ async def health() -> dict:
130
+ return {
131
+ "status": "ok" if engine.model_runner is not None else "starting",
132
+ "model": config.model,
133
+ "device": config.device,
134
+ }
135
+
136
  @app.get("/engine/snapshot")
137
  async def snapshot() -> dict:
138
  return engine.snapshot()
 
319
  parser.add_argument("--record", default=None,
320
  help="Append every engine event to this JSONL file "
321
  "(e.g. web/events.jsonl) to power the static replay demo.")
322
+ parser.add_argument("--host", default=os.environ.get("HOST", "0.0.0.0"))
323
+ parser.add_argument("--port", type=int,
324
+ default=int(os.environ.get("PORT", "8000")))
325
+ parser.add_argument(
326
+ "--cors-origins", default=os.environ.get("TINY_VLLM_CORS_ORIGINS", "*"),
327
+ help="Comma-separated allowed origins for CORS (default '*' — fine "
328
+ "for the demo since this server is read-only inference).",
329
+ )
330
  args = parser.parse_args()
331
 
332
  cfg = EngineConfig(
 
342
  record_path=args.record,
343
  )
344
 
345
+ cors_origins: list[str] | None
346
+ if args.cors_origins.strip() in ("*", ""):
347
+ cors_origins = None # defaults to ["*"]
348
+ else:
349
+ cors_origins = [o.strip() for o in args.cors_origins.split(",") if o.strip()]
350
+
351
  import uvicorn
352
+ app = build_app(cfg, cors_allow_origins=cors_origins)
353
  uvicorn.run(app, host=args.host, port=args.port, log_level="info")
354
 
355
 
web/app.js CHANGED
@@ -467,6 +467,21 @@ if (ui.playPause) ui.playPause.addEventListener("click", () => {
467
  });
468
  if (ui.restart) ui.restart.addEventListener("click", () => state.replay?.start());
469
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
470
  // ---------- entry point ----------
471
 
472
  (function boot() {
 
467
  });
468
  if (ui.restart) ui.restart.addEventListener("click", () => state.replay?.start());
469
 
470
+ // ---------- "Try live" link → Hugging Face Space ----------
471
+
472
+ (function setupHFLink() {
473
+ const url = document.body.getAttribute("data-hf-space") || "";
474
+ // Don't advertise the live link if we're already on it (avoids
475
+ // showing "try live →" while on the live page).
476
+ const onHF = /\.hf\.space$/i.test(location.hostname) ||
477
+ /huggingface\.co$/i.test(location.hostname);
478
+ if (!url || onHF) return;
479
+ const top = document.getElementById("hf-live");
480
+ if (top) { top.href = url; top.style.display = ""; }
481
+ const rl = document.getElementById("rl-hf");
482
+ if (rl) { rl.href = url; rl.style.display = ""; }
483
+ })();
484
+
485
  // ---------- entry point ----------
486
 
487
  (function boot() {
web/index.html CHANGED
@@ -10,7 +10,7 @@
10
  <link rel="stylesheet" href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&family=JetBrains+Mono:wght@400;500;600&display=swap">
11
  <link rel="stylesheet" href="style.css">
12
  </head>
13
- <body>
14
 
15
  <nav class="topbar">
16
  <div class="crumb">
@@ -21,6 +21,7 @@
21
  <span>tiny_vllm</span>
22
  </div>
23
  <div class="right">
 
24
  <a href="https://github.com/surajsharan/tiny_vLLM" target="_blank" rel="noopener">github ↗</a>
25
  <a href="https://surajsharan.github.io/" title="Back to portfolio">← back</a>
26
  </div>
@@ -67,6 +68,7 @@ python -m tiny_vllm.server --model Qwen/Qwen2.5-0.5B-Instruct
67
  # then open http://localhost:8000</code></pre>
68
  <div class="rl-foot">
69
  <button id="rl-copy" class="ghost">Copy command</button>
 
70
  <button id="rl-close" class="ghost">Dismiss</button>
71
  </div>
72
  </div>
 
10
  <link rel="stylesheet" href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&family=JetBrains+Mono:wght@400;500;600&display=swap">
11
  <link rel="stylesheet" href="style.css">
12
  </head>
13
+ <body data-hf-space="https://encoder-tiny-vllm.hf.space">
14
 
15
  <nav class="topbar">
16
  <div class="crumb">
 
21
  <span>tiny_vllm</span>
22
  </div>
23
  <div class="right">
24
+ <a id="hf-live" href="#" target="_blank" rel="noopener" class="live-link" style="display:none">try live ↗</a>
25
  <a href="https://github.com/surajsharan/tiny_vLLM" target="_blank" rel="noopener">github ↗</a>
26
  <a href="https://surajsharan.github.io/" title="Back to portfolio">← back</a>
27
  </div>
 
68
  # then open http://localhost:8000</code></pre>
69
  <div class="rl-foot">
70
  <button id="rl-copy" class="ghost">Copy command</button>
71
+ <a id="rl-hf" class="rl-cta" href="#" target="_blank" rel="noopener" style="display:none">Or use the hosted live demo ↗</a>
72
  <button id="rl-close" class="ghost">Dismiss</button>
73
  </div>
74
  </div>
web/style.css CHANGED
@@ -65,9 +65,17 @@ body {
65
  .topbar a:hover { color: var(--accent); }
66
  .topbar .crumb { color: var(--muted); }
67
  .topbar .crumb .sep { margin: 0 6px; }
68
- .topbar .right { display: flex; gap: 16px; align-items: center; }
69
  .topbar .right a { color: var(--muted); }
70
  .topbar .right a:hover { color: var(--accent); }
 
 
 
 
 
 
 
 
71
 
72
  header.hero {
73
  padding: 28px 24px 20px;
@@ -231,7 +239,17 @@ textarea:disabled { opacity: 0.5; }
231
  color: var(--accent);
232
  overflow-x: auto;
233
  }
234
- .run-locally .rl-foot { display: flex; gap: 8px; }
 
 
 
 
 
 
 
 
 
 
235
 
236
  .replay-controls { margin-left: auto; display: flex; gap: 6px; align-items: center; }
237
  .replay-controls select {
 
65
  .topbar a:hover { color: var(--accent); }
66
  .topbar .crumb { color: var(--muted); }
67
  .topbar .crumb .sep { margin: 0 6px; }
68
+ .topbar .right { display: flex; gap: 14px; align-items: center; }
69
  .topbar .right a { color: var(--muted); }
70
  .topbar .right a:hover { color: var(--accent); }
71
+ .topbar .right a.live-link {
72
+ color: var(--bg);
73
+ background: var(--accent);
74
+ padding: 3px 10px;
75
+ border-radius: 999px;
76
+ font-weight: 600;
77
+ }
78
+ .topbar .right a.live-link:hover { background: var(--accent-dim); color: var(--bg); }
79
 
80
  header.hero {
81
  padding: 28px 24px 20px;
 
239
  color: var(--accent);
240
  overflow-x: auto;
241
  }
242
+ .run-locally .rl-foot { display: flex; gap: 8px; align-items: center; flex-wrap: wrap; }
243
+ .run-locally .rl-cta {
244
+ margin-left: auto;
245
+ color: var(--bg);
246
+ background: var(--accent);
247
+ padding: 6px 12px;
248
+ border-radius: var(--radius);
249
+ font-family: var(--mono); font-size: 12px; font-weight: 600;
250
+ text-decoration: none;
251
+ }
252
+ .run-locally .rl-cta:hover { background: var(--accent-dim); }
253
 
254
  .replay-controls { margin-left: auto; display: flex; gap: 6px; align-items: center; }
255
  .replay-controls select {