tusarway commited on
Commit
91e4928
·
verified ·
1 Parent(s): cca38ad
Files changed (1) hide show
  1. app.py +77 -29
app.py CHANGED
@@ -12,7 +12,7 @@ Endpoints
12
  POST /v1/messages → Anthropic-compatible ← Claude Code uses this
13
  """
14
 
15
- import os, json, time, uuid, asyncio, threading
16
  from contextlib import asynccontextmanager
17
  from typing import Optional, List, Union, Any, Dict
18
 
@@ -28,48 +28,101 @@ MODEL_FILE = os.getenv("MODEL_FILE", "gemma-4-26B-A4B-it-UD-IQ3_XXS.gguf")
28
  MODEL_DIR = "/app/models"
29
  MODEL_PATH = f"{MODEL_DIR}/{MODEL_FILE}"
30
  SPACE_URL = os.getenv("SPACE_URL", "")
 
31
 
32
- # Context 4096 keeps KV cache ≤2 GB — safe with 11.2 GB model on 16 GB RAM
33
  N_CTX = int(os.getenv("N_CTX", "4096"))
34
  N_THREADS = int(os.getenv("N_THREADS", "2"))
35
 
36
- # Coding-optimised defaults (OP's settings from reddit thread)
37
  DEFAULT_TEMP = float(os.getenv("DEFAULT_TEMP", "0.3"))
38
  DEFAULT_TOP_P = float(os.getenv("DEFAULT_TOP_P", "0.9"))
39
  DEFAULT_MIN_P = float(os.getenv("DEFAULT_MIN_P", "0.1"))
40
  DEFAULT_TOP_K = int(os.getenv("DEFAULT_TOP_K", "20"))
41
 
 
 
 
42
  MODEL_ALIAS = "gemma-4-26b"
43
  llm = None
44
 
45
- # ── Model download + load ─────────────────────────────────────────────────────
46
  def download_model():
47
- from huggingface_hub import hf_hub_download
48
  os.makedirs(MODEL_DIR, exist_ok=True)
49
- if not os.path.exists(MODEL_PATH):
50
- print(f"[model] Downloading {MODEL_FILE} (~11.2 GB)...")
51
- hf_hub_download(
52
- repo_id=MODEL_REPO,
53
- filename=MODEL_FILE,
54
- local_dir=MODEL_DIR,
55
- )
56
- print("[model] Download complete.")
57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  def load_model():
59
  global llm
60
  from llama_cpp import Llama
61
  download_model()
62
- print("[model] Loading Gemma 4 26B IQ3_XXS into RAM...")
63
  llm = Llama(
64
  model_path = MODEL_PATH,
65
  n_ctx = N_CTX,
66
  n_threads = N_THREADS,
67
  n_batch = 512,
68
- n_gpu_layers = 0, # HF free tier is CPU-only
69
  verbose = False,
70
- chat_format = None, # auto-detect from GGUF metadata (Gemma 4 template)
71
  )
72
- print(f"[model] Gemma 4 26B ready — ctx={N_CTX}, threads={N_THREADS}")
73
 
74
  # ── Self-ping ─────────────────────────────────────────────────────────────────
75
  async def self_ping_loop():
@@ -79,9 +132,9 @@ async def self_ping_loop():
79
  try:
80
  async with httpx.AsyncClient(timeout=15) as c:
81
  r = await c.get(f"{SPACE_URL}/health")
82
- print(f"[ping] {r.status_code}")
83
  except Exception as e:
84
- print(f"[ping] failed: {e}")
85
 
86
  # ── App ───────────────────────────────────────────────────────────────────────
87
  @asynccontextmanager
@@ -230,7 +283,7 @@ async def anthropic_messages(req: AnthropicRequest):
230
  messages = msgs,
231
  temperature = req.temperature,
232
  top_p = req.top_p,
233
- min_p = DEFAULT_MIN_P, # always apply min_p for coding accuracy
234
  top_k = req.top_k,
235
  max_tokens = req.max_tokens,
236
  stop = req.stop_sequences,
@@ -312,14 +365,12 @@ footer{margin-top:2.5rem;font-size:.75rem;color:#374151;text-align:center;line-h
312
  <body>
313
  <h1>Gemma 4 26B A4B</h1>
314
  <p class="tagline">Coding-tuned · Anthropic &amp; OpenAI compatible · HuggingFace Spaces</p>
315
-
316
  <div class="badges">
317
  <span class="badge"><span class="dot"></span>{{ST}}</span>
318
  <span class="badge" style="color:#9ca3af">IQ3_XXS · 11.2 GB</span>
319
  <span class="badge" style="color:#9ca3af">ctx 4096 · 2 vCPU · 16 GB RAM</span>
320
  <span class="badge" style="color:#9ca3af">temp 0.3 · top-k 20 · min-p 0.1</span>
321
  </div>
322
-
323
  <div class="cards">
324
  <div class="card">
325
  <div class="card-title">Claude Code setup</div>
@@ -349,20 +400,18 @@ r = client.chat.completions.create(
349
  -d '{
350
  "model": "gemma-4-26b",
351
  "messages": [
352
- {"role":"user",
353
- "content":"hello"}
354
  ]
355
  }'</pre>
356
  </div>
357
  </div>
358
-
359
  <div class="tip">
360
- <strong>First boot:</strong> The model (~11.2 GB) downloads from HuggingFace on first start — allow 5–10 min.
 
361
  <code style="background:#0d1b26;padding:1px 5px;border-radius:4px">/health</code> returns
362
  <code style="background:#0d1b26;padding:1px 5px;border-radius:4px">model_loaded: false</code>
363
- until ready. Subsequent restarts load from disk in ~60 s. Self-pings every 25 min to prevent sleep.
364
  </div>
365
-
366
  <table class="ep-table">
367
  <thead><tr><th>Method</th><th>Path</th><th>Notes</th></tr></thead>
368
  <tbody>
@@ -372,7 +421,6 @@ r = client.chat.completions.create(
372
  <tr><td><span class="method post">POST</span></td><td class="path">/v1/messages</td><td class="note">Anthropic-compatible · used by Claude Code</td></tr>
373
  </tbody>
374
  </table>
375
-
376
  <footer>
377
  Gemma 4 26B A4B · unsloth UD-IQ3_XXS · llama-cpp-python + OpenBLAS<br>
378
  Self-pings /health every 25 min · April 2026
 
12
  POST /v1/messages → Anthropic-compatible ← Claude Code uses this
13
  """
14
 
15
+ import os, sys, json, time, uuid, asyncio, threading, requests
16
  from contextlib import asynccontextmanager
17
  from typing import Optional, List, Union, Any, Dict
18
 
 
28
  MODEL_DIR = "/app/models"
29
  MODEL_PATH = f"{MODEL_DIR}/{MODEL_FILE}"
30
  SPACE_URL = os.getenv("SPACE_URL", "")
31
+ HF_TOKEN = os.getenv("HF_TOKEN", "")
32
 
 
33
  N_CTX = int(os.getenv("N_CTX", "4096"))
34
  N_THREADS = int(os.getenv("N_THREADS", "2"))
35
 
 
36
  DEFAULT_TEMP = float(os.getenv("DEFAULT_TEMP", "0.3"))
37
  DEFAULT_TOP_P = float(os.getenv("DEFAULT_TOP_P", "0.9"))
38
  DEFAULT_MIN_P = float(os.getenv("DEFAULT_MIN_P", "0.1"))
39
  DEFAULT_TOP_K = int(os.getenv("DEFAULT_TOP_K", "20"))
40
 
41
+ # Minimum expected size for a complete model file (10 GB safety margin)
42
+ MIN_MODEL_BYTES = 10 * 1024 ** 3
43
+
44
  MODEL_ALIAS = "gemma-4-26b"
45
  llm = None
46
 
47
+ # ── Model download ────────────────────────────────────────────────────────────
48
  def download_model():
 
49
  os.makedirs(MODEL_DIR, exist_ok=True)
 
 
 
 
 
 
 
 
50
 
51
+ # Check for existing complete file
52
+ if os.path.exists(MODEL_PATH):
53
+ size = os.path.getsize(MODEL_PATH)
54
+ if size >= MIN_MODEL_BYTES:
55
+ print(f"[model] Cached model found ({size / 1e9:.2f} GB) — skipping download.", flush=True)
56
+ return
57
+ print(f"[model] Incomplete file detected ({size / 1e9:.2f} GB) — re-downloading...", flush=True)
58
+ os.remove(MODEL_PATH)
59
+
60
+ url = f"https://huggingface.co/{MODEL_REPO}/resolve/main/{MODEL_FILE}"
61
+ headers = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}
62
+ tmp_path = MODEL_PATH + ".tmp"
63
+
64
+ print(f"[model] Connecting to HuggingFace...", flush=True)
65
+
66
+ with requests.get(url, stream=True, headers=headers, timeout=60) as r:
67
+ r.raise_for_status()
68
+ total = int(r.headers.get("content-length", 0))
69
+ total_gb = total / (1024 ** 3)
70
+
71
+ print(f"[model] Downloading {MODEL_FILE}", flush=True)
72
+ print(f"[model] Total size : {total_gb:.2f} GB", flush=True)
73
+ print(f"[model] Destination: {MODEL_PATH}", flush=True)
74
+ print(f"[model] {'─' * 52}", flush=True)
75
+
76
+ downloaded = 0
77
+ last_step = -1 # tracks which 5%-band was last printed
78
+ chunk_size = 8 * 1024 * 1024 # 8 MB chunks
79
+
80
+ with open(tmp_path, "wb") as f:
81
+ for chunk in r.iter_content(chunk_size=chunk_size):
82
+ if not chunk:
83
+ continue
84
+ f.write(chunk)
85
+ downloaded += len(chunk)
86
+
87
+ if total > 0:
88
+ pct = downloaded / total * 100
89
+ step = int(pct) // 5 # 0–20
90
+ if step > last_step:
91
+ last_step = step
92
+ filled = step
93
+ empty = 20 - filled
94
+ bar = "█" * filled + "░" * empty
95
+ gb_done = downloaded / (1024 ** 3)
96
+ speed_mb = (downloaded / (time.monotonic() + 1e-9)) / 1e6
97
+ print(
98
+ f"[model] |{bar}| {pct:5.1f}% "
99
+ f"{gb_done:.2f}/{total_gb:.2f} GB",
100
+ flush=True,
101
+ )
102
+
103
+ # Atomic rename — avoids half-written files on crash/restart
104
+ os.rename(tmp_path, MODEL_PATH)
105
+ final_size = os.path.getsize(MODEL_PATH)
106
+ print(f"[model] {'─' * 52}", flush=True)
107
+ print(f"[model] Download complete! {final_size / 1e9:.2f} GB saved to {MODEL_PATH}", flush=True)
108
+
109
+
110
+ # ── Model load ────────────────────────────────────────────────────────────────
111
  def load_model():
112
  global llm
113
  from llama_cpp import Llama
114
  download_model()
115
+ print(f"[model] Loading {MODEL_FILE} into RAM (ctx={N_CTX}, threads={N_THREADS})...", flush=True)
116
  llm = Llama(
117
  model_path = MODEL_PATH,
118
  n_ctx = N_CTX,
119
  n_threads = N_THREADS,
120
  n_batch = 512,
121
+ n_gpu_layers = 0,
122
  verbose = False,
123
+ chat_format = None,
124
  )
125
+ print(f"[model] Gemma 4 26B ready!", flush=True)
126
 
127
  # ── Self-ping ─────────────────────────────────────────────────────────────────
128
  async def self_ping_loop():
 
132
  try:
133
  async with httpx.AsyncClient(timeout=15) as c:
134
  r = await c.get(f"{SPACE_URL}/health")
135
+ print(f"[ping] {r.status_code}", flush=True)
136
  except Exception as e:
137
+ print(f"[ping] failed: {e}", flush=True)
138
 
139
  # ── App ───────────────────────────────────────────────────────────────────────
140
  @asynccontextmanager
 
283
  messages = msgs,
284
  temperature = req.temperature,
285
  top_p = req.top_p,
286
+ min_p = DEFAULT_MIN_P,
287
  top_k = req.top_k,
288
  max_tokens = req.max_tokens,
289
  stop = req.stop_sequences,
 
365
  <body>
366
  <h1>Gemma 4 26B A4B</h1>
367
  <p class="tagline">Coding-tuned · Anthropic &amp; OpenAI compatible · HuggingFace Spaces</p>
 
368
  <div class="badges">
369
  <span class="badge"><span class="dot"></span>{{ST}}</span>
370
  <span class="badge" style="color:#9ca3af">IQ3_XXS · 11.2 GB</span>
371
  <span class="badge" style="color:#9ca3af">ctx 4096 · 2 vCPU · 16 GB RAM</span>
372
  <span class="badge" style="color:#9ca3af">temp 0.3 · top-k 20 · min-p 0.1</span>
373
  </div>
 
374
  <div class="cards">
375
  <div class="card">
376
  <div class="card-title">Claude Code setup</div>
 
400
  -d '{
401
  "model": "gemma-4-26b",
402
  "messages": [
403
+ {"role":"user","content":"hello"}
 
404
  ]
405
  }'</pre>
406
  </div>
407
  </div>
 
408
  <div class="tip">
409
+ <strong>First boot:</strong> The model (~11.2 GB) downloads on first start — allow 5–10 min.
410
+ Watch the container logs for a live progress bar.
411
  <code style="background:#0d1b26;padding:1px 5px;border-radius:4px">/health</code> returns
412
  <code style="background:#0d1b26;padding:1px 5px;border-radius:4px">model_loaded: false</code>
413
+ until ready. Subsequent restarts load from disk in ~60 s.
414
  </div>
 
415
  <table class="ep-table">
416
  <thead><tr><th>Method</th><th>Path</th><th>Notes</th></tr></thead>
417
  <tbody>
 
421
  <tr><td><span class="method post">POST</span></td><td class="path">/v1/messages</td><td class="note">Anthropic-compatible · used by Claude Code</td></tr>
422
  </tbody>
423
  </table>
 
424
  <footer>
425
  Gemma 4 26B A4B · unsloth UD-IQ3_XXS · llama-cpp-python + OpenBLAS<br>
426
  Self-pings /health every 25 min · April 2026