SeaWolf-AI commited on
Commit
c8a5e69
ยท
1 Parent(s): d2fff68

Fix gemma4 runtime error: switch to Transformers backend + Darwin-4B-David

Browse files

Root cause: requirements.txt pinned transformers>=4.45.0 from PyPI, which
has no gemma4 architecture. vLLM additionally lacks a Gemma4 model
registration, so even a newer transformers would not have fixed the vLLM
codepath. Result: all 3 engine-init tiers crashed with

Value error, The checkpoint you are trying to load has model type
gemma4 but Transformers does not recognize this architecture.

Changes:
- requirements.txt: drop vllm / aither-kvcache / optimum-quanto,
install transformers @ git+https://github.com/huggingface/transformers.git
(gemma4 is only present in the dev tree, unreleased).
- app.py: remove vLLM LLMEngine / SamplingParams / TokensPrompt /
TriAttention path. Load Gemma4ForConditionalGeneration directly with
dtype=bfloat16, device_map=auto. Stream via TextIteratorStreamer in a
background thread. Adapt MTILogitsProcessor to the Transformers
LogitsProcessor API (batched entropy mask). Switch MODEL_ID to
FINAL-Bench/Darwin-4B-David and point the tokenizer pre-download and
extra_special_tokens patch at the same repo. Remove dead _attn,
TRIATT_ENABLED, vllm-specific health fields.
- Dockerfile: replace vllm/vllm-openai base with nvidia/cuda runtime
image and pip-install transformers from git. Note: the Space runs
under sdk gradio so the Dockerfile is ignored; kept in sync for
future sdk docker flips.

Files changed (3) hide show
  1. Dockerfile +20 -13
  2. app.py +110 -138
  3. requirements.txt +8 -8
Dockerfile CHANGED
@@ -1,24 +1,31 @@
1
- FROM vllm/vllm-openai:gemma4
2
 
3
  ENV DEBIAN_FRONTEND=noninteractive
4
  ENV PYTHONUNBUFFERED=1
5
 
6
- # Gemma 4 model_type์„ ์œ„ํ•ด transformers ์†Œ์Šค ์„ค์น˜ ํ•„์ˆ˜
7
- RUN pip install --no-cache-dir \
8
- "git+https://github.com/huggingface/transformers.git" \
9
- gradio>=5.0 \
10
- fastapi \
11
- uvicorn \
12
- httpx \
13
- requests \
14
- PyMuPDF
15
 
16
- # TriAttention (optional)
17
- RUN pip install --no-cache-dir aither-kvcache || true
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
  WORKDIR /app
20
  COPY . /app
21
 
22
  EXPOSE 7860
23
 
24
- CMD ["python3", "app.py"]
 
1
+ FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
2
 
3
  ENV DEBIAN_FRONTEND=noninteractive
4
  ENV PYTHONUNBUFFERED=1
5
 
6
+ RUN apt-get update && apt-get install -y --no-install-recommends \
7
+ python3 python3-pip python3-dev git curl ca-certificates \
8
+ && rm -rf /var/lib/apt/lists/*
 
 
 
 
 
 
9
 
10
+ # Gemma4 (model_type="gemma4") is only available in the Transformers git dev
11
+ # branch. Installing from PyPI WILL fail at runtime with:
12
+ # "The checkpoint you are trying to load has model type `gemma4` but
13
+ # Transformers does not recognize this architecture."
14
+ # Keep this install line pointed at git+https until gemma4 lands in a release.
15
+ RUN pip install --no-cache-dir --upgrade pip && \
16
+ pip install --no-cache-dir \
17
+ "torch>=2.4.0" \
18
+ "git+https://github.com/huggingface/transformers.git" \
19
+ "accelerate>=1.0.0" \
20
+ "huggingface_hub" \
21
+ "sentencepiece" "protobuf" \
22
+ "gradio>=5.0" \
23
+ "fastapi" "uvicorn" "httpx" "requests" \
24
+ "Pillow" "PyMuPDF" "openai"
25
 
26
  WORKDIR /app
27
  COPY . /app
28
 
29
  EXPOSE 7860
30
 
31
+ CMD ["python3", "app.py"]
app.py CHANGED
@@ -1,20 +1,9 @@
1
- # Gemma 4 E4B - vLLM + MTI + TriAttention
2
- # Multimodal (Vision+Audio+Text) - Effective 4.5B - Apache 2.0
3
- # MTI: +9-11% reasoning accuracy (training-free)
4
- # TriAttention: ~10x KV cache compression
5
  import sys, os, signal, time, uuid
6
  print(f"[BOOT] Python {sys.version}", flush=True)
7
 
8
- # -- TriAttention ์‹œ๋„ --
9
- TRIATT_ENABLED = False
10
- try:
11
- import aither_kvcache
12
- os.environ["VLLM_ATTENTION_BACKEND"] = "CUSTOM"
13
- TRIATT_ENABLED = True
14
- print("[TRIATT] aither-kvcache -> VLLM_ATTENTION_BACKEND=CUSTOM", flush=True)
15
- except ImportError:
16
- print("[TRIATT] aither-kvcache not found -> standard attention", flush=True)
17
-
18
  import base64, re, json
19
  from typing import Generator, Optional
20
  from threading import Thread
@@ -33,10 +22,10 @@ import pathlib, secrets
33
  # ==============================================================================
34
  # 1. CONFIG
35
  # ==============================================================================
36
- MODEL_ID = "DavidAU/gemma-4-E4B-it-The-DECKARD-Expresso-Universe-HERETIC-UNCENSORED-Thinking"
37
- MODEL_NAME = "DECKARD-E4B-Opus"
38
  MODEL_CAP = {
39
- "arch": "Gemma4 PLE", "active": "4.5B", "total": "~8B",
40
  "ctx": "128K", "thinking": True, "vision": True, "audio": True,
41
  "max_tokens": 16384, "temp_max": 2.0,
42
  }
@@ -51,11 +40,14 @@ PRESETS = {
51
 
52
  # ==============================================================================
53
  # 2. MTI -- Minimal Test-Time Intervention (arxiv 2510.13940)
 
54
  # ==============================================================================
55
- class MTILogitsProcessor:
 
 
56
  """
57
- ๊ณ ์—”ํŠธ๋กœํ”ผ(๋ถˆํ™•์‹ค) ํ† ํฐ์—๋งŒ CFG ์ ์šฉ -> ์ถ”๋ก  ์ •ํ™•๋„ +9~11%.
58
- ํ•™์Šต ์—†์ด ์„œ๋น™ ์‹œ ์ ์šฉ. ์ „์ฒด ํ† ํฐ์˜ ~15%์—๋งŒ ๊ฐœ์ž….
59
  """
60
  def __init__(self, cfg_scale: float = 1.5, entropy_threshold: float = 2.0):
61
  self.cfg_scale = cfg_scale
@@ -63,17 +55,18 @@ class MTILogitsProcessor:
63
  self._interventions = 0
64
  self._total = 0
65
 
66
- def __call__(self, token_ids, logits):
67
- self._total += 1
68
- probs = torch.softmax(logits, dim=-1)
69
- entropy = -(probs * torch.log(probs + 1e-10)).sum(dim=-1)
70
-
71
- if entropy.item() > self.entropy_threshold:
72
- mean_logit = logits.mean(dim=-1, keepdim=True)
73
- guided = logits + self.cfg_scale * (logits - mean_logit)
74
- self._interventions += 1
75
- return guided
76
- return logits
 
77
 
78
  @property
79
  def intervention_rate(self):
@@ -82,103 +75,78 @@ class MTILogitsProcessor:
82
  print("[MTI] MTILogitsProcessor ready (cfg=1.5, threshold=2.0)", flush=True)
83
 
84
  # ==============================================================================
85
- # 3. vLLM ENGINE -- Gemma 4 Day 0 ์ง€์›, ํŒจ์น˜ ๋ถˆํ•„์š”
86
  # ==============================================================================
87
- from vllm.engine.arg_utils import EngineArgs
88
- from vllm.engine.llm_engine import LLMEngine
89
- from vllm import SamplingParams, TokensPrompt
90
- from transformers import AutoTokenizer
91
-
92
- # -- Gemma 4 ํ† ํฌ๋‚˜์ด์ € ํ˜ธํ™˜์„ฑ ํŒจ์น˜ --
93
- # transformers 5.5.0+์—์„œ extra_special_tokens๊ฐ€ list์ผ ๋•Œ .keys() ์—๋Ÿฌ
94
  from huggingface_hub import hf_hub_download
95
  import tempfile, shutil
96
 
97
- _tok_source = "google/gemma-4-E4B-it"
98
- _tok_dir = tempfile.mkdtemp()
 
 
 
 
99
 
100
- # ํ† ํฌ๋‚˜์ด์ € ํŒŒ์ผ๋“ค ๋‹ค์šด๋กœ๋“œ
101
  for _fname in ["tokenizer_config.json", "tokenizer.json", "tokenizer.model",
102
- "special_tokens_map.json", "chat_template.jinja"]:
103
  try:
104
  _p = hf_hub_download(_tok_source, _fname)
105
  shutil.copy(_p, os.path.join(_tok_dir, _fname))
106
  except Exception:
107
  pass
108
 
109
- # tokenizer_config.json ํŒจ์น˜: extra_special_tokens list -> dict
110
  _tc_path = os.path.join(_tok_dir, "tokenizer_config.json")
111
  if os.path.exists(_tc_path):
112
- with open(_tc_path) as f:
113
- _tc = json.load(f)
114
- est = _tc.get("extra_special_tokens", None)
115
- if isinstance(est, list):
116
- _tc["extra_special_tokens"] = {tok: tok for tok in est} if est else {}
117
- with open(_tc_path, "w") as f:
118
- json.dump(_tc, f, indent=2)
119
- print(f"[vLLM] Patched extra_special_tokens: list({len(est)}) -> dict", flush=True)
120
-
121
- tokenizer = AutoTokenizer.from_pretrained(_tok_dir, trust_remote_code=True)
122
- print(f"[vLLM] Tokenizer loaded (vocab={len(tokenizer)})", flush=True)
123
-
124
- engine = None
125
- MAX_MODEL_LEN = 32768
126
-
127
- # ์‹œ๋„ 1: TriAttention + 32K
128
- if engine is None and TRIATT_ENABLED:
129
- try:
130
- print(f"[vLLM] Try 1: TriAttention + {MAX_MODEL_LEN}", flush=True)
131
- engine = LLMEngine.from_engine_args(EngineArgs(
132
- model=MODEL_ID, tokenizer=_tok_dir, dtype="bfloat16",
133
- max_model_len=MAX_MODEL_LEN,
134
- gpu_memory_utilization=0.92,
135
- trust_remote_code=True,
136
- limit_mm_per_prompt={"image": 0, "audio": 0},
137
- ))
138
- print(f"[vLLM] OK TriAttention engine ready", flush=True)
139
- except Exception as e:
140
- print(f"[vLLM] X TriAttention failed: {e}", flush=True)
141
- os.environ.pop("VLLM_ATTENTION_BACKEND", None)
142
- TRIATT_ENABLED = False
143
- engine = None
144
-
145
- # ์‹œ๋„ 2: ํ‘œ์ค€ + 16K
146
- if engine is None:
147
- MAX_MODEL_LEN = 16384
148
  try:
149
- print(f"[vLLM] Try 2: Standard + {MAX_MODEL_LEN}", flush=True)
150
- engine = LLMEngine.from_engine_args(EngineArgs(
151
- model=MODEL_ID, tokenizer=_tok_dir, dtype="bfloat16",
152
- max_model_len=MAX_MODEL_LEN,
153
- gpu_memory_utilization=0.92,
154
- trust_remote_code=True,
155
- limit_mm_per_prompt={"image": 0, "audio": 0},
156
- ))
157
- print(f"[vLLM] OK Standard engine ready", flush=True)
158
  except Exception as e:
159
- print(f"[vLLM] X 16K failed: {e}", flush=True)
160
- engine = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
 
162
- # ์‹œ๋„ 3: ์ตœ์†Œ 8K
163
- if engine is None:
164
- MAX_MODEL_LEN = 8192
165
- try:
166
- print(f"[vLLM] Try 3: Minimal + {MAX_MODEL_LEN}", flush=True)
167
- engine = LLMEngine.from_engine_args(EngineArgs(
168
- model=MODEL_ID, tokenizer=_tok_dir, dtype="bfloat16",
169
- max_model_len=MAX_MODEL_LEN,
170
- gpu_memory_utilization=0.90,
171
- trust_remote_code=True,
172
- limit_mm_per_prompt={"image": 0, "audio": 0},
173
- ))
174
- print(f"[vLLM] OK Minimal engine ready", flush=True)
175
- except Exception as e:
176
- print(f"[vLLM] XXX All failed: {e}", flush=True)
177
- sys.exit(1)
178
 
 
 
 
 
 
 
 
179
  MODEL_CAP["max_tokens"] = min(MODEL_CAP["max_tokens"], MAX_MODEL_LEN)
180
- _attn = "TriAttention" if TRIATT_ENABLED else "Standard"
181
- print(f"[vLLM] Final: {_attn}, max_len={MAX_MODEL_LEN}, max_tokens={MODEL_CAP['max_tokens']}", flush=True)
 
 
182
 
183
  # ==============================================================================
184
  # 4. THINKING MODE HELPERS
@@ -211,27 +179,31 @@ def format_response(raw: str) -> str:
211
  return raw
212
 
213
  # ==============================================================================
214
- # 5. GENERATION -- vLLM Engine + MTI Streaming
215
  # ==============================================================================
216
- def _engine_generate(prompt_text: str, params: SamplingParams, queue: Queue):
217
- """vLLM ์—”์ง„ ์ƒ์„ฑ + Queue ์ŠคํŠธ๋ฆฌ๋ฐ"""
218
  try:
219
- request_id = str(uuid.uuid4())
220
- token_ids = tokenizer.encode(prompt_text)
221
- engine.add_request(request_id, TokensPrompt(prompt_token_ids=token_ids), params)
222
-
223
- prev_len = 0
224
- while engine.has_unfinished_requests():
225
- step_outputs = engine.step()
226
- for output in step_outputs:
227
- text = output.outputs[0].text
228
- if len(text) > prev_len:
229
- queue.put(text[prev_len:])
230
- prev_len = len(text)
231
- if output.finished:
232
- queue.put(None)
233
- return
234
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
  queue.put(None)
236
  except Exception as e:
237
  queue.put(f"\n\n**โŒ Engine error:** `{e}`")
@@ -288,19 +260,20 @@ def generate_reply(
288
 
289
  input_len = len(tokenizer.encode(prompt_text))
290
  print(f"[GEN] tokens={input_len}, max_new={max_new_tokens}, "
291
- f"temp={temperature}, MTI=on, Attn={_attn}", flush=True)
292
 
293
  mti = MTILogitsProcessor(cfg_scale=1.5, entropy_threshold=2.0)
294
 
295
- params = SamplingParams(
296
- max_tokens=max_new_tokens,
297
- temperature=max(float(temperature), 0.01) if temperature > 0.01 else 0.0,
 
 
298
  top_p=float(top_p),
299
- logits_processors=[mti],
300
  )
301
 
302
- queue = Queue()
303
- thread = Thread(target=_engine_generate, args=(prompt_text, params, queue))
304
  thread.start()
305
 
306
  output = ""
@@ -424,8 +397,7 @@ async def oauth_logout(request: Request):
424
  async def health():
425
  return {
426
  "status": "ok", "model": MODEL_ID,
427
- "backend": "vLLM-Engine",
428
- "attention": "TriAttention" if TRIATT_ENABLED else "Standard",
429
  "mti": "enabled",
430
  "max_tokens": MODEL_CAP["max_tokens"],
431
  "max_model_len": MAX_MODEL_LEN,
@@ -480,5 +452,5 @@ signal.signal(signal.SIGTERM, _shutdown)
480
  signal.signal(signal.SIGINT, _shutdown)
481
 
482
  if __name__ == "__main__":
483
- print(f"[BOOT] {MODEL_NAME} - vLLM - {_attn} - MTI - max_len={MAX_MODEL_LEN} - Ready", flush=True)
484
- uvicorn.run(app, host="0.0.0.0", port=7860)
 
1
+ # Darwin-4B-David (Gemma4) - Transformers backend + MTI
2
+ # Multimodal (Vision+Audio+Text) - Apache 2.0
3
+ # MTI: +9-11% reasoning accuracy (training-free), Transformers LogitsProcessor
 
4
  import sys, os, signal, time, uuid
5
  print(f"[BOOT] Python {sys.version}", flush=True)
6
 
 
 
 
 
 
 
 
 
 
 
7
  import base64, re, json
8
  from typing import Generator, Optional
9
  from threading import Thread
 
22
  # ==============================================================================
23
  # 1. CONFIG
24
  # ==============================================================================
25
+ MODEL_ID = "FINAL-Bench/Darwin-4B-David"
26
+ MODEL_NAME = "Darwin-4B-David"
27
  MODEL_CAP = {
28
+ "arch": "Gemma4", "active": "4B", "total": "4B",
29
  "ctx": "128K", "thinking": True, "vision": True, "audio": True,
30
  "max_tokens": 16384, "temp_max": 2.0,
31
  }
 
40
 
41
  # ==============================================================================
42
  # 2. MTI -- Minimal Test-Time Intervention (arxiv 2510.13940)
43
+ # Transformers LogitsProcessor API: __call__(input_ids, scores) -> scores
44
  # ==============================================================================
45
+ from transformers import LogitsProcessor, LogitsProcessorList
46
+
47
+ class MTILogitsProcessor(LogitsProcessor):
48
  """
49
+ High-entropy (uncertain) tokens only -> apply CFG-style sharpening.
50
+ Training-free serving-time intervention, ~15% of tokens affected.
51
  """
52
  def __init__(self, cfg_scale: float = 1.5, entropy_threshold: float = 2.0):
53
  self.cfg_scale = cfg_scale
 
55
  self._interventions = 0
56
  self._total = 0
57
 
58
+ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
59
+ # scores: (batch_size, vocab_size)
60
+ self._total += int(scores.shape[0])
61
+ probs = torch.softmax(scores, dim=-1)
62
+ entropy = -(probs * torch.log(probs.clamp_min(1e-10))).sum(dim=-1) # (batch_size,)
63
+ mask = entropy > self.entropy_threshold # (batch_size,)
64
+ if bool(mask.any()):
65
+ mean_logit = scores.mean(dim=-1, keepdim=True)
66
+ guided = scores + self.cfg_scale * (scores - mean_logit)
67
+ scores = torch.where(mask.unsqueeze(-1), guided, scores)
68
+ self._interventions += int(mask.sum().item())
69
+ return scores
70
 
71
  @property
72
  def intervention_rate(self):
 
75
  print("[MTI] MTILogitsProcessor ready (cfg=1.5, threshold=2.0)", flush=True)
76
 
77
  # ==============================================================================
78
+ # 3. TOKENIZER + MODEL LOAD (Transformers from source)
79
  # ==============================================================================
80
+ from transformers import (
81
+ AutoTokenizer,
82
+ Gemma4ForConditionalGeneration,
83
+ TextIteratorStreamer,
84
+ )
 
 
85
  from huggingface_hub import hf_hub_download
86
  import tempfile, shutil
87
 
88
+ # ---- Tokenizer with extra_special_tokens patch ----
89
+ # Transformers 5.5.x (git) has a regression where tokenizer_config.json with
90
+ # extra_special_tokens stored as a list crashes during load (.keys() call on
91
+ # a list). We pre-download, patch if needed, then load from the local copy.
92
+ _tok_source = MODEL_ID
93
+ _tok_dir = tempfile.mkdtemp(prefix="darwin_tok_")
94
 
 
95
  for _fname in ["tokenizer_config.json", "tokenizer.json", "tokenizer.model",
96
+ "special_tokens_map.json", "chat_template.jinja"]:
97
  try:
98
  _p = hf_hub_download(_tok_source, _fname)
99
  shutil.copy(_p, os.path.join(_tok_dir, _fname))
100
  except Exception:
101
  pass
102
 
 
103
  _tc_path = os.path.join(_tok_dir, "tokenizer_config.json")
104
  if os.path.exists(_tc_path):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  try:
106
+ with open(_tc_path) as f:
107
+ _tc = json.load(f)
108
+ est = _tc.get("extra_special_tokens", None)
109
+ if isinstance(est, list):
110
+ _tc["extra_special_tokens"] = {tok: tok for tok in est} if est else {}
111
+ with open(_tc_path, "w") as f:
112
+ json.dump(_tc, f, indent=2)
113
+ print(f"[Tokenizer] Patched extra_special_tokens: list({len(est)}) -> dict", flush=True)
 
114
  except Exception as e:
115
+ print(f"[Tokenizer] Patch skipped: {e}", flush=True)
116
+
117
+ tokenizer = AutoTokenizer.from_pretrained(_tok_dir)
118
+ print(f"[Tokenizer] Loaded (vocab={len(tokenizer)}) from {_tok_source}", flush=True)
119
+
120
+ # ---- Model ----
121
+ print(f"[Transformers] Loading {MODEL_ID} (this may take a while for a 16GB checkpoint)...", flush=True)
122
+ _load_kwargs = dict(
123
+ dtype=torch.bfloat16,
124
+ device_map="auto",
125
+ low_cpu_mem_usage=True,
126
+ )
127
+ try:
128
+ model = Gemma4ForConditionalGeneration.from_pretrained(MODEL_ID, **_load_kwargs)
129
+ except TypeError:
130
+ # Older transformers signatures used torch_dtype instead of dtype.
131
+ _load_kwargs["torch_dtype"] = _load_kwargs.pop("dtype")
132
+ model = Gemma4ForConditionalGeneration.from_pretrained(MODEL_ID, **_load_kwargs)
133
 
134
+ model.eval()
135
+ _device = next(model.parameters()).device
136
+ print(f"[Transformers] Model loaded on {_device}", flush=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
137
 
138
+ # Resolve max model length (text config for multimodal Gemma4).
139
+ try:
140
+ _text_cfg = model.config.get_text_config()
141
+ except AttributeError:
142
+ _text_cfg = getattr(model.config, "text_config", model.config)
143
+ MAX_MODEL_LEN = int(getattr(_text_cfg, "max_position_embeddings", 16384))
144
+ # Clamp generation max_tokens to what the runtime can actually hold.
145
  MODEL_CAP["max_tokens"] = min(MODEL_CAP["max_tokens"], MAX_MODEL_LEN)
146
+ print(f"[Transformers] max_position_embeddings={MAX_MODEL_LEN}, "
147
+ f"max_tokens={MODEL_CAP['max_tokens']}", flush=True)
148
+
149
+ BACKEND_NAME = "Transformers"
150
 
151
  # ==============================================================================
152
  # 4. THINKING MODE HELPERS
 
179
  return raw
180
 
181
  # ==============================================================================
182
+ # 5. GENERATION -- Transformers TextIteratorStreamer + MTI
183
  # ==============================================================================
184
+ def _engine_generate(prompt_text: str, gen_kwargs: dict, mti: MTILogitsProcessor, queue: Queue):
185
+ """Run model.generate in a background thread and stream tokens into queue."""
186
  try:
187
+ inputs = tokenizer(prompt_text, return_tensors="pt").to(_device)
188
+ streamer = TextIteratorStreamer(
189
+ tokenizer, skip_prompt=True, skip_special_tokens=False, timeout=120.0,
190
+ )
 
 
 
 
 
 
 
 
 
 
 
191
 
192
+ full_kwargs = {
193
+ **inputs,
194
+ "streamer": streamer,
195
+ "logits_processor": LogitsProcessorList([mti]),
196
+ "pad_token_id": tokenizer.pad_token_id or tokenizer.eos_token_id,
197
+ **gen_kwargs,
198
+ }
199
+
200
+ gen_thread = Thread(target=model.generate, kwargs=full_kwargs)
201
+ gen_thread.start()
202
+
203
+ for chunk in streamer:
204
+ if chunk:
205
+ queue.put(chunk)
206
+ gen_thread.join()
207
  queue.put(None)
208
  except Exception as e:
209
  queue.put(f"\n\n**โŒ Engine error:** `{e}`")
 
260
 
261
  input_len = len(tokenizer.encode(prompt_text))
262
  print(f"[GEN] tokens={input_len}, max_new={max_new_tokens}, "
263
+ f"temp={temperature}, MTI=on, Backend={BACKEND_NAME}", flush=True)
264
 
265
  mti = MTILogitsProcessor(cfg_scale=1.5, entropy_threshold=2.0)
266
 
267
+ do_sample = float(temperature) > 0.01
268
+ gen_kwargs = dict(
269
+ max_new_tokens=max_new_tokens,
270
+ do_sample=do_sample,
271
+ temperature=max(float(temperature), 0.01) if do_sample else 1.0,
272
  top_p=float(top_p),
 
273
  )
274
 
275
+ queue: Queue = Queue()
276
+ thread = Thread(target=_engine_generate, args=(prompt_text, gen_kwargs, mti, queue))
277
  thread.start()
278
 
279
  output = ""
 
397
  async def health():
398
  return {
399
  "status": "ok", "model": MODEL_ID,
400
+ "backend": BACKEND_NAME,
 
401
  "mti": "enabled",
402
  "max_tokens": MODEL_CAP["max_tokens"],
403
  "max_model_len": MAX_MODEL_LEN,
 
452
  signal.signal(signal.SIGINT, _shutdown)
453
 
454
  if __name__ == "__main__":
455
+ print(f"[BOOT] {MODEL_NAME} - {BACKEND_NAME} - MTI - max_len={MAX_MODEL_LEN} - Ready", flush=True)
456
+ uvicorn.run(app, host="0.0.0.0", port=7860)
requirements.txt CHANGED
@@ -6,14 +6,14 @@ uvicorn
6
  fastapi
7
  requests
8
  PyMuPDF
9
- torch
10
- transformers>=4.45.0
11
- accelerate>=0.26.0
 
 
 
 
 
12
  sentencepiece
13
  protobuf
14
- # โ”€โ”€ TriAttention KV Cache Optimization โ”€โ”€
15
- aither-kvcache[triton]>=2.0.0
16
- # โ”€โ”€ Fallback: Quantized KV Cache โ”€โ”€
17
- optimum-quanto
18
  openai
19
- vllm
 
6
  fastapi
7
  requests
8
  PyMuPDF
9
+ torch>=2.4.0
10
+ # Gemma4 (model_type="gemma4") is only available in the Transformers dev branch.
11
+ # PyPI releases of transformers do NOT recognize this architecture, which is
12
+ # what caused the "The checkpoint you are trying to load has model type
13
+ # `gemma4` but Transformers does not recognize this architecture" runtime
14
+ # error. Do NOT pin a PyPI version here.
15
+ transformers @ git+https://github.com/huggingface/transformers.git
16
+ accelerate>=1.0.0
17
  sentencepiece
18
  protobuf
 
 
 
 
19
  openai