Darwin-4B-david

Paused

SeaWolf-AI commited on Apr 10

Commit

c8a5e69

1 Parent(s): d2fff68

Fix gemma4 runtime error: switch to Transformers backend + Darwin-4B-David

Root cause: requirements.txt pinned transformers>=4.45.0 from PyPI, which
has no gemma4 architecture. vLLM additionally lacks a Gemma4 model
registration, so even a newer transformers would not have fixed the vLLM
codepath. Result: all 3 engine-init tiers crashed with

Value error, The checkpoint you are trying to load has model type
gemma4 but Transformers does not recognize this architecture.

Changes:
- requirements.txt: drop vllm / aither-kvcache / optimum-quanto,
install transformers @ git+https://github.com/huggingface/transformers.git
(gemma4 is only present in the dev tree, unreleased).
- app.py: remove vLLM LLMEngine / SamplingParams / TokensPrompt /
TriAttention path. Load Gemma4ForConditionalGeneration directly with
dtype=bfloat16, device_map=auto. Stream via TextIteratorStreamer in a
background thread. Adapt MTILogitsProcessor to the Transformers
LogitsProcessor API (batched entropy mask). Switch MODEL_ID to
FINAL-Bench/Darwin-4B-David and point the tokenizer pre-download and
extra_special_tokens patch at the same repo. Remove dead _attn,
TRIATT_ENABLED, vllm-specific health fields.
- Dockerfile: replace vllm/vllm-openai base with nvidia/cuda runtime
image and pip-install transformers from git. Note: the Space runs
under sdk gradio so the Dockerfile is ignored; kept in sync for
future sdk docker flips.

Files changed (3) hide show

Dockerfile +20 -13
app.py +110 -138
requirements.txt +8 -8

Dockerfile CHANGED Viewed

@@ -1,24 +1,31 @@
-FROM vllm/vllm-openai:gemma4
 ENV DEBIAN_FRONTEND=noninteractive
 ENV PYTHONUNBUFFERED=1
-# Gemma 4 model_type을 위해 transformers 소스 설치 필수
-RUN pip install --no-cache-dir \
-    "git+https://github.com/huggingface/transformers.git" \
-    gradio>=5.0 \
-    fastapi \
-    uvicorn \
-    httpx \
-    requests \
-    PyMuPDF
-# TriAttention (optional)
-RUN pip install --no-cache-dir aither-kvcache || true
 WORKDIR /app
 COPY . /app
 EXPOSE 7860
-CMD ["python3", "app.py"]

+FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
 ENV DEBIAN_FRONTEND=noninteractive
 ENV PYTHONUNBUFFERED=1
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    python3 python3-pip python3-dev git curl ca-certificates \
+ && rm -rf /var/lib/apt/lists/*
+# Gemma4 (model_type="gemma4") is only available in the Transformers git dev
+# branch. Installing from PyPI WILL fail at runtime with:
+#   "The checkpoint you are trying to load has model type `gemma4` but
+#    Transformers does not recognize this architecture."
+# Keep this install line pointed at git+https until gemma4 lands in a release.
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir \
+        "torch>=2.4.0" \
+        "git+https://github.com/huggingface/transformers.git" \
+        "accelerate>=1.0.0" \
+        "huggingface_hub" \
+        "sentencepiece" "protobuf" \
+        "gradio>=5.0" \
+        "fastapi" "uvicorn" "httpx" "requests" \
+        "Pillow" "PyMuPDF" "openai"
 WORKDIR /app
 COPY . /app
 EXPOSE 7860
+CMD ["python3", "app.py"]

app.py CHANGED Viewed

@@ -1,20 +1,9 @@
-# Gemma 4 E4B - vLLM + MTI + TriAttention
-# Multimodal (Vision+Audio+Text) - Effective 4.5B - Apache 2.0
-# MTI: +9-11% reasoning accuracy (training-free)
-# TriAttention: ~10x KV cache compression
 import sys, os, signal, time, uuid
 print(f"[BOOT] Python {sys.version}", flush=True)
-# -- TriAttention 시도 --
-TRIATT_ENABLED = False
-try:
-    import aither_kvcache
-    os.environ["VLLM_ATTENTION_BACKEND"] = "CUSTOM"
-    TRIATT_ENABLED = True
-    print("[TRIATT] aither-kvcache -> VLLM_ATTENTION_BACKEND=CUSTOM", flush=True)
-except ImportError:
-    print("[TRIATT] aither-kvcache not found -> standard attention", flush=True)
 import base64, re, json
 from typing import Generator, Optional
 from threading import Thread
@@ -33,10 +22,10 @@ import pathlib, secrets
 # ==============================================================================
 # 1.  CONFIG
 # ==============================================================================
-MODEL_ID   = "DavidAU/gemma-4-E4B-it-The-DECKARD-Expresso-Universe-HERETIC-UNCENSORED-Thinking"
-MODEL_NAME = "DECKARD-E4B-Opus"
 MODEL_CAP  = {
-    "arch": "Gemma4 PLE", "active": "4.5B", "total": "~8B",
     "ctx": "128K", "thinking": True, "vision": True, "audio": True,
     "max_tokens": 16384, "temp_max": 2.0,
 }
@@ -51,11 +40,14 @@ PRESETS = {
 # ==============================================================================
 # 2.  MTI -- Minimal Test-Time Intervention (arxiv 2510.13940)
 # ==============================================================================
-class MTILogitsProcessor:
     """
-    고엔트로피(불확실) 토큰에만 CFG 적용 -> 추론 정확도 +9~11%.
-    학습 없이 서빙 시 적용. 전체 토큰의 ~15%에만 개입.
     """
     def __init__(self, cfg_scale: float = 1.5, entropy_threshold: float = 2.0):
         self.cfg_scale = cfg_scale
@@ -63,17 +55,18 @@ class MTILogitsProcessor:
         self._interventions = 0
         self._total = 0
-    def __call__(self, token_ids, logits):
-        self._total += 1
-        probs = torch.softmax(logits, dim=-1)
-        entropy = -(probs * torch.log(probs + 1e-10)).sum(dim=-1)
-        if entropy.item() > self.entropy_threshold:
-            mean_logit = logits.mean(dim=-1, keepdim=True)
-            guided = logits + self.cfg_scale * (logits - mean_logit)
-            self._interventions += 1
-            return guided
-        return logits
     @property
     def intervention_rate(self):
@@ -82,103 +75,78 @@ class MTILogitsProcessor:
 print("[MTI] MTILogitsProcessor ready (cfg=1.5, threshold=2.0)", flush=True)
 # ==============================================================================
-# 3.  vLLM ENGINE -- Gemma 4 Day 0 지원, 패치 불필요
 # ==============================================================================
-from vllm.engine.arg_utils import EngineArgs
-from vllm.engine.llm_engine import LLMEngine
-from vllm import SamplingParams, TokensPrompt
-from transformers import AutoTokenizer
-# -- Gemma 4 토크나이저 호환성 패치 --
-# transformers 5.5.0+에서 extra_special_tokens가 list일 때 .keys() 에러
 from huggingface_hub import hf_hub_download
 import tempfile, shutil
-_tok_source = "google/gemma-4-E4B-it"
-_tok_dir = tempfile.mkdtemp()
-# 토크나이저 파일들 다운로드
 for _fname in ["tokenizer_config.json", "tokenizer.json", "tokenizer.model",
-                "special_tokens_map.json", "chat_template.jinja"]:
     try:
         _p = hf_hub_download(_tok_source, _fname)
         shutil.copy(_p, os.path.join(_tok_dir, _fname))
     except Exception:
         pass
-# tokenizer_config.json 패치: extra_special_tokens list -> dict
 _tc_path = os.path.join(_tok_dir, "tokenizer_config.json")
 if os.path.exists(_tc_path):
-    with open(_tc_path) as f:
-        _tc = json.load(f)
-    est = _tc.get("extra_special_tokens", None)
-    if isinstance(est, list):
-        _tc["extra_special_tokens"] = {tok: tok for tok in est} if est else {}
-        with open(_tc_path, "w") as f:
-            json.dump(_tc, f, indent=2)
-        print(f"[vLLM] Patched extra_special_tokens: list({len(est)}) -> dict", flush=True)
-tokenizer = AutoTokenizer.from_pretrained(_tok_dir, trust_remote_code=True)
-print(f"[vLLM] Tokenizer loaded (vocab={len(tokenizer)})", flush=True)
-engine = None
-MAX_MODEL_LEN = 32768
-# 시도 1: TriAttention + 32K
-if engine is None and TRIATT_ENABLED:
-    try:
-        print(f"[vLLM] Try 1: TriAttention + {MAX_MODEL_LEN}", flush=True)
-        engine = LLMEngine.from_engine_args(EngineArgs(
-            model=MODEL_ID, tokenizer=_tok_dir, dtype="bfloat16",
-            max_model_len=MAX_MODEL_LEN,
-            gpu_memory_utilization=0.92,
-            trust_remote_code=True,
-            limit_mm_per_prompt={"image": 0, "audio": 0},
-        ))
-        print(f"[vLLM] OK TriAttention engine ready", flush=True)
-    except Exception as e:
-        print(f"[vLLM] X TriAttention failed: {e}", flush=True)
-        os.environ.pop("VLLM_ATTENTION_BACKEND", None)
-        TRIATT_ENABLED = False
-        engine = None
-# 시도 2: 표준 + 16K
-if engine is None:
-    MAX_MODEL_LEN = 16384
     try:
-        print(f"[vLLM] Try 2: Standard + {MAX_MODEL_LEN}", flush=True)
-        engine = LLMEngine.from_engine_args(EngineArgs(
-            model=MODEL_ID, tokenizer=_tok_dir, dtype="bfloat16",
-            max_model_len=MAX_MODEL_LEN,
-            gpu_memory_utilization=0.92,
-            trust_remote_code=True,
-            limit_mm_per_prompt={"image": 0, "audio": 0},
-        ))
-        print(f"[vLLM] OK Standard engine ready", flush=True)
     except Exception as e:
-        print(f"[vLLM] X 16K failed: {e}", flush=True)
-        engine = None
-# 시도 3: 최소 8K
-if engine is None:
-    MAX_MODEL_LEN = 8192
-    try:
-        print(f"[vLLM] Try 3: Minimal + {MAX_MODEL_LEN}", flush=True)
-        engine = LLMEngine.from_engine_args(EngineArgs(
-            model=MODEL_ID, tokenizer=_tok_dir, dtype="bfloat16",
-            max_model_len=MAX_MODEL_LEN,
-            gpu_memory_utilization=0.90,
-            trust_remote_code=True,
-            limit_mm_per_prompt={"image": 0, "audio": 0},
-        ))
-        print(f"[vLLM] OK Minimal engine ready", flush=True)
-    except Exception as e:
-        print(f"[vLLM] XXX All failed: {e}", flush=True)
-        sys.exit(1)
 MODEL_CAP["max_tokens"] = min(MODEL_CAP["max_tokens"], MAX_MODEL_LEN)
-_attn = "TriAttention" if TRIATT_ENABLED else "Standard"
-print(f"[vLLM] Final: {_attn}, max_len={MAX_MODEL_LEN}, max_tokens={MODEL_CAP['max_tokens']}", flush=True)
 # ==============================================================================
 # 4.  THINKING MODE HELPERS
@@ -211,27 +179,31 @@ def format_response(raw: str) -> str:
     return raw
 # ==============================================================================
-# 5.  GENERATION -- vLLM Engine + MTI Streaming
 # ==============================================================================
-def _engine_generate(prompt_text: str, params: SamplingParams, queue: Queue):
-    """vLLM 엔진 생성 + Queue 스트리밍"""
     try:
-        request_id = str(uuid.uuid4())
-        token_ids = tokenizer.encode(prompt_text)
-        engine.add_request(request_id, TokensPrompt(prompt_token_ids=token_ids), params)
-        prev_len = 0
-        while engine.has_unfinished_requests():
-            step_outputs = engine.step()
-            for output in step_outputs:
-                text = output.outputs[0].text
-                if len(text) > prev_len:
-                    queue.put(text[prev_len:])
-                    prev_len = len(text)
-                if output.finished:
-                    queue.put(None)
-                    return
         queue.put(None)
     except Exception as e:
         queue.put(f"\n\n**❌ Engine error:** `{e}`")
@@ -288,19 +260,20 @@ def generate_reply(
     input_len = len(tokenizer.encode(prompt_text))
     print(f"[GEN] tokens={input_len}, max_new={max_new_tokens}, "
-          f"temp={temperature}, MTI=on, Attn={_attn}", flush=True)
     mti = MTILogitsProcessor(cfg_scale=1.5, entropy_threshold=2.0)
-    params = SamplingParams(
-        max_tokens=max_new_tokens,
-        temperature=max(float(temperature), 0.01) if temperature > 0.01 else 0.0,
         top_p=float(top_p),
-        logits_processors=[mti],
     )
-    queue = Queue()
-    thread = Thread(target=_engine_generate, args=(prompt_text, params, queue))
     thread.start()
     output = ""
@@ -424,8 +397,7 @@ async def oauth_logout(request: Request):
 async def health():
     return {
         "status": "ok", "model": MODEL_ID,
-        "backend": "vLLM-Engine",
-        "attention": "TriAttention" if TRIATT_ENABLED else "Standard",
         "mti": "enabled",
         "max_tokens": MODEL_CAP["max_tokens"],
         "max_model_len": MAX_MODEL_LEN,
@@ -480,5 +452,5 @@ signal.signal(signal.SIGTERM, _shutdown)
 signal.signal(signal.SIGINT, _shutdown)
 if __name__ == "__main__":
-    print(f"[BOOT] {MODEL_NAME} - vLLM - {_attn} - MTI - max_len={MAX_MODEL_LEN} - Ready", flush=True)
-    uvicorn.run(app, host="0.0.0.0", port=7860)

+# Darwin-4B-David (Gemma4) - Transformers backend + MTI
+# Multimodal (Vision+Audio+Text) - Apache 2.0
+# MTI: +9-11% reasoning accuracy (training-free), Transformers LogitsProcessor
 import sys, os, signal, time, uuid
 print(f"[BOOT] Python {sys.version}", flush=True)
 import base64, re, json
 from typing import Generator, Optional
 from threading import Thread
 # ==============================================================================
 # 1.  CONFIG
 # ==============================================================================
+MODEL_ID   = "FINAL-Bench/Darwin-4B-David"
+MODEL_NAME = "Darwin-4B-David"
 MODEL_CAP  = {
+    "arch": "Gemma4", "active": "4B", "total": "4B",
     "ctx": "128K", "thinking": True, "vision": True, "audio": True,
     "max_tokens": 16384, "temp_max": 2.0,
 }
 # ==============================================================================
 # 2.  MTI -- Minimal Test-Time Intervention (arxiv 2510.13940)
+#     Transformers LogitsProcessor API: __call__(input_ids, scores) -> scores
 # ==============================================================================
+from transformers import LogitsProcessor, LogitsProcessorList
+class MTILogitsProcessor(LogitsProcessor):
     """
+    High-entropy (uncertain) tokens only -> apply CFG-style sharpening.
+    Training-free serving-time intervention, ~15% of tokens affected.
     """
     def __init__(self, cfg_scale: float = 1.5, entropy_threshold: float = 2.0):
         self.cfg_scale = cfg_scale
         self._interventions = 0
         self._total = 0
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        # scores: (batch_size, vocab_size)
+        self._total += int(scores.shape[0])
+        probs = torch.softmax(scores, dim=-1)
+        entropy = -(probs * torch.log(probs.clamp_min(1e-10))).sum(dim=-1)  # (batch_size,)
+        mask = entropy > self.entropy_threshold                              # (batch_size,)
+        if bool(mask.any()):
+            mean_logit = scores.mean(dim=-1, keepdim=True)
+            guided = scores + self.cfg_scale * (scores - mean_logit)
+            scores = torch.where(mask.unsqueeze(-1), guided, scores)
+            self._interventions += int(mask.sum().item())
+        return scores
     @property
     def intervention_rate(self):
 print("[MTI] MTILogitsProcessor ready (cfg=1.5, threshold=2.0)", flush=True)
 # ==============================================================================
+# 3.  TOKENIZER + MODEL LOAD (Transformers from source)
 # ==============================================================================
+from transformers import (
+    AutoTokenizer,
+    Gemma4ForConditionalGeneration,
+    TextIteratorStreamer,
+)
 from huggingface_hub import hf_hub_download
 import tempfile, shutil
+# ---- Tokenizer with extra_special_tokens patch ----
+# Transformers 5.5.x (git) has a regression where tokenizer_config.json with
+# extra_special_tokens stored as a list crashes during load (.keys() call on
+# a list). We pre-download, patch if needed, then load from the local copy.
+_tok_source = MODEL_ID
+_tok_dir = tempfile.mkdtemp(prefix="darwin_tok_")
 for _fname in ["tokenizer_config.json", "tokenizer.json", "tokenizer.model",
+               "special_tokens_map.json", "chat_template.jinja"]:
     try:
         _p = hf_hub_download(_tok_source, _fname)
         shutil.copy(_p, os.path.join(_tok_dir, _fname))
     except Exception:
         pass
 _tc_path = os.path.join(_tok_dir, "tokenizer_config.json")
 if os.path.exists(_tc_path):
     try:
+        with open(_tc_path) as f:
+            _tc = json.load(f)
+        est = _tc.get("extra_special_tokens", None)
+        if isinstance(est, list):
+            _tc["extra_special_tokens"] = {tok: tok for tok in est} if est else {}
+            with open(_tc_path, "w") as f:
+                json.dump(_tc, f, indent=2)
+            print(f"[Tokenizer] Patched extra_special_tokens: list({len(est)}) -> dict", flush=True)
     except Exception as e:
+        print(f"[Tokenizer] Patch skipped: {e}", flush=True)
+tokenizer = AutoTokenizer.from_pretrained(_tok_dir)
+print(f"[Tokenizer] Loaded (vocab={len(tokenizer)}) from {_tok_source}", flush=True)
+# ---- Model ----
+print(f"[Transformers] Loading {MODEL_ID} (this may take a while for a 16GB checkpoint)...", flush=True)
+_load_kwargs = dict(
+    dtype=torch.bfloat16,
+    device_map="auto",
+    low_cpu_mem_usage=True,
+)
+try:
+    model = Gemma4ForConditionalGeneration.from_pretrained(MODEL_ID, **_load_kwargs)
+except TypeError:
+    # Older transformers signatures used torch_dtype instead of dtype.
+    _load_kwargs["torch_dtype"] = _load_kwargs.pop("dtype")
+    model = Gemma4ForConditionalGeneration.from_pretrained(MODEL_ID, **_load_kwargs)
+model.eval()
+_device = next(model.parameters()).device
+print(f"[Transformers] Model loaded on {_device}", flush=True)
+# Resolve max model length (text config for multimodal Gemma4).
+try:
+    _text_cfg = model.config.get_text_config()
+except AttributeError:
+    _text_cfg = getattr(model.config, "text_config", model.config)
+MAX_MODEL_LEN = int(getattr(_text_cfg, "max_position_embeddings", 16384))
+# Clamp generation max_tokens to what the runtime can actually hold.
 MODEL_CAP["max_tokens"] = min(MODEL_CAP["max_tokens"], MAX_MODEL_LEN)
+print(f"[Transformers] max_position_embeddings={MAX_MODEL_LEN}, "
+      f"max_tokens={MODEL_CAP['max_tokens']}", flush=True)
+BACKEND_NAME = "Transformers"
 # ==============================================================================
 # 4.  THINKING MODE HELPERS
     return raw
 # ==============================================================================
+# 5.  GENERATION -- Transformers TextIteratorStreamer + MTI
 # ==============================================================================
+def _engine_generate(prompt_text: str, gen_kwargs: dict, mti: MTILogitsProcessor, queue: Queue):
+    """Run model.generate in a background thread and stream tokens into queue."""
     try:
+        inputs = tokenizer(prompt_text, return_tensors="pt").to(_device)
+        streamer = TextIteratorStreamer(
+            tokenizer, skip_prompt=True, skip_special_tokens=False, timeout=120.0,
+        )
+        full_kwargs = {
+            **inputs,
+            "streamer": streamer,
+            "logits_processor": LogitsProcessorList([mti]),
+            "pad_token_id": tokenizer.pad_token_id or tokenizer.eos_token_id,
+            **gen_kwargs,
+        }
+        gen_thread = Thread(target=model.generate, kwargs=full_kwargs)
+        gen_thread.start()
+        for chunk in streamer:
+            if chunk:
+                queue.put(chunk)
+        gen_thread.join()
         queue.put(None)
     except Exception as e:
         queue.put(f"\n\n**❌ Engine error:** `{e}`")
     input_len = len(tokenizer.encode(prompt_text))
     print(f"[GEN] tokens={input_len}, max_new={max_new_tokens}, "
+          f"temp={temperature}, MTI=on, Backend={BACKEND_NAME}", flush=True)
     mti = MTILogitsProcessor(cfg_scale=1.5, entropy_threshold=2.0)
+    do_sample = float(temperature) > 0.01
+    gen_kwargs = dict(
+        max_new_tokens=max_new_tokens,
+        do_sample=do_sample,
+        temperature=max(float(temperature), 0.01) if do_sample else 1.0,
         top_p=float(top_p),
     )
+    queue: Queue = Queue()
+    thread = Thread(target=_engine_generate, args=(prompt_text, gen_kwargs, mti, queue))
     thread.start()
     output = ""
 async def health():
     return {
         "status": "ok", "model": MODEL_ID,
+        "backend": BACKEND_NAME,
         "mti": "enabled",
         "max_tokens": MODEL_CAP["max_tokens"],
         "max_model_len": MAX_MODEL_LEN,
 signal.signal(signal.SIGINT, _shutdown)
 if __name__ == "__main__":
+    print(f"[BOOT] {MODEL_NAME} - {BACKEND_NAME} - MTI - max_len={MAX_MODEL_LEN} - Ready", flush=True)
+    uvicorn.run(app, host="0.0.0.0", port=7860)

requirements.txt CHANGED Viewed

@@ -6,14 +6,14 @@ uvicorn
 fastapi
 requests
 PyMuPDF
-torch
-transformers>=4.45.0
-accelerate>=0.26.0
 sentencepiece
 protobuf
-# ── TriAttention KV Cache Optimization ──
-aither-kvcache[triton]>=2.0.0
-# ── Fallback: Quantized KV Cache ──
-optimum-quanto
 openai
-vllm

 fastapi
 requests
 PyMuPDF
+torch>=2.4.0
+# Gemma4 (model_type="gemma4") is only available in the Transformers dev branch.
+# PyPI releases of transformers do NOT recognize this architecture, which is
+# what caused the "The checkpoint you are trying to load has model type
+# `gemma4` but Transformers does not recognize this architecture" runtime
+# error. Do NOT pin a PyPI version here.
+transformers @ git+https://github.com/huggingface/transformers.git
+accelerate>=1.0.0
 sentencepiece
 protobuf
 openai