Darwin-4B-david

Paused

App Files Files Community

SeaWolf-AI commited on Apr 9

Commit

e4bf98e

verified ·

1 Parent(s): 1bf9a4a

Update app.py

Browse files

Files changed (1) hide show

app.py +101 -267

app.py CHANGED Viewed

@@ -1,21 +1,21 @@
 """
-🧬 Darwin-9B-Opus — vLLM + TriAttention + MTI
-Docker Space · Qwen3.5 9B · BF16 · TriAttention 10x KV · MTI +9% reasoning
 """
 import sys, os, signal, time, uuid
 print(f"[BOOT] Python {sys.version}", flush=True)
-# ── vLLM 설정 ──
-# ── TriAttention 활성화 시도 ──
 TRIATT_ENABLED = False
 try:
     import aither_kvcache
     os.environ["VLLM_ATTENTION_BACKEND"] = "CUSTOM"
     TRIATT_ENABLED = True
-    print("[TRIATT] aither-kvcache found → VLLM_ATTENTION_BACKEND=CUSTOM", flush=True)
 except ImportError:
-    print("[TRIATT] aither-kvcache not installed → standard attention", flush=True)
 import base64, re, json
 from typing import Generator, Optional
@@ -32,70 +32,51 @@ from fastapi.responses import HTMLResponse, RedirectResponse, JSONResponse
 from urllib.parse import urlencode
 import pathlib, secrets
-import urllib3
-urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 # ══════════════════════════════════════════════════════════════════════════════
 # 1.  CONFIG
 # ══════════════════════════════════════════════════════════════════════════════
-MODEL_ID   = "FINAL-Bench/Darwin-9B-Opus"
-MODEL_NAME = "Darwin-9B-Opus"
 MODEL_CAP  = {
-    "arch": "Qwen3.5 Dense", "active": "9B",
-    "ctx": "131K", "thinking": True, "vision": False,
-    "max_tokens": 32768, "temp_max": 1.5,
 }
 PRESETS = {
-    "general":   "You are Darwin-9B-Opus, a highly capable reasoning model created by VIDRAFT via evolutionary merge. Think step by step for complex questions.",
-    "code":      "You are an expert software engineer. Write clean, efficient, well-commented code. Explain your approach before writing. Use modern best practices.",
-    "math":      "You are a world-class mathematician. Break problems step-by-step. Show full working. Use LaTeX where helpful.",
-    "creative":  "You are a brilliant creative writer. Be imaginative, vivid, and engaging. Adapt tone and style to the request.",
-    "translate": "You are a professional translator fluent in 201 languages. Provide accurate, natural-sounding translations with cultural context.",
-    "research":  "You are a rigorous research analyst. Provide structured, well-reasoned analysis. Identify assumptions and acknowledge uncertainty.",
 }
 # ══════════════════════════════════════════════════════════════════════════════
 # 2.  MTI — Minimal Test-Time Intervention (arxiv 2510.13940)
 # ══════════════════════════════════════════════════════════════════════════════
-# 학습 없이 서빙 시 고엔트로피 토큰에만 CFG를 적용하여 추론 정확도 +9~11%.
-# 핵심 발견: 추론 불확실성은 고도로 국소화 — 소수 고엔트로피 토큰만이
-# 출력 정확도에 결정적 영향. 나머지 토큰은 건드리지 않아 비용 최소.
 class MTILogitsProcessor:
     """
-    Minimal Test-Time Intervention — selective CFG on high-entropy tokens only.
-    고엔트로피(불확실) 토큰에서만 classifier-free guidance를 적용:
-    - entropy > threshold → logits를 mean 기준으로 cfg_scale만큼 증폭
-    - entropy <= threshold → logits 그대로 통과 (비용 0)
-    효과: DeepSeek-R1-7B 기준 6개 벤치마크 평균 +9.28%
-    비용: 전체 토큰 중 ~15%만 개입 → 추론 오버헤드 미미
     """
     def __init__(self, cfg_scale: float = 1.5, entropy_threshold: float = 2.0):
         self.cfg_scale = cfg_scale
         self.entropy_threshold = entropy_threshold
         self._interventions = 0
         self._total = 0
     def __call__(self, token_ids, logits):
         self._total += 1
-        # 현재 토큰의 엔트로피 계산
         probs = torch.softmax(logits, dim=-1)
         entropy = -(probs * torch.log(probs + 1e-10)).sum(dim=-1)
         if entropy.item() > self.entropy_threshold:
-            # 고엔트로피 토큰 → CFG 가이드
-            # unconditional logits를 mean으로 근사 (KV 캐시 재활용)
             mean_logit = logits.mean(dim=-1, keepdim=True)
             guided = logits + self.cfg_scale * (logits - mean_logit)
             self._interventions += 1
             return guided
         return logits
     @property
     def intervention_rate(self):
         return self._interventions / max(self._total, 1)
@@ -103,198 +84,88 @@ class MTILogitsProcessor:
 print("[MTI] MTILogitsProcessor ready (cfg=1.5, threshold=2.0)", flush=True)
 # ══════════════════════════════════════════════════════════════════════════════
-# 3.  vLLM ENGINE — 방어적 초기화 (TriAttention + MTI)
 # ══════════════════════════════════════════════════════════════════════════════
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.llm_engine import LLMEngine
 from vllm import SamplingParams, TokensPrompt
 from transformers import AutoTokenizer
-from huggingface_hub import snapshot_download
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
 print(f"[vLLM] Tokenizer loaded ✓", flush=True)
-# ── 모델 다운로드 (로컬 복사, symlink 아닌 실제 파일) ──
-print(f"[vLLM] Downloading {MODEL_ID} to /app/model ...", flush=True)
-MODEL_PATH = snapshot_download(MODEL_ID, local_dir="/app/model")
-print(f"[vLLM] Downloaded → {MODEL_PATH}", flush=True)
-print(f"[vLLM] Files: {os.listdir(MODEL_PATH)[:15]}", flush=True)
-_config_path = os.path.join(MODEL_PATH, "config.json")
-try:
-    with open(_config_path) as f:
-        _config = json.load(f)
-    _orig_arch = _config.get("architectures", [])
-    # 1. ConditionalGeneration → CausalLM
-    if any("ConditionalGeneration" in a for a in _orig_arch):
-        _config["architectures"] = [a.replace("ConditionalGeneration", "CausalLM") for a in _orig_arch]
-    # 2. 멀티모달 설정 블록만 제거 (정상 키는 보존)
-    _mm_config_keys = ["vision_config", "video_config", "audio_config",
-                       "visual_config", "video_processor_config",
-                       "image_processor_config"]
-    _removed_keys = []
-    for key in list(_config.keys()):
-        if key in _mm_config_keys:
-            del _config[key]
-            _removed_keys.append(key)
-    # 3. auto_map에서 멀티모달 참조만 제거 (AutoProcessor 등은 유지)
-    if "auto_map" in _config:
-        _auto = _config["auto_map"]
-        _mm_auto_keys = [k for k in _auto if k in ["AutoImageProcessor", "AutoVideoProcessor", "AutoFeatureExtractor"]]
-        for k in _mm_auto_keys:
-            del _auto[k]
-            _removed_keys.append(f"auto_map.{k}")
-    # 4. model_type이 멀티모달을 가리키면 텍스트 전용으로
-    mt = _config.get("model_type", "")
-    if mt in ["qwen3_5_vl", "qwen2_vl", "qwen2_5_vl"]:
-        _config["model_type"] = mt.replace("_vl", "")
-        _removed_keys.append(f"model_type: {mt} → {_config['model_type']}")
-    with open(_config_path, "w") as f:
-        json.dump(_config, f, indent=2)
-    print(f"[vLLM] Config patched → {_config['architectures']}", flush=True)
-    if _removed_keys:
-        print(f"[vLLM] Removed MM keys: {_removed_keys}", flush=True)
-    # 5. preprocessor_config.json 패치 — video processor 참조 제거
-    _preproc_path = os.path.join(MODEL_PATH, "preprocessor_config.json")
-    if os.path.exists(_preproc_path):
-        try:
-            with open(_preproc_path) as f:
-                _preproc = json.load(f)
-            # video 관련 키 제거
-            _video_keys = [k for k in _preproc if "video" in k.lower()]
-            for k in _video_keys:
-                del _preproc[k]
-                _removed_keys.append(f"preproc.{k}")
-            # processor_class가 비디오를 참조하면 제거
-            if "processor_class" in _preproc:
-                _removed_keys.append(f"preproc.processor_class={_preproc['processor_class']}")
-            with open(_preproc_path, "w") as f:
-                json.dump(_preproc, f, indent=2)
-            print(f"[vLLM] preprocessor_config.json patched", flush=True)
-        except Exception as e:
-            print(f"[vLLM] preprocessor patch error: {e}", flush=True)
-    # video_preprocessor_config.json이 있으면 삭제 (이건 필요 없음)
-    _vidproc = os.path.join(MODEL_PATH, "video_preprocessor_config.json")
-    if os.path.exists(_vidproc):
-        os.remove(_vidproc)
-        print(f"[vLLM] Removed video_preprocessor_config.json", flush=True)
-    print(f"[vLLM] Preprocessor files handled", flush=True)
-except Exception as e:
-    print(f"[vLLM] Config patch failed: {e} — proceeding with original", flush=True)
-# ── 단계적 엔진 초기화: 실패 시 점진적 fallback ──
 engine = None
-MAX_MODEL_LEN = 32768  # L4 24GB 안전값 (TriAttention 시 확장 가능)
 # 시도 1: TriAttention + 32K
 if engine is None and TRIATT_ENABLED:
     try:
-        print(f"[vLLM] Try 1: TriAttention + max_model_len={MAX_MODEL_LEN}", flush=True)
-        args = EngineArgs(
-            model=MODEL_PATH, dtype="bfloat16",
             max_model_len=MAX_MODEL_LEN,
             gpu_memory_utilization=0.92,
             trust_remote_code=True,
-        )
-        engine = LLMEngine.from_engine_args(args)
-        print(f"[vLLM] ✓ TriAttention engine ready (max_len={MAX_MODEL_LEN})", flush=True)
     except Exception as e:
         print(f"[vLLM] ✗ TriAttention failed: {e}", flush=True)
         os.environ.pop("VLLM_ATTENTION_BACKEND", None)
         TRIATT_ENABLED = False
         engine = None
-# 시도 2: 표준 어텐션 + 16K
 if engine is None:
     MAX_MODEL_LEN = 16384
     try:
-        print(f"[vLLM] Try 2: Standard attention + max_model_len={MAX_MODEL_LEN}", flush=True)
-        args = EngineArgs(
-            model=MODEL_PATH, dtype="bfloat16",
             max_model_len=MAX_MODEL_LEN,
             gpu_memory_utilization=0.92,
             trust_remote_code=True,
-        )
-        engine = LLMEngine.from_engine_args(args)
-        print(f"[vLLM] ✓ Standard engine ready (max_len={MAX_MODEL_LEN})", flush=True)
     except Exception as e:
         print(f"[vLLM] ✗ 16K failed: {e}", flush=True)
         engine = None
-# 시도 3: 최소 설정 8K
 if engine is None:
     MAX_MODEL_LEN = 8192
     try:
-        print(f"[vLLM] Try 3: Minimal + max_model_len={MAX_MODEL_LEN}", flush=True)
-        args = EngineArgs(
-            model=MODEL_PATH, dtype="bfloat16",
             max_model_len=MAX_MODEL_LEN,
             gpu_memory_utilization=0.90,
             trust_remote_code=True,
-        )
-        engine = LLMEngine.from_engine_args(args)
-        print(f"[vLLM] ✓ Minimal engine ready (max_len={MAX_MODEL_LEN})", flush=True)
     except Exception as e:
-        print(f"[vLLM] ✗✗✗ All attempts failed: {e}", flush=True)
         sys.exit(1)
-# max_tokens 조정
 MODEL_CAP["max_tokens"] = min(MODEL_CAP["max_tokens"], MAX_MODEL_LEN)
-attn_mode = "TriAttention" if TRIATT_ENABLED else "Standard"
-print(f"[vLLM] Final: {attn_mode}, max_model_len={MAX_MODEL_LEN}, "
-      f"max_tokens={MODEL_CAP['max_tokens']}", flush=True)
 # ══════════════════════════════════════════════════════════════════════════════
-# 4.  THINKING MODE HELPERS (기존 동일)
 # ══════════════════════════════════════════════════════════════════════════════
 def parse_think_blocks(text: str) -> tuple[str, str]:
     m = re.search(r"<think>(.*?)</think>\s*", text, re.DOTALL)
-    return (m.group(1).strip(), text[m.end():].strip()) if m else ("", text)
-def _is_thinking_line(line: str) -> bool:
-    l = line.strip()
-    if not l: return True
-    think_starts = [
-        "The user", "the user", "This is", "this is", "I should", "I need to",
-        "Let me", "let me", "My task", "my task", "I'll ", "I will",
-        "Since ", "since ", "Now,", "now,", "So,", "so,", "First,", "first,",
-        "Okay", "okay", "Alright", "Hmm", "Wait", "Actually",
-        "The question", "the question", "The input", "the input",
-        "The request", "the request", "The prompt", "the prompt",
-        "Thinking Process", "Thinking process", "**Thinking",
-        "Step ", "step ", "Approach:", "Analysis:", "Reasoning:",
-        "1. **", "2. **", "3. **", "4. **", "5. **",
-    ]
-    for s in think_starts:
-        if l.startswith(s): return True
-    if l.startswith(("- ", "* ", "○ ")) and any(c.isascii() and c.isalpha() for c in l[:20]):
-        if not any(ord(c) > 0x1100 for c in l[:30]): return True
-    return False
-def _split_thinking_answer(raw: str) -> tuple:
-    lines = raw.split("\n")
-    answer_start = -1
-    for i, line in enumerate(lines):
-        if not _is_thinking_line(line):
-            if any(ord(c) > 0x1100 for c in line.strip()[:10]):
-                answer_start = i; break
-            if i > 2 and not _is_thinking_line(line):
-                if all(not lines[j].strip() for j in range(max(0,i-2), i)):
-                    answer_start = i; break
-    if answer_start > 0:
-        return "\n".join(lines[:answer_start]).strip(), "\n".join(lines[answer_start:]).strip()
-    return "", raw
 def format_response(raw: str) -> str:
     chain, answer = parse_think_blocks(raw)
@@ -303,30 +174,22 @@ def format_response(raw: str) -> str:
             "<details>\n<summary>🧠 Reasoning Chain — click to expand</summary>\n\n"
             f"{chain}\n\n</details>\n\n{answer}"
         )
     if "<think>" in raw and "</think>" not in raw:
         think_len = len(raw) - raw.index("<think>") - 7
-        return f"🧠 Reasoning... ({think_len} chars)"
-    first_line = raw.strip().split("\n")[0] if raw.strip() else ""
-    if _is_thinking_line(first_line) and len(raw) > 20:
-        thinking, answer = _split_thinking_answer(raw)
-        if thinking and answer:
-            return (
-                f"<details>\n<summary>🧠 Reasoning Chain ({len(thinking)} chars)</summary>\n\n"
-                f"{thinking}\n\n</details>\n\n{answer}"
-            )
-        elif thinking and not answer:
-            return f"🧠 Reasoning... ({len(raw)} chars)"
     return raw
 # ══════════════════════════════════════════════════════════════════════════════
-# 5.  GENERATION — vLLM Engine + TriAttention + MTI Streaming
 # ══════════════════════════════════════════════════════════════════════════════
 def _engine_generate(prompt_text: str, params: SamplingParams, queue: Queue):
-    """vLLM 엔진으로 생성 + Queue로 스트리밍"""
     try:
         request_id = str(uuid.uuid4())
-        # 토크나이즈
         token_ids = tokenizer.encode(prompt_text)
         engine.add_request(request_id, TokensPrompt(prompt_token_ids=token_ids), params)
@@ -334,16 +197,13 @@ def _engine_generate(prompt_text: str, params: SamplingParams, queue: Queue):
         while engine.has_unfinished_requests():
             step_outputs = engine.step()
             for output in step_outputs:
-                if output.finished:
-                    text = output.outputs[0].text
-                    if len(text) > prev_len:
-                        queue.put(text[prev_len:])
-                    queue.put(None)
-                    return
                 text = output.outputs[0].text
                 if len(text) > prev_len:
                     queue.put(text[prev_len:])
                     prev_len = len(text)
         queue.put(None)
     except Exception as e:
@@ -352,20 +212,13 @@ def _engine_generate(prompt_text: str, params: SamplingParams, queue: Queue):
 def generate_reply(
-    message:        str,
-    history:        list,
-    thinking_mode:  str,
-    image_input,
-    system_prompt:  str,
-    max_new_tokens: int,
-    temperature:    float,
-    top_p:          float,
 ) -> Generator[str, None, None]:
     max_new_tokens = min(int(max_new_tokens), MODEL_CAP["max_tokens"])
     temperature    = min(float(temperature),  MODEL_CAP["temp_max"])
-    # ── 메시지 구성 ──
     messages: list[dict] = []
     if system_prompt.strip():
         messages.append({"role": "system", "content": system_prompt.strip()})
@@ -383,17 +236,14 @@ def generate_reply(
                 _, clean = parse_think_blocks(text)
                 messages.append({"role":"assistant","content":clean})
         else:
-            try:
-                u, a = (turn[0] or None), (turn[1] if len(turn)>1 else None)
-            except (IndexError, TypeError):
-                continue
             def _txt(v):
                 if v is None: return None
                 if isinstance(v, list):
-                    return " ".join(p.get("text","") for p in v
-                                    if isinstance(p,dict) and p.get("type")=="text")
                 return str(v)
-            ut = _txt(u); at = _txt(a)
             if ut: messages.append({"role":"user","content":ut})
             if at:
                 _, clean = parse_think_blocks(at)
@@ -401,7 +251,6 @@ def generate_reply(
     messages.append({"role": "user", "content": message})
-    # ── 프롬프트 구성 ──
     try:
         prompt_text = tokenizer.apply_chat_template(
             messages, tokenize=False, add_generation_prompt=True,
@@ -412,9 +261,8 @@ def generate_reply(
     input_len = len(tokenizer.encode(prompt_text))
     print(f"[GEN] tokens={input_len}, max_new={max_new_tokens}, "
-          f"temp={temperature}, MTI=on, TriAtt=on", flush=True)
-    # ── MTI LogitsProcessor + SamplingParams ──
     mti = MTILogitsProcessor(cfg_scale=1.5, entropy_threshold=2.0)
     params = SamplingParams(
@@ -424,7 +272,6 @@ def generate_reply(
         logits_processors=[mti],
     )
-    # ── vLLM 엔진 스트리밍 ──
     queue = Queue()
     thread = Thread(target=_engine_generate, args=(prompt_text, params, queue))
     thread.start()
@@ -433,8 +280,7 @@ def generate_reply(
     try:
         while True:
             chunk = queue.get(timeout=120)
-            if chunk is None:
-                break
             output += chunk
             yield format_response(output)
     except Exception as e:
@@ -445,8 +291,8 @@ def generate_reply(
     if output:
         mti_rate = f"{mti.intervention_rate*100:.1f}%"
-        print(f"[GEN] Done — {len(output)} chars, MTI intervention={mti_rate} "
-              f"({mti._interventions}/{mti._total} tokens)", flush=True)
         yield format_response(output)
     else:
         yield "**⚠️ 모델이 빈 응답을 반환했습니다.** 다시 시도해 주세요."
@@ -457,20 +303,17 @@ def generate_reply(
 # ══════════════════════════════════════════════════════════════════════════════
 with gr.Blocks(title=MODEL_NAME) as gradio_demo:
     thinking_toggle = gr.Radio(
-        choices=["⚡ Fast Mode  (direct answer)",
-                 "🧠 Thinking Mode  (chain-of-thought reasoning)"],
-        value="⚡ Fast Mode  (direct answer)",
-        visible=False,
     )
     image_input    = gr.Textbox(value="", visible=False)
     system_prompt  = gr.Textbox(value=PRESETS["general"], visible=False)
     max_new_tokens = gr.Slider(minimum=64, maximum=MODEL_CAP["max_tokens"], value=4096, visible=False)
-    temperature    = gr.Slider(minimum=0.0, maximum=1.5, value=0.6,  visible=False)
-    top_p          = gr.Slider(minimum=0.1, maximum=1.0, value=0.9,  visible=False)
     gr.ChatInterface(
-        fn=generate_reply,
-        api_name="chat",
         additional_inputs=[
             thinking_toggle, image_input,
             system_prompt, max_new_tokens, temperature, top_p,
@@ -478,7 +321,7 @@ with gr.Blocks(title=MODEL_NAME) as gradio_demo:
     )
 # ══════════════════════════════════════════════════════════════════════════════
-# 7.  FASTAPI — index.html + OAuth + APIs
 # ══════════════════════════════════════════════════════════════════════════════
 fapp    = FastAPI()
 SESSIONS: dict[str, dict] = {}
@@ -488,17 +331,15 @@ CLIENT_ID     = os.getenv("OAUTH_CLIENT_ID", "")
 CLIENT_SECRET = os.getenv("OAUTH_CLIENT_SECRET", "")
 SPACE_HOST    = os.getenv("SPACE_HOST", "localhost:7860")
 REDIRECT_URI  = f"https://{SPACE_HOST}/login/callback"
-print(f"[OAuth] CLIENT_ID set: {bool(CLIENT_ID)}")
-print(f"[OAuth] SPACE_HOST: {SPACE_HOST}")
 HF_AUTH_URL   = "https://huggingface.co/oauth/authorize"
 HF_TOKEN_URL  = "https://huggingface.co/oauth/token"
 HF_USER_URL   = "https://huggingface.co/oauth/userinfo"
 SCOPES        = os.getenv("OAUTH_SCOPES", "openid profile")
-def _sid(req: Request) -> Optional[str]:
-    return req.cookies.get("mc_session")
-def _user(req: Request) -> Optional[dict]:
     sid = _sid(req)
     return SESSIONS.get(sid) if sid else None
@@ -514,16 +355,14 @@ async def oauth_user(request: Request):
 @fapp.get("/oauth/login")
 async def oauth_login(request: Request):
-    if not CLIENT_ID:
-        return RedirectResponse("/?oauth_error=not_configured")
     state = secrets.token_urlsafe(16)
     params = {"response_type":"code","client_id":CLIENT_ID,"redirect_uri":REDIRECT_URI,"scope":SCOPES,"state":state}
     return RedirectResponse(f"{HF_AUTH_URL}?{urlencode(params)}", status_code=302)
 @fapp.get("/login/callback")
 async def oauth_callback(code: str = "", error: str = "", state: str = ""):
-    if error or not code:
-        return RedirectResponse("/?auth_error=1")
     basic = base64.b64encode(f"{CLIENT_ID}:{CLIENT_SECRET}".encode()).decode()
     async with httpx.AsyncClient() as client:
         tok = await client.post(HF_TOKEN_URL, data={"grant_type":"authorization_code","code":code,"redirect_uri":REDIRECT_URI},
@@ -557,13 +396,13 @@ async def oauth_logout(request: Request):
 @fapp.get("/health")
 async def health():
     return {
-        "status": "ok",
-        "model": MODEL_ID,
         "backend": "vLLM-Engine",
-        "kv_cache": "triattention-vllm" if TRIATT_ENABLED else "standard",
         "mti": "enabled",
         "max_tokens": MODEL_CAP["max_tokens"],
-        "max_model_len": 65536,
     }
 BRAVE_API_KEY = os.getenv("BRAVE_API_KEY", "")
@@ -572,18 +411,17 @@ BRAVE_API_KEY = os.getenv("BRAVE_API_KEY", "")
 async def api_search(request: Request):
     body = await request.json()
     query = body.get("query", "").strip()
-    if not query: return JSONResponse({"error": "empty query"}, status_code=400)
-    if not BRAVE_API_KEY: return JSONResponse({"error": "BRAVE_API_KEY not set"}, status_code=500)
     try:
         r = requests.get("https://api.search.brave.com/res/v1/web/search",
             headers={"X-Subscription-Token": BRAVE_API_KEY, "Accept": "application/json"},
             params={"q": query, "count": 5}, timeout=10)
         r.raise_for_status()
         results = r.json().get("web", {}).get("results", [])
-        items = [{"title": i.get("title",""), "desc": i.get("description",""), "url": i.get("url","")} for i in results[:5]]
-        return JSONResponse({"results": items})
     except Exception as e:
-        return JSONResponse({"error": str(e)}, status_code=500)
 @fapp.post("/api/extract-pdf")
 async def api_extract_pdf(request: Request):
@@ -598,12 +436,10 @@ async def api_extract_pdf(request: Request):
             doc = fitz.open(stream=pdf_bytes, filetype="pdf")
             for page in doc: text += page.get_text() + "\n"
         except ImportError:
-            content = pdf_bytes.decode("utf-8", errors="ignore")
-            text = re.sub(r'[^\x20-\x7E\n\r\uAC00-\uD7A3\u3040-\u309F\u30A0-\u30FF]', '', content)
-        text = text.strip()[:8000]
-        return JSONResponse({"text": text, "chars": len(text)})
     except Exception as e:
-        return JSONResponse({"error": str(e)}, status_code=500)
 # ══════════════════════════════════════════════════════════════════════════════
 # 8.  MOUNT & RUN
@@ -611,13 +447,11 @@ async def api_extract_pdf(request: Request):
 app = gr.mount_gradio_app(fapp, gradio_demo, path="/gradio")
 def _shutdown(sig, frame):
-    print("[BOOT] Shutting down...", flush=True)
     sys.exit(0)
 signal.signal(signal.SIGTERM, _shutdown)
 signal.signal(signal.SIGINT, _shutdown)
 if __name__ == "__main__":
-    _attn = "TriAttention" if TRIATT_ENABLED else "Standard"
-    print(f"[BOOT] {MODEL_NAME} · vLLM Engine · BF16 · {_attn} · MTI · max_len={MAX_MODEL_LEN} · Ready", flush=True)
     uvicorn.run(app, host="0.0.0.0", port=7860)

 """
+🧬 Gemma 4 E4B — vLLM + MTI + TriAttention
+Multimodal (Vision+Audio+Text) · Effective 4.5B · Apache 2.0
+MTI: +9~11% reasoning accuracy (training-free)
+TriAttention: ~10x KV cache compression
 """
 import sys, os, signal, time, uuid
 print(f"[BOOT] Python {sys.version}", flush=True)
+# ── TriAttention 시도 ──
 TRIATT_ENABLED = False
 try:
     import aither_kvcache
     os.environ["VLLM_ATTENTION_BACKEND"] = "CUSTOM"
     TRIATT_ENABLED = True
+    print("[TRIATT] aither-kvcache → VLLM_ATTENTION_BACKEND=CUSTOM", flush=True)
 except ImportError:
+    print("[TRIATT] aither-kvcache not found → standard attention", flush=True)
 import base64, re, json
 from typing import Generator, Optional
 from urllib.parse import urlencode
 import pathlib, secrets
 # ══════════════════════════════════════════════════════════════════════════════
 # 1.  CONFIG
 # ══════════════════════════════════════════════════════════════════════════════
+MODEL_ID   = "google/gemma-4-E4B-it"
+MODEL_NAME = "Gemma-4-E4B"
 MODEL_CAP  = {
+    "arch": "Gemma4 PLE", "active": "4.5B", "total": "~8B",
+    "ctx": "128K", "thinking": True, "vision": True, "audio": True,
+    "max_tokens": 16384, "temp_max": 2.0,
 }
 PRESETS = {
+    "general":   "You are Gemma 4 E4B, a highly capable multimodal AI. Think step by step for complex questions.",
+    "code":      "You are an expert software engineer. Write clean, efficient, well-commented code.",
+    "math":      "You are a world-class mathematician. Break problems step-by-step. Show full working.",
+    "creative":  "You are a brilliant creative writer. Be imaginative, vivid, and engaging.",
+    "vision":    "You are an expert at analyzing images. Describe what you see in detail, extract text, and answer questions about visual content.",
 }
 # ══════════════════════════════════════════════════════════════════════════════
 # 2.  MTI — Minimal Test-Time Intervention (arxiv 2510.13940)
 # ══════════════════════════════════════════════════════════════════════════════
 class MTILogitsProcessor:
     """
+    고엔트로피(불확실) 토큰에만 CFG 적용 → 추론 정확도 +9~11%.
+    학습 없이 서빙 시 적용. 전체 토큰의 ~15%에만 개입.
     """
     def __init__(self, cfg_scale: float = 1.5, entropy_threshold: float = 2.0):
         self.cfg_scale = cfg_scale
         self.entropy_threshold = entropy_threshold
         self._interventions = 0
         self._total = 0
     def __call__(self, token_ids, logits):
         self._total += 1
         probs = torch.softmax(logits, dim=-1)
         entropy = -(probs * torch.log(probs + 1e-10)).sum(dim=-1)
         if entropy.item() > self.entropy_threshold:
             mean_logit = logits.mean(dim=-1, keepdim=True)
             guided = logits + self.cfg_scale * (logits - mean_logit)
             self._interventions += 1
             return guided
         return logits
     @property
     def intervention_rate(self):
         return self._interventions / max(self._total, 1)
 print("[MTI] MTILogitsProcessor ready (cfg=1.5, threshold=2.0)", flush=True)
 # ══════════════════════════════════════════════════════════════════════════════
+# 3.  vLLM ENGINE — Gemma 4 Day 0 지원, 패치 불필요
 # ══════════════════════════════════════════════════════════════════════════════
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.llm_engine import LLMEngine
 from vllm import SamplingParams, TokensPrompt
 from transformers import AutoTokenizer
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
 print(f"[vLLM] Tokenizer loaded ✓", flush=True)
 engine = None
+MAX_MODEL_LEN = 32768
 # 시도 1: TriAttention + 32K
 if engine is None and TRIATT_ENABLED:
     try:
+        print(f"[vLLM] Try 1: TriAttention + {MAX_MODEL_LEN}", flush=True)
+        engine = LLMEngine.from_engine_args(EngineArgs(
+            model=MODEL_ID, dtype="bfloat16",
             max_model_len=MAX_MODEL_LEN,
             gpu_memory_utilization=0.92,
             trust_remote_code=True,
+            limit_mm_per_prompt={"image": 0, "audio": 0},
+        ))
+        print(f"[vLLM] ✓ TriAttention engine ready", flush=True)
     except Exception as e:
         print(f"[vLLM] ✗ TriAttention failed: {e}", flush=True)
         os.environ.pop("VLLM_ATTENTION_BACKEND", None)
         TRIATT_ENABLED = False
         engine = None
+# 시도 2: 표준 + 16K
 if engine is None:
     MAX_MODEL_LEN = 16384
     try:
+        print(f"[vLLM] Try 2: Standard + {MAX_MODEL_LEN}", flush=True)
+        engine = LLMEngine.from_engine_args(EngineArgs(
+            model=MODEL_ID, dtype="bfloat16",
             max_model_len=MAX_MODEL_LEN,
             gpu_memory_utilization=0.92,
             trust_remote_code=True,
+            limit_mm_per_prompt={"image": 0, "audio": 0},
+        ))
+        print(f"[vLLM] ✓ Standard engine ready", flush=True)
     except Exception as e:
         print(f"[vLLM] ✗ 16K failed: {e}", flush=True)
         engine = None
+# 시도 3: 최소 8K
 if engine is None:
     MAX_MODEL_LEN = 8192
     try:
+        print(f"[vLLM] Try 3: Minimal + {MAX_MODEL_LEN}", flush=True)
+        engine = LLMEngine.from_engine_args(EngineArgs(
+            model=MODEL_ID, dtype="bfloat16",
             max_model_len=MAX_MODEL_LEN,
             gpu_memory_utilization=0.90,
             trust_remote_code=True,
+            limit_mm_per_prompt={"image": 0, "audio": 0},
+        ))
+        print(f"[vLLM] ✓ Minimal engine ready", flush=True)
     except Exception as e:
+        print(f"[vLLM] ✗✗✗ All failed: {e}", flush=True)
         sys.exit(1)
 MODEL_CAP["max_tokens"] = min(MODEL_CAP["max_tokens"], MAX_MODEL_LEN)
+_attn = "TriAttention" if TRIATT_ENABLED else "Standard"
+print(f"[vLLM] Final: {_attn}, max_len={MAX_MODEL_LEN}, max_tokens={MODEL_CAP['max_tokens']}", flush=True)
 # ══════════════════════════════════════════════════════════════════════════════
+# 4.  THINKING MODE HELPERS
 # ══════════════════════════════════════════════════════════════════════════════
 def parse_think_blocks(text: str) -> tuple[str, str]:
+    # Gemma 4 thinking format: <|channel|>thought\n...<channel|>answer
+    m = re.search(r"<\|channel\|>thought\s*\n(.*?)<channel\|>", text, re.DOTALL)
+    if m:
+        return m.group(1).strip(), text[m.end():].strip()
+    # Fallback: <think>...</think>
     m = re.search(r"<think>(.*?)</think>\s*", text, re.DOTALL)
+    if m:
+        return m.group(1).strip(), text[m.end():].strip()
+    return "", text
 def format_response(raw: str) -> str:
     chain, answer = parse_think_blocks(raw)
             "<details>\n<summary>🧠 Reasoning Chain — click to expand</summary>\n\n"
             f"{chain}\n\n</details>\n\n{answer}"
         )
+    # Gemma 4 thinking in progress
+    if "<|channel|>thought" in raw and "<channel|>" not in raw:
+        think_len = len(raw) - raw.index("<|channel|>thought") - 18
+        return f"🧠 Thinking... ({think_len} chars)"
     if "<think>" in raw and "</think>" not in raw:
         think_len = len(raw) - raw.index("<think>") - 7
+        return f"🧠 Thinking... ({think_len} chars)"
     return raw
 # ══════════════════════════════════════════════════════════════════════════════
+# 5.  GENERATION — vLLM Engine + MTI Streaming
 # ══════════════════════════════════════════════════════════════════════════════
 def _engine_generate(prompt_text: str, params: SamplingParams, queue: Queue):
+    """vLLM 엔진 생성 + Queue 스트리밍"""
     try:
         request_id = str(uuid.uuid4())
         token_ids = tokenizer.encode(prompt_text)
         engine.add_request(request_id, TokensPrompt(prompt_token_ids=token_ids), params)
         while engine.has_unfinished_requests():
             step_outputs = engine.step()
             for output in step_outputs:
                 text = output.outputs[0].text
                 if len(text) > prev_len:
                     queue.put(text[prev_len:])
                     prev_len = len(text)
+                if output.finished:
+                    queue.put(None)
+                    return
         queue.put(None)
     except Exception as e:
 def generate_reply(
+    message, history, thinking_mode, image_input,
+    system_prompt, max_new_tokens, temperature, top_p,
 ) -> Generator[str, None, None]:
     max_new_tokens = min(int(max_new_tokens), MODEL_CAP["max_tokens"])
     temperature    = min(float(temperature),  MODEL_CAP["temp_max"])
     messages: list[dict] = []
     if system_prompt.strip():
         messages.append({"role": "system", "content": system_prompt.strip()})
                 _, clean = parse_think_blocks(text)
                 messages.append({"role":"assistant","content":clean})
         else:
+            try: u, a = (turn[0] or None), (turn[1] if len(turn)>1 else None)
+            except: continue
             def _txt(v):
                 if v is None: return None
                 if isinstance(v, list):
+                    return " ".join(p.get("text","") for p in v if isinstance(p,dict) and p.get("type")=="text")
                 return str(v)
+            ut, at = _txt(u), _txt(a)
             if ut: messages.append({"role":"user","content":ut})
             if at:
                 _, clean = parse_think_blocks(at)
     messages.append({"role": "user", "content": message})
     try:
         prompt_text = tokenizer.apply_chat_template(
             messages, tokenize=False, add_generation_prompt=True,
     input_len = len(tokenizer.encode(prompt_text))
     print(f"[GEN] tokens={input_len}, max_new={max_new_tokens}, "
+          f"temp={temperature}, MTI=on, Attn={_attn}", flush=True)
     mti = MTILogitsProcessor(cfg_scale=1.5, entropy_threshold=2.0)
     params = SamplingParams(
         logits_processors=[mti],
     )
     queue = Queue()
     thread = Thread(target=_engine_generate, args=(prompt_text, params, queue))
     thread.start()
     try:
         while True:
             chunk = queue.get(timeout=120)
+            if chunk is None: break
             output += chunk
             yield format_response(output)
     except Exception as e:
     if output:
         mti_rate = f"{mti.intervention_rate*100:.1f}%"
+        print(f"[GEN] Done — {len(output)} chars, MTI={mti_rate} "
+              f"({mti._interventions}/{mti._total})", flush=True)
         yield format_response(output)
     else:
         yield "**⚠️ 모델이 빈 응답을 반환했습니다.** 다시 시도해 주세요."
 # ══════════════════════════════════════════════════════════════════════════════
 with gr.Blocks(title=MODEL_NAME) as gradio_demo:
     thinking_toggle = gr.Radio(
+        choices=["⚡ Fast Mode", "🧠 Thinking Mode"],
+        value="⚡ Fast Mode", visible=False,
     )
     image_input    = gr.Textbox(value="", visible=False)
     system_prompt  = gr.Textbox(value=PRESETS["general"], visible=False)
     max_new_tokens = gr.Slider(minimum=64, maximum=MODEL_CAP["max_tokens"], value=4096, visible=False)
+    temperature    = gr.Slider(minimum=0.0, maximum=2.0, value=0.6, visible=False)
+    top_p          = gr.Slider(minimum=0.1, maximum=1.0, value=0.9, visible=False)
     gr.ChatInterface(
+        fn=generate_reply, api_name="chat",
         additional_inputs=[
             thinking_toggle, image_input,
             system_prompt, max_new_tokens, temperature, top_p,
     )
 # ══════════════════════════════════════════════════════════════════════════════
+# 7.  FASTAPI
 # ══════════════════════════════════════════════════════════════════════════════
 fapp    = FastAPI()
 SESSIONS: dict[str, dict] = {}
 CLIENT_SECRET = os.getenv("OAUTH_CLIENT_SECRET", "")
 SPACE_HOST    = os.getenv("SPACE_HOST", "localhost:7860")
 REDIRECT_URI  = f"https://{SPACE_HOST}/login/callback"
 HF_AUTH_URL   = "https://huggingface.co/oauth/authorize"
 HF_TOKEN_URL  = "https://huggingface.co/oauth/token"
 HF_USER_URL   = "https://huggingface.co/oauth/userinfo"
 SCOPES        = os.getenv("OAUTH_SCOPES", "openid profile")
+print(f"[OAuth] CLIENT_ID={bool(CLIENT_ID)}, SPACE_HOST={SPACE_HOST}")
+def _sid(req): return req.cookies.get("mc_session")
+def _user(req):
     sid = _sid(req)
     return SESSIONS.get(sid) if sid else None
 @fapp.get("/oauth/login")
 async def oauth_login(request: Request):
+    if not CLIENT_ID: return RedirectResponse("/?oauth_error=not_configured")
     state = secrets.token_urlsafe(16)
     params = {"response_type":"code","client_id":CLIENT_ID,"redirect_uri":REDIRECT_URI,"scope":SCOPES,"state":state}
     return RedirectResponse(f"{HF_AUTH_URL}?{urlencode(params)}", status_code=302)
 @fapp.get("/login/callback")
 async def oauth_callback(code: str = "", error: str = "", state: str = ""):
+    if error or not code: return RedirectResponse("/?auth_error=1")
     basic = base64.b64encode(f"{CLIENT_ID}:{CLIENT_SECRET}".encode()).decode()
     async with httpx.AsyncClient() as client:
         tok = await client.post(HF_TOKEN_URL, data={"grant_type":"authorization_code","code":code,"redirect_uri":REDIRECT_URI},
 @fapp.get("/health")
 async def health():
     return {
+        "status": "ok", "model": MODEL_ID,
         "backend": "vLLM-Engine",
+        "attention": "TriAttention" if TRIATT_ENABLED else "Standard",
         "mti": "enabled",
         "max_tokens": MODEL_CAP["max_tokens"],
+        "max_model_len": MAX_MODEL_LEN,
+        "multimodal": "vision+audio",
     }
 BRAVE_API_KEY = os.getenv("BRAVE_API_KEY", "")
 async def api_search(request: Request):
     body = await request.json()
     query = body.get("query", "").strip()
+    if not query: return JSONResponse({"error": "empty"}, 400)
+    if not BRAVE_API_KEY: return JSONResponse({"error": "no key"}, 500)
     try:
         r = requests.get("https://api.search.brave.com/res/v1/web/search",
             headers={"X-Subscription-Token": BRAVE_API_KEY, "Accept": "application/json"},
             params={"q": query, "count": 5}, timeout=10)
         r.raise_for_status()
         results = r.json().get("web", {}).get("results", [])
+        return JSONResponse({"results": [{"title":i.get("title",""),"desc":i.get("description",""),"url":i.get("url","")} for i in results[:5]]})
     except Exception as e:
+        return JSONResponse({"error": str(e)}, 500)
 @fapp.post("/api/extract-pdf")
 async def api_extract_pdf(request: Request):
             doc = fitz.open(stream=pdf_bytes, filetype="pdf")
             for page in doc: text += page.get_text() + "\n"
         except ImportError:
+            text = pdf_bytes.decode("utf-8", errors="ignore")
+        return JSONResponse({"text": text.strip()[:8000], "chars": len(text)})
     except Exception as e:
+        return JSONResponse({"error": str(e)}, 500)
 # ══════════════════════════════════════════════════════════════════════════════
 # 8.  MOUNT & RUN
 app = gr.mount_gradio_app(fapp, gradio_demo, path="/gradio")
 def _shutdown(sig, frame):
+    print("[BOOT] Shutdown", flush=True)
     sys.exit(0)
 signal.signal(signal.SIGTERM, _shutdown)
 signal.signal(signal.SIGINT, _shutdown)
 if __name__ == "__main__":
+    print(f"[BOOT] {MODEL_NAME} · vLLM · {_attn} · MTI · max_len={MAX_MODEL_LEN} · Ready", flush=True)
     uvicorn.run(app, host="0.0.0.0", port=7860)