SeaWolf-AI commited on
Commit
d2fff68
Β·
verified Β·
1 Parent(s): d028344

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -524
app.py CHANGED
@@ -1,21 +1,19 @@
1
- """
2
- 🧬 Gemma 4 E4B β€” vLLM + MTI + TriAttention
3
- Multimodal (Vision+Audio+Text) Β· Effective 4.5B Β· Apache 2.0
4
- MTI: +9~11% reasoning accuracy (training-free)
5
- TriAttention: ~10x KV cache compression
6
- """
7
  import sys, os, signal, time, uuid
8
  print(f"[BOOT] Python {sys.version}", flush=True)
9
 
10
- # ── TriAttention μ‹œλ„ ──
11
  TRIATT_ENABLED = False
12
  try:
13
  import aither_kvcache
14
  os.environ["VLLM_ATTENTION_BACKEND"] = "CUSTOM"
15
  TRIATT_ENABLED = True
16
- print("[TRIATT] aither-kvcache β†’ VLLM_ATTENTION_BACKEND=CUSTOM", flush=True)
17
  except ImportError:
18
- print("[TRIATT] aither-kvcache not found β†’ standard attention", flush=True)
19
 
20
  import base64, re, json
21
  from typing import Generator, Optional
@@ -32,9 +30,9 @@ from fastapi.responses import HTMLResponse, RedirectResponse, JSONResponse
32
  from urllib.parse import urlencode
33
  import pathlib, secrets
34
 
35
- # ══════════════════════════════════════════════════════════════════════════════
36
  # 1. CONFIG
37
- # ══════════════════════════════════════════════════════════════════════════════
38
  MODEL_ID = "DavidAU/gemma-4-E4B-it-The-DECKARD-Expresso-Universe-HERETIC-UNCENSORED-Thinking"
39
  MODEL_NAME = "DECKARD-E4B-Opus"
40
  MODEL_CAP = {
@@ -51,12 +49,12 @@ PRESETS = {
51
  "vision": "You are an expert at analyzing images. Describe what you see in detail, extract text, and answer questions about visual content.",
52
  }
53
 
54
- # ══════════════════════════════════════════════════════════════════════════════
55
- # 2. MTI β€” Minimal Test-Time Intervention (arxiv 2510.13940)
56
- # ══════════════════════════════════════════════════════════════════════════════
57
  class MTILogitsProcessor:
58
  """
59
- κ³ μ—”νŠΈλ‘œν”Ό(λΆˆν™•μ‹€) ν† ν°μ—λ§Œ CFG 적용 β†’ μΆ”λ‘  정확도 +9~11%.
60
  ν•™μŠ΅ 없이 μ„œλΉ™ μ‹œ 적용. 전체 ν† ν°μ˜ ~15%μ—λ§Œ κ°œμž….
61
  """
62
  def __init__(self, cfg_scale: float = 1.5, entropy_threshold: float = 2.0):
@@ -83,15 +81,15 @@ class MTILogitsProcessor:
83
 
84
  print("[MTI] MTILogitsProcessor ready (cfg=1.5, threshold=2.0)", flush=True)
85
 
86
- # ══════════════════════════════════════════════════════════════════════════════
87
- # 3. vLLM ENGINE β€” Gemma 4 Day 0 지원, 패치 λΆˆν•„μš”
88
- # ══════════════════════════════════════════════════════════════════════════════
89
  from vllm.engine.arg_utils import EngineArgs
90
  from vllm.engine.llm_engine import LLMEngine
91
  from vllm import SamplingParams, TokensPrompt
92
  from transformers import AutoTokenizer
93
 
94
- # ── Gemma 4 ν† ν¬λ‚˜μ΄μ € ν˜Έν™˜μ„± 패치 ──
95
  # transformers 5.5.0+μ—μ„œ extra_special_tokensκ°€ list일 λ•Œ .keys() μ—λŸ¬
96
  from huggingface_hub import hf_hub_download
97
  import tempfile, shutil
@@ -108,7 +106,7 @@ for _fname in ["tokenizer_config.json", "tokenizer.json", "tokenizer.model",
108
  except Exception:
109
  pass
110
 
111
- # tokenizer_config.json 패치: extra_special_tokens list β†’ dict
112
  _tc_path = os.path.join(_tok_dir, "tokenizer_config.json")
113
  if os.path.exists(_tc_path):
114
  with open(_tc_path) as f:
@@ -137,9 +135,9 @@ if engine is None and TRIATT_ENABLED:
137
  trust_remote_code=True,
138
  limit_mm_per_prompt={"image": 0, "audio": 0},
139
  ))
140
- print(f"[vLLM] βœ“ TriAttention engine ready", flush=True)
141
  except Exception as e:
142
- print(f"[vLLM] βœ— TriAttention failed: {e}", flush=True)
143
  os.environ.pop("VLLM_ATTENTION_BACKEND", None)
144
  TRIATT_ENABLED = False
145
  engine = None
@@ -156,9 +154,9 @@ if engine is None:
156
  trust_remote_code=True,
157
  limit_mm_per_prompt={"image": 0, "audio": 0},
158
  ))
159
- print(f"[vLLM] βœ“ Standard engine ready", flush=True)
160
  except Exception as e:
161
- print(f"[vLLM] βœ— 16K failed: {e}", flush=True)
162
  engine = None
163
 
164
  # μ‹œλ„ 3: μ΅œμ†Œ 8K
@@ -173,18 +171,18 @@ if engine is None:
173
  trust_remote_code=True,
174
  limit_mm_per_prompt={"image": 0, "audio": 0},
175
  ))
176
- print(f"[vLLM] βœ“ Minimal engine ready", flush=True)
177
  except Exception as e:
178
- print(f"[vLLM] βœ—βœ—βœ— All failed: {e}", flush=True)
179
  sys.exit(1)
180
 
181
  MODEL_CAP["max_tokens"] = min(MODEL_CAP["max_tokens"], MAX_MODEL_LEN)
182
  _attn = "TriAttention" if TRIATT_ENABLED else "Standard"
183
  print(f"[vLLM] Final: {_attn}, max_len={MAX_MODEL_LEN}, max_tokens={MODEL_CAP['max_tokens']}", flush=True)
184
 
185
- # ══════════════════════════════════════════════════════════════════════════════
186
  # 4. THINKING MODE HELPERS
187
- # ══════════════════════════════════════════════════════════════════════════════
188
  def parse_think_blocks(text: str) -> tuple[str, str]:
189
  # Gemma 4 thinking format: <|channel|>thought\n...<channel|>answer
190
  m = re.search(r"<\|channel\|>thought\s*\n(.*?)<channel\|>", text, re.DOTALL)
@@ -200,7 +198,7 @@ def format_response(raw: str) -> str:
200
  chain, answer = parse_think_blocks(raw)
201
  if chain:
202
  return (
203
- "<details>\n<summary>🧠 Reasoning Chain β€” click to expand</summary>\n\n"
204
  f"{chain}\n\n</details>\n\n{answer}"
205
  )
206
  # Gemma 4 thinking in progress
@@ -212,9 +210,9 @@ def format_response(raw: str) -> str:
212
  return f"🧠 Thinking... ({think_len} chars)"
213
  return raw
214
 
215
- # ══════════════════════════════════════════════════════════════════════════════
216
- # 5. GENERATION β€” vLLM Engine + MTI Streaming
217
- # ══════════════════════════════════════════════════════════════════════════════
218
  def _engine_generate(prompt_text: str, params: SamplingParams, queue: Queue):
219
  """vLLM μ—”μ§„ 생성 + Queue 슀트리밍"""
220
  try:
@@ -320,16 +318,16 @@ def generate_reply(
320
 
321
  if output:
322
  mti_rate = f"{mti.intervention_rate*100:.1f}%"
323
- print(f"[GEN] Done β€” {len(output)} chars, MTI={mti_rate} "
324
  f"({mti._interventions}/{mti._total})", flush=True)
325
  yield format_response(output)
326
  else:
327
  yield "**⚠️ λͺ¨λΈμ΄ 빈 응닡을 λ°˜ν™˜ν–ˆμŠ΅λ‹ˆλ‹€.** λ‹€μ‹œ μ‹œλ„ν•΄ μ£Όμ„Έμš”."
328
 
329
 
330
- # ══════════════════════════════════════════════════════════════════════════════
331
  # 6. GRADIO BLOCKS
332
- # ══════════════════════════════════════════════════════════════════════════════
333
  with gr.Blocks(title=MODEL_NAME) as gradio_demo:
334
  thinking_toggle = gr.Radio(
335
  choices=["⚑ Fast Mode", "🧠 Thinking Mode"],
@@ -349,9 +347,9 @@ with gr.Blocks(title=MODEL_NAME) as gradio_demo:
349
  ],
350
  )
351
 
352
- # ══════════════════════════════════════════════════════════════════════════════
353
  # 7. FASTAPI
354
- # ══════════════════════════════════════════════════════════════════════════════
355
  fapp = FastAPI()
356
  SESSIONS: dict[str, dict] = {}
357
  HTML = pathlib.Path(__file__).parent / "index.html"
@@ -470,494 +468,9 @@ async def api_extract_pdf(request: Request):
470
  except Exception as e:
471
  return JSONResponse({"error": str(e)}, 500)
472
 
473
- # ══════════════════════════════════════════════════════════════════════════════
474
  # 8. MOUNT & RUN
475
- # ══════════════════════════════════════════════════════════════════════════════
476
- app = gr.mount_gradio_app(fapp, gradio_demo, path="/gradio")
477
-
478
- def _shutdown(sig, frame):
479
- print("[BOOT] Shutdown", flush=True)
480
- sys.exit(0)
481
- signal.signal(signal.SIGTERM, _shutdown)
482
- signal.signal(signal.SIGINT, _shutdown)
483
-
484
- if __name__ == "__main__":
485
- print(f"[BOOT] {MODEL_NAME} - vLLM - {_attn} - MTI - max_len={MAX_MODEL_LEN} - Ready", flush=True)
486
- uvicorn.run(app, host="0.0.0.0", port=7860)"""
487
- 🧬 Gemma 4 E4B β€” vLLM + MTI + TriAttention
488
- Multimodal (Vision+Audio+Text) Β· Effective 4.5B Β· Apache 2.0
489
- MTI: +9~11% reasoning accuracy (training-free)
490
- TriAttention: ~10x KV cache compression
491
- """
492
- import sys, os, signal, time, uuid
493
- print(f"[BOOT] Python {sys.version}", flush=True)
494
-
495
- # ── TriAttention μ‹œλ„ ──
496
- TRIATT_ENABLED = False
497
- try:
498
- import aither_kvcache
499
- os.environ["VLLM_ATTENTION_BACKEND"] = "CUSTOM"
500
- TRIATT_ENABLED = True
501
- print("[TRIATT] aither-kvcache β†’ VLLM_ATTENTION_BACKEND=CUSTOM", flush=True)
502
- except ImportError:
503
- print("[TRIATT] aither-kvcache not found β†’ standard attention", flush=True)
504
-
505
- import base64, re, json
506
- from typing import Generator, Optional
507
- from threading import Thread
508
- from queue import Queue
509
-
510
- import torch
511
- import gradio as gr
512
- print(f"[BOOT] gradio {gr.__version__}, torch {torch.__version__}", flush=True)
513
-
514
- import requests, httpx, uvicorn
515
- from fastapi import FastAPI, Request
516
- from fastapi.responses import HTMLResponse, RedirectResponse, JSONResponse
517
- from urllib.parse import urlencode
518
- import pathlib, secrets
519
-
520
- # ══════════════════════════════════════════════════════════════════════════════
521
- # 1. CONFIG
522
- # ══════════════════════════════════════════════════════════════════════════════
523
- MODEL_ID = "DavidAU/gemma-4-E4B-it-The-DECKARD-Expresso-Universe-HERETIC-UNCENSORED-Thinking"
524
- MODEL_NAME = "DECKARD-E4B-Opus"
525
- MODEL_CAP = {
526
- "arch": "Gemma4 PLE", "active": "4.5B", "total": "~8B",
527
- "ctx": "128K", "thinking": True, "vision": True, "audio": True,
528
- "max_tokens": 16384, "temp_max": 2.0,
529
- }
530
-
531
- PRESETS = {
532
- "general": "You are a highly capable multimodal AI assistant. Think deeply and provide thorough, insightful responses.",
533
- "code": "You are an expert software engineer. Write clean, efficient, well-commented code.",
534
- "math": "You are a world-class mathematician. Break problems step-by-step. Show full working.",
535
- "creative": "You are a brilliant creative writer. Be imaginative, vivid, and engaging.",
536
- "vision": "You are an expert at analyzing images. Describe what you see in detail, extract text, and answer questions about visual content.",
537
- }
538
-
539
- # ══════════════════════════════════════════════════════════════════════════════
540
- # 2. MTI β€” Minimal Test-Time Intervention (arxiv 2510.13940)
541
- # ════════════════════════════���═════════════════════════════════════════════════
542
- class MTILogitsProcessor:
543
- """
544
- κ³ μ—”νŠΈλ‘œν”Ό(λΆˆν™•μ‹€) ν† ν°μ—λ§Œ CFG 적용 β†’ μΆ”λ‘  정확도 +9~11%.
545
- ν•™μŠ΅ 없이 μ„œλΉ™ μ‹œ 적용. 전체 ν† ν°μ˜ ~15%μ—λ§Œ κ°œμž….
546
- """
547
- def __init__(self, cfg_scale: float = 1.5, entropy_threshold: float = 2.0):
548
- self.cfg_scale = cfg_scale
549
- self.entropy_threshold = entropy_threshold
550
- self._interventions = 0
551
- self._total = 0
552
-
553
- def __call__(self, token_ids, logits):
554
- self._total += 1
555
- probs = torch.softmax(logits, dim=-1)
556
- entropy = -(probs * torch.log(probs + 1e-10)).sum(dim=-1)
557
-
558
- if entropy.item() > self.entropy_threshold:
559
- mean_logit = logits.mean(dim=-1, keepdim=True)
560
- guided = logits + self.cfg_scale * (logits - mean_logit)
561
- self._interventions += 1
562
- return guided
563
- return logits
564
-
565
- @property
566
- def intervention_rate(self):
567
- return self._interventions / max(self._total, 1)
568
-
569
- print("[MTI] MTILogitsProcessor ready (cfg=1.5, threshold=2.0)", flush=True)
570
-
571
- # ══════════════════════════════════════════════════════════════════════════════
572
- # 3. vLLM ENGINE β€” Gemma 4 Day 0 지원, 패치 λΆˆν•„μš”
573
- # ══════════════════════════════════════════════════════════════════════════════
574
- from vllm.engine.arg_utils import EngineArgs
575
- from vllm.engine.llm_engine import LLMEngine
576
- from vllm import SamplingParams, TokensPrompt
577
- from transformers import AutoTokenizer
578
-
579
- # ── Gemma 4 ν† ν¬λ‚˜μ΄μ € ν˜Έν™˜μ„± 패치 ──
580
- # transformers 5.5.0+μ—μ„œ extra_special_tokensκ°€ list일 λ•Œ .keys() μ—λŸ¬
581
- from huggingface_hub import hf_hub_download
582
- import tempfile, shutil
583
-
584
- _tok_source = "google/gemma-4-E4B-it"
585
- _tok_dir = tempfile.mkdtemp()
586
-
587
- # ν† ν¬λ‚˜μ΄μ € νŒŒμΌλ“€ λ‹€μš΄λ‘œλ“œ
588
- for _fname in ["tokenizer_config.json", "tokenizer.json", "tokenizer.model",
589
- "special_tokens_map.json", "chat_template.jinja"]:
590
- try:
591
- _p = hf_hub_download(_tok_source, _fname)
592
- shutil.copy(_p, os.path.join(_tok_dir, _fname))
593
- except Exception:
594
- pass
595
-
596
- # tokenizer_config.json 패치: extra_special_tokens list β†’ dict
597
- _tc_path = os.path.join(_tok_dir, "tokenizer_config.json")
598
- if os.path.exists(_tc_path):
599
- with open(_tc_path) as f:
600
- _tc = json.load(f)
601
- est = _tc.get("extra_special_tokens", None)
602
- if isinstance(est, list):
603
- _tc["extra_special_tokens"] = {tok: tok for tok in est} if est else {}
604
- with open(_tc_path, "w") as f:
605
- json.dump(_tc, f, indent=2)
606
- print(f"[vLLM] Patched extra_special_tokens: list({len(est)}) -> dict", flush=True)
607
-
608
- tokenizer = AutoTokenizer.from_pretrained(_tok_dir, trust_remote_code=True)
609
- print(f"[vLLM] Tokenizer loaded (vocab={len(tokenizer)})", flush=True)
610
-
611
- engine = None
612
- MAX_MODEL_LEN = 32768
613
-
614
- # μ‹œλ„ 1: TriAttention + 32K
615
- if engine is None and TRIATT_ENABLED:
616
- try:
617
- print(f"[vLLM] Try 1: TriAttention + {MAX_MODEL_LEN}", flush=True)
618
- engine = LLMEngine.from_engine_args(EngineArgs(
619
- model=MODEL_ID, tokenizer=_tok_dir, dtype="bfloat16",
620
- max_model_len=MAX_MODEL_LEN,
621
- gpu_memory_utilization=0.92,
622
- trust_remote_code=True,
623
- limit_mm_per_prompt={"image": 0, "audio": 0},
624
- ))
625
- print(f"[vLLM] βœ“ TriAttention engine ready", flush=True)
626
- except Exception as e:
627
- print(f"[vLLM] βœ— TriAttention failed: {e}", flush=True)
628
- os.environ.pop("VLLM_ATTENTION_BACKEND", None)
629
- TRIATT_ENABLED = False
630
- engine = None
631
-
632
- # μ‹œλ„ 2: ν‘œμ€€ + 16K
633
- if engine is None:
634
- MAX_MODEL_LEN = 16384
635
- try:
636
- print(f"[vLLM] Try 2: Standard + {MAX_MODEL_LEN}", flush=True)
637
- engine = LLMEngine.from_engine_args(EngineArgs(
638
- model=MODEL_ID, tokenizer=_tok_dir, dtype="bfloat16",
639
- max_model_len=MAX_MODEL_LEN,
640
- gpu_memory_utilization=0.92,
641
- trust_remote_code=True,
642
- limit_mm_per_prompt={"image": 0, "audio": 0},
643
- ))
644
- print(f"[vLLM] βœ“ Standard engine ready", flush=True)
645
- except Exception as e:
646
- print(f"[vLLM] βœ— 16K failed: {e}", flush=True)
647
- engine = None
648
-
649
- # μ‹œλ„ 3: μ΅œμ†Œ 8K
650
- if engine is None:
651
- MAX_MODEL_LEN = 8192
652
- try:
653
- print(f"[vLLM] Try 3: Minimal + {MAX_MODEL_LEN}", flush=True)
654
- engine = LLMEngine.from_engine_args(EngineArgs(
655
- model=MODEL_ID, tokenizer=_tok_dir, dtype="bfloat16",
656
- max_model_len=MAX_MODEL_LEN,
657
- gpu_memory_utilization=0.90,
658
- trust_remote_code=True,
659
- limit_mm_per_prompt={"image": 0, "audio": 0},
660
- ))
661
- print(f"[vLLM] βœ“ Minimal engine ready", flush=True)
662
- except Exception as e:
663
- print(f"[vLLM] βœ—βœ—βœ— All failed: {e}", flush=True)
664
- sys.exit(1)
665
-
666
- MODEL_CAP["max_tokens"] = min(MODEL_CAP["max_tokens"], MAX_MODEL_LEN)
667
- _attn = "TriAttention" if TRIATT_ENABLED else "Standard"
668
- print(f"[vLLM] Final: {_attn}, max_len={MAX_MODEL_LEN}, max_tokens={MODEL_CAP['max_tokens']}", flush=True)
669
-
670
- # ══════════════════════════════════════════════════════════════════════════════
671
- # 4. THINKING MODE HELPERS
672
- # ══════════════════════════════════════════════════════════════════════════════
673
- def parse_think_blocks(text: str) -> tuple[str, str]:
674
- # Gemma 4 thinking format: <|channel|>thought\n...<channel|>answer
675
- m = re.search(r"<\|channel\|>thought\s*\n(.*?)<channel\|>", text, re.DOTALL)
676
- if m:
677
- return m.group(1).strip(), text[m.end():].strip()
678
- # Fallback: <think>...</think>
679
- m = re.search(r"<think>(.*?)</think>\s*", text, re.DOTALL)
680
- if m:
681
- return m.group(1).strip(), text[m.end():].strip()
682
- return "", text
683
-
684
- def format_response(raw: str) -> str:
685
- chain, answer = parse_think_blocks(raw)
686
- if chain:
687
- return (
688
- "<details>\n<summary>🧠 Reasoning Chain β€” click to expand</summary>\n\n"
689
- f"{chain}\n\n</details>\n\n{answer}"
690
- )
691
- # Gemma 4 thinking in progress
692
- if "<|channel|>thought" in raw and "<channel|>" not in raw:
693
- think_len = len(raw) - raw.index("<|channel|>thought") - 18
694
- return f"🧠 Thinking... ({think_len} chars)"
695
- if "<think>" in raw and "</think>" not in raw:
696
- think_len = len(raw) - raw.index("<think>") - 7
697
- return f"🧠 Thinking... ({think_len} chars)"
698
- return raw
699
-
700
- # ══════════════════════════════════════════════════════════════════════════════
701
- # 5. GENERATION β€” vLLM Engine + MTI Streaming
702
- # ══════════════════════════════════════════════════════════════════════════════
703
- def _engine_generate(prompt_text: str, params: SamplingParams, queue: Queue):
704
- """vLLM μ—”μ§„ 생성 + Queue 슀트리밍"""
705
- try:
706
- request_id = str(uuid.uuid4())
707
- token_ids = tokenizer.encode(prompt_text)
708
- engine.add_request(request_id, TokensPrompt(prompt_token_ids=token_ids), params)
709
-
710
- prev_len = 0
711
- while engine.has_unfinished_requests():
712
- step_outputs = engine.step()
713
- for output in step_outputs:
714
- text = output.outputs[0].text
715
- if len(text) > prev_len:
716
- queue.put(text[prev_len:])
717
- prev_len = len(text)
718
- if output.finished:
719
- queue.put(None)
720
- return
721
-
722
- queue.put(None)
723
- except Exception as e:
724
- queue.put(f"\n\n**❌ Engine error:** `{e}`")
725
- queue.put(None)
726
-
727
-
728
- def generate_reply(
729
- message, history, thinking_mode, image_input,
730
- system_prompt, max_new_tokens, temperature, top_p,
731
- ) -> Generator[str, None, None]:
732
-
733
- max_new_tokens = min(int(max_new_tokens), MODEL_CAP["max_tokens"])
734
- temperature = min(float(temperature), MODEL_CAP["temp_max"])
735
-
736
- messages: list[dict] = []
737
- if system_prompt.strip():
738
- messages.append({"role": "system", "content": system_prompt.strip()})
739
-
740
- for turn in history:
741
- if isinstance(turn, dict):
742
- role = turn.get("role", "")
743
- raw = turn.get("content") or ""
744
- text = (" ".join(p.get("text","") for p in raw
745
- if isinstance(p,dict) and p.get("type")=="text")
746
- if isinstance(raw, list) else str(raw))
747
- if role == "user":
748
- messages.append({"role":"user","content":text})
749
- elif role == "assistant":
750
- _, clean = parse_think_blocks(text)
751
- messages.append({"role":"assistant","content":clean})
752
- else:
753
- try: u, a = (turn[0] or None), (turn[1] if len(turn)>1 else None)
754
- except: continue
755
- def _txt(v):
756
- if v is None: return None
757
- if isinstance(v, list):
758
- return " ".join(p.get("text","") for p in v if isinstance(p,dict) and p.get("type")=="text")
759
- return str(v)
760
- ut, at = _txt(u), _txt(a)
761
- if ut: messages.append({"role":"user","content":ut})
762
- if at:
763
- _, clean = parse_think_blocks(at)
764
- messages.append({"role":"assistant","content":clean})
765
-
766
- messages.append({"role": "user", "content": message})
767
-
768
- try:
769
- prompt_text = tokenizer.apply_chat_template(
770
- messages, tokenize=False, add_generation_prompt=True,
771
- )
772
- except Exception as e:
773
- yield f"**❌ Template error:** `{e}`"
774
- return
775
-
776
- input_len = len(tokenizer.encode(prompt_text))
777
- print(f"[GEN] tokens={input_len}, max_new={max_new_tokens}, "
778
- f"temp={temperature}, MTI=on, Attn={_attn}", flush=True)
779
-
780
- mti = MTILogitsProcessor(cfg_scale=1.5, entropy_threshold=2.0)
781
-
782
- params = SamplingParams(
783
- max_tokens=max_new_tokens,
784
- temperature=max(float(temperature), 0.01) if temperature > 0.01 else 0.0,
785
- top_p=float(top_p),
786
- logits_processors=[mti],
787
- )
788
-
789
- queue = Queue()
790
- thread = Thread(target=_engine_generate, args=(prompt_text, params, queue))
791
- thread.start()
792
-
793
- output = ""
794
- try:
795
- while True:
796
- chunk = queue.get(timeout=120)
797
- if chunk is None: break
798
- output += chunk
799
- yield format_response(output)
800
- except Exception as e:
801
- if not output:
802
- yield f"**❌ Streaming error:** `{e}`"
803
-
804
- thread.join(timeout=5)
805
-
806
- if output:
807
- mti_rate = f"{mti.intervention_rate*100:.1f}%"
808
- print(f"[GEN] Done β€” {len(output)} chars, MTI={mti_rate} "
809
- f"({mti._interventions}/{mti._total})", flush=True)
810
- yield format_response(output)
811
- else:
812
- yield "**⚠️ λͺ¨λΈμ΄ 빈 응닡을 λ°˜ν™˜ν–ˆμŠ΅λ‹ˆλ‹€.** λ‹€μ‹œ μ‹œλ„ν•΄ μ£Όμ„Έμš”."
813
-
814
-
815
- # ══════════════════════════════════════════════════════════════════════════════
816
- # 6. GRADIO BLOCKS
817
- # ══════════════════════════════════════════════════════════════════════════════
818
- with gr.Blocks(title=MODEL_NAME) as gradio_demo:
819
- thinking_toggle = gr.Radio(
820
- choices=["⚑ Fast Mode", "🧠 Thinking Mode"],
821
- value="⚑ Fast Mode", visible=False,
822
- )
823
- image_input = gr.Textbox(value="", visible=False)
824
- system_prompt = gr.Textbox(value=PRESETS["general"], visible=False)
825
- max_new_tokens = gr.Slider(minimum=64, maximum=MODEL_CAP["max_tokens"], value=4096, visible=False)
826
- temperature = gr.Slider(minimum=0.0, maximum=2.0, value=0.6, visible=False)
827
- top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.9, visible=False)
828
-
829
- gr.ChatInterface(
830
- fn=generate_reply, api_name="chat",
831
- additional_inputs=[
832
- thinking_toggle, image_input,
833
- system_prompt, max_new_tokens, temperature, top_p,
834
- ],
835
- )
836
-
837
- # ══════════════════════════════════════════════════════════════════════════════
838
- # 7. FASTAPI
839
- # ══════════════════════════════════════════════════════════════════════════════
840
- fapp = FastAPI()
841
- SESSIONS: dict[str, dict] = {}
842
- HTML = pathlib.Path(__file__).parent / "index.html"
843
-
844
- CLIENT_ID = os.getenv("OAUTH_CLIENT_ID", "")
845
- CLIENT_SECRET = os.getenv("OAUTH_CLIENT_SECRET", "")
846
- SPACE_HOST = os.getenv("SPACE_HOST", "localhost:7860")
847
- REDIRECT_URI = f"https://{SPACE_HOST}/login/callback"
848
- HF_AUTH_URL = "https://huggingface.co/oauth/authorize"
849
- HF_TOKEN_URL = "https://huggingface.co/oauth/token"
850
- HF_USER_URL = "https://huggingface.co/oauth/userinfo"
851
- SCOPES = os.getenv("OAUTH_SCOPES", "openid profile")
852
-
853
- print(f"[OAuth] CLIENT_ID={bool(CLIENT_ID)}, SPACE_HOST={SPACE_HOST}")
854
-
855
- def _sid(req): return req.cookies.get("mc_session")
856
- def _user(req):
857
- sid = _sid(req)
858
- return SESSIONS.get(sid) if sid else None
859
-
860
- @fapp.get("/")
861
- async def root(request: Request):
862
- html = HTML.read_text(encoding="utf-8") if HTML.exists() else "<h2>index.html missing</h2>"
863
- return HTMLResponse(html)
864
-
865
- @fapp.get("/oauth/user")
866
- async def oauth_user(request: Request):
867
- u = _user(request)
868
- return JSONResponse(u) if u else JSONResponse({"logged_in": False}, status_code=401)
869
-
870
- @fapp.get("/oauth/login")
871
- async def oauth_login(request: Request):
872
- if not CLIENT_ID: return RedirectResponse("/?oauth_error=not_configured")
873
- state = secrets.token_urlsafe(16)
874
- params = {"response_type":"code","client_id":CLIENT_ID,"redirect_uri":REDIRECT_URI,"scope":SCOPES,"state":state}
875
- return RedirectResponse(f"{HF_AUTH_URL}?{urlencode(params)}", status_code=302)
876
-
877
- @fapp.get("/login/callback")
878
- async def oauth_callback(code: str = "", error: str = "", state: str = ""):
879
- if error or not code: return RedirectResponse("/?auth_error=1")
880
- basic = base64.b64encode(f"{CLIENT_ID}:{CLIENT_SECRET}".encode()).decode()
881
- async with httpx.AsyncClient() as client:
882
- tok = await client.post(HF_TOKEN_URL, data={"grant_type":"authorization_code","code":code,"redirect_uri":REDIRECT_URI},
883
- headers={"Accept":"application/json","Authorization":f"Basic {basic}"})
884
- if tok.status_code != 200: return RedirectResponse("/?auth_error=1")
885
- access_token = tok.json().get("access_token", "")
886
- if not access_token: return RedirectResponse("/?auth_error=1")
887
- uinfo = await client.get(HF_USER_URL, headers={"Authorization":f"Bearer {access_token}"})
888
- if uinfo.status_code != 200: return RedirectResponse("/?auth_error=1")
889
- user = uinfo.json()
890
- sid = secrets.token_urlsafe(32)
891
- SESSIONS[sid] = {
892
- "logged_in": True,
893
- "username": user.get("preferred_username", user.get("name", "User")),
894
- "name": user.get("name", ""),
895
- "avatar": user.get("picture", ""),
896
- "profile": f"https://huggingface.co/{user.get('preferred_username', '')}",
897
- }
898
- resp = RedirectResponse("/")
899
- resp.set_cookie("mc_session", sid, httponly=True, samesite="lax", secure=True, max_age=60*60*24*7)
900
- return resp
901
-
902
- @fapp.get("/oauth/logout")
903
- async def oauth_logout(request: Request):
904
- sid = _sid(request)
905
- if sid and sid in SESSIONS: del SESSIONS[sid]
906
- resp = RedirectResponse("/")
907
- resp.delete_cookie("mc_session")
908
- return resp
909
-
910
- @fapp.get("/health")
911
- async def health():
912
- return {
913
- "status": "ok", "model": MODEL_ID,
914
- "backend": "vLLM-Engine",
915
- "attention": "TriAttention" if TRIATT_ENABLED else "Standard",
916
- "mti": "enabled",
917
- "max_tokens": MODEL_CAP["max_tokens"],
918
- "max_model_len": MAX_MODEL_LEN,
919
- "multimodal": "vision+audio",
920
- }
921
-
922
- BRAVE_API_KEY = os.getenv("BRAVE_API_KEY", "")
923
-
924
- @fapp.post("/api/search")
925
- async def api_search(request: Request):
926
- body = await request.json()
927
- query = body.get("query", "").strip()
928
- if not query: return JSONResponse({"error": "empty"}, 400)
929
- if not BRAVE_API_KEY: return JSONResponse({"error": "no key"}, 500)
930
- try:
931
- r = requests.get("https://api.search.brave.com/res/v1/web/search",
932
- headers={"X-Subscription-Token": BRAVE_API_KEY, "Accept": "application/json"},
933
- params={"q": query, "count": 5}, timeout=10)
934
- r.raise_for_status()
935
- results = r.json().get("web", {}).get("results", [])
936
- return JSONResponse({"results": [{"title":i.get("title",""),"desc":i.get("description",""),"url":i.get("url","")} for i in results[:5]]})
937
- except Exception as e:
938
- return JSONResponse({"error": str(e)}, 500)
939
-
940
- @fapp.post("/api/extract-pdf")
941
- async def api_extract_pdf(request: Request):
942
- try:
943
- body = await request.json()
944
- b64 = body.get("data", "")
945
- if "," in b64: b64 = b64.split(",", 1)[1]
946
- pdf_bytes = base64.b64decode(b64)
947
- text = ""
948
- try:
949
- import fitz
950
- doc = fitz.open(stream=pdf_bytes, filetype="pdf")
951
- for page in doc: text += page.get_text() + "\n"
952
- except ImportError:
953
- text = pdf_bytes.decode("utf-8", errors="ignore")
954
- return JSONResponse({"text": text.strip()[:8000], "chars": len(text)})
955
- except Exception as e:
956
- return JSONResponse({"error": str(e)}, 500)
957
-
958
- # ══════════════════════════════════════════════════════════════════════════════
959
- # 8. MOUNT & RUN
960
- # ══════════════════════════════════════════════════════════════════════════════
961
  app = gr.mount_gradio_app(fapp, gradio_demo, path="/gradio")
962
 
963
  def _shutdown(sig, frame):
 
1
+ # Gemma 4 E4B - vLLM + MTI + TriAttention
2
+ # Multimodal (Vision+Audio+Text) - Effective 4.5B - Apache 2.0
3
+ # MTI: +9-11% reasoning accuracy (training-free)
4
+ # TriAttention: ~10x KV cache compression
 
 
5
  import sys, os, signal, time, uuid
6
  print(f"[BOOT] Python {sys.version}", flush=True)
7
 
8
+ # -- TriAttention μ‹œλ„ --
9
  TRIATT_ENABLED = False
10
  try:
11
  import aither_kvcache
12
  os.environ["VLLM_ATTENTION_BACKEND"] = "CUSTOM"
13
  TRIATT_ENABLED = True
14
+ print("[TRIATT] aither-kvcache -> VLLM_ATTENTION_BACKEND=CUSTOM", flush=True)
15
  except ImportError:
16
+ print("[TRIATT] aither-kvcache not found -> standard attention", flush=True)
17
 
18
  import base64, re, json
19
  from typing import Generator, Optional
 
30
  from urllib.parse import urlencode
31
  import pathlib, secrets
32
 
33
+ # ==============================================================================
34
  # 1. CONFIG
35
+ # ==============================================================================
36
  MODEL_ID = "DavidAU/gemma-4-E4B-it-The-DECKARD-Expresso-Universe-HERETIC-UNCENSORED-Thinking"
37
  MODEL_NAME = "DECKARD-E4B-Opus"
38
  MODEL_CAP = {
 
49
  "vision": "You are an expert at analyzing images. Describe what you see in detail, extract text, and answer questions about visual content.",
50
  }
51
 
52
+ # ==============================================================================
53
+ # 2. MTI -- Minimal Test-Time Intervention (arxiv 2510.13940)
54
+ # ==============================================================================
55
  class MTILogitsProcessor:
56
  """
57
+ κ³ μ—”νŠΈλ‘œν”Ό(λΆˆν™•μ‹€) ν† ν°μ—λ§Œ CFG 적용 -> μΆ”λ‘  정확도 +9~11%.
58
  ν•™μŠ΅ 없이 μ„œλΉ™ μ‹œ 적용. 전체 ν† ν°μ˜ ~15%μ—λ§Œ κ°œμž….
59
  """
60
  def __init__(self, cfg_scale: float = 1.5, entropy_threshold: float = 2.0):
 
81
 
82
  print("[MTI] MTILogitsProcessor ready (cfg=1.5, threshold=2.0)", flush=True)
83
 
84
+ # ==============================================================================
85
+ # 3. vLLM ENGINE -- Gemma 4 Day 0 지원, 패치 λΆˆν•„μš”
86
+ # ==============================================================================
87
  from vllm.engine.arg_utils import EngineArgs
88
  from vllm.engine.llm_engine import LLMEngine
89
  from vllm import SamplingParams, TokensPrompt
90
  from transformers import AutoTokenizer
91
 
92
+ # -- Gemma 4 ν† ν¬λ‚˜μ΄μ € ν˜Έν™˜μ„± 패치 --
93
  # transformers 5.5.0+μ—μ„œ extra_special_tokensκ°€ list일 λ•Œ .keys() μ—λŸ¬
94
  from huggingface_hub import hf_hub_download
95
  import tempfile, shutil
 
106
  except Exception:
107
  pass
108
 
109
+ # tokenizer_config.json 패치: extra_special_tokens list -> dict
110
  _tc_path = os.path.join(_tok_dir, "tokenizer_config.json")
111
  if os.path.exists(_tc_path):
112
  with open(_tc_path) as f:
 
135
  trust_remote_code=True,
136
  limit_mm_per_prompt={"image": 0, "audio": 0},
137
  ))
138
+ print(f"[vLLM] OK TriAttention engine ready", flush=True)
139
  except Exception as e:
140
+ print(f"[vLLM] X TriAttention failed: {e}", flush=True)
141
  os.environ.pop("VLLM_ATTENTION_BACKEND", None)
142
  TRIATT_ENABLED = False
143
  engine = None
 
154
  trust_remote_code=True,
155
  limit_mm_per_prompt={"image": 0, "audio": 0},
156
  ))
157
+ print(f"[vLLM] OK Standard engine ready", flush=True)
158
  except Exception as e:
159
+ print(f"[vLLM] X 16K failed: {e}", flush=True)
160
  engine = None
161
 
162
  # μ‹œλ„ 3: μ΅œμ†Œ 8K
 
171
  trust_remote_code=True,
172
  limit_mm_per_prompt={"image": 0, "audio": 0},
173
  ))
174
+ print(f"[vLLM] OK Minimal engine ready", flush=True)
175
  except Exception as e:
176
+ print(f"[vLLM] XXX All failed: {e}", flush=True)
177
  sys.exit(1)
178
 
179
  MODEL_CAP["max_tokens"] = min(MODEL_CAP["max_tokens"], MAX_MODEL_LEN)
180
  _attn = "TriAttention" if TRIATT_ENABLED else "Standard"
181
  print(f"[vLLM] Final: {_attn}, max_len={MAX_MODEL_LEN}, max_tokens={MODEL_CAP['max_tokens']}", flush=True)
182
 
183
+ # ==============================================================================
184
  # 4. THINKING MODE HELPERS
185
+ # ==============================================================================
186
  def parse_think_blocks(text: str) -> tuple[str, str]:
187
  # Gemma 4 thinking format: <|channel|>thought\n...<channel|>answer
188
  m = re.search(r"<\|channel\|>thought\s*\n(.*?)<channel\|>", text, re.DOTALL)
 
198
  chain, answer = parse_think_blocks(raw)
199
  if chain:
200
  return (
201
+ "<details>\n<summary>🧠 Reasoning Chain -- click to expand</summary>\n\n"
202
  f"{chain}\n\n</details>\n\n{answer}"
203
  )
204
  # Gemma 4 thinking in progress
 
210
  return f"🧠 Thinking... ({think_len} chars)"
211
  return raw
212
 
213
+ # ==============================================================================
214
+ # 5. GENERATION -- vLLM Engine + MTI Streaming
215
+ # ==============================================================================
216
  def _engine_generate(prompt_text: str, params: SamplingParams, queue: Queue):
217
  """vLLM μ—”μ§„ 생성 + Queue 슀트리밍"""
218
  try:
 
318
 
319
  if output:
320
  mti_rate = f"{mti.intervention_rate*100:.1f}%"
321
+ print(f"[GEN] Done -- {len(output)} chars, MTI={mti_rate} "
322
  f"({mti._interventions}/{mti._total})", flush=True)
323
  yield format_response(output)
324
  else:
325
  yield "**⚠️ λͺ¨λΈμ΄ 빈 응닡을 λ°˜ν™˜ν–ˆμŠ΅λ‹ˆλ‹€.** λ‹€μ‹œ μ‹œλ„ν•΄ μ£Όμ„Έμš”."
326
 
327
 
328
+ # ==============================================================================
329
  # 6. GRADIO BLOCKS
330
+ # ==============================================================================
331
  with gr.Blocks(title=MODEL_NAME) as gradio_demo:
332
  thinking_toggle = gr.Radio(
333
  choices=["⚑ Fast Mode", "🧠 Thinking Mode"],
 
347
  ],
348
  )
349
 
350
+ # ==============================================================================
351
  # 7. FASTAPI
352
+ # ==============================================================================
353
  fapp = FastAPI()
354
  SESSIONS: dict[str, dict] = {}
355
  HTML = pathlib.Path(__file__).parent / "index.html"
 
468
  except Exception as e:
469
  return JSONResponse({"error": str(e)}, 500)
470
 
471
+ # ==============================================================================
472
  # 8. MOUNT & RUN
473
+ # ==============================================================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
474
  app = gr.mount_gradio_app(fapp, gradio_demo, path="/gradio")
475
 
476
  def _shutdown(sig, frame):