Darwin-9B-Opus

Running on L4

App Files Files Community

SeaWolf-AI commited on 8 days ago

Commit

afc2199

verified ·

1 Parent(s): 4cbb479

Update app.py

Browse files

Files changed (1) hide show

app.py +70 -120

app.py CHANGED Viewed

@@ -24,7 +24,7 @@ from transformers import (
 )
 from PIL import Image
 import requests
-import httpx, uvicorn
 from fastapi import FastAPI, Request
 from fastapi.responses import HTMLResponse, RedirectResponse, JSONResponse
 from urllib.parse import urlencode
@@ -55,11 +55,11 @@ PRESETS = {
 }
 # ══════════════════════════════════════════════════════════════════════════════
-# 2.  MODEL LOADING  (ZeroGPU: CPU at import, GPU at inference)
 # ══════════════════════════════════════════════════════════════════════════════
 print(f"[MODEL] Loading {MODEL_ID} ...", flush=True)
-IS_VISION = True   # 모델이 vision 지원하는지 여부
 processor = None
 tokenizer = None
 model     = None
@@ -73,48 +73,41 @@ except Exception as e:
     IS_VISION = False
     tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
-try:
-    if IS_VISION:
-        model = AutoModelForImageTextToText.from_pretrained(
-            MODEL_ID,
-            torch_dtype=torch.bfloat16,
-            device_map="auto",
-            trust_remote_code=True,
-        )
-        print("[MODEL] AutoModelForImageTextToText loaded ✓", flush=True)
-    else:
-        model = AutoModelForCausalLM.from_pretrained(
-            MODEL_ID,
-            torch_dtype=torch.bfloat16,
-            device_map="auto",
-            trust_remote_code=True,
-        )
-        print("[MODEL] AutoModelForCausalLM loaded ✓", flush=True)
-except Exception as e:
-    print(f"[MODEL] bfloat16 load failed: {e}", flush=True)
     print("[MODEL] Retrying with 4-bit quantization...", flush=True)
     from transformers import BitsAndBytesConfig
     bnb_config = BitsAndBytesConfig(
-        load_in_4bit=True,
-        bnb_4bit_quant_type="nf4",
-        bnb_4bit_compute_dtype=torch.bfloat16,
-        bnb_4bit_use_double_quant=True,
     )
-    ModelClass = AutoModelForImageTextToText if IS_VISION else AutoModelForCausalLM
     model = ModelClass.from_pretrained(
-        MODEL_ID,
-        quantization_config=bnb_config,
-        device_map="auto",
-        trust_remote_code=True,
     )
     print("[MODEL] 4-bit quantized model loaded ✓", flush=True)
 # 토크나이저 결정
 _tok = processor.tokenizer if (processor and hasattr(processor, 'tokenizer')) else (processor or tokenizer)
-print(f"[MODEL] Ready — device: {model.device}, dtype: {model.dtype}", flush=True)
 # ══════════════════════════════════════════════════════════════════════════════
-# 3.  THINKING MODE HELPERS  (기존 로직 유지)
 # ══════════════════════════════════════════════════════════════════════════════
 def parse_think_blocks(text: str) -> tuple[str, str]:
     m = re.search(r"<think>(.*?)</think>\s*", text, re.DOTALL)
@@ -156,20 +149,15 @@ def _split_thinking_answer(raw: str) -> tuple:
                     answer_start = i
                     break
     if answer_start > 0:
-        thinking = "\n".join(lines[:answer_start]).strip()
-        answer = "\n".join(lines[answer_start:]).strip()
-        return thinking, answer
     return "", raw
 def format_response(raw: str) -> str:
     chain, answer = parse_think_blocks(raw)
     if chain:
         return (
-            "<details>\n"
-            "<summary>🧠 Reasoning Chain — click to expand</summary>\n\n"
-            f"{chain}\n\n"
-            "</details>\n\n"
-            f"{answer}"
         )
     if "<think>" in raw and "</think>" not in raw:
         think_len = len(raw) - raw.index("<think>") - 7
@@ -179,11 +167,8 @@ def format_response(raw: str) -> str:
         thinking, answer = _split_thinking_answer(raw)
         if thinking and answer:
             return (
-                f"<details>\n"
-                f"<summary>🧠 Reasoning Chain ({len(thinking)} chars)</summary>\n\n"
-                f"{thinking}\n\n"
-                f"</details>\n\n"
-                f"{answer}"
             )
         elif thinking and not answer:
             return f"🧠 Reasoning... ({len(raw)} chars)"
@@ -193,7 +178,6 @@ def format_response(raw: str) -> str:
 # 4.  IMAGE HELPERS
 # ══════════════════════════════════════════════════════════════════════════════
 def _load_image_from_source(src: str) -> Optional[Image.Image]:
-    """base64 data URI 또는 URL → PIL Image"""
     try:
         if src.startswith("data:"):
             _, b64 = src.split(",", 1)
@@ -207,32 +191,11 @@ def _load_image_from_source(src: str) -> Optional[Image.Image]:
     return None
 # ══════════════════════════════════════════════════════════════════════════════
-# 5.  GENERATION  — ZeroGPU + TextIteratorStreamer
 # ══════════════════════════════════════════════════════════════════════════════
 @spaces.GPU(duration=180)
-def _run_generation(input_ids, attention_mask, pixel_values, image_grid_thw,
-                    max_new_tokens, temperature, top_p, streamer):
-    """GPU 할당 후 실행되는 실제 생성 함수"""
-    gen_kwargs = dict(
-        input_ids=input_ids.to(model.device),
-        attention_mask=attention_mask.to(model.device),
-        max_new_tokens=max_new_tokens,
-        do_sample=temperature > 0.01,
-        temperature=max(temperature, 0.01) if temperature > 0.01 else 1.0,
-        top_p=top_p,
-        streamer=streamer,
-        use_cache=True,
-    )
-    # vision inputs (있으면)
-    if pixel_values is not None:
-        gen_kwargs["pixel_values"] = pixel_values.to(model.device)
-    if image_grid_thw is not None:
-        gen_kwargs["image_grid_thw"] = image_grid_thw.to(model.device)
-    with torch.inference_mode():
-        model.generate(**gen_kwargs)
 def generate_reply(
     message:        str,
     history:        list,
@@ -252,7 +215,6 @@ def generate_reply(
     if system_prompt.strip():
         messages.append({"role": "system", "content": system_prompt.strip()})
-    # history  (프론트엔드: [user, assistant] 튜플 리스트)
     for turn in history:
         if isinstance(turn, dict):
             role = turn.get("role", "")
@@ -292,8 +254,7 @@ def generate_reply(
         if pil_image:
             has_image = True
-    if IS_VISION and has_image:
-        # Vision 모드: 이미지 + 텍스트
         messages.append({
             "role": "user",
             "content": [
@@ -308,62 +269,48 @@ def generate_reply(
     try:
         if IS_VISION and processor is not None:
             text_prompt = processor.apply_chat_template(
-                messages,
-                tokenize=False,
-                add_generation_prompt=True,
             )
             if has_image and pil_image:
                 inputs = processor(
-                    text=[text_prompt],
-                    images=[pil_image],
-                    return_tensors="pt",
-                    padding=True,
                 )
             else:
                 inputs = processor(
-                    text=[text_prompt],
-                    return_tensors="pt",
-                    padding=True,
                 )
         else:
-            # text-only 모드
             text_prompt = tokenizer.apply_chat_template(
-                messages,
-                tokenize=False,
-                add_generation_prompt=True,
             )
             inputs = tokenizer(text_prompt, return_tensors="pt")
     except Exception as e:
         yield f"**❌ Tokenization error:** `{e}`"
         return
-    # ── Streamer 설정 ──
-    decode_tok = _tok
-    streamer = TextIteratorStreamer(decode_tok, skip_special_tokens=True, skip_prompt=True)
-    # ── 텐서 추출 ──
-    input_ids      = inputs["input_ids"]
-    attention_mask  = inputs.get("attention_mask", torch.ones_like(input_ids))
-    pixel_values    = inputs.get("pixel_values", None)
-    image_grid_thw  = inputs.get("image_grid_thw", None)
-    print(f"[GEN] tokens={input_ids.shape[-1]}, max_new={max_new_tokens}, "
           f"temp={temperature}, vision={has_image}", flush=True)
-    # ── 스레드에서 생성 실행 ──
-    thread = Thread(
-        target=_run_generation,
-        kwargs=dict(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            pixel_values=pixel_values,
-            image_grid_thw=image_grid_thw,
-            max_new_tokens=max_new_tokens,
-            temperature=temperature,
-            top_p=float(top_p),
-            streamer=streamer,
-        ),
     )
     thread.start()
     output = ""
@@ -496,14 +443,10 @@ async def oauth_logout(request: Request):
 @fapp.get("/health")
 async def health():
     return {
-        "status": "ok",
-        "model": MODEL_ID,
-        "vision": IS_VISION,
-        "device": str(model.device),
-        "dtype": str(model.dtype),
     }
-# ── Web Search API (Brave) ──
 BRAVE_API_KEY = os.getenv("BRAVE_API_KEY", "")
 @fapp.post("/api/search")
@@ -528,7 +471,6 @@ async def api_search(request: Request):
     except Exception as e:
         return JSONResponse({"error": str(e)}, status_code=500)
-# ── PDF Text Extraction ──
 @fapp.post("/api/extract-pdf")
 async def api_extract_pdf(request: Request):
     try:
@@ -551,9 +493,17 @@ async def api_extract_pdf(request: Request):
     except Exception as e:
         return JSONResponse({"error": str(e)}, status_code=500)
-# ── Mount ──
 app = gr.mount_gradio_app(fapp, gradio_demo, path="/gradio")
-if __name__ == "__main__":
-    print(f"[BOOT] Darwin-35B-A3B-Opus · ZeroGPU Direct Serving", flush=True)
     uvicorn.run(app, host="0.0.0.0", port=7860)

 )
 from PIL import Image
 import requests
+import httpx
 from fastapi import FastAPI, Request
 from fastapi.responses import HTMLResponse, RedirectResponse, JSONResponse
 from urllib.parse import urlencode
 }
 # ══════════════════════════════════════════════════════════════════════════════
+# 2.  MODEL LOADING
 # ══════════════════════════════════════════════════════════════════════════════
 print(f"[MODEL] Loading {MODEL_ID} ...", flush=True)
+IS_VISION = True
 processor = None
 tokenizer = None
 model     = None
     IS_VISION = False
     tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
+# 모델 로드 — dtype= 우선, 실패 시 torch_dtype= 폴백, 최종 4bit
+_load_ok = False
+ModelClass = AutoModelForImageTextToText if IS_VISION else AutoModelForCausalLM
+for attempt, load_kwargs in enumerate([
+    dict(dtype=torch.bfloat16, device_map="auto", trust_remote_code=True),
+    dict(torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True),
+]):
+    try:
+        model = ModelClass.from_pretrained(MODEL_ID, **load_kwargs)
+        print(f"[MODEL] {ModelClass.__name__} loaded (attempt {attempt+1}) ✓", flush=True)
+        _load_ok = True
+        break
+    except Exception as e:
+        print(f"[MODEL] Attempt {attempt+1} failed: {e}", flush=True)
+if not _load_ok:
     print("[MODEL] Retrying with 4-bit quantization...", flush=True)
     from transformers import BitsAndBytesConfig
     bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True, bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True,
     )
     model = ModelClass.from_pretrained(
+        MODEL_ID, quantization_config=bnb_config,
+        device_map="auto", trust_remote_code=True,
     )
     print("[MODEL] 4-bit quantized model loaded ✓", flush=True)
 # 토크나이저 결정
 _tok = processor.tokenizer if (processor and hasattr(processor, 'tokenizer')) else (processor or tokenizer)
+print(f"[MODEL] Ready — vision={IS_VISION}, dtype={model.dtype}", flush=True)
 # ══════════════════════════════════════════════════════════════════════════════
+# 3.  THINKING MODE HELPERS
 # ══════════════════════════════════════════════════════════════════════════════
 def parse_think_blocks(text: str) -> tuple[str, str]:
     m = re.search(r"<think>(.*?)</think>\s*", text, re.DOTALL)
                     answer_start = i
                     break
     if answer_start > 0:
+        return "\n".join(lines[:answer_start]).strip(), "\n".join(lines[answer_start:]).strip()
     return "", raw
 def format_response(raw: str) -> str:
     chain, answer = parse_think_blocks(raw)
     if chain:
         return (
+            "<details>\n<summary>🧠 Reasoning Chain — click to expand</summary>\n\n"
+            f"{chain}\n\n</details>\n\n{answer}"
         )
     if "<think>" in raw and "</think>" not in raw:
         think_len = len(raw) - raw.index("<think>") - 7
         thinking, answer = _split_thinking_answer(raw)
         if thinking and answer:
             return (
+                f"<details>\n<summary>🧠 Reasoning Chain ({len(thinking)} chars)</summary>\n\n"
+                f"{thinking}\n\n</details>\n\n{answer}"
             )
         elif thinking and not answer:
             return f"🧠 Reasoning... ({len(raw)} chars)"
 # 4.  IMAGE HELPERS
 # ══════════════════════════════════════════════════════════════════════════════
 def _load_image_from_source(src: str) -> Optional[Image.Image]:
     try:
         if src.startswith("data:"):
             _, b64 = src.split(",", 1)
     return None
 # ══════════════════════════════════════════════════════════════════════════════
+# 5.  GENERATION  — ★ @spaces.GPU on Gradio fn (핵심 수정) ★
+#     ZeroGPU는 Gradio 이벤트 함수에 @spaces.GPU가 있어야 감지합니다.
+#     내부 서브함수가 아닌, ChatInterface의 fn에 직접 데코레이션!
 # ══════════════════════════════════════════════════════════════════════════════
 @spaces.GPU(duration=180)
 def generate_reply(
     message:        str,
     history:        list,
     if system_prompt.strip():
         messages.append({"role": "system", "content": system_prompt.strip()})
     for turn in history:
         if isinstance(turn, dict):
             role = turn.get("role", "")
         if pil_image:
             has_image = True
+    if IS_VISION and has_image and pil_image:
         messages.append({
             "role": "user",
             "content": [
     try:
         if IS_VISION and processor is not None:
             text_prompt = processor.apply_chat_template(
+                messages, tokenize=False, add_generation_prompt=True,
             )
             if has_image and pil_image:
                 inputs = processor(
+                    text=[text_prompt], images=[pil_image],
+                    return_tensors="pt", padding=True,
                 )
             else:
                 inputs = processor(
+                    text=[text_prompt], return_tensors="pt", padding=True,
                 )
         else:
             text_prompt = tokenizer.apply_chat_template(
+                messages, tokenize=False, add_generation_prompt=True,
             )
             inputs = tokenizer(text_prompt, return_tensors="pt")
     except Exception as e:
         yield f"**❌ Tokenization error:** `{e}`"
         return
+    # ── GPU로 이동 ──
+    inputs = {k: v.to(model.device) if hasattr(v, 'to') else v for k, v in inputs.items()}
+    # ── Streamer ──
+    streamer = TextIteratorStreamer(_tok, skip_special_tokens=True, skip_prompt=True)
+    input_len = inputs["input_ids"].shape[-1]
+    print(f"[GEN] tokens={input_len}, max_new={max_new_tokens}, "
           f"temp={temperature}, vision={has_image}", flush=True)
+    # ── generate → 별도 스레드 (GPU 컨텍스트는 이 함수가 유지) ──
+    gen_kwargs = dict(
+        **inputs,
+        max_new_tokens=max_new_tokens,
+        do_sample=temperature > 0.01,
+        temperature=max(temperature, 0.01) if temperature > 0.01 else 1.0,
+        top_p=float(top_p),
+        streamer=streamer,
+        use_cache=True,
     )
+    thread = Thread(target=model.generate, kwargs=gen_kwargs)
     thread.start()
     output = ""
 @fapp.get("/health")
 async def health():
     return {
+        "status": "ok", "model": MODEL_ID,
+        "vision": IS_VISION, "dtype": str(model.dtype),
     }
 BRAVE_API_KEY = os.getenv("BRAVE_API_KEY", "")
 @fapp.post("/api/search")
     except Exception as e:
         return JSONResponse({"error": str(e)}, status_code=500)
 @fapp.post("/api/extract-pdf")
 async def api_extract_pdf(request: Request):
     try:
     except Exception as e:
         return JSONResponse({"error": str(e)}, status_code=500)
+# ══════════════════════════════════════════════════════════════════════════════
+# 8.  MOUNT & LAUNCH
+#     ★ 핵심: uvicorn.run() 사용 금지! ★
+#     HF Spaces ZeroGPU 런타임이 모듈 스캔 → 'app' 변수 감지 → 자동 서빙.
+#     uvicorn.run()을 호출하면 ZeroGPU wrapper를 우회하여 즉시 종료됨.
+# ══════════════════════════════════════════════════════════════════════════════
 app = gr.mount_gradio_app(fapp, gradio_demo, path="/gradio")
+print("[BOOT] Darwin-35B-A3B-Opus · ZeroGPU Direct Serving · Ready", flush=True)
+# ── 로컬 개발 전용 (SPACE_ID 없을 때만) ──
+if __name__ == "__main__" and not os.getenv("SPACE_ID"):
+    import uvicorn
     uvicorn.run(app, host="0.0.0.0", port=7860)