Darwin-4B-david

Sleeping

App Files Files Community

SeaWolf-AI commited on 8 days ago

Commit

d422c24

verified ·

1 Parent(s): 3e3322c

Update app.py

Browse files

Files changed (1) hide show

app.py +94 -185

app.py CHANGED Viewed

@@ -1,47 +1,48 @@
 """
-🧬 Darwin-35B-A3B-Opus — ZeroGPU Direct Serving
-transformers + @spaces.GPU  · Vision support · Streaming
 """
-import sys
 print(f"[BOOT] Python {sys.version}", flush=True)
 import base64, os, re, json, io
 from typing import Generator, Optional
-from threading import Thread
-# ── Core imports ──────────────────────────────────────────────────────────
-import torch
-import spaces
 import gradio as gr
-print(f"[BOOT] gradio {gr.__version__}, torch {torch.__version__}", flush=True)
-from transformers import (
-    AutoProcessor,
-    AutoModelForImageTextToText,
-    AutoModelForCausalLM,
-    AutoTokenizer,
-    TextIteratorStreamer,
-)
-from PIL import Image
-import requests
-import httpx
 from fastapi import FastAPI, Request
 from fastapi.responses import HTMLResponse, RedirectResponse, JSONResponse
 from urllib.parse import urlencode
 import pathlib, secrets
-# SSL 경고 무시
 import urllib3
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 # ══════════════════════════════════════════════════════════════════════════════
 # 1.  MODEL CONFIG
 # ══════════════════════════════════════════════════════════════════════════════
-MODEL_ID   = "FINAL-Bench/Darwin-35B-A3B-Opus"
-MODEL_NAME = "Darwin-35B-A3B-Opus"
 MODEL_CAP  = {
     "arch": "MoE", "active": "3B / 35B total",
-    "ctx": "262K", "thinking": True, "vision": True,
     "max_tokens": 16384, "temp_max": 1.5,
 }
@@ -55,56 +56,44 @@ PRESETS = {
 }
 # ══════════════════════════════════════════════════════════════════════════════
-# 2.  MODEL LOADING
 # ══════════════════════════════════════════════════════════════════════════════
-print(f"[MODEL] Loading {MODEL_ID} ...", flush=True)
-IS_VISION = True
-processor = None
-tokenizer = None
-model     = None
-try:
-    processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
-    print("[MODEL] AutoProcessor loaded (vision mode)", flush=True)
-except Exception as e:
-    print(f"[MODEL] AutoProcessor failed: {e}", flush=True)
-    print("[MODEL] Falling back to AutoTokenizer (text-only mode)", flush=True)
-    IS_VISION = False
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
-# 모델 로드 — dtype= 우선, 실패 시 torch_dtype= 폴백, 최종 4bit
-_load_ok = False
-ModelClass = AutoModelForImageTextToText if IS_VISION else AutoModelForCausalLM
-for attempt, load_kwargs in enumerate([
-    dict(dtype=torch.bfloat16, device_map="auto", trust_remote_code=True),
-    dict(torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True),
-]):
     try:
-        model = ModelClass.from_pretrained(MODEL_ID, **load_kwargs)
-        print(f"[MODEL] {ModelClass.__name__} loaded (attempt {attempt+1}) ✓", flush=True)
-        _load_ok = True
-        break
     except Exception as e:
-        print(f"[MODEL] Attempt {attempt+1} failed: {e}", flush=True)
-if not _load_ok:
-    print("[MODEL] Retrying with 4-bit quantization...", flush=True)
-    from transformers import BitsAndBytesConfig
-    bnb_config = BitsAndBytesConfig(
-        load_in_4bit=True, bnb_4bit_quant_type="nf4",
-        bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True,
-    )
-    model = ModelClass.from_pretrained(
-        MODEL_ID, quantization_config=bnb_config,
-        device_map="auto", trust_remote_code=True,
-    )
-    print("[MODEL] 4-bit quantized model loaded ✓", flush=True)
-# 토크나이저 결정
-_tok = processor.tokenizer if (processor and hasattr(processor, 'tokenizer')) else (processor or tokenizer)
-print(f"[MODEL] Ready — vision={IS_VISION}, dtype={model.dtype}", flush=True)
 # ══════════════════════════════════════════════════════════════════════════════
 # 3.  THINKING MODE HELPERS
@@ -175,27 +164,8 @@ def format_response(raw: str) -> str:
     return raw
 # ══════════════════════════════════════════════════════════════════════════════
-# 4.  IMAGE HELPERS
-# ══════════════════════════════════════════════════════════════════════════════
-def _load_image_from_source(src: str) -> Optional[Image.Image]:
-    try:
-        if src.startswith("data:"):
-            _, b64 = src.split(",", 1)
-            return Image.open(io.BytesIO(base64.b64decode(b64))).convert("RGB")
-        elif src.startswith("http"):
-            resp = requests.get(src, timeout=15)
-            resp.raise_for_status()
-            return Image.open(io.BytesIO(resp.content)).convert("RGB")
-    except Exception as e:
-        print(f"[IMG] Failed to load image: {e}", flush=True)
-    return None
 # ══════════════════════════════════════════════════════════════════════════════
-# 5.  GENERATION  — ★ @spaces.GPU on Gradio fn (핵심 수정) ★
-#     ZeroGPU는 Gradio 이벤트 함수에 @spaces.GPU가 있어야 감지합니다.
-#     내부 서브함수가 아닌, ChatInterface의 fn에 직접 데코레이션!
-# ══════════════════════════════════════════════════════════════════════════════
-@spaces.GPU(duration=180)
 def generate_reply(
     message:        str,
     history:        list,
@@ -245,98 +215,44 @@ def generate_reply(
                 _, clean = parse_think_blocks(at)
                 messages.append({"role":"assistant","content":clean})
-    # ── 현재 메시지 (이미지 포함 가능) ──
-    has_image = False
-    pil_image = None
-    if image_input and isinstance(image_input, str) and image_input.strip():
-        pil_image = _load_image_from_source(image_input)
-        if pil_image:
-            has_image = True
-    if IS_VISION and has_image and pil_image:
-        messages.append({
-            "role": "user",
-            "content": [
-                {"type": "image", "image": pil_image},
-                {"type": "text", "text": message},
-            ]
-        })
-    else:
-        messages.append({"role": "user", "content": message})
-    # ── 토크나이즈 ──
-    try:
-        if IS_VISION and processor is not None:
-            text_prompt = processor.apply_chat_template(
-                messages, tokenize=False, add_generation_prompt=True,
-            )
-            if has_image and pil_image:
-                inputs = processor(
-                    text=[text_prompt], images=[pil_image],
-                    return_tensors="pt", padding=True,
-                )
-            else:
-                inputs = processor(
-                    text=[text_prompt], return_tensors="pt", padding=True,
-                )
-        else:
-            text_prompt = tokenizer.apply_chat_template(
-                messages, tokenize=False, add_generation_prompt=True,
-            )
-            inputs = tokenizer(text_prompt, return_tensors="pt")
-    except Exception as e:
-        yield f"**❌ Tokenization error:** `{e}`"
-        return
-    # ── GPU로 이동 ──
-    inputs = {k: v.to(model.device) if hasattr(v, 'to') else v for k, v in inputs.items()}
-    # ── Streamer ──
-    streamer = TextIteratorStreamer(_tok, skip_special_tokens=True, skip_prompt=True)
-    input_len = inputs["input_ids"].shape[-1]
-    print(f"[GEN] tokens={input_len}, max_new={max_new_tokens}, "
-          f"temp={temperature}, vision={has_image}", flush=True)
-    # ── generate → 별도 스레드 (GPU 컨텍스트는 이 함수가 유지) ──
-    gen_kwargs = dict(
-        **inputs,
-        max_new_tokens=max_new_tokens,
-        do_sample=temperature > 0.01,
-        temperature=max(temperature, 0.01) if temperature > 0.01 else 1.0,
-        top_p=float(top_p),
-        streamer=streamer,
-        use_cache=True,
-    )
-    thread = Thread(target=model.generate, kwargs=gen_kwargs)
-    thread.start()
-    output = ""
     try:
-        for text in streamer:
-            output += text
-            yield format_response(output)
-    except Exception as e:
-        if output:
-            yield format_response(output)
-        else:
-            yield f"**❌ Generation error:** `{e}`"
-    thread.join()
-    if output:
-        print(f"[GEN] Done — {len(output)} chars", flush=True)
-        yield format_response(output)
-    else:
-        yield "**⚠️ 모델이 빈 응답을 반환했습니다.** 다시 시도해 주세요."
 # ══════════════════════════════════════════════════════════════════════════════
-# 6.  GRADIO BLOCKS
 # ══════════════════════════════════════════════════════════════════════════════
-with gr.Blocks(title="Darwin-35B-A3B-Opus") as gradio_demo:
     thinking_toggle = gr.Radio(
         choices=["⚡ Fast Mode  (direct answer)",
                  "🧠 Thinking Mode  (chain-of-thought reasoning)"],
@@ -359,7 +275,7 @@ with gr.Blocks(title="Darwin-35B-A3B-Opus") as gradio_demo:
     )
 # ══════════════════════════════════════════════════════════════════════════════
-# 7.  FASTAPI — index.html + OAuth + 유틸 API
 # ══════════════════════════════════════════════════════════════════════════════
 fapp    = FastAPI()
 SESSIONS: dict[str, dict] = {}
@@ -379,7 +295,6 @@ SCOPES        = os.getenv("OAUTH_SCOPES", "openid profile")
 def _sid(req: Request) -> Optional[str]:
     return req.cookies.get("mc_session")
 def _user(req: Request) -> Optional[dict]:
     sid = _sid(req)
     return SESSIONS.get(sid) if sid else None
@@ -419,7 +334,6 @@ async def oauth_callback(code: str = "", error: str = "", state: str = ""):
         if uinfo.status_code != 200:
             return RedirectResponse("/?auth_error=1")
         user = uinfo.json()
     sid = secrets.token_urlsafe(32)
     SESSIONS[sid] = {
         "logged_in": True,
@@ -442,11 +356,9 @@ async def oauth_logout(request: Request):
 @fapp.get("/health")
 async def health():
-    return {
-        "status": "ok", "model": MODEL_ID,
-        "vision": IS_VISION, "dtype": str(model.dtype),
-    }
 BRAVE_API_KEY = os.getenv("BRAVE_API_KEY", "")
 @fapp.post("/api/search")
@@ -471,6 +383,7 @@ async def api_search(request: Request):
     except Exception as e:
         return JSONResponse({"error": str(e)}, status_code=500)
 @fapp.post("/api/extract-pdf")
 async def api_extract_pdf(request: Request):
     try:
@@ -494,14 +407,10 @@ async def api_extract_pdf(request: Request):
         return JSONResponse({"error": str(e)}, status_code=500)
 # ══════════════════════════════════════════════════════════════════════════════
-# 8.  MOUNT & LAUNCH
-#     @spaces.GPU는 모듈 로드 시 자동 감지됨 (generate_reply에 데코레이션).
-#     uvicorn.run()으로 서버를 시작해야 프로세스가 유지됩니다.
 # ══════════════════════════════════════════════════════════════════════════════
 app = gr.mount_gradio_app(fapp, gradio_demo, path="/gradio")
-print("[BOOT] Darwin-35B-A3B-Opus · ZeroGPU Direct Serving · Ready", flush=True)
 if __name__ == "__main__":
-    import uvicorn
     uvicorn.run(app, host="0.0.0.0", port=7860)

 """
+🧬 Darwin-35B-A3B-Opus Q8 GGUF — llama-cpp-python Direct Serving
+전용 GPU · OpenAI-compatible streaming · 커스텀 프론트엔드
 """
+import sys, subprocess
 print(f"[BOOT] Python {sys.version}", flush=True)
+# ── llama-cpp-python CUDA 설치 확인 ──
+try:
+    from llama_cpp import Llama
+    print("[BOOT] llama-cpp-python already installed", flush=True)
+except ImportError:
+    print("[BOOT] Installing llama-cpp-python with CUDA...", flush=True)
+    subprocess.check_call([
+        sys.executable, "-m", "pip", "install",
+        "llama-cpp-python", "--no-cache-dir", "--prefer-binary",
+        "--extra-index-url", "https://abetlen.github.io/llama-cpp-python/whl/cu124",
+    ])
+    from llama_cpp import Llama
+    print("[BOOT] llama-cpp-python installed ✓", flush=True)
 import base64, os, re, json, io
 from typing import Generator, Optional
 import gradio as gr
+print(f"[BOOT] gradio {gr.__version__}", flush=True)
+import requests, httpx, uvicorn
 from fastapi import FastAPI, Request
 from fastapi.responses import HTMLResponse, RedirectResponse, JSONResponse
 from urllib.parse import urlencode
 import pathlib, secrets
 import urllib3
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 # ══════════════════════════════════════════════════════════════════════════════
 # 1.  MODEL CONFIG
 # ══════════════════════════════════════════════════════════════════════════════
+REPO_ID    = "FINAL-Bench/Darwin-35B-A3B-Opus-Q8-GGUF"
+GGUF_FILE  = "darwin-35b-a3b-opus-q8_0-00001-of-00003.gguf"
+MODEL_NAME = "Darwin-35B-A3B-Opus-Q8"
 MODEL_CAP  = {
     "arch": "MoE", "active": "3B / 35B total",
+    "ctx": "262K", "thinking": True, "vision": False,
     "max_tokens": 16384, "temp_max": 1.5,
 }
 }
 # ══════════════════════════════════════════════════════════════════════════════
+# 2.  VRAM 감지 + 모델 로딩
 # ══════════════════════════════════════════════════════════════════════════════
+def detect_gpu_layers() -> int:
+    """사용 가능한 VRAM에 따라 n_gpu_layers 자동 결정"""
     try:
+        import torch
+        if torch.cuda.is_available():
+            vram_gb = torch.cuda.get_device_properties(0).total_mem / (1024**3)
+            print(f"[GPU] {torch.cuda.get_device_name(0)} — {vram_gb:.1f} GB VRAM", flush=True)
+            if vram_gb >= 40:      # A100 40GB — 전체 레이어 GPU
+                return -1          # -1 = all layers
+            elif vram_gb >= 24:    # A10G 24GB — 약 25레이어
+                return 28
+            elif vram_gb >= 16:    # T4 16GB — 약 15레이어
+                return 18
+            else:
+                return 10
+        else:
+            print("[GPU] No CUDA device found, CPU-only mode", flush=True)
+            return 0
     except Exception as e:
+        print(f"[GPU] Detection failed: {e}, using CPU", flush=True)
+        return 0
+N_GPU_LAYERS = int(os.getenv("N_GPU_LAYERS", str(detect_gpu_layers())))
+N_CTX        = int(os.getenv("N_CTX", "32768"))
+print(f"[MODEL] Loading {REPO_ID} ...", flush=True)
+print(f"[MODEL] n_gpu_layers={N_GPU_LAYERS}, n_ctx={N_CTX}", flush=True)
+llm = Llama.from_pretrained(
+    repo_id=REPO_ID,
+    filename=GGUF_FILE,
+    n_gpu_layers=N_GPU_LAYERS,
+    n_ctx=N_CTX,
+    verbose=True,
+)
+print(f"[MODEL] {MODEL_NAME} loaded ✓", flush=True)
 # ══════════════════════════════════════════════════════════════════════════════
 # 3.  THINKING MODE HELPERS
     return raw
 # ══════════════════════════════════════════════════════════════════════════════
+# 4.  GENERATION — llama-cpp-python 스트리밍 (초간단)
 # ══════════════════════════════════════════════════════════════════════════════
 def generate_reply(
     message:        str,
     history:        list,
                 _, clean = parse_think_blocks(at)
                 messages.append({"role":"assistant","content":clean})
+    # PDF 텍스트가 image_input에 들어올 수 있음 (프론트엔드 호환)
+    messages.append({"role": "user", "content": message})
+    print(f"[GEN] msgs={len(messages)}, max_new={max_new_tokens}, temp={temperature}", flush=True)
+    # ── llama-cpp 스트리밍 — 심플! ──
     try:
+        stream = llm.create_chat_completion(
+            messages=messages,
+            max_tokens=max_new_tokens,
+            temperature=max(temperature, 0.01) if temperature > 0.01 else 0.0,
+            top_p=float(top_p),
+            stream=True,
+        )
+        raw = ""
+        for chunk in stream:
+            delta = chunk.get("choices", [{}])[0].get("delta", {})
+            token = delta.get("content", "")
+            if token:
+                raw += token
+                yield format_response(raw)
+        if raw:
+            print(f"[GEN] Done — {len(raw)} chars", flush=True)
+            yield format_response(raw)
+        else:
+            yield "**⚠️ 모델이 빈 응답을 반환했습니다.** 다시 시도해 주세요."
+    except Exception as e:
+        print(f"[GEN] Error: {e}", flush=True)
+        yield f"**❌ Generation error:** `{e}`"
 # ══════════════════════════════════════════════════════════════════════════════
+# 5.  GRADIO BLOCKS
 # ══════════════════════════════════════════════════════════════════════════════
+with gr.Blocks(title=MODEL_NAME) as gradio_demo:
     thinking_toggle = gr.Radio(
         choices=["⚡ Fast Mode  (direct answer)",
                  "🧠 Thinking Mode  (chain-of-thought reasoning)"],
     )
 # ══════════════════════════════════════════════════════════════════════════════
+# 6.  FASTAPI — index.html + OAuth + 유틸 API
 # ══════════════════════════════════════════════════════════════════════════════
 fapp    = FastAPI()
 SESSIONS: dict[str, dict] = {}
 def _sid(req: Request) -> Optional[str]:
     return req.cookies.get("mc_session")
 def _user(req: Request) -> Optional[dict]:
     sid = _sid(req)
     return SESSIONS.get(sid) if sid else None
         if uinfo.status_code != 200:
             return RedirectResponse("/?auth_error=1")
         user = uinfo.json()
     sid = secrets.token_urlsafe(32)
     SESSIONS[sid] = {
         "logged_in": True,
 @fapp.get("/health")
 async def health():
+    return {"status": "ok", "model": MODEL_NAME, "gpu_layers": N_GPU_LAYERS, "ctx": N_CTX}
+# ── Web Search API (Brave) ──
 BRAVE_API_KEY = os.getenv("BRAVE_API_KEY", "")
 @fapp.post("/api/search")
     except Exception as e:
         return JSONResponse({"error": str(e)}, status_code=500)
+# ── PDF Text Extraction ──
 @fapp.post("/api/extract-pdf")
 async def api_extract_pdf(request: Request):
     try:
         return JSONResponse({"error": str(e)}, status_code=500)
 # ══════════════════════════════════════════════════════════════════════════════
+# 7.  MOUNT & RUN — 전용 GPU이므로 uvicorn.run() 정상 사용
 # ══════════════════════════════════════════════════════════════════════════════
 app = gr.mount_gradio_app(fapp, gradio_demo, path="/gradio")
 if __name__ == "__main__":
+    print(f"[BOOT] {MODEL_NAME} · llama-cpp · GPU layers: {N_GPU_LAYERS}", flush=True)
     uvicorn.run(app, host="0.0.0.0", port=7860)