Darwin-9B-Opus

Running on L4

App Files Files Community

SeaWolf-AI commited on 14 days ago

Commit

de52729

verified ·

1 Parent(s): c4783b1

Create app.py

Browse files

Files changed (1) hide show

app.py +559 -0

app.py ADDED Viewed

	@@ -0,0 +1,559 @@

+"""
+🧬 Darwin-35B-A3B-Opus — ZeroGPU Direct Serving
+transformers + @spaces.GPU  · Vision support · Streaming
+"""
+import sys
+print(f"[BOOT] Python {sys.version}", flush=True)
+import base64, os, re, json, io
+from typing import Generator, Optional
+from threading import Thread
+# ── Core imports ──────────────────────────────────────────────────────────
+import torch
+import spaces
+import gradio as gr
+print(f"[BOOT] gradio {gr.__version__}, torch {torch.__version__}", flush=True)
+from transformers import (
+    AutoProcessor,
+    AutoModelForImageTextToText,
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    TextIteratorStreamer,
+)
+from PIL import Image
+import requests
+import httpx, uvicorn
+from fastapi import FastAPI, Request
+from fastapi.responses import HTMLResponse, RedirectResponse, JSONResponse
+from urllib.parse import urlencode
+import pathlib, secrets
+# SSL 경고 무시
+import urllib3
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+# ══════════════════════════════════════════════════════════════════════════════
+# 1.  MODEL CONFIG
+# ══════════════════════════════════════════════════════════════════════════════
+MODEL_ID   = "FINAL-Bench/Darwin-35B-A3B-Opus"
+MODEL_NAME = "Darwin-35B-A3B-Opus"
+MODEL_CAP  = {
+    "arch": "MoE", "active": "3B / 35B total",
+    "ctx": "262K", "thinking": True, "vision": True,
+    "max_tokens": 16384, "temp_max": 1.5,
+}
+PRESETS = {
+    "general":   "You are Darwin-35B-A3B-Opus, a highly capable reasoning model created by VIDRAFT via evolutionary merge. Think step by step for complex questions.",
+    "code":      "You are an expert software engineer. Write clean, efficient, well-commented code. Explain your approach before writing. Use modern best practices.",
+    "math":      "You are a world-class mathematician. Break problems step-by-step. Show full working. Use LaTeX where helpful.",
+    "creative":  "You are a brilliant creative writer. Be imaginative, vivid, and engaging. Adapt tone and style to the request.",
+    "translate": "You are a professional translator fluent in 201 languages. Provide accurate, natural-sounding translations with cultural context.",
+    "research":  "You are a rigorous research analyst. Provide structured, well-reasoned analysis. Identify assumptions and acknowledge uncertainty.",
+}
+# ══════════════════════════════════════════════════════════════════════════════
+# 2.  MODEL LOADING  (ZeroGPU: CPU at import, GPU at inference)
+# ══════════════════════════════════════════════════════════════════════════════
+print(f"[MODEL] Loading {MODEL_ID} ...", flush=True)
+IS_VISION = True   # 모델이 vision 지원하는지 여부
+processor = None
+tokenizer = None
+model     = None
+try:
+    processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
+    print("[MODEL] AutoProcessor loaded (vision mode)", flush=True)
+except Exception as e:
+    print(f"[MODEL] AutoProcessor failed: {e}", flush=True)
+    print("[MODEL] Falling back to AutoTokenizer (text-only mode)", flush=True)
+    IS_VISION = False
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
+try:
+    if IS_VISION:
+        model = AutoModelForImageTextToText.from_pretrained(
+            MODEL_ID,
+            torch_dtype=torch.bfloat16,
+            device_map="auto",
+            trust_remote_code=True,
+        )
+        print("[MODEL] AutoModelForImageTextToText loaded ✓", flush=True)
+    else:
+        model = AutoModelForCausalLM.from_pretrained(
+            MODEL_ID,
+            torch_dtype=torch.bfloat16,
+            device_map="auto",
+            trust_remote_code=True,
+        )
+        print("[MODEL] AutoModelForCausalLM loaded ✓", flush=True)
+except Exception as e:
+    print(f"[MODEL] bfloat16 load failed: {e}", flush=True)
+    print("[MODEL] Retrying with 4-bit quantization...", flush=True)
+    from transformers import BitsAndBytesConfig
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=torch.bfloat16,
+        bnb_4bit_use_double_quant=True,
+    )
+    ModelClass = AutoModelForImageTextToText if IS_VISION else AutoModelForCausalLM
+    model = ModelClass.from_pretrained(
+        MODEL_ID,
+        quantization_config=bnb_config,
+        device_map="auto",
+        trust_remote_code=True,
+    )
+    print("[MODEL] 4-bit quantized model loaded ✓", flush=True)
+# 토크나이저 결정
+_tok = processor.tokenizer if (processor and hasattr(processor, 'tokenizer')) else (processor or tokenizer)
+print(f"[MODEL] Ready — device: {model.device}, dtype: {model.dtype}", flush=True)
+# ══════════════════════════════════════════════════════════════════════════════
+# 3.  THINKING MODE HELPERS  (기존 로직 유지)
+# ══════════════════════════════════════════════════════════════════════════════
+def parse_think_blocks(text: str) -> tuple[str, str]:
+    m = re.search(r"<think>(.*?)</think>\s*", text, re.DOTALL)
+    return (m.group(1).strip(), text[m.end():].strip()) if m else ("", text)
+def _is_thinking_line(line: str) -> bool:
+    l = line.strip()
+    if not l:
+        return True
+    think_starts = [
+        "The user", "the user", "This is", "this is", "I should", "I need to",
+        "Let me", "let me", "My task", "my task", "I'll ", "I will",
+        "Since ", "since ", "Now,", "now,", "So,", "so,", "First,", "first,",
+        "Okay", "okay", "Alright", "Hmm", "Wait", "Actually",
+        "The question", "the question", "The input", "the input",
+        "The request", "the request", "The prompt", "the prompt",
+        "Thinking Process", "Thinking process", "**Thinking",
+        "Step ", "step ", "Approach:", "Analysis:", "Reasoning:",
+        "1. **", "2. **", "3. **", "4. **", "5. **",
+    ]
+    for s in think_starts:
+        if l.startswith(s):
+            return True
+    if l.startswith(("- ", "* ", "○ ")) and any(c.isascii() and c.isalpha() for c in l[:20]):
+        if not any(ord(c) > 0x1100 for c in l[:30]):
+            return True
+    return False
+def _split_thinking_answer(raw: str) -> tuple:
+    lines = raw.split("\n")
+    answer_start = -1
+    for i, line in enumerate(lines):
+        if not _is_thinking_line(line):
+            if any(ord(c) > 0x1100 for c in line.strip()[:10]):
+                answer_start = i
+                break
+            if i > 2 and not _is_thinking_line(line):
+                if all(not lines[j].strip() for j in range(max(0,i-2), i)):
+                    answer_start = i
+                    break
+    if answer_start > 0:
+        thinking = "\n".join(lines[:answer_start]).strip()
+        answer = "\n".join(lines[answer_start:]).strip()
+        return thinking, answer
+    return "", raw
+def format_response(raw: str) -> str:
+    chain, answer = parse_think_blocks(raw)
+    if chain:
+        return (
+            "<details>\n"
+            "<summary>🧠 Reasoning Chain — click to expand</summary>\n\n"
+            f"{chain}\n\n"
+            "</details>\n\n"
+            f"{answer}"
+        )
+    if "<think>" in raw and "</think>" not in raw:
+        think_len = len(raw) - raw.index("<think>") - 7
+        return f"🧠 Reasoning... ({think_len} chars)"
+    first_line = raw.strip().split("\n")[0] if raw.strip() else ""
+    if _is_thinking_line(first_line) and len(raw) > 20:
+        thinking, answer = _split_thinking_answer(raw)
+        if thinking and answer:
+            return (
+                f"<details>\n"
+                f"<summary>🧠 Reasoning Chain ({len(thinking)} chars)</summary>\n\n"
+                f"{thinking}\n\n"
+                f"</details>\n\n"
+                f"{answer}"
+            )
+        elif thinking and not answer:
+            return f"🧠 Reasoning... ({len(raw)} chars)"
+    return raw
+# ══════════════════════════════════════════════════════════════════════════════
+# 4.  IMAGE HELPERS
+# ══════════════════════════════════════════════════════════════════════════════
+def _load_image_from_source(src: str) -> Optional[Image.Image]:
+    """base64 data URI 또는 URL → PIL Image"""
+    try:
+        if src.startswith("data:"):
+            _, b64 = src.split(",", 1)
+            return Image.open(io.BytesIO(base64.b64decode(b64))).convert("RGB")
+        elif src.startswith("http"):
+            resp = requests.get(src, timeout=15)
+            resp.raise_for_status()
+            return Image.open(io.BytesIO(resp.content)).convert("RGB")
+    except Exception as e:
+        print(f"[IMG] Failed to load image: {e}", flush=True)
+    return None
+# ══════════════════════════════════════════════════════════════════════════════
+# 5.  GENERATION  — ZeroGPU + TextIteratorStreamer
+# ══════════════════════════════════════════════════════════════════════════════
+@spaces.GPU(duration=180)
+def _run_generation(input_ids, attention_mask, pixel_values, image_grid_thw,
+                    max_new_tokens, temperature, top_p, streamer):
+    """GPU 할당 후 실행되는 실제 생성 함수"""
+    gen_kwargs = dict(
+        input_ids=input_ids.to(model.device),
+        attention_mask=attention_mask.to(model.device),
+        max_new_tokens=max_new_tokens,
+        do_sample=temperature > 0.01,
+        temperature=max(temperature, 0.01) if temperature > 0.01 else 1.0,
+        top_p=top_p,
+        streamer=streamer,
+        use_cache=True,
+    )
+    # vision inputs (있으면)
+    if pixel_values is not None:
+        gen_kwargs["pixel_values"] = pixel_values.to(model.device)
+    if image_grid_thw is not None:
+        gen_kwargs["image_grid_thw"] = image_grid_thw.to(model.device)
+    with torch.inference_mode():
+        model.generate(**gen_kwargs)
+def generate_reply(
+    message:        str,
+    history:        list,
+    thinking_mode:  str,
+    image_input,
+    system_prompt:  str,
+    max_new_tokens: int,
+    temperature:    float,
+    top_p:          float,
+) -> Generator[str, None, None]:
+    max_new_tokens = min(int(max_new_tokens), MODEL_CAP["max_tokens"])
+    temperature    = min(float(temperature),  MODEL_CAP["temp_max"])
+    # ── 메시지 구성 ──
+    messages: list[dict] = []
+    if system_prompt.strip():
+        messages.append({"role": "system", "content": system_prompt.strip()})
+    # history  (프론트엔드: [user, assistant] 튜플 리스트)
+    for turn in history:
+        if isinstance(turn, dict):
+            role = turn.get("role", "")
+            raw  = turn.get("content") or ""
+            text = (" ".join(p.get("text","") for p in raw
+                             if isinstance(p,dict) and p.get("type")=="text")
+                    if isinstance(raw, list) else str(raw))
+            if role == "user":
+                messages.append({"role":"user","content":text})
+            elif role == "assistant":
+                _, clean = parse_think_blocks(text)
+                messages.append({"role":"assistant","content":clean})
+        else:
+            try:
+                u, a = (turn[0] or None), (turn[1] if len(turn)>1 else None)
+            except (IndexError, TypeError):
+                continue
+            def _txt(v):
+                if v is None: return None
+                if isinstance(v, list):
+                    return " ".join(p.get("text","") for p in v
+                                    if isinstance(p,dict) and p.get("type")=="text")
+                return str(v)
+            ut = _txt(u)
+            at = _txt(a)
+            if ut: messages.append({"role":"user","content":ut})
+            if at:
+                _, clean = parse_think_blocks(at)
+                messages.append({"role":"assistant","content":clean})
+    # ── 현재 메시지 (이미지 포함 가능) ──
+    has_image = False
+    pil_image = None
+    if image_input and isinstance(image_input, str) and image_input.strip():
+        pil_image = _load_image_from_source(image_input)
+        if pil_image:
+            has_image = True
+    if IS_VISION and has_image:
+        # Vision 모드: 이미지 + 텍스트
+        messages.append({
+            "role": "user",
+            "content": [
+                {"type": "image", "image": pil_image},
+                {"type": "text", "text": message},
+            ]
+        })
+    else:
+        messages.append({"role": "user", "content": message})
+    # ── 토크나이즈 ──
+    try:
+        if IS_VISION and processor is not None:
+            text_prompt = processor.apply_chat_template(
+                messages,
+                tokenize=False,
+                add_generation_prompt=True,
+            )
+            if has_image and pil_image:
+                inputs = processor(
+                    text=[text_prompt],
+                    images=[pil_image],
+                    return_tensors="pt",
+                    padding=True,
+                )
+            else:
+                inputs = processor(
+                    text=[text_prompt],
+                    return_tensors="pt",
+                    padding=True,
+                )
+        else:
+            # text-only 모드
+            text_prompt = tokenizer.apply_chat_template(
+                messages,
+                tokenize=False,
+                add_generation_prompt=True,
+            )
+            inputs = tokenizer(text_prompt, return_tensors="pt")
+    except Exception as e:
+        yield f"**❌ Tokenization error:** `{e}`"
+        return
+    # ── Streamer ��정 ──
+    decode_tok = _tok
+    streamer = TextIteratorStreamer(decode_tok, skip_special_tokens=True, skip_prompt=True)
+    # ── 텐서 추출 ──
+    input_ids      = inputs["input_ids"]
+    attention_mask  = inputs.get("attention_mask", torch.ones_like(input_ids))
+    pixel_values    = inputs.get("pixel_values", None)
+    image_grid_thw  = inputs.get("image_grid_thw", None)
+    print(f"[GEN] tokens={input_ids.shape[-1]}, max_new={max_new_tokens}, "
+          f"temp={temperature}, vision={has_image}", flush=True)
+    # ── 스레드에서 생성 실행 ──
+    thread = Thread(
+        target=_run_generation,
+        kwargs=dict(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            pixel_values=pixel_values,
+            image_grid_thw=image_grid_thw,
+            max_new_tokens=max_new_tokens,
+            temperature=temperature,
+            top_p=float(top_p),
+            streamer=streamer,
+        ),
+    )
+    thread.start()
+    output = ""
+    try:
+        for text in streamer:
+            output += text
+            yield format_response(output)
+    except Exception as e:
+        if output:
+            yield format_response(output)
+        else:
+            yield f"**❌ Generation error:** `{e}`"
+    thread.join()
+    if output:
+        print(f"[GEN] Done — {len(output)} chars", flush=True)
+        yield format_response(output)
+    else:
+        yield "**⚠️ 모델이 빈 응답을 반환했습니다.** 다시 시도해 주세요."
+# ══════════════════════════════════════════════════════════════════════════════
+# 6.  GRADIO BLOCKS
+# ══════════════════════════════════════════════════════════════════════════════
+with gr.Blocks(title="Darwin-35B-A3B-Opus") as gradio_demo:
+    thinking_toggle = gr.Radio(
+        choices=["⚡ Fast Mode  (direct answer)",
+                 "🧠 Thinking Mode  (chain-of-thought reasoning)"],
+        value="⚡ Fast Mode  (direct answer)",
+        visible=False,
+    )
+    image_input    = gr.Textbox(value="", visible=False)
+    system_prompt  = gr.Textbox(value=PRESETS["general"], visible=False)
+    max_new_tokens = gr.Slider(minimum=64, maximum=16384, value=4096, visible=False)
+    temperature    = gr.Slider(minimum=0.0, maximum=1.5, value=0.6,  visible=False)
+    top_p          = gr.Slider(minimum=0.1, maximum=1.0, value=0.9,  visible=False)
+    gr.ChatInterface(
+        fn=generate_reply,
+        api_name="chat",
+        additional_inputs=[
+            thinking_toggle, image_input,
+            system_prompt, max_new_tokens, temperature, top_p,
+        ],
+    )
+# ══════════════════════════════════════════════════════════════════════════════
+# 7.  FASTAPI — index.html + OAuth + 유틸 API
+# ══════════════════════════════════════════════════════════════════════════════
+fapp    = FastAPI()
+SESSIONS: dict[str, dict] = {}
+HTML    = pathlib.Path(__file__).parent / "index.html"
+CLIENT_ID     = os.getenv("OAUTH_CLIENT_ID", "")
+CLIENT_SECRET = os.getenv("OAUTH_CLIENT_SECRET", "")
+SPACE_HOST    = os.getenv("SPACE_HOST", "localhost:7860")
+REDIRECT_URI  = f"https://{SPACE_HOST}/login/callback"
+print(f"[OAuth] CLIENT_ID set: {bool(CLIENT_ID)}")
+print(f"[OAuth] SPACE_HOST: {SPACE_HOST}")
+HF_AUTH_URL   = "https://huggingface.co/oauth/authorize"
+HF_TOKEN_URL  = "https://huggingface.co/oauth/token"
+HF_USER_URL   = "https://huggingface.co/oauth/userinfo"
+SCOPES        = os.getenv("OAUTH_SCOPES", "openid profile")
+def _sid(req: Request) -> Optional[str]:
+    return req.cookies.get("mc_session")
+def _user(req: Request) -> Optional[dict]:
+    sid = _sid(req)
+    return SESSIONS.get(sid) if sid else None
+@fapp.get("/")
+async def root(request: Request):
+    html = HTML.read_text(encoding="utf-8") if HTML.exists() else "<h2>index.html missing</h2>"
+    return HTMLResponse(html)
+@fapp.get("/oauth/user")
+async def oauth_user(request: Request):
+    u = _user(request)
+    return JSONResponse(u) if u else JSONResponse({"logged_in": False}, status_code=401)
+@fapp.get("/oauth/login")
+async def oauth_login(request: Request):
+    if not CLIENT_ID:
+        return RedirectResponse("/?oauth_error=not_configured")
+    state = secrets.token_urlsafe(16)
+    params = {"response_type":"code","client_id":CLIENT_ID,"redirect_uri":REDIRECT_URI,"scope":SCOPES,"state":state}
+    return RedirectResponse(f"{HF_AUTH_URL}?{urlencode(params)}", status_code=302)
+@fapp.get("/login/callback")
+async def oauth_callback(code: str = "", error: str = "", state: str = ""):
+    if error or not code:
+        return RedirectResponse("/?auth_error=1")
+    basic = base64.b64encode(f"{CLIENT_ID}:{CLIENT_SECRET}".encode()).decode()
+    async with httpx.AsyncClient() as client:
+        tok = await client.post(HF_TOKEN_URL, data={"grant_type":"authorization_code","code":code,"redirect_uri":REDIRECT_URI},
+                                headers={"Accept":"application/json","Authorization":f"Basic {basic}"})
+        if tok.status_code != 200:
+            return RedirectResponse("/?auth_error=1")
+        access_token = tok.json().get("access_token", "")
+        if not access_token:
+            return RedirectResponse("/?auth_error=1")
+        uinfo = await client.get(HF_USER_URL, headers={"Authorization":f"Bearer {access_token}"})
+        if uinfo.status_code != 200:
+            return RedirectResponse("/?auth_error=1")
+        user = uinfo.json()
+    sid = secrets.token_urlsafe(32)
+    SESSIONS[sid] = {
+        "logged_in": True,
+        "username": user.get("preferred_username", user.get("name", "User")),
+        "name": user.get("name", ""),
+        "avatar": user.get("picture", ""),
+        "profile": f"https://huggingface.co/{user.get('preferred_username', '')}",
+    }
+    resp = RedirectResponse("/")
+    resp.set_cookie("mc_session", sid, httponly=True, samesite="lax", secure=True, max_age=60*60*24*7)
+    return resp
+@fapp.get("/oauth/logout")
+async def oauth_logout(request: Request):
+    sid = _sid(request)
+    if sid and sid in SESSIONS: del SESSIONS[sid]
+    resp = RedirectResponse("/")
+    resp.delete_cookie("mc_session")
+    return resp
+@fapp.get("/health")
+async def health():
+    return {
+        "status": "ok",
+        "model": MODEL_ID,
+        "vision": IS_VISION,
+        "device": str(model.device),
+        "dtype": str(model.dtype),
+    }
+# ── Web Search API (Brave) ──
+BRAVE_API_KEY = os.getenv("BRAVE_API_KEY", "")
+@fapp.post("/api/search")
+async def api_search(request: Request):
+    body = await request.json()
+    query = body.get("query", "").strip()
+    if not query:
+        return JSONResponse({"error": "empty query"}, status_code=400)
+    key = BRAVE_API_KEY
+    if not key:
+        return JSONResponse({"error": "BRAVE_API_KEY not set"}, status_code=500)
+    try:
+        r = requests.get(
+            "https://api.search.brave.com/res/v1/web/search",
+            headers={"X-Subscription-Token": key, "Accept": "application/json"},
+            params={"q": query, "count": 5}, timeout=10,
+        )
+        r.raise_for_status()
+        results = r.json().get("web", {}).get("results", [])
+        items = [{"title": item.get("title",""), "desc": item.get("description",""), "url": item.get("url","")} for item in results[:5]]
+        return JSONResponse({"results": items})
+    except Exception as e:
+        return JSONResponse({"error": str(e)}, status_code=500)
+# ── PDF Text Extraction ──
+@fapp.post("/api/extract-pdf")
+async def api_extract_pdf(request: Request):
+    try:
+        body = await request.json()
+        b64 = body.get("data", "")
+        if "," in b64:
+            b64 = b64.split(",", 1)[1]
+        pdf_bytes = base64.b64decode(b64)
+        text = ""
+        try:
+            import fitz
+            doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+            for page in doc:
+                text += page.get_text() + "\n"
+        except ImportError:
+            content = pdf_bytes.decode("utf-8", errors="ignore")
+            text = re.sub(r'[^\x20-\x7E\n\r\uAC00-\uD7A3\u3040-\u309F\u30A0-\u30FF]', '', content)
+        text = text.strip()[:8000]
+        return JSONResponse({"text": text, "chars": len(text)})
+    except Exception as e:
+        return JSONResponse({"error": str(e)}, status_code=500)
+# ── Mount ──
+app = gr.mount_gradio_app(fapp, gradio_demo, path="/gradio")
+if __name__ == "__main__":
+    print(f"[BOOT] Darwin-35B-A3B-Opus · ZeroGPU Direct Serving", flush=True)
+    uvicorn.run(app, host="0.0.0.0", port=7860)