Gemma-4-Multi

Running on Zero

App Files Files Community

SeaWolf-AI commited on 19 days ago

Commit

5603963

verified ·

1 Parent(s): 09cae0a

Update app.py

Browse files

Files changed (1) hide show

app.py +107 -41

app.py CHANGED Viewed

@@ -32,13 +32,32 @@ except ImportError as e:
 SGLANG_BASE = os.getenv("DARWIN_API", "http://localhost:7947")
 SGLANG_URL  = f"{SGLANG_BASE}/v1/chat/completions"
-MODEL_NAME = "Darwin-35B-A3B-Opus"
-MODEL_CAP  = {
-    "arch": "MoE", "active": "3B / 35B total",
-    "ctx": "262K", "thinking": True, "vision": True,
-    "max_tokens": 16384, "temp_max": 1.5,
 }
 PRESETS = {
     "general":   "You are Darwin-35B-A3B-Opus, a highly capable reasoning model created by VIDRAFT via evolutionary merge. Think step by step for complex questions.",
     "code":      "You are an expert software engineer. Write clean, efficient, well-commented code. Explain your approach before writing. Use modern best practices.",
@@ -155,11 +174,14 @@ def generate_reply(
     max_new_tokens: int,
     temperature:    float,
     top_p:          float,
 ) -> Generator[str, None, None]:
     use_think = "Thinking" in thinking_mode
-    max_new_tokens = min(int(max_new_tokens), MODEL_CAP["max_tokens"])
-    temperature    = min(float(temperature),  MODEL_CAP["temp_max"])
     messages: list[dict] = []
     if system_prompt.strip():
@@ -196,7 +218,7 @@ def generate_reply(
     user_text = build_user_message(message, use_think)
     # Vision: image input handling
-    if image_input and MODEL_CAP["vision"]:
         import io
         from PIL import Image as PILImage
@@ -218,41 +240,75 @@ def generate_reply(
         content = user_text
     messages.append({"role":"user","content":content})
-    # Stream from SGLang
-    try:
-        resp = requests.post(SGLANG_URL, json={
-            "model": "FINAL-Bench/Darwin-35B-A3B-Opus",
-            "messages": messages,
-            "max_tokens": max_new_tokens,
-            "temperature": temperature,
-            "top_p": float(top_p),
-            "stream": True,
-        }, stream=True, timeout=600, verify=False)
-        raw = ""
-        for line in resp.iter_lines(decode_unicode=True):
-            if not line or not line.startswith("data: "):
-                continue
-            payload = line[6:]
-            if payload.strip() == "[DONE]":
-                break
-            try:
-                chunk = json.loads(payload)
-                delta = chunk.get("choices", [{}])[0].get("delta", {})
-                token = delta.get("content", "")
-                if token:
-                    raw += token
-                    yield format_response(raw)
-            except (json.JSONDecodeError, IndexError, KeyError):
-                continue
-        if raw:
-            yield format_response(raw)
-    except requests.exceptions.ConnectionError:
-        yield "**❌ SGLang 서버 연결 실패.** `localhost:7947`에 서버가 실행 중인지 확인하세요."
-    except Exception as exc:
-        yield f"**Error:** `{exc}`"
 # ══════════════════════════════════════════════════════════════════════════════
@@ -270,6 +326,7 @@ with gr.Blocks(title="Darwin-35B-A3B-Opus") as gradio_demo:
     max_new_tokens = gr.Slider(minimum=64, maximum=16384, value=4096, visible=False)
     temperature    = gr.Slider(minimum=0.0, maximum=1.5, value=0.6,  visible=False)
     top_p          = gr.Slider(minimum=0.1, maximum=1.0, value=0.9,  visible=False)
     gr.ChatInterface(
         fn=generate_reply,
@@ -277,6 +334,7 @@ with gr.Blocks(title="Darwin-35B-A3B-Opus") as gradio_demo:
         additional_inputs=[
             thinking_toggle, image_input,
             system_prompt, max_new_tokens, temperature, top_p,
         ],
     )
@@ -374,6 +432,14 @@ async def health():
     except:
         return {"status":"ok","sglang":"disconnected"}
 # ── Web Search API (Brave) ──────────────────────────────────────────────
 BRAVE_API_KEY = os.getenv("BRAVE_API_KEY", "")

 SGLANG_BASE = os.getenv("DARWIN_API", "http://localhost:7947")
 SGLANG_URL  = f"{SGLANG_BASE}/v1/chat/completions"
+# Multi-model config
+MODELS = {
+    "Darwin-35B-A3B-Opus": {
+        "id": "FINAL-Bench/Darwin-35B-A3B-Opus",
+        "api": os.getenv("DARWIN_API", "http://localhost:7947"),
+        "arch": "MoE", "active": "3B / 35B total",
+        "ctx": "262K", "thinking": True, "vision": True,
+        "max_tokens": 16384, "temp_max": 1.5,
+        "desc": "Original BF16 · SGLang",
+        "badge": "BF16",
+    },
+    "Darwin-35B-A3B-Opus-Q8-GGUF": {
+        "id": "FINAL-Bench/Darwin-35B-A3B-Opus-Q8-GGUF",
+        "api": os.getenv("DARWIN_GGUF_API", "http://localhost:8080"),
+        "arch": "MoE", "active": "3B / 35B total",
+        "ctx": "262K", "thinking": True, "vision": False,
+        "max_tokens": 16384, "temp_max": 1.5,
+        "desc": "Q8_0 GGUF · llama.cpp",
+        "badge": "Q8 GGUF",
+    },
 }
+DEFAULT_MODEL = "Darwin-35B-A3B-Opus"
+MODEL_NAME = DEFAULT_MODEL
+MODEL_CAP  = MODELS[DEFAULT_MODEL]
 PRESETS = {
     "general":   "You are Darwin-35B-A3B-Opus, a highly capable reasoning model created by VIDRAFT via evolutionary merge. Think step by step for complex questions.",
     "code":      "You are an expert software engineer. Write clean, efficient, well-commented code. Explain your approach before writing. Use modern best practices.",
     max_new_tokens: int,
     temperature:    float,
     top_p:          float,
+    model_name:     str = "Darwin-35B-A3B-Opus",
 ) -> Generator[str, None, None]:
+    mcfg = MODELS.get(model_name, MODELS[DEFAULT_MODEL])
+    api_url = f"{mcfg['api']}/v1/chat/completions"
     use_think = "Thinking" in thinking_mode
+    max_new_tokens = min(int(max_new_tokens), mcfg["max_tokens"])
+    temperature    = min(float(temperature),  mcfg["temp_max"])
     messages: list[dict] = []
     if system_prompt.strip():
     user_text = build_user_message(message, use_think)
     # Vision: image input handling
+    if image_input and mcfg["vision"]:
         import io
         from PIL import Image as PILImage
         content = user_text
     messages.append({"role":"user","content":content})
+    # Stream from API (with fallback)
+    H100_API = os.getenv("H100_API", "")
+    api_urls = [api_url]
+    if H100_API:
+        api_urls.append(f"{H100_API.rstrip('/')}/v1/chat/completions")
+    request_body = {
+        "model": mcfg["id"],
+        "messages": messages,
+        "max_tokens": max_new_tokens,
+        "temperature": temperature,
+        "top_p": float(top_p),
+        "stream": True,
+    }
+    for i, url in enumerate(api_urls):
+        try:
+            label = "Primary" if i == 0 else "Fallback(H100)"
+            print(f"[API] {label}: {url}", flush=True)
+            resp = requests.post(url, json=request_body,
+                                 stream=True, timeout=15, verify=False)
+            if resp.status_code != 200:
+                raise requests.exceptions.ConnectionError(f"HTTP {resp.status_code}")
+            raw = ""
+            got_token = False
+            for line in resp.iter_lines(decode_unicode=True):
+                if not line or not line.startswith("data: "):
+                    continue
+                payload = line[6:]
+                if payload.strip() == "[DONE]":
+                    break
+                try:
+                    chunk = json.loads(payload)
+                    delta = chunk.get("choices", [{}])[0].get("delta", {})
+                    token = delta.get("content", "")
+                    if token:
+                        raw += token
+                        got_token = True
+                        yield format_response(raw)
+                except (json.JSONDecodeError, IndexError, KeyError):
+                    continue
+            if raw:
+                yield format_response(raw)
+            if got_token:
+                print(f"[API] {label} OK — {len(raw)} chars", flush=True)
+                return  # 성공 시 종료
+            # 토큰 0개면 다음 API로
+            if not got_token and i < len(api_urls) - 1:
+                print(f"[API] {label} returned no tokens, trying fallback...", flush=True)
+                continue
+        except (requests.exceptions.ConnectionError,
+                requests.exceptions.Timeout,
+                requests.exceptions.ReadTimeout) as e:
+            print(f"[API] {label} failed: {e}", flush=True)
+            if i < len(api_urls) - 1:
+                print(f"[API] Switching to fallback...", flush=True)
+                continue
+            else:
+                yield f"**❌ 모든 API 연결 실패.**\n\n- Primary: `{api_urls[0]}`\n- Fallback: `{url}`"
+        except Exception as exc:
+            yield f"**Error:** `{exc}`"
+            return
 # ══════════════════════════════════════════════════════════════════════════════
     max_new_tokens = gr.Slider(minimum=64, maximum=16384, value=4096, visible=False)
     temperature    = gr.Slider(minimum=0.0, maximum=1.5, value=0.6,  visible=False)
     top_p          = gr.Slider(minimum=0.1, maximum=1.0, value=0.9,  visible=False)
+    model_selector = gr.Textbox(value=DEFAULT_MODEL, visible=False)
     gr.ChatInterface(
         fn=generate_reply,
         additional_inputs=[
             thinking_toggle, image_input,
             system_prompt, max_new_tokens, temperature, top_p,
+            model_selector,
         ],
     )
     except:
         return {"status":"ok","sglang":"disconnected"}
+@fapp.get("/api/models")
+async def api_models():
+    return JSONResponse({
+        "models": {k: {"desc": v["desc"], "badge": v["badge"], "vision": v["vision"], "ctx": v["ctx"]}
+                   for k, v in MODELS.items()},
+        "default": DEFAULT_MODEL,
+    })
 # ── Web Search API (Brave) ──────────────────────────────────────────────
 BRAVE_API_KEY = os.getenv("BRAVE_API_KEY", "")