Gemma-4-Multi

Running on Zero

App Files Files Community

SeaWolf-AI commited on 16 days ago

Commit

c98aa0c

verified ·

1 Parent(s): b8875ff

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -129

app.py CHANGED Viewed

@@ -85,34 +85,23 @@ THINKING_END = "<channel|>"
 _loaded_model_name = None
 _model = None
 _processor = None
-_strip_tokens = []
 def _load_model(model_name: str):
-    """Load or switch model. Unloads previous model first."""
     global _loaded_model_name, _model, _processor, _strip_tokens
     if _loaded_model_name == model_name and _model is not None:
-        return  # Already loaded
     model_cfg = MODELS[model_name]
     model_id = model_cfg["id"]
     print(f"[MODEL] Loading {model_name} ({model_id})...", flush=True)
-    # Unload previous model
-    if _model is not None:
-        del _model
-        _model = None
-        torch.cuda.empty_cache()
-        import gc; gc.collect()
-        print(f"[MODEL] Unloaded previous model", flush=True)
     _processor = AutoProcessor.from_pretrained(model_id)
     _model = AutoModelForMultimodalLM.from_pretrained(
         model_id, device_map="auto", dtype=torch.bfloat16,
     )
-    # Build strip tokens list (keep thinking delimiters)
     _keep = {THINKING_START, THINKING_END}
     _strip_tokens = sorted(
         (t for t in _processor.tokenizer.all_special_tokens if t not in _keep),
@@ -123,7 +112,7 @@ def _load_model(model_name: str):
     print(f"[MODEL] ✓ {model_name} loaded ({model_cfg['arch']}, {model_cfg['active']} active)", flush=True)
-# Load default model at startup
 _load_model(DEFAULT_MODEL)
@@ -137,17 +126,14 @@ def _strip_special_tokens(text: str) -> str:
 # 3.  THINKING MODE HELPERS
 # ══════════════════════════════════════════════════════════════════════════════
 def parse_think_blocks(text: str) -> tuple[str, str]:
-    """Parse <|channel>...<channel|> thinking blocks"""
     m = re.search(r"<\|channel\>(.*?)<channel\|>\s*", text, re.DOTALL)
     if m:
         return (m.group(1).strip(), text[m.end():].strip())
-    # Fallback: <think>...</think>
     m = re.search(r"<think>(.*?)</think>\s*", text, re.DOTALL)
     return (m.group(1).strip(), text[m.end():].strip()) if m else ("", text)
 def format_response(raw: str) -> str:
-    """Format response with thinking blocks collapsed"""
     chain, answer = parse_think_blocks(raw)
     if chain:
         return (
@@ -157,7 +143,6 @@ def format_response(raw: str) -> str:
             "</details>\n\n"
             f"{answer}"
         )
-    # Thinking in progress
     if THINKING_START in raw and THINKING_END not in raw:
         think_len = len(raw) - raw.index(THINKING_START) - len(THINKING_START)
         return f"🧠 Reasoning... ({think_len} chars)"
@@ -240,13 +225,8 @@ def generate_reply(
     max_new_tokens: int,
     temperature:    float,
     top_p:          float,
-    model_choice:   str,
 ) -> Generator[str, None, None]:
-    """Main generation function — builds messages, calls GPU inference."""
-    # ── Model switching ──
-    target_model = model_choice if model_choice in MODELS else DEFAULT_MODEL
-    _load_model(target_model)
     use_think = "Thinking" in thinking_mode
     max_new_tokens = min(int(max_new_tokens), 8192)
@@ -256,17 +236,12 @@ def generate_reply(
     if system_prompt.strip():
         messages.append({"role": "system", "content": [{"type": "text", "text": system_prompt.strip()}]})
-    # Process history
     for turn in history:
         if isinstance(turn, dict):
             role = turn.get("role", "")
             raw = turn.get("content") or ""
             if isinstance(raw, list):
-                text_parts = []
-                for p in raw:
-                    if isinstance(p, dict) and p.get("type") == "text":
-                        text_parts.append(p.get("text", ""))
-                text = " ".join(text_parts)
             else:
                 text = str(raw)
             if role == "user":
@@ -275,34 +250,13 @@ def generate_reply(
                 _, clean = parse_think_blocks(text)
                 messages.append({"role": "assistant", "content": [{"type": "text", "text": clean}]})
-    # ── User message with optional vision ──
     user_content: list[dict] = []
-    if image_input:
-        try:
-            import io
-            from PIL import Image as PILImage
-            if isinstance(image_input, str) and image_input.startswith("data:"):
-                _, b64_data = image_input.split(",", 1)
-                img_bytes = base64.b64decode(b64_data)
-            elif isinstance(image_input, str) and os.path.isfile(image_input):
-                with open(image_input, "rb") as f:
-                    img_bytes = f.read()
-            else:
-                buf = io.BytesIO()
-                if not isinstance(image_input, PILImage.Image):
-                    image_input = PILImage.fromarray(image_input)
-                image_input.save(buf, format="JPEG")
-                img_bytes = buf.getvalue()
-            b64 = base64.b64encode(img_bytes).decode()
-            user_content.append({
-                "type": "image",
-                "url": f"data:image/jpeg;base64,{b64}",
-            })
-        except Exception as e:
-            print(f"[VISION] Image processing error: {e}", flush=True)
     user_content.append({"type": "text", "text": message})
     messages.append({"role": "user", "content": user_content})
@@ -340,8 +294,6 @@ def generate_reply(
         yield f"**❌ Generation error:** `{e}`"
-# ══════════════════════════════════════════════════════════════════════════════
 # ══════════════════════════════════════════════════════════════════════════════
 # 6.  GRADIO UI
 # ══════════════════════════════════════════════════════════════════════════════
@@ -351,109 +303,76 @@ footer { display: none !important; }
 .gradio-container { background: #faf8f5 !important; }
 #send-btn { background: linear-gradient(135deg, #6d28d9, #7c3aed) !important; border: none !important; border-radius: 12px !important; color: white !important; font-size: 18px !important; min-width: 48px !important; }
 #chatbot { border: 1.5px solid #e4dfd8 !important; border-radius: 14px !important; background: rgba(255,255,255,.65) !important; }
-.model-info-box { padding: 10px 14px; border-radius: 10px; border: 1.5px solid rgba(109,40,217,.2); background: linear-gradient(135deg, rgba(109,40,217,.04), rgba(16,185,129,.03)); font-size: 12px; line-height: 1.6; }
-.model-info-box b { color: #6d28d9; }
-.model-info-box .stats { font-size: 10px; color: #78716c; margin-top: 4px; }
 """
-# Model info display (updates when dropdown changes)
-def _model_info_html(name):
-    m = MODELS.get(name, MODELS[DEFAULT_MODEL])
-    return (
-        f'<div class="model-info-box">'
-        f'<b>{"⚡" if m["arch"]=="MoE" else "🏆"} {name}</b> '
-        f'<span style="font-size:9px;padding:2px 6px;border-radius:6px;background:rgba(109,40,217,.08);color:#6d28d9;font-weight:700">{m["arch"]}</span><br>'
-        f'<div class="stats">{m["active"]} active / {m["total"]} total · 👁️ Vision · {m["ctx"]} context<br>{m["desc"]}</div>'
-        f'</div>'
-    )
 with gr.Blocks(title="Gemma 4 Playground") as demo:
-    gr.Markdown("## 💎 Gemma 4 Playground\nGoogle DeepMind · Dense 31B or MoE 26B-A4B · Vision · Thinking · Apache 2.0")
     with gr.Row():
-        # ══ Sidebar ══
         with gr.Column(scale=0, min_width=300):
-            gr.Markdown("#### Select Model")
-            model_dd = gr.Dropdown(
-                choices=list(MODELS.keys()), value=DEFAULT_MODEL,
-                label="Model", elem_id="model-dd",
-                info="MoE=Fast inference | Dense=Best quality",
-            )
-            model_info = gr.HTML(value=_model_info_html(DEFAULT_MODEL))
             gr.Markdown("---")
-            gr.Markdown("#### 👁️ Vision")
-            image_input = gr.Image(label="Upload image", type="filepath", height=150)
             gr.Markdown("---")
             gr.Markdown("#### Settings")
-            thinking_radio = gr.Radio(
-                ["⚡ Fast", "🧠 Thinking"], value="⚡ Fast", label="Mode",
-            )
-            sys_prompt = gr.Textbox(
-                value=PRESETS["general"], label="System Prompt", lines=2,
-            )
-            preset_dd = gr.Dropdown(
-                choices=list(PRESETS.keys()), value="general", label="Preset",
-            )
             max_tok = gr.Slider(64, 8192, value=4096, step=64, label="Max Tokens")
             temp = gr.Slider(0.0, 1.5, value=0.6, step=0.05, label="Temperature")
             topp = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-P")
             clear_btn = gr.Button("🗑️ Clear conversation", size="sm")
-        # ══ Chat ══
         with gr.Column(scale=3):
             chatbot = gr.Chatbot(elem_id="chatbot", show_label=False, height=600)
             with gr.Row():
                 chat_input = gr.Textbox(
-                    placeholder="Message Gemma 4…", show_label=False,
-                    scale=7, autofocus=True, lines=1, max_lines=4,
                 )
                 send_btn = gr.Button("↑", variant="primary", scale=0, min_width=48, elem_id="send-btn")
-    # ── Events: model info update ──
-    model_dd.change(
-        fn=_model_info_html,
-        inputs=[model_dd],
-        outputs=[model_info],
-    )
-    # ── Events: preset → system prompt ──
-    preset_dd.change(
-        fn=lambda k: PRESETS.get(k, PRESETS["general"]),
-        inputs=[preset_dd],
-        outputs=[sys_prompt],
-    )
-    # ── Chat logic ──
     def user_msg(msg, hist):
-        if not msg.strip():
-            return "", hist
         return "", hist + [{"role": "user", "content": msg}]
-    def bot_reply(hist, think, img, sysp, maxt, tmp, tp, model):
-        if not hist or hist[-1]["role"] != "user":
-            return hist
         txt, past = hist[-1]["content"], hist[:-1]
         hist = hist + [{"role": "assistant", "content": ""}]
-        for chunk in generate_reply(txt, past, think, img, sysp, maxt, tmp, tp, model):
             hist[-1]["content"] = chunk
             yield hist
-    ins = [chatbot, thinking_radio, image_input, sys_prompt, max_tok, temp, topp, model_dd]
-    send_btn.click(
-        user_msg, [chat_input, chatbot], [chat_input, chatbot], queue=False
-    ).then(
-        bot_reply, ins, chatbot
-    )
-    chat_input.submit(
-        user_msg, [chat_input, chatbot], [chat_input, chatbot], queue=False
-    ).then(
-        bot_reply, ins, chatbot
-    )
     clear_btn.click(lambda: [], None, chatbot, queue=False)
@@ -461,5 +380,5 @@ with gr.Blocks(title="Gemma 4 Playground") as demo:
 # 7.  LAUNCH
 # ══════════════════════════════════════════════════════════════════════════════
 if __name__ == "__main__":
-    print(f"[BOOT] Gemma 4 Playground · Default: {DEFAULT_MODEL}", flush=True)
-    demo.launch(server_name="0.0.0.0", server_port=7860, css=CSS)

 _loaded_model_name = None
 _model = None
 _processor = None
 def _load_model(model_name: str):
+    """Load model at startup only. ZeroGPU packs tensors once — no runtime switching."""
     global _loaded_model_name, _model, _processor, _strip_tokens
     if _loaded_model_name == model_name and _model is not None:
+        return
     model_cfg = MODELS[model_name]
     model_id = model_cfg["id"]
     print(f"[MODEL] Loading {model_name} ({model_id})...", flush=True)
     _processor = AutoProcessor.from_pretrained(model_id)
     _model = AutoModelForMultimodalLM.from_pretrained(
         model_id, device_map="auto", dtype=torch.bfloat16,
     )
     _keep = {THINKING_START, THINKING_END}
     _strip_tokens = sorted(
         (t for t in _processor.tokenizer.all_special_tokens if t not in _keep),
     print(f"[MODEL] ✓ {model_name} loaded ({model_cfg['arch']}, {model_cfg['active']} active)", flush=True)
+# Load default model at startup (ZeroGPU will pack tensors — cannot switch later)
 _load_model(DEFAULT_MODEL)
 # 3.  THINKING MODE HELPERS
 # ══════════════════════════════════════════════════════════════════════════════
 def parse_think_blocks(text: str) -> tuple[str, str]:
     m = re.search(r"<\|channel\>(.*?)<channel\|>\s*", text, re.DOTALL)
     if m:
         return (m.group(1).strip(), text[m.end():].strip())
     m = re.search(r"<think>(.*?)</think>\s*", text, re.DOTALL)
     return (m.group(1).strip(), text[m.end():].strip()) if m else ("", text)
 def format_response(raw: str) -> str:
     chain, answer = parse_think_blocks(raw)
     if chain:
         return (
             "</details>\n\n"
             f"{answer}"
         )
     if THINKING_START in raw and THINKING_END not in raw:
         think_len = len(raw) - raw.index(THINKING_START) - len(THINKING_START)
         return f"🧠 Reasoning... ({think_len} chars)"
     max_new_tokens: int,
     temperature:    float,
     top_p:          float,
 ) -> Generator[str, None, None]:
+    """Main generation function."""
     use_think = "Thinking" in thinking_mode
     max_new_tokens = min(int(max_new_tokens), 8192)
     if system_prompt.strip():
         messages.append({"role": "system", "content": [{"type": "text", "text": system_prompt.strip()}]})
     for turn in history:
         if isinstance(turn, dict):
             role = turn.get("role", "")
             raw = turn.get("content") or ""
             if isinstance(raw, list):
+                text = " ".join(p.get("text", "") for p in raw if isinstance(p, dict) and p.get("type") == "text")
             else:
                 text = str(raw)
             if role == "user":
                 _, clean = parse_think_blocks(text)
                 messages.append({"role": "assistant", "content": [{"type": "text", "text": clean}]})
+    # ── User message with optional image ──
     user_content: list[dict] = []
+    # IMAGE: pass filepath directly as URL (Gemma 4 processor handles it)
+    if image_input and isinstance(image_input, str) and os.path.isfile(image_input):
+        user_content.append({"type": "image", "url": image_input})
+        print(f"[VISION] Image attached: {image_input}", flush=True)
     user_content.append({"type": "text", "text": message})
     messages.append({"role": "user", "content": user_content})
         yield f"**❌ Generation error:** `{e}`"
 # ══════════════════════════════════════════════════════════════════════════════
 # 6.  GRADIO UI
 # ══════════════════════════════════════════════════════════════════════════════
 .gradio-container { background: #faf8f5 !important; }
 #send-btn { background: linear-gradient(135deg, #6d28d9, #7c3aed) !important; border: none !important; border-radius: 12px !important; color: white !important; font-size: 18px !important; min-width: 48px !important; }
 #chatbot { border: 1.5px solid #e4dfd8 !important; border-radius: 14px !important; background: rgba(255,255,255,.65) !important; }
+.model-box { padding: 10px 14px; border-radius: 10px; border: 1.5px solid rgba(109,40,217,.2); background: linear-gradient(135deg, rgba(109,40,217,.04), rgba(16,185,129,.03)); font-size: 12px; line-height: 1.6; }
+.model-box b { color: #6d28d9; }
+.model-box .st { font-size: 10px; color: #78716c; margin-top: 4px; }
 """
+_mcfg = MODELS[DEFAULT_MODEL]
+MODEL_INFO_HTML = (
+    f'<div class="model-box">'
+    f'<b>{"⚡" if _mcfg["arch"]=="MoE" else "🏆"} {DEFAULT_MODEL}</b> '
+    f'<span style="font-size:9px;padding:2px 6px;border-radius:6px;background:rgba(109,40,217,.08);color:#6d28d9;font-weight:700">{_mcfg["arch"]}</span><br>'
+    f'<div class="st">{_mcfg["active"]} active / {_mcfg["total"]} total · 👁️ Vision · {_mcfg["ctx"]} context</div>'
+    f'<div class="st">{_mcfg["desc"]}</div>'
+    f'<div class="st" style="margin-top:6px">'
+    f'<a href="https://huggingface.co/{MODELS[DEFAULT_MODEL]["id"]}" target="_blank" style="color:#6d28d9;font-weight:700;text-decoration:none">🤗 Model Card ↗</a> · '
+    f'<a href="https://deepmind.google/models/gemma/gemma-4/" target="_blank" style="color:#059669;font-weight:700;text-decoration:none">🔬 DeepMind ↗</a>'
+    f'</div></div>'
+)
 with gr.Blocks(title="Gemma 4 Playground") as demo:
+    gr.Markdown("## 💎 Gemma 4 Playground\nGoogle DeepMind · Apache 2.0 · Vision · Thinking")
     with gr.Row():
+        # ── Sidebar ──
         with gr.Column(scale=0, min_width=300):
+            gr.Markdown("#### Current Model")
+            gr.HTML(MODEL_INFO_HTML)
             gr.Markdown("---")
+            gr.Markdown("#### 👁️ Upload Image")
+            image_input = gr.Image(label=None, type="filepath", height=160)
             gr.Markdown("---")
             gr.Markdown("#### Settings")
+            thinking_radio = gr.Radio(["⚡ Fast", "🧠 Thinking"], value="⚡ Fast", label="Mode")
+            sys_prompt = gr.Textbox(value=PRESETS["general"], label="System Prompt", lines=2)
+            preset_dd = gr.Dropdown(choices=list(PRESETS.keys()), value="general", label="Preset")
             max_tok = gr.Slider(64, 8192, value=4096, step=64, label="Max Tokens")
             temp = gr.Slider(0.0, 1.5, value=0.6, step=0.05, label="Temperature")
             topp = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-P")
             clear_btn = gr.Button("🗑️ Clear conversation", size="sm")
+        # ── Chat ──
         with gr.Column(scale=3):
             chatbot = gr.Chatbot(elem_id="chatbot", show_label=False, height=600)
             with gr.Row():
                 chat_input = gr.Textbox(
+                    placeholder="Message Gemma 4… (upload image in sidebar for vision)",
+                    show_label=False, scale=7, autofocus=True, lines=1, max_lines=4,
                 )
                 send_btn = gr.Button("↑", variant="primary", scale=0, min_width=48, elem_id="send-btn")
+    # ── Events ──
+    preset_dd.change(fn=lambda k: PRESETS.get(k, PRESETS["general"]), inputs=[preset_dd], outputs=[sys_prompt])
     def user_msg(msg, hist):
+        if not msg.strip(): return "", hist
         return "", hist + [{"role": "user", "content": msg}]
+    def bot_reply(hist, think, img, sysp, maxt, tmp, tp):
+        if not hist or hist[-1]["role"] != "user": return hist
         txt, past = hist[-1]["content"], hist[:-1]
         hist = hist + [{"role": "assistant", "content": ""}]
+        for chunk in generate_reply(txt, past, think, img, sysp, maxt, tmp, tp):
             hist[-1]["content"] = chunk
             yield hist
+    ins = [chatbot, thinking_radio, image_input, sys_prompt, max_tok, temp, topp]
+    send_btn.click(user_msg, [chat_input, chatbot], [chat_input, chatbot], queue=False).then(bot_reply, ins, chatbot)
+    chat_input.submit(user_msg, [chat_input, chatbot], [chat_input, chatbot], queue=False).then(bot_reply, ins, chatbot)
     clear_btn.click(lambda: [], None, chatbot, queue=False)
 # 7.  LAUNCH
 # ══════════════════════════════════════════════════════════════════════════════
 if __name__ == "__main__":
+    print(f"[BOOT] Gemma 4 Playground · Model: {DEFAULT_MODEL}", flush=True)
+    demo.launch(server_name="0.0.0.0", server_port=7860, css=CSS, ssr_mode=False)