ScottzillaSystems
/

agent-zero-fixed

Model card Files Files and versions

xet

Community

ScottzillaSystems commited on 8 days ago

Commit

430a1df

verified ·

1 Parent(s): 69cd95f

Upload app.py

Browse files

Files changed (1) hide show

app.py +298 -0

app.py ADDED Viewed

	@@ -0,0 +1,298 @@

+#!/usr/bin/env python3
+"""
+Agent Zero — HF Spaces Native Version
+Loads your actual ScottzillaSystems model weights directly via transformers.
+No TGE endpoints, no LiteLLM proxy, no Docker Compose — works on any HF Space.
+Models are loaded on-demand and cached. Switch between models via dropdown.
+Uses @spaces.GPU for ZeroGPU compatibility on zero-a10g hardware.
+"""
+import os
+import re
+import json
+import asyncio
+from pathlib import Path
+from typing import List, Dict, Optional, Any
+from threading import Thread
+import gradio as gr
+import spaces
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+# ─── Configuration ───────────────────────────────────────────────────────────
+AVAILABLE_MODELS = {
+    "cydonia-24b": {
+        "repo": "ScottzillaSystems/Cydonia-24B-v4.1",
+        "description": "Cydonia 24B — Mistral-based general purpose",
+        "tier": "T2",
+        "device_map": "auto",
+        "max_new_tokens": 2048,
+    },
+    "qwen3.5-27b": {
+        "repo": "ScottzillaSystems/Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled",
+        "description": "Qwen3.5 27B — Claude Opus distilled reasoning",
+        "tier": "T3",
+        "device_map": "auto",
+        "max_new_tokens": 4096,
+    },
+    "qwen3.5-9b": {
+        "repo": "ScottzillaSystems/Qwen3.5-9B-Chat",
+        "description": "Qwen3.5 9B — Fast general purpose, daily driver",
+        "tier": "T1",
+        "device_map": "auto",
+        "max_new_tokens": 2048,
+    },
+    "chatgpt5": {
+        "repo": "ScottzillaSystems/ChatGPT-5-Chat",
+        "description": "ChatGPT-5 494M — Ultra-fast router/classification",
+        "tier": "T0",
+        "device_map": "auto",
+        "max_new_tokens": 1024,
+    },
+    "fallen-command": {
+        "repo": "ScottzillaSystems/Fallen-Command-A-111B-Chat",
+        "description": "Fallen Command 111B — Flagship reasoning",
+        "tier": "T4",
+        "device_map": "auto",
+        "load_in_8bit": True,
+        "max_new_tokens": 4096,
+    },
+}
+DEFAULT_MODEL = "qwen3.5-9b"
+# Global model cache (persists across requests on paid hardware)
+_model_cache: Dict[str, Any] = {}
+_tokenizer_cache: Dict[str, Any] = {}
+# ─── Model Loading ───────────────────────────────────────────────────────────
+def load_model(model_key: str):
+    """Load model and tokenizer, caching in memory."""
+    if model_key in _model_cache:
+        return _model_cache[model_key], _tokenizer_cache[model_key]
+    config = AVAILABLE_MODELS.get(model_key)
+    if not config:
+        raise ValueError(f"Unknown model: {model_key}. Available: {list(AVAILABLE_MODELS.keys())}")
+    repo_id = config["repo"]
+    print(f"[AgentZero] ⏳ Loading {model_key} from {repo_id}...")
+    tokenizer = AutoTokenizer.from_pretrained(
+        repo_id,
+        trust_remote_code=True,
+        token=os.getenv("HF_TOKEN"),
+    )
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    load_kwargs = {
+        "pretrained_model_name_or_path": repo_id,
+        "trust_remote_code": True,
+        "token": os.getenv("HF_TOKEN"),
+        "torch_dtype": torch.bfloat16,
+        "device_map": config.get("device_map", "auto"),
+    }
+    if config.get("load_in_8bit"):
+        load_kwargs["load_in_8bit"] = True
+    model = AutoModelForCausalLM.from_pretrained(**load_kwargs)
+    _model_cache[model_key] = model
+    _tokenizer_cache[model_key] = tokenizer
+    print(f"[AgentZero] ✅ {model_key} loaded successfully")
+    return model, tokenizer
+def unload_model(model_key: str):
+    """Free GPU memory."""
+    if model_key in _model_cache:
+        del _model_cache[model_key]
+        del _tokenizer_cache[model_key]
+        torch.cuda.empty_cache()
+        print(f"[AgentZero] 🔄 Unloaded {model_key}")
+        return f"✅ {model_key} unloaded — memory freed"
+    return f"ℹ️ {model_key} was not loaded"
+def get_model_status():
+    """Report which models are loaded."""
+    loaded = list(_model_cache.keys())
+    if not loaded:
+        return "No models loaded"
+    return f"Loaded: {', '.join(loaded)} | GPU memory: {torch.cuda.memory_allocated() // 1024**3 if torch.cuda.is_available() else 0}GB used"
+# ─── Inference ───────────────────────────────────────────────────────────────
+@spaces.GPU(duration=120)
+def generate_stream(model_key: str, messages: List[Dict[str, str]], max_new_tokens: int = None, temperature: float = 0.7):
+    """Stream tokens from the model."""
+    model, tokenizer = load_model(model_key)
+    config = AVAILABLE_MODELS[model_key]
+    if max_new_tokens is None:
+        max_new_tokens = config.get("max_new_tokens", 2048)
+    prompt = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True,
+    )
+    inputs = tokenizer(prompt, return_tensors="pt", padding=True)
+    inputs = {k: v.to(model.device) for k, v in inputs.items()}
+    streamer = TextIteratorStreamer(
+        tokenizer, skip_prompt=True, skip_special_tokens=True,
+    )
+    gen_kwargs = dict(
+        inputs,
+        streamer=streamer,
+        max_new_tokens=max_new_tokens,
+        do_sample=True,
+        temperature=temperature,
+        top_p=0.9,
+        pad_token_id=tokenizer.pad_token_id,
+        eos_token_id=tokenizer.eos_token_id,
+    )
+    thread = Thread(target=model.generate, kwargs=gen_kwargs)
+    thread.start()
+    for text in streamer:
+        yield text
+    thread.join()
+# ─── Gradio UI ───────────────────────────────────────────────────────────────
+CSS = """
+.agent-zero-header { text-align: center; padding: 20px; background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%); border-radius: 12px; margin-bottom: 16px; }
+.agent-zero-header h1 { color: #e94560; margin: 0; font-size: 2em; }
+.agent-zero-header p { color: #a0a0b0; margin: 8px 0 0 0; }
+.model-info { background: #0f0f23; padding: 12px; border-radius: 8px; border-left: 4px solid #e94560; margin-bottom: 8px; }
+.tier-badge { display: inline-block; padding: 2px 8px; border-radius: 4px; font-size: 0.8em; font-weight: bold; margin-left: 6px; }
+.tier-T0 { background: #00d4aa; color: #000; }
+.tier-T1 { background: #00a8e8; color: #000; }
+.tier-T2 { background: #f7b731; color: #000; }
+.tier-T3 { background: #e94560; color: #fff; }
+.tier-T4 { background: #9b59b6; color: #fff; }
+.status-bar { font-size: 0.85em; color: #6c6c8a; padding: 8px; background: #0f0f23; border-radius: 6px; }
+"""
+def create_ui():
+    with gr.Blocks(css=CSS, title="Agent Zero — Native") as demo:
+        with gr.Column(elem_classes="agent-zero-header"):
+            gr.HTML("""
+                <h1>🤖 Agent Zero</h1>
+                <p>Autonomous multi-model agent — loading YOUR weights directly via transformers</p>
+            """)
+        with gr.Row():
+            with gr.Column(scale=1):
+                gr.Markdown("### ⚙️ Model")
+                model_dropdown = gr.Dropdown(
+                    choices=list(AVAILABLE_MODELS.keys()),
+                    value=DEFAULT_MODEL,
+                    label="Active Model",
+                )
+                model_info = gr.Markdown("Select a model to see details")
+                with gr.Accordion("🧠 Catalog", open=False):
+                    catalog_html = "<table style='width:100%'>"
+                    for k, v in AVAILABLE_MODELS.items():
+                        catalog_html += f"<tr><td><b>{k}</b> <span class='tier-badge tier-{v['tier']}'>{v['tier']}</span></td><td style='font-size:0.9em'>{v['description']}</td></tr>"
+                    catalog_html += "</table>"
+                    gr.HTML(catalog_html)
+                with gr.Accordion("🔧 Settings", open=False):
+                    max_tokens_slider = gr.Slider(128, 4096, value=2048, step=128, label="Max New Tokens")
+                    temperature_slider = gr.Slider(0.1, 1.5, value=0.7, step=0.1, label="Temperature")
+                status_bar = gr.Textbox(label="System Status", value="Ready — no models loaded", interactive=False, elem_classes="status-bar")
+            with gr.Column(scale=3):
+                chatbot = gr.Chatbot(label="Agent Zero", type="messages", height=550)
+                with gr.Row():
+                    msg_input = gr.Textbox(placeholder="Ask anything... model loads on first send", show_label=False, scale=8)
+                    send_btn = gr.Button("Send", scale=1, variant="primary")
+                with gr.Row():
+                    clear_btn = gr.Button("🗑 Clear")
+                    unload_btn = gr.Button("🔄 Unload Model")
+                    status_btn = gr.Button("📊 Status")
+        # ─── Callbacks ───
+        def update_model_info(model_key):
+            config = AVAILABLE_MODELS.get(model_key, {})
+            return f"""<div class="model-info">
+<b>{config.get('description', 'Unknown')}</b><br>
+Tier: <span class="tier-badge tier-{config.get('tier', 'T0')}">{config.get('tier', 'T0')}</span> |
+Max tokens: {config.get('max_new_tokens', 'N/A')}<br>
+<code>{config.get('repo', 'N/A')}</code>
+</div>"""
+        model_dropdown.change(update_model_info, inputs=model_dropdown, outputs=model_info)
+        async def chat_fn(message, history, model_key, max_tok, temp):
+            if not message.strip():
+                yield history, "", ""
+            history = history or []
+            history.append({"role": "user", "content": message})
+            yield history, "", f"⏳ Loading {model_key}..."
+            try:
+                messages = [{"role": h["role"], "content": h["content"]} for h in history]
+                response_text = ""
+                for chunk in generate_stream(model_key, messages, max_tok, temp):
+                    response_text += chunk
+                    if history and history[-1]["role"] == "assistant":
+                        history[-1]["content"] = response_text
+                    else:
+                        history.append({"role": "assistant", "content": response_text})
+                    yield history, "", get_model_status()
+            except Exception as e:
+                error_msg = f"❌ Error: {str(e)}\n\nTry a smaller model or check status."
+                history.append({"role": "assistant", "content": error_msg})
+                yield history, "", get_model_status()
+        send_btn.click(
+            chat_fn,
+            inputs=[msg_input, chatbot, model_dropdown, max_tokens_slider, temperature_slider],
+            outputs=[chatbot, msg_input, status_bar],
+        )
+        msg_input.submit(
+            chat_fn,
+            inputs=[msg_input, chatbot, model_dropdown, max_tokens_slider, temperature_slider],
+            outputs=[chatbot, msg_input, status_bar],
+        )
+        clear_btn.click(lambda: ([], "", "Ready"), outputs=[chatbot, msg_input, status_bar])
+        unload_btn.click(
+            lambda m: (unload_model(m), get_model_status()),
+            inputs=model_dropdown, outputs=[status_bar, status_bar],
+        )
+        status_btn.click(lambda: get_model_status(), outputs=status_bar)
+    return demo
+if __name__ == "__main__":
+    demo = create_ui()
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=int(os.getenv("PORT", "7860")),
+        share=False,
+    )