| |
| """ |
| Agent Zero β HF Spaces Native Version |
| Loads your actual ScottzillaSystems model weights directly via transformers. |
| No TGE endpoints, no LiteLLM proxy, no Docker Compose β works on any HF Space. |
| """ |
|
|
| import os |
| import re |
| import json |
| import asyncio |
| from pathlib import Path |
| from typing import List, Dict, Optional, Any |
| from threading import Thread |
|
|
| import gradio as gr |
| import spaces |
| import torch |
| from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer |
|
|
|
|
| |
|
|
| AVAILABLE_MODELS = { |
| "cydonia-24b": { |
| "repo": "ScottzillaSystems/Cydonia-24B-v4.1", |
| "description": "Cydonia 24B β Mistral-based general purpose", |
| "tier": "T2", |
| "device_map": "auto", |
| "max_new_tokens": 2048, |
| }, |
| "qwen3.5-27b": { |
| "repo": "ScottzillaSystems/Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled", |
| "description": "Qwen3.5 27B β Claude Opus distilled reasoning", |
| "tier": "T3", |
| "device_map": "auto", |
| "max_new_tokens": 4096, |
| }, |
| "qwen3.5-9b": { |
| "repo": "ScottzillaSystems/Qwen3.5-9B-Chat", |
| "description": "Qwen3.5 9B β Fast general purpose, daily driver", |
| "tier": "T1", |
| "device_map": "auto", |
| "max_new_tokens": 2048, |
| }, |
| "chatgpt5": { |
| "repo": "ScottzillaSystems/ChatGPT-5-Chat", |
| "description": "ChatGPT-5 494M β Ultra-fast router/classification", |
| "tier": "T0", |
| "device_map": "auto", |
| "max_new_tokens": 1024, |
| }, |
| "fallen-command": { |
| "repo": "ScottzillaSystems/Fallen-Command-A-111B-Chat", |
| "description": "Fallen Command 111B β Flagship reasoning", |
| "tier": "T4", |
| "device_map": "auto", |
| "load_in_8bit": True, |
| "max_new_tokens": 4096, |
| }, |
| } |
|
|
| DEFAULT_MODEL = "qwen3.5-9b" |
|
|
| _model_cache: Dict[str, Any] = {} |
| _tokenizer_cache: Dict[str, Any] = {} |
|
|
|
|
| |
|
|
| def load_model(model_key: str): |
| """Load model and tokenizer, caching in memory.""" |
| if model_key in _model_cache: |
| return _model_cache[model_key], _tokenizer_cache[model_key] |
|
|
| config = AVAILABLE_MODELS.get(model_key) |
| if not config: |
| raise ValueError(f"Unknown model: {model_key}") |
|
|
| repo_id = config["repo"] |
| print(f"[AgentZero] Loading {model_key} from {repo_id}...") |
|
|
| tokenizer = AutoTokenizer.from_pretrained( |
| repo_id, trust_remote_code=True, token=os.getenv("HF_TOKEN"), |
| ) |
| if tokenizer.pad_token is None: |
| tokenizer.pad_token = tokenizer.eos_token |
|
|
| load_kwargs = { |
| "pretrained_model_name_or_path": repo_id, |
| "trust_remote_code": True, |
| "token": os.getenv("HF_TOKEN"), |
| "torch_dtype": torch.bfloat16, |
| "device_map": config.get("device_map", "auto"), |
| } |
| if config.get("load_in_8bit"): |
| load_kwargs["load_in_8bit"] = True |
|
|
| model = AutoModelForCausalLM.from_pretrained(**load_kwargs) |
|
|
| _model_cache[model_key] = model |
| _tokenizer_cache[model_key] = tokenizer |
|
|
| print(f"[AgentZero] {model_key} loaded") |
| return model, tokenizer |
|
|
|
|
| def unload_model(model_key: str): |
| if model_key in _model_cache: |
| del _model_cache[model_key] |
| del _tokenizer_cache[model_key] |
| torch.cuda.empty_cache() |
| return f"Unloaded {model_key}" |
| return f"{model_key} not loaded" |
|
|
|
|
| def get_status(): |
| loaded = list(_model_cache.keys()) |
| mem = torch.cuda.memory_allocated() // 1024**3 if torch.cuda.is_available() else 0 |
| return f"Loaded: {', '.join(loaded) if loaded else 'none'} | GPU: {mem}GB" |
|
|
|
|
| |
|
|
| @spaces.GPU(duration=120) |
| def generate_stream(model_key, messages, max_new_tokens=None, temperature=0.7): |
| model, tokenizer = load_model(model_key) |
| config = AVAILABLE_MODELS[model_key] |
| if max_new_tokens is None: |
| max_new_tokens = config.get("max_new_tokens", 2048) |
|
|
| prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) |
| inputs = tokenizer(prompt, return_tensors="pt", padding=True) |
| inputs = {k: v.to(model.device) for k, v in inputs.items()} |
|
|
| streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) |
|
|
| gen_kwargs = dict( |
| inputs, streamer=streamer, max_new_tokens=max_new_tokens, |
| do_sample=True, temperature=temperature, top_p=0.9, |
| pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id, |
| ) |
|
|
| Thread(target=model.generate, kwargs=gen_kwargs).start() |
| for text in streamer: |
| yield text |
|
|
|
|
| |
|
|
| CSS = """ |
| .az-header { text-align: center; padding: 20px; background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%); border-radius: 12px; margin-bottom: 16px; } |
| .az-header h1 { color: #e94560; margin: 0; font-size: 2em; } |
| .az-header p { color: #a0a0b0; margin: 4px 0 0 0; } |
| .model-card { background: #0f0f23; padding: 12px; border-radius: 8px; border-left: 4px solid #e94560; } |
| .tier-T0 { background: #00d4aa; color: #000; padding: 2px 8px; border-radius: 4px; font-size: 0.8em; } |
| .tier-T1 { background: #00a8e8; color: #000; padding: 2px 8px; border-radius: 4px; font-size: 0.8em; } |
| .tier-T2 { background: #f7b731; color: #000; padding: 2px 8px; border-radius: 4px; font-size: 0.8em; } |
| .tier-T3 { background: #e94560; color: #fff; padding: 2px 8px; border-radius: 4px; font-size: 0.8em; } |
| .tier-T4 { background: #9b59b6; color: #fff; padding: 2px 8px; border-radius: 4px; font-size: 0.8em; } |
| """ |
|
|
|
|
| def create_ui(): |
| with gr.Blocks(css=CSS, title="Agent Zero v2") as demo: |
| with gr.Column(elem_classes="az-header"): |
| gr.HTML("<h1>π€ Agent Zero v2</h1><p>Loading YOUR model weights β no proxies, no TGI, no lies</p>") |
|
|
| with gr.Row(): |
| with gr.Column(scale=1): |
| gr.Markdown("### Model") |
| model_dd = gr.Dropdown(choices=list(AVAILABLE_MODELS.keys()), value=DEFAULT_MODEL, label="Active Model") |
| model_info = gr.Markdown("Select a model") |
|
|
| with gr.Accordion("Catalog", open=False): |
| rows = "" |
| for k, v in AVAILABLE_MODELS.items(): |
| rows += f"<tr><td><b>{k}</b></td><td><span class='tier-{v['tier']}'>{v['tier']}</span></td><td>{v['description']}</td></tr>" |
| gr.HTML(f"<table width='100%'>{rows}</table>") |
|
|
| with gr.Accordion("Settings", open=False): |
| max_tok = gr.Slider(128, 4096, value=2048, step=128, label="Max New Tokens") |
| temp = gr.Slider(0.1, 1.5, value=0.7, step=0.1, label="Temperature") |
|
|
| status = gr.Textbox(value="Ready", label="Status", interactive=False) |
|
|
| with gr.Column(scale=3): |
| chatbot = gr.Chatbot(type="messages", height=550, label="Agent Zero v2") |
| with gr.Row(): |
| msg = gr.Textbox(placeholder="Ask anything... model loads on first send", show_label=False, scale=8) |
| send = gr.Button("Send", scale=1, variant="primary") |
| with gr.Row(): |
| clear = gr.Button("π Clear") |
| unload = gr.Button("π Unload") |
| statbtn = gr.Button("π Status") |
|
|
| def update_info(k): |
| c = AVAILABLE_MODELS.get(k, {}) |
| tier = c.get("tier", "T0") |
| return ( |
| f"<div class='model-card'><b>{c.get('description', '?')}</b><br>" |
| f"<span class='tier-{tier}'>{tier}</span> | " |
| f"{c.get('max_new_tokens', '?')} tokens<br>" |
| f"<code>{c.get('repo', '?')}</code></div>" |
| ) |
|
|
| model_dd.change(update_info, model_dd, model_info) |
|
|
| async def chat_fn(message, history, mk, mtok, tmp): |
| if not message.strip(): |
| yield history, "", "" |
| history = history or [] |
| history.append({"role": "user", "content": message}) |
| yield history, "", f"Loading {mk}..." |
| try: |
| msgs = [{"role": h["role"], "content": h["content"]} for h in history] |
| out = "" |
| for chunk in generate_stream(mk, msgs, mtok, tmp): |
| out += chunk |
| if history and history[-1]["role"] == "assistant": |
| history[-1]["content"] = out |
| else: |
| history.append({"role": "assistant", "content": out}) |
| yield history, "", get_status() |
| except Exception as e: |
| history.append({"role": "assistant", "content": f"β Error: {e}"}) |
| yield history, "", get_status() |
|
|
| send.click(chat_fn, [msg, chatbot, model_dd, max_tok, temp], [chatbot, msg, status]) |
| msg.submit(chat_fn, [msg, chatbot, model_dd, max_tok, temp], [chatbot, msg, status]) |
| clear.click(lambda: ([], "", "Ready"), outputs=[chatbot, msg, status]) |
| unload.click(lambda m: (unload_model(m), get_status()), model_dd, [status, status]) |
| statbtn.click(get_status, outputs=status) |
|
|
| return demo |
|
|
|
|
| if __name__ == "__main__": |
| demo = create_ui() |
| demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")), share=False) |