| |
| """ |
| Agent Zero β HF Spaces Native Version |
| Loads your actual ScottzillaSystems model weights directly via transformers. |
| No TGE endpoints, no LiteLLM proxy, no Docker Compose β works on any HF Space. |
| |
| Models are loaded on-demand and cached. Switch between models via dropdown. |
| Uses @spaces.GPU for ZeroGPU compatibility on zero-a10g hardware. |
| """ |
|
|
| import os |
| import re |
| import json |
| import asyncio |
| from pathlib import Path |
| from typing import List, Dict, Optional, Any |
| from threading import Thread |
|
|
| import gradio as gr |
| import spaces |
| import torch |
| from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer |
|
|
|
|
| |
|
|
| AVAILABLE_MODELS = { |
| "cydonia-24b": { |
| "repo": "ScottzillaSystems/Cydonia-24B-v4.1", |
| "description": "Cydonia 24B β Mistral-based general purpose", |
| "tier": "T2", |
| "device_map": "auto", |
| "max_new_tokens": 2048, |
| }, |
| "qwen3.5-27b": { |
| "repo": "ScottzillaSystems/Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled", |
| "description": "Qwen3.5 27B β Claude Opus distilled reasoning", |
| "tier": "T3", |
| "device_map": "auto", |
| "max_new_tokens": 4096, |
| }, |
| "qwen3.5-9b": { |
| "repo": "ScottzillaSystems/Qwen3.5-9B-Chat", |
| "description": "Qwen3.5 9B β Fast general purpose, daily driver", |
| "tier": "T1", |
| "device_map": "auto", |
| "max_new_tokens": 2048, |
| }, |
| "chatgpt5": { |
| "repo": "ScottzillaSystems/ChatGPT-5-Chat", |
| "description": "ChatGPT-5 494M β Ultra-fast router/classification", |
| "tier": "T0", |
| "device_map": "auto", |
| "max_new_tokens": 1024, |
| }, |
| "fallen-command": { |
| "repo": "ScottzillaSystems/Fallen-Command-A-111B-Chat", |
| "description": "Fallen Command 111B β Flagship reasoning", |
| "tier": "T4", |
| "device_map": "auto", |
| "load_in_8bit": True, |
| "max_new_tokens": 4096, |
| }, |
| } |
|
|
| DEFAULT_MODEL = "qwen3.5-9b" |
|
|
| |
| _model_cache: Dict[str, Any] = {} |
| _tokenizer_cache: Dict[str, Any] = {} |
|
|
|
|
| |
|
|
| def load_model(model_key: str): |
| """Load model and tokenizer, caching in memory.""" |
| if model_key in _model_cache: |
| return _model_cache[model_key], _tokenizer_cache[model_key] |
|
|
| config = AVAILABLE_MODELS.get(model_key) |
| if not config: |
| raise ValueError(f"Unknown model: {model_key}. Available: {list(AVAILABLE_MODELS.keys())}") |
|
|
| repo_id = config["repo"] |
| print(f"[AgentZero] β³ Loading {model_key} from {repo_id}...") |
|
|
| tokenizer = AutoTokenizer.from_pretrained( |
| repo_id, |
| trust_remote_code=True, |
| token=os.getenv("HF_TOKEN"), |
| ) |
| if tokenizer.pad_token is None: |
| tokenizer.pad_token = tokenizer.eos_token |
|
|
| load_kwargs = { |
| "pretrained_model_name_or_path": repo_id, |
| "trust_remote_code": True, |
| "token": os.getenv("HF_TOKEN"), |
| "torch_dtype": torch.bfloat16, |
| "device_map": config.get("device_map", "auto"), |
| } |
| if config.get("load_in_8bit"): |
| load_kwargs["load_in_8bit"] = True |
|
|
| model = AutoModelForCausalLM.from_pretrained(**load_kwargs) |
|
|
| _model_cache[model_key] = model |
| _tokenizer_cache[model_key] = tokenizer |
|
|
| print(f"[AgentZero] β
{model_key} loaded successfully") |
| return model, tokenizer |
|
|
|
|
| def unload_model(model_key: str): |
| """Free GPU memory.""" |
| if model_key in _model_cache: |
| del _model_cache[model_key] |
| del _tokenizer_cache[model_key] |
| torch.cuda.empty_cache() |
| print(f"[AgentZero] π Unloaded {model_key}") |
| return f"β
{model_key} unloaded β memory freed" |
| return f"βΉοΈ {model_key} was not loaded" |
|
|
|
|
| def get_model_status(): |
| """Report which models are loaded.""" |
| loaded = list(_model_cache.keys()) |
| if not loaded: |
| return "No models loaded" |
| return f"Loaded: {', '.join(loaded)} | GPU memory: {torch.cuda.memory_allocated() // 1024**3 if torch.cuda.is_available() else 0}GB used" |
|
|
|
|
| |
|
|
| @spaces.GPU(duration=120) |
| def generate_stream(model_key: str, messages: List[Dict[str, str]], max_new_tokens: int = None, temperature: float = 0.7): |
| """Stream tokens from the model.""" |
| model, tokenizer = load_model(model_key) |
| config = AVAILABLE_MODELS[model_key] |
|
|
| if max_new_tokens is None: |
| max_new_tokens = config.get("max_new_tokens", 2048) |
|
|
| prompt = tokenizer.apply_chat_template( |
| messages, tokenize=False, add_generation_prompt=True, |
| ) |
|
|
| inputs = tokenizer(prompt, return_tensors="pt", padding=True) |
| inputs = {k: v.to(model.device) for k, v in inputs.items()} |
|
|
| streamer = TextIteratorStreamer( |
| tokenizer, skip_prompt=True, skip_special_tokens=True, |
| ) |
|
|
| gen_kwargs = dict( |
| inputs, |
| streamer=streamer, |
| max_new_tokens=max_new_tokens, |
| do_sample=True, |
| temperature=temperature, |
| top_p=0.9, |
| pad_token_id=tokenizer.pad_token_id, |
| eos_token_id=tokenizer.eos_token_id, |
| ) |
|
|
| thread = Thread(target=model.generate, kwargs=gen_kwargs) |
| thread.start() |
|
|
| for text in streamer: |
| yield text |
|
|
| thread.join() |
|
|
|
|
| |
|
|
| CSS = """ |
| .agent-zero-header { text-align: center; padding: 20px; background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%); border-radius: 12px; margin-bottom: 16px; } |
| .agent-zero-header h1 { color: #e94560; margin: 0; font-size: 2em; } |
| .agent-zero-header p { color: #a0a0b0; margin: 8px 0 0 0; } |
| .model-info { background: #0f0f23; padding: 12px; border-radius: 8px; border-left: 4px solid #e94560; margin-bottom: 8px; } |
| .tier-badge { display: inline-block; padding: 2px 8px; border-radius: 4px; font-size: 0.8em; font-weight: bold; margin-left: 6px; } |
| .tier-T0 { background: #00d4aa; color: #000; } |
| .tier-T1 { background: #00a8e8; color: #000; } |
| .tier-T2 { background: #f7b731; color: #000; } |
| .tier-T3 { background: #e94560; color: #fff; } |
| .tier-T4 { background: #9b59b6; color: #fff; } |
| .status-bar { font-size: 0.85em; color: #6c6c8a; padding: 8px; background: #0f0f23; border-radius: 6px; } |
| """ |
|
|
|
|
| def create_ui(): |
| with gr.Blocks(css=CSS, title="Agent Zero β Native") as demo: |
| with gr.Column(elem_classes="agent-zero-header"): |
| gr.HTML(""" |
| <h1>π€ Agent Zero</h1> |
| <p>Autonomous multi-model agent β loading YOUR weights directly via transformers</p> |
| """) |
|
|
| with gr.Row(): |
| with gr.Column(scale=1): |
| gr.Markdown("### βοΈ Model") |
| model_dropdown = gr.Dropdown( |
| choices=list(AVAILABLE_MODELS.keys()), |
| value=DEFAULT_MODEL, |
| label="Active Model", |
| ) |
| model_info = gr.Markdown("Select a model to see details") |
|
|
| with gr.Accordion("π§ Catalog", open=False): |
| catalog_html = "<table style='width:100%'>" |
| for k, v in AVAILABLE_MODELS.items(): |
| catalog_html += f"<tr><td><b>{k}</b> <span class='tier-badge tier-{v['tier']}'>{v['tier']}</span></td><td style='font-size:0.9em'>{v['description']}</td></tr>" |
| catalog_html += "</table>" |
| gr.HTML(catalog_html) |
|
|
| with gr.Accordion("π§ Settings", open=False): |
| max_tokens_slider = gr.Slider(128, 4096, value=2048, step=128, label="Max New Tokens") |
| temperature_slider = gr.Slider(0.1, 1.5, value=0.7, step=0.1, label="Temperature") |
|
|
| status_bar = gr.Textbox(label="System Status", value="Ready β no models loaded", interactive=False, elem_classes="status-bar") |
|
|
| with gr.Column(scale=3): |
| chatbot = gr.Chatbot(label="Agent Zero", type="messages", height=550) |
| with gr.Row(): |
| msg_input = gr.Textbox(placeholder="Ask anything... model loads on first send", show_label=False, scale=8) |
| send_btn = gr.Button("Send", scale=1, variant="primary") |
| with gr.Row(): |
| clear_btn = gr.Button("π Clear") |
| unload_btn = gr.Button("π Unload Model") |
| status_btn = gr.Button("π Status") |
|
|
| |
|
|
| def update_model_info(model_key): |
| config = AVAILABLE_MODELS.get(model_key, {}) |
| return f"""<div class="model-info"> |
| <b>{config.get('description', 'Unknown')}</b><br> |
| Tier: <span class="tier-badge tier-{config.get('tier', 'T0')}">{config.get('tier', 'T0')}</span> | |
| Max tokens: {config.get('max_new_tokens', 'N/A')}<br> |
| <code>{config.get('repo', 'N/A')}</code> |
| </div>""" |
|
|
| model_dropdown.change(update_model_info, inputs=model_dropdown, outputs=model_info) |
|
|
| async def chat_fn(message, history, model_key, max_tok, temp): |
| if not message.strip(): |
| yield history, "", "" |
|
|
| history = history or [] |
| history.append({"role": "user", "content": message}) |
| yield history, "", f"β³ Loading {model_key}..." |
|
|
| try: |
| messages = [{"role": h["role"], "content": h["content"]} for h in history] |
|
|
| response_text = "" |
| for chunk in generate_stream(model_key, messages, max_tok, temp): |
| response_text += chunk |
| if history and history[-1]["role"] == "assistant": |
| history[-1]["content"] = response_text |
| else: |
| history.append({"role": "assistant", "content": response_text}) |
| yield history, "", get_model_status() |
|
|
| except Exception as e: |
| error_msg = f"β Error: {str(e)}\n\nTry a smaller model or check status." |
| history.append({"role": "assistant", "content": error_msg}) |
| yield history, "", get_model_status() |
|
|
| send_btn.click( |
| chat_fn, |
| inputs=[msg_input, chatbot, model_dropdown, max_tokens_slider, temperature_slider], |
| outputs=[chatbot, msg_input, status_bar], |
| ) |
| msg_input.submit( |
| chat_fn, |
| inputs=[msg_input, chatbot, model_dropdown, max_tokens_slider, temperature_slider], |
| outputs=[chatbot, msg_input, status_bar], |
| ) |
|
|
| clear_btn.click(lambda: ([], "", "Ready"), outputs=[chatbot, msg_input, status_bar]) |
| unload_btn.click( |
| lambda m: (unload_model(m), get_model_status()), |
| inputs=model_dropdown, outputs=[status_bar, status_bar], |
| ) |
| status_btn.click(lambda: get_model_status(), outputs=status_bar) |
|
|
| return demo |
|
|
|
|
| if __name__ == "__main__": |
| demo = create_ui() |
| demo.launch( |
| server_name="0.0.0.0", |
| server_port=int(os.getenv("PORT", "7860")), |
| share=False, |
| ) |