ScottzillaSystems commited on
Commit
430a1df
Β·
verified Β·
1 Parent(s): 69cd95f

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +298 -0
app.py ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Agent Zero β€” HF Spaces Native Version
4
+ Loads your actual ScottzillaSystems model weights directly via transformers.
5
+ No TGE endpoints, no LiteLLM proxy, no Docker Compose β€” works on any HF Space.
6
+
7
+ Models are loaded on-demand and cached. Switch between models via dropdown.
8
+ Uses @spaces.GPU for ZeroGPU compatibility on zero-a10g hardware.
9
+ """
10
+
11
+ import os
12
+ import re
13
+ import json
14
+ import asyncio
15
+ from pathlib import Path
16
+ from typing import List, Dict, Optional, Any
17
+ from threading import Thread
18
+
19
+ import gradio as gr
20
+ import spaces
21
+ import torch
22
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
23
+
24
+
25
+ # ─── Configuration ───────────────────────────────────────────────────────────
26
+
27
+ AVAILABLE_MODELS = {
28
+ "cydonia-24b": {
29
+ "repo": "ScottzillaSystems/Cydonia-24B-v4.1",
30
+ "description": "Cydonia 24B β€” Mistral-based general purpose",
31
+ "tier": "T2",
32
+ "device_map": "auto",
33
+ "max_new_tokens": 2048,
34
+ },
35
+ "qwen3.5-27b": {
36
+ "repo": "ScottzillaSystems/Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled",
37
+ "description": "Qwen3.5 27B β€” Claude Opus distilled reasoning",
38
+ "tier": "T3",
39
+ "device_map": "auto",
40
+ "max_new_tokens": 4096,
41
+ },
42
+ "qwen3.5-9b": {
43
+ "repo": "ScottzillaSystems/Qwen3.5-9B-Chat",
44
+ "description": "Qwen3.5 9B β€” Fast general purpose, daily driver",
45
+ "tier": "T1",
46
+ "device_map": "auto",
47
+ "max_new_tokens": 2048,
48
+ },
49
+ "chatgpt5": {
50
+ "repo": "ScottzillaSystems/ChatGPT-5-Chat",
51
+ "description": "ChatGPT-5 494M β€” Ultra-fast router/classification",
52
+ "tier": "T0",
53
+ "device_map": "auto",
54
+ "max_new_tokens": 1024,
55
+ },
56
+ "fallen-command": {
57
+ "repo": "ScottzillaSystems/Fallen-Command-A-111B-Chat",
58
+ "description": "Fallen Command 111B β€” Flagship reasoning",
59
+ "tier": "T4",
60
+ "device_map": "auto",
61
+ "load_in_8bit": True,
62
+ "max_new_tokens": 4096,
63
+ },
64
+ }
65
+
66
+ DEFAULT_MODEL = "qwen3.5-9b"
67
+
68
+ # Global model cache (persists across requests on paid hardware)
69
+ _model_cache: Dict[str, Any] = {}
70
+ _tokenizer_cache: Dict[str, Any] = {}
71
+
72
+
73
+ # ─── Model Loading ───────────────────────────────────────────────────────────
74
+
75
+ def load_model(model_key: str):
76
+ """Load model and tokenizer, caching in memory."""
77
+ if model_key in _model_cache:
78
+ return _model_cache[model_key], _tokenizer_cache[model_key]
79
+
80
+ config = AVAILABLE_MODELS.get(model_key)
81
+ if not config:
82
+ raise ValueError(f"Unknown model: {model_key}. Available: {list(AVAILABLE_MODELS.keys())}")
83
+
84
+ repo_id = config["repo"]
85
+ print(f"[AgentZero] ⏳ Loading {model_key} from {repo_id}...")
86
+
87
+ tokenizer = AutoTokenizer.from_pretrained(
88
+ repo_id,
89
+ trust_remote_code=True,
90
+ token=os.getenv("HF_TOKEN"),
91
+ )
92
+ if tokenizer.pad_token is None:
93
+ tokenizer.pad_token = tokenizer.eos_token
94
+
95
+ load_kwargs = {
96
+ "pretrained_model_name_or_path": repo_id,
97
+ "trust_remote_code": True,
98
+ "token": os.getenv("HF_TOKEN"),
99
+ "torch_dtype": torch.bfloat16,
100
+ "device_map": config.get("device_map", "auto"),
101
+ }
102
+ if config.get("load_in_8bit"):
103
+ load_kwargs["load_in_8bit"] = True
104
+
105
+ model = AutoModelForCausalLM.from_pretrained(**load_kwargs)
106
+
107
+ _model_cache[model_key] = model
108
+ _tokenizer_cache[model_key] = tokenizer
109
+
110
+ print(f"[AgentZero] βœ… {model_key} loaded successfully")
111
+ return model, tokenizer
112
+
113
+
114
+ def unload_model(model_key: str):
115
+ """Free GPU memory."""
116
+ if model_key in _model_cache:
117
+ del _model_cache[model_key]
118
+ del _tokenizer_cache[model_key]
119
+ torch.cuda.empty_cache()
120
+ print(f"[AgentZero] πŸ”„ Unloaded {model_key}")
121
+ return f"βœ… {model_key} unloaded β€” memory freed"
122
+ return f"ℹ️ {model_key} was not loaded"
123
+
124
+
125
+ def get_model_status():
126
+ """Report which models are loaded."""
127
+ loaded = list(_model_cache.keys())
128
+ if not loaded:
129
+ return "No models loaded"
130
+ return f"Loaded: {', '.join(loaded)} | GPU memory: {torch.cuda.memory_allocated() // 1024**3 if torch.cuda.is_available() else 0}GB used"
131
+
132
+
133
+ # ─── Inference ───────────────────────────────────────────────────────────────
134
+
135
+ @spaces.GPU(duration=120)
136
+ def generate_stream(model_key: str, messages: List[Dict[str, str]], max_new_tokens: int = None, temperature: float = 0.7):
137
+ """Stream tokens from the model."""
138
+ model, tokenizer = load_model(model_key)
139
+ config = AVAILABLE_MODELS[model_key]
140
+
141
+ if max_new_tokens is None:
142
+ max_new_tokens = config.get("max_new_tokens", 2048)
143
+
144
+ prompt = tokenizer.apply_chat_template(
145
+ messages, tokenize=False, add_generation_prompt=True,
146
+ )
147
+
148
+ inputs = tokenizer(prompt, return_tensors="pt", padding=True)
149
+ inputs = {k: v.to(model.device) for k, v in inputs.items()}
150
+
151
+ streamer = TextIteratorStreamer(
152
+ tokenizer, skip_prompt=True, skip_special_tokens=True,
153
+ )
154
+
155
+ gen_kwargs = dict(
156
+ inputs,
157
+ streamer=streamer,
158
+ max_new_tokens=max_new_tokens,
159
+ do_sample=True,
160
+ temperature=temperature,
161
+ top_p=0.9,
162
+ pad_token_id=tokenizer.pad_token_id,
163
+ eos_token_id=tokenizer.eos_token_id,
164
+ )
165
+
166
+ thread = Thread(target=model.generate, kwargs=gen_kwargs)
167
+ thread.start()
168
+
169
+ for text in streamer:
170
+ yield text
171
+
172
+ thread.join()
173
+
174
+
175
+ # ─── Gradio UI ───────────────────────────────────────────────────────────────
176
+
177
+ CSS = """
178
+ .agent-zero-header { text-align: center; padding: 20px; background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%); border-radius: 12px; margin-bottom: 16px; }
179
+ .agent-zero-header h1 { color: #e94560; margin: 0; font-size: 2em; }
180
+ .agent-zero-header p { color: #a0a0b0; margin: 8px 0 0 0; }
181
+ .model-info { background: #0f0f23; padding: 12px; border-radius: 8px; border-left: 4px solid #e94560; margin-bottom: 8px; }
182
+ .tier-badge { display: inline-block; padding: 2px 8px; border-radius: 4px; font-size: 0.8em; font-weight: bold; margin-left: 6px; }
183
+ .tier-T0 { background: #00d4aa; color: #000; }
184
+ .tier-T1 { background: #00a8e8; color: #000; }
185
+ .tier-T2 { background: #f7b731; color: #000; }
186
+ .tier-T3 { background: #e94560; color: #fff; }
187
+ .tier-T4 { background: #9b59b6; color: #fff; }
188
+ .status-bar { font-size: 0.85em; color: #6c6c8a; padding: 8px; background: #0f0f23; border-radius: 6px; }
189
+ """
190
+
191
+
192
+ def create_ui():
193
+ with gr.Blocks(css=CSS, title="Agent Zero β€” Native") as demo:
194
+ with gr.Column(elem_classes="agent-zero-header"):
195
+ gr.HTML("""
196
+ <h1>πŸ€– Agent Zero</h1>
197
+ <p>Autonomous multi-model agent β€” loading YOUR weights directly via transformers</p>
198
+ """)
199
+
200
+ with gr.Row():
201
+ with gr.Column(scale=1):
202
+ gr.Markdown("### βš™οΈ Model")
203
+ model_dropdown = gr.Dropdown(
204
+ choices=list(AVAILABLE_MODELS.keys()),
205
+ value=DEFAULT_MODEL,
206
+ label="Active Model",
207
+ )
208
+ model_info = gr.Markdown("Select a model to see details")
209
+
210
+ with gr.Accordion("🧠 Catalog", open=False):
211
+ catalog_html = "<table style='width:100%'>"
212
+ for k, v in AVAILABLE_MODELS.items():
213
+ catalog_html += f"<tr><td><b>{k}</b> <span class='tier-badge tier-{v['tier']}'>{v['tier']}</span></td><td style='font-size:0.9em'>{v['description']}</td></tr>"
214
+ catalog_html += "</table>"
215
+ gr.HTML(catalog_html)
216
+
217
+ with gr.Accordion("πŸ”§ Settings", open=False):
218
+ max_tokens_slider = gr.Slider(128, 4096, value=2048, step=128, label="Max New Tokens")
219
+ temperature_slider = gr.Slider(0.1, 1.5, value=0.7, step=0.1, label="Temperature")
220
+
221
+ status_bar = gr.Textbox(label="System Status", value="Ready β€” no models loaded", interactive=False, elem_classes="status-bar")
222
+
223
+ with gr.Column(scale=3):
224
+ chatbot = gr.Chatbot(label="Agent Zero", type="messages", height=550)
225
+ with gr.Row():
226
+ msg_input = gr.Textbox(placeholder="Ask anything... model loads on first send", show_label=False, scale=8)
227
+ send_btn = gr.Button("Send", scale=1, variant="primary")
228
+ with gr.Row():
229
+ clear_btn = gr.Button("πŸ—‘ Clear")
230
+ unload_btn = gr.Button("πŸ”„ Unload Model")
231
+ status_btn = gr.Button("πŸ“Š Status")
232
+
233
+ # ─── Callbacks ───
234
+
235
+ def update_model_info(model_key):
236
+ config = AVAILABLE_MODELS.get(model_key, {})
237
+ return f"""<div class="model-info">
238
+ <b>{config.get('description', 'Unknown')}</b><br>
239
+ Tier: <span class="tier-badge tier-{config.get('tier', 'T0')}">{config.get('tier', 'T0')}</span> |
240
+ Max tokens: {config.get('max_new_tokens', 'N/A')}<br>
241
+ <code>{config.get('repo', 'N/A')}</code>
242
+ </div>"""
243
+
244
+ model_dropdown.change(update_model_info, inputs=model_dropdown, outputs=model_info)
245
+
246
+ async def chat_fn(message, history, model_key, max_tok, temp):
247
+ if not message.strip():
248
+ yield history, "", ""
249
+
250
+ history = history or []
251
+ history.append({"role": "user", "content": message})
252
+ yield history, "", f"⏳ Loading {model_key}..."
253
+
254
+ try:
255
+ messages = [{"role": h["role"], "content": h["content"]} for h in history]
256
+
257
+ response_text = ""
258
+ for chunk in generate_stream(model_key, messages, max_tok, temp):
259
+ response_text += chunk
260
+ if history and history[-1]["role"] == "assistant":
261
+ history[-1]["content"] = response_text
262
+ else:
263
+ history.append({"role": "assistant", "content": response_text})
264
+ yield history, "", get_model_status()
265
+
266
+ except Exception as e:
267
+ error_msg = f"❌ Error: {str(e)}\n\nTry a smaller model or check status."
268
+ history.append({"role": "assistant", "content": error_msg})
269
+ yield history, "", get_model_status()
270
+
271
+ send_btn.click(
272
+ chat_fn,
273
+ inputs=[msg_input, chatbot, model_dropdown, max_tokens_slider, temperature_slider],
274
+ outputs=[chatbot, msg_input, status_bar],
275
+ )
276
+ msg_input.submit(
277
+ chat_fn,
278
+ inputs=[msg_input, chatbot, model_dropdown, max_tokens_slider, temperature_slider],
279
+ outputs=[chatbot, msg_input, status_bar],
280
+ )
281
+
282
+ clear_btn.click(lambda: ([], "", "Ready"), outputs=[chatbot, msg_input, status_bar])
283
+ unload_btn.click(
284
+ lambda m: (unload_model(m), get_model_status()),
285
+ inputs=model_dropdown, outputs=[status_bar, status_bar],
286
+ )
287
+ status_btn.click(lambda: get_model_status(), outputs=status_bar)
288
+
289
+ return demo
290
+
291
+
292
+ if __name__ == "__main__":
293
+ demo = create_ui()
294
+ demo.launch(
295
+ server_name="0.0.0.0",
296
+ server_port=int(os.getenv("PORT", "7860")),
297
+ share=False,
298
+ )