ScottzillaSystems commited on
Commit
3c03ca6
·
verified ·
1 Parent(s): 07846a9

Upload fleet_manager.py

Browse files
Files changed (1) hide show
  1. fleet_manager.py +369 -0
fleet_manager.py ADDED
@@ -0,0 +1,369 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Agent Zero Fleet Manager
4
+ Manages 4 autonomous Agent Zero instances with workspace isolation,
5
+ self-healing, config persistence, inter-agent delegation, and persistent storage.
6
+
7
+ Usage:
8
+ python fleet_manager.py --check # Check all 4 agents
9
+ python fleet_manager.py --provision-all # Provision entire fleet
10
+ python fleet_manager.py --set-hw agent-zero-main zero-a10g
11
+ python fleet_manager.py --enable-storage agent-zero-main
12
+ python fleet_manager.py --deploy-prompts agent-zero-main
13
+ python fleet_manager.py --setup-pipeline # Configure inter-agent task queue
14
+ """
15
+
16
+ import os
17
+ import json
18
+ import time
19
+ import argparse
20
+ from datetime import datetime
21
+ from typing import List, Dict, Optional
22
+
23
+ from huggingface_hub import HfApi, SpaceHardware
24
+
25
+
26
+ # ─── Fleet Configuration ─────────────────────────────────────────────────────
27
+
28
+ FLEET = {
29
+ "agent-zero-main": {
30
+ "repo": "ScottzillaSystems/agentzerov2",
31
+ "role": "sovereign",
32
+ "description": "Main orchestrator — receives all tasks, delegates to specialists",
33
+ "models": ["qwen3.5-27b", "cydonia-24b", "fallen-command"],
34
+ "hardware_target": "zero-a10g",
35
+ "persistent_storage": True,
36
+ },
37
+ "agent-zero-pentesting": {
38
+ "repo": "ScottzillaSystems/agent-zero-pentesting",
39
+ "role": "specialist",
40
+ "description": "Security specialist — writes findings to shared task queue",
41
+ "models": ["qwen3.5-9b"],
42
+ "hardware_target": "zero-a10g",
43
+ "persistent_storage": True,
44
+ },
45
+ "agent-zero-finops": {
46
+ "repo": "ScottzillaSystems/agent-zero-finops",
47
+ "role": "specialist",
48
+ "description": "Cost optimization specialist — monitors billing, suggests cuts",
49
+ "models": ["chatgpt5"],
50
+ "hardware_target": "cpu-basic",
51
+ "persistent_storage": False,
52
+ },
53
+ "agent-zero-creative": {
54
+ "repo": "ScottzillaSystems/agent-zero-adult-entertainment",
55
+ "role": "specialist",
56
+ "description": "Creative specialist — content generation, image workflows",
57
+ "models": ["qwen3.5-9b"],
58
+ "hardware_target": "zero-a10g",
59
+ "persistent_storage": True,
60
+ },
61
+ }
62
+
63
+
64
+ # ─── Prompts ─────────────────────────────────────────────────────────────────
65
+
66
+ SELF_DIAGNOSTIC_PROMPT = """# Agent Zero Self-Diagnostic Protocol
67
+
68
+ Run this diagnostic immediately on startup and report results:
69
+
70
+ ## 1. System Health
71
+ - [ ] GPU available: `torch.cuda.is_available()` → report True/False
72
+ - [ ] GPU memory: `torch.cuda.get_device_properties(0).total_memory / 1e9` → report GB
73
+ - [ ] Current model loaded: report model_key
74
+ - [ ] Model cache status: list all keys in _model_cache
75
+
76
+ ## 2. Model Loading Test
77
+ For each model in your catalog:
78
+ 1. Call `load_model(model_key)`
79
+ 2. Report: success/failure, load time, GPU memory after load
80
+ 3. Call `unload_model(model_key)`
81
+ 4. Report: memory freed
82
+
83
+ ## 3. Inference Test
84
+ Send: "What is 2+2?"
85
+ Expected: Response contains "4"
86
+ Report: latency_ms, tokens_generated, tokens/sec
87
+
88
+ ## 4. File System Test
89
+ - [ ] Write test file to `/workspace/projects/test.txt`
90
+ - [ ] Read it back
91
+ - [ ] Report: persistent storage working (survives restart?)
92
+
93
+ ## 5. Network Test
94
+ - [ ] Can reach huggingface.co?
95
+ - [ ] Can reach your model repos?
96
+
97
+ ## 6. Inter-Agent Test (if applicable)
98
+ - [ ] Can write to `/workspace/shared/task_queue/`?
99
+ - [ ] Can read from `/workspace/shared/task_queue/`?
100
+
101
+ Report all results in a single JSON block.
102
+ """
103
+
104
+ AUTONOMOUS_LOOP_PROMPT = """# Agent Zero Autonomous Operation Loop
105
+
106
+ You are now running autonomously. Follow this loop indefinitely:
107
+
108
+ ## Loop Cycle (every 60 seconds)
109
+
110
+ 1. **SCAN** `/workspace/shared/task_queue/` for new task files
111
+ 2. **READ** each task file (JSON format)
112
+ 3. **EXECUTE** the task using your loaded model
113
+ 4. **WRITE** results to `/workspace/shared/results/{task_id}.json`
114
+ 5. **ARCHIVE** completed task to `/workspace/shared/completed/`
115
+ 6. **SELF-HEAL** if model fails:
116
+ - Unload current model
117
+ - Try fallback model from catalog
118
+ - If all fail, pause and alert
119
+ 7. **REPORT** status to stdout every cycle
120
+
121
+ ## Task File Format
122
+ ```json
123
+ {
124
+ "task_id": "uuid",
125
+ "source_agent": "agent-zero-pentesting",
126
+ "priority": 1-5,
127
+ "task_type": "analysis|generation|review|action",
128
+ "prompt": "...",
129
+ "context": {},
130
+ "deadline": "ISO8601",
131
+ "created_at": "ISO8601"
132
+ }
133
+ ```
134
+
135
+ ## Result File Format
136
+ ```json
137
+ {
138
+ "task_id": "uuid",
139
+ "completed_by": "agent-zero-main",
140
+ "completed_at": "ISO8601",
141
+ "status": "success|failure|partial",
142
+ "output": "...",
143
+ "model_used": "cydonia-24b",
144
+ "tokens_used": 1234,
145
+ "latency_ms": 5600
146
+ }
147
+ ```
148
+
149
+ ## Self-Healing Rules
150
+ - If OOM: unload all models, wait 10s, reload smallest model
151
+ - If model download fails: retry 3x with exponential backoff
152
+ - If GPU not available: switch to CPU mode (slower but functional)
153
+ - If task queue full (>100): process highest priority first
154
+
155
+ Begin autonomous operation now.
156
+ """
157
+
158
+
159
+ # ─── Fleet Manager ───────────────────────────────────────────────────────────
160
+
161
+ class AgentZeroFleetManager:
162
+ def __init__(self, token: Optional[str] = None):
163
+ self.api = HfApi(token=token or os.getenv("HF_TOKEN"))
164
+ self.status_log: List[Dict] = []
165
+
166
+ def check_fleet(self) -> Dict:
167
+ """Check status of all 4 agent instances."""
168
+ report = {
169
+ "checked_at": datetime.utcnow().isoformat(),
170
+ "agents": [],
171
+ "issues": [],
172
+ }
173
+
174
+ for name, config in FLEET.items():
175
+ repo_id = config["repo"]
176
+ try:
177
+ runtime = self.api.get_space_runtime(repo_id)
178
+ agent_report = {
179
+ "name": name,
180
+ "repo": repo_id,
181
+ "stage": runtime.stage,
182
+ "hardware": runtime.hardware,
183
+ "requested_hardware": runtime.requested_hardware,
184
+ "role": config["role"],
185
+ "healthy": runtime.stage == "RUNNING",
186
+ }
187
+ report["agents"].append(agent_report)
188
+
189
+ if runtime.stage != "RUNNING":
190
+ report["issues"].append(f"{name}: {runtime.stage} (expected RUNNING)")
191
+ if runtime.hardware != config["hardware_target"]:
192
+ report["issues"].append(
193
+ f"{name}: hardware={runtime.hardware} (target={config['hardware_target']})"
194
+ )
195
+
196
+ except Exception as e:
197
+ report["agents"].append({
198
+ "name": name,
199
+ "repo": repo_id,
200
+ "stage": "ERROR",
201
+ "error": str(e),
202
+ })
203
+ report["issues"].append(f"{name}: API error - {e}")
204
+
205
+ self.status_log.append(report)
206
+ return report
207
+
208
+ def set_hardware(self, agent_name: str, hardware: str):
209
+ """Set hardware for an agent Space."""
210
+ config = FLEET.get(agent_name)
211
+ if not config:
212
+ raise ValueError(f"Unknown agent: {agent_name}")
213
+
214
+ repo_id = config["repo"]
215
+ hardware_enum = getattr(SpaceHardware, hardware.upper().replace("-", "_"))
216
+
217
+ self.api.request_space_hardware(repo_id, hardware=hardware_enum)
218
+ print(f"[Fleet] Set {agent_name} -> {hardware}")
219
+
220
+ def enable_persistent_storage(self, agent_name: str):
221
+ """Enable /data volume on a Space."""
222
+ config = FLEET.get(agent_name)
223
+ if not config:
224
+ raise ValueError(f"Unknown agent: {agent_name}")
225
+
226
+ repo_id = config["repo"]
227
+ import requests
228
+ resp = requests.put(
229
+ f"https://huggingface.co/api/spaces/{repo_id}/volumes",
230
+ headers={"Authorization": f"Bearer {self.api.token}"},
231
+ json={"data": True},
232
+ )
233
+ if resp.status_code == 200:
234
+ print(f"[Fleet] Persistent storage enabled for {agent_name}")
235
+ else:
236
+ print(f"[Fleet] Failed to enable storage for {agent_name}: {resp.status_code}")
237
+
238
+ def restart_agent(self, agent_name: str):
239
+ """Restart an agent Space."""
240
+ config = FLEET.get(agent_name)
241
+ if not config:
242
+ raise ValueError(f"Unknown agent: {agent_name}")
243
+
244
+ self.api.restart_space(config["repo"])
245
+ print(f"[Fleet] Restarted {agent_name}")
246
+
247
+ def print_report(self, report: Dict):
248
+ """Print formatted fleet status."""
249
+ print("\n" + "=" * 70)
250
+ print(f"🤖 AGENT ZERO FLEET REPORT — {report['checked_at']}")
251
+ print("=" * 70)
252
+
253
+ for agent in report["agents"]:
254
+ emoji = "🟢" if agent.get("healthy") else "🔴"
255
+ print(f"\n{emoji} {agent['name']} ({agent['role']})")
256
+ print(f" Repo: {agent['repo']}")
257
+ print(f" Stage: {agent.get('stage', 'unknown')}")
258
+ print(f" Hardware: {agent.get('hardware', 'none')}")
259
+ if "error" in agent:
260
+ print(f" ❌ Error: {agent['error']}")
261
+
262
+ if report["issues"]:
263
+ print(f"\n⚠️ ISSUES ({len(report['issues'])}):")
264
+ for issue in report["issues"]:
265
+ print(f" - {issue}")
266
+ else:
267
+ print("\n✅ All agents healthy")
268
+
269
+ print("=" * 70 + "\n")
270
+
271
+ def deploy_prompts(self, agent_name: str):
272
+ """Deploy diagnostic and autonomous loop prompts to an agent."""
273
+ config = FLEET.get(agent_name)
274
+ if not config:
275
+ raise ValueError(f"Unknown agent: {agent_name}")
276
+
277
+ repo_id = config["repo"]
278
+
279
+ self.api.upload_file(
280
+ path_or_fileobj=SELF_DIAGNOSTIC_PROMPT.encode(),
281
+ path_in_repo="prompts/self_diagnostic.md",
282
+ repo_id=repo_id,
283
+ repo_type="space",
284
+ )
285
+
286
+ self.api.upload_file(
287
+ path_or_fileobj=AUTONOMOUS_LOOP_PROMPT.encode(),
288
+ path_in_repo="prompts/autonomous_loop.md",
289
+ repo_id=repo_id,
290
+ repo_type="space",
291
+ )
292
+
293
+ print(f"[Fleet] Prompts deployed to {agent_name}")
294
+
295
+ def setup_inter_agent_pipeline(self):
296
+ """Configure shared task queue directories."""
297
+ shared_setup = """#!/bin/bash
298
+ mkdir -p /workspace/shared/task_queue
299
+ mkdir -p /workspace/shared/results
300
+ mkdir -p /workspace/shared/completed
301
+ mkdir -p /workspace/projects
302
+ chmod 777 /workspace/shared/task_queue
303
+ chmod 777 /workspace/shared/results
304
+ chmod 777 /workspace/shared/completed
305
+ chmod 777 /workspace/projects
306
+ echo "Shared directories ready"
307
+ """
308
+ for name, config in FLEET.items():
309
+ self.api.upload_file(
310
+ path_or_fileobj=shared_setup.encode(),
311
+ path_in_repo="setup_shared.sh",
312
+ repo_id=config["repo"],
313
+ repo_type="space",
314
+ )
315
+
316
+ print("[Fleet] Inter-agent pipeline configured")
317
+
318
+
319
+ def main():
320
+ parser = argparse.ArgumentParser(description="Agent Zero Fleet Manager")
321
+ parser.add_argument("--check", action="store_true", help="Check fleet status")
322
+ parser.add_argument("--set-hw", nargs=2, metavar=("AGENT", "HW"), help="Set hardware")
323
+ parser.add_argument("--enable-storage", metavar="AGENT", help="Enable persistent storage")
324
+ parser.add_argument("--restart", metavar="AGENT", help="Restart agent")
325
+ parser.add_argument("--deploy-prompts", metavar="AGENT", help="Deploy prompts")
326
+ parser.add_argument("--setup-pipeline", action="store_true", help="Setup inter-agent pipeline")
327
+ parser.add_argument("--provision-all", action="store_true", help="Provision entire fleet")
328
+ args = parser.parse_args()
329
+
330
+ manager = AgentZeroFleetManager()
331
+
332
+ if args.check:
333
+ report = manager.check_fleet()
334
+ manager.print_report(report)
335
+
336
+ elif args.set_hw:
337
+ manager.set_hardware(args.set_hw[0], args.set_hw[1])
338
+
339
+ elif args.enable_storage:
340
+ manager.enable_persistent_storage(args.enable_storage)
341
+
342
+ elif args.restart:
343
+ manager.restart_agent(args.restart)
344
+
345
+ elif args.deploy_prompts:
346
+ manager.deploy_prompts(args.deploy_prompts)
347
+
348
+ elif args.setup_pipeline:
349
+ manager.setup_inter_agent_pipeline()
350
+
351
+ elif args.provision_all:
352
+ print("[Fleet] Provisioning entire fleet...")
353
+ for name, config in FLEET.items():
354
+ print(f"\n--- {name} ---")
355
+ try:
356
+ manager.set_hardware(name, config["hardware_target"])
357
+ if config["persistent_storage"]:
358
+ manager.enable_persistent_storage(name)
359
+ manager.restart_agent(name)
360
+ except Exception as e:
361
+ print(f"Failed: {e}")
362
+ print("\n[Fleet] Provisioning complete. Waiting for builds...")
363
+
364
+ else:
365
+ parser.print_help()
366
+
367
+
368
+ if __name__ == "__main__":
369
+ main()