ScottzillaSystems
/

conversation-memory

Model card Files Files and versions

xet

Community

ScottzillaSystems commited on 9 days ago

Commit

3c03ca6

verified ·

1 Parent(s): 07846a9

Upload fleet_manager.py

Browse files

Files changed (1) hide show

fleet_manager.py +369 -0

fleet_manager.py ADDED Viewed

	@@ -0,0 +1,369 @@

+#!/usr/bin/env python3
+"""
+Agent Zero Fleet Manager
+Manages 4 autonomous Agent Zero instances with workspace isolation,
+self-healing, config persistence, inter-agent delegation, and persistent storage.
+Usage:
+    python fleet_manager.py --check              # Check all 4 agents
+    python fleet_manager.py --provision-all      # Provision entire fleet
+    python fleet_manager.py --set-hw agent-zero-main zero-a10g
+    python fleet_manager.py --enable-storage agent-zero-main
+    python fleet_manager.py --deploy-prompts agent-zero-main
+    python fleet_manager.py --setup-pipeline     # Configure inter-agent task queue
+"""
+import os
+import json
+import time
+import argparse
+from datetime import datetime
+from typing import List, Dict, Optional
+from huggingface_hub import HfApi, SpaceHardware
+# ─── Fleet Configuration ─────────────────────────────────────────────────────
+FLEET = {
+    "agent-zero-main": {
+        "repo": "ScottzillaSystems/agentzerov2",
+        "role": "sovereign",
+        "description": "Main orchestrator — receives all tasks, delegates to specialists",
+        "models": ["qwen3.5-27b", "cydonia-24b", "fallen-command"],
+        "hardware_target": "zero-a10g",
+        "persistent_storage": True,
+    },
+    "agent-zero-pentesting": {
+        "repo": "ScottzillaSystems/agent-zero-pentesting",
+        "role": "specialist",
+        "description": "Security specialist — writes findings to shared task queue",
+        "models": ["qwen3.5-9b"],
+        "hardware_target": "zero-a10g",
+        "persistent_storage": True,
+    },
+    "agent-zero-finops": {
+        "repo": "ScottzillaSystems/agent-zero-finops",
+        "role": "specialist",
+        "description": "Cost optimization specialist — monitors billing, suggests cuts",
+        "models": ["chatgpt5"],
+        "hardware_target": "cpu-basic",
+        "persistent_storage": False,
+    },
+    "agent-zero-creative": {
+        "repo": "ScottzillaSystems/agent-zero-adult-entertainment",
+        "role": "specialist",
+        "description": "Creative specialist — content generation, image workflows",
+        "models": ["qwen3.5-9b"],
+        "hardware_target": "zero-a10g",
+        "persistent_storage": True,
+    },
+}
+# ─── Prompts ─────────────────────────────────────────────────────────────────
+SELF_DIAGNOSTIC_PROMPT = """# Agent Zero Self-Diagnostic Protocol
+Run this diagnostic immediately on startup and report results:
+## 1. System Health
+- [ ] GPU available: `torch.cuda.is_available()` → report True/False
+- [ ] GPU memory: `torch.cuda.get_device_properties(0).total_memory / 1e9` → report GB
+- [ ] Current model loaded: report model_key
+- [ ] Model cache status: list all keys in _model_cache
+## 2. Model Loading Test
+For each model in your catalog:
+1. Call `load_model(model_key)`
+2. Report: success/failure, load time, GPU memory after load
+3. Call `unload_model(model_key)`
+4. Report: memory freed
+## 3. Inference Test
+Send: "What is 2+2?"
+Expected: Response contains "4"
+Report: latency_ms, tokens_generated, tokens/sec
+## 4. File System Test
+- [ ] Write test file to `/workspace/projects/test.txt`
+- [ ] Read it back
+- [ ] Report: persistent storage working (survives restart?)
+## 5. Network Test
+- [ ] Can reach huggingface.co?
+- [ ] Can reach your model repos?
+## 6. Inter-Agent Test (if applicable)
+- [ ] Can write to `/workspace/shared/task_queue/`?
+- [ ] Can read from `/workspace/shared/task_queue/`?
+Report all results in a single JSON block.
+"""
+AUTONOMOUS_LOOP_PROMPT = """# Agent Zero Autonomous Operation Loop
+You are now running autonomously. Follow this loop indefinitely:
+## Loop Cycle (every 60 seconds)
+1. **SCAN** `/workspace/shared/task_queue/` for new task files
+2. **READ** each task file (JSON format)
+3. **EXECUTE** the task using your loaded model
+4. **WRITE** results to `/workspace/shared/results/{task_id}.json`
+5. **ARCHIVE** completed task to `/workspace/shared/completed/`
+6. **SELF-HEAL** if model fails:
+   - Unload current model
+   - Try fallback model from catalog
+   - If all fail, pause and alert
+7. **REPORT** status to stdout every cycle
+## Task File Format
+```json
+{
+  "task_id": "uuid",
+  "source_agent": "agent-zero-pentesting",
+  "priority": 1-5,
+  "task_type": "analysis|generation|review|action",
+  "prompt": "...",
+  "context": {},
+  "deadline": "ISO8601",
+  "created_at": "ISO8601"
+}
+```
+## Result File Format
+```json
+{
+  "task_id": "uuid",
+  "completed_by": "agent-zero-main",
+  "completed_at": "ISO8601",
+  "status": "success|failure|partial",
+  "output": "...",
+  "model_used": "cydonia-24b",
+  "tokens_used": 1234,
+  "latency_ms": 5600
+}
+```
+## Self-Healing Rules
+- If OOM: unload all models, wait 10s, reload smallest model
+- If model download fails: retry 3x with exponential backoff
+- If GPU not available: switch to CPU mode (slower but functional)
+- If task queue full (>100): process highest priority first
+Begin autonomous operation now.
+"""
+# ─── Fleet Manager ───────────────────────────────────────────────────────────
+class AgentZeroFleetManager:
+    def __init__(self, token: Optional[str] = None):
+        self.api = HfApi(token=token or os.getenv("HF_TOKEN"))
+        self.status_log: List[Dict] = []
+    def check_fleet(self) -> Dict:
+        """Check status of all 4 agent instances."""
+        report = {
+            "checked_at": datetime.utcnow().isoformat(),
+            "agents": [],
+            "issues": [],
+        }
+        for name, config in FLEET.items():
+            repo_id = config["repo"]
+            try:
+                runtime = self.api.get_space_runtime(repo_id)
+                agent_report = {
+                    "name": name,
+                    "repo": repo_id,
+                    "stage": runtime.stage,
+                    "hardware": runtime.hardware,
+                    "requested_hardware": runtime.requested_hardware,
+                    "role": config["role"],
+                    "healthy": runtime.stage == "RUNNING",
+                }
+                report["agents"].append(agent_report)
+                if runtime.stage != "RUNNING":
+                    report["issues"].append(f"{name}: {runtime.stage} (expected RUNNING)")
+                if runtime.hardware != config["hardware_target"]:
+                    report["issues"].append(
+                        f"{name}: hardware={runtime.hardware} (target={config['hardware_target']})"
+                    )
+            except Exception as e:
+                report["agents"].append({
+                    "name": name,
+                    "repo": repo_id,
+                    "stage": "ERROR",
+                    "error": str(e),
+                })
+                report["issues"].append(f"{name}: API error - {e}")
+        self.status_log.append(report)
+        return report
+    def set_hardware(self, agent_name: str, hardware: str):
+        """Set hardware for an agent Space."""
+        config = FLEET.get(agent_name)
+        if not config:
+            raise ValueError(f"Unknown agent: {agent_name}")
+        repo_id = config["repo"]
+        hardware_enum = getattr(SpaceHardware, hardware.upper().replace("-", "_"))
+        self.api.request_space_hardware(repo_id, hardware=hardware_enum)
+        print(f"[Fleet] Set {agent_name} -> {hardware}")
+    def enable_persistent_storage(self, agent_name: str):
+        """Enable /data volume on a Space."""
+        config = FLEET.get(agent_name)
+        if not config:
+            raise ValueError(f"Unknown agent: {agent_name}")
+        repo_id = config["repo"]
+        import requests
+        resp = requests.put(
+            f"https://huggingface.co/api/spaces/{repo_id}/volumes",
+            headers={"Authorization": f"Bearer {self.api.token}"},
+            json={"data": True},
+        )
+        if resp.status_code == 200:
+            print(f"[Fleet] Persistent storage enabled for {agent_name}")
+        else:
+            print(f"[Fleet] Failed to enable storage for {agent_name}: {resp.status_code}")
+    def restart_agent(self, agent_name: str):
+        """Restart an agent Space."""
+        config = FLEET.get(agent_name)
+        if not config:
+            raise ValueError(f"Unknown agent: {agent_name}")
+        self.api.restart_space(config["repo"])
+        print(f"[Fleet] Restarted {agent_name}")
+    def print_report(self, report: Dict):
+        """Print formatted fleet status."""
+        print("\n" + "=" * 70)
+        print(f"🤖 AGENT ZERO FLEET REPORT — {report['checked_at']}")
+        print("=" * 70)
+        for agent in report["agents"]:
+            emoji = "🟢" if agent.get("healthy") else "🔴"
+            print(f"\n{emoji} {agent['name']} ({agent['role']})")
+            print(f"   Repo: {agent['repo']}")
+            print(f"   Stage: {agent.get('stage', 'unknown')}")
+            print(f"   Hardware: {agent.get('hardware', 'none')}")
+            if "error" in agent:
+                print(f"   ❌ Error: {agent['error']}")
+        if report["issues"]:
+            print(f"\n⚠️ ISSUES ({len(report['issues'])}):")
+            for issue in report["issues"]:
+                print(f"   - {issue}")
+        else:
+            print("\n✅ All agents healthy")
+        print("=" * 70 + "\n")
+    def deploy_prompts(self, agent_name: str):
+        """Deploy diagnostic and autonomous loop prompts to an agent."""
+        config = FLEET.get(agent_name)
+        if not config:
+            raise ValueError(f"Unknown agent: {agent_name}")
+        repo_id = config["repo"]
+        self.api.upload_file(
+            path_or_fileobj=SELF_DIAGNOSTIC_PROMPT.encode(),
+            path_in_repo="prompts/self_diagnostic.md",
+            repo_id=repo_id,
+            repo_type="space",
+        )
+        self.api.upload_file(
+            path_or_fileobj=AUTONOMOUS_LOOP_PROMPT.encode(),
+            path_in_repo="prompts/autonomous_loop.md",
+            repo_id=repo_id,
+            repo_type="space",
+        )
+        print(f"[Fleet] Prompts deployed to {agent_name}")
+    def setup_inter_agent_pipeline(self):
+        """Configure shared task queue directories."""
+        shared_setup = """#!/bin/bash
+mkdir -p /workspace/shared/task_queue
+mkdir -p /workspace/shared/results
+mkdir -p /workspace/shared/completed
+mkdir -p /workspace/projects
+chmod 777 /workspace/shared/task_queue
+chmod 777 /workspace/shared/results
+chmod 777 /workspace/shared/completed
+chmod 777 /workspace/projects
+echo "Shared directories ready"
+"""
+        for name, config in FLEET.items():
+            self.api.upload_file(
+                path_or_fileobj=shared_setup.encode(),
+                path_in_repo="setup_shared.sh",
+                repo_id=config["repo"],
+                repo_type="space",
+            )
+        print("[Fleet] Inter-agent pipeline configured")
+def main():
+    parser = argparse.ArgumentParser(description="Agent Zero Fleet Manager")
+    parser.add_argument("--check", action="store_true", help="Check fleet status")
+    parser.add_argument("--set-hw", nargs=2, metavar=("AGENT", "HW"), help="Set hardware")
+    parser.add_argument("--enable-storage", metavar="AGENT", help="Enable persistent storage")
+    parser.add_argument("--restart", metavar="AGENT", help="Restart agent")
+    parser.add_argument("--deploy-prompts", metavar="AGENT", help="Deploy prompts")
+    parser.add_argument("--setup-pipeline", action="store_true", help="Setup inter-agent pipeline")
+    parser.add_argument("--provision-all", action="store_true", help="Provision entire fleet")
+    args = parser.parse_args()
+    manager = AgentZeroFleetManager()
+    if args.check:
+        report = manager.check_fleet()
+        manager.print_report(report)
+    elif args.set_hw:
+        manager.set_hardware(args.set_hw[0], args.set_hw[1])
+    elif args.enable_storage:
+        manager.enable_persistent_storage(args.enable_storage)
+    elif args.restart:
+        manager.restart_agent(args.restart)
+    elif args.deploy_prompts:
+        manager.deploy_prompts(args.deploy_prompts)
+    elif args.setup_pipeline:
+        manager.setup_inter_agent_pipeline()
+    elif args.provision_all:
+        print("[Fleet] Provisioning entire fleet...")
+        for name, config in FLEET.items():
+            print(f"\n--- {name} ---")
+            try:
+                manager.set_hardware(name, config["hardware_target"])
+                if config["persistent_storage"]:
+                    manager.enable_persistent_storage(name)
+                manager.restart_agent(name)
+            except Exception as e:
+                print(f"Failed: {e}")
+        print("\n[Fleet] Provisioning complete. Waiting for builds...")
+    else:
+        parser.print_help()
+if __name__ == "__main__":
+    main()