""" PULSE — Agent Nervous System & Scheduler The heartbeat, ReAct loop, timetable, and identity layer connecting all 7 ki-fusion-labs agent spaces. Connected spaces: FORGE — skill registry RELAY — communication hub MEMORY — multi-tier memory KANBAN — task board NEXUS — LLM routing (OpenAI-compatible) VAULT — file workspace + execution KNOWLEDGE — knowledge base (if available) Agent lifecycle per heartbeat tick: 1. Check RELAY inbox for messages 2. Check KANBAN for assigned open tasks 3. Check timetable for due jobs 4. If anything found → ReAct loop via NEXUS 5. ReAct: Thought → Action(tool) → Observation → repeat 6. Write results to MEMORY / KANBAN / RELAY / VAULT 7. Sleep → next tick """ import os, uuid, json, asyncio, time, re, logging from pathlib import Path from datetime import datetime, timezone, timedelta from typing import Optional, Any from collections import defaultdict import httpx from fastapi import FastAPI, HTTPException, Request from fastapi.responses import JSONResponse, HTMLResponse, StreamingResponse from apscheduler.schedulers.asyncio import AsyncIOScheduler from apscheduler.triggers.cron import CronTrigger from apscheduler.triggers.date import DateTrigger logging.basicConfig(level=logging.INFO) log = logging.getLogger("pulse") BASE = Path(__file__).parent for d in ["data", "logs", "traces"]: (BASE / d).mkdir(exist_ok=True) # ── Space URLs ───────────────────────────────────────────────────── SPACES = { "pulse": os.environ.get("PULSE_URL", "http://localhost:7860"), "relay": os.environ.get("RELAY_URL", "https://chris4k-agent-relay.hf.space"), "memory": os.environ.get("MEMORY_URL", "https://chris4k-agent-memory.hf.space"), "kanban": os.environ.get("KANBAN_URL", "https://chris4k-agent-kanban-board.hf.space"), "nexus": os.environ.get("NEXUS_URL", "https://chris4k-agent-nexus.hf.space"), "vault": os.environ.get("VAULT_URL", "https://chris4k-agent-vault.hf.space"), "forge": os.environ.get("FORGE_URL", "https://chris4k-agent-forge.hf.space"), "knowledge": os.environ.get("KNOWLEDGE_URL", "https://chris4k-agent-knowledge.hf.space"), } NEXUS_MODEL = os.environ.get("NEXUS_MODEL", "nexus-auto") REACT_MAX = int(os.environ.get("REACT_MAX_STEPS", "6")) # ── FORGE new infrastructure ──────────────────────────────────────── PROMPTS_URL = os.environ.get("PROMPTS_URL", "https://chris4k-agent-prompts.hf.space") TRACE_URL = os.environ.get("TRACE_URL", "https://chris4k-agent-trace.hf.space") LEARN_URL = os.environ.get("LEARN_URL", "https://chris4k-agent-learn.hf.space") LOOP_URL = os.environ.get("LOOP_URL", "https://chris4k-agent-loop.hf.space") HARNESS_URL = os.environ.get("HARNESS_URL", "https://chris4k-agent-harness.hf.space") APPROVE_URL = os.environ.get("APPROVE_URL", "https://chris4k-agent-approve.hf.space") COMPLIANCE_URL = os.environ.get("COMPLIANCE_URL", "https://chris4k-agent-compliance.hf.space") BRAVE_API_KEY = os.environ.get("BRAVE_API_KEY", "") # Risky tools that require approval gate RISKY_TOOLS = {"vault_exec"} RISKY_RUNTIMES = {"bash", "git"} # within vault_exec these trigger approve RISKY_PATTERNS = {"rm ", "rmdir", "git push", "git force", "dd ", "chmod 777"} # ── Persona cache (fetched from agent-prompts, refreshed every 5min) ─ _persona_cache: dict = {} # agent_name → {system_prompt, max_steps, ...} _persona_loaded: dict = {} # agent_name → timestamp PERSONA_TTL = 300 PULSE_FALLBACK_PROMPT = ( "You are a specialized agent in the FORGE AI ecosystem. Execute assigned tasks using " "the ReAct loop: Thought → Action → Observation. Log every action, move kanban cards, " "reserve LLM slots before long tasks. Never hallucinate tool results." ) def get_agent_persona(agent_name: str) -> dict: """Fetch agent persona from agent-prompts. Falls back to local defaults.""" now = time.time() if agent_name in _persona_cache and (now - _persona_loaded.get(agent_name, 0)) < PERSONA_TTL: return _persona_cache[agent_name] try: import urllib.request as ureq with ureq.urlopen(f"{PROMPTS_URL}/api/personas/{agent_name}", timeout=3) as r: data = json.loads(r.read()) if data and data.get("system_prompt"): _persona_cache[agent_name] = data _persona_loaded[agent_name] = now log.info(f"[PROMPTS] Persona loaded for {agent_name}") return data except Exception as e: log.debug(f"[PROMPTS] Persona fetch failed for {agent_name}: {e}") # Fallback: return local agent config's persona field return {"system_prompt": PULSE_FALLBACK_PROMPT, "max_steps": REACT_MAX} def emit_trace(agent: str, event_type: str, payload: dict, status: str = "ok"): """Fire-and-forget trace event to agent-trace. Never blocks.""" try: import urllib.request as ureq body = json.dumps({"agent": agent, "event_type": event_type, "status": status, "payload": payload}).encode() req = ureq.Request(f"{TRACE_URL}/api/trace", data=body, headers={"Content-Type": "application/json"}, method="POST") ureq.urlopen(req, timeout=2) except Exception: pass def render_prompt(prompt_id: str, variables: dict) -> str: """Fetch rendered prompt from agent-prompts. Returns empty string on failure.""" try: import urllib.request as ureq, urllib.parse params = urllib.parse.urlencode({k: str(v) for k, v in variables.items()}) with ureq.urlopen(f"{PROMPTS_URL}/api/prompts/{prompt_id}/render?{params}", timeout=3) as r: return json.loads(r.read()).get("rendered", "") except Exception: return "" # ── LLM Fallback Chain ───────────────────────────────────────────── # Priority order (matches your actual infra): # 1. NEXUS → routes to ki-fusion-labs.de (RTX 5090) when server is ON # → falls back to HF serverless inside NEXUS when server is OFF # 2. Anthropic claude-haiku (ANTHROPIC_API_KEY secret) # 3. HF Inference API (HF_TOKEN secret, rate-limited, use sparingly) # 4. NEXUS local_cpu model (Qwen 0.5B inside NEXUS container, always available) # # Every failure logs the HTTP status + full response body so you can see WHY it failed. ANTHROPIC_KEY = os.environ.get("ANTHROPIC_API_KEY", "") HF_TOKEN = os.environ.get("HF_TOKEN", "") FALLBACK_HF_MODEL = os.environ.get("FALLBACK_HF_MODEL", "meta-llama/Meta-Llama-3.1-8B-Instruct") # Track provider health to skip recently-failed ones faster _provider_failures: dict = {} # provider → last_fail_ts _PROVIDER_COOLDOWN = 30 # seconds to skip a failed provider (was 120 — too long) def _provider_ok(name: str) -> bool: ts = _provider_failures.get(name, 0) return (time.time() - ts) > _PROVIDER_COOLDOWN def _provider_fail(name: str): _provider_failures[name] = time.time() log.warning(f"[LLM] Marking {name} as failed (cooldown {_PROVIDER_COOLDOWN}s)") def _provider_success(name: str): _provider_failures.pop(name, None) async def _call_nexus(messages: list, model: str, max_tokens: int) -> str: payload = {"model": model, "messages": messages, "max_tokens": max_tokens, "temperature": 0.3} # 120s timeout: NEXUS may chain ki_fusion(6s fail) + hf(30s) + local_cpu(30-60s) # 35s was too short — PULSE timed out before NEXUS even reached local_cpu timeout = httpx.Timeout(connect=10.0, read=120.0, write=10.0, pool=5.0) async with httpx.AsyncClient(timeout=timeout) as c: r = await c.post(f"{SPACES['nexus']}/v1/chat/completions", json=payload) if not r.is_success: body = r.text[:400] log.error(f"[LLM] NEXUS {r.status_code}: {body}") raise RuntimeError(f"NEXUS HTTP {r.status_code}: {body}") return r.json()["choices"][0]["message"]["content"] async def _call_nexus_local_cpu(messages: list, max_tokens: int) -> str: """Force NEXUS to use its built-in CPU model — always available, slow.""" payload = {"model": "local_cpu", "messages": messages, "max_tokens": max_tokens, "temperature": 0.3} async with httpx.AsyncClient(timeout=90) as c: # CPU model is slow r = await c.post(f"{SPACES['nexus']}/v1/chat/completions", json=payload) if not r.is_success: body = r.text[:400] log.error(f"[LLM] NEXUS/local_cpu {r.status_code}: {body}") raise RuntimeError(f"NEXUS/local_cpu HTTP {r.status_code}: {body}") return r.json()["choices"][0]["message"]["content"] async def _call_anthropic(messages: list, system: str, max_tokens: int) -> str: url = "https://api.anthropic.com/v1/messages" headers = {"x-api-key": ANTHROPIC_KEY, "anthropic-version": "2023-06-01", "content-type": "application/json"} msgs = [m for m in messages if m["role"] != "system"] payload = {"model": "claude-haiku-4-5-20251001", "max_tokens": max_tokens, "system": system, "messages": msgs} async with httpx.AsyncClient(timeout=40) as c: r = await c.post(url, headers=headers, json=payload) if not r.is_success: body = r.text[:400] log.error(f"[LLM] Anthropic {r.status_code}: {body}") raise RuntimeError(f"Anthropic HTTP {r.status_code}: {body}") return r.json()["content"][0]["text"] async def _call_hf(messages: list, max_tokens: int) -> str: url = f"https://api-inference.huggingface.co/models/{FALLBACK_HF_MODEL}/v1/chat/completions" headers = {"Authorization": f"Bearer {HF_TOKEN}"} payload = {"model": FALLBACK_HF_MODEL, "messages": messages, "max_tokens": max_tokens, "temperature": 0.3} async with httpx.AsyncClient(timeout=60) as c: r = await c.post(url, headers=headers, json=payload) if not r.is_success: body = r.text[:400] log.error(f"[LLM] HF Inference {r.status_code}: {body}") raise RuntimeError(f"HF HTTP {r.status_code}: {body}") return r.json()["choices"][0]["message"]["content"] async def call_llm(messages: list, system: str = "", max_tokens: int = 600) -> str: """ LLM chain: NEXUS(RTX5090) → Anthropic → HF API → NEXUS(local_cpu) Logs full error bodies so you can see exactly what's failing. Skips providers that failed recently (cooldown). """ errors = [] # ── 1. NEXUS (primary: ki-fusion-labs RTX 5090 or HF serverless inside NEXUS) if _provider_ok("nexus"): try: content = await _call_nexus(messages, NEXUS_MODEL, max_tokens) _provider_success("nexus") log.info("[LLM] nexus → OK") return content except Exception as e: errors.append(f"nexus: {e}") _provider_fail("nexus") push_live({"type":"llm_fallback","provider":"nexus_failed", "reason": str(e)[:120]}) else: log.info("[LLM] nexus in cooldown, skipping") # ── 2. Anthropic claude-haiku (fast, reliable, costs ~$0.00025/call) if ANTHROPIC_KEY and _provider_ok("anthropic"): try: content = await _call_anthropic(messages, system, max_tokens) _provider_success("anthropic") log.info("[LLM] anthropic → OK") push_live({"type":"llm_fallback","provider":"anthropic"}) return content except Exception as e: errors.append(f"anthropic: {e}") _provider_fail("anthropic") elif not ANTHROPIC_KEY: log.debug("[LLM] anthropic skipped (no key)") # ── 3. HF Inference API (rate-limited, use sparingly) if HF_TOKEN and _provider_ok("hf_inference"): try: content = await _call_hf(messages, max_tokens) _provider_success("hf_inference") log.info("[LLM] hf_inference → OK") push_live({"type":"llm_fallback","provider":"hf_inference"}) return content except Exception as e: errors.append(f"hf_inference: {e}") _provider_fail("hf_inference") elif not HF_TOKEN: log.debug("[LLM] hf_inference skipped (no token)") # ── 4. NEXUS again but force local_cpu model directly # Only reached if NEXUS itself failed entirely (e.g. space is down). # If NEXUS is UP, it already tried local_cpu internally — we don't retry. # This covers the case where the NEXUS space itself is unreachable. if not _provider_ok("nexus"): try: log.warning("[LLM] All cloud providers failed — forcing NEXUS local_cpu model directly") content = await _call_nexus_local_cpu(messages, min(max_tokens, 300)) log.info("[LLM] nexus/local_cpu → OK") push_live({"type":"llm_fallback","provider":"local_cpu"}) return content except Exception as e: errors.append(f"local_cpu: {e}") log.error(f"[LLM] local_cpu also failed: {e}") all_errors = " | ".join(errors) log.error(f"[LLM] ALL PROVIDERS FAILED: {all_errors}") raise RuntimeError(f"All LLM providers failed: {all_errors}") # ── Persistence ──────────────────────────────────────────────────── AGENTS_FILE = BASE / "data" / "agents.json" SCHEDULE_FILE = BASE / "data" / "schedule.json" ACTIVITY_FILE = BASE / "data" / "activity.json" def load_json(p: Path, default): return json.loads(p.read_text()) if p.exists() else default def save_json(p: Path, data): p.write_text(json.dumps(data, indent=2, ensure_ascii=False)) # ── Live feed ────────────────────────────────────────────────────── live_queues: list[asyncio.Queue] = [] agent_status: dict[str, dict] = {} # name → {running, last_tick, last_action, tick_count} def push_live(event: dict): event["ts"] = int(time.time()) for q in live_queues: try: q.put_nowait(json.dumps(event)) except: pass # Append to activity log (last 200) act = load_json(ACTIVITY_FILE, []) act.insert(0, event) save_json(ACTIVITY_FILE, act[:200]) # ── Space clients ────────────────────────────────────────────────── HTTP_TIMEOUT = 20 async def space_get(space: str, path: str, params: dict = {}) -> Optional[Any]: url = SPACES[space] + path try: async with httpx.AsyncClient(timeout=HTTP_TIMEOUT) as c: r = await c.get(url, params=params) r.raise_for_status() return r.json() except Exception as e: log.warning(f"space_get {space}{path}: {e}") return None async def space_post(space: str, path: str, data: dict) -> Optional[Any]: url = SPACES[space] + path try: async with httpx.AsyncClient(timeout=HTTP_TIMEOUT) as c: r = await c.post(url, json=data) r.raise_for_status() return r.json() except Exception as e: log.warning(f"space_post {space}{path}: {e}") return None # ── Sprint 5: Middleware helpers ──────────────────────────────────── async def harness_scan(agent: str, tool: str, content: str) -> tuple[bool, str]: """Scan tool output through agent-harness before LLM sees it. Returns (safe, sanitised_content). On harness unavailable, pass-through.""" if not HARNESS_URL: return True, content try: async with httpx.AsyncClient(timeout=4) as c: r = await c.post(f"{HARNESS_URL}/api/scan/output", json={"agent": agent, "tool": tool, "content": content}) if r.status_code == 200: d = r.json() return d.get("safe", True), d.get("sanitised", content) except Exception as e: log.debug(f"[HARNESS] scan failed (pass-through): {e}") return True, content async def request_approval(agent: str, tool: str, args: dict, risk: str = "high") -> tuple[bool, str]: """Gate risky tool calls through agent-approve. Returns (approved, reason). Timeout = auto-reject.""" if not APPROVE_URL: log.warning("[APPROVE] APPROVE_URL not set — auto-approving (unsafe!)") return True, "approve_url_missing" try: async with httpx.AsyncClient(timeout=6) as c: r = await c.post(f"{APPROVE_URL}/api/approval/request", json={"agent": agent, "tool": tool, "args": args, "risk": risk, "auto_timeout": 120}) if r.status_code == 200: d = r.json() approval_id = d.get("id") # Poll for up to 90s (Telegram keyboard gives christof 2 min) for _ in range(18): await asyncio.sleep(5) pr = await c.get(f"{APPROVE_URL}/api/approval/{approval_id}") if pr.status_code == 200: pd = pr.json() status = pd.get("status") if status == "approved": return True, "human_approved" if status in ("rejected", "expired"): return False, status return False, "timeout" except Exception as e: log.warning(f"[APPROVE] gate failed: {e} — blocking tool call") return False, f"approve_error: {e}" async def compliance_scan(agent: str, content: str) -> tuple[bool, str, list]: """Scan content for PII before writing to memory. Returns (safe, redacted_content, pii_types_found).""" if not COMPLIANCE_URL: return True, content, [] try: async with httpx.AsyncClient(timeout=4) as c: r = await c.post(f"{COMPLIANCE_URL}/api/scan/pii", json={"text": content, "agent": agent, "redact": True}) if r.status_code == 200: d = r.json() return (not d.get("pii_found", False), d.get("redacted", content), d.get("types_found", [])) except Exception as e: log.debug(f"[COMPLIANCE] scan failed (pass-through): {e}") return True, content, [] async def web_search_brave(query: str, count: int = 5) -> str: """Brave Search API call. Returns formatted results.""" if not BRAVE_API_KEY: return "web_search unavailable: BRAVE_API_KEY not configured" try: async with httpx.AsyncClient(timeout=8) as c: r = await c.get("https://api.search.brave.com/res/v1/web/search", params={"q": query, "count": count, "text_decorations": False}, headers={"Accept": "application/json", "Accept-Encoding": "gzip", "X-Subscription-Token": BRAVE_API_KEY}) r.raise_for_status() data = r.json() results = data.get("web", {}).get("results", []) if not results: return "no results found" lines = [] for i, res in enumerate(results[:count], 1): lines.append(f"{i}. {res.get('title','?')} — {res.get('url','')}\n {res.get('description','')[:200]}") return "\n\n".join(lines) except Exception as e: return f"web_search error: {e}" async def fetch_url_content(url: str) -> str: """Fetch a URL and return stripped text (5000 char limit).""" try: async with httpx.AsyncClient(timeout=10, follow_redirects=True) as c: r = await c.get(url, headers={"User-Agent": "FORGE-Agent/1.0"}) r.raise_for_status() ct = r.headers.get("content-type", "") if "html" in ct: text = re.sub(r"<[^>]+>", " ", r.text) text = re.sub(r"\s{2,}", " ", text).strip() else: text = r.text.strip() return text[:5000] + ("…[truncated]" if len(text) > 5000 else "") except Exception as e: return f"fetch_url error: {e}" # ── Sprint 5: Saga Orchestrator ───────────────────────────────────── class SagaStep: def __init__(self, name: str, forward, compensate=None): self.name = name self.forward = forward # async callable → result str self.compensate = compensate # async callable → None (undo) class SagaOrchestrator: """Run a sequence of steps with automatic compensation on failure. Usage: saga = SagaOrchestrator(agent_name, saga_id) saga.add_step("reserve_slot", fwd=lambda: ..., comp=lambda: ...) saga.add_step("vault_write", fwd=lambda: ..., comp=lambda: ...) result = await saga.run() """ def __init__(self, agent: str, saga_id: str = ""): self.agent = agent self.saga_id = saga_id or str(uuid.uuid4())[:8] self.steps: list[SagaStep] = [] self.completed: list[tuple[str, str]] = [] # (name, result) def add_step(self, name: str, fwd, comp=None): self.steps.append(SagaStep(name, fwd, comp)) async def run(self) -> dict: emit_trace(self.agent, "saga_start", {"saga_id": self.saga_id, "steps": [s.name for s in self.steps]}) for step in self.steps: try: result = await step.forward() self.completed.append((step.name, str(result))) log.info(f"[SAGA {self.saga_id}] {step.name} OK: {str(result)[:80]}") except Exception as e: log.error(f"[SAGA {self.saga_id}] {step.name} FAILED: {e} — compensating") emit_trace(self.agent, "saga_failed", {"saga_id": self.saga_id, "failed_step": step.name, "error": str(e)}, status="error") # Compensate in reverse order for name, _ in reversed(self.completed): comp_step = next((s for s in self.steps if s.name == name), None) if comp_step and comp_step.compensate: try: await comp_step.compensate() log.info(f"[SAGA {self.saga_id}] compensated {name}") except Exception as ce: log.warning(f"[SAGA {self.saga_id}] compensate {name} failed: {ce}") # Alert christof try: async with httpx.AsyncClient(timeout=4) as c: await c.post(f"{SPACES['relay']}/api/notify", json={ "text": f"⚠️ SAGA {self.saga_id} failed at step {step.name}\nAgent: {self.agent}\nError: {str(e)[:200]}\nCompensations ran for: {[n for n,_ in self.completed]}", "parse_mode": "HTML"}) except Exception: pass return {"ok": False, "saga_id": self.saga_id, "failed_step": step.name, "error": str(e), "saga_compensated": True} emit_trace(self.agent, "saga_complete", {"saga_id": self.saga_id, "steps_completed": len(self.completed)}) return {"ok": True, "saga_id": self.saga_id, "steps": dict(self.completed)} # ── smolagents — Tool definitions ────────────────────────────────── # Each tool uses httpx synchronous client (tools run in a thread via asyncio.to_thread). # CodeAgent writes Python code to call these tools, enabling loops, conditionals, # and natural composition — far more powerful than JSON ReAct. try: from smolagents import CodeAgent, Tool, OpenAIServerModel, ToolCallingAgent from smolagents.monitoring import LogLevel SMOLAGENTS_OK = True except ImportError: SMOLAGENTS_OK = False log.warning("[SMOLAGENTS] not installed — install 'smolagents[litellm]'") def _sync_get(space: str, path: str, params: dict = {}) -> dict | None: url = SPACES.get(space, space) + path try: r = httpx.get(url, params=params, timeout=HTTP_TIMEOUT) r.raise_for_status() return r.json() except Exception as e: log.warning(f"_sync_get {space}{path}: {e}") return None def _sync_post(space: str, path: str, data: dict) -> dict | None: url = SPACES.get(space, space) + path try: r = httpx.post(url, json=data, timeout=HTTP_TIMEOUT) r.raise_for_status() return r.json() except Exception as e: log.warning(f"_sync_post {space}{path}: {e}") return None def _harness_scan_sync(agent: str, tool: str, content: str) -> str: """Synchronous harness scan — returns sanitised content.""" if not HARNESS_URL: return content try: r = httpx.post(f"{HARNESS_URL}/api/scan/output", json={"agent": agent, "tool": tool, "content": content}, timeout=4) if r.status_code == 200: d = r.json() return d.get("sanitised", content) except Exception: pass return content def _approve_sync(agent: str, tool: str, args: dict, risk: str = "high") -> tuple[bool, str]: """Synchronous approval gate. Polls up to 90s.""" if not APPROVE_URL: return True, "approve_url_missing" try: r = httpx.post(f"{APPROVE_URL}/api/approval/request", json={"agent": agent, "tool": tool, "args": args, "risk": risk, "auto_timeout": 120}, timeout=6) if r.status_code == 200: approval_id = r.json().get("id") for _ in range(18): time.sleep(5) pr = httpx.get(f"{APPROVE_URL}/api/approval/{approval_id}", timeout=4) if pr.status_code == 200: status = pr.json().get("status") if status == "approved": return True, "human_approved" if status in ("rejected", "expired"): return False, status return False, "timeout" except Exception as e: return False, f"approve_error: {e}" def _compliance_scan_sync(agent: str, content: str) -> str: """Compliance PII scan — returns redacted content.""" if not COMPLIANCE_URL: return content try: r = httpx.post(f"{COMPLIANCE_URL}/api/scan/pii", json={"text": content, "agent": agent, "redact": True}, timeout=4) if r.status_code == 200: return r.json().get("redacted", content) except Exception: pass return content # ── FORGE Tool classes ────────────────────────────────────────────── class RelaySendTool(Tool): name = "relay_send" description = "Send a message to an agent or broadcast via RELAY. Use for notifications, delegations, status updates." inputs = { "to": {"type":"string","description":"Recipient agent name or 'broadcast'"}, "subject": {"type":"string","description":"Message subject (short)"}, "body": {"type":"string","description":"Full message body"}, "priority": {"type":"string","description":"low | normal | high | urgent", "nullable":True}, "channel": {"type":"string","description":"internal | telegram | browser", "nullable":True}, } output_type = "string" def __init__(self, agent_name): super().__init__(); self._agent = agent_name def forward(self, to, subject, body, priority="normal", channel="internal"): r = _sync_post("relay", "/api/messages", { "from": self._agent, "to": to, "subject": subject, "body": body, "priority": priority or "normal", "channel": channel or "internal"}) return f"sent id={r.get('id','?')}" if r else "relay_send failed" class MemorySearchTool(Tool): name = "memory_search" description = "Search agent memory across tiers. Always search before answering questions — you may have relevant memories." inputs = { "query": {"type":"string","description":"Search query"}, "tier": {"type":"string","description":"all | episodic | semantic | procedural | working", "nullable":True}, } output_type = "string" def __init__(self, agent_name): super().__init__(); self._agent = agent_name def forward(self, query, tier="all"): r = _sync_get("memory", "/api/memories/search", {"q": query, "tier": tier or "all", "limit": 8}) if not r: return "no results" results = r if isinstance(r, list) else r.get("results", []) import json as _json return _json.dumps([{"content": m.get("content","")[:200], "tier": m.get("tier"), "tags": m.get("tags")} for m in results[:5]]) class MemoryStoreTool(Tool): name = "memory_store" description = "Store a memory in MEMORY space. Content is PII-scanned before writing." inputs = { "content": {"type":"string","description":"Memory content to store"}, "tier": {"type":"string","description":"episodic | semantic | procedural | working"}, "tags": {"type":"array","description":"List of tag strings", "nullable":True}, "importance": {"type":"integer","description":"0-10 importance score", "nullable":True}, } output_type = "string" def __init__(self, agent_name): super().__init__(); self._agent = agent_name def forward(self, content, tier="episodic", tags=None, importance=6): # Compliance: PII scan before writing safe_content = _compliance_scan_sync(self._agent, content) r = _sync_post("memory", "/api/memories", { "content": safe_content, "tier": tier, "tags": tags or [], "importance": importance or 6, "agent": self._agent}) return f"stored id={r.get('id','?')}" if r else "memory_store failed" class KanbanListTool(Tool): name = "kanban_list" description = "List tasks from KANBAN board. Filter by status and/or agent." inputs = { "status": {"type":"string","description":"todo | doing | done | blocked | failed", "nullable":True}, "agent": {"type":"string","description":"Filter by agent name", "nullable":True}, } output_type = "string" def __init__(self, agent_name): super().__init__(); self._agent = agent_name def forward(self, status=None, agent=None): params = {} if status: params["status"] = status if agent: params["agent"] = agent import json as _json r = _sync_get("kanban", "/api/tasks", params) or [] tasks = r if isinstance(r, list) else [] return _json.dumps([{"id":t.get("id"),"title":t.get("title"), "status":t.get("status"),"priority":t.get("priority")} for t in tasks[:8]]) class KanbanMoveTool(Tool): name = "kanban_move" description = "Move a task to a new status on the KANBAN board." inputs = { "id": {"type":"string","description":"Task ID"}, "status": {"type":"string","description":"todo | doing | done | blocked | failed"}, "slot_id": {"type":"string","description":"GPU slot ID if applicable", "nullable":True}, "react_steps": {"type":"integer","description":"Number of ReAct steps taken", "nullable":True}, } output_type = "string" def __init__(self, agent_name): super().__init__(); self._agent = agent_name def forward(self, id, status, slot_id=None, react_steps=None): payload = {"id": id, "status": status} if slot_id: payload["slot_id"] = slot_id if react_steps: payload["react_steps"] = react_steps r = _sync_post("kanban", "/api/move", payload) return f"moved {id} → {status}" if r else "kanban_move failed" class KanbanCreateTool(Tool): name = "kanban_create" description = "Create a new task on the KANBAN board and assign it to an agent." inputs = { "title": {"type":"string","description":"Short task title"}, "body": {"type":"string","description":"Full task description with context"}, "priority": {"type":"string","description":"low | medium | high | critical"}, "agent": {"type":"string","description":"Agent to assign task to"}, "est_minutes": {"type":"integer","description":"Estimated completion minutes", "nullable":True}, } output_type = "string" def __init__(self, agent_name): super().__init__(); self._agent = agent_name def forward(self, title, body, priority="medium", agent=None, est_minutes=None): payload = {"title": title, "body": body, "priority": priority, "agent": agent or self._agent, "type": "ai"} if est_minutes: payload["est_minutes"] = est_minutes r = _sync_post("kanban", "/api/tasks", payload) return f"created task id={r.get('id','?')}" if r else "kanban_create failed" class VaultExecTool(Tool): name = "vault_exec" description = ( "Execute code in VAULT workspace. Runtimes: python3, bash, node, npm, pip, git. " "IMPORTANT: cwd must be one of: code, reports, scratch, shared. " "Bash and git commands that are destructive require human approval." ) inputs = { "runtime": {"type":"string","description":"python3 | bash | node | npm | pip | git"}, "code": {"type":"string","description":"Code or command to execute"}, "cwd": {"type":"string","description":"Working directory: code | reports | scratch | shared", "nullable":True}, } output_type = "string" def __init__(self, agent_name): super().__init__(); self._agent = agent_name def forward(self, runtime, code, cwd="scratch"): _VALID_CWDS = {"code","reports","scratch","shared",""} safe_cwd = (cwd or "scratch").strip("/") if (cwd or "scratch").strip("/") in _VALID_CWDS else "scratch" # Approval gate for risky bash/git if runtime in RISKY_RUNTIMES or any(p in code for p in RISKY_PATTERNS): approved, reason = _approve_sync(self._agent, "vault_exec", {"runtime": runtime, "code": code[:200], "cwd": safe_cwd}, risk="high") if not approved: return f"vault_exec BLOCKED by approval gate: {reason}" r = _sync_post("vault", "/api/exec", { "runtime": runtime, "code": code, "cwd": safe_cwd, "timeout": 30}) if not r: return "vault_exec failed" out = _harness_scan_sync(self._agent, "vault_exec", f"exit={r.get('exit_code')} ms={r.get('ms')}\n{r.get('output','')[:500]}") return out class VaultReadTool(Tool): name = "vault_read" description = "Read a file from the VAULT workspace." inputs = {"path": {"type":"string","description":"File path relative to workspace"}} output_type = "string" def __init__(self, agent_name): super().__init__(); self._agent = agent_name def forward(self, path): r = _sync_get("vault", "/api/read", {"path": path}) return r.get("content","")[:800] if r else "vault_read failed" class VaultWriteTool(Tool): name = "vault_write" description = "Write a file to the VAULT workspace. Always write complete file content." inputs = { "path": {"type":"string","description":"File path, e.g. code/script.py"}, "content": {"type":"string","description":"Complete file content"}, } output_type = "string" def __init__(self, agent_name): super().__init__(); self._agent = agent_name def forward(self, path, content): r = _sync_post("vault", "/api/write", {"path": path, "content": content, "agent": self._agent}) return f"written: {path} snap={r.get('snapshot',{}).get('id','?')}" if r else "vault_write failed" class ForgeSearchTool(Tool): name = "forge_search" description = "Search for skills and tools in the FORGE skill registry." inputs = {"query": {"type":"string","description":"Search query"}} output_type = "string" def __init__(self, agent_name): super().__init__(); self._agent = agent_name def forward(self, query): import json as _json r = _sync_get("forge", "/api/capabilities", {"q": query, "limit": 5}) items = r if isinstance(r, list) else (r.get("skills",[]) if r else []) return _json.dumps([{"name":s.get("name"),"description":s.get("description","")[:100]} for s in items[:5]]) class SlotReserveTool(Tool): name = "slot_reserve" description = "Reserve the RTX 5090 GPU slot before a long task. Returns slot_id or queue position." inputs = { "task_id": {"type":"string","description":"Task identifier"}, "est_minutes": {"type":"integer","description":"Estimated minutes needed (1-60)"}, "priority": {"type":"integer","description":"Priority 1=critical 5=normal 10=low", "nullable":True}, } output_type = "string" def __init__(self, agent_name): super().__init__(); self._agent = agent_name def forward(self, task_id, est_minutes=5, priority=5): r = _sync_post("nexus", "/api/slot/reserve", { "agent": self._agent, "task_id": task_id, "est_minutes": est_minutes, "priority": priority or 5}) if not r: return "slot_reserve failed" status = r.get("status","unknown") if status == "active": return f"slot ACTIVE slot_id={r['slot_id']} expires_in={est_minutes}min" elif status == "queued": return f"slot QUEUED position={r.get('queue_position')} eta={r.get('eta_seconds',0)}s holder={r.get('current_holder','?')} — wait or use local_cpu" return f"slot status={status}" class SlotReleaseTool(Tool): name = "slot_release" description = "Release the GPU slot when done. Always call after finishing to unblock other agents." inputs = {"slot_id": {"type":"string","description":"Slot ID from slot_reserve"}} output_type = "string" def __init__(self, agent_name): super().__init__(); self._agent = agent_name def forward(self, slot_id): r = _sync_post("nexus", "/api/slot/release", {"slot_id": slot_id}) return f"slot released (held {r.get('held_seconds',0)}s)" if r and r.get("released") else "slot_release failed" class SlotStatusTool(Tool): name = "slot_status" description = "Check who holds the GPU slot and current queue. Use before slot_reserve." inputs = {} output_type = "string" def __init__(self, agent_name): super().__init__(); self._agent = agent_name def forward(self): r = _sync_get("nexus", "/api/slot/status", {}) if not r: return "slot_status failed" active = r.get("active") queue = r.get("queue", []) result = f"OCCUPIED by {active['agent']} expires_in={int(active.get('expires_at',0)-time.time())}s" if active else "FREE" if queue: result += f" | Queue: {[q['agent'] for q in queue]}" return result class TriggerAgentTool(Tool): name = "trigger_agent" description = "Wake another agent immediately with a task. Always call after delegate to ensure pickup." inputs = { "agent": {"type":"string","description":"Agent name to wake"}, "content": {"type":"string","description":"Task content or context for the agent"}, } output_type = "string" def __init__(self, agent_name): super().__init__(); self._agent = agent_name def forward(self, agent, content=""): r = _sync_post("pulse", f"/api/trigger/{agent}", {"from": self._agent, "content": content or f"Task delegated by {self._agent}"}) return f"triggered {agent}" if r else f"trigger queued for {agent} (heartbeat pickup)" class WebSearchTool(Tool): name = "web_search" description = "Search the web via Brave Search API. Returns titles, URLs and snippets." inputs = { "query": {"type":"string","description":"Search query"}, "count": {"type":"integer","description":"Number of results 1-10, default 5", "nullable":True}, } output_type = "string" def __init__(self, agent_name): super().__init__(); self._agent = agent_name def forward(self, query, count=5): if not BRAVE_API_KEY: return "web_search unavailable: BRAVE_API_KEY not set" try: r = httpx.get("https://api.search.brave.com/res/v1/web/search", params={"q": query, "count": min(count or 5, 10), "text_decorations": False}, headers={"Accept": "application/json", "X-Subscription-Token": BRAVE_API_KEY}, timeout=8) r.raise_for_status() results = r.json().get("web", {}).get("results", []) if not results: return "no results" lines = [f"{i}. {res.get('title','?')} — {res.get('url','')}\n {res.get('description','')[:200]}" for i, res in enumerate(results[:count or 5], 1)] return "\n\n".join(lines) except Exception as e: return f"web_search error: {e}" class FetchUrlTool(Tool): name = "fetch_url" description = "Fetch a URL and return stripped text (5000 char limit). Use after web_search." inputs = {"url": {"type":"string","description":"Full URL to fetch"}} output_type = "string" def __init__(self, agent_name): super().__init__(); self._agent = agent_name def forward(self, url): try: r = httpx.get(url, headers={"User-Agent": "FORGE-Agent/1.0"}, timeout=10, follow_redirects=True) r.raise_for_status() ct = r.headers.get("content-type", "") text = re.sub(r"<[^>]+>", " ", r.text) if "html" in ct else r.text text = re.sub(r"\s{2,}", " ", text).strip() return text[:5000] + ("…[truncated]" if len(text) > 5000 else "") except Exception as e: return f"fetch_url error: {e}" # ── FORGE OpenAI-compatible model (NEXUS backend) ─────────────────── def build_forge_model(cost_mode: str = "balanced") -> object | None: """Build smolagents model pointing at NEXUS (OpenAI-compatible).""" if not SMOLAGENTS_OK: return None nexus_url = SPACES.get("nexus", "") model_name = { "cheap": "nexus-fast", "balanced": "nexus-auto", "best": "nexus-best", }.get(cost_mode, "nexus-auto") try: from smolagents import OpenAIServerModel return OpenAIServerModel( model_id = model_name, api_base = nexus_url + "/v1", api_key = os.environ.get("NEXUS_API_KEY", "forge-internal"), ) except Exception as e: log.warning(f"[SMOLAGENTS] OpenAIServerModel failed: {e}") return None def build_agent_tools(agent_name: str) -> list: """Instantiate all FORGE tools for the given agent.""" return [ RelaySendTool(agent_name), MemorySearchTool(agent_name), MemoryStoreTool(agent_name), KanbanListTool(agent_name), KanbanMoveTool(agent_name), KanbanCreateTool(agent_name), VaultExecTool(agent_name), VaultReadTool(agent_name), VaultWriteTool(agent_name), ForgeSearchTool(agent_name), SlotReserveTool(agent_name), SlotReleaseTool(agent_name), SlotStatusTool(agent_name), TriggerAgentTool(agent_name), WebSearchTool(agent_name), FetchUrlTool(agent_name), ] # ── Step callback — trace + harness ──────────────────────────────── def make_step_callback(agent_name: str, trace: dict): """Returns a step callback that emits trace events and scans tool outputs.""" from smolagents.memory import ActionStep def _callback(step_log, agent=None): if not isinstance(step_log, ActionStep): return # Harness: scan tool output before LLM re-ingests obs = getattr(step_log, "observations", None) or "" if obs and HARNESS_URL: tool_name = "" if step_log.tool_calls: tool_name = step_log.tool_calls[0].name if hasattr(step_log.tool_calls[0], "name") else "" sanitised = _harness_scan_sync(agent_name, tool_name, str(obs)) if sanitised != str(obs): step_log.observations = sanitised # Trace step_info = { "step": getattr(step_log, "step_number", len(trace["steps"])), "thought": str(getattr(step_log, "model_output_message", ""))[:200], "tool": step_log.tool_calls[0].name if getattr(step_log, "tool_calls", None) else "", "obs": str(getattr(step_log, "observations", ""))[:200], "error": str(step_log.error) if getattr(step_log, "error", None) else "", } trace["steps"].append(step_info) push_live({"type": "step", "agent": agent_name, **step_info}) emit_trace(agent_name, "react_step", step_info, status="error" if step_info["error"] else "ok") return _callback # ── smolagents CodeAgent react_loop ──────────────────────────────── async def react_loop(agent: dict, trigger_type: str, trigger_content: str) -> dict: """ Run a smolagents CodeAgent for this agent tick. The agent writes Python code to call FORGE tools — loops, conditionals, multi-step composition all work naturally. Falls back to ToolCallingAgent if CodeAgent unavailable. """ name = agent["name"] cost_mode = agent.get("cost_mode", "balanced") max_steps = agent.get("max_react_steps", REACT_MAX) trace = {"agent": name, "trigger": trigger_type, "started": int(time.time()), "steps": [], "result": "", "ok": True} if not SMOLAGENTS_OK: trace["result"] = "smolagents not installed" trace["ok"] = False return trace # Fetch persona from agent-prompts (cached) persona_data = get_agent_persona(name) system_prompt = persona_data.get("system_prompt", agent.get("persona", "You are a helpful AI agent.")) max_steps = persona_data.get("max_steps", max_steps) # Load soul.md and user.md for context injection soul_ctx = "" try: sv = _sync_get("vault", "/api/read", {"path": "soul.md"}) if sv: soul_ctx = sv.get("content", "")[:500] except Exception: pass user_ctx = "" try: uv = _sync_get("vault", "/api/read", {"path": "user.md"}) if uv: user_ctx = uv.get("content", "")[:300] except Exception: pass # Auto-load skills from FORGE at ReAct start skills_ctx = "" try: skills = _sync_get("forge", "/api/capabilities", {"q": name, "limit": 5}) if skills: items = skills if isinstance(skills, list) else skills.get("skills", []) skills_ctx = "AVAILABLE SKILLS:\n" + "\n".join( f" - {s.get('name')}: {s.get('description','')[:80]}" for s in items[:5]) except Exception: pass full_system = "\n\n".join(filter(None, [system_prompt, soul_ctx, skills_ctx])) task = ( f"TRIGGER: {trigger_type}\n" f"CONTEXT: {trigger_content}\n" + (f"OPERATOR: {user_ctx}\n" if user_ctx else "") + f"UTC: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M')}\n" "Execute your assigned task using the available tools." ) push_live({"type": "react_start", "agent": name, "trigger": trigger_type}) def _run_agent(): """Sync function — runs in thread pool via asyncio.to_thread.""" model = build_forge_model(cost_mode) if model is None: return {"ok": False, "result": "NEXUS model unavailable", "steps": []} tools = build_agent_tools(name) try: from smolagents import ToolCallingAgent agent_obj = ToolCallingAgent( tools = tools, model = model, max_steps = max_steps, instructions = full_system, verbosity_level = LogLevel.WARNING, step_callbacks = [make_step_callback(name, trace)], name = name, ) except Exception as e: log.error(f"[SMOLAGENTS] agent init failed: {e}") return {"ok": False, "result": str(e), "steps": []} try: result = agent_obj.run(task, reset=True) return {"ok": True, "result": str(result)[:500], "steps": trace["steps"]} except Exception as e: log.error(f"[SMOLAGENTS] agent.run failed: {e}") return {"ok": False, "result": str(e), "steps": trace["steps"]} try: outcome = await asyncio.to_thread(_run_agent) except Exception as e: outcome = {"ok": False, "result": str(e), "steps": []} trace["ok"] = outcome["ok"] trace["result"] = outcome["result"] trace["ms"] = int((time.time() - trace["started"]) * 1000) # Emit final trace to TRACE + LEARN emit_trace(name, "react_complete", {"result": trace["result"], "steps": len(trace["steps"]), "trigger": trigger_type, "ms": trace["ms"]}, status="ok" if trace["ok"] else "error") push_live({"type": "react_done", "agent": name, "ok": trace["ok"], "ms": trace["ms"], "steps": len(trace["steps"])}) return trace # ── Heartbeat engine ─────────────────────────────────────────────── scheduler = AsyncIOScheduler(timezone="UTC") # ── Heartbeat engine ─────────────────────────────────────────────── scheduler = AsyncIOScheduler(timezone="UTC") async def agent_tick(agent_name: str, trigger_type: str = "heartbeat", content: str = ""): agents = load_json(AGENTS_FILE, []) agent = next((a for a in agents if a["name"] == agent_name), None) if not agent or not agent.get("enabled", True): return if agent_status.get(agent_name, {}).get("running"): log.info(f"Agent {agent_name} already running, skip tick") return agent_status[agent_name] = {**agent_status.get(agent_name,{}), "running": True, "last_tick": int(time.time())} push_live({"type":"heartbeat","agent":agent_name,"trigger":trigger_type}) try: # Build context from multiple sources context_parts = [] if content: context_parts.append(content) # Check relay inbox — read AND ack so same messages don't repeat next tick inbox = await space_get("relay", f"/api/inbox/{agent_name}", {"unread":"true","limit":"5"}) if isinstance(inbox, list) and inbox: msgs = [f"[from:{m.get('from')}] {m.get('subject','')} — {m.get('body','')[:150]}" for m in inbox[:5]] context_parts.append("UNREAD MESSAGES:\n" + "\n".join(msgs)) # ACK each message so it doesn't re-appear next tick for m in inbox[:5]: mid = m.get("id","") if mid: asyncio.create_task(space_post("relay", f"/api/messages/{mid}/ack", {"agent": agent_name})) # Check kanban for assigned tasks — include task body so agent knows what to do kanban = await space_get("kanban", "/api/tasks", {"agent":agent_name,"status":"todo"}) if isinstance(kanban, list) and kanban: tasks = [f"[{t.get('priority','?')}] id={t.get('id','')} title={t.get('title','')} | {t.get('body','')[:200]}" for t in kanban[:5]] context_parts.append("YOUR OPEN TASKS (act on these):\n" + "\n".join(tasks)) if not context_parts and trigger_type == "heartbeat": # Nothing to do agent_status[agent_name]["running"] = False agent_status[agent_name]["last_result"] = "idle" push_live({"type":"idle","agent":agent_name}) return # For manual/UI triggers with no specific content, give agent clear instruction if trigger_type in ("manual",) and not content: content = f"You have been manually triggered. Check your open tasks above and execute them. If a task says to create a file, use vault_write then vault_exec." full_context = "\n\n".join(context_parts) if context_parts else "Routine heartbeat check." trace = await react_loop(agent, trigger_type, full_context) tc = agent_status.get(agent_name, {}).get("tick_count", 0) + 1 agent_status[agent_name] = {**agent_status.get(agent_name,{}), "running": False, "tick_count": tc, "last_result": trace["result"][:100], "last_ok": trace["ok"], "last_ms": trace.get("ms",0)} except Exception as e: log.error(f"agent_tick {agent_name}: {e}") agent_status[agent_name] = {**agent_status.get(agent_name,{}), "running": False, "last_result": f"error: {e}", "last_ok": False} push_live({"type":"error","agent":agent_name,"message":str(e)}) def register_agent_jobs(agent: dict): """Register APScheduler jobs for an agent.""" name = agent["name"] # Remove existing jobs for this agent for job in scheduler.get_jobs(): if job.id.startswith(f"hb_{name}"): job.remove() if not agent.get("enabled", True): return # Heartbeat interval = agent.get("heartbeat_seconds", 0) if interval and interval > 0: from apscheduler.triggers.interval import IntervalTrigger scheduler.add_job( agent_tick, IntervalTrigger(seconds=max(interval, 30)), args=[name, "heartbeat", ""], id=f"hb_{name}_interval", replace_existing=True, max_instances=1, misfire_grace_time=60) log.info(f"Registered heartbeat for {name} every {interval}s") def register_schedule_job(entry: dict): """Register a timetable entry as APScheduler job.""" eid = entry["id"] agent = entry.get("agent","") if not entry.get("enabled", True): return trigger_content = entry.get("prompt","Scheduled task: " + entry.get("title","")) job_id = f"sch_{eid}" for job in scheduler.get_jobs(): if job.id == job_id: job.remove() if entry.get("recurrence") == "once": dt_str = entry.get("datetime","") if dt_str: try: dt = datetime.fromisoformat(dt_str) scheduler.add_job(agent_tick, DateTrigger(run_date=dt), args=[agent, "scheduled", trigger_content], id=job_id, replace_existing=True, max_instances=1) except: pass else: # weekly / daily cron day = entry.get("day", 0) # 0=Mon hour = entry.get("hour", 9) minute = entry.get("minute", 0) day_map = {0:"mon",1:"tue",2:"wed",3:"thu",4:"fri",5:"sat",6:"sun"} if entry.get("recurrence") == "daily": cron = CronTrigger(hour=hour, minute=minute) else: cron = CronTrigger(day_of_week=day_map.get(day,"mon"), hour=hour, minute=minute) scheduler.add_job(agent_tick, cron, args=[agent, "scheduled", trigger_content], id=job_id, replace_existing=True, max_instances=1) def reload_all_jobs(): agents = load_json(AGENTS_FILE, []) schedule = load_json(SCHEDULE_FILE, []) for a in agents: register_agent_jobs(a) for e in schedule: register_schedule_job(e) # ── Default data ─────────────────────────────────────────────────── DEFAULT_AGENTS = [ {"name":"researcher","persona":"You are a deep research specialist and self-improvement engine. RESEARCH WORKFLOW: 1) kanban_move(doing), 2) slot_reserve(est_minutes=8, priority=3) for complex queries, 3) memory_search across all tiers, 4) synthesize findings, 5) memory_store(semantic tier, importance=7+), 6) relay_send to christof with digest, 7) slot_release, 8) kanban_move(done). SELF-IMPROVEMENT DUTY (weekly or when triggered): call self_reflect() — read your own traces, find failure patterns, propose persona improvements. PATTERN DETECTION: After completing research, check if you found the same insight 3+ times across memories. If yes: vault_write('code/utils/.py', utility_code), then relay_send('christof', 'Skill candidate: '). DEEP RESEARCH means: use memory_search multiple times with different queries, cross-reference findings, never stop at first result.","enabled":True,"heartbeat_seconds":0,"cost_mode":"best","max_react_steps":8,"color":"#0ea5e9","tags":["research","analysis","self-improvement"]}, {"name":"coder","persona":"You are a senior software engineer with RTX 5090 access. WORKFLOW for any coding task: 1) kanban_move(id, status=doing), 2) slot_reserve(task_id=id, est_minutes=10, priority=2) — if QUEUED, wait or proceed with local_cpu for small tasks, 3) vault_write(path=code/filename.py, content=), 4) vault_exec(runtime=python3, code='exec(open(\"code/filename.py\").read())', cwd=code) to verify it runs, 5) slot_release(slot_id=), 6) kanban_move(id, status=done, slot_id=, react_steps=), 7) finish with result. Always write COMPLETE working code, never placeholders.","enabled":True,"heartbeat_seconds":0,"cost_mode":"best","max_react_steps":7,"color":"#2ed573","tags":["coding","execution"]}, {"name":"planner","persona":"You are a strategic planner. Break goals into kanban tasks with kanban_create(title, body, agent, est_minutes, deps=[dep_task_id]). After creating a task for another agent: ALWAYS call trigger_agent(agent=target, content=task_body) to wake them immediately — do not just delegate and hope. Track progress with kanban_list. If a task has dependencies, set deps=[id1,id2] so it stays blocked until deps are done. Report sprint status to christof weekly.","enabled":True,"heartbeat_seconds":0,"cost_mode":"balanced","max_react_steps":6,"color":"#ff9500","tags":["planning","coordination"]}, {"name":"monitor","persona":"You are a system watchdog. Check kanban for stuck/failed tasks, check relay for urgent messages, report to christof. When you find a critical task stuck in todo: relay_send to the assigned agent reminding them. Keep reports short and factual.","enabled":True,"heartbeat_seconds":300,"cost_mode":"cheap","max_react_steps":5,"color":"#ff6b9d","tags":["monitoring","alerts"]}, {"name":"christof","persona":"You are Christof's personal AI coordinator at ki-fusion-labs.de. Summarize daily progress from kanban and memory. Flag blockers immediately. Create tasks for things that need doing. Write concisely in German or English as appropriate.","enabled":True,"heartbeat_seconds":0,"cost_mode":"best","max_react_steps":6,"color":"#ff6b00","tags":["personal","coordinator"]}, ] DEFAULT_SCHEDULE = [ {"id":"s1","agent":"monitor","title":"Morning system check","recurrence":"daily","hour":8,"minute":0,"day":0,"prompt":"Run a full system health check: check relay for urgent messages, scan kanban for blocked/failed tasks, report to christof.","enabled":True,"color":"#ff6b9d"}, {"id":"s2","agent":"researcher","title":"Daily AI research digest","recurrence":"daily","hour":9,"minute":0,"day":0,"prompt":"Search memory and knowledge for recent AI topics. Find the top 3 relevant developments for ki-fusion-labs projects. Store summary in memory (semantic tier). Send digest to christof via relay.","enabled":True,"color":"#0ea5e9"}, {"id":"s3","agent":"planner","title":"Sprint planning","recurrence":"weekly","hour":9,"minute":30,"day":0,"prompt":"Review all todo tasks in kanban. Prioritize by urgency and dependencies. Create a sprint plan for the week. Send summary to christof via relay.","enabled":True,"color":"#ff9500"}, {"id":"s4","agent":"coder","title":"Code quality check","recurrence":"weekly","hour":10,"minute":0,"day":2,"prompt":"List files in vault/code. Run basic linting on Python files. Report any issues to kanban as tasks. Store quality report in vault/reports.","enabled":True,"color":"#2ed573"}, {"id":"s5","agent":"monitor","title":"Evening task review","recurrence":"daily","hour":18,"minute":0,"day":0,"prompt":"Review today's kanban activity: tasks completed, blocked, or failed. Send end-of-day summary to christof. Archive completed tasks.","enabled":True,"color":"#ff6b9d"}, {"id":"s6","agent":"planner","title":"Weekly retrospective","recurrence":"weekly","hour":16,"minute":0,"day":4,"prompt":"Review the week: what was completed, what is blocked, what needs attention next week. Generate a markdown retrospective report and store in vault/reports. Send highlights to christof.","enabled":True,"color":"#ff9500"}, ] def seed(): if not AGENTS_FILE.exists(): save_json(AGENTS_FILE, DEFAULT_AGENTS) if not SCHEDULE_FILE.exists(): save_json(SCHEDULE_FILE, DEFAULT_SCHEDULE) if not ACTIVITY_FILE.exists(): save_json(ACTIVITY_FILE, []) seed() # ── FastAPI ─────────────────────────────────────────────────────── app = FastAPI(title="PULSE — Agent Nervous System") @app.on_event("startup") async def startup(): scheduler.start() reload_all_jobs() log.info("PULSE scheduler started") @app.on_event("shutdown") async def shutdown(): scheduler.shutdown(wait=False) def jresp(d, s=200): return JSONResponse(content=d, status_code=s) # ── Agent API ───────────────────────────────────────────────────── @app.get("/api/agents") async def list_agents(): agents = load_json(AGENTS_FILE, []) return jresp([{**a, "status": agent_status.get(a["name"],{"running":False,"tick_count":0})} for a in agents]) @app.post("/api/agents") async def upsert_agent(request: Request): data = await request.json() name = data.get("name","").strip().lower() if not name: raise HTTPException(400, "name required") agents = load_json(AGENTS_FILE, []) existing = next((i for i,a in enumerate(agents) if a["name"]==name), None) agent = {**(agents[existing] if existing is not None else {}), **data, "name": name} if existing is not None: agents[existing] = agent else: agents.append(agent) save_json(AGENTS_FILE, agents) register_agent_jobs(agent) return jresp({"status":"saved","agent":agent}) @app.delete("/api/agents/{name}") async def delete_agent(name: str): agents = load_json(AGENTS_FILE, []) agents = [a for a in agents if a["name"] != name] save_json(AGENTS_FILE, agents) for job in scheduler.get_jobs(): if job.id.startswith(f"hb_{name}"): job.remove() return jresp({"status":"deleted"}) @app.post("/api/agents/{name}/run") async def trigger_agent(name: str, request: Request): data = await request.json() trigger = data.get("trigger","manual") content = data.get("content","Manual trigger from PULSE UI") asyncio.create_task(agent_tick(name, trigger, content)) return jresp({"status":"triggered","agent":name}) @app.post("/api/trigger/{agent_name}") async def trigger_agent_shortcut(agent_name: str, request: Request): """Shortcut trigger — called by other agents after delegation. No auth needed.""" body = await request.json() if request.headers.get("content-type","").startswith("application/json") else {} content = body.get("content", f"Triggered by delegation from {body.get('from','system')}") agents_cfg = load_json(AGENTS_FILE, []) by_name = {a["name"]: a for a in agents_cfg} if agent_name not in by_name: return jresp({"error": f"unknown agent: {agent_name}"}, 404) asyncio.create_task(agent_tick(agent_name, "delegation", content)) log.info(f"[TRIGGER] {agent_name} woken by delegation: {content[:60]}") return jresp({"triggered": agent_name, "ts": datetime.now(timezone.utc).isoformat()}) @app.post("/api/reflect/{agent_name}") async def self_reflect(agent_name: str, request: Request): """Self-reflection delegated to agent-loop. PULSE triggers the cycle, loop orchestrates.""" try: import urllib.request as ureq body = json.dumps({"triggered_by": f"pulse:{agent_name}"}).encode() req = ureq.Request(f"{LOOP_URL}/api/cycle", data=body, headers={"Content-Type": "application/json"}, method="POST") with ureq.urlopen(req, timeout=5) as r: resp = json.loads(r.read()) emit_trace(agent_name, "self_reflect", {"delegated_to": "agent-loop"}) return jresp({"status": "delegated_to_loop", "agent": agent_name, "loop_response": resp, "note": "Self-improvement cycle started in agent-loop"}) except Exception as e: return jresp({"status": "loop_unreachable", "agent": agent_name, "error": str(e), "note": "agent-loop is not reachable — check LOOP_URL secret"}) @app.get("/api/reflect/proposals") async def reflection_proposals(): """Pending persona proposals — read from agent-loop.""" try: import urllib.request as ureq with ureq.urlopen(f"{LOOP_URL}/api/proposals?state=pending&limit=20", timeout=4) as r: data = json.loads(r.read()) return jresp(data) except Exception as e: return jresp({"error": str(e), "note": "Proposals now live in agent-loop"}) @app.post("/api/reflect/apply/{agent_name}") async def apply_reflection(agent_name: str, request: Request): """Approve a proposal — delegated to agent-loop.""" body = await request.json() proposal_id = body.get("proposal_id", "") if not proposal_id: return jresp({"error": "proposal_id required — get it from /api/reflect/proposals"}, 400) try: import urllib.request as ureq approve_body = json.dumps({"approved_by": "pulse"}).encode() req = ureq.Request(f"{LOOP_URL}/api/proposals/{proposal_id}/approve", data=approve_body, headers={"Content-Type":"application/json"}, method="POST") with ureq.urlopen(req, timeout=5) as r: resp = json.loads(r.read()) return jresp(resp) except Exception as e: return jresp({"error": str(e)}, 500) @app.get("/api/agents/{name}/traces") async def agent_traces(name: str, limit: int = 10): traces = [] for p in sorted((BASE/"traces").glob("*.json"), reverse=True)[:50]: try: t = json.loads(p.read_text()) if t.get("agent") == name: traces.append(t) if len(traces) >= limit: break except: pass return jresp(traces) @app.get("/api/agents/status/all") async def all_status(): return jresp(agent_status) # ── Schedule API ────────────────────────────────────────────────── @app.get("/api/schedule") async def list_schedule(): return jresp(load_json(SCHEDULE_FILE, [])) @app.post("/api/schedule") async def upsert_schedule(request: Request): data = await request.json() if not data.get("id"): data["id"] = uuid.uuid4().hex[:8] entries = load_json(SCHEDULE_FILE, []) existing = next((i for i,e in enumerate(entries) if e["id"]==data["id"]), None) if existing is not None: entries[existing] = data else: entries.append(data) save_json(SCHEDULE_FILE, entries) register_schedule_job(data) return jresp({"status":"saved","entry":data}) @app.delete("/api/schedule/{eid}") async def delete_schedule(eid: str): entries = load_json(SCHEDULE_FILE, []) entries = [e for e in entries if e["id"] != eid] save_json(SCHEDULE_FILE, entries) for job in scheduler.get_jobs(): if job.id == f"sch_{eid}": job.remove() return jresp({"status":"deleted"}) @app.post("/api/schedule/{eid}/run") async def run_now(eid: str): entries = load_json(SCHEDULE_FILE, []) entry = next((e for e in entries if e["id"]==eid), None) if not entry: raise HTTPException(404) asyncio.create_task(agent_tick(entry.get("agent",""), "manual_schedule", entry.get("prompt","scheduled"))) return jresp({"status":"triggered"}) # ── Activity + SSE ──────────────────────────────────────────────── @app.get("/api/activity") async def activity(limit: int = 50): return jresp(load_json(ACTIVITY_FILE, [])[:limit]) @app.get("/api/stats") async def api_stats_pulse(): """Basic stats for external health checks.""" agents = load_json(AGENTS_FILE, []) running = sum(1 for a in agents if agent_status.get(a["name"],{}).get("running")) ticks = sum(agent_status.get(a["name"],{}).get("tick_count",0) for a in agents) return jresp({ "ok": True, "version": "2.0.0", "agents": len(agents), "running": running, "total_ticks": ticks, "smolagents": SMOLAGENTS_OK, }) @app.get("/api/live") async def live_feed(): q = asyncio.Queue() live_queues.append(q) async def stream(): try: # Send recent activity on connect recent = load_json(ACTIVITY_FILE, [])[:5] for ev in reversed(recent): yield f"data: {json.dumps(ev)}\n\n" yield f"data: {json.dumps({'type':'connected','spaces':list(SPACES.keys())})}\n\n" while True: try: payload = await asyncio.wait_for(q.get(), timeout=25) yield f"data: {payload}\n\n" except asyncio.TimeoutError: yield f"data: {json.dumps({'type':'ping','ts':int(time.time())})}\n\n" finally: live_queues.remove(q) return StreamingResponse(stream(), media_type="text/event-stream", headers={"Cache-Control":"no-cache","X-Accel-Buffering":"no"}) @app.get("/api/spaces/health") async def spaces_health(): results = {} async def check(name, url): # forge doesn't expose /api/stats — fall back to root health_path = "/api/stats" if name != "forge" else "/" try: async with httpx.AsyncClient(timeout=5) as c: r = await c.get(url + health_path) results[name] = {"ok": r.status_code < 400, "status": r.status_code, "url": url} except Exception as e: results[name] = {"ok": False, "error": str(e)[:60], "url": url} await asyncio.gather(*[check(n,u) for n,u in SPACES.items()]) return jresp(results) @app.get("/api/traces") async def all_traces(limit: int = 20): traces = [] for p in sorted((BASE/"traces").glob("*.json"), reverse=True)[:limit]: try: traces.append(json.loads(p.read_text())) except: pass return jresp(traces) @app.get("/api/llm/status") async def llm_status(): """Show which LLM providers are configured and their cooldown state.""" now = time.time() providers = [ { "name": "nexus", "label": "NEXUS (ki-fusion-labs RTX 5090)", "configured": True, "url": SPACES["nexus"], "priority": 1, "ok": _provider_ok("nexus"), "last_fail": _provider_failures.get("nexus"), "cooldown_remaining": max(0, int(_PROVIDER_COOLDOWN - (now - _provider_failures.get("nexus", 0)))), }, { "name": "anthropic", "label": "Anthropic claude-haiku", "configured": bool(ANTHROPIC_KEY), "priority": 2, "ok": _provider_ok("anthropic"), "last_fail": _provider_failures.get("anthropic"), "cooldown_remaining": max(0, int(_PROVIDER_COOLDOWN - (now - _provider_failures.get("anthropic", 0)))), }, { "name": "hf_inference", "label": f"HF Inference ({FALLBACK_HF_MODEL})", "configured": bool(HF_TOKEN), "priority": 3, "ok": _provider_ok("hf_inference"), "last_fail": _provider_failures.get("hf_inference"), "cooldown_remaining": max(0, int(_PROVIDER_COOLDOWN - (now - _provider_failures.get("hf_inference", 0)))), }, { "name": "local_cpu", "label": "NEXUS local_cpu (Qwen 0.5B, always available)", "configured": True, "priority": 4, "ok": True, # always try this last "last_fail": None, "cooldown_remaining": 0, "note": "slow ~10-30s, last resort", }, ] return jresp({"providers": providers, "cooldown_seconds": _PROVIDER_COOLDOWN}) @app.post("/api/llm/test") async def llm_test(request: Request): """Fire a minimal test call through the full chain and return which provider answered.""" data = await request.json() probe = data.get("prompt", "Reply with exactly: PULSE_OK") try: result = await call_llm( [{"role": "user", "content": probe}], system="You are a test probe. Follow instructions exactly.", max_tokens=20) return jresp({"ok": True, "response": result[:200]}) except Exception as e: return jresp({"ok": False, "error": str(e)}, 500) # ── MCP ────────────────────────────────────────────────────────── MCP_TOOLS = [ {"name":"pulse_trigger","description":"Trigger an agent to run its ReAct loop", "inputSchema":{"type":"object","required":["agent"],"properties":{ "agent":{"type":"string"},"content":{"type":"string"},"trigger":{"type":"string"}}}}, {"name":"pulse_schedule","description":"Add or update a schedule entry", "inputSchema":{"type":"object","required":["agent","title"],"properties":{ "agent":{"type":"string"},"title":{"type":"string"}, "recurrence":{"type":"string","enum":["daily","weekly","once"]}, "hour":{"type":"integer"},"minute":{"type":"integer"},"day":{"type":"integer"}, "prompt":{"type":"string"},"enabled":{"type":"boolean"}}}}, {"name":"pulse_status","description":"Get status of all agents", "inputSchema":{"type":"object","properties":{}}}, ] async def mcp_call(name, args): if name == "pulse_trigger": asyncio.create_task(agent_tick(args["agent"], args.get("trigger","mcp"), args.get("content",""))) return json.dumps({"triggered": args["agent"]}) if name == "pulse_schedule": if not args.get("id"): args["id"] = uuid.uuid4().hex[:8] entries = load_json(SCHEDULE_FILE, []) entries.append(args); save_json(SCHEDULE_FILE, entries) register_schedule_job(args) return json.dumps({"scheduled": args["id"]}) if name == "pulse_status": return json.dumps(agent_status) return json.dumps({"error": f"unknown: {name}"}) @app.get("/mcp/sse") async def mcp_sse(): async def stream(): init = {"jsonrpc":"2.0","method":"notifications/initialized", "params":{"serverInfo":{"name":"pulse","version":"1.0"},"capabilities":{"tools":{}}}} yield f"data: {json.dumps(init)}\n\n" await asyncio.sleep(0.1) yield f"data: {json.dumps({'jsonrpc':'2.0','method':'notifications/tools/list_changed','params':{}})}\n\n" while True: await asyncio.sleep(25) yield f"data: {json.dumps({'jsonrpc':'2.0','method':'ping'})}\n\n" return StreamingResponse(stream(), media_type="text/event-stream", headers={"Cache-Control":"no-cache","X-Accel-Buffering":"no"}) @app.post("/mcp") async def mcp_rpc(request: Request): body = await request.json(); method = body.get("method",""); rid = body.get("id",1) if method == "initialize": return jresp({"jsonrpc":"2.0","id":rid,"result":{"serverInfo":{"name":"pulse","version":"1.0"},"capabilities":{"tools":{}}}}) if method == "tools/list": return jresp({"jsonrpc":"2.0","id":rid,"result":{"tools":MCP_TOOLS}}) if method == "tools/call": p = body.get("params",{}); res = await mcp_call(p.get("name",""), p.get("arguments",{})) return jresp({"jsonrpc":"2.0","id":rid,"result":{"content":[{"type":"text","text":res}]}}) return jresp({"jsonrpc":"2.0","id":rid,"error":{"code":-32601,"message":"not found"}}) @app.get("/", response_class=HTMLResponse) async def ui(): return HTMLResponse(content=SPA, media_type="text/html; charset=utf-8") SPA = r""" PULSE — Agent Nervous System PULSE Agent Nervous System — ki-fusion-labs.de 0AGENTS 0RUNNING 0SCHEDULED 0TICKS TODAY 📅 Timetable 🤖 Agents ⚡ Live 🌐 Spaces Active Agents Connected Spaces ‹ Week › Today + Schedule UTC + New Agent RECENT TRACES TRACE ✕ MODAL ✕ """