""" NOC Agent — Inference Script ============================= Runs all 3 incident tasks sequentially, one Docker container per task. Prints a final summary table after all tasks complete. Required environment variables: API_BASE_URL LLM API endpoint (e.g. https://router.huggingface.co/v1) MODEL_NAME Model identifier (e.g. Qwen/Qwen2.5-72B-Instruct) HF_TOKEN API key for the above endpoint IMAGE_NAME Docker image name (e.g. noc_agent) Optional: TASK_NAME Run a single task instead of all 3 cpu_overload | memory_leak | network_congestion Stdout format (mandatory per task): [START] task= env=noc_agent model= [STEP] step= action= reward=<0.00> done= error= [END] success= steps= score=<0.000> rewards= """ from __future__ import annotations import asyncio import os import sys import textwrap from pathlib import Path from typing import List, Optional from dotenv import load_dotenv from openai import OpenAI load_dotenv(Path(__file__).parent / ".env") # --------------------------------------------------------------------------- # Configuration # --------------------------------------------------------------------------- API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1" MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct" API_KEY = os.getenv("API_KEY") or os.getenv("HF_TOKEN") IMAGE_NAME = os.getenv("IMAGE_NAME", "openenv-noc_agent") BENCHMARK = os.getenv("NOC_AGENT_BENCHMARK", "noc_agent") # If set, run only that task; otherwise run all three in order (easy → hard) _SINGLE_TASK: Optional[str] = os.getenv("NOC_AGENT_TASK") ALL_TASKS: list[str] = [ "network_congestion", # easy "memory_leak", # medium "cpu_overload", # hard ] MAX_STEPS: int = 30 TEMPERATURE: float = 0.2 MAX_TOKENS: int = 20 SUCCESS_SCORE_THRESHOLD: float = 0.5 MAX_TOTAL_REWARD: float = 25.0 VALID_ACTIONS: list[str] = [ "do_nothing", "restart_service", "throttle_cpu", "clear_cache", "reroute_traffic", "scale_up", ] # --------------------------------------------------------------------------- # Mandatory stdout logging # --------------------------------------------------------------------------- def log_start(task: str, env: str, model: str) -> None: print(f"[START] task={task} env={env} model={model}", flush=True) def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None: print( f"[STEP] step={step} action={action} reward={reward:.2f} " f"done={str(done).lower()} error={error or 'null'}", flush=True, ) def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None: rewards_str = ",".join(f"{r:.2f}" for r in rewards) print( f"[END] success={str(success).lower()} steps={steps} " f"score={score:.3f} rewards={rewards_str}", flush=True, ) # --------------------------------------------------------------------------- # LLM helpers # --------------------------------------------------------------------------- SYSTEM_PROMPT = textwrap.dedent("""\ You are an autonomous NOC (Network Operations Centre) engineer. You observe live system telemetry from a Linux server under an active incident. Your job: choose the single best remediation action to resolve the incident as quickly as possible. Available actions (output EXACTLY one, lowercase, underscore-separated): do_nothing — wait and observe restart_service — restart the degraded service process throttle_cpu — reduce CPU allocation of runaway processes clear_cache — flush application/OS cache to reclaim memory reroute_traffic — redirect traffic to alternate network paths scale_up — add compute/bandwidth capacity Rules: - Output ONLY the action name. No explanation, no punctuation, no other text. - Match actions to the dominant symptom: high CPU → throttle_cpu, scale_up high memory / OOM → clear_cache, restart_service high latency / loss → reroute_traffic, scale_up service down → restart_service - Avoid do_nothing unless all metrics are near-healthy. """).strip() def _build_user_prompt( step: int, incident: str, metrics: dict, last_reward: float, history: List[str], ) -> str: history_block = "\n".join(history[-5:]) if history else " (none)" return textwrap.dedent(f"""\ Step: {step} Incident type: {incident.replace('_', ' ').upper()} Current system metrics: CPU usage : {metrics['cpu_usage']:.1%} Memory usage : {metrics['memory_usage']:.1%} Latency : {metrics.get('latency_ms', 0):.0f} ms Packet loss : {metrics['packet_loss']:.1%} Service health: {metrics['service_healthy']:.1%} Error rate : {metrics['error_rate']:.1%} Last reward: {last_reward:+.3f} Recent actions: {history_block} Choose your action: """).strip() def _parse_action(raw: str) -> str: cleaned = raw.strip().lower().replace("-", "_").split()[0] if raw.strip() else "" if cleaned in VALID_ACTIONS: return cleaned for action in VALID_ACTIONS: if action.startswith(cleaned) or cleaned in action: return action print(f"[DEBUG] Unrecognised action {raw!r} — defaulting to do_nothing", flush=True) return "do_nothing" def get_llm_action( client: OpenAI, step: int, incident: str, metrics: dict, last_reward: float, history: List[str], ) -> str: prompt = _build_user_prompt(step, incident, metrics, last_reward, history) try: completion = client.chat.completions.create( model=MODEL_NAME, messages=[ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": prompt}, ], temperature=TEMPERATURE, max_tokens=MAX_TOKENS, stream=False, ) raw = (completion.choices[0].message.content or "").strip() return _parse_action(raw) except Exception as exc: print(f"[DEBUG] LLM call failed at step {step}: {exc}", flush=True) return "do_nothing" # --------------------------------------------------------------------------- # Episode runner # --------------------------------------------------------------------------- async def run_episode(task: str, client: OpenAI) -> dict: """ Run one full episode for the given task. Spins up a fresh Docker container, runs the agent, tears it down. Returns a result dict for the summary table. """ from noc_agent.client import NocAgentEnvClient from noc_agent.models import ActionType, NOCAction rewards: List[float] = [] history: List[str] = [] steps_taken: int = 0 score: float = 0.0 success: bool = False log_start(task=task, env=BENCHMARK, model=MODEL_NAME) env = await NocAgentEnvClient.from_docker_image(IMAGE_NAME) try: # Pin exact incident type via reset kwargs (server forwards to reset(incident_type=...)) result = await env.reset(incident_type=task) incident_type: str = result.observation.incident_type obs = result.observation metrics = obs.metrics.model_dump() metrics["latency_ms"] = metrics.pop("latency", 0) * 500 last_reward = 0.0 for step in range(1, MAX_STEPS + 1): if result.done: break action_str = get_llm_action(client, step, incident_type, metrics, last_reward, history) action_type = ActionType(action_str) result = await env.step(NOCAction(action_type=action_type)) obs = result.observation reward = result.reward or 0.0 done = result.done metrics = obs.metrics.model_dump() metrics["latency_ms"] = metrics.pop("latency", 0) * 500 last_reward = reward rewards.append(reward) steps_taken = step history.append(f"Step {step}: {action_str} → reward {reward:+.3f}") log_step(step=step, action=action_str, reward=reward, done=done, error=None) if done: break raw_score = sum(rewards) / MAX_TOTAL_REWARD if rewards else 0.0 score = min(max(raw_score, 0.0), 1.0) success = score >= SUCCESS_SCORE_THRESHOLD except Exception as exc: print(f"[DEBUG] Episode error ({task}): {exc}", flush=True) finally: try: await env.close() except Exception as e: print(f"[DEBUG] env.close() error: {e}", flush=True) log_end(success=success, steps=steps_taken, score=score, rewards=rewards) return {"task": task, "success": success, "steps": steps_taken, "score": score} # --------------------------------------------------------------------------- # Main — run all tasks sequentially, print summary # --------------------------------------------------------------------------- async def main() -> None: # Ensure noc_agent package is importable sys.path.insert(0, str(Path(__file__).parent)) tasks = [_SINGLE_TASK] if _SINGLE_TASK else ALL_TASKS client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY) results: list[dict] = [] for task in tasks: print(f"\n{'='*60}", flush=True) print(f"[INFO] Starting task: {task}", flush=True) print(f"{'='*60}", flush=True) result = await run_episode(task=task, client=client) results.append(result) # Summary table print(f"\n{'='*60}", flush=True) print("[SUMMARY]", flush=True) print(f"{'Task':<25} {'Success':<10} {'Steps':<8} {'Score'}", flush=True) print(f"{'-'*25} {'-'*10} {'-'*8} {'-'*6}", flush=True) for r in results: print( f"{r['task']:<25} {str(r['success']).lower():<10} {r['steps']:<8} {r['score']:.3f}", flush=True, ) overall = sum(r["score"] for r in results) / len(results) if results else 0.0 print(f"\n[SUMMARY] overall_score={overall:.3f}", flush=True) print(f"{'='*60}", flush=True) if __name__ == "__main__": asyncio.run(main())