Spaces:

Vector11187u
/

NEXON

Sleeping

App Files Files Community

ashishMenon05 commited on Apr 11

Commit

fd5d7f9

1 Parent(s): 3168d77

fix: full resubmission patch - fix [STEP] format, add close(), expose system_state, fix /state endpoint, improve reward variance

Browse files

Files changed (4) hide show

backend/api/routes/openenv.py +5 -3
backend/core/environment.py +9 -3
backend/core/reward_engine.py +114 -86
inference.py +164 -112

backend/api/routes/openenv.py CHANGED Viewed

@@ -150,11 +150,13 @@ async def step_env(action: NexusAction):
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
-@router.get("/state", response_model=NexusState)
 def get_state():
     state = episode_manager.env.state()
-    if not state:
-        raise HTTPException(status_code=400, detail="No active episode")
     return state
 @router.get("/telemetry")

     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
+@router.get("/state")
 def get_state():
+    """Returns the current episode state. Returns idle status if no episode is active."""
     state = episode_manager.env.state()
+    # state() now always returns something — either a NexusState pydantic object or an idle dict.
+    if hasattr(state, "model_dump"):
+        return state.model_dump()
     return state
 @router.get("/telemetry")

backend/core/environment.py CHANGED Viewed

@@ -71,7 +71,7 @@ class NexusEnvironment:
         obs = NexusObservation(
             partner_message="",
             tool_results=[],
-            system_state={},
             investigation_stage="investigating",
             round=1,
             available_tools=available_tools,
@@ -143,7 +143,7 @@ class NexusEnvironment:
         obs = NexusObservation(
             partner_message=action.message,
             tool_results=tool_results_objs,
-            system_state={"total_tools_run": len(ep.tool_calls_made)},
             investigation_stage=ep.investigation_stage,
             round=ep.current_round,
             available_tools=SSH_TOOLS if settings.EXECUTION_MODE == "ssh" else SIMULATED_TOOLS,
@@ -156,5 +156,11 @@ class NexusEnvironment:
     def state(self):
         if not self.active_episode:
-            return None
         return self.active_episode.to_pydantic()

         obs = NexusObservation(
             partner_message="",
             tool_results=[],
+            system_state=self.active_episode.system_state,  # Expose real state so agent sees initial conditions
             investigation_stage="investigating",
             round=1,
             available_tools=available_tools,
         obs = NexusObservation(
             partner_message=action.message,
             tool_results=tool_results_objs,
+            system_state=ep.system_state,  # Return real mutated state so agent sees the effect of its actions
             investigation_stage=ep.investigation_stage,
             round=ep.current_round,
             available_tools=SSH_TOOLS if settings.EXECUTION_MODE == "ssh" else SIMULATED_TOOLS,
     def state(self):
         if not self.active_episode:
+            # Return a valid default state so the /state endpoint always responds
+            return {"status": "idle", "message": "No active episode. Call /reset to start."}
         return self.active_episode.to_pydantic()
+    async def close(self):
+        """Clean up the active episode. Required by OpenEnv spec."""
+        self.active_episode = None
+        self.active_scenario = None

backend/core/reward_engine.py CHANGED Viewed

@@ -3,115 +3,143 @@ import logging
 logger = logging.getLogger("nexus.reward_engine")
 def compute_reward(message: str, tool_calls: list, tool_results: list, episode_state, scenario: dict) -> tuple[float, dict]:
     breakdown = {}
     msg_lower = message.lower()
     ep = episode_state
     sc = scenario
-    # 1. HYPOTHESIS SPECIFICITY (0.0-0.20)
-    specificity_indicators = ["shows", "value", "config", "log", "found", "confirmed",
-                               "set to", "equals", "returns", "indicates", "trace", "root cause"]
-    breakdown['specificity'] = min(0.20,
-        sum(0.025 for word in specificity_indicators if word in msg_lower)
-    )
-    # 2. TOOL EXECUTION SUCCESS (0.0-0.25)
     tool_score = 0.0
     if tool_calls:
-        new_tools = 0
         for t in tool_calls:
             sig = f"{t.tool_name}:{str(t.params)}"
             if sig not in ep.previous_tool_calls:
-                new_tools += 1
-        # Reward for using different tool categories
-        tool_categories = set()
-        for tc in tool_calls:
-            if tc.tool_name in ["read_logs", "check_config", "query_database", "check_service_status"]:
-                tool_categories.add("investigation")
-            elif tc.tool_name in ["update_config", "restart_service"]:
-                tool_categories.add("fix_action")
-            elif tc.tool_name in ["propose_fix", "verify_fix"]:
-                tool_categories.add("resolution")
-        tool_score = min(0.25, len(tool_categories) * 0.08 + new_tools * 0.05)
-    breakdown['tool_usage'] = tool_score
-    # 3. TOOL RESULT QUALITY (0.0-0.15)
     result_score = 0.0
     for tr in tool_results:
-        result_text = tr.get('result', '').lower() if isinstance(tr, dict) else str(tr).lower()
-        # Positive signals in tool results
-        if any(kw in result_text for kw in ['error', 'fail', 'degraded', 'anomaly', 'threshold']):
-            result_score += 0.03  # Found something useful
-        if any(kw in result_text for kw in ['rate_limit', 'nginx', 'config', 'timeout', 'connection']):
-            result_score += 0.02  # Found relevant clue
-        if 'success' in result_text or 'running' in result_text or 'healthy' in result_text:
-            result_score += 0.01  # Status info
-    breakdown['result_quality'] = min(0.15, result_score)
-    # 4. CLUE DISCOVERY (0.0-0.20)
-    clue_score = 0.0
-    if hasattr(ep, 'clues_found') and ep.clues_found:
-        clue_score = min(0.20, len(ep.clues_found) * 0.05)
-    breakdown['clue_discovery'] = clue_score
-    # 5. INVESTIGATION STAGE PROGRESS (0.0-0.15)
-    stage_score = 0.0
-    if hasattr(ep, 'investigation_stage'):
-        stage_map = {'investigating': 0.02, 'narrowing': 0.08, 'hypothesizing': 0.12, 'found': 0.15, 'verified': 0.15}
-        stage_score = stage_map.get(ep.investigation_stage, 0.02)
-    breakdown['stage_progress'] = stage_score
-    # 6. SEMANTIC SIMILARITY TO ROOT CAUSE (0.0-0.10)
     similarity_score = 0.0
     try:
-        root_cause_desc = scenario.get('root_cause', {}).get('description', '')
-        if root_cause_desc:
             msg_emb = get_embedding(message)
             rc_emb = get_embedding(root_cause_desc)
-            sim = cos_sim(msg_emb, rc_emb)
-            # Only reward if embedding is not fallback (has meaningful variance)
-            if len(msg_emb) == 384 and sum(msg_emb) != 0:
-                similarity_score = min(0.10, sim * 0.15)
-    except:
         pass
-    breakdown['semantic_similarity'] = similarity_score
-    # 7. NOVELTY BONUS (0.0-0.05)
-    novelty_score = 0.0
-    if hasattr(ep, 'all_messages') and ep.all_messages:
-        try:
             msg_emb = get_embedding(message)
-            max_sim = 0
             for prev in ep.all_messages[-3:]:
                 prev_emb = get_embedding(prev)
                 sim = cos_sim(msg_emb, prev_emb)
-                max_sim = max(max_sim, sim)
-            novelty_score = max(0.0, 0.05 * (1 - max_sim))
-        except:
-            novelty_score = 0.03
-    else:
-        novelty_score = 0.05
-    breakdown['novelty'] = novelty_score
-    # PENALTIES
     penalty = 0.0
-    msg_len = len(message.split())
-    if msg_len < 5:
-        penalty += 0.10  # Too terse
-    if len(message) > 2000:
-        penalty += 0.05  # Too verbose without action
-    if breakdown['novelty'] < 0.01:
-        penalty += 0.10  # Circular/duplicate message
     total = sum(breakdown.values()) - penalty
     final_score = round(max(0.0, min(1.0, total)), 4)
     ep.reward_history.append(final_score)
-    ep.cumulative_reward += final_score
     return final_score, breakdown

 logger = logging.getLogger("nexus.reward_engine")
+# Root-cause keywords per difficulty — pre-defined for fast matching
+DIFFICULTY_ROOT_CAUSE_HINTS = {
+    "easy":   ["rate_limit", "nginx", "rate limit", "429", "proxy", "throttle"],
+    "medium": ["approval", "process", "workflow", "sla", "escalation", "manual"],
+    "hard":   ["postgres", "connection pool", "long_running_query", "max_connections", "deadlock", "timeout"],
+}
 def compute_reward(message: str, tool_calls: list, tool_results: list, episode_state, scenario: dict) -> tuple[float, dict]:
     breakdown = {}
     msg_lower = message.lower()
     ep = episode_state
     sc = scenario
+    difficulty = getattr(ep, "difficulty", "easy")
+    # 1. HYPOTHESIS QUALITY — Reward specificity and domain alignment (0.0-0.20)
+    #    Check if message mentions domain-specific terms relevant to this difficulty
+    domain_hints = DIFFICULTY_ROOT_CAUSE_HINTS.get(difficulty, [])
+    domain_hits = sum(1 for hint in domain_hints if hint in msg_lower)
+    # General specificity — mentions numbers, config keys, service names
+    generic_specificity = sum(0.01 for word in ["set to", "equals", "config", "found", "confirmed", "root cause",
+                                                  "value", "log", "trace", "indicates", "returns"] if word in msg_lower)
+    breakdown["hypothesis_quality"] = min(0.20, domain_hits * 0.04 + generic_specificity)
+    # 2. TOOL USAGE QUALITY — Correct tools, no repeating same call (0.0-0.25)
     tool_score = 0.0
     if tool_calls:
+        tool_categories = set()
+        new_calls = 0
         for t in tool_calls:
             sig = f"{t.tool_name}:{str(t.params)}"
             if sig not in ep.previous_tool_calls:
+                new_calls += 1
+            if t.tool_name in ["read_logs", "check_config", "query_database", "check_service_status", "run_diagnostic"]:
+                tool_categories.add("investigate")
+            elif t.tool_name in ["update_config", "restart_service"]:
+                tool_categories.add("fix")
+            elif t.tool_name in ["propose_fix", "verify_fix", "submit_resolution"]:
+                tool_categories.add("resolve")
+        # Reward for covering investigation before jumping to fixes
+        stage_coverage = len(tool_categories)
+        tool_score = min(0.25, stage_coverage * 0.07 + new_calls * 0.04)
+    breakdown["tool_usage"] = tool_score
+    # 3. TOOL RESULT QUALITY — Did the tools find actionable info? (0.0-0.15)
     result_score = 0.0
+    domain_found = False
     for tr in tool_results:
+        result_text = tr.get("result", "").lower() if isinstance(tr, dict) else str(tr).lower()
+        if any(kw in result_text for kw in ["error", "fail", "degraded", "anomaly", "threshold", "critical"]):
+            result_score += 0.04  # Found a symptom
+        if any(hint in result_text for hint in domain_hints):
+            result_score += 0.05  # Found a domain-specific clue
+            domain_found = True
+        if "success" in result_text or "fixed" in result_text:
+            result_score += 0.02  # Fix confirmed by tool
+    breakdown["result_quality"] = min(0.15, result_score)
+    # 4. CLUE ACCUMULATION — Discovering new clues (0.0-0.15)
+    #    Cap at 3 clues to prevent reward hacking via tool spam
+    if hasattr(ep, "clues_found") and ep.clues_found:
+        breakdown["clue_discovery"] = min(0.15, len(ep.clues_found) * 0.04)
+    else:
+        breakdown["clue_discovery"] = 0.0
+    # 5. INVESTIGATION STAGE PROGRESSION — Rewards forward momentum (0.0-0.15)
+    stage_map = {
+        "investigating": 0.03,
+        "narrowing":     0.08,
+        "hypothesizing": 0.12,
+        "found":         0.15,
+        "verified":      0.15,
+    }
+    breakdown["stage_progress"] = stage_map.get(getattr(ep, "investigation_stage", "investigating"), 0.03)
+    # 6. SEMANTIC SIMILARITY TO ROOT CAUSE — Only if embeddings are available (0.0-0.15)
+    #    More weight than before — this is the real quality signal
     similarity_score = 0.0
     try:
+        root_cause_desc = sc.get("root_cause", {}).get("description", "")
+        if root_cause_desc and message.strip():
             msg_emb = get_embedding(message)
             rc_emb = get_embedding(root_cause_desc)
+            # Only reward if not using the zero-variance fallback embedding
+            if len(msg_emb) == 384 and abs(sum(msg_emb)) > 0.001:
+                sim = cos_sim(msg_emb, rc_emb)
+                similarity_score = min(0.15, sim * 0.20)
+    except Exception:
         pass
+    breakdown["semantic_similarity"] = similarity_score
+    # 7. NOVELTY — Penalize circular/repetitive reasoning (0.0-0.05)
+    novelty_score = 0.05  # Start assuming novel
+    try:
+        if hasattr(ep, "all_messages") and len(ep.all_messages) > 1:
             msg_emb = get_embedding(message)
+            max_sim = 0.0
             for prev in ep.all_messages[-3:]:
                 prev_emb = get_embedding(prev)
                 sim = cos_sim(msg_emb, prev_emb)
+                if sim > max_sim:
+                    max_sim = sim
+            novelty_score = max(0.0, 0.05 * (1.0 - max_sim))
+    except Exception:
+        novelty_score = 0.03
+    breakdown["novelty"] = novelty_score
+    # ── PENALTIES ─────────────────────────────────────────────────────────────
     penalty = 0.0
+    # Too terse: no useful reasoning
+    if len(message.split()) < 8:
+        penalty += 0.10
+    # Too verbose without any tool calls: wall of text, no action
+    if len(message) > 1200 and not tool_calls:
+        penalty += 0.05
+    # Circular reasoning: nearly identical to a recent message
+    if breakdown["novelty"] < 0.005:
+        penalty += 0.12
+    # Wrong domain: confidently blaming the wrong service
+    # Check if agent blames a red-herring service mentioned in scenario
+    red_herrings = sc.get("red_herrings", [])
+    if red_herrings:
+        for rh in red_herrings:
+            rh_lower = str(rh).lower()
+            if rh_lower in msg_lower and "not" not in msg_lower:
+                penalty += 0.05  # Fell for the red herring
+                break
     total = sum(breakdown.values()) - penalty
     final_score = round(max(0.0, min(1.0, total)), 4)
     ep.reward_history.append(final_score)
+    ep.cumulative_reward = round(ep.cumulative_reward + final_score, 4)
     return final_score, breakdown

inference.py CHANGED Viewed

@@ -5,7 +5,6 @@ NEXUS Inference Script — OpenEnv Competition Submission
 import os
 import sys
-import asyncio
 import re
 from pathlib import Path
@@ -13,140 +12,193 @@ ROOT = Path(__file__).resolve().parent
 sys.path.insert(0, str(ROOT))
 sys.path.insert(0, str(ROOT / "backend"))
-from dotenv import load_dotenv
-if (ROOT / ".env").exists():
-    load_dotenv(ROOT / ".env", override=True)
-elif (ROOT / "backend" / ".env").exists():
-    load_dotenv(ROOT / "backend" / ".env", override=True)
-# Fallback for defaults, will NOT override the .env we just loaded
-load_dotenv(ROOT / "default.env", override=False)
 API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
-MODEL_NAME = os.getenv("MODEL_NAME", "meta-llama/Llama-3.1-8B-Instruct")
-HF_TOKEN = os.getenv("HF_TOKEN", "")
-if "API_BASE_URL" not in os.environ or not os.environ["API_BASE_URL"]:
-    os.environ["API_BASE_URL"] = API_BASE_URL
-if "API_KEY" not in os.environ or not os.environ["API_KEY"]:
-    os.environ["API_KEY"] = "none"
-# The client should NOT be initialized here at the module level.
-# If the evaluator imports this file before patching os.environ, it will permanently bind to fallbacks.
 from backend.core.environment import NexusEnvironment
 from backend.api.schemas.action import NexusAction, ToolCall
-def parse_tool_calls(text: str) -> list:
-    tool_calls = []
-    for match in re.finditer(r"TOOL:\s*([a-zA-Z0-9_]+)\(([^)]*)\)", text):
-        name = match.group(1)
-        args_s = match.group(2)
-        params = {}
-        for kv in re.finditer(r"(\w+)=['\"]?([^,'\"]+)['\"]?", args_s):
-            params[kv.group(1)] = kv.group(2)
-        tool_calls.append(ToolCall(tool_name=name, params=params))
-    return tool_calls
 TASKS = [
-    {"name": "software-incident", "difficulty": "easy"},
-    {"name": "business-process-failure", "difficulty": "medium"},
-    {"name": "cascade-system-failure", "difficulty": "hard"},
 ]
 SYSTEM_PROMPT = (
     "You are an expert incident investigator. "
-    "Format tool calls as: TOOL: tool_name(param='value') "
     "Available tools: read_logs, check_config, query_database, check_service_status, "
-    "propose_fix, verify_fix"
 )
-MAX_STEPS = int(os.environ.get("MAX_STEPS", "8"))
 def _print(line: str):
     print(line, flush=True)
-async def run():
-    # Initialize client dynamically at runtime to correctly capture evaluator's patched os.environ
-    from openai import OpenAI
-    client = OpenAI(base_url=os.environ["API_BASE_URL"], api_key=os.environ["API_KEY"])
-    try:
-        env = NexusEnvironment()
-        for task in TASKS:
-            _print(f"[START] task={task['name']} env=nexus-incident-investigation model={MODEL_NAME}")
             try:
-                obs = await env.reset(task=task["name"], seed=42)
-            except Exception as e:
-                _print(f"[STEP] step=1 error=\"reset failed: {str(e)[:100]}\"")
-                continue
-            messages = [{"role": "system", "content": SYSTEM_PROMPT}]
-            done = False
-            step_n = 0
-            rewards = []
-            while not done and step_n < MAX_STEPS:
-                step_n += 1
-                user_content = (
-                    f"Scenario: {obs.scenario_description}\n"
-                    f"Context: {obs.scenario_context}\n"
-                    f"Round {obs.round}. Investigate and call tools."
-                )
-                messages.append({"role": "user", "content": user_content})
-                action_text = ""
-                try:
-                    resp = client.chat.completions.create(
-                        model=MODEL_NAME,
-                        messages=messages,
-                        max_tokens=300,
-                        temperature=0.7,
-                        timeout=120.0
-                    )
-                    action_text = resp.choices[0].message.content or ""
-                except Exception as e:
-                    _print(f"[STEP] step={step_n} error=\"{str(e)[:100]}\"")
-                    break
-                messages.append({"role": "assistant", "content": action_text})
-                tool_calls = parse_tool_calls(action_text)
-                action = NexusAction(
-                    agent_id="agent_a",
-                    message=action_text,
-                    tool_calls=tool_calls,
-                    confidence=0.8
-                )
-                try:
-                    obs, reward, done, info = await env.step(action)
-                except Exception as e:
-                    _print(f"[STEP] step={step_n} error=\"step failed: {str(e)[:100]}\"")
-                    break
-                rewards.append(reward)
-                clean = action_text.replace("\n", " ")[:200]
-                _print(
-                    f'[STEP] step={step_n} action="{clean}" '
-                    f'reward={reward:.2f} done={str(done).lower()} error=null'
                 )
-            final_score = info.get("final_score", rewards[-1] if rewards else 0.0) if 'info' in dir() else 0.0
-            success = final_score >= 0.5
-            rewards_str = ",".join(f"{r:.2f}" for r in rewards)
             _print(
-                f"[END] success={str(success).lower()} steps={step_n} "
-                f"score={final_score:.3f} rewards={rewards_str}"
             )
-    except Exception as e:
-        _print(f"[ERROR] {str(e)}")
-        sys.exit(1)
 if __name__ == "__main__":
-    asyncio.run(run())

 import os
 import sys
 import re
 from pathlib import Path
 sys.path.insert(0, str(ROOT))
 sys.path.insert(0, str(ROOT / "backend"))
+# ── Environment Variables (spec-required) ──────────────────────────────────────
 API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
+MODEL_NAME   = os.getenv("MODEL_NAME",   "meta-llama/Llama-3.1-8B-Instruct")
+HF_TOKEN     = os.getenv("HF_TOKEN")
+if HF_TOKEN is None:
+    raise ValueError("HF_TOKEN environment variable is required")
+API_KEY = os.getenv("API_KEY", HF_TOKEN)
+# Import AFTER path setup
+from openai import OpenAI  # sync client — matches spec example exactly
 from backend.core.environment import NexusEnvironment
 from backend.api.schemas.action import NexusAction, ToolCall
+# ── Config ─────────────────────────────────────────────────────────────────────
+MAX_STEPS = int(os.environ.get("MAX_STEPS", "8"))
 TASKS = [
+    {"name": "software-incident",         "difficulty": "easy"},
+    {"name": "business-process-failure",  "difficulty": "medium"},
+    {"name": "cascade-system-failure",    "difficulty": "hard"},
 ]
 SYSTEM_PROMPT = (
     "You are an expert incident investigator. "
+    "Your goal is to identify the root cause of system incidents and apply the correct fix. "
+    "You have access to these tools — call them by writing: TOOL: tool_name(param='value')\n"
     "Available tools: read_logs, check_config, query_database, check_service_status, "
+    "update_config, restart_service, propose_fix, verify_fix, submit_resolution\n\n"
+    "Strategy:\n"
+    "1. Use read_logs and check_service_status to gather evidence.\n"
+    "2. Use update_config or restart_service to apply your fix.\n"
+    "3. Use verify_fix to confirm the fix worked.\n"
+    "4. Call submit_resolution with root_cause_service, root_cause_description, and fix_applied.\n"
+    "After each tool result, update your hypothesis. The system state shown to you reflects real changes."
 )
+# ── Helpers ────────────────────────────────────────────────────────────────────
 def _print(line: str):
     print(line, flush=True)
+def _safe_action(text: str) -> str:
+    """Strip newlines and truncate for the [STEP] action field — NO quotes."""
+    return text.replace("\n", " ").replace("\r", "").strip()[:300]
+def _safe_error(error: str) -> str:
+    """Format error for [STEP] — raw string, no quotes, or null."""
+    if not error:
+        return "null"
+    return error.replace("\n", " ").strip()[:200]
+def parse_tool_calls(text: str) -> list[ToolCall]:
+    tool_calls = []
+    for match in re.finditer(r"TOOL:\s*([a-zA-Z0-9_]+)\(([^)]*)\)", text):
+        name = match.group(1)
+        args_s = match.group(2)
+        params = {}
+        for kv in re.finditer(r"(\w+)=['\"]?([^,'\"]+)['\"]?", args_s):
+            params[kv.group(1)] = kv.group(2).strip()
+        tool_calls.append(ToolCall(tool_name=name, params=params))
+    return tool_calls
+def build_user_content(obs) -> str:
+    """Build the user message from the current observation, including system state."""
+    parts = [
+        f"Scenario: {obs.scenario_description}",
+        f"Context: {obs.scenario_context}",
+        f"Round: {obs.round}",
+    ]
+    # Show the agent what the system state currently looks like
+    if hasattr(obs, "system_state") and obs.system_state:
+        parts.append(f"Current system state: {obs.system_state}")
+    # Show tool results from last step
+    if hasattr(obs, "tool_results") and obs.tool_results:
+        results_str = "; ".join(
+            f"{tr.tool_name}: {tr.result}" for tr in obs.tool_results
+        )
+        parts.append(f"Tool results: {results_str}")
+    # Show clues found so far
+    if hasattr(obs, "clues_found") and obs.clues_found:
+        parts.append(f"Clues found: {', '.join(obs.clues_found[-5:])}")
+    parts.append("Investigate and call tools to find and fix the root cause.")
+    return "\n".join(parts)
+# ── Main Inference Loop ────────────────────────────────────────────────────────
+def run():
+    import asyncio
+    client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
+    env = NexusEnvironment()
+    for task in TASKS:
+        _print(f"[START] task={task['name']} env=nexus-incident-investigation model={MODEL_NAME}")
+        # Reset environment
+        try:
+            obs = asyncio.run(env.reset(task=task["name"], seed=42))
+        except Exception as e:
+            err = _safe_error(f"reset failed: {str(e)}")
+            _print(f"[STEP] step=1 action=reset_attempted reward=0.00 done=true error={err}")
+            _print("[END] success=false steps=1 rewards=0.00")
+            continue
+        messages = [{"role": "system", "content": SYSTEM_PROMPT}]
+        done = False
+        step_n = 0
+        rewards = []
+        last_error = "null"
+        while not done and step_n < MAX_STEPS:
+            step_n += 1
+            # Build user message from observation (including system state feedback)
+            user_content = build_user_content(obs)
+            messages.append({"role": "user", "content": user_content})
+            # Call LLM
+            action_text = ""
+            last_error = "null"
             try:
+                resp = client.chat.completions.create(
+                    model=MODEL_NAME,
+                    messages=messages,
+                    max_tokens=400,
+                    temperature=0.5,
+                    timeout=120.0
                 )
+                action_text = resp.choices[0].message.content or ""
+            except Exception as e:
+                last_error = _safe_error(str(e))
+                rewards_str = ",".join(f"{r:.2f}" for r in rewards) if rewards else "0.00"
+                _print(f"[STEP] step={step_n} action=llm_call_failed reward=0.00 done=true error={last_error}")
+                _print(f"[END] success=false steps={step_n} rewards={rewards_str}")
+                break
+            messages.append({"role": "assistant", "content": action_text})
+            # Parse tool calls from LLM response
+            tool_calls = parse_tool_calls(action_text)
+            action = NexusAction(
+                agent_id="agent_a",
+                message=action_text,
+                tool_calls=tool_calls,
+                confidence=0.8
+            )
+            # Step the environment
+            try:
+                obs, reward, done, info = asyncio.run(env.step(action))
+            except Exception as e:
+                last_error = _safe_error(str(e))
+                rewards_str = ",".join(f"{r:.2f}" for r in rewards) if rewards else "0.00"
+                _print(f"[STEP] step={step_n} action={_safe_action(action_text)} reward=0.00 done=true error={last_error}")
+                _print(f"[END] success=false steps={step_n} rewards={rewards_str}")
+                break
+            rewards.append(reward)
+            # Emit [STEP] — NO quotes around action or error values
+            action_str = _safe_action(action_text)
             _print(
+                f"[STEP] step={step_n} action={action_str} "
+                f"reward={reward:.2f} done={str(done).lower()} error={last_error}"
             )
+        else:
+            # Normal loop completion — emit [END]
+            final_score = info.get("final_score", rewards[-1] if rewards else 0.0) if rewards else 0.0
+            success = final_score >= 0.5
+            rewards_str = ",".join(f"{r:.2f}" for r in rewards) if rewards else "0.00"
+            _print(f"[END] success={str(success).lower()} steps={step_n} rewards={rewards_str}")
+    # Always close
+    try:
+        asyncio.run(env.close())
+    except Exception:
+        pass
 if __name__ == "__main__":
+    try:
+        run()
+    except Exception as e:
+        # Even on fatal error, emit a valid [END] if possible
+        print(f"[END] success=false steps=0 rewards=0.00", flush=True)
+        sys.exit(1)