Spaces:

Naseer-010
/

DIME

Configuration error

App Files Files Community

DIME / inference.py

Naseer-010

added reward function normalization, hidden eval set

15673b8 23 days ago

raw

history blame contribute delete

33.6 kB

	#!/usr/bin/env python3
	"""
	LLM Agent Inference Loop for Distributed Infrastructure Environment (DIME).

	Evaluates the autonomous reasoning capabilities of LLMs for SRE tasks.
	Features:
	- Dual-mode inference: local GPU (Transformers) and remote endpoint (OpenAI API)
	- Structured JSON-lines logging per episode + legacy console/CSV output
	- Robust CoT reasoning extraction with multi-format fallback
	- Strict action reconstructor to guarantee 0 backend crashes
	- CLI argument support for model, mode, tasks, log directory

	Usage:
	python inference.py # defaults
	python inference.py --mode local --model Qwen/Qwen3-8B
	python inference.py --mode endpoint --tasks traffic_spike node_failure
	python inference.py --log-dir /path/to/logs
	"""

	import argparse
	import ast
	import csv
	import json
	import math
	import os
	import re
	import sys
	import time
	import traceback
	from contextlib import contextmanager
	from datetime import datetime, timezone
	from pathlib import Path
	from typing import List, Optional, Tuple

	import requests
	from dotenv import load_dotenv

	from agents.triage import CANONICAL_TRIAGE_TREE

	load_dotenv()

	# ---------------------------------------------------------------------------
	# Configuration (env vars with CLI override)
	# ---------------------------------------------------------------------------
	_DEFAULT_API_BASE = "https://router.huggingface.co/v1"
	_DEFAULT_MODEL = "Qwen/Qwen3-8B"
	_DEFAULT_ENV_URL = "http://localhost:8000"

	ALL_TASKS = [
	"traffic_spike",
	"node_failure",
	"cascading_failure",
	"flash_crowd",
	"level_5_alibaba_trace",
	"thundering_herd",
	"zombie_node",
	"memory_leak_slow_burn",
	"split_brain_io_bottleneck",
	"black_swan_az_failure",
	"retry_storm",
	"hot_shard_skew",
	"connection_pool_deadlock",
	"autoscaler_flapping_trap",
	]
	MAX_RETRIES = 4
	BENCHMARK = "distributed_infra_env"

	# Global states for models and clients (lazy-loaded)
	_tokenizer = None
	_model = None
	_client = None
	_direct_env = None

	SYSTEM_PROMPT = f"""You are an expert Site Reliability Engineer (SRE) managing a highly volatile Kubernetes cluster.
	You receive telemetry as JSON and must respond with a SINGLE kubectl command to prevent cascading failure.

	CLUSTER ARCHITECTURE & PHYSICS:
	- Node 0 is the stateful DATABASE (SPOF). Nodes 1-7 are stateless APP WORKERS.
	- NEW METRICS: You now see 'mem_utilizations' (RAM), 'io_wait' (Disk), 'p99_latency' (Tail risk), and 'error_budget'.
	- MEMORY CLIFF: If ANY node hits > 0.98 mem_utilization, it suffers an instant OOM Kill.
	- COLD START: Scaling up takes 3 steps to boot.
	- ERROR BUDGET: Throttling traffic saves the DB but burns your finite Error Budget.

	Available commands:
	- kubectl delete pod node-<ID> → restart a node (clears memory leaks and deadlocks)
	- kubectl scale deployment frontend --replicas=10 → scale up (takes 3 steps to boot)
	- kubectl exec -it istio-proxy -- traffic shift --from=<ID> --to=<ID> → reroute traffic away from a bad node
	- kubectl throttle ingress --rate=<float> → drop traffic (0.0 to 1.0). Burns error budget!
	- kubectl logs node-<ID> → investigate telemetry timeout
	- no_op → do nothing

	{CANONICAL_TRIAGE_TREE}

	Respond using the following STRICT format. You must include the XML reasoning tags:
	<reasoning>Diagnose the telemetry. Identify which of the 10 Triage rules applies.</reasoning>
	<action>
	{{"command": "your_kubectl_command_or_no_op_here"}}
	</action>"""


	# ---------------------------------------------------------------------------
	# Structured JSON-lines Logger
	# ---------------------------------------------------------------------------
	class StructuredLogger:
	"""Writes one JSON object per line to a .jsonl file for each episode."""

	def __init__(self, log_dir: Path, model_name: str):
	self.log_dir = log_dir
	self.model_name = model_name
	self._fh = None
	self._path = None

	def start_episode(self, task_id: str) -> Path:
	self.close()
	safe_model = self.model_name.replace("/", "_").replace(".", "_")
	ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
	subdir = self.log_dir / safe_model
	subdir.mkdir(parents=True, exist_ok=True)
	self._path = subdir / f"{task_id}_{ts}.jsonl"
	self._fh = open(self._path, "w", encoding="utf-8", buffering=1)
	self._write_record(
	event="episode_start",
	task_id=task_id,
	model=self.model_name,
	timestamp=ts,
	)
	return self._path

	def log_step(
	self,
	step: int,
	raw_llm_output: str,
	parsed_action: dict,
	backend_action: dict,
	reward: float,
	done: bool,
	task_score: float,
	obs_snapshot: Optional[dict] = None,
	error: Optional[str] = None,
	reasoning: str = "",
	):
	record = {
	"event": "step",
	"step": step,
	"reasoning": reasoning[:500],
	"raw_llm_output_length": len(raw_llm_output),
	"parsed_action": parsed_action,
	"backend_action": backend_action,
	"reward": round(reward, 4),
	"done": done,
	"task_score": round(task_score, 4),
	}
	if error:
	record["error"] = error
	if obs_snapshot:
	# Store compact summary of observation
	record["obs"] = {
	"latency_ms": obs_snapshot.get("latency_ms"),
	"failed_nodes": obs_snapshot.get("failed_nodes"),
	"request_rate": obs_snapshot.get("request_rate"),
	"error_budget": obs_snapshot.get("error_budget"),
	}
	self._write_record(**record)

	def end_episode(
	self,
	task_id: str,
	success: bool,
	total_steps: int,
	task_score: float,
	rewards: List[float],
	):
	self._write_record(
	event="episode_end",
	task_id=task_id,
	success=success,
	total_steps=total_steps,
	task_score=round(task_score, 4),
	reward_mean=round(sum(rewards) / max(len(rewards), 1), 4),
	reward_min=round(min(rewards) if rewards else 0.0, 4),
	reward_max=round(max(rewards) if rewards else 0.0, 4),
	reward_std=round(_std(rewards), 4),
	)
	self.close()

	def close(self):
	if self._fh is not None:
	self._fh.close()
	self._fh = None

	def _write_record(self, **kwargs):
	if self._fh is not None:
	self._fh.write(json.dumps(kwargs, default=str) + "\n")


	def _std(xs: List[float]) -> float:
	if len(xs) < 2:
	return 0.0
	mu = sum(xs) / len(xs)
	return (sum((x - mu) 2 for x in xs) / len(xs)) 0.5


	# ---------------------------------------------------------------------------
	# CSV Logging (legacy, kept for backward compatibility)
	# ---------------------------------------------------------------------------
	def _init_csv(path: Path):
	if not path.exists():
	with open(path, "w", newline="", encoding="utf-8") as f:
	csv.writer(f).writerow(
	[
	"model",
	"task_id",
	"step",
	"action_taken",
	"reasoning",
	"reward",
	"cumulative_score",
	"done",
	"error",
	]
	)


	def _log_csv(
	path: Path,
	model: str,
	task_id: str,
	step: int,
	action: str,
	reasoning: str,
	reward: float,
	score: float,
	done: bool,
	error: Optional[str],
	):
	with open(path, "a", newline="", encoding="utf-8") as f:
	csv.writer(f).writerow(
	[model, task_id, step, action, reasoning, reward, score, done, error]
	)


	# ---------------------------------------------------------------------------
	# Terminal Logging (legacy console output + tee to file)
	# ---------------------------------------------------------------------------
	class TeeStream:
	def __init__(self, primary, log_file) -> None:
	self.primary = primary
	self.log_file = log_file
	self.encoding = getattr(primary, "encoding", "utf-8")
	self.errors = getattr(primary, "errors", "replace")

	def write(self, data: str) -> int:
	self.primary.write(data)
	self.log_file.write(data)
	return len(data)

	def flush(self) -> None:
	self.primary.flush()
	self.log_file.flush()

	def isatty(self) -> bool:
	return self.primary.isatty()


	@contextmanager
	def tee_output(log_path: Path):
	log_path.parent.mkdir(parents=True, exist_ok=True)
	with log_path.open("w", encoding="utf-8", buffering=1) as log_file:
	original_stdout = sys.stdout
	original_stderr = sys.stderr
	sys.stdout = TeeStream(original_stdout, log_file)
	sys.stderr = TeeStream(original_stderr, log_file)
	try:
	yield
	finally:
	sys.stdout = original_stdout
	sys.stderr = original_stderr


	def log_start(task: str, env: str, model: str, mode: str) -> None:
	print(
	f"\n[START] task={task} env={env} model={model} mode={mode.upper()}",
	flush=True,
	)


	def log_step(
	step: int, action: str, reward: float, done: bool, error: Optional[str]
	) -> None:
	error_val = error if error else "null"
	done_val = str(done).lower()
	print(
	f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}",
	flush=True,
	)


	def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
	rewards_str = ",".join(f"{r:.2f}" for r in rewards)
	print(
	f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}",
	flush=True,
	)


	# ---------------------------------------------------------------------------
	# Model / Client Loading
	# ---------------------------------------------------------------------------
	def _get_api_key() -> Optional[str]:
	return (
	os.environ.get("API_KEY")
	or os.environ.get("OPENAI_API_KEY")
	or os.environ.get("HF_TOKEN")
	)


	def get_client(api_base: str, api_key: str):
	"""Initialize the OpenAI client for endpoint inference."""
	global _client
	if _client is None:
	from openai import OpenAI

	print(f"\n[INFO] Connecting to endpoint {api_base}...", flush=True)
	_client = OpenAI(base_url=api_base, api_key=api_key)
	return _client


	def load_local_model(model_name: str):
	"""Load local models via Hugging Face Transformers to GPU."""
	global _tokenizer, _model
	if _model is None:
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer

	print(
	f"\n[INFO] Loading local model {model_name} via Transformers to GPU...",
	flush=True,
	)
	_tokenizer = AutoTokenizer.from_pretrained(model_name)
	_model = AutoModelForCausalLM.from_pretrained(
	model_name, torch_dtype=torch.bfloat16, device_map="auto"
	)
	return _tokenizer, _model


	# ---------------------------------------------------------------------------
	# LLM Response Parser
	# ---------------------------------------------------------------------------
	def parse_llm_response(text: str) -> Tuple[dict, str]:
	"""
	Robustly extracts JSON action and reasoning from LLM output.

	Handles: <think>/<reasoning> blocks, markdown code fences, loose JSON,
	double-escaped strings, and missing XML tags.
	"""
	reasoning = "No reasoning provided."
	action_dict: dict = {"action_type": "no_op", "kubectl_command": "no_op"}

	# 1. Extract <think> block as primary reasoning
	think_match = re.search(r"<think>(.*?)</think>", text, re.DOTALL)
	if think_match:
	reasoning = think_match.group(1).strip()

	# Strip think block before parsing action tags
	text_stripped = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()

	# 2. Extract <reasoning> tag as fallback
	if reasoning == "No reasoning provided.":
	res_match = re.search(
	r"<reasoning>\s(.?)\s*</reasoning>",
	text_stripped,
	re.IGNORECASE \| re.DOTALL,
	)
	if res_match:
	reasoning = res_match.group(1).strip()

	# 3. Extract Action block
	act_match = re.search(
	r"<action>\s(.?)\s*</action>", text_stripped, re.IGNORECASE \| re.DOTALL
	)
	json_text = act_match.group(1) if act_match else text_stripped

	# 4. Extract JSON from content (handles markdown code fences)
	# Strip markdown code fences
	json_text = re.sub(r"```[a-zA-Z]*", "", json_text)
	json_text = json_text.replace("```", "").strip()

	# Find outermost JSON braces
	start_idx = json_text.find("{")
	end_idx = json_text.rfind("}")
	if start_idx != -1 and end_idx != -1 and end_idx >= start_idx:
	json_text = json_text[start_idx : end_idx + 1]
	else:
	# Try extracting a bare kubectl command
	cmd_match = re.search(r"(kubectl\s+\S+.*?)(?:\n\|$)", json_text, re.IGNORECASE)
	if cmd_match:
	json_text = json.dumps({"command": cmd_match.group(1).strip()})
	else:
	json_text = '{"command": "no_op"}'

	# 5. Parse JSON with multiple fallbacks
	try:
	action_dict = json.loads(json_text)
	except json.JSONDecodeError:
	# Try fixing common LLM JSON errors
	try:
	# Handle single quotes
	fixed = json_text.replace("'", '"')
	action_dict = json.loads(fixed)
	except json.JSONDecodeError:
	try:
	action_dict = ast.literal_eval(json_text)
	except Exception:
	# Extract command from raw text as last resort
	cmd_match = re.search(
	r"(kubectl\s+\S+.*?)(?:\"\|'\|$)", json_text, re.IGNORECASE
	)
	if cmd_match:
	action_dict = {"command": cmd_match.group(1).strip()}
	else:
	action_dict = {"action_type": "no_op", "kubectl_command": "no_op"}

	return action_dict, reasoning


	# ---------------------------------------------------------------------------
	# Safe Backend Action Builder
	# ---------------------------------------------------------------------------
	def build_safe_backend_action(action_dict: dict) -> dict:
	"""
	STRICT RECONSTRUCTOR: Prevents FastAPI 422 Errors.
	Guarantees the backend only receives valid keys specified in server/models.py.
	"""
	if isinstance(action_dict, dict) and (
	"command" in action_dict or "raw_command" in action_dict
	):
	cmd = str(
	action_dict.get("command") or action_dict.get("raw_command") or "no_op"
	).strip()
	return {"action_type": "no_op", "raw_command": cmd}

	safe_action = {"action_type": action_dict.get("action_type", "no_op")}
	act_type = safe_action["action_type"]

	if act_type in ("restart_node", "query_logs"):
	try:
	safe_action["target"] = int(action_dict.get("target", 0))
	except (ValueError, TypeError):
	safe_action["target"] = 0

	elif act_type == "reroute_traffic":
	try:
	safe_action["from_node"] = int(action_dict.get("from_node", 0))
	safe_action["to_node"] = int(action_dict.get("to_node", 0))
	except (ValueError, TypeError):
	safe_action["action_type"] = "no_op"

	elif act_type == "throttle":
	try:
	raw_rate = float(action_dict.get("rate", 1.0))
	safe_action["rate"] = max(0.0, min(1.0, raw_rate))
	except (ValueError, TypeError):
	safe_action["rate"] = 1.0

	elif act_type not in ("scale_up", "no_op"):
	safe_action["action_type"] = "no_op"

	return safe_action


	# ---------------------------------------------------------------------------
	# LLM Decision Maker
	# ---------------------------------------------------------------------------
	def llm_decide(
	observation: dict,
	model_name: str,
	mode: str,
	api_base: str,
	api_key: Optional[str],
	) -> Tuple[dict, str, str]:
	"""
	Query the LLM for a decision based on the current observation.

	Returns:
	(action_dict, reasoning, raw_output)
	"""
	obs_str = json.dumps(observation)
	user_prompt = f"Current system state:\n{obs_str}\nRespond with the required XML and JSON format."

	messages = [
	{"role": "system", "content": SYSTEM_PROMPT},
	{"role": "user", "content": user_prompt},
	]

	for attempt in range(MAX_RETRIES):
	try:
	content = ""

	# --- LOCAL GPU INFERENCE ---
	if mode == "local":
	import torch

	tokenizer, model = load_local_model(model_name)

	try:
	prompt = tokenizer.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True,
	enable_thinking=True,
	)
	except TypeError:
	prompt = tokenizer.apply_chat_template(
	messages, tokenize=False, add_generation_prompt=True
	)

	inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

	with torch.no_grad():
	outputs = model.generate(
	**inputs,
	max_new_tokens=4096,
	do_sample=True,
	temperature=0.7,
	top_p=0.9,
	pad_token_id=tokenizer.eos_token_id,
	)

	input_length = inputs.input_ids.shape[1]
	content = tokenizer.decode(
	outputs[0][input_length:], skip_special_tokens=True
	).strip()

	# --- REMOTE ENDPOINT INFERENCE ---
	elif mode == "endpoint":
	if not api_key:
	raise ValueError(
	"API_KEY is missing for endpoint inference! "
	"Set API_KEY, OPENAI_API_KEY, or HF_TOKEN."
	)
	api_client = get_client(api_base, api_key)
	response = api_client.chat.completions.create(
	model=model_name,
	messages=messages,
	max_tokens=4000,
	temperature=0.7,
	top_p=0.9,
	)
	content = response.choices[0].message.content.strip()

	else:
	raise ValueError(
	f"Unknown inference mode: {mode}. Must be 'local' or 'endpoint'."
	)

	action_dict, reasoning = parse_llm_response(content)

	print("\n[DEBUG RAW OUTPUT]\n", content[:500], "\n", flush=True)

	if (
	"action_type" in action_dict
	or "command" in action_dict
	or "raw_command" in action_dict
	):
	return action_dict, reasoning, content

	except Exception as e:
	wait_time = 2 ** (attempt + 1)
	print(
	f"[WARNING] Inference call failed (Attempt {attempt + 1}/{MAX_RETRIES}): {str(e)[:200]}",
	flush=True,
	)
	if mode == "endpoint":
	print(f"[DEBUG] Retrying in {wait_time} seconds...", flush=True)
	time.sleep(wait_time)

	print("[ERROR] Inference repeatedly failed. Skipping turn with no_op.", flush=True)
	return (
	{"action_type": "no_op", "kubectl_command": "no_op"},
	"Inference Error - Fallback to no_op",
	"",
	)


	# ---------------------------------------------------------------------------
	# Environment Communication
	# ---------------------------------------------------------------------------
	def env_reset(env_url: str, task_id: str) -> dict:
	for attempt in range(MAX_RETRIES):
	try:
	response = requests.post(
	f"{env_url}/reset", json={"task": task_id}, timeout=15
	)
	response.raise_for_status()
	payload = response.json()
	data_block = payload.get("data", payload)
	if "observation" in data_block and isinstance(
	data_block["observation"], dict
	):
	return data_block["observation"]
	return data_block
	except Exception:
	time.sleep(2**attempt)
	raise ConnectionError(f"Failed to reset environment after {MAX_RETRIES} attempts.")


	def env_step(env_url: str, action: dict) -> dict:
	for attempt in range(MAX_RETRIES):
	try:
	response = requests.post(
	f"{env_url}/step", json={"action": action}, timeout=15
	)
	response.raise_for_status()
	return response.json()
	except Exception:
	time.sleep(2**attempt)
	raise ConnectionError(f"Failed to step environment after {MAX_RETRIES} attempts.")


	# ---------------------------------------------------------------------------
	# Direct (in-process) environment access — used when --mode local so the HTTP
	# server does not need to be running separately.
	# ---------------------------------------------------------------------------
	def _get_direct_env():
	global _direct_env
	if _direct_env is None:
	from server.environment import DistributedInfraEnvironment

	_direct_env = DistributedInfraEnvironment()
	return _direct_env


	def _infraobs_to_dict(obs) -> dict:
	keys = [
	"cpu_loads",
	"mem_utilizations",
	"queue_lengths",
	"failed_nodes",
	"latency_ms",
	"request_rate",
	"io_wait",
	"p99_latency",
	"error_budget",
	"step",
	"task_hint",
	"action_errors",
	"cloud_budget",
	"task_score",
	"done",
	"reward",
	"uptime_pct",
	]
	return {k: getattr(obs, k) for k in keys if hasattr(obs, k)}


	def env_reset_direct(task_id: str) -> dict:
	return _infraobs_to_dict(_get_direct_env().reset(task=task_id))


	def env_step_direct(action_dict: dict) -> dict:
	from server.models import InfraAction

	env = _get_direct_env()
	raw_cmd = action_dict.get("raw_command")
	if raw_cmd and raw_cmd != "no_op":
	action = InfraAction(action_type="no_op", raw_command=raw_cmd)
	else:
	act_type = action_dict.get("action_type", "no_op")
	kwargs: dict = {"action_type": act_type}
	if (
	act_type in ("restart_node", "query_logs")
	and action_dict.get("target") is not None
	):
	kwargs["target"] = int(action_dict["target"])
	elif act_type == "reroute_traffic":
	kwargs["from_node"] = int(action_dict.get("from_node", 0))
	kwargs["to_node"] = int(action_dict.get("to_node", 0))
	elif act_type == "throttle" and action_dict.get("rate") is not None:
	kwargs["rate"] = float(action_dict["rate"])
	try:
	action = InfraAction(**kwargs)
	except Exception:
	action = InfraAction(action_type="no_op")
	obs = env.step(action)
	obs_dict = _infraobs_to_dict(obs)
	return {
	"data": {
	"observation": obs_dict,
	"reward": getattr(obs, "reward", 0.0),
	"done": getattr(obs, "done", False),
	}
	}


	# ---------------------------------------------------------------------------
	# Task Runner
	# ---------------------------------------------------------------------------
	def run_task(
	task_id: str,
	*,
	model_name: str,
	mode: str,
	api_base: str,
	api_key: Optional[str],
	env_url: str,
	csv_path: Path,
	structured_logger: StructuredLogger,
	max_episode_steps: int = 100,
	use_direct: bool = False,
	) -> dict:
	log_start(task=task_id, env=BENCHMARK, model=model_name, mode=mode)
	episode_path = structured_logger.start_episode(task_id)
	print(f"[INFO] Structured log: {episode_path}", flush=True)

	try:
	obs = env_reset_direct(task_id) if use_direct else env_reset(env_url, task_id)
	except Exception as e:
	print(
	f"[FATAL ERROR] Failed to connect to environment server for task '{task_id}': {e}",
	flush=True,
	)
	log_end(success=False, steps=0, score=0.0, rewards=[])
	structured_logger.end_episode(task_id, False, 0, 0.0, [])
	return {
	"task": task_id,
	"score": 0.0,
	"avg_latency": 0.0,
	"avg_uptime": 0.0,
	"total_steps": 0,
	}

	step = 0
	rewards_list: List[float] = []
	latencies: List[float] = []
	uptimes: List[float] = []
	task_score = 0.0

	while True:
	step += 1

	# Track metrics from observation
	lat = float(obs.get("latency_ms", 0.0))
	up = float(obs.get("uptime_pct", 0.0))
	if lat > 0:
	latencies.append(lat)
	if up > 0:
	uptimes.append(up)

	# 1. Get the action dict, reasoning, and raw output
	action_dict, reasoning, raw_output = llm_decide(
	obs, model_name, mode, api_base, api_key
	)

	# 2. Stringify for logging
	action_str = json.dumps(action_dict).replace('"', "'")
	reasoning_clean = reasoning.replace("\n", " ")
	terminal_action_str = f"act={action_str} \| rsn='{reasoning_clean[:90]}...'"

	# 3. Build safe backend payload
	backend_action = build_safe_backend_action(action_dict)

	error_msg = None
	reward = 0.0
	done = False

	try:
	result = (
	env_step_direct(backend_action)
	if use_direct
	else env_step(env_url, backend_action)
	)
	data_block = result.get("data", result)

	if "observation" in data_block and isinstance(
	data_block["observation"], dict
	):
	obs = data_block["observation"]
	else:
	obs = data_block

	reward = float(data_block.get("reward", obs.get("reward", 0.0)))
	done = bool(data_block.get("done", obs.get("done", False)))
	task_score = float(obs.get("task_score", 0.0))

	except Exception as e:
	error_msg = str(e).replace("\n", " ")
	done = True

	rewards_list.append(reward)

	# Log to all outputs
	log_step(
	step=step,
	action=terminal_action_str,
	reward=reward,
	done=done,
	error=error_msg,
	)
	_log_csv(
	csv_path,
	model_name,
	task_id,
	step,
	action_str,
	reasoning,
	reward,
	task_score,
	done,
	error_msg,
	)
	structured_logger.log_step(
	step=step,
	raw_llm_output=raw_output,
	parsed_action=action_dict,
	backend_action=backend_action,
	reward=reward,
	done=done,
	task_score=task_score,
	obs_snapshot=obs if isinstance(obs, dict) else None,
	error=error_msg,
	reasoning=reasoning,
	)

	if done or step > max_episode_steps:
	success = task_score >= 0.1
	log_end(success=success, steps=step, score=task_score, rewards=rewards_list)
	structured_logger.end_episode(
	task_id, success, step, task_score, rewards_list
	)

	avg_lat = sum(latencies) / len(latencies) if latencies else 0.0
	avg_up = sum(uptimes) / len(uptimes) if uptimes else 0.0

	return {
	"task": task_id,
	"score": task_score,
	"avg_latency": avg_lat,
	"avg_uptime": avg_up,
	"total_steps": step,
	}


	# ---------------------------------------------------------------------------
	# CLI Argument Parser
	# ---------------------------------------------------------------------------
	def build_arg_parser() -> argparse.ArgumentParser:
	p = argparse.ArgumentParser(
	description="DIME LLM Agent Inference Loop",
	formatter_class=argparse.RawDescriptionHelpFormatter,
	)
	p.add_argument(
	"--mode",
	choices=["local", "endpoint"],
	default=os.environ.get("INFERENCE_MODE", "endpoint").lower(),
	help="Inference mode: local GPU or remote endpoint (default: endpoint)",
	)
	p.add_argument(
	"--model",
	default=os.environ.get("MODEL_NAME", _DEFAULT_MODEL),
	help=f"Model name/path (default: {_DEFAULT_MODEL})",
	)
	p.add_argument(
	"--api-base",
	default=os.environ.get("API_BASE_URL", _DEFAULT_API_BASE),
	help="API base URL for endpoint mode",
	)
	p.add_argument(
	"--env-url",
	default=os.environ.get("ENV_SERVER_URL", _DEFAULT_ENV_URL),
	help="Environment server URL",
	)
	p.add_argument(
	"--tasks",
	nargs="+",
	default=None,
	help="Tasks to run (default: all tasks)",
	)
	p.add_argument(
	"--max-steps",
	type=int,
	default=100,
	help="Max steps per episode (default: 100)",
	)
	p.add_argument(
	"--log-dir",
	type=str,
	default=None,
	help="Directory for structured JSON logs (default: ./logs/)",
	)
	return p


	# ---------------------------------------------------------------------------
	# Main Entry Point
	# ---------------------------------------------------------------------------
	def main():
	parser = build_arg_parser()
	args = parser.parse_args()

	model_name = args.model
	mode = args.mode
	api_base = args.api_base
	api_key = _get_api_key()
	env_url = args.env_url
	tasks = args.tasks or ALL_TASKS
	max_steps = args.max_steps

	# Setup paths
	project_root = Path(__file__).resolve().parent
	safe_model_name = model_name.replace("/", "_").replace(".", "_")
	log_file = project_root / f"logs_{safe_model_name}.txt"
	csv_file = project_root / f"metrics_{safe_model_name}.csv"
	log_dir = Path(args.log_dir) if args.log_dir else project_root / "logs"

	_init_csv(csv_file)
	structured_logger = StructuredLogger(log_dir, model_name)

	print(f"API KEY LOADED: {api_key[:10] if api_key else 'Missing or Running Local!'}")

	all_task_stats = []

	with tee_output(log_file):
	print("==================================================")
	print(f" STARTING DIME EVALUATION SESSION: {model_name}")
	print(f" MODE: {mode.upper()}")
	print(f" TASKS: {len(tasks)}")
	print(f" STRUCTURED LOGS: {log_dir}")
	print("==================================================")

	use_direct = mode == "local"
	for task_id in tasks:
	stats = run_task(
	task_id,
	model_name=model_name,
	mode=mode,
	api_base=api_base,
	api_key=api_key,
	env_url=env_url,
	csv_path=csv_file,
	structured_logger=structured_logger,
	max_episode_steps=max_steps,
	use_direct=use_direct,
	)
	all_task_stats.append(stats)

	# -----------------------------------------------------------
	# FINAL STRUCTURED SUMMARY FOR JUDGES
	# -----------------------------------------------------------
	print("\n" + "=" * 80)
	print(f"FINAL PERFORMANCE SUMMARY: {model_name}")
	print("=" * 80)
	print(
	f"{'Task Name':<25} \| {'Task Score':<12} \| {'Uptime %':<12} \| {'Avg Latency (ms)':<15}"
	)
	print("-" * 80)

	for s in all_task_stats:
	print(
	f"{s['task']:<25} \| {s['score']:<12.4f} \| {s['avg_uptime']:<12.1f} \| {s['avg_latency']:<15.1f}"
	)

	# Calculate DIME Index
	overall_score = sum(s["score"] for s in all_task_stats) / max(
	len(all_task_stats), 1
	)
	avg_system_uptime = sum(s["avg_uptime"] for s in all_task_stats) / max(
	len(all_task_stats), 1
	)
	avg_system_latency = sum(s["avg_latency"] for s in all_task_stats) / max(
	len(all_task_stats), 1
	)
	safe_lat = max(avg_system_latency, 2.0)

	dime_index = (overall_score * avg_system_uptime) / math.log(safe_lat)

	print("-" * 80)
	print(f"Overall Task Completion Score : {overall_score:.4f}")
	print(f"Mean System Uptime : {avg_system_uptime:.1f}%")
	print(f"Mean System Latency : {avg_system_latency:.1f} ms")
	print("=" * 80)
	print(f"OVERALL DIME INDEX (Higher=Better): {dime_index:.4f}")
	print("=" * 80)
	print(f"Logs saved to: {log_file}")
	print(f"Metrics saved to: {csv_file}")
	print(f"Structured logs: {log_dir}")

	structured_logger.close()


	if __name__ == "__main__":
	try:
	main()
	except Exception:
	traceback.print_exc()
	raise SystemExit(1)