""" GridMind-RL Baseline Inference Script -------------------------------------- Runs an LLM agent against all 3 tasks for N episodes each. Uses OpenAI-compatible API via API_BASE_URL / MODEL_NAME / HF_TOKEN environment variables. Usage: export MODEL_NAME=meta-llama/Llama-3.1-8B-Instruct export HF_TOKEN=hf_xxxx python inference.py # or: python python/inference.py [--episodes 1] [--llm-every 4] [--fast-mode] """ from __future__ import annotations import argparse import json import os import sys import time from typing import Any import requests from openai import OpenAI # ── Constants ────────────────────────────────────────────────────────────── ENV_URL = os.getenv("ENV_URL", "http://localhost:7860") MODEL_NAME = os.getenv("MODEL_NAME", "meta-llama/Llama-3.1-8B-Instruct") API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1") HF_TOKEN = os.getenv("HF_TOKEN", "") DEFAULT_EPISODES = 1 DEFAULT_SEED_BASE = 1000 MAX_RETRIES = 3 # 96 steps × 15 min = 24 h (must match env.EpisodeSteps) EPISODE_STEPS = 96 LAST_STEP_INDEX = EPISODE_STEPS - 1 SYSPROMPT = """You are GridMind, an expert industrial energy management controller. You control a building's HVAC, thermal storage, batch job scheduling, and load shedding. Your goal is to minimize electricity costs while maintaining comfort and meeting grid demand-response signals. Always respond with a single valid JSON object matching the action schema. No explanation needed.""" TASK_DESCRIPTIONS = { 1: "Task 1 (Easy - Cost Minimization): Minimize total energy cost over 24 hours. No temperature or batch constraints. Use cheap off-peak periods and thermal storage.", 2: "Task 2 (Medium - Temperature Management): Minimize cost AND keep indoor temperature within 19-23°C at all times. Balance comfort vs cost.", 3: "Task 3 (Hard - Full Demand Response): Minimize cost, maintain temperature, respond to grid stress (shed when grid_stress_signal > 0.7), schedule batch jobs, minimize carbon.", } ACTION_SCHEMA_STR = """{ "hvac_power_level": , "thermal_charge_rate": , "batch_job_slot": , "load_shed_fraction": , "building_id": 0 }""" def extract_json_object(text: str) -> dict[str, Any] | None: """Parse first balanced {...} JSON object from text (handles nested braces).""" start = text.find("{") if start < 0: return None depth = 0 for i in range(start, len(text)): c = text[i] if c == "{": depth += 1 elif c == "}": depth -= 1 if depth == 0: try: return json.loads(text[start : i + 1]) except json.JSONDecodeError: return None return None # ── Environment client ─────────────────────────────────────────────────────── class GridMindEnvClient: """Simple HTTP client for the GridMind-RL Go environment server.""" def __init__(self, base_url: str = ENV_URL, timeout: int = 30): self.base = base_url.rstrip("/") self.timeout = timeout def health(self) -> bool: try: r = requests.get(f"{self.base}/health", timeout=5) return r.status_code == 200 except Exception: return False def reset(self, task_id: int = 1, seed: int = 42, num_buildings: int = 1) -> dict: payload = {"task_id": task_id, "seed": seed, "num_buildings": num_buildings} r = requests.post(f"{self.base}/reset", json=payload, timeout=self.timeout) r.raise_for_status() return r.json() def step(self, action: dict) -> dict: r = requests.post(f"{self.base}/step", json=action, timeout=self.timeout) r.raise_for_status() return r.json() def grade(self) -> dict: r = requests.get(f"{self.base}/grade", timeout=self.timeout) r.raise_for_status() return r.json() def state(self) -> dict: r = requests.get(f"{self.base}/state", timeout=self.timeout) r.raise_for_status() return r.json() # ── LLM agent ─────────────────────────────────────────────────────────────── class LLMAgent: """OpenAI-compatible LLM agent that chooses actions given observations.""" def __init__(self): self.client = OpenAI( base_url=API_BASE_URL, api_key=HF_TOKEN if HF_TOKEN else "none", ) self.model = MODEL_NAME self.fallback_mode = False def choose_action(self, obs: dict, task_id: int) -> dict: """Prompt the LLM with current observation, return parsed action dict.""" if self.fallback_mode: return self._heuristic_action(obs) task_desc = TASK_DESCRIPTIONS.get(task_id, TASK_DESCRIPTIONS[1]) prompt = f"""{task_desc} Current observation: - Indoor temperature: {obs.get('indoor_temperature', 21):.1f}°C (target: 21°C, bounds: 19-23°C) - Thermal storage level: {obs.get('thermal_storage_level', 0.5):.2f} (0=empty, 1=full) - Process demand: {obs.get('process_demand', 15):.1f} kW - Current electricity price: ${obs.get('current_price', 0.10):.4f}/kWh - Grid stress signal: {obs.get('grid_stress_signal', 0):.3f} (>0.7 = critical, shed load!) - Carbon intensity: {obs.get('carbon_intensity', 300):.0f} gCO2/kWh - Hour of day: {obs.get('hour_of_day', 12)} (0=midnight, peak prices 8-12 and 17-21) - Pending batch job deadlines: {obs.get('batch_queue', [])} - Cumulative cost so far: ${obs.get('cumulative_cost', 0):.4f} - Episode step: {obs.get('step', 0)}/{LAST_STEP_INDEX} Strategy hints: - Charge thermal storage when price < $0.08/kWh, discharge when price > $0.15/kWh - Set HVAC low during peak prices (0.3-0.4) and use storage for temperature control - Shed 30-50% load if grid_stress_signal > 0.7 - Schedule batch jobs early if deadline is close (slot 0 or 1) Respond with ONLY a JSON action: {ACTION_SCHEMA_STR}""" for attempt in range(MAX_RETRIES): try: completion = self.client.chat.completions.create( model=self.model, messages=[ {"role": "system", "content": SYSPROMPT}, {"role": "user", "content": prompt}, ], max_tokens=128, temperature=0.0, ) content = completion.choices[0].message.content.strip() parsed = extract_json_object(content) if parsed is not None: return self._clamp_action(parsed) action = json.loads(content) return self._clamp_action(action) except Exception as e: err_str = str(e) print(f" [LLM attempt {attempt+1}/{MAX_RETRIES}] error: {err_str}") if "402" in err_str or "depleted" in err_str: print(" [WARN] Hugging Face free credits depleted! Switching to local heuristic agent for the rest of the simulation.") self.fallback_mode = True return self._heuristic_action(obs) time.sleep(1) return self._heuristic_action(obs) def _clamp_action(self, action: dict) -> dict: return { "hvac_power_level": max(0.0, min(1.0, float(action.get("hvac_power_level", 0.5)))), "thermal_charge_rate": max(-1.0, min(1.0, float(action.get("thermal_charge_rate", 0.0)))), "batch_job_slot": max(0, min(4, int(action.get("batch_job_slot", 0)))), "load_shed_fraction": max(0.0, min(0.5, float(action.get("load_shed_fraction", 0.0)))), "building_id": int(action.get("building_id", 0)), } def _heuristic_action(self, obs: dict) -> dict: """Rule-based policy (deterministic given obs).""" price = obs.get("current_price", 0.10) stress = obs.get("grid_stress_signal", 0.0) temp = obs.get("indoor_temperature", 21.0) storage = obs.get("thermal_storage_level", 0.5) queue = obs.get("batch_queue", []) hvac = 0.7 if price < 0.08 else (0.3 if price > 0.15 else 0.5) if temp > 23.0: hvac = max(hvac, 0.8) elif temp < 19.0: hvac = min(hvac, 0.2) charge = 0.0 if price < 0.07 and storage < 0.8: charge = 0.5 elif price > 0.15 and storage > 0.3: charge = -0.5 shed = 0.0 if stress > 0.7: shed = 0.4 elif stress > 0.5: shed = 0.2 slot = 2 if queue and min(queue) < 8: slot = 0 return { "hvac_power_level": hvac, "thermal_charge_rate": charge, "batch_job_slot": slot, "load_shed_fraction": shed, "building_id": 0, } def _default_action(self) -> dict: return { "hvac_power_level": 0.5, "thermal_charge_rate": 0.0, "batch_job_slot": 0, "load_shed_fraction": 0.0, "building_id": 0, } # ── Episode runner ─────────────────────────────────────────────────────────── def run_episode( env_client: GridMindEnvClient, agent: LLMAgent, task_id: int, seed: int, *, fast_mode: bool, llm_every: int, max_steps: int | None, verbose: bool = False, ) -> dict[str, Any]: """Run a single episode and return grade + metadata. Prints [START], [STEPn], [END].""" reset_resp = env_client.reset(task_id=task_id, seed=seed) obs = reset_resp["observations"][0] print("[START]", flush=True) total_reward = 0.0 total_steps = 0 start_time = time.time() step_resp: dict[str, Any] = {} step_limit = EPISODE_STEPS if max_steps is None else min(max_steps, EPISODE_STEPS) llm_reuse_remaining = 0 cached_action = agent._default_action() while not step_resp.get("done", False): if total_steps >= step_limit: break if fast_mode: action = agent._heuristic_action(obs) else: if llm_reuse_remaining <= 0: cached_action = agent.choose_action(obs, task_id) llm_reuse_remaining = max(1, llm_every) action = cached_action step_resp = env_client.step(action) if step_resp is None or "observation" not in step_resp: print(f" [WARN] step {total_steps}: invalid step response", flush=True) break if not fast_mode: llm_reuse_remaining -= 1 obs = step_resp["observation"] total_reward += float(step_resp["reward"]) total_steps += 1 print(f"[STEP{total_steps}]", flush=True) if verbose and total_steps % 16 == 0: print( f" step={total_steps:02d} price=${obs['current_price']:.3f} " f"temp={obs['indoor_temperature']:.1f}°C " f"stress={obs['grid_stress_signal']:.2f} " f"cost=${obs['cumulative_cost']:.2f} " f"reward={step_resp['reward']:.3f}", flush=True, ) elapsed = time.time() - start_time grade = env_client.grade() print("[END]", flush=True) return { "task_id": task_id, "seed": seed, "total_reward": total_reward, "total_steps": total_steps, "elapsed_sec": elapsed, "score": grade.get("score", 0.0), "sub_scores": grade.get("sub_scores", {}), "exploit_detected": grade.get("exploit_detected", False), } # ── Main ───────────────────────────────────────────────────────────────────── def main() -> None: parser = argparse.ArgumentParser(description="GridMind-RL baseline inference") parser.add_argument("--episodes", type=int, default=DEFAULT_EPISODES) parser.add_argument("--env-url", type=str, default=ENV_URL) parser.add_argument("--verbose", action="store_true") parser.add_argument("--output", type=str, default="baseline_scores.json") parser.add_argument( "--fast-mode", action="store_true", help="Heuristic policy only (no LLM calls; fastest, fully reproducible).", ) parser.add_argument( "--llm-every", type=int, default=4, metavar="N", help="Reuse the same LLM action for N consecutive steps (default: 4).", ) parser.add_argument( "--max-steps", type=int, default=None, metavar="N", help="Stop after N steps (default: full episode). Grade uses partial episode.", ) args = parser.parse_args() print("=" * 60) print("GridMind-RL Baseline Inference") print(f" Model: {MODEL_NAME}") print(f" API: {API_BASE_URL}") print(f" Env: {args.env_url}") print(f" Episodes per task: {args.episodes}") print(f" Fast mode: {args.fast_mode} | LLM every: {args.llm_every} steps") print("=" * 60) env_client = GridMindEnvClient(base_url=args.env_url) print("\nWaiting for environment server...") for attempt in range(30): if env_client.health(): print(" [OK] Environment server is healthy") break time.sleep(2) if attempt == 29: print(" [FAIL] Environment server not reachable. Exiting.") sys.exit(1) agent = LLMAgent() all_results: list[dict[str, Any]] = [] for task_id in [1, 2, 3]: print(f"\n-- Task {task_id}: {TASK_DESCRIPTIONS[task_id][:60]}...") task_scores: list[float] = [] for ep in range(args.episodes): seed = DEFAULT_SEED_BASE + task_id * 100 + ep print(f" Episode {ep+1}/{args.episodes} (seed={seed})") result = run_episode( env_client, agent, task_id=task_id, seed=seed, fast_mode=args.fast_mode, llm_every=args.llm_every, max_steps=args.max_steps, verbose=args.verbose, ) task_scores.append(float(result["score"])) all_results.append(result) print( f" → score={result['score']:.4f} | reward={result['total_reward']:.3f} | " f"{result['elapsed_sec']:.1f}s | steps={result['total_steps']}" ) avg_score = sum(task_scores) / len(task_scores) print(f" Task {task_id} average score: {avg_score:.4f}") print("\n" + "=" * 60) print("BASELINE SCORES SUMMARY") print("=" * 60) print(f"{'Task':<10} {'Model':<30} {'Score':<10} {'Episodes':<10}") print("-" * 60) task_avgs: dict[int, float] = {} for task_id in [1, 2, 3]: scores = [float(r["score"]) for r in all_results if r["task_id"] == task_id] avg = sum(scores) / len(scores) if scores else 0.0 task_avgs[task_id] = avg print(f"Task {task_id:<6} {MODEL_NAME:<30} {avg:<10.4f} {len(scores)}") print("-" * 60) overall = sum(task_avgs.values()) / len(task_avgs) print(f"{'Overall':<10} {'':<30} {overall:<10.4f}") output = { "model": MODEL_NAME, "api_base": API_BASE_URL, "episodes_per_task": args.episodes, "seed_base": DEFAULT_SEED_BASE, "fast_mode": args.fast_mode, "llm_every": args.llm_every, "max_steps": args.max_steps, "task_averages": {str(k): v for k, v in task_avgs.items()}, "overall_average": overall, "all_results": all_results, } with open(args.output, "w", encoding="utf-8") as f: json.dump(output, f, indent=2) print(f"\n[OK] Results saved to {args.output}") if __name__ == "__main__": main()