""" ACRE inference script for OpenEnv submission evaluation. Environment variables: - API_BASE_URL: LLM API endpoint injected by evaluator - MODEL_NAME: model identifier (default allowed) - API_KEY: API token for the OpenAI-compatible proxy endpoint - ENV_URL: running ACRE server base URL (required) - LOCAL_IMAGE_NAME: present for evaluator compatibility (optional) - USE_LLM: set to "0" to disable LLM action selection STRICT stdout format (do not change): [START] task= [STEP] action= [END] task= score= """ from __future__ import annotations import json import os import re import sys import time from typing import Dict, List, Optional, Tuple import requests from openai import OpenAI MODEL_NAME = os.getenv("MODEL_NAME") or "gpt-4o-mini" # Phase-2 validator expects API_KEY through provided proxy. API_KEY = os.getenv("API_KEY") ENV_URL: str = os.getenv("ENV_URL", "http://localhost:7860") LOCAL_IMAGE_NAME: str | None = os.getenv("LOCAL_IMAGE_NAME") TASKS: List[str] = ["rename_variables", "remove_dead_code", "full_refactor"] ACTION_MEANINGS: Dict[int, str] = { 0: "rename_variable", 1: "remove_dead_code", 2: "simplify_loop", 3: "optimize_condition", 4: "inline_function", } SYSTEM_PROMPT = """\ You are an RL agent that refactors Python code. Choose one action per step. Actions: 0 rename_variable - rename generic names (x, tmp, i) to descriptive ones 1 remove_dead_code - remove unreachable stmts, if False blocks, unused vars 2 simplify_loop - convert append-loops to list comprehensions 3 optimize_condition- simplify 'not not x', 'if True/False', 'x==True' 4 inline_function - inline simple single-return module-level functions Respond ONLY with valid JSON (no markdown): {"action": <0-4>, "reason": ""}""" SAFE_FALLBACK_SCORES: Dict[str, float] = { "easy": 0.0, "medium": 0.0, "hard": 0.0, "final": 0.0, } def _safe_scores() -> Dict[str, float]: return dict(SAFE_FALLBACK_SCORES) def _env_url() -> str: # Never crash due to missing env var. return str(ENV_URL or "http://localhost:7860").rstrip("/") def _post(path: str, payload: dict | None = None) -> dict: try: response = requests.post(f"{_env_url()}{path}", json=payload or {}, timeout=5) response.raise_for_status() return response.json() except Exception: print("Warning: Could not reach environment", file=sys.stderr) return {} def _get(path: str) -> dict: try: response = requests.get(f"{_env_url()}{path}", timeout=5) response.raise_for_status() return response.json() except Exception: print("Warning: Could not reach environment", file=sys.stderr) return {} def reset_env(task_id: str) -> dict: return _post("/reset", {"task_id": task_id}) def step_env(action: int) -> dict: return _post("/step", {"action": action}) def get_state() -> dict: return _get("/state") def grade(task_id: str, code: str) -> float: try: response = requests.post( f"{_env_url()}/tasks/{task_id}/grade", json={"code": code}, timeout=5, ) response.raise_for_status() return float(response.json().get("score", 0.0)) except Exception: print("Warning: Could not reach environment", file=sys.stderr) return 0.0 def choose_action(client: Optional[OpenAI], state: dict, task_id: str) -> Tuple[int, str]: def heuristic_action() -> Tuple[int, str]: code = str(state.get("current_code", "")) step_i = int(state.get("episode_steps", 0)) has_generic = re.search(r"\b(x|tmp|i)\b", code) is not None has_if_false = re.search(r"\bif\s+False\b", code) is not None has_if_true = re.search(r"\bif\s+True\b", code) is not None has_append_loop = ".append(" in code and "for " in code has_double_not = "not not" in code has_add_call = "add(" in code if task_id == "rename_variables": if has_generic: return 0, "heuristic: remove generic names first" if has_if_false or "unused" in code: return 1, "heuristic: remove dead code" if has_append_loop: return 2, "heuristic: simplify loop" if has_if_true or has_double_not: return 3, "heuristic: optimize conditions" return 4, "heuristic: inline simple function" if task_id == "remove_dead_code": if has_if_false or "unused" in code: return 1, "heuristic: remove dead code patterns" if has_append_loop: return 2, "heuristic: convert append-loop" if has_if_true or has_double_not: return 3, "heuristic: simplify conditions" if has_generic: return 0, "heuristic: clean generic names" return 4, "heuristic: inline helper" if has_generic: return 0, "heuristic: rename generic variables" if has_append_loop: return 2, "heuristic: simplify loop into listcomp" if has_if_false or has_if_true or has_double_not: return 3, "heuristic: optimize boolean branches" if has_add_call: return 4, "heuristic: inline add() call" if step_i >= 2: return 1, "heuristic: remove remaining dead code" return 3, "heuristic: condition optimization as safe default" # Enable LLM by default when credentials are present. use_llm = bool(API_KEY) and os.getenv("USE_LLM", "1") == "1" if (not use_llm) or client is None: return heuristic_action() messages = [ {"role": "system", "content": SYSTEM_PROMPT}, { "role": "user", "content": ( f"Task: {task_id}\n" f"Steps remaining: {state.get('max_steps', 5) - state.get('episode_steps', 0)}\n" f"Complexity: {state.get('complexity', 0)}\n\n" f"Current code:\n```python\n{state.get('current_code', '')}\n```\n\n" "Choose the best action." ), }, ] try: response = client.chat.completions.create( model=MODEL_NAME, messages=messages, temperature=0.0, max_tokens=120, ) raw = (response.choices[0].message.content or "").strip() json_blob = raw if "{" not in json_blob or "}" not in json_blob: return heuristic_action() match = re.search(r"\{.*\}", json_blob, flags=re.DOTALL) if match: json_blob = match.group(0) parsed = json.loads(json_blob) action = int(parsed.get("action", -1)) reason = str(parsed.get("reason", "")) if 0 <= action <= 4: return action, reason or "llm-selected action" return heuristic_action() except Exception: return heuristic_action() def _build_openai_client() -> Optional[OpenAI]: """ Build OpenAI-compatible client using hackathon-required proxy env vars. Falls back safely when vars are absent in local runs. """ base_url = os.getenv("API_BASE_URL") api_key = os.getenv("API_KEY") if not base_url or not api_key: return None try: return OpenAI(base_url=base_url, api_key=api_key) except Exception: return None def _touch_proxy(client: Optional[OpenAI]) -> None: """ Ensure at least one request is sent through the provided proxy in Phase-2. """ if client is None: return None try: client.chat.completions.create( model=MODEL_NAME, messages=[{"role": "user", "content": "Return exactly: ok"}], temperature=0.0, max_tokens=2, ) except Exception: # Keep inference resilient even if proxy is temporarily unavailable. return None return None def run_episode(client: Optional[OpenAI], task_id: str, episode_num: int) -> float: reset_env(task_id) state = get_state() # STRICT logging format required by evaluator. print(f"[START] task={task_id}", flush=True) cumulative_reward = 0.0 for step_num in range(1, 6): action, reason = choose_action(client, state, task_id) result = step_env(action) state = get_state() reward_payload = result.get("reward", {}) raw_reward = float(reward_payload.get("raw", 0.0)) norm_reward = float(reward_payload.get("normalized", (raw_reward + 32) / 52)) cumulative_reward += raw_reward # STRICT logging format required by evaluator. print(f"[STEP] action={int(action)}", flush=True) if result.get("done") or result.get("terminated") or result.get("truncated"): break final_state = get_state() task_score = grade(task_id, final_state.get("current_code", "")) # STRICT logging format required by evaluator. print(f"[END] task={task_id} score={task_score:.4f}", flush=True) return task_score def run_all_tasks() -> Dict[str, float]: """ Run all three tasks and return deterministic scores. This is used by the FastAPI server to show live demo results on the Space. """ try: # Prefer local in-process execution when running inside the server (no ENV_URL needed). try: from acre.tasks.task_registry import TaskRegistry from openenv_interface import OpenEnvRefactorEnv except Exception: TaskRegistry = None # type: ignore[assignment] OpenEnvRefactorEnv = None # type: ignore[assignment] registry = TaskRegistry() if TaskRegistry is not None else None env = OpenEnvRefactorEnv(registry=registry) if OpenEnvRefactorEnv is not None else None client = _build_openai_client() _touch_proxy(client) task_plan = [ "rename_variables", "remove_dead_code", "full_refactor", ] results: Dict[str, float] = _safe_scores() scores: List[float] = [] # If we have a local env, use it. Otherwise fall back to HTTP. if env is None or registry is None: # Network safety: quick health probe before running. try: r = requests.get(f"{_env_url()}/health", timeout=5) r.raise_for_status() except Exception: print("Warning: Could not reach environment", file=sys.stderr) return _safe_scores() for task_id in task_plan: print(f"[START] task={task_id}", flush=True) reset_env(task_id) for _ in range(5): state = get_state() action, _reason = choose_action(client, state, task_id) print(f"[STEP] action={int(action)}", flush=True) step_env(action) final_state = get_state() score = float(grade(task_id, final_state.get("current_code", ""))) print(f"[END] task={task_id} score={float(score):.4f}", flush=True) scores.append(score) if task_id == "rename_variables": results["easy"] = score elif task_id == "remove_dead_code": results["medium"] = score else: results["hard"] = score results["final"] = float(sum(scores) / len(scores)) if scores else 0.0 return results else: # Local in-process execution (fast + no network recursion). for task_id in task_plan: print(f"[START] task={task_id}", flush=True) env.reset(seed=0, task_id=task_id) for _ in range(5): st = env.state() state_payload = { "current_code": str(st.current_code), "episode_steps": int(st.episode_steps), "max_steps": int(st.max_steps), "complexity": float(st.complexity), } action, _reason = choose_action(client, state_payload, task_id) action = int(action) print(f"[STEP] action={int(action)}", flush=True) env.step(action) st = env.state() task = registry.get_task(task_id) score = float(task.grade_against_expected(st.current_code)) if task is not None else 0.0 print(f"[END] task={task_id} score={float(score):.4f}", flush=True) scores.append(score) if task_id == "rename_variables": results["easy"] = score elif task_id == "remove_dead_code": results["medium"] = score else: results["hard"] = score results["final"] = float(sum(scores) / len(scores)) if scores else 0.0 return results except Exception as e: print(f"ERROR: {str(e)}", file=sys.stderr) return _safe_scores() def main() -> None: # Never crash. Always produce output. result = run_all_tasks() print(f"Easy: {float(result.get('easy', 0.0)):.4f}", file=sys.stderr) print(f"Medium: {float(result.get('medium', 0.0)):.4f}", file=sys.stderr) print(f"Hard: {float(result.get('hard', 0.0)):.4f}", file=sys.stderr) print(f"Final: {float(result.get('final', 0.0)):.4f}", file=sys.stderr) return None if __name__ == "__main__": try: run_all_tasks() except Exception as e: print(f"Fatal error: {e}", file=sys.stderr)