""" Baseline inference script for the Compiler Pass Ordering RL Environment. Runs an LLM agent (via OpenAI-compatible API) against all 3 tasks and produces a reproducible baseline score report. Usage: export OPENAI_API_KEY=your_key_here export OPENAI_BASE_URL=https://api.openai.com/v1 # optional, defaults to OpenAI python baseline_agent.py --base-url http://localhost:8000 Requirements: pip install openai (server must be running: uvicorn server.app:app --host 0.0.0.0 --port 8000) """ import argparse import json import os import time from openai import OpenAI from compiler_opt_env import CompilerOptAction, CompilerOptEnv from compiler_opt_env.models import PASS_NAMES, TASK_EASY, TASK_MEDIUM, TASK_HARD # --------------------------------------------------------------------------- # Config # --------------------------------------------------------------------------- MODEL = os.getenv("OPENAI_MODEL", "gpt-4o-mini") EPISODES = int(os.getenv("BASELINE_EPISODES", "5")) # episodes per task MAX_RETRIES = 3 TASK_NAMES = {TASK_EASY: "Easy", TASK_MEDIUM: "Medium", TASK_HARD: "Hard"} SYSTEM_PROMPT = """You are an expert compiler engineer. You are controlling a compiler optimization pipeline. At each step you must choose ONE optimization pass to apply to the program's Intermediate Representation (IR) to minimize its estimated runtime cost. Available passes (use the integer ID): 0: dead_code_elimination — removes unreachable/unused code 1: constant_folding — evaluates constant expressions at compile time 2: loop_unrolling — expands loop bodies to reduce iteration overhead 3: function_inlining — replaces function calls with function body 4: vectorization — uses SIMD instructions for parallel computation 5: loop_invariant_motion — moves loop-invariant code outside the loop 6: strength_reduction — replaces expensive ops with cheaper equivalents 7: common_subexpr_elimination — eliminates redundant computations 8: tail_call_optimization — converts tail recursion to iteration 9: branch_prediction_hints — adds CPU branch prediction metadata 10: register_allocation — optimizes register usage 11: instruction_scheduling — reorders instructions to avoid pipeline stalls 12: memory_coalescing — combines memory accesses for cache efficiency 13: alias_analysis — determines which pointers can alias (enables others) 14: interprocedural_analysis — cross-function analysis (enables inlining) IMPORTANT: Some passes are much more effective when specific prerequisite passes have been applied first. For example, vectorization is nearly useless without alias_analysis and dead_code_elimination applied first. Think carefully about ordering — applying enabler passes early unlocks large gains later. You must respond with ONLY a JSON object: {"pass_id": } No explanation, no markdown, just the JSON.""" def build_user_prompt(obs) -> str: applied_names = [PASS_NAMES[p] for p in obs.passes_applied] available_names = {p: PASS_NAMES[p] for p in obs.passes_available} return f"""Current program state: - Program type: {obs.program_type} - Estimated cost: {obs.estimated_cost:.1f} (baseline: {obs.baseline_cost:.1f}) - Cost reduction so far: {obs.improvement_pct:.1f}% - Steps used: {obs.step_count} / {obs.max_steps} - Passes applied so far (in order): {applied_names if applied_names else 'none'} - Available passes: {json.dumps(available_names)} - Synergy state (effectiveness multipliers): {dict(zip(obs.passes_available, [round(obs.synergy_state[p], 2) for p in obs.passes_available]))} Task: {obs.task_description} Which pass should be applied next? Respond with only: {{"pass_id": }}""" def run_llm_episode(env, openai_client: OpenAI, task_id: int) -> dict: """Run one episode with the LLM agent. Returns episode result dict.""" result = env.reset() obs = result.observation if hasattr(result, 'observation') else result conversation = [] episode_rewards = [] while not obs.done: user_msg = build_user_prompt(obs) conversation_turn = [ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": user_msg}, ] # Call LLM with retries pass_id = None for attempt in range(MAX_RETRIES): try: response = openai_client.chat.completions.create( model=MODEL, messages=conversation_turn, temperature=0.2, max_tokens=50, ) raw = response.choices[0].message.content.strip() parsed = json.loads(raw) pass_id = int(parsed["pass_id"]) if pass_id not in obs.passes_available: print(f" [warn] LLM chose unavailable pass {pass_id}, picking random") import random pass_id = random.choice(obs.passes_available) break except Exception as e: print(f" [retry {attempt+1}] LLM parse error: {e}") time.sleep(1) if pass_id is None: import random pass_id = random.choice(obs.passes_available) print(f" [fallback] Using random pass: {PASS_NAMES[pass_id]}") step_result = env.step(CompilerOptAction(pass_id=pass_id, task_id=task_id)) obs = step_result.observation episode_rewards.append(step_result.reward or 0.0) print(f" Step {obs.step_count}: {PASS_NAMES[pass_id]:35s} " f"→ improvement={obs.improvement_pct:.1f}% " f"reward={step_result.reward:.4f}") return { "task_id": task_id, "improvement_pct": obs.improvement_pct, "grader_score": obs.grader_score, "steps_used": obs.step_count, "passes_applied": [PASS_NAMES[p] for p in obs.passes_applied], "total_reward": sum(episode_rewards), "program_type": obs.program_type, } def main(): parser = argparse.ArgumentParser(description="Compiler Opt Env — LLM Baseline Agent") parser.add_argument("--base-url", default="http://localhost:8000", help="Environment server URL") parser.add_argument("--episodes", type=int, default=EPISODES, help="Episodes per task") parser.add_argument("--model", default=MODEL, help="OpenAI model name") args = parser.parse_args() api_key = os.getenv("OPENAI_API_KEY") if not api_key: raise ValueError("OPENAI_API_KEY environment variable not set") openai_client = OpenAI( api_key=api_key, base_url=os.getenv("OPENAI_BASE_URL", "https://api.openai.com/v1"), ) print(f"\n{'='*65}") print(f" Compiler Pass Ordering — LLM Baseline ({args.model})") print(f" Server: {args.base_url} | Episodes per task: {args.episodes}") print(f"{'='*65}\n") all_results = [] with CompilerOptEnv(base_url=args.base_url).sync() as env: for task_id in [TASK_EASY, TASK_MEDIUM, TASK_HARD]: print(f"\n--- Task {task_id} ({TASK_NAMES[task_id]}) ---") task_results = [] for ep in range(args.episodes): print(f" Episode {ep+1}/{args.episodes}:") result = run_llm_episode(env, openai_client, task_id) task_results.append(result) print(f" → Grader score: {result['grader_score']:.3f} " f"Improvement: {result['improvement_pct']:.1f}%\n") avg_score = sum(r['grader_score'] or 0 for r in task_results) / len(task_results) avg_improv = sum(r['improvement_pct'] for r in task_results) / len(task_results) all_results.extend(task_results) print(f" Task {task_id} average — score: {avg_score:.3f} improvement: {avg_improv:.1f}%") # --------------------------------------------------------------------------- # Summary report # --------------------------------------------------------------------------- print(f"\n{'='*65}") print(" BASELINE SCORE REPORT") print(f"{'='*65}") print(f" Model: {args.model}") print(f" Episodes per task: {args.episodes}\n") for task_id in [TASK_EASY, TASK_MEDIUM, TASK_HARD]: task_r = [r for r in all_results if r['task_id'] == task_id] scores = [r['grader_score'] or 0 for r in task_r] improvs = [r['improvement_pct'] for r in task_r] print(f" Task {task_id} ({TASK_NAMES[task_id]:6s}): " f"avg_score={sum(scores)/len(scores):.3f} " f"avg_improvement={sum(improvs)/len(improvs):.1f}% " f"best={max(scores):.3f}") overall = sum(r['grader_score'] or 0 for r in all_results) / len(all_results) print(f"\n Overall average score: {overall:.3f} / 1.000") print(f"{'='*65}\n") # Save results to JSON output_path = "baseline_results.json" with open(output_path, "w") as f: json.dump({ "model": args.model, "episodes": args.episodes, "results": all_results, "summary": { "overall_avg_score": overall, "by_task": { str(tid): { "avg_score": sum(r['grader_score'] or 0 for r in all_results if r['task_id'] == tid) / sum(1 for r in all_results if r['task_id'] == tid), "avg_improvement_pct": sum(r['improvement_pct'] for r in all_results if r['task_id'] == tid) / sum(1 for r in all_results if r['task_id'] == tid), } for tid in [TASK_EASY, TASK_MEDIUM, TASK_HARD] } } }, f, indent=2) print(f"Full results saved to: {output_path}") if __name__ == "__main__": main()