Spaces:
Sleeping
Sleeping
| """ | |
| Baseline inference script for the Compiler Pass Ordering RL Environment. | |
| Runs an LLM agent (via OpenAI-compatible API) against all 3 tasks and | |
| produces a reproducible baseline score report. | |
| Usage: | |
| export OPENAI_API_KEY=your_key_here | |
| export OPENAI_BASE_URL=https://api.openai.com/v1 # optional, defaults to OpenAI | |
| python baseline_agent.py --base-url http://localhost:8000 | |
| Requirements: | |
| pip install openai | |
| (server must be running: uvicorn server.app:app --host 0.0.0.0 --port 8000) | |
| """ | |
| import argparse | |
| import json | |
| import os | |
| import time | |
| from openai import OpenAI | |
| from compiler_opt_env import CompilerOptAction, CompilerOptEnv | |
| from compiler_opt_env.models import PASS_NAMES, TASK_EASY, TASK_MEDIUM, TASK_HARD | |
| # --------------------------------------------------------------------------- | |
| # Config | |
| # --------------------------------------------------------------------------- | |
| MODEL = os.getenv("OPENAI_MODEL", "gpt-4o-mini") | |
| EPISODES = int(os.getenv("BASELINE_EPISODES", "5")) # episodes per task | |
| MAX_RETRIES = 3 | |
| TASK_NAMES = {TASK_EASY: "Easy", TASK_MEDIUM: "Medium", TASK_HARD: "Hard"} | |
| SYSTEM_PROMPT = """You are an expert compiler engineer. You are controlling a compiler | |
| optimization pipeline. At each step you must choose ONE optimization pass to apply | |
| to the program's Intermediate Representation (IR) to minimize its estimated runtime cost. | |
| Available passes (use the integer ID): | |
| 0: dead_code_elimination — removes unreachable/unused code | |
| 1: constant_folding — evaluates constant expressions at compile time | |
| 2: loop_unrolling — expands loop bodies to reduce iteration overhead | |
| 3: function_inlining — replaces function calls with function body | |
| 4: vectorization — uses SIMD instructions for parallel computation | |
| 5: loop_invariant_motion — moves loop-invariant code outside the loop | |
| 6: strength_reduction — replaces expensive ops with cheaper equivalents | |
| 7: common_subexpr_elimination — eliminates redundant computations | |
| 8: tail_call_optimization — converts tail recursion to iteration | |
| 9: branch_prediction_hints — adds CPU branch prediction metadata | |
| 10: register_allocation — optimizes register usage | |
| 11: instruction_scheduling — reorders instructions to avoid pipeline stalls | |
| 12: memory_coalescing — combines memory accesses for cache efficiency | |
| 13: alias_analysis — determines which pointers can alias (enables others) | |
| 14: interprocedural_analysis — cross-function analysis (enables inlining) | |
| IMPORTANT: Some passes are much more effective when specific prerequisite passes | |
| have been applied first. For example, vectorization is nearly useless without | |
| alias_analysis and dead_code_elimination applied first. Think carefully about | |
| ordering — applying enabler passes early unlocks large gains later. | |
| You must respond with ONLY a JSON object: {"pass_id": <integer 0-14>} | |
| No explanation, no markdown, just the JSON.""" | |
| def build_user_prompt(obs) -> str: | |
| applied_names = [PASS_NAMES[p] for p in obs.passes_applied] | |
| available_names = {p: PASS_NAMES[p] for p in obs.passes_available} | |
| return f"""Current program state: | |
| - Program type: {obs.program_type} | |
| - Estimated cost: {obs.estimated_cost:.1f} (baseline: {obs.baseline_cost:.1f}) | |
| - Cost reduction so far: {obs.improvement_pct:.1f}% | |
| - Steps used: {obs.step_count} / {obs.max_steps} | |
| - Passes applied so far (in order): {applied_names if applied_names else 'none'} | |
| - Available passes: {json.dumps(available_names)} | |
| - Synergy state (effectiveness multipliers): {dict(zip(obs.passes_available, [round(obs.synergy_state[p], 2) for p in obs.passes_available]))} | |
| Task: {obs.task_description} | |
| Which pass should be applied next? Respond with only: {{"pass_id": <integer>}}""" | |
| def run_llm_episode(env, openai_client: OpenAI, task_id: int) -> dict: | |
| """Run one episode with the LLM agent. Returns episode result dict.""" | |
| result = env.reset() | |
| obs = result.observation if hasattr(result, 'observation') else result | |
| conversation = [] | |
| episode_rewards = [] | |
| while not obs.done: | |
| user_msg = build_user_prompt(obs) | |
| conversation_turn = [ | |
| {"role": "system", "content": SYSTEM_PROMPT}, | |
| {"role": "user", "content": user_msg}, | |
| ] | |
| # Call LLM with retries | |
| pass_id = None | |
| for attempt in range(MAX_RETRIES): | |
| try: | |
| response = openai_client.chat.completions.create( | |
| model=MODEL, | |
| messages=conversation_turn, | |
| temperature=0.2, | |
| max_tokens=50, | |
| ) | |
| raw = response.choices[0].message.content.strip() | |
| parsed = json.loads(raw) | |
| pass_id = int(parsed["pass_id"]) | |
| if pass_id not in obs.passes_available: | |
| print(f" [warn] LLM chose unavailable pass {pass_id}, picking random") | |
| import random | |
| pass_id = random.choice(obs.passes_available) | |
| break | |
| except Exception as e: | |
| print(f" [retry {attempt+1}] LLM parse error: {e}") | |
| time.sleep(1) | |
| if pass_id is None: | |
| import random | |
| pass_id = random.choice(obs.passes_available) | |
| print(f" [fallback] Using random pass: {PASS_NAMES[pass_id]}") | |
| step_result = env.step(CompilerOptAction(pass_id=pass_id, task_id=task_id)) | |
| obs = step_result.observation | |
| episode_rewards.append(step_result.reward or 0.0) | |
| print(f" Step {obs.step_count}: {PASS_NAMES[pass_id]:35s} " | |
| f"→ improvement={obs.improvement_pct:.1f}% " | |
| f"reward={step_result.reward:.4f}") | |
| return { | |
| "task_id": task_id, | |
| "improvement_pct": obs.improvement_pct, | |
| "grader_score": obs.grader_score, | |
| "steps_used": obs.step_count, | |
| "passes_applied": [PASS_NAMES[p] for p in obs.passes_applied], | |
| "total_reward": sum(episode_rewards), | |
| "program_type": obs.program_type, | |
| } | |
| def main(): | |
| parser = argparse.ArgumentParser(description="Compiler Opt Env — LLM Baseline Agent") | |
| parser.add_argument("--base-url", default="http://localhost:8000", help="Environment server URL") | |
| parser.add_argument("--episodes", type=int, default=EPISODES, help="Episodes per task") | |
| parser.add_argument("--model", default=MODEL, help="OpenAI model name") | |
| args = parser.parse_args() | |
| api_key = os.getenv("OPENAI_API_KEY") | |
| if not api_key: | |
| raise ValueError("OPENAI_API_KEY environment variable not set") | |
| openai_client = OpenAI( | |
| api_key=api_key, | |
| base_url=os.getenv("OPENAI_BASE_URL", "https://api.openai.com/v1"), | |
| ) | |
| print(f"\n{'='*65}") | |
| print(f" Compiler Pass Ordering — LLM Baseline ({args.model})") | |
| print(f" Server: {args.base_url} | Episodes per task: {args.episodes}") | |
| print(f"{'='*65}\n") | |
| all_results = [] | |
| with CompilerOptEnv(base_url=args.base_url).sync() as env: | |
| for task_id in [TASK_EASY, TASK_MEDIUM, TASK_HARD]: | |
| print(f"\n--- Task {task_id} ({TASK_NAMES[task_id]}) ---") | |
| task_results = [] | |
| for ep in range(args.episodes): | |
| print(f" Episode {ep+1}/{args.episodes}:") | |
| result = run_llm_episode(env, openai_client, task_id) | |
| task_results.append(result) | |
| print(f" → Grader score: {result['grader_score']:.3f} " | |
| f"Improvement: {result['improvement_pct']:.1f}%\n") | |
| avg_score = sum(r['grader_score'] or 0 for r in task_results) / len(task_results) | |
| avg_improv = sum(r['improvement_pct'] for r in task_results) / len(task_results) | |
| all_results.extend(task_results) | |
| print(f" Task {task_id} average — score: {avg_score:.3f} improvement: {avg_improv:.1f}%") | |
| # --------------------------------------------------------------------------- | |
| # Summary report | |
| # --------------------------------------------------------------------------- | |
| print(f"\n{'='*65}") | |
| print(" BASELINE SCORE REPORT") | |
| print(f"{'='*65}") | |
| print(f" Model: {args.model}") | |
| print(f" Episodes per task: {args.episodes}\n") | |
| for task_id in [TASK_EASY, TASK_MEDIUM, TASK_HARD]: | |
| task_r = [r for r in all_results if r['task_id'] == task_id] | |
| scores = [r['grader_score'] or 0 for r in task_r] | |
| improvs = [r['improvement_pct'] for r in task_r] | |
| print(f" Task {task_id} ({TASK_NAMES[task_id]:6s}): " | |
| f"avg_score={sum(scores)/len(scores):.3f} " | |
| f"avg_improvement={sum(improvs)/len(improvs):.1f}% " | |
| f"best={max(scores):.3f}") | |
| overall = sum(r['grader_score'] or 0 for r in all_results) / len(all_results) | |
| print(f"\n Overall average score: {overall:.3f} / 1.000") | |
| print(f"{'='*65}\n") | |
| # Save results to JSON | |
| output_path = "baseline_results.json" | |
| with open(output_path, "w") as f: | |
| json.dump({ | |
| "model": args.model, | |
| "episodes": args.episodes, | |
| "results": all_results, | |
| "summary": { | |
| "overall_avg_score": overall, | |
| "by_task": { | |
| str(tid): { | |
| "avg_score": sum(r['grader_score'] or 0 for r in all_results if r['task_id'] == tid) | |
| / sum(1 for r in all_results if r['task_id'] == tid), | |
| "avg_improvement_pct": sum(r['improvement_pct'] for r in all_results if r['task_id'] == tid) | |
| / sum(1 for r in all_results if r['task_id'] == tid), | |
| } | |
| for tid in [TASK_EASY, TASK_MEDIUM, TASK_HARD] | |
| } | |
| } | |
| }, f, indent=2) | |
| print(f"Full results saved to: {output_path}") | |
| if __name__ == "__main__": | |
| main() | |