Meta-Hackathon-main / baseline_agent.py
Parth3841's picture
Upload folder using huggingface_hub
7c2f148 verified
"""
Baseline inference script for the Compiler Pass Ordering RL Environment.
Runs an LLM agent (via OpenAI-compatible API) against all 3 tasks and
produces a reproducible baseline score report.
Usage:
export OPENAI_API_KEY=your_key_here
export OPENAI_BASE_URL=https://api.openai.com/v1 # optional, defaults to OpenAI
python baseline_agent.py --base-url http://localhost:8000
Requirements:
pip install openai
(server must be running: uvicorn server.app:app --host 0.0.0.0 --port 8000)
"""
import argparse
import json
import os
import time
from openai import OpenAI
from compiler_opt_env import CompilerOptAction, CompilerOptEnv
from compiler_opt_env.models import PASS_NAMES, TASK_EASY, TASK_MEDIUM, TASK_HARD
# ---------------------------------------------------------------------------
# Config
# ---------------------------------------------------------------------------
MODEL = os.getenv("OPENAI_MODEL", "gpt-4o-mini")
EPISODES = int(os.getenv("BASELINE_EPISODES", "5")) # episodes per task
MAX_RETRIES = 3
TASK_NAMES = {TASK_EASY: "Easy", TASK_MEDIUM: "Medium", TASK_HARD: "Hard"}
SYSTEM_PROMPT = """You are an expert compiler engineer. You are controlling a compiler
optimization pipeline. At each step you must choose ONE optimization pass to apply
to the program's Intermediate Representation (IR) to minimize its estimated runtime cost.
Available passes (use the integer ID):
0: dead_code_elimination — removes unreachable/unused code
1: constant_folding — evaluates constant expressions at compile time
2: loop_unrolling — expands loop bodies to reduce iteration overhead
3: function_inlining — replaces function calls with function body
4: vectorization — uses SIMD instructions for parallel computation
5: loop_invariant_motion — moves loop-invariant code outside the loop
6: strength_reduction — replaces expensive ops with cheaper equivalents
7: common_subexpr_elimination — eliminates redundant computations
8: tail_call_optimization — converts tail recursion to iteration
9: branch_prediction_hints — adds CPU branch prediction metadata
10: register_allocation — optimizes register usage
11: instruction_scheduling — reorders instructions to avoid pipeline stalls
12: memory_coalescing — combines memory accesses for cache efficiency
13: alias_analysis — determines which pointers can alias (enables others)
14: interprocedural_analysis — cross-function analysis (enables inlining)
IMPORTANT: Some passes are much more effective when specific prerequisite passes
have been applied first. For example, vectorization is nearly useless without
alias_analysis and dead_code_elimination applied first. Think carefully about
ordering — applying enabler passes early unlocks large gains later.
You must respond with ONLY a JSON object: {"pass_id": <integer 0-14>}
No explanation, no markdown, just the JSON."""
def build_user_prompt(obs) -> str:
applied_names = [PASS_NAMES[p] for p in obs.passes_applied]
available_names = {p: PASS_NAMES[p] for p in obs.passes_available}
return f"""Current program state:
- Program type: {obs.program_type}
- Estimated cost: {obs.estimated_cost:.1f} (baseline: {obs.baseline_cost:.1f})
- Cost reduction so far: {obs.improvement_pct:.1f}%
- Steps used: {obs.step_count} / {obs.max_steps}
- Passes applied so far (in order): {applied_names if applied_names else 'none'}
- Available passes: {json.dumps(available_names)}
- Synergy state (effectiveness multipliers): {dict(zip(obs.passes_available, [round(obs.synergy_state[p], 2) for p in obs.passes_available]))}
Task: {obs.task_description}
Which pass should be applied next? Respond with only: {{"pass_id": <integer>}}"""
def run_llm_episode(env, openai_client: OpenAI, task_id: int) -> dict:
"""Run one episode with the LLM agent. Returns episode result dict."""
result = env.reset()
obs = result.observation if hasattr(result, 'observation') else result
conversation = []
episode_rewards = []
while not obs.done:
user_msg = build_user_prompt(obs)
conversation_turn = [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": user_msg},
]
# Call LLM with retries
pass_id = None
for attempt in range(MAX_RETRIES):
try:
response = openai_client.chat.completions.create(
model=MODEL,
messages=conversation_turn,
temperature=0.2,
max_tokens=50,
)
raw = response.choices[0].message.content.strip()
parsed = json.loads(raw)
pass_id = int(parsed["pass_id"])
if pass_id not in obs.passes_available:
print(f" [warn] LLM chose unavailable pass {pass_id}, picking random")
import random
pass_id = random.choice(obs.passes_available)
break
except Exception as e:
print(f" [retry {attempt+1}] LLM parse error: {e}")
time.sleep(1)
if pass_id is None:
import random
pass_id = random.choice(obs.passes_available)
print(f" [fallback] Using random pass: {PASS_NAMES[pass_id]}")
step_result = env.step(CompilerOptAction(pass_id=pass_id, task_id=task_id))
obs = step_result.observation
episode_rewards.append(step_result.reward or 0.0)
print(f" Step {obs.step_count}: {PASS_NAMES[pass_id]:35s} "
f"→ improvement={obs.improvement_pct:.1f}% "
f"reward={step_result.reward:.4f}")
return {
"task_id": task_id,
"improvement_pct": obs.improvement_pct,
"grader_score": obs.grader_score,
"steps_used": obs.step_count,
"passes_applied": [PASS_NAMES[p] for p in obs.passes_applied],
"total_reward": sum(episode_rewards),
"program_type": obs.program_type,
}
def main():
parser = argparse.ArgumentParser(description="Compiler Opt Env — LLM Baseline Agent")
parser.add_argument("--base-url", default="http://localhost:8000", help="Environment server URL")
parser.add_argument("--episodes", type=int, default=EPISODES, help="Episodes per task")
parser.add_argument("--model", default=MODEL, help="OpenAI model name")
args = parser.parse_args()
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
raise ValueError("OPENAI_API_KEY environment variable not set")
openai_client = OpenAI(
api_key=api_key,
base_url=os.getenv("OPENAI_BASE_URL", "https://api.openai.com/v1"),
)
print(f"\n{'='*65}")
print(f" Compiler Pass Ordering — LLM Baseline ({args.model})")
print(f" Server: {args.base_url} | Episodes per task: {args.episodes}")
print(f"{'='*65}\n")
all_results = []
with CompilerOptEnv(base_url=args.base_url).sync() as env:
for task_id in [TASK_EASY, TASK_MEDIUM, TASK_HARD]:
print(f"\n--- Task {task_id} ({TASK_NAMES[task_id]}) ---")
task_results = []
for ep in range(args.episodes):
print(f" Episode {ep+1}/{args.episodes}:")
result = run_llm_episode(env, openai_client, task_id)
task_results.append(result)
print(f" → Grader score: {result['grader_score']:.3f} "
f"Improvement: {result['improvement_pct']:.1f}%\n")
avg_score = sum(r['grader_score'] or 0 for r in task_results) / len(task_results)
avg_improv = sum(r['improvement_pct'] for r in task_results) / len(task_results)
all_results.extend(task_results)
print(f" Task {task_id} average — score: {avg_score:.3f} improvement: {avg_improv:.1f}%")
# ---------------------------------------------------------------------------
# Summary report
# ---------------------------------------------------------------------------
print(f"\n{'='*65}")
print(" BASELINE SCORE REPORT")
print(f"{'='*65}")
print(f" Model: {args.model}")
print(f" Episodes per task: {args.episodes}\n")
for task_id in [TASK_EASY, TASK_MEDIUM, TASK_HARD]:
task_r = [r for r in all_results if r['task_id'] == task_id]
scores = [r['grader_score'] or 0 for r in task_r]
improvs = [r['improvement_pct'] for r in task_r]
print(f" Task {task_id} ({TASK_NAMES[task_id]:6s}): "
f"avg_score={sum(scores)/len(scores):.3f} "
f"avg_improvement={sum(improvs)/len(improvs):.1f}% "
f"best={max(scores):.3f}")
overall = sum(r['grader_score'] or 0 for r in all_results) / len(all_results)
print(f"\n Overall average score: {overall:.3f} / 1.000")
print(f"{'='*65}\n")
# Save results to JSON
output_path = "baseline_results.json"
with open(output_path, "w") as f:
json.dump({
"model": args.model,
"episodes": args.episodes,
"results": all_results,
"summary": {
"overall_avg_score": overall,
"by_task": {
str(tid): {
"avg_score": sum(r['grader_score'] or 0 for r in all_results if r['task_id'] == tid)
/ sum(1 for r in all_results if r['task_id'] == tid),
"avg_improvement_pct": sum(r['improvement_pct'] for r in all_results if r['task_id'] == tid)
/ sum(1 for r in all_results if r['task_id'] == tid),
}
for tid in [TASK_EASY, TASK_MEDIUM, TASK_HARD]
}
}
}, f, indent=2)
print(f"Full results saved to: {output_path}")
if __name__ == "__main__":
main()