File size: 10,067 Bytes
7c2f148
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
"""
Baseline inference script for the Compiler Pass Ordering RL Environment.

Runs an LLM agent (via OpenAI-compatible API) against all 3 tasks and
produces a reproducible baseline score report.

Usage:
    export OPENAI_API_KEY=your_key_here
    export OPENAI_BASE_URL=https://api.openai.com/v1   # optional, defaults to OpenAI
    python baseline_agent.py --base-url http://localhost:8000

Requirements:
    pip install openai
    (server must be running: uvicorn server.app:app --host 0.0.0.0 --port 8000)
"""

import argparse
import json
import os
import time

from openai import OpenAI

from compiler_opt_env import CompilerOptAction, CompilerOptEnv
from compiler_opt_env.models import PASS_NAMES, TASK_EASY, TASK_MEDIUM, TASK_HARD

# ---------------------------------------------------------------------------
# Config
# ---------------------------------------------------------------------------
MODEL       = os.getenv("OPENAI_MODEL", "gpt-4o-mini")
EPISODES    = int(os.getenv("BASELINE_EPISODES", "5"))   # episodes per task
MAX_RETRIES = 3

TASK_NAMES = {TASK_EASY: "Easy", TASK_MEDIUM: "Medium", TASK_HARD: "Hard"}

SYSTEM_PROMPT = """You are an expert compiler engineer. You are controlling a compiler
optimization pipeline. At each step you must choose ONE optimization pass to apply
to the program's Intermediate Representation (IR) to minimize its estimated runtime cost.

Available passes (use the integer ID):
  0: dead_code_elimination      — removes unreachable/unused code
  1: constant_folding           — evaluates constant expressions at compile time
  2: loop_unrolling             — expands loop bodies to reduce iteration overhead
  3: function_inlining          — replaces function calls with function body
  4: vectorization              — uses SIMD instructions for parallel computation
  5: loop_invariant_motion      — moves loop-invariant code outside the loop
  6: strength_reduction         — replaces expensive ops with cheaper equivalents
  7: common_subexpr_elimination — eliminates redundant computations
  8: tail_call_optimization     — converts tail recursion to iteration
  9: branch_prediction_hints    — adds CPU branch prediction metadata
 10: register_allocation        — optimizes register usage
 11: instruction_scheduling     — reorders instructions to avoid pipeline stalls
 12: memory_coalescing          — combines memory accesses for cache efficiency
 13: alias_analysis             — determines which pointers can alias (enables others)
 14: interprocedural_analysis   — cross-function analysis (enables inlining)

IMPORTANT: Some passes are much more effective when specific prerequisite passes
have been applied first. For example, vectorization is nearly useless without
alias_analysis and dead_code_elimination applied first. Think carefully about
ordering — applying enabler passes early unlocks large gains later.

You must respond with ONLY a JSON object: {"pass_id": <integer 0-14>}
No explanation, no markdown, just the JSON."""


def build_user_prompt(obs) -> str:
    applied_names = [PASS_NAMES[p] for p in obs.passes_applied]
    available_names = {p: PASS_NAMES[p] for p in obs.passes_available}

    return f"""Current program state:
- Program type: {obs.program_type}
- Estimated cost: {obs.estimated_cost:.1f} (baseline: {obs.baseline_cost:.1f})
- Cost reduction so far: {obs.improvement_pct:.1f}%
- Steps used: {obs.step_count} / {obs.max_steps}
- Passes applied so far (in order): {applied_names if applied_names else 'none'}
- Available passes: {json.dumps(available_names)}
- Synergy state (effectiveness multipliers): {dict(zip(obs.passes_available, [round(obs.synergy_state[p], 2) for p in obs.passes_available]))}

Task: {obs.task_description}

Which pass should be applied next? Respond with only: {{"pass_id": <integer>}}"""


def run_llm_episode(env, openai_client: OpenAI, task_id: int) -> dict:
    """Run one episode with the LLM agent. Returns episode result dict."""
    result = env.reset()
    obs = result.observation if hasattr(result, 'observation') else result

    conversation = []
    episode_rewards = []

    while not obs.done:
        user_msg = build_user_prompt(obs)
        conversation_turn = [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user",   "content": user_msg},
        ]

        # Call LLM with retries
        pass_id = None
        for attempt in range(MAX_RETRIES):
            try:
                response = openai_client.chat.completions.create(
                    model=MODEL,
                    messages=conversation_turn,
                    temperature=0.2,
                    max_tokens=50,
                )
                raw = response.choices[0].message.content.strip()
                parsed = json.loads(raw)
                pass_id = int(parsed["pass_id"])
                if pass_id not in obs.passes_available:
                    print(f"  [warn] LLM chose unavailable pass {pass_id}, picking random")
                    import random
                    pass_id = random.choice(obs.passes_available)
                break
            except Exception as e:
                print(f"  [retry {attempt+1}] LLM parse error: {e}")
                time.sleep(1)

        if pass_id is None:
            import random
            pass_id = random.choice(obs.passes_available)
            print(f"  [fallback] Using random pass: {PASS_NAMES[pass_id]}")

        step_result = env.step(CompilerOptAction(pass_id=pass_id, task_id=task_id))
        obs = step_result.observation
        episode_rewards.append(step_result.reward or 0.0)

        print(f"    Step {obs.step_count}: {PASS_NAMES[pass_id]:35s} "
              f"→ improvement={obs.improvement_pct:.1f}%  "
              f"reward={step_result.reward:.4f}")

    return {
        "task_id":        task_id,
        "improvement_pct": obs.improvement_pct,
        "grader_score":   obs.grader_score,
        "steps_used":     obs.step_count,
        "passes_applied": [PASS_NAMES[p] for p in obs.passes_applied],
        "total_reward":   sum(episode_rewards),
        "program_type":   obs.program_type,
    }


def main():
    parser = argparse.ArgumentParser(description="Compiler Opt Env — LLM Baseline Agent")
    parser.add_argument("--base-url",  default="http://localhost:8000", help="Environment server URL")
    parser.add_argument("--episodes",  type=int, default=EPISODES,      help="Episodes per task")
    parser.add_argument("--model",     default=MODEL,                   help="OpenAI model name")
    args = parser.parse_args()

    api_key = os.getenv("OPENAI_API_KEY")
    if not api_key:
        raise ValueError("OPENAI_API_KEY environment variable not set")

    openai_client = OpenAI(
        api_key=api_key,
        base_url=os.getenv("OPENAI_BASE_URL", "https://api.openai.com/v1"),
    )

    print(f"\n{'='*65}")
    print(f"  Compiler Pass Ordering — LLM Baseline ({args.model})")
    print(f"  Server: {args.base_url}  |  Episodes per task: {args.episodes}")
    print(f"{'='*65}\n")

    all_results = []

    with CompilerOptEnv(base_url=args.base_url).sync() as env:
        for task_id in [TASK_EASY, TASK_MEDIUM, TASK_HARD]:
            print(f"\n--- Task {task_id} ({TASK_NAMES[task_id]}) ---")
            task_results = []

            for ep in range(args.episodes):
                print(f"  Episode {ep+1}/{args.episodes}:")
                result = run_llm_episode(env, openai_client, task_id)
                task_results.append(result)
                print(f"  → Grader score: {result['grader_score']:.3f}  "
                      f"Improvement: {result['improvement_pct']:.1f}%\n")

            avg_score  = sum(r['grader_score'] or 0 for r in task_results) / len(task_results)
            avg_improv = sum(r['improvement_pct'] for r in task_results) / len(task_results)
            all_results.extend(task_results)

            print(f"  Task {task_id} average — score: {avg_score:.3f}  improvement: {avg_improv:.1f}%")

    # ---------------------------------------------------------------------------
    # Summary report
    # ---------------------------------------------------------------------------
    print(f"\n{'='*65}")
    print("  BASELINE SCORE REPORT")
    print(f"{'='*65}")
    print(f"  Model: {args.model}")
    print(f"  Episodes per task: {args.episodes}\n")

    for task_id in [TASK_EASY, TASK_MEDIUM, TASK_HARD]:
        task_r = [r for r in all_results if r['task_id'] == task_id]
        scores  = [r['grader_score'] or 0 for r in task_r]
        improvs = [r['improvement_pct'] for r in task_r]
        print(f"  Task {task_id} ({TASK_NAMES[task_id]:6s}): "
              f"avg_score={sum(scores)/len(scores):.3f}  "
              f"avg_improvement={sum(improvs)/len(improvs):.1f}%  "
              f"best={max(scores):.3f}")

    overall = sum(r['grader_score'] or 0 for r in all_results) / len(all_results)
    print(f"\n  Overall average score: {overall:.3f} / 1.000")
    print(f"{'='*65}\n")

    # Save results to JSON
    output_path = "baseline_results.json"
    with open(output_path, "w") as f:
        json.dump({
            "model":    args.model,
            "episodes": args.episodes,
            "results":  all_results,
            "summary": {
                "overall_avg_score": overall,
                "by_task": {
                    str(tid): {
                        "avg_score": sum(r['grader_score'] or 0 for r in all_results if r['task_id'] == tid)
                                     / sum(1 for r in all_results if r['task_id'] == tid),
                        "avg_improvement_pct": sum(r['improvement_pct'] for r in all_results if r['task_id'] == tid)
                                               / sum(1 for r in all_results if r['task_id'] == tid),
                    }
                    for tid in [TASK_EASY, TASK_MEDIUM, TASK_HARD]
                }
            }
        }, f, indent=2)

    print(f"Full results saved to: {output_path}")


if __name__ == "__main__":
    main()