File size: 3,161 Bytes
2182d10
 
 
 
62f081e
2182d10
 
 
 
 
1d48da1
2182d10
 
1d48da1
 
2182d10
 
 
 
 
 
78c01fe
2182d10
 
78c01fe
2182d10
 
3d38d37
 
 
 
0263e79
2182d10
3d38d37
2182d10
3d38d37
 
 
2182d10
3d38d37
 
 
 
 
 
 
 
 
 
0263e79
3d38d37
 
 
 
 
2182d10
3d38d37
0263e79
2182d10
3d38d37
2182d10
3d38d37
 
0263e79
3d38d37
 
 
 
0263e79
 
 
 
 
 
3d38d37
0263e79
2182d10
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import os
import sys
import traceback
from openai import OpenAI
from server.environment import CodeReviewEnv

# -------------------------------------------------------------------
# Configuration & Environment Variables
# -------------------------------------------------------------------
API_BASE_URL = os.getenv("API_BASE_URL", "https://api.openai.com/v1")
MODEL_NAME = os.getenv("MODEL_NAME", "gpt-4.1-mini")
HF_TOKEN = os.getenv("HF_TOKEN")

if HF_TOKEN is None:
    raise ValueError("HF_TOKEN environment variable is required")

# -------------------------------------------------------------------
# Main Inference Loop
# -------------------------------------------------------------------
def main():

    # Initialize OpenAI Client
    client = OpenAI(
        base_url=API_BASE_URL,
        api_key=HF_TOKEN
    )

    for diff in ["easy", "medium", "hard"]:
        env = CodeReviewEnv(difficulty=diff)
        
        # [START] Output
        print(f"[START] task={env.task_name} env={env.benchmark_name} model={MODEL_NAME}", flush=True)

        success = False
        
        try:
            obs = env.reset()
            done = False
            
            while not done:
                # Replace dummy action with actual LLM generation using the standard OpenAI client
                response = client.chat.completions.create(
                    model=MODEL_NAME,
                    messages=[
                        {"role": "system", "content": "You are a precise code reviewer. Your ONLY allowed outputs are: 'COMMENT <line_number> <text>', 'APPROVE', or 'REQUEST_CHANGES'."},
                        {"role": "user", "content": obs}
                    ],
                    max_tokens=100
                )
                action_str = response.choices[0].message.content.strip().replace("\n", " ")
                
                obs, reward_str, done, error = env.step(action_str)
                
                error_str = error if error else "null"
                done_str = "true" if done else "false"

                # [STEP] Output
                print(f"[STEP] step={env.steps_taken} action={action_str} reward={reward_str} done={done_str} error={error_str}", flush=True)

            success = True

        except Exception as e:
            error_msg = str(e).replace('\n', ' ')
            print(f"[STEP] step={env.steps_taken} action=error reward=0.00 done=true error={error_msg}", flush=True)
            success = False
        finally:
            # [END] Output MUST ALWAYS be emitted, even on exceptions
            success_str = "true" if success else "false"
            
            # For our Code Review Environment, the maximum optimal reward is 1.8 (0.8 comment + 1.0 request_changes)
            sum_rewards = sum(env.rewards) if env.rewards else 0.0
            score = max(0.0, min(sum_rewards / 1.8, 1.0))
            score_str = f"{score:.3f}"
            
            rewards_str = ",".join([f"{r:.2f}" for r in env.rewards])
            print(f"[END] success={success_str} steps={env.steps_taken} score={score_str} rewards={rewards_str}", flush=True)

if __name__ == "__main__":
    main()