File size: 4,291 Bytes
6b42632
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#!/usr/bin/env python3
"""
Standalone baseline inference script.
Uses OpenAI gpt-4o-mini to review Python code across 3 difficulty levels.
Saves results to baseline_scores.json.
"""

import os
import json
import requests
from openai import OpenAI

# Initialize OpenAI client
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise ValueError("OPENAI_API_KEY environment variable not set")

client = OpenAI(api_key=api_key)

# Server endpoint
BASE_URL = "http://localhost:8000"
TASKS = ["easy", "medium", "hard"]

def reset_task(task_id: str) -> dict:
    """Reset environment for a given task_id."""
    response = requests.post(
        f"{BASE_URL}/reset",
        json={"task_id": task_id}
    )
    response.raise_for_status()
    return response.json()

def step_task(action: dict) -> dict:
    """Submit action to environment and get observation."""
    response = requests.post(
        f"{BASE_URL}/step",
        json={"action": action}
    )
    response.raise_for_status()
    return response.json()

def review_code(code_snippet: str) -> dict:
    """Use GPT-4o-mini to review code snippet."""
    prompt = f"""Review this Python code. Reply as JSON with keys: review (str), bug_type (syntax/logic/security/none), line_number (int), confidence (float)

Code:
{code_snippet}"""

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "user", "content": prompt}
        ],
        temperature=0.7
    )
    
    content = response.choices[0].message.content
    
    # Try to extract JSON from response
    try:
        # First try direct JSON parsing
        result = json.loads(content)
    except json.JSONDecodeError:
        # Try to find JSON in the response text
        start = content.find('{')
        end = content.rfind('}') + 1
        if start != -1 and end > start:
            result = json.loads(content[start:end])
        else:
            raise ValueError(f"Could not parse JSON from response: {content}")
    
    return result

def run_baseline():
    """Run baseline inference on all tasks."""
    results = {
        "scores": {},
        "details": {}
    }
    
    for task_id in TASKS:
        print(f"\n{'='*60}")
        print(f"Running task: {task_id}")
        print('='*60)
        
        # Reset environment
        obs = reset_task(task_id)
        code_snippet = obs.get("code_snippet", "")
        print(f"Code snippet:\n{code_snippet}\n")
        
        # Get review from GPT-4o-mini
        print("Calling GPT-4o-mini for review...")
        review_result = review_code(code_snippet)
        print(f"Review result: {review_result}")
        
        # Prepare action
        action = {
            "review": review_result.get("review", ""),
            "bug_type": review_result.get("bug_type", "none"),
            "line_number": int(review_result.get("line_number", -1)),
            "confidence": float(review_result.get("confidence", 0.0))
        }
        
        # Submit action to environment
        print(f"Submitting action: {action}")
        step_obs = step_task(action)
        
        # Extract score from observation
        # The step response should have reward/score information
        score = step_obs.get("cumulative_reward", 0.0)
        feedback = step_obs.get("previous_feedback", "")
        
        print(f"Score: {score}")
        print(f"Feedback: {feedback}")
        
        results["scores"][task_id] = score
        results["details"][task_id] = {
            "action": action,
            "feedback": feedback,
            "score": score
        }
    
    # Calculate average
    scores = list(results["scores"].values())
    average = sum(scores) / len(scores) if scores else 0.0
    results["average"] = round(average, 4)
    
    # Print summary
    print(f"\n{'='*60}")
    print("BASELINE RESULTS")
    print('='*60)
    for task_id in TASKS:
        print(f"{task_id:10s}: {results['scores'][task_id]:.4f}")
    print(f"{'Average':10s}: {results['average']:.4f}")
    print('='*60 + "\n")
    
    # Save to file
    with open("baseline_scores.json", "w") as f:
        json.dump(results, f, indent=2)
    
    print("Results saved to baseline_scores.json")
    return results

if __name__ == "__main__":
    run_baseline()