Spaces:
Sleeping
Sleeping
File size: 4,291 Bytes
6b42632 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 | #!/usr/bin/env python3
"""
Standalone baseline inference script.
Uses OpenAI gpt-4o-mini to review Python code across 3 difficulty levels.
Saves results to baseline_scores.json.
"""
import os
import json
import requests
from openai import OpenAI
# Initialize OpenAI client
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
raise ValueError("OPENAI_API_KEY environment variable not set")
client = OpenAI(api_key=api_key)
# Server endpoint
BASE_URL = "http://localhost:8000"
TASKS = ["easy", "medium", "hard"]
def reset_task(task_id: str) -> dict:
"""Reset environment for a given task_id."""
response = requests.post(
f"{BASE_URL}/reset",
json={"task_id": task_id}
)
response.raise_for_status()
return response.json()
def step_task(action: dict) -> dict:
"""Submit action to environment and get observation."""
response = requests.post(
f"{BASE_URL}/step",
json={"action": action}
)
response.raise_for_status()
return response.json()
def review_code(code_snippet: str) -> dict:
"""Use GPT-4o-mini to review code snippet."""
prompt = f"""Review this Python code. Reply as JSON with keys: review (str), bug_type (syntax/logic/security/none), line_number (int), confidence (float)
Code:
{code_snippet}"""
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "user", "content": prompt}
],
temperature=0.7
)
content = response.choices[0].message.content
# Try to extract JSON from response
try:
# First try direct JSON parsing
result = json.loads(content)
except json.JSONDecodeError:
# Try to find JSON in the response text
start = content.find('{')
end = content.rfind('}') + 1
if start != -1 and end > start:
result = json.loads(content[start:end])
else:
raise ValueError(f"Could not parse JSON from response: {content}")
return result
def run_baseline():
"""Run baseline inference on all tasks."""
results = {
"scores": {},
"details": {}
}
for task_id in TASKS:
print(f"\n{'='*60}")
print(f"Running task: {task_id}")
print('='*60)
# Reset environment
obs = reset_task(task_id)
code_snippet = obs.get("code_snippet", "")
print(f"Code snippet:\n{code_snippet}\n")
# Get review from GPT-4o-mini
print("Calling GPT-4o-mini for review...")
review_result = review_code(code_snippet)
print(f"Review result: {review_result}")
# Prepare action
action = {
"review": review_result.get("review", ""),
"bug_type": review_result.get("bug_type", "none"),
"line_number": int(review_result.get("line_number", -1)),
"confidence": float(review_result.get("confidence", 0.0))
}
# Submit action to environment
print(f"Submitting action: {action}")
step_obs = step_task(action)
# Extract score from observation
# The step response should have reward/score information
score = step_obs.get("cumulative_reward", 0.0)
feedback = step_obs.get("previous_feedback", "")
print(f"Score: {score}")
print(f"Feedback: {feedback}")
results["scores"][task_id] = score
results["details"][task_id] = {
"action": action,
"feedback": feedback,
"score": score
}
# Calculate average
scores = list(results["scores"].values())
average = sum(scores) / len(scores) if scores else 0.0
results["average"] = round(average, 4)
# Print summary
print(f"\n{'='*60}")
print("BASELINE RESULTS")
print('='*60)
for task_id in TASKS:
print(f"{task_id:10s}: {results['scores'][task_id]:.4f}")
print(f"{'Average':10s}: {results['average']:.4f}")
print('='*60 + "\n")
# Save to file
with open("baseline_scores.json", "w") as f:
json.dump(results, f, indent=2)
print("Results saved to baseline_scores.json")
return results
if __name__ == "__main__":
run_baseline()
|