Spaces:
Sleeping
Sleeping
File size: 4,581 Bytes
a448db8 54a19c9 03defc2 f14f8d9 03defc2 f14f8d9 03defc2 b53855f 03defc2 a448db8 03a7eb9 b53855f a448db8 03a7eb9 a448db8 03a7eb9 a448db8 03a7eb9 a448db8 03defc2 a448db8 03a7eb9 a448db8 03a7eb9 a448db8 9967cb5 646409d a448db8 646409d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 | import os
import json
from openai import OpenAI
from .models import ExecutionResult, TaskInfo
def force_valid_reward(value) -> float:
"""Hard guarantee: reward is strictly in (0, 1) — never 0 or 1, no exceptions."""
try:
r = float(value)
except Exception:
return 0.5
# HARD GUARANTEE: prevent .2f formatting from rounding to 1.00 or 0.00
if r <= 0.01:
return 0.01
if r >= 0.99:
return 0.99
return r
def safe_reward(reward) -> float:
"""Clamp reward to open interval (0, 1) via force_valid_reward."""
if reward is None:
reward = 0.5
return force_valid_reward(reward)
def normalize_reward(passed: int, total: int) -> float:
if total == 0:
return 0.5
raw = passed / total
return force_valid_reward(raw)
_LLM_CACHE = {}
_JUDGE_DISABLED_WARNED = False
def get_llm_quality_score(proposed_fix: str) -> dict:
global _JUDGE_DISABLED_WARNED
if proposed_fix in _LLM_CACHE:
return _LLM_CACHE[proposed_fix]
api_key = os.environ.get("OPENAI_API_KEY")
if not api_key:
if not _JUDGE_DISABLED_WARNED:
print("LLM judge disabled: OPENAI_API_KEY not set. Using neutral fallback scores.")
_JUDGE_DISABLED_WARNED = True
fallback = {"code_quality": 0.5, "security": 0.5, "correctness": 0.5}
_LLM_CACHE[proposed_fix] = fallback
return fallback
try:
client = OpenAI(api_key=api_key)
response = client.chat.completions.create(
model=os.environ.get("JUDGE_MODEL", "gpt-4o-mini"),
messages=[
{"role": "system", "content": "You are a code judge. Evaluate the provided Python code on a scale of 0.0 to 1.0 for three metrics: code_quality, security, and correctness. Respond with JSON format strictly matching: {\"code_quality\": 0.0, \"security\": 0.0, \"correctness\": 0.0}"},
{"role": "user", "content": proposed_fix}
],
response_format={"type": "json_object"}
)
result = json.loads(response.choices[0].message.content)
_LLM_CACHE[proposed_fix] = result
return result
except Exception as e:
print(f"LLM judge error: {e}")
fallback = {"code_quality": 0.5, "security": 0.5, "correctness": 0.5}
_LLM_CACHE[proposed_fix] = fallback
return fallback
def calculate_reward_components(exec_result: ExecutionResult, task_info: TaskInfo, proposed_fix: str) -> dict:
compile_score = 1.0 if not exec_result.runtime_errors else 0.0
test_ratio = 0.0
if exec_result.test_total > 0:
test_ratio = exec_result.test_passed / exec_result.test_total
efficiency = 0.0
if test_ratio == 1.0:
if exec_result.execution_time_seconds <= task_info.optimal_time_seconds:
efficiency = 1.0
else:
ratio = exec_result.execution_time_seconds / max(0.001, task_info.optimal_time_seconds)
efficiency = max(0.0, 1.0 - (ratio - 1.0) / 2.0)
llm_scores = get_llm_quality_score(proposed_fix)
return {
"compile_score": compile_score,
"test_ratio": test_ratio,
"efficiency": efficiency,
"llm_correctness": float(llm_scores.get("correctness", 0.5)),
"llm_security": float(llm_scores.get("security", 0.5)),
"llm_quality": float(llm_scores.get("code_quality", 0.5))
}
def calculate_reward(exec_result: ExecutionResult, task_info: TaskInfo, proposed_fix: str) -> tuple[float, dict]:
comps = calculate_reward_components(exec_result, task_info, proposed_fix)
base_reward = (
0.15 * comps["compile_score"] +
0.35 * comps["test_ratio"] +
0.30 * comps["efficiency"] + # Increased from 0.15 to push optimization
0.10 * comps["llm_correctness"] +
0.05 * comps["llm_security"] +
0.05 * comps["llm_quality"]
)
# Compile bonus: encourage first milestone
if comps["compile_score"] > 0.0:
base_reward += 0.05
# Harsh complexity penalty: if runtime is > 5x optimal, penalize heavily
if exec_result.test_passed == exec_result.test_total and exec_result.test_total > 0:
if exec_result.execution_time_seconds > task_info.optimal_time_seconds * 5:
base_reward -= 0.30
return base_reward, comps
def grade(*args, **kwargs) -> float:
try:
if len(args) == 3:
return calculate_reward(args[0], args[1], args[2])[0]
return 0.5
except Exception:
return 0.5
|