Spaces:

ceoavinash
/

codearena-rl

Sleeping

File size: 4,581 Bytes

a448db8
 
 
54a19c9
 
03defc2
 
f14f8d9
03defc2
f14f8d9
 
03defc2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b53855f
 
 
03defc2
 
 
a448db8
03a7eb9
b53855f
a448db8
03a7eb9
a448db8
 
03a7eb9
 
 
 
 
 
 
 
 
 
a448db8
03a7eb9
a448db8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
03defc2
a448db8
 
 
03a7eb9
 
 
 
 
 
a448db8
03a7eb9
 
 
 
 
 
 
 
 
 
a448db8
9967cb5
646409d
 
a448db8
 
646409d

import os
import json
from openai import OpenAI
from .models import ExecutionResult, TaskInfo

def force_valid_reward(value) -> float:
    """Hard guarantee: reward is strictly in (0, 1) — never 0 or 1, no exceptions."""
    try:
        r = float(value)
    except Exception:
        return 0.5

    # HARD GUARANTEE: prevent .2f formatting from rounding to 1.00 or 0.00
    if r <= 0.01:
        return 0.01
    if r >= 0.99:
        return 0.99

    return r

def safe_reward(reward) -> float:
    """Clamp reward to open interval (0, 1) via force_valid_reward."""
    if reward is None:
        reward = 0.5
    return force_valid_reward(reward)

def normalize_reward(passed: int, total: int) -> float:
    if total == 0:
        return 0.5
    raw = passed / total
    return force_valid_reward(raw)

_LLM_CACHE = {}
_JUDGE_DISABLED_WARNED = False

def get_llm_quality_score(proposed_fix: str) -> dict:
    global _JUDGE_DISABLED_WARNED
    if proposed_fix in _LLM_CACHE:
        return _LLM_CACHE[proposed_fix]

    api_key = os.environ.get("OPENAI_API_KEY")
    if not api_key:
        if not _JUDGE_DISABLED_WARNED:
            print("LLM judge disabled: OPENAI_API_KEY not set. Using neutral fallback scores.")
            _JUDGE_DISABLED_WARNED = True
        fallback = {"code_quality": 0.5, "security": 0.5, "correctness": 0.5}
        _LLM_CACHE[proposed_fix] = fallback
        return fallback

    try:
        client = OpenAI(api_key=api_key)
        response = client.chat.completions.create(
            model=os.environ.get("JUDGE_MODEL", "gpt-4o-mini"),
            messages=[
                {"role": "system", "content": "You are a code judge. Evaluate the provided Python code on a scale of 0.0 to 1.0 for three metrics: code_quality, security, and correctness. Respond with JSON format strictly matching: {\"code_quality\": 0.0, \"security\": 0.0, \"correctness\": 0.0}"},
                {"role": "user", "content": proposed_fix}
            ],
            response_format={"type": "json_object"}
        )
        result = json.loads(response.choices[0].message.content)
        _LLM_CACHE[proposed_fix] = result
        return result
    except Exception as e:
        print(f"LLM judge error: {e}")
        fallback = {"code_quality": 0.5, "security": 0.5, "correctness": 0.5}
        _LLM_CACHE[proposed_fix] = fallback
        return fallback

def calculate_reward_components(exec_result: ExecutionResult, task_info: TaskInfo, proposed_fix: str) -> dict:
    compile_score = 1.0 if not exec_result.runtime_errors else 0.0
    
    test_ratio = 0.0
    if exec_result.test_total > 0:
        test_ratio = exec_result.test_passed / exec_result.test_total
        
    efficiency = 0.0
    if test_ratio == 1.0:
        if exec_result.execution_time_seconds <= task_info.optimal_time_seconds:
            efficiency = 1.0
        else:
            ratio = exec_result.execution_time_seconds / max(0.001, task_info.optimal_time_seconds)
            efficiency = max(0.0, 1.0 - (ratio - 1.0) / 2.0)
            
    llm_scores = get_llm_quality_score(proposed_fix)
    
    return {
        "compile_score": compile_score,
        "test_ratio": test_ratio,
        "efficiency": efficiency,
        "llm_correctness": float(llm_scores.get("correctness", 0.5)),
        "llm_security": float(llm_scores.get("security", 0.5)),
        "llm_quality": float(llm_scores.get("code_quality", 0.5))
    }

def calculate_reward(exec_result: ExecutionResult, task_info: TaskInfo, proposed_fix: str) -> tuple[float, dict]:
    comps = calculate_reward_components(exec_result, task_info, proposed_fix)
    base_reward = (
        0.15 * comps["compile_score"] +  
        0.35 * comps["test_ratio"] +      
        0.30 * comps["efficiency"] +     # Increased from 0.15 to push optimization
        0.10 * comps["llm_correctness"] +
        0.05 * comps["llm_security"] +   
        0.05 * comps["llm_quality"]      
    )
    
    # Compile bonus: encourage first milestone
    if comps["compile_score"] > 0.0:
        base_reward += 0.05
        
    # Harsh complexity penalty: if runtime is > 5x optimal, penalize heavily
    if exec_result.test_passed == exec_result.test_total and exec_result.test_total > 0:
        if exec_result.execution_time_seconds > task_info.optimal_time_seconds * 5:
            base_reward -= 0.30
    
    return base_reward, comps

def grade(*args, **kwargs) -> float:
    try:
        if len(args) == 3:
            return calculate_reward(args[0], args[1], args[2])[0]
        return 0.5
    except Exception:
        return 0.5