Spaces:

CodeReview
/

codereview-env

Running

File size: 4,102 Bytes

6e7ce30
 
 
 
 
 
 
a089c46
6e7ce30
 
 
a089c46
 
6e7ce30
 
 
 
a089c46
 
 
 
6e7ce30
 
 
a089c46
 
 
 
6e7ce30
a089c46
6e7ce30
 
 
 
a089c46
6e7ce30
a089c46
 
6e7ce30
 
 
 
a089c46
6e7ce30
 
 
a089c46
 
 
 
 
 
6e7ce30
 
 
 
 
 
 
a089c46
 
 
 
 
 
 
 
6e7ce30
a089c46
6e7ce30
 
a089c46
6e7ce30
 
 
 
a089c46
6e7ce30
a089c46
6e7ce30
a089c46
 
 
 
 
 
 
6e7ce30
 
 
 
 
a089c46
 
 
 
 
6e7ce30
 
 
 
a089c46
 
 
 
 
 
 
6e7ce30
a089c46
 
6e7ce30
a089c46

import os
import json
import logging
import requests
from openai import OpenAI
from environment.models import Action, Issue

# Configure logging for better visibility in Hugging Face Logs
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# --- CONFIGURATION ---
# The judges will provide these via environment variables
API_BASE_URL = os.getenv("API_BASE_URL", "https://api.groq.com/openai/v1")
API_KEY = os.getenv("GROQ_API_KEY") or os.getenv("HF_TOKEN") or os.getenv("OPENAI_API_KEY")
MODEL_NAME = os.getenv("MODEL_NAME", "llama3-70b-8192")

# UPDATED: Points directly to your Space URL by default
ENV_URL = os.getenv("ENV_URL", "https://syam-sashank-codereview-env.hf.space")

# Initialize OpenAI Client
client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)

def parse_llm_response(text: str) -> Action:
    """
    Parses the LLM's string output into a structured Action object.
    Handles Markdown code blocks commonly used by LLMs.
    """
    try:
        # Clean up Markdown JSON blocks
        if "```json" in text:
            text = text.split("```json")[1].split("```")[0]
        elif "```" in text:
            text = text.split("```")[1].split("```")[0]
            
        data = json.loads(text.strip())
        
        # Validate items against the Issue model
        issues = [Issue(**item) for item in data]
        return Action(issues=issues, final=True)
    except Exception as e:
        logger.error(f"Failed to parse LLM response: {e}")
        # Return empty list so the grader can still run (and likely give 0.0)
        return Action(issues=[], final=True)

def run_task(task_id: str) -> float:
    """
    Executes a single task: Reset -> LLM Inference -> Step -> Return Reward.
    """
    logger.info(f"--- Starting Task: {task_id} ---")
    
    # 1. Reset environment
    resp = requests.post(f"{ENV_URL}/reset", json={"task_id": task_id})
    resp.raise_for_status()
    reset_data = resp.json()
    
    session_id = reset_data["session_id"]
    obs = reset_data["observation"]
    
    # 2. Build the prompt
    prompt = f"""You are a professional security and code reviewer. 
Analyze the following Python code and identify all bugs, style issues, security flaws, performance anti-patterns, and missing documentation.

Return ONLY a JSON list where each item has:
- "line": (integer)
- "category": (one of: bug, style, security, performance, documentation)
- "description": (string, max 200 chars)

Code to review:
{obs['code']}
"""

    try:
        response = client.chat.completions.create(
            model=MODEL_NAME,
            messages=[{"role": "user", "content": prompt}],
            temperature=0.0  # Crucial for reproducible baseline scores
        )
        raw_content = response.choices[0].message.content
    except Exception as e:
        logger.error(f"LLM Completion error: {e}")
        raw_content = "[]"

    # Convert LLM text to Action object
    action = parse_llm_response(raw_content)

    # 3. Take step in the environment
    step_resp = requests.post(f"{ENV_URL}/step", json={
        "session_id": session_id,
        "action": action.dict()
    })
    step_resp.raise_for_status()
    result_data = step_resp.json()
    
    # Extract the F1-based reward
    final_reward = result_data["reward"]["value"]
    logger.info(f"Result for {task_id}: Score = {final_reward:.3f}")
    
    return final_reward

if __name__ == "__main__":
    # The competition requires scores for at least 3 tasks
    task_list = ["easy", "medium", "hard"]
    final_scores = {}

    print(f"Connecting to environment at: {ENV_URL}")
    
    for task in task_list:
        try:
            score = run_task(task)
            final_scores[task] = score
        except Exception as e:
            logger.error(f"Task {task} failed to execute: {e}")
            final_scores[task] = 0.0

    # Final Summary for the Logs
    print("\n" + "="*30)
    print(" BASELINE PERFORMANCE REPORT ")
    print("="*30)
    for task, score in final_scores.items():
        print(f"Task: {task:8} | Score: {score:.3f}")
    print("="*30)