Spaces:
Running
Running
| import os | |
| import json | |
| import logging | |
| import requests | |
| from openai import OpenAI | |
| from environment.models import Action, Issue | |
| # Configure logging for better visibility in Hugging Face Logs | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| # --- CONFIGURATION --- | |
| # The judges will provide these via environment variables | |
| API_BASE_URL = os.getenv("API_BASE_URL", "https://api.groq.com/openai/v1") | |
| API_KEY = os.getenv("GROQ_API_KEY") or os.getenv("HF_TOKEN") or os.getenv("OPENAI_API_KEY") | |
| MODEL_NAME = os.getenv("MODEL_NAME", "llama3-70b-8192") | |
| # UPDATED: Points directly to your Space URL by default | |
| ENV_URL = os.getenv("ENV_URL", "https://syam-sashank-codereview-env.hf.space") | |
| # Initialize OpenAI Client | |
| client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY) | |
| def parse_llm_response(text: str) -> Action: | |
| """ | |
| Parses the LLM's string output into a structured Action object. | |
| Handles Markdown code blocks commonly used by LLMs. | |
| """ | |
| try: | |
| # Clean up Markdown JSON blocks | |
| if "```json" in text: | |
| text = text.split("```json")[1].split("```")[0] | |
| elif "```" in text: | |
| text = text.split("```")[1].split("```")[0] | |
| data = json.loads(text.strip()) | |
| # Validate items against the Issue model | |
| issues = [Issue(**item) for item in data] | |
| return Action(issues=issues, final=True) | |
| except Exception as e: | |
| logger.error(f"Failed to parse LLM response: {e}") | |
| # Return empty list so the grader can still run (and likely give 0.0) | |
| return Action(issues=[], final=True) | |
| def run_task(task_id: str) -> float: | |
| """ | |
| Executes a single task: Reset -> LLM Inference -> Step -> Return Reward. | |
| """ | |
| logger.info(f"--- Starting Task: {task_id} ---") | |
| # 1. Reset environment | |
| resp = requests.post(f"{ENV_URL}/reset", json={"task_id": task_id}) | |
| resp.raise_for_status() | |
| reset_data = resp.json() | |
| session_id = reset_data["session_id"] | |
| obs = reset_data["observation"] | |
| # 2. Build the prompt | |
| prompt = f"""You are a professional security and code reviewer. | |
| Analyze the following Python code and identify all bugs, style issues, security flaws, performance anti-patterns, and missing documentation. | |
| Return ONLY a JSON list where each item has: | |
| - "line": (integer) | |
| - "category": (one of: bug, style, security, performance, documentation) | |
| - "description": (string, max 200 chars) | |
| Code to review: | |
| {obs['code']} | |
| """ | |
| try: | |
| response = client.chat.completions.create( | |
| model=MODEL_NAME, | |
| messages=[{"role": "user", "content": prompt}], | |
| temperature=0.0 # Crucial for reproducible baseline scores | |
| ) | |
| raw_content = response.choices[0].message.content | |
| except Exception as e: | |
| logger.error(f"LLM Completion error: {e}") | |
| raw_content = "[]" | |
| # Convert LLM text to Action object | |
| action = parse_llm_response(raw_content) | |
| # 3. Take step in the environment | |
| step_resp = requests.post(f"{ENV_URL}/step", json={ | |
| "session_id": session_id, | |
| "action": action.dict() | |
| }) | |
| step_resp.raise_for_status() | |
| result_data = step_resp.json() | |
| # Extract the F1-based reward | |
| final_reward = result_data["reward"]["value"] | |
| logger.info(f"Result for {task_id}: Score = {final_reward:.3f}") | |
| return final_reward | |
| if __name__ == "__main__": | |
| # The competition requires scores for at least 3 tasks | |
| task_list = ["easy", "medium", "hard"] | |
| final_scores = {} | |
| print(f"Connecting to environment at: {ENV_URL}") | |
| for task in task_list: | |
| try: | |
| score = run_task(task) | |
| final_scores[task] = score | |
| except Exception as e: | |
| logger.error(f"Task {task} failed to execute: {e}") | |
| final_scores[task] = 0.0 | |
| # Final Summary for the Logs | |
| print("\n" + "="*30) | |
| print(" BASELINE PERFORMANCE REPORT ") | |
| print("="*30) | |
| for task, score in final_scores.items(): | |
| print(f"Task: {task:8} | Score: {score:.3f}") | |
| print("="*30) |