Spaces:

CodeReview
/

codereview-env

Running

App Files Files Community

codereview-env / inference.py

SyamSashank

Update inference.py

a089c46 verified 8 days ago

raw

history blame

4.1 kB

	import os
	import json
	import logging
	import requests
	from openai import OpenAI
	from environment.models import Action, Issue

	# Configure logging for better visibility in Hugging Face Logs
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	# --- CONFIGURATION ---
	# The judges will provide these via environment variables
	API_BASE_URL = os.getenv("API_BASE_URL", "https://api.groq.com/openai/v1")
	API_KEY = os.getenv("GROQ_API_KEY") or os.getenv("HF_TOKEN") or os.getenv("OPENAI_API_KEY")
	MODEL_NAME = os.getenv("MODEL_NAME", "llama3-70b-8192")

	# UPDATED: Points directly to your Space URL by default
	ENV_URL = os.getenv("ENV_URL", "https://syam-sashank-codereview-env.hf.space")

	# Initialize OpenAI Client
	client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)

	def parse_llm_response(text: str) -> Action:
	"""
	Parses the LLM's string output into a structured Action object.
	Handles Markdown code blocks commonly used by LLMs.
	"""
	try:
	# Clean up Markdown JSON blocks
	if "```json" in text:
	text = text.split("```json")[1].split("```")[0]
	elif "```" in text:
	text = text.split("```")[1].split("```")[0]

	data = json.loads(text.strip())

	# Validate items against the Issue model
	issues = [Issue(**item) for item in data]
	return Action(issues=issues, final=True)
	except Exception as e:
	logger.error(f"Failed to parse LLM response: {e}")
	# Return empty list so the grader can still run (and likely give 0.0)
	return Action(issues=[], final=True)

	def run_task(task_id: str) -> float:
	"""
	Executes a single task: Reset -> LLM Inference -> Step -> Return Reward.
	"""
	logger.info(f"--- Starting Task: {task_id} ---")

	# 1. Reset environment
	resp = requests.post(f"{ENV_URL}/reset", json={"task_id": task_id})
	resp.raise_for_status()
	reset_data = resp.json()

	session_id = reset_data["session_id"]
	obs = reset_data["observation"]

	# 2. Build the prompt
	prompt = f"""You are a professional security and code reviewer.
	Analyze the following Python code and identify all bugs, style issues, security flaws, performance anti-patterns, and missing documentation.

	Return ONLY a JSON list where each item has:
	- "line": (integer)
	- "category": (one of: bug, style, security, performance, documentation)
	- "description": (string, max 200 chars)

	Code to review:
	{obs['code']}
	"""

	try:
	response = client.chat.completions.create(
	model=MODEL_NAME,
	messages=[{"role": "user", "content": prompt}],
	temperature=0.0 # Crucial for reproducible baseline scores
	)
	raw_content = response.choices[0].message.content
	except Exception as e:
	logger.error(f"LLM Completion error: {e}")
	raw_content = "[]"

	# Convert LLM text to Action object
	action = parse_llm_response(raw_content)

	# 3. Take step in the environment
	step_resp = requests.post(f"{ENV_URL}/step", json={
	"session_id": session_id,
	"action": action.dict()
	})
	step_resp.raise_for_status()
	result_data = step_resp.json()

	# Extract the F1-based reward
	final_reward = result_data["reward"]["value"]
	logger.info(f"Result for {task_id}: Score = {final_reward:.3f}")

	return final_reward

	if __name__ == "__main__":
	# The competition requires scores for at least 3 tasks
	task_list = ["easy", "medium", "hard"]
	final_scores = {}

	print(f"Connecting to environment at: {ENV_URL}")

	for task in task_list:
	try:
	score = run_task(task)
	final_scores[task] = score
	except Exception as e:
	logger.error(f"Task {task} failed to execute: {e}")
	final_scores[task] = 0.0

	# Final Summary for the Logs
	print("\n" + "="*30)
	print(" BASELINE PERFORMANCE REPORT ")
	print("="*30)
	for task, score in final_scores.items():
	print(f"Task: {task:8} \| Score: {score:.3f}")
	print("="*30)