Spaces:

CodeReview
/

codereview-env

Sleeping

App Files Files Community

SyamSashank commited on Mar 30

Commit

a089c46

verified ·

1 Parent(s): 3e0331f

Update inference.py

Browse files

Files changed (1) hide show

inference.py +65 -34

inference.py CHANGED Viewed

@@ -5,37 +5,51 @@ import requests
 from openai import OpenAI
 from environment.models import Action, Issue
-# Better logging instead of quiet failures
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 API_BASE_URL = os.getenv("API_BASE_URL", "https://api.groq.com/openai/v1")
 API_KEY = os.getenv("GROQ_API_KEY") or os.getenv("HF_TOKEN") or os.getenv("OPENAI_API_KEY")
 MODEL_NAME = os.getenv("MODEL_NAME", "llama3-70b-8192")
-ENV_URL = os.getenv("ENV_URL", "http://localhost:7860")  # Set this for HF Spaces
 client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
 def parse_llm_response(text: str) -> Action:
-    """Parse LLM output into an Action. Expects JSON list of issues."""
     try:
-        # Extract JSON from markdown blocks
         if "```json" in text:
             text = text.split("```json")[1].split("```")[0]
         elif "```" in text:
             text = text.split("```")[1].split("```")[0]
         data = json.loads(text.strip())
         issues = [Issue(**item) for item in data]
         return Action(issues=issues, final=True)
     except Exception as e:
         logger.error(f"Failed to parse LLM response: {e}")
-        # Return an empty list indicating the model failed to find issues properly
         return Action(issues=[], final=True)
 def run_task(task_id: str) -> float:
-    # 1. Reset environment to get initial observation and session_id
-    logger.info(f"Running task: {task_id}")
     resp = requests.post(f"{ENV_URL}/reset", json={"task_id": task_id})
     resp.raise_for_status()
     reset_data = resp.json()
@@ -43,49 +57,66 @@ def run_task(task_id: str) -> float:
     session_id = reset_data["session_id"]
     obs = reset_data["observation"]
-    # 2. Build prompt using the code from the observation
-    prompt = f"""You are a code reviewer. Analyze the following Python code and list all issues (bugs, style, security, performance, documentation).
-    Return a JSON list where each item has: "line" (int), "category" (one of: bug, style, security, performance, documentation), "description" (string).
-    Example: [{{"line": 5, "category": "bug", "description": "Division by zero"}}]
-Code:
 {obs['code']}
 """
     try:
         response = client.chat.completions.create(
             model=MODEL_NAME,
             messages=[{"role": "user", "content": prompt}],
-            temperature=0.0  # Reproducibility
         )
-        raw = response.choices[0].message.content
-        logger.debug(f"Raw Output: {raw}")
     except Exception as e:
-        logger.error(f"OpenAI completion error: {e}")
-        raw = "[]"
-    action = parse_llm_response(raw)
-    # 3. Take step using the session_id
     step_resp = requests.post(f"{ENV_URL}/step", json={
         "session_id": session_id,
         "action": action.dict()
     })
     step_resp.raise_for_status()
-    data = step_resp.json()
-    final_reward = data["reward"]["value"]
-    logger.info(f"Task {task_id}: Final Score = {final_reward:.3f}")
     return final_reward
 if __name__ == "__main__":
-    scores = {}
-    for task in ["easy", "medium", "hard"]:
         try:
-            scores[task] = run_task(task)
         except Exception as e:
-            logger.error(f"Task execution failed ({task}): {e}")
-            scores[task] = 0.0
-    print("\n=== Baseline Results ===")
-    for task, score in scores.items():
-        print(f"{task}: {score:.3f}")

 from openai import OpenAI
 from environment.models import Action, Issue
+# Configure logging for better visibility in Hugging Face Logs
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# --- CONFIGURATION ---
+# The judges will provide these via environment variables
 API_BASE_URL = os.getenv("API_BASE_URL", "https://api.groq.com/openai/v1")
 API_KEY = os.getenv("GROQ_API_KEY") or os.getenv("HF_TOKEN") or os.getenv("OPENAI_API_KEY")
 MODEL_NAME = os.getenv("MODEL_NAME", "llama3-70b-8192")
+# UPDATED: Points directly to your Space URL by default
+ENV_URL = os.getenv("ENV_URL", "https://syam-sashank-codereview-env.hf.space")
+# Initialize OpenAI Client
 client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
 def parse_llm_response(text: str) -> Action:
+    """
+    Parses the LLM's string output into a structured Action object.
+    Handles Markdown code blocks commonly used by LLMs.
+    """
     try:
+        # Clean up Markdown JSON blocks
         if "```json" in text:
             text = text.split("```json")[1].split("```")[0]
         elif "```" in text:
             text = text.split("```")[1].split("```")[0]
         data = json.loads(text.strip())
+        # Validate items against the Issue model
         issues = [Issue(**item) for item in data]
         return Action(issues=issues, final=True)
     except Exception as e:
         logger.error(f"Failed to parse LLM response: {e}")
+        # Return empty list so the grader can still run (and likely give 0.0)
         return Action(issues=[], final=True)
 def run_task(task_id: str) -> float:
+    """
+    Executes a single task: Reset -> LLM Inference -> Step -> Return Reward.
+    """
+    logger.info(f"--- Starting Task: {task_id} ---")
+    # 1. Reset environment
     resp = requests.post(f"{ENV_URL}/reset", json={"task_id": task_id})
     resp.raise_for_status()
     reset_data = resp.json()
     session_id = reset_data["session_id"]
     obs = reset_data["observation"]
+    # 2. Build the prompt
+    prompt = f"""You are a professional security and code reviewer.
+Analyze the following Python code and identify all bugs, style issues, security flaws, performance anti-patterns, and missing documentation.
+Return ONLY a JSON list where each item has:
+- "line": (integer)
+- "category": (one of: bug, style, security, performance, documentation)
+- "description": (string, max 200 chars)
+Code to review:
 {obs['code']}
 """
     try:
         response = client.chat.completions.create(
             model=MODEL_NAME,
             messages=[{"role": "user", "content": prompt}],
+            temperature=0.0  # Crucial for reproducible baseline scores
         )
+        raw_content = response.choices[0].message.content
     except Exception as e:
+        logger.error(f"LLM Completion error: {e}")
+        raw_content = "[]"
+    # Convert LLM text to Action object
+    action = parse_llm_response(raw_content)
+    # 3. Take step in the environment
     step_resp = requests.post(f"{ENV_URL}/step", json={
         "session_id": session_id,
         "action": action.dict()
     })
     step_resp.raise_for_status()
+    result_data = step_resp.json()
+    # Extract the F1-based reward
+    final_reward = result_data["reward"]["value"]
+    logger.info(f"Result for {task_id}: Score = {final_reward:.3f}")
     return final_reward
 if __name__ == "__main__":
+    # The competition requires scores for at least 3 tasks
+    task_list = ["easy", "medium", "hard"]
+    final_scores = {}
+    print(f"Connecting to environment at: {ENV_URL}")
+    for task in task_list:
         try:
+            score = run_task(task)
+            final_scores[task] = score
         except Exception as e:
+            logger.error(f"Task {task} failed to execute: {e}")
+            final_scores[task] = 0.0
+    # Final Summary for the Logs
+    print("\n" + "="*30)
+    print(" BASELINE PERFORMANCE REPORT ")
+    print("="*30)
+    for task, score in final_scores.items():
+        print(f"Task: {task:8} | Score: {score:.3f}")
+    print("="*30)