Spaces:

adityanaikhpt
/

codearena

Sleeping

App Files Files Community

adityanaikhpt commited on about 1 month ago

Commit

a4db07d

verified ·

1 Parent(s): da8d00d

Deploy: server/llm_judge.py

Browse files

Files changed (1) hide show

server/llm_judge.py +57 -0

server/llm_judge.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import hashlib
+import json
+import os
+from openai import OpenAI
+_JUDGE_CACHE = {}
+def llm_judge(buggy_code: str, proposed_fix: str, task_category: str) -> dict:
+    cache_key = hashlib.md5(proposed_fix.encode()).hexdigest()
+    if cache_key in _JUDGE_CACHE:
+        return _JUDGE_CACHE[cache_key]
+    fallback = {"correctness": 0.5, "security": 0.5, "code_quality": 0.5}
+    try:
+        client = OpenAI(
+            api_key=os.environ.get("OPENAI_API_KEY") or os.environ.get("API_KEY"),
+            base_url=os.environ.get("API_BASE_URL") or None,
+        )
+        response = client.chat.completions.create(
+            model=os.environ.get("JUDGE_MODEL", "gpt-4o-mini"),
+            messages=[
+                {
+                    "role": "system",
+                    "content": (
+                        "You are a code judge. Evaluate the provided Python code on a "
+                        "scale of 0.0 to 1.0 for three metrics: code_quality, security, "
+                        "and correctness. Respond with JSON format strictly matching: "
+                        "{\"code_quality\": 0.0, \"security\": 0.0, \"correctness\": 0.0}"
+                    ),
+                },
+                {
+                    "role": "user",
+                    "content": (
+                        f"Task category: {task_category}\n\n"
+                        f"Buggy code:\n{buggy_code}\n\n"
+                        f"Proposed fix:\n{proposed_fix}"
+                    ),
+                },
+            ],
+            response_format={"type": "json_object"},
+        )
+        raw = json.loads(response.choices[0].message.content)
+        result = {
+            "correctness": float(raw.get("correctness", 0.5)),
+            "security": float(raw.get("security", 0.5)),
+            "code_quality": float(raw.get("code_quality", 0.5)),
+        }
+    except Exception as exc:
+        print(f"LLM judge error: {exc}")
+        result = fallback
+    _JUDGE_CACHE[cache_key] = result
+    return result