adityanaikhpt commited on
Commit
a4db07d
·
verified ·
1 Parent(s): da8d00d

Deploy: server/llm_judge.py

Browse files
Files changed (1) hide show
  1. server/llm_judge.py +57 -0
server/llm_judge.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import hashlib
2
+ import json
3
+ import os
4
+
5
+ from openai import OpenAI
6
+
7
+
8
+ _JUDGE_CACHE = {}
9
+
10
+
11
+ def llm_judge(buggy_code: str, proposed_fix: str, task_category: str) -> dict:
12
+ cache_key = hashlib.md5(proposed_fix.encode()).hexdigest()
13
+ if cache_key in _JUDGE_CACHE:
14
+ return _JUDGE_CACHE[cache_key]
15
+
16
+ fallback = {"correctness": 0.5, "security": 0.5, "code_quality": 0.5}
17
+
18
+ try:
19
+ client = OpenAI(
20
+ api_key=os.environ.get("OPENAI_API_KEY") or os.environ.get("API_KEY"),
21
+ base_url=os.environ.get("API_BASE_URL") or None,
22
+ )
23
+ response = client.chat.completions.create(
24
+ model=os.environ.get("JUDGE_MODEL", "gpt-4o-mini"),
25
+ messages=[
26
+ {
27
+ "role": "system",
28
+ "content": (
29
+ "You are a code judge. Evaluate the provided Python code on a "
30
+ "scale of 0.0 to 1.0 for three metrics: code_quality, security, "
31
+ "and correctness. Respond with JSON format strictly matching: "
32
+ "{\"code_quality\": 0.0, \"security\": 0.0, \"correctness\": 0.0}"
33
+ ),
34
+ },
35
+ {
36
+ "role": "user",
37
+ "content": (
38
+ f"Task category: {task_category}\n\n"
39
+ f"Buggy code:\n{buggy_code}\n\n"
40
+ f"Proposed fix:\n{proposed_fix}"
41
+ ),
42
+ },
43
+ ],
44
+ response_format={"type": "json_object"},
45
+ )
46
+ raw = json.loads(response.choices[0].message.content)
47
+ result = {
48
+ "correctness": float(raw.get("correctness", 0.5)),
49
+ "security": float(raw.get("security", 0.5)),
50
+ "code_quality": float(raw.get("code_quality", 0.5)),
51
+ }
52
+ except Exception as exc:
53
+ print(f"LLM judge error: {exc}")
54
+ result = fallback
55
+
56
+ _JUDGE_CACHE[cache_key] = result
57
+ return result