TheAarvee05 commited on
Commit
c030db3
·
verified ·
1 Parent(s): 8c807bd

Upload evaluation/llm_grader.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. evaluation/llm_grader.py +100 -0
evaluation/llm_grader.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ evaluation/llm_grader.py — LLM-as-judge grader for qualitative scoring.
3
+
4
+ Scores the agent's REASONING quality on top of the programmatic score.
5
+ Uses a rubric to evaluate whether the agent correctly diagnosed the root cause.
6
+ """
7
+
8
+ from __future__ import annotations
9
+ import json
10
+ import os
11
+ from typing import List
12
+
13
+ from openai import OpenAI
14
+
15
+ RUBRIC = """
16
+ You are evaluating an AI agent's performance on a Meta Ads attribution recovery task.
17
+
18
+ Score the agent's trajectory from 0.0 to 1.0 on the following rubric:
19
+
20
+ 1.0 — Agent correctly identified ALL root causes (wrong attribution window, pixel signal loss,
21
+ budget misallocation) and applied the right fixes in a logical order with clear reasoning.
22
+
23
+ 0.75 — Agent identified the primary issue and fixed it, but missed secondary issues or
24
+ applied fixes in a suboptimal order.
25
+
26
+ 0.50 — Agent showed partial understanding of the problem and applied some correct actions,
27
+ but reasoning was vague or steps were redundant.
28
+
29
+ 0.25 — Agent took some valid actions but clearly did not understand the root causes.
30
+ Mixed correct and incorrect reasoning.
31
+
32
+ 0.0 — Agent failed to diagnose any issue correctly. Applied irrelevant or harmful actions.
33
+
34
+ Return ONLY a JSON object:
35
+ {"score": 0.0, "rationale": "one paragraph explanation"}
36
+ """
37
+
38
+
39
+ class LLMGrader:
40
+ def __init__(self, model: str | None = None):
41
+ api_key = os.environ.get("HF_TOKEN")
42
+ if not api_key:
43
+ raise EnvironmentError("HF_TOKEN not set")
44
+ base_url = os.environ.get("API_BASE_URL")
45
+ if not base_url:
46
+ raise EnvironmentError("API_BASE_URL not set")
47
+ self.client = OpenAI(api_key=api_key, base_url=base_url)
48
+ self.model = model or os.environ.get("MODEL_NAME")
49
+ if not self.model:
50
+ raise EnvironmentError("MODEL_NAME not set")
51
+ if self.model != "Qwen/Qwen2.5-72B-Instruct":
52
+ raise EnvironmentError("MODEL_NAME must be 'Qwen/Qwen2.5-72B-Instruct'")
53
+
54
+ def grade_trajectory(
55
+ self,
56
+ task_id: str,
57
+ history: List[dict],
58
+ initial_context: str,
59
+ final_context: str,
60
+ ) -> dict:
61
+ """Score the agent's full trajectory."""
62
+
63
+ steps_text = "\n".join(
64
+ f"Step {s['step']}: action={s['action']}, reward={s['reward']:.4f}, effects={s['effects']}"
65
+ for s in history
66
+ )
67
+
68
+ prompt = f"""
69
+ Task: {task_id}
70
+
71
+ INITIAL STATE:
72
+ {initial_context}
73
+
74
+ AGENT TRAJECTORY:
75
+ {steps_text}
76
+
77
+ FINAL STATE:
78
+ {final_context}
79
+
80
+ Please evaluate the agent's performance using the rubric.
81
+ """
82
+ response = self.client.chat.completions.create(
83
+ model=self.model,
84
+ messages=[
85
+ {"role": "system", "content": RUBRIC},
86
+ {"role": "user", "content": prompt},
87
+ ],
88
+ temperature=0.0,
89
+ max_tokens=400,
90
+ )
91
+ raw = response.choices[0].message.content.strip()
92
+ if raw.startswith("```"):
93
+ raw = raw.split("```")[1]
94
+ if raw.startswith("json"):
95
+ raw = raw[4:]
96
+ raw = raw.strip()
97
+ try:
98
+ return json.loads(raw)
99
+ except Exception:
100
+ return {"score": 0.0, "rationale": "Parse error"}