TheAarvee05's picture
Upload evaluation/llm_grader.py with huggingface_hub
c030db3 verified
"""
evaluation/llm_grader.py β€” LLM-as-judge grader for qualitative scoring.
Scores the agent's REASONING quality on top of the programmatic score.
Uses a rubric to evaluate whether the agent correctly diagnosed the root cause.
"""
from __future__ import annotations
import json
import os
from typing import List
from openai import OpenAI
RUBRIC = """
You are evaluating an AI agent's performance on a Meta Ads attribution recovery task.
Score the agent's trajectory from 0.0 to 1.0 on the following rubric:
1.0 β€” Agent correctly identified ALL root causes (wrong attribution window, pixel signal loss,
budget misallocation) and applied the right fixes in a logical order with clear reasoning.
0.75 β€” Agent identified the primary issue and fixed it, but missed secondary issues or
applied fixes in a suboptimal order.
0.50 β€” Agent showed partial understanding of the problem and applied some correct actions,
but reasoning was vague or steps were redundant.
0.25 β€” Agent took some valid actions but clearly did not understand the root causes.
Mixed correct and incorrect reasoning.
0.0 β€” Agent failed to diagnose any issue correctly. Applied irrelevant or harmful actions.
Return ONLY a JSON object:
{"score": 0.0, "rationale": "one paragraph explanation"}
"""
class LLMGrader:
def __init__(self, model: str | None = None):
api_key = os.environ.get("HF_TOKEN")
if not api_key:
raise EnvironmentError("HF_TOKEN not set")
base_url = os.environ.get("API_BASE_URL")
if not base_url:
raise EnvironmentError("API_BASE_URL not set")
self.client = OpenAI(api_key=api_key, base_url=base_url)
self.model = model or os.environ.get("MODEL_NAME")
if not self.model:
raise EnvironmentError("MODEL_NAME not set")
if self.model != "Qwen/Qwen2.5-72B-Instruct":
raise EnvironmentError("MODEL_NAME must be 'Qwen/Qwen2.5-72B-Instruct'")
def grade_trajectory(
self,
task_id: str,
history: List[dict],
initial_context: str,
final_context: str,
) -> dict:
"""Score the agent's full trajectory."""
steps_text = "\n".join(
f"Step {s['step']}: action={s['action']}, reward={s['reward']:.4f}, effects={s['effects']}"
for s in history
)
prompt = f"""
Task: {task_id}
INITIAL STATE:
{initial_context}
AGENT TRAJECTORY:
{steps_text}
FINAL STATE:
{final_context}
Please evaluate the agent's performance using the rubric.
"""
response = self.client.chat.completions.create(
model=self.model,
messages=[
{"role": "system", "content": RUBRIC},
{"role": "user", "content": prompt},
],
temperature=0.0,
max_tokens=400,
)
raw = response.choices[0].message.content.strip()
if raw.startswith("```"):
raw = raw.split("```")[1]
if raw.startswith("json"):
raw = raw[4:]
raw = raw.strip()
try:
return json.loads(raw)
except Exception:
return {"score": 0.0, "rationale": "Parse error"}