Spaces:

agent-zero
/

meta-ads-attribution-env

Running

App Files Files Community

TheAarvee05 commited on 6 days ago

Commit

c030db3

verified ·

1 Parent(s): 8c807bd

Upload evaluation/llm_grader.py with huggingface_hub

Browse files

Files changed (1) hide show

evaluation/llm_grader.py +100 -0

evaluation/llm_grader.py ADDED Viewed

	@@ -0,0 +1,100 @@

+"""
+evaluation/llm_grader.py — LLM-as-judge grader for qualitative scoring.
+Scores the agent's REASONING quality on top of the programmatic score.
+Uses a rubric to evaluate whether the agent correctly diagnosed the root cause.
+"""
+from __future__ import annotations
+import json
+import os
+from typing import List
+from openai import OpenAI
+RUBRIC = """
+You are evaluating an AI agent's performance on a Meta Ads attribution recovery task.
+Score the agent's trajectory from 0.0 to 1.0 on the following rubric:
+1.0 — Agent correctly identified ALL root causes (wrong attribution window, pixel signal loss,
+       budget misallocation) and applied the right fixes in a logical order with clear reasoning.
+0.75 — Agent identified the primary issue and fixed it, but missed secondary issues or
+        applied fixes in a suboptimal order.
+0.50 — Agent showed partial understanding of the problem and applied some correct actions,
+        but reasoning was vague or steps were redundant.
+0.25 — Agent took some valid actions but clearly did not understand the root causes.
+        Mixed correct and incorrect reasoning.
+0.0  — Agent failed to diagnose any issue correctly. Applied irrelevant or harmful actions.
+Return ONLY a JSON object:
+{"score": 0.0, "rationale": "one paragraph explanation"}
+"""
+class LLMGrader:
+    def __init__(self, model: str | None = None):
+        api_key = os.environ.get("HF_TOKEN")
+        if not api_key:
+            raise EnvironmentError("HF_TOKEN not set")
+        base_url = os.environ.get("API_BASE_URL")
+        if not base_url:
+            raise EnvironmentError("API_BASE_URL not set")
+        self.client = OpenAI(api_key=api_key, base_url=base_url)
+        self.model = model or os.environ.get("MODEL_NAME")
+        if not self.model:
+            raise EnvironmentError("MODEL_NAME not set")
+        if self.model != "Qwen/Qwen2.5-72B-Instruct":
+            raise EnvironmentError("MODEL_NAME must be 'Qwen/Qwen2.5-72B-Instruct'")
+    def grade_trajectory(
+        self,
+        task_id: str,
+        history: List[dict],
+        initial_context: str,
+        final_context: str,
+    ) -> dict:
+        """Score the agent's full trajectory."""
+        steps_text = "\n".join(
+            f"Step {s['step']}: action={s['action']}, reward={s['reward']:.4f}, effects={s['effects']}"
+            for s in history
+        )
+        prompt = f"""
+Task: {task_id}
+INITIAL STATE:
+{initial_context}
+AGENT TRAJECTORY:
+{steps_text}
+FINAL STATE:
+{final_context}
+Please evaluate the agent's performance using the rubric.
+"""
+        response = self.client.chat.completions.create(
+            model=self.model,
+            messages=[
+                {"role": "system", "content": RUBRIC},
+                {"role": "user",   "content": prompt},
+            ],
+            temperature=0.0,
+            max_tokens=400,
+        )
+        raw = response.choices[0].message.content.strip()
+        if raw.startswith("```"):
+            raw = raw.split("```")[1]
+            if raw.startswith("json"):
+                raw = raw[4:]
+        raw = raw.strip()
+        try:
+            return json.loads(raw)
+        except Exception:
+            return {"score": 0.0, "rationale": "Parse error"}