GAIA_v6 / app.py
sajjadpsavoji
update
e05ac72
# file.py
import os
import re
import json
import math
import csv
import time
import gradio as gr
import requests
import inspect
import pandas as pd
from ast import literal_eval
from dotenv import load_dotenv
from pathlib import Path
from typing import Optional, Tuple, Dict, Any, List
from smolagents import CodeAgent, DuckDuckGoSearchTool, OpenAIServerModel
from tools import (
ReverseTextTool,
ExtractTextFromImageTool,
AnalyzeCSVTool,
AnalyzeExcelTool,
DateCalculatorTool,
DownloadFileTool
)
# Try to load environment variables
try:
load_dotenv()
print("Loaded environment variables from .env file")
except Exception as e:
print(f"Note: Could not load .env file - {e}")
# --- Constants ---
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
DEFAULT_GOLD_CSV = os.environ.get("GAIA_GOLD_CSV", "answers.csv")
# --- Helpers for FINAL ANSWER parsing / normalization ---
FINAL_ANSWER_RE = re.compile(r"FINAL ANSWER\s*:\s*(.*)", re.IGNORECASE)
def extract_final_answer(text: str) -> str:
"""
Extract whatever comes after 'FINAL ANSWER:' on the last occurrence.
Falls back to entire text if pattern not found.
"""
if not isinstance(text, str):
return ""
matches = FINAL_ANSWER_RE.findall(text)
if matches:
return matches[-1].strip()
return (text or "").strip()
def is_number(s: str) -> bool:
try:
float(s)
return True
except Exception:
return False
def try_parse_number(s: str) -> Optional[float]:
try:
return float(s)
except Exception:
return None
def split_csv_like(s: str) -> List[str]:
parts = [p.strip() for p in re.split(r",", s)]
parts = [p for p in parts if p != ""]
return parts
def normalize_final(s: str) -> str:
"""
GAIA-style normalization:
- lowercase
- strip articles 'a', 'an', 'the'
- strip $, % if any
- collapse spaces, trim punctuation
"""
s = (s or "").strip().lower()
s = s.replace("%", "").replace("$", "")
s = re.sub(r"[^\w\s,.\-]+", "", s)
s = re.sub(r"\s+", " ", s).strip()
s = re.sub(r"^(a|an|the)\s+", "", s)
return s
def list_like_equal(a: str, b: str) -> bool:
la = [normalize_final(p) for p in split_csv_like(a)]
lb = [normalize_final(p) for p in split_csv_like(b)]
return sorted(la) == sorted(lb)
def numeric_close(a: str, b: str, rel_tol=1e-9, abs_tol=1e-6) -> bool:
na, nb = try_parse_number(a), try_parse_number(b)
if na is None or nb is None:
return False
return math.isclose(na, nb, rel_tol=rel_tol, abs_tol=abs_tol)
def fast_heuristic_match(pred: str, gold: str) -> bool:
pn = normalize_final(pred)
gn = normalize_final(gold)
if pn == gn:
return True
if numeric_close(pn, gn):
return True
if ("," in pred) or ("," in gold):
if list_like_equal(pred, gold):
return True
return False
def quick_format_fix(answer: str, question: str) -> str:
"""
Deterministic, judge-friendly cleanup. We DO NOT use gold here.
- Remove leading articles for strings
- Strip currency & percent unless explicitly requested by question
- Remove thousands commas in numbers
- Trim trailing punctuation
- Normalize whitespace
- Unify separators to comma for list-like strings
"""
if not isinstance(answer, str):
return answer
s = answer.strip()
# remove code fences around final answer if any
s = re.sub(r"^```.*?\n", "", s, flags=re.DOTALL)
s = s.replace("```", "").strip()
# normalize whitespace
s = re.sub(r"\s+", " ", s).strip()
# drop trailing period if looks like a sentence end
s = re.sub(r"[.。]+$", "", s)
# if list-like but uses semicolons or slashes, convert to commas
if ";" in s or "/" in s:
s = re.sub(r"[;/]+", ",", s)
s = re.sub(r"\s*,\s*", ", ", s) # pretty spacing
# remove leading articles for string-y answers
s = re.sub(r"^(?i)(a|an|the)\s+", "", s)
# remove thousands commas in numbers like 1,234 -> 1234 (but keep commas that separate lists)
# crude heuristic: if the whole answer is numeric-with-commas and no other commas
if "," in s and not re.search(r".*,.*", s): # only one comma group
if re.fullmatch(r"\d{1,3}(,\d{3})+(\.\d+)?", s):
s = s.replace(",", "")
# remove currency unless explicitly requested
if "$" in s and not re.search(r"(?i)\b(dollar|usd|\$)\b.*(include|keep|use)|include\s*\$", question):
s = s.replace("$", "")
# percent sign rules: keep only if question appears to require it explicitly
needs_percent = bool(re.search(r"(?i)\b(percent|%)\b.*(include|with|as sign)|include\s*%", question))
if "%" in s and not needs_percent:
s = s.replace("%", "")
return s.strip()
# --- Gold Answers Loader ---
class GoldAnswers:
"""
Loads answers.csv like your example and indexes by task_id.
- content column includes "... Final answer : X"
- metadata column includes "{'task_id': '...'}"
"""
def __init__(self, path: str = DEFAULT_GOLD_CSV):
self.by_task_id: Dict[str, str] = {}
self.load(path)
def load(self, path: str):
p = Path(path)
if not p.exists():
print(f"[GoldAnswers] Warning: {path} not found. Local judge will skip.")
return
with p.open("r", encoding="utf-8") as f:
reader = csv.DictReader(f)
for row in reader:
content = row.get("content", "")
metadata_str = row.get("metadata", "")
# extract gold final answer from 'content'
gold_full = extract_final_answer(content)
gold_full = re.sub(r"^final answer\s*:\s*", "", gold_full, flags=re.IGNORECASE).strip()
task_id = None
try:
md = literal_eval(metadata_str) if metadata_str else {}
task_id = md.get("task_id")
except Exception:
pass
if task_id and gold_full:
self.by_task_id[task_id] = gold_full
print(f"[GoldAnswers] Loaded {len(self.by_task_id)} gold answers from {path}.")
# --- Judge Agent (smolagents CodeAgent) ---
JUDGE_SYSTEM = (
"You are a strict grader for short answers. "
"Follow these GAIA rules: answers must be exact, concise, and obey units/format rules. "
"However, accept semantically equivalent forms (e.g., pluralization or minor punctuation) "
"and unordered lists if order is not required by the question. "
"For numeric answers, small rounding differences are acceptable. "
"Return ONLY a compact JSON object with keys: is_correct (true/false), score (0..1), justification (short). "
"Do not include any additional text outside the JSON."
)
def build_judge_prompt(question: str, predicted: str, gold: str) -> str:
return f"""
You are grading whether the predicted answer matches the gold answer for this GAIA-style item.
Question:
{question}
Predicted answer:
{predicted}
Gold answer:
{gold}
Evaluate correctness according to GAIA formatting rules and semantics.
Output strictly this JSON:
{{
"is_correct": true|false,
"score": number between 0 and 1,
"justification": "≤ 2 short sentences; no chain-of-thought"
}}
"""
class JudgeAgent:
"""
A smolagents CodeAgent used purely for grading. We call .run(prompt) to avoid any
direct use of model.generate signatures — this mirrors the GAIA agent path.
"""
def __init__(self, base_model: OpenAIServerModel, verbose: bool = False):
self.verbose = verbose
self.agent = CodeAgent(
tools=[],
model=base_model,
add_base_tools=False,
planning_interval=0,
verbosity_level=2 if verbose else 0,
additional_authorized_imports=[]
)
def judge(self, question: str, predicted: str, gold: str) -> Dict[str, Any]:
# Fast heuristics first
if fast_heuristic_match(predicted, gold):
return {"is_correct": True, "score": 1.0, "justification": "Heuristic match."}
prompt = f"{JUDGE_SYSTEM}\n\n{build_judge_prompt(question, predicted, gold)}"
try:
raw = self.agent.run(prompt)
text = (raw or "").strip()
m = re.search(r"\{.*\}", text, flags=re.DOTALL)
payload = json.loads(m.group(0) if m else text)
is_correct = bool(payload.get("is_correct", False))
score = float(payload.get("score", 0.0))
justification = str(payload.get("justification", "")).strip()[:300]
return {"is_correct": is_correct, "score": score, "justification": justification}
except Exception as e:
return {"is_correct": False, "score": 0.0, "justification": f"Judge error: {e}"}
# --- GAIA Agent Definition ---
class GAIAAgent:
def __init__(self, verbose=False):
self.verbose = verbose
print("Initializing GAIA Agent...")
# Get API key
api_key = os.environ.get("OPENAI_API_KEY")
if not api_key:
raise ValueError("OpenAI API key not found. Please set the OPENAI_API_KEY environment variable.")
# Initialize model with gpt-4o-mini for cost efficiency
model_id = os.environ.get("OPENAI_MODEL_ID", "gpt-4o-mini")
print(f"Using OpenAI model: {model_id}")
model = OpenAIServerModel(
model_id=model_id,
api_key=api_key,
temperature=0.1
)
duck_search_tool = DuckDuckGoSearchTool()
self.tools = [
duck_search_tool,
ReverseTextTool(),
ExtractTextFromImageTool(),
AnalyzeCSVTool(),
AnalyzeExcelTool(),
DateCalculatorTool(),
DownloadFileTool()
]
additional_imports = [
"PyPDF2", "pdf2image", "PIL", "nltk", "sklearn",
"networkx", "matplotlib", "seaborn", "scipy", "time"
]
self.agent = CodeAgent(
tools=self.tools,
model=model,
add_base_tools=True,
planning_interval=3,
verbosity_level=2 if self.verbose else 0,
additional_authorized_imports=additional_imports
)
print("GAIA Agent initialized and ready")
def _is_reversed_text(self, text):
return (
text.startswith(".") or
".rewsna eht sa" in text or
"esrever" in text or
"sdrawkcab" in text
)
def _base_prompt(self, question: str, allow_extra_searches: bool = False) -> str:
# Let retries slightly relax the search budget
search_budget_line = (
"- Limit to 1-2 web searches per question.\n"
if not allow_extra_searches else
"- You may use up to 3-4 web searches if needed.\n"
)
return f"""
You are a general AI assistant. I will ask you a question. Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER].
YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
- If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
- If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
- If you are asked for a comma separated list, apply the above rules depending on whether the element to be put in the list is a number or a string.
Question: {question}
IMPORTANT NOTES TO LIMIT COSTS AND PREVENT ERRORS:
- Use web search sparingly and only when absolutely necessary.
{search_budget_line}- If a search fails due to rate limiting, add a 3-5 second delay using time.sleep() before retrying with a different search term.
- Do not import libraries that aren't available - stick to basic Python and the tools provided.
- Focus on answering directly with what you already know when possible.
- If you've made more than 3 attempts to solve a problem, prioritize providing your best guess.
- Always add a delay of 2-3 seconds between web searches using time.sleep() to avoid rate limiting.
Remember to structure your response in Python code format using the final_answer() function.
"""
def _reversed_prompt(self, question: str, allow_extra_searches: bool = False) -> str:
search_budget_line = (
"- Limit to 1-2 web searches per question.\n"
if not allow_extra_searches else
"- You may use up to 3-4 web searches if needed.\n"
)
return f"""
You are a general AI assistant. I will ask you a question.
This question appears to be in reversed text. Here is the reversed version for clarity:
{question[::-1]}
Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER].
YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
- If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
- If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
- If you are asked for a comma separated list, apply the above rules depending on whether the element to be put in the list is a number or a string.
IMPORTANT NOTES TO LIMIT COSTS AND PREVENT ERRORS:
- Use web search sparingly and only when absolutely necessary.
{search_budget_line}- If a search fails due to rate limiting, add a 3-5 second delay using time.sleep() before retrying with a different search term.
- Do not import libraries that aren't available - stick to basic Python and the tools provided.
- Focus on answering directly with what you already know when possible.
- If you've made more than 3 attempts to solve a problem, prioritize providing your best guess.
- Always add a delay of 2-3 seconds between web searches using time.sleep() to avoid rate limiting.
Remember to structure your response in Python code format using the final_answer() function.
"""
def __call__(self, question: str, allow_extra_searches: bool = False) -> str:
if self.verbose:
msg = f"Processing question: {question[:100]}..." if len(question) > 100 else f"Processing question: {question}"
print(msg)
prompt = (
self._reversed_prompt(question, allow_extra_searches)
if self._is_reversed_text(question)
else self._base_prompt(question, allow_extra_searches)
)
try:
answer = self.agent.run(prompt)
if self.verbose:
print(f"Generated answer: {answer}")
return answer
except Exception as e:
error_msg = f"Error processing question: {e}"
if self.verbose:
print(error_msg)
return error_msg
def refine(self, question: str, prev_answer: str, judge_feedback: str, attempt_no: int) -> str:
"""
Reflection-based reattempt without using gold.
"""
if self.verbose:
print(f"Refining (attempt {attempt_no}) based on judge note: {judge_feedback}")
allow_extra = attempt_no >= 2 # relax search budget after first retry
base = self._base_prompt(question, allow_extra_searches=allow_extra)
refinement_addendum = f"""
Your previous FINAL ANSWER was:
{prev_answer}
A strict judge said this answer was incorrect for the following reason(s) (be concise): {judge_feedback}
Re-evaluate the question carefully. Consider possible formatting issues (units, articles, thousands commas), list ordering (only if the question requires a specific order), and rounding.
Produce a NEW final answer. Do not repeat the previous final answer if you think it was wrong.
"""
try:
answer = self.agent.run(base + refinement_addendum)
if self.verbose:
print(f"Refined answer: {answer}")
return answer
except Exception as e:
err = f"Error refining: {e}"
if self.verbose:
print(err)
return err
# --- Singletons for judge/gold ---
gold_answers = GoldAnswers(path=DEFAULT_GOLD_CSV)
_judge_agent_singleton: Optional[JudgeAgent] = None
# --- Runner & Submitter (with judge integration) ---
def _ensure_judge(model: OpenAIServerModel) -> JudgeAgent:
global _judge_agent_singleton
if _judge_agent_singleton is None:
_judge_agent_singleton = JudgeAgent(base_model=model, verbose=False)
return _judge_agent_singleton
def run_and_submit_all(sample_size: int = 0, max_retries: int = 1, use_local_judge_to_select: bool = True):
"""
Fetches all questions, runs the agent on them, judges locally (if gold available),
optionally reattempts on incorrect results, submits answers, and returns:
- final status string
- final results dataframe (one row per question)
- attempt log dataframe (one row per attempt)
"""
username = "Gralon"
print(f"Using username: {username}")
api_url = DEFAULT_API_URL
questions_url = f"{api_url}/questions"
submit_url = f"{api_url}/submit"
# 1. Instantiate Agent
try:
agent = GAIAAgent(verbose=True)
except Exception as e:
print(f"Error instantiating agent: {e}")
return f"Error initializing agent: {e}", None, None
# 1b. Init JudgeAgent once, reusing the SAME model instance
judge_agent = _ensure_judge(agent.agent.model)
# Derive code URL for submission
space_id = os.getenv("SPACE_ID")
agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else "local"
# 2. Fetch Questions
print(f"Fetching questions from: {questions_url}")
try:
response = requests.get(questions_url, timeout=15)
response.raise_for_status()
questions_data = response.json()
if not questions_data:
print("Fetched questions list is empty.")
return "Fetched questions list is empty or invalid format.", None, None
print(f"Fetched {len(questions_data)} questions.")
except requests.exceptions.RequestException as e:
print(f"Error fetching questions: {e}")
return f"Error fetching questions: {e}", None, None
except json.JSONDecodeError as e:
print(f"Error decoding JSON response from questions endpoint: {e}")
print(f"Response text: {response.text[:500]}")
return f"Error decoding server response for questions: {e}", None, None
except Exception as e:
print(f"An unexpected error occurred fetching questions: {e}")
return f"An unexpected error occurred fetching questions: {e}", None, None
# Sampling
if sample_size > 0 and sample_size < len(questions_data):
import random
print(f"Using a sample of {sample_size} questions from {len(questions_data)} total questions")
questions_data = random.sample(questions_data, sample_size)
print(f"Running agent on {len(questions_data)} questions...")
results_log: List[Dict[str, Any]] = []
attempts_log: List[Dict[str, Any]] = []
answers_payload: List[Dict[str, Any]] = []
for i, item in enumerate(questions_data):
task_id = item.get("task_id")
question_text = item.get("question")
if not task_id or question_text is None:
print(f"Skipping item with missing task_id or question: {item}")
continue
gold = gold_answers.by_task_id.get(task_id)
per_question_attempts: List[Dict[str, Any]] = []
try:
print(f"Processing question {i+1}/{len(questions_data)}: Task ID {task_id}")
# -- First attempt
raw = agent(question_text, allow_extra_searches=False)
ans = extract_final_answer(raw)
fixed = quick_format_fix(ans, question_text) or ans
# judge first (on fixed)
jres = None
j_is_correct = None
j_score = None
j_note = None
if gold:
jres = judge_agent.judge(question_text, fixed, gold)
j_is_correct = jres.get("is_correct")
j_score = jres.get("score")
j_note = jres.get("justification")
per_question_attempts.append({
"Task ID": task_id,
"Attempt": 1,
"Submitted Answer (raw)": ans,
"Submitted Answer (fixed)": fixed,
"Judge Correct?": j_is_correct,
"Judge Score": j_score,
"Judge Note": j_note
})
best_answer = fixed
best_score = j_score if j_score is not None else 0.0
best_correct = j_is_correct
retries = 0
while (j_is_correct is False) and (retries < max_retries):
retries += 1
# Try reflective retry
refined_raw = agent.refine(
question=question_text,
prev_answer=fixed,
judge_feedback=j_note or "Format/content mismatch.",
attempt_no=retries
)
refined = extract_final_answer(refined_raw)
refined_fixed = quick_format_fix(refined, question_text) or refined
# Judge the refined answer
j2 = None
j2_is_correct = None
j2_score = None
j2_note = None
if gold:
j2 = judge_agent.judge(question_text, refined_fixed, gold)
j2_is_correct = j2.get("is_correct")
j2_score = j2.get("score")
j2_note = j2.get("justification")
per_question_attempts.append({
"Task ID": task_id,
"Attempt": retries + 1,
"Submitted Answer (raw)": refined,
"Submitted Answer (fixed)": refined_fixed,
"Judge Correct?": j2_is_correct,
"Judge Score": j2_score,
"Judge Note": j2_note
})
# Decide whether to keep this as best
if use_local_judge_to_select and gold and (j2_score is not None):
if (j2_score > (best_score or 0)) or (best_score is None):
best_answer, best_score, best_correct = refined_fixed, j2_score, j2_is_correct
else:
# If we don't have gold/judge, prefer the newest answer
best_answer = refined_fixed
best_score = j2_score if j2_score is not None else best_score
best_correct = j2_is_correct if j2_is_correct is not None else best_correct
# Prepare for another retry if needed
fixed = refined_fixed
j_is_correct = j2_is_correct
j_score = j2_score
j_note = j2_note
if j2_is_correct:
break
if retries < max_retries:
print("Waiting 2 seconds before next attempt...")
time.sleep(2)
# Append final choice per question
answers_payload.append({"task_id": task_id, "submitted_answer": best_answer})
results_log.append({
"Task ID": task_id,
"Question": question_text,
"Submitted Answer": best_answer,
"Gold (local)": gold if gold else "",
"Judge Correct?": best_correct,
"Judge Score": best_score,
"Judge Note": j_note
})
print(f"Finished question {i+1}")
# Add to global attempts log
attempts_log.extend(per_question_attempts)
if i < len(questions_data) - 1:
print("Waiting 2 seconds before next question...")
time.sleep(2)
except Exception as e:
print(f"Error running agent on task {task_id}: {e}")
results_log.append({
"Task ID": task_id,
"Question": question_text,
"Submitted Answer": f"AGENT ERROR: {e}",
"Gold (local)": gold_answers.by_task_id.get(task_id, ""),
"Judge Correct?": False,
"Judge Score": 0.0,
"Judge Note": "agent error"
})
if not answers_payload:
print("Agent did not produce any answers to submit.")
return "Agent did not produce any answers to submit.", pd.DataFrame(results_log), pd.DataFrame(attempts_log)
# 4. Prepare Submission
submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
print(status_update)
# 5. Submit
print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
try:
response = requests.post(submit_url, json=submission_data, timeout=60)
response.raise_for_status()
result_data = response.json()
final_status = (
f"Submission Successful!\n"
f"User: {result_data.get('username')}\n"
f"Overall Score: {result_data.get('score', 'N/A')}% "
f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
f"Message: {result_data.get('message', 'No message received.')}"
)
print("Submission successful.")
results_df = pd.DataFrame(results_log)
attempts_df = pd.DataFrame(attempts_log)
return final_status, results_df, attempts_df
except requests.exceptions.HTTPError as e:
error_detail = f"Server responded with status {e.response.status_code}."
try:
error_json = e.response.json()
error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
except json.JSONDecodeError:
error_detail += f" Response: {e.response.text[:500]}"
status_message = f"Submission Failed: {error_detail}"
print(status_message)
results_df = pd.DataFrame(results_log)
attempts_df = pd.DataFrame(attempts_log)
return status_message, results_df, attempts_df
except requests.exceptions.Timeout:
status_message = "Submission Failed: The request timed out."
print(status_message)
results_df = pd.DataFrame(results_log)
attempts_df = pd.DataFrame(attempts_log)
return status_message, results_df, attempts_df
except requests.exceptions.RequestException as e:
status_message = f"Submission Failed: Network error - {e}"
print(status_message)
results_df = pd.DataFrame(results_log)
attempts_df = pd.DataFrame(attempts_log)
return status_message, results_df, attempts_df
except Exception as e:
status_message = f"An unexpected error occurred during submission: {e}"
print(status_message)
results_df = pd.DataFrame(results_log)
attempts_df = pd.DataFrame(attempts_log)
return status_message, results_df, attempts_df
def test_single_question(question: str, retries: int = 1) -> str:
"""Test the agent on a single question (no submission), with judge-aware retries if gold exists."""
try:
agent = GAIAAgent(verbose=True)
judge_agent = _ensure_judge(agent.agent.model)
gold = None # No task_id context here; pure test (no gold)
# First attempt
raw = agent(question)
ans = extract_final_answer(raw)
fixed = quick_format_fix(ans, question) or ans
if retries <= 0:
return fixed
# Without gold we can't know correctness; just do a reflective retry once for demo
last = fixed
note = "Possible format/content mismatch; re-evaluate."
for k in range(retries):
refined_raw = agent.refine(question, prev_answer=last, judge_feedback=note, attempt_no=k+1)
refined = extract_final_answer(refined_raw)
refined_fixed = quick_format_fix(refined, question) or refined
last = refined_fixed
return last
except Exception as e:
return f"Error: {e}"
# Optional: manual local judge tab
def local_judge_single(question: str, predicted: str, task_id_or_gold: str):
# try task_id lookup first
gold = gold_answers.by_task_id.get(task_id_or_gold, task_id_or_gold)
agent = GAIAAgent(verbose=False)
judge_agent = _ensure_judge(agent.agent.model)
res = judge_agent.judge(question, predicted, gold)
out = {
"Gold": gold,
"is_correct": res["is_correct"],
"score": res["score"],
"note": res["justification"]
}
return json.dumps(out, ensure_ascii=False, indent=2)
# --- Build Gradio Interface using Blocks ---
with gr.Blocks() as demo:
gr.Markdown("# GAIA Agent Evaluation Runner + Local LLM Judge (with smart retries)")
gr.Markdown(
"""
## Instructions:
1. Log in to your Hugging Face account using the button below
2. Test your agent on individual questions in the Testing tab
3. Run the full evaluation on the GAIA benchmark in the Evaluation tab
This agent runs locally, uses an LLM judge against your answers.csv (if present),
**retries intelligently** when the judge says 'incorrect', and then submits answers to the server.
"""
)
gr.LoginButton()
with gr.Tab("Test Single Question"):
test_input = gr.Textbox(label="Enter a question to test", lines=3)
test_retries = gr.Slider(minimum=0, maximum=3, value=1, step=1, label="Retries (no gold here, heuristic only)")
test_output = gr.Textbox(label="Answer", lines=3)
test_button = gr.Button("Test Question")
test_button.click(
fn=test_single_question,
inputs=[test_input, test_retries],
outputs=test_output
)
with gr.Tab("Local Judge (manual)"):
lj_q = gr.Textbox(label="Question", lines=3)
lj_pred = gr.Textbox(label="Predicted (your FINAL ANSWER)", lines=1)
lj_gold_or_id = gr.Textbox(label="Task ID (to fetch gold) OR paste a Gold answer", lines=1)
lj_out = gr.Textbox(label="Judge Result (JSON)", lines=8)
gr.Button("Judge Now").click(local_judge_single, inputs=[lj_q, lj_pred, lj_gold_or_id], outputs=lj_out)
with gr.Tab("Full Evaluation"):
with gr.Row():
sample_size = gr.Slider(
minimum=0,
maximum=20,
value=0,
step=1,
label="Sample Size (0 for all questions)",
info="Set a number to limit how many questions to process (reduces costs)"
)
max_retries = gr.Slider(
minimum=0,
maximum=3,
value=1,
step=1,
label="Max judge-driven retries per question",
info="0 = no retries; 1-3 = progressively more effort"
)
use_local = gr.Checkbox(
value=True,
label="Use local judge (gold) to pick best attempt when available",
info="If unchecked, we submit the last attempt instead."
)
run_button = gr.Button("Run Evaluation, Judge Locally, Retry & Submit")
status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
results_table = gr.DataFrame(label="Final Results (per question)", wrap=True)
attempts_table = gr.DataFrame(label="Attempt Log (expanded)", wrap=True)
run_button.click(
fn=run_and_submit_all,
inputs=[sample_size, max_retries, use_local],
outputs=[status_output, results_table, attempts_table]
)
if __name__ == "__main__":
print("\n" + "-"*30 + " GAIA Agent Starting " + "-"*30)
# Check for API key
api_key = os.environ.get("OPENAI_API_KEY")
if not api_key:
print("WARNING: OpenAI API key not found. Please set OPENAI_API_KEY environment variable.")
else:
print("OpenAI API key found.")
# Check environment variables
space_host = os.getenv("SPACE_HOST")
space_id = os.getenv("SPACE_ID")
if space_host:
print(f"✅ Running in Hugging Face Space: {space_host}")
print(f" Runtime URL: https://{space_host}.hf.space")
else:
print("ℹ️ Running locally")
if space_id:
print(f"✅ Space ID: {space_id}")
print(f" Repo URL: https://huggingface.co/spaces/{space_id}")
print(f" Code URL: https://huggingface.co/spaces/{space_id}/tree/main")
print("-"*78 + "\n")
if Path(DEFAULT_GOLD_CSV).exists():
print(f"Local gold answers found at: {DEFAULT_GOLD_CSV}")
else:
print(f"No local gold CSV found at: {DEFAULT_GOLD_CSV} (judge will skip gold for unknown tasks)")
print("Launching Gradio Interface...")
demo.launch(debug=True)