AlexDGenu's picture
Refactor run_gaia_evaluation to integrate LiteLLMModel and update agent initialization.
9ec3f06
import os
import requests
from dotenv import load_dotenv
from smolagents import CodeAgent, DuckDuckGoSearchTool, LiteLLMModel
load_dotenv()
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
INSTRUCTIONS = """You are a general AI assistant. I will ask you a question. Report your thoughts, and then provide your final answer.
CRITICAL FORMATTING RULES:
- Your final answer should be a number OR as few words as possible OR a comma separated list of numbers and/or strings
- If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise
- If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise
- If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string
- Be extremely precise with spelling and formatting - the evaluation uses exact matching
- For strings: no extra spaces, no punctuation unless part of the answer, lowercase
- For numbers: just the number, no units, no commas, no currency symbols
- Provide ONLY the answer as your final response, nothing else
- Expand abbreviations like 'St.' to 'Saint' in city names
You have access to a web search tool to help you find accurate information. Use it when you need to look up facts."""
def run_gaia_evaluation():
print("πŸš€ GAIA Benchmark Evaluation with Ollama")
print("=" * 60)
username = os.getenv("HF_USERNAME")
if not username:
print("❌ Please set HF_USERNAME environment variable")
return
print(f"πŸ‘€ User: {username}")
model = LiteLLMModel(
model_id="ollama_chat/gemma3",
api_base="http://localhost:11434",
num_ctx=8192,
temperature=0.1, # Low temperature for more deterministic answers
)
agent = CodeAgent(
tools=[DuckDuckGoSearchTool()],
model=model,
instructions=INSTRUCTIONS,
max_steps=10,
)
try:
resp = requests.get(f"{DEFAULT_API_URL}/questions", timeout=30)
resp.raise_for_status()
data = resp.json()
questions = data if isinstance(data, list) else data.get("questions", [])
print(f"πŸ“‹ Loaded {len(questions)} questions")
except requests.RequestException as e:
print(f"❌ Error fetching questions: {e}")
return
results = []
for i, q in enumerate(questions):
task_id = q["task_id"]
text = q["question"]
print(f"\n❓ Question {i+1}: {text}")
result = agent.run(text, reset=True)
result_str = str(result).strip()
# Take the last line as the answer (since agent should provide only the answer)
out = result_str.splitlines()[-1] if result_str else "AGENT ERROR: No response."
if out.startswith("{"):
out = "AGENT ERROR: No final answer."
out = out.strip().rstrip(".")
results.append({"task_id": task_id, "submitted_answer": out})
print(f"βœ… Answer: '{out}'")
print(f"πŸ“ Preview: {result_str[:200]}...")
# Submit answers automatically
payload = {
"username": username,
"agent_code": "ollama-gemma3-with-tools",
"answers": results,
}
try:
post = requests.post(f"{DEFAULT_API_URL}/submit", json=payload, timeout=60)
post.raise_for_status()
res = post.json()
print("\n" + "=" * 60)
print("πŸ† GAIA BENCHMARK RESULTS")
print("=" * 60)
print(f"πŸ‘€ User: {res.get('username', username)}")
print(f"πŸ“Š Overall Score: {res.get('score', res.get('overall_score', 'N/A'))}%")
print(f"βœ… Correct: {res.get('correct_count', res.get('num_correct', 'N/A'))}/{len(results)}")
print(f"πŸ’¬ Message: {res.get('message', 'N/A')}")
print("=" * 60)
except requests.RequestException as e:
print(f"❌ Error submitting: {e}")
done = sum(1 for r in results if not r["submitted_answer"].startswith("AGENT ERROR"))
print(f"Completed locally: {done}/{len(results)}")
if __name__ == "__main__":
run_gaia_evaluation()