Spaces:
Runtime error
Runtime error
File size: 4,216 Bytes
10e9b7d eccf8e4 4e9aecc 9ec3f06 4e9aecc c195ce7 3db6293 0ab201b 9ec3f06 0ab201b bd08449 9ec3f06 0ab201b 9ec3f06 0ab201b 9ec3f06 0ab201b 3c4371f 9ec3f06 eccf8e4 0ab201b 9ec3f06 0ab201b 9ec3f06 0ab201b 9ec3f06 0ab201b e80aab9 0ab201b 9ec3f06 0ab201b 9ec3f06 0ab201b 9ec3f06 e80aab9 9ec3f06 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
import os
import requests
from dotenv import load_dotenv
from smolagents import CodeAgent, DuckDuckGoSearchTool, LiteLLMModel
load_dotenv()
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
INSTRUCTIONS = """You are a general AI assistant. I will ask you a question. Report your thoughts, and then provide your final answer.
CRITICAL FORMATTING RULES:
- Your final answer should be a number OR as few words as possible OR a comma separated list of numbers and/or strings
- If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise
- If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise
- If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string
- Be extremely precise with spelling and formatting - the evaluation uses exact matching
- For strings: no extra spaces, no punctuation unless part of the answer, lowercase
- For numbers: just the number, no units, no commas, no currency symbols
- Provide ONLY the answer as your final response, nothing else
- Expand abbreviations like 'St.' to 'Saint' in city names
You have access to a web search tool to help you find accurate information. Use it when you need to look up facts."""
def run_gaia_evaluation():
print("π GAIA Benchmark Evaluation with Ollama")
print("=" * 60)
username = os.getenv("HF_USERNAME")
if not username:
print("β Please set HF_USERNAME environment variable")
return
print(f"π€ User: {username}")
model = LiteLLMModel(
model_id="ollama_chat/gemma3",
api_base="http://localhost:11434",
num_ctx=8192,
temperature=0.1, # Low temperature for more deterministic answers
)
agent = CodeAgent(
tools=[DuckDuckGoSearchTool()],
model=model,
instructions=INSTRUCTIONS,
max_steps=10,
)
try:
resp = requests.get(f"{DEFAULT_API_URL}/questions", timeout=30)
resp.raise_for_status()
data = resp.json()
questions = data if isinstance(data, list) else data.get("questions", [])
print(f"π Loaded {len(questions)} questions")
except requests.RequestException as e:
print(f"β Error fetching questions: {e}")
return
results = []
for i, q in enumerate(questions):
task_id = q["task_id"]
text = q["question"]
print(f"\nβ Question {i+1}: {text}")
result = agent.run(text, reset=True)
result_str = str(result).strip()
# Take the last line as the answer (since agent should provide only the answer)
out = result_str.splitlines()[-1] if result_str else "AGENT ERROR: No response."
if out.startswith("{"):
out = "AGENT ERROR: No final answer."
out = out.strip().rstrip(".")
results.append({"task_id": task_id, "submitted_answer": out})
print(f"β
Answer: '{out}'")
print(f"π Preview: {result_str[:200]}...")
# Submit answers automatically
payload = {
"username": username,
"agent_code": "ollama-gemma3-with-tools",
"answers": results,
}
try:
post = requests.post(f"{DEFAULT_API_URL}/submit", json=payload, timeout=60)
post.raise_for_status()
res = post.json()
print("\n" + "=" * 60)
print("π GAIA BENCHMARK RESULTS")
print("=" * 60)
print(f"π€ User: {res.get('username', username)}")
print(f"π Overall Score: {res.get('score', res.get('overall_score', 'N/A'))}%")
print(f"β
Correct: {res.get('correct_count', res.get('num_correct', 'N/A'))}/{len(results)}")
print(f"π¬ Message: {res.get('message', 'N/A')}")
print("=" * 60)
except requests.RequestException as e:
print(f"β Error submitting: {e}")
done = sum(1 for r in results if not r["submitted_answer"].startswith("AGENT ERROR"))
print(f"Completed locally: {done}/{len(results)}")
if __name__ == "__main__":
run_gaia_evaluation()
|