File size: 4,216 Bytes
10e9b7d
eccf8e4
4e9aecc
9ec3f06
4e9aecc
 
c195ce7
3db6293
0ab201b
 
 
 
 
 
 
 
 
 
 
9ec3f06
0ab201b
 
bd08449
9ec3f06
 
0ab201b
9ec3f06
 
0ab201b
9ec3f06
 
0ab201b
3c4371f
9ec3f06
 
 
 
 
 
 
 
 
 
 
 
 
 
eccf8e4
0ab201b
 
 
 
 
 
9ec3f06
 
0ab201b
 
 
 
 
 
 
9ec3f06
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0ab201b
 
9ec3f06
0ab201b
 
e80aab9
0ab201b
 
 
9ec3f06
 
 
 
 
 
 
 
0ab201b
9ec3f06
0ab201b
9ec3f06
 
e80aab9
9ec3f06
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import os
import requests
from dotenv import load_dotenv
from smolagents import CodeAgent, DuckDuckGoSearchTool, LiteLLMModel

load_dotenv()

DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
INSTRUCTIONS = """You are a general AI assistant. I will ask you a question. Report your thoughts, and then provide your final answer.

CRITICAL FORMATTING RULES:
- Your final answer should be a number OR as few words as possible OR a comma separated list of numbers and/or strings
- If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise
- If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise
- If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string
- Be extremely precise with spelling and formatting - the evaluation uses exact matching
- For strings: no extra spaces, no punctuation unless part of the answer, lowercase
- For numbers: just the number, no units, no commas, no currency symbols
- Provide ONLY the answer as your final response, nothing else
- Expand abbreviations like 'St.' to 'Saint' in city names

You have access to a web search tool to help you find accurate information. Use it when you need to look up facts."""

def run_gaia_evaluation():
    print("πŸš€ GAIA Benchmark Evaluation with Ollama")
    print("=" * 60)

    username = os.getenv("HF_USERNAME")
    if not username:
        print("❌ Please set HF_USERNAME environment variable")
        return
    print(f"πŸ‘€ User: {username}")

    model = LiteLLMModel(
        model_id="ollama_chat/gemma3",
        api_base="http://localhost:11434",
        num_ctx=8192,
        temperature=0.1,  # Low temperature for more deterministic answers
    )

    agent = CodeAgent(
        tools=[DuckDuckGoSearchTool()],
        model=model,
        instructions=INSTRUCTIONS,
        max_steps=10,
    )

    try:
        resp = requests.get(f"{DEFAULT_API_URL}/questions", timeout=30)
        resp.raise_for_status()
        data = resp.json()
        questions = data if isinstance(data, list) else data.get("questions", [])
        print(f"πŸ“‹ Loaded {len(questions)} questions")
    except requests.RequestException as e:
        print(f"❌ Error fetching questions: {e}")
        return

    results = []
    for i, q in enumerate(questions):
        task_id = q["task_id"]
        text = q["question"]
        print(f"\n❓ Question {i+1}: {text}")

        result = agent.run(text, reset=True)
        result_str = str(result).strip()

        # Take the last line as the answer (since agent should provide only the answer)
        out = result_str.splitlines()[-1] if result_str else "AGENT ERROR: No response."

        if out.startswith("{"):
            out = "AGENT ERROR: No final answer."

        out = out.strip().rstrip(".")
        results.append({"task_id": task_id, "submitted_answer": out})
        print(f"βœ… Answer: '{out}'")
        print(f"πŸ“ Preview: {result_str[:200]}...")

    # Submit answers automatically
    payload = {
        "username": username,
        "agent_code": "ollama-gemma3-with-tools",
        "answers": results,
    }
    try:
        post = requests.post(f"{DEFAULT_API_URL}/submit", json=payload, timeout=60)
        post.raise_for_status()
        res = post.json()
        print("\n" + "=" * 60)
        print("πŸ† GAIA BENCHMARK RESULTS")
        print("=" * 60)
        print(f"πŸ‘€ User: {res.get('username', username)}")
        print(f"πŸ“Š Overall Score: {res.get('score', res.get('overall_score', 'N/A'))}%")
        print(f"βœ… Correct: {res.get('correct_count', res.get('num_correct', 'N/A'))}/{len(results)}")
        print(f"πŸ’¬ Message: {res.get('message', 'N/A')}")
        print("=" * 60)
    except requests.RequestException as e:
        print(f"❌ Error submitting: {e}")
        done = sum(1 for r in results if not r["submitted_answer"].startswith("AGENT ERROR"))
        print(f"Completed locally: {done}/{len(results)}")

if __name__ == "__main__":
    run_gaia_evaluation()