AlexDGenu commited on
Commit
9ec3f06
Β·
1 Parent(s): 0ab201b

Refactor run_gaia_evaluation to integrate LiteLLMModel and update agent initialization.

Browse files
Files changed (1) hide show
  1. app.py +52 -157
app.py CHANGED
@@ -1,18 +1,11 @@
1
  import os
2
- import gradio as gr
3
  import requests
4
- import pandas as pd
5
  from dotenv import load_dotenv
 
6
 
7
- from smolagents import CodeAgent, OpenAIServerModel, DuckDuckGoSearchTool
8
-
9
- # Load environment variables (including OPENAI_API_KEY)
10
  load_dotenv()
11
 
12
- # --- Constants ---
13
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
14
- OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
15
-
16
  INSTRUCTIONS = """You are a general AI assistant. I will ask you a question. Report your thoughts, and then provide your final answer.
17
 
18
  CRITICAL FORMATTING RULES:
@@ -24,62 +17,34 @@ CRITICAL FORMATTING RULES:
24
  - For strings: no extra spaces, no punctuation unless part of the answer, lowercase
25
  - For numbers: just the number, no units, no commas, no currency symbols
26
  - Provide ONLY the answer as your final response, nothing else
 
27
 
28
  You have access to a web search tool to help you find accurate information. Use it when you need to look up facts."""
29
 
30
- # --- Smol Agent Definition ---
31
- class SmolAgent:
32
- def __init__(self):
33
- print("Initializing SmolAgent with OpenAI model...")
34
- if not OPENAI_API_KEY:
35
- raise ValueError("OPENAI_API_KEY not found. Please set it in your environment.")
36
-
37
- # Initialize the OpenAI-backed model
38
- self.model = OpenAIServerModel(
39
- model_id="gpt-4o-mini", # or "gpt-4", "gpt-3.5-turbo", etc.
40
- api_base="https://api.openai.com/v1",
41
- api_key=OPENAI_API_KEY,
42
- )
43
-
44
- # Initialize the agent with tools and instructions
45
- self.agent = CodeAgent(
46
- tools=[DuckDuckGoSearchTool()],
47
- model=self.model,
48
- instructions=INSTRUCTIONS,
49
- max_steps=7,
50
- )
51
- print("SmolAgent initialized with CodeAgent and DuckDuckGoSearchTool.")
52
-
53
- def __call__(self, question: str) -> str:
54
- print(f"\nπŸͺ Running on question:\n{question}\n")
55
- try:
56
- answer = self.agent.run(question)
57
- print(f"βœ… Agent's final answer: {answer}")
58
- return str(answer)
59
- except Exception as e:
60
- import traceback
61
- traceback.print_exc()
62
- error_message = f"AGENT ERROR: {e}"
63
- print(f"❌ {error_message}")
64
- return error_message
65
-
66
- def run_gaia_evaluation(username: str):
67
- """Run the complete GAIA evaluation and submit results"""
68
- print("πŸš€ GAIA Benchmark Evaluation with ChatGPT")
69
  print("=" * 60)
70
-
 
71
  if not username:
72
- return "❌ Please provide a username"
73
-
74
  print(f"πŸ‘€ User: {username}")
75
-
76
- # Initialize the agent
77
- try:
78
- agent = SmolAgent()
79
- except Exception as e:
80
- return f"❌ Failed to initialize agent: {e}"
81
 
82
- # Fetch questions
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  try:
84
  resp = requests.get(f"{DEFAULT_API_URL}/questions", timeout=30)
85
  resp.raise_for_status()
@@ -87,121 +52,51 @@ def run_gaia_evaluation(username: str):
87
  questions = data if isinstance(data, list) else data.get("questions", [])
88
  print(f"πŸ“‹ Loaded {len(questions)} questions")
89
  except requests.RequestException as e:
90
- return f"❌ Error fetching questions: {e}"
 
91
 
92
- # Process questions
93
  results = []
94
- progress_log = []
95
-
96
  for i, q in enumerate(questions):
97
  task_id = q["task_id"]
98
  text = q["question"]
99
- progress_log.append(f"❓ Question {i+1}: {text}")
100
  print(f"\n❓ Question {i+1}: {text}")
101
 
102
- try:
103
- result = agent(text)
104
- result_str = str(result).strip()
105
-
106
- # Take the last line as the answer
107
- out = result_str.splitlines()[-1] if result_str else "AGENT ERROR: No response."
108
-
109
- if out.startswith("{"):
110
- out = "AGENT ERROR: No final answer."
111
-
112
- out = out.strip().rstrip(".")
113
- results.append({"task_id": task_id, "submitted_answer": out})
114
-
115
- progress_log.append(f"βœ… Answer: '{out}'")
116
- print(f"βœ… Answer: '{out}'")
117
-
118
- except Exception as e:
119
- error_msg = f"AGENT ERROR: {e}"
120
- results.append({"task_id": task_id, "submitted_answer": error_msg})
121
- progress_log.append(f"❌ Error: {error_msg}")
122
- print(f"❌ Error: {error_msg}")
123
-
124
- # Submit results
125
  payload = {
126
  "username": username,
127
- "agent_code": "chatgpt-gpt4o-mini-with-tools",
128
  "answers": results,
129
  }
130
-
131
  try:
132
- print("πŸ“€ Submitting to GAIA leaderboard...")
133
  post = requests.post(f"{DEFAULT_API_URL}/submit", json=payload, timeout=60)
134
  post.raise_for_status()
135
  res = post.json()
136
-
137
- # Format results for display
138
- result_summary = f"""
139
- πŸ† GAIA BENCHMARK RESULTS
140
- {'=' * 60}
141
- πŸ‘€ User: {res.get('username', username)}
142
- πŸ“Š Overall Score: {res.get('score', res.get('overall_score', 'N/A'))}%
143
- βœ… Correct: {res.get('correct_count', res.get('num_correct', 'N/A'))}/{len(results)}
144
- πŸ’¬ Message: {res.get('message', 'N/A')}
145
- {'=' * 60}
146
- """
147
-
148
- # Combine progress log with final results
149
- full_log = "\n".join(progress_log) + "\n" + result_summary
150
- return full_log
151
-
152
  except requests.RequestException as e:
153
- error_msg = f"❌ Error submitting: {e}"
154
  done = sum(1 for r in results if not r["submitted_answer"].startswith("AGENT ERROR"))
155
- local_summary = f"πŸ“‹ Completed locally: {done}/{len(results)}"
156
- return "\n".join(progress_log) + "\n" + error_msg + "\n" + local_summary
157
-
158
- # --- Gradio Interface ---
159
- def create_interface():
160
- with gr.Blocks(title="GAIA Benchmark with ChatGPT", theme=gr.themes.Soft()) as demo:
161
- gr.Markdown("# πŸš€ GAIA Benchmark Evaluation with ChatGPT")
162
- gr.Markdown("This app runs the GAIA benchmark using ChatGPT (GPT-4o-mini) with web search capabilities.")
163
-
164
- with gr.Row():
165
- with gr.Column(scale=1):
166
- username_input = gr.Textbox(
167
- label="Hugging Face Username",
168
- placeholder="Enter your HF username",
169
- info="This will be used for the GAIA leaderboard submission"
170
- )
171
-
172
- run_button = gr.Button("πŸš€ Run GAIA Evaluation", variant="primary", size="lg")
173
-
174
- with gr.Column(scale=2):
175
- output_area = gr.Textbox(
176
- label="Results & Progress",
177
- lines=20,
178
- max_lines=50,
179
- interactive=False
180
- )
181
-
182
- # Event handler
183
- run_button.click(
184
- fn=run_gaia_evaluation,
185
- inputs=[username_input],
186
- outputs=[output_area]
187
- )
188
-
189
- gr.Markdown("""
190
- ### How it works:
191
- 1. Enter your Hugging Face username
192
- 2. Click "Run GAIA Evaluation"
193
- 3. The agent will process all 20 GAIA questions using ChatGPT + web search
194
- 4. Results will be automatically submitted to the GAIA leaderboard
195
- 5. Your score will be displayed here
196
-
197
- ### Requirements:
198
- - Set `OPENAI_API_KEY` in your environment variables
199
- - Valid Hugging Face username for leaderboard submission
200
- """)
201
-
202
- return demo
203
-
204
- # --- Main execution ---
205
  if __name__ == "__main__":
206
- demo = create_interface()
207
- demo.launch()
 
1
  import os
 
2
  import requests
 
3
  from dotenv import load_dotenv
4
+ from smolagents import CodeAgent, DuckDuckGoSearchTool, LiteLLMModel
5
 
 
 
 
6
  load_dotenv()
7
 
 
8
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 
 
9
  INSTRUCTIONS = """You are a general AI assistant. I will ask you a question. Report your thoughts, and then provide your final answer.
10
 
11
  CRITICAL FORMATTING RULES:
 
17
  - For strings: no extra spaces, no punctuation unless part of the answer, lowercase
18
  - For numbers: just the number, no units, no commas, no currency symbols
19
  - Provide ONLY the answer as your final response, nothing else
20
+ - Expand abbreviations like 'St.' to 'Saint' in city names
21
 
22
  You have access to a web search tool to help you find accurate information. Use it when you need to look up facts."""
23
 
24
+ def run_gaia_evaluation():
25
+ print("πŸš€ GAIA Benchmark Evaluation with Ollama")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  print("=" * 60)
27
+
28
+ username = os.getenv("HF_USERNAME")
29
  if not username:
30
+ print("❌ Please set HF_USERNAME environment variable")
31
+ return
32
  print(f"πŸ‘€ User: {username}")
 
 
 
 
 
 
33
 
34
+ model = LiteLLMModel(
35
+ model_id="ollama_chat/gemma3",
36
+ api_base="http://localhost:11434",
37
+ num_ctx=8192,
38
+ temperature=0.1, # Low temperature for more deterministic answers
39
+ )
40
+
41
+ agent = CodeAgent(
42
+ tools=[DuckDuckGoSearchTool()],
43
+ model=model,
44
+ instructions=INSTRUCTIONS,
45
+ max_steps=10,
46
+ )
47
+
48
  try:
49
  resp = requests.get(f"{DEFAULT_API_URL}/questions", timeout=30)
50
  resp.raise_for_status()
 
52
  questions = data if isinstance(data, list) else data.get("questions", [])
53
  print(f"πŸ“‹ Loaded {len(questions)} questions")
54
  except requests.RequestException as e:
55
+ print(f"❌ Error fetching questions: {e}")
56
+ return
57
 
 
58
  results = []
 
 
59
  for i, q in enumerate(questions):
60
  task_id = q["task_id"]
61
  text = q["question"]
 
62
  print(f"\n❓ Question {i+1}: {text}")
63
 
64
+ result = agent.run(text, reset=True)
65
+ result_str = str(result).strip()
66
+
67
+ # Take the last line as the answer (since agent should provide only the answer)
68
+ out = result_str.splitlines()[-1] if result_str else "AGENT ERROR: No response."
69
+
70
+ if out.startswith("{"):
71
+ out = "AGENT ERROR: No final answer."
72
+
73
+ out = out.strip().rstrip(".")
74
+ results.append({"task_id": task_id, "submitted_answer": out})
75
+ print(f"βœ… Answer: '{out}'")
76
+ print(f"πŸ“ Preview: {result_str[:200]}...")
77
+
78
+ # Submit answers automatically
 
 
 
 
 
 
 
 
79
  payload = {
80
  "username": username,
81
+ "agent_code": "ollama-gemma3-with-tools",
82
  "answers": results,
83
  }
 
84
  try:
 
85
  post = requests.post(f"{DEFAULT_API_URL}/submit", json=payload, timeout=60)
86
  post.raise_for_status()
87
  res = post.json()
88
+ print("\n" + "=" * 60)
89
+ print("πŸ† GAIA BENCHMARK RESULTS")
90
+ print("=" * 60)
91
+ print(f"πŸ‘€ User: {res.get('username', username)}")
92
+ print(f"πŸ“Š Overall Score: {res.get('score', res.get('overall_score', 'N/A'))}%")
93
+ print(f"βœ… Correct: {res.get('correct_count', res.get('num_correct', 'N/A'))}/{len(results)}")
94
+ print(f"πŸ’¬ Message: {res.get('message', 'N/A')}")
95
+ print("=" * 60)
 
 
 
 
 
 
 
 
96
  except requests.RequestException as e:
97
+ print(f"❌ Error submitting: {e}")
98
  done = sum(1 for r in results if not r["submitted_answer"].startswith("AGENT ERROR"))
99
+ print(f"Completed locally: {done}/{len(results)}")
100
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  if __name__ == "__main__":
102
+ run_gaia_evaluation()