Mehedi2 commited on
Commit
0a876aa
·
verified ·
1 Parent(s): 288ad5e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +88 -168
app.py CHANGED
@@ -7,8 +7,13 @@ import time
7
 
8
  # Your OpenRouter API key
9
  OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY") or os.getenv("my_key")
10
- GAIA_API_BASE = "https://gaia-benchmark-hf.fly.dev" # official GAIA API base
11
 
 
 
 
 
 
 
12
  class OpenRouterLLM:
13
  def __init__(self, api_key: str, model: str = "deepseek/deepseek-v3.1-terminus"):
14
  self.api_key = api_key
@@ -16,8 +21,6 @@ class OpenRouterLLM:
16
  self.base_url = "https://openrouter.ai/api/v1/chat/completions"
17
 
18
  def __call__(self, prompt: str, max_tokens: int = 1000, temperature: float = 0.1) -> str:
19
- """Make API call to OpenRouter"""
20
-
21
  if not self.api_key or not self.api_key.startswith('sk-or-v1-'):
22
  return "Error: Invalid OpenRouter API key"
23
 
@@ -36,14 +39,11 @@ class OpenRouterLLM:
36
  "content": """You are a helpful AI assistant designed to answer questions accurately and concisely.
37
  For GAIA evaluation, provide EXACT answers without explanation unless asked.
38
  - For math questions, give just the number
39
- - For yes/no questions, give just "Yes" or "No"
40
  - For factual questions, give just the fact
41
- - Be precise and direct."""
42
  },
43
- {
44
- "role": "user",
45
- "content": prompt
46
- }
47
  ],
48
  "temperature": temperature,
49
  "max_tokens": max_tokens,
@@ -52,8 +52,8 @@ For GAIA evaluation, provide EXACT answers without explanation unless asked.
52
 
53
  try:
54
  response = requests.post(
55
- self.base_url,
56
- headers=headers,
57
  json=payload,
58
  timeout=30
59
  )
@@ -62,265 +62,185 @@ For GAIA evaluation, provide EXACT answers without explanation unless asked.
62
  return f"API Error: {response.status_code}"
63
 
64
  result = response.json()
65
-
66
  if "choices" in result and len(result["choices"]) > 0:
67
  return result["choices"][0]["message"]["content"].strip()
68
  else:
69
  return "Error: No response content received"
70
-
71
  except Exception as e:
72
  return f"Error: {str(e)}"
73
 
 
 
 
 
74
  class GAIAAgent:
75
  def __init__(self, api_key: str):
76
  self.llm = OpenRouterLLM(api_key=api_key)
77
  self.api_key = api_key
78
 
79
  def run_agent(self, prompt: str) -> str:
80
- """
81
- Main function for GAIA evaluation
82
- This is what GAIA calls to get answers
83
- """
84
  try:
85
- # Process the question to get a direct answer
86
  enhanced_prompt = f"""
87
  Question: {prompt}
88
-
89
- Analyze this question carefully and provide the exact answer. Do not include explanations, reasoning, or extra text unless specifically asked for reasoning.
90
-
91
- Examples of good responses:
92
- - Math question "What is 15 + 27?" → Answer: "42"
93
- - Yes/No question "Is Paris the capital of France?" → Answer: "Yes"
94
- - Factual question "What is the capital of Japan?" → Answer: "Tokyo"
95
-
96
  Your answer:"""
97
-
98
  response = self.llm(enhanced_prompt, max_tokens=500, temperature=0.1)
99
-
100
- # Clean up the response to get just the answer
101
- answer = self.clean_answer(response)
102
- return answer
103
-
104
  except Exception as e:
105
  return f"Error: {str(e)}"
106
 
107
  def clean_answer(self, response: str) -> str:
108
- """Clean the response to extract just the answer"""
109
  response = response.strip()
110
-
111
- # Remove common prefixes
112
  prefixes_to_remove = [
113
- "Answer:", "The answer is:", "Response:", "Result:",
114
  "Final answer:", "Solution:", "A:", "Answer is:"
115
  ]
116
-
117
  for prefix in prefixes_to_remove:
118
  if response.lower().startswith(prefix.lower()):
119
  response = response[len(prefix):].strip()
120
-
121
- # Remove quotes if they wrap the entire answer
122
  if response.startswith('"') and response.endswith('"'):
123
  response = response[1:-1]
124
-
125
  return response
126
 
127
  def get_questions(self) -> List[Dict]:
128
- """Get questions from GAIA API"""
129
  try:
130
- response = requests.get(f"{GAIA_API_BASE}/questions", timeout=30)
131
- if response.status_code == 200:
132
- return response.json()
133
- else:
134
- return []
135
- except:
136
  return []
137
 
138
  def get_random_question(self) -> Dict:
139
- """Get a random question from GAIA API"""
140
  try:
141
- response = requests.get(f"{GAIA_API_BASE}/random-question", timeout=30)
142
- if response.status_code == 200:
143
- return response.json()
144
- else:
145
- return {}
146
- except:
147
  return {}
148
 
 
 
 
 
 
 
 
 
 
 
 
149
  def submit_answers(self, username: str, agent_code: str, answers: List[Dict]) -> Dict:
150
- """Submit answers to GAIA for scoring"""
151
  try:
152
  payload = {
153
  "username": username,
154
  "agent_code": agent_code,
155
  "answers": answers
156
  }
157
-
158
- response = requests.post(
159
- f"{GAIA_API_BASE}/submit",
160
- json=payload,
161
- timeout=60
162
- )
163
-
164
- if response.status_code == 200:
165
- return response.json()
166
- else:
167
- return {"error": f"Submission failed: {response.status_code}"}
168
-
169
  except Exception as e:
170
  return {"error": f"Submission error: {str(e)}"}
171
 
172
- # Initialize the agent
 
 
 
173
  agent = GAIAAgent(api_key=OPENROUTER_API_KEY)
174
 
175
  def run_agent(prompt: str) -> str:
176
- """Main function that GAIA will call"""
177
  return agent.run_agent(prompt)
178
 
179
  def test_single_question():
180
- """Test the agent with a single question"""
181
  question = agent.get_random_question()
182
  if question:
183
- answer = run_agent(question.get("Question", ""))
184
- return f"Question: {question.get('Question', '')}\nAnswer: {answer}"
 
185
  return "Failed to get question"
186
 
187
  def run_full_evaluation(username: str, progress=gr.Progress()):
188
- """Run full GAIA evaluation"""
189
  if not username:
190
  return "Please provide your Hugging Face username"
191
-
192
  if not OPENROUTER_API_KEY:
193
  return "Please configure your OpenRouter API key"
194
 
195
- progress(0.1, desc="Getting questions...")
196
-
197
- # Get all questions
198
  questions = agent.get_questions()
199
  if not questions:
200
  return "Failed to retrieve questions from GAIA API"
201
 
202
- progress(0.2, desc=f"Processing {len(questions)} questions...")
203
-
204
- # Process each question
205
  answers = []
206
  for i, question in enumerate(questions):
207
- progress(0.2 + (0.7 * i / len(questions)), desc=f"Processing question {i+1}/{len(questions)}")
208
-
209
  task_id = question.get("task_id", "")
210
- question_text = question.get("Question", "")
211
-
212
- if question_text:
213
- answer = run_agent(question_text)
214
- answers.append({
215
- "task_id": task_id,
216
- "submitted_answer": answer
217
- })
218
-
219
- # Small delay to avoid rate limiting
220
- time.sleep(0.5)
221
 
222
  progress(0.9, desc="Submitting answers...")
223
-
224
- # Submit answers
225
  agent_code = f"https://huggingface.co/spaces/{username}/Gaia-Test-Agent/tree/main"
226
  result = agent.submit_answers(username, agent_code, answers)
227
 
228
- progress(1.0, desc="Complete!")
229
-
230
  if "error" in result:
231
  return f"Submission failed: {result['error']}"
232
- else:
233
- score = result.get("score", 0)
234
- return f"Evaluation complete!\nScore: {score}%\nAnswers submitted: {len(answers)}\nCheck the leaderboard for your ranking!"
235
 
236
- # Create Gradio interface
 
 
237
  def create_gradio_app():
238
  with gr.Blocks(title="GAIA Test Agent", theme=gr.themes.Soft()) as app:
239
-
240
  gr.HTML("""
241
- <div style="text-align: center; background: linear-gradient(90deg, #667eea 0%, #764ba2 100%); color: white; padding: 20px; border-radius: 10px; margin-bottom: 20px;">
242
  <h1>GAIA Test Agent</h1>
243
  <p>AI Agent for GAIA Benchmark Evaluation</p>
244
  </div>
245
  """)
246
 
247
  with gr.Tab("Single Question Test"):
248
- test_btn = gr.Button("Test Random Question", variant="primary")
249
- test_output = gr.Textbox(
250
- label="Test Result",
251
- lines=10,
252
- placeholder="Test results will appear here..."
253
- )
254
-
255
- test_btn.click(
256
- fn=test_single_question,
257
- outputs=[test_output]
258
- )
259
 
260
  with gr.Tab("Full Evaluation"):
261
- gr.Markdown("### Run Full GAIA Evaluation")
262
-
263
- username_input = gr.Textbox(
264
- label="Hugging Face Username",
265
- placeholder="Enter your HF username",
266
- info="This will be used for the leaderboard"
267
- )
268
-
269
  eval_btn = gr.Button("Run Full Evaluation", variant="primary")
270
- eval_output = gr.Textbox(
271
- label="Evaluation Results",
272
- lines=15,
273
- placeholder="Evaluation results will appear here..."
274
- )
275
-
276
- eval_btn.click(
277
- fn=run_full_evaluation,
278
- inputs=[username_input],
279
- outputs=[eval_output],
280
- show_progress=True
281
- )
282
 
283
- with gr.Tab("Manual Testing"):
284
- gr.Markdown("### Test Individual Questions")
285
-
286
- manual_input = gr.Textbox(
287
- label="Enter Question",
288
- placeholder="Type a question to test...",
289
- lines=3
290
- )
291
-
292
- manual_btn = gr.Button("Get Answer", variant="primary")
293
- manual_output = gr.Textbox(
294
- label="Answer",
295
- lines=5,
296
- placeholder="Answer will appear here..."
297
- )
298
-
299
- manual_btn.click(
300
- fn=run_agent,
301
- inputs=[manual_input],
302
- outputs=[manual_output]
303
- )
304
 
305
  gr.Markdown("""
306
- ### How to Use:
307
- 1. **Single Question Test**: Test your agent with one random question from GAIA
308
- 2. **Full Evaluation**: Run the complete evaluation and submit to leaderboard
309
- 3. **Manual Testing**: Test your agent with custom questions
310
-
311
- ### Requirements:
312
- - Set your OpenRouter API key in Space secrets as `OPENROUTER_API_KEY`
313
- - Keep your Space public for leaderboard verification
314
- - Your HF username will appear on the leaderboard
315
  """)
316
-
317
  return app
318
 
319
- # Launch the app
320
  if __name__ == "__main__":
321
  app = create_gradio_app()
322
-
323
  if os.getenv("SPACE_ID"):
324
  app.launch(server_name="0.0.0.0", server_port=7860, show_api=False)
325
  else:
326
- app.launch(share=True, show_api=False)
 
7
 
8
  # Your OpenRouter API key
9
  OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY") or os.getenv("my_key")
 
10
 
11
+ # ✅ Replace with the official GAIA leaderboard API base
12
+ GAIA_API_BASE = "https://gaia-leaderboard.fly.dev"
13
+
14
+ # -----------------------------
15
+ # OpenRouter LLM Wrapper
16
+ # -----------------------------
17
  class OpenRouterLLM:
18
  def __init__(self, api_key: str, model: str = "deepseek/deepseek-v3.1-terminus"):
19
  self.api_key = api_key
 
21
  self.base_url = "https://openrouter.ai/api/v1/chat/completions"
22
 
23
  def __call__(self, prompt: str, max_tokens: int = 1000, temperature: float = 0.1) -> str:
 
 
24
  if not self.api_key or not self.api_key.startswith('sk-or-v1-'):
25
  return "Error: Invalid OpenRouter API key"
26
 
 
39
  "content": """You are a helpful AI assistant designed to answer questions accurately and concisely.
40
  For GAIA evaluation, provide EXACT answers without explanation unless asked.
41
  - For math questions, give just the number
42
+ - For yes/no questions, give just "Yes" or "No"
43
  - For factual questions, give just the fact
44
+ Be precise and direct."""
45
  },
46
+ {"role": "user", "content": prompt}
 
 
 
47
  ],
48
  "temperature": temperature,
49
  "max_tokens": max_tokens,
 
52
 
53
  try:
54
  response = requests.post(
55
+ self.base_url,
56
+ headers=headers,
57
  json=payload,
58
  timeout=30
59
  )
 
62
  return f"API Error: {response.status_code}"
63
 
64
  result = response.json()
 
65
  if "choices" in result and len(result["choices"]) > 0:
66
  return result["choices"][0]["message"]["content"].strip()
67
  else:
68
  return "Error: No response content received"
 
69
  except Exception as e:
70
  return f"Error: {str(e)}"
71
 
72
+
73
+ # -----------------------------
74
+ # GAIA Agent
75
+ # -----------------------------
76
  class GAIAAgent:
77
  def __init__(self, api_key: str):
78
  self.llm = OpenRouterLLM(api_key=api_key)
79
  self.api_key = api_key
80
 
81
  def run_agent(self, prompt: str) -> str:
 
 
 
 
82
  try:
 
83
  enhanced_prompt = f"""
84
  Question: {prompt}
85
+ Analyze this question carefully and provide the exact answer. Do not include explanations, reasoning, or extra text.
 
 
 
 
 
 
 
86
  Your answer:"""
 
87
  response = self.llm(enhanced_prompt, max_tokens=500, temperature=0.1)
88
+ return self.clean_answer(response)
 
 
 
 
89
  except Exception as e:
90
  return f"Error: {str(e)}"
91
 
92
  def clean_answer(self, response: str) -> str:
 
93
  response = response.strip()
 
 
94
  prefixes_to_remove = [
95
+ "Answer:", "The answer is:", "Response:", "Result:",
96
  "Final answer:", "Solution:", "A:", "Answer is:"
97
  ]
 
98
  for prefix in prefixes_to_remove:
99
  if response.lower().startswith(prefix.lower()):
100
  response = response[len(prefix):].strip()
 
 
101
  if response.startswith('"') and response.endswith('"'):
102
  response = response[1:-1]
 
103
  return response
104
 
105
  def get_questions(self) -> List[Dict]:
 
106
  try:
107
+ resp = requests.get(f"{GAIA_API_BASE}/questions", timeout=30)
108
+ if resp.status_code == 200:
109
+ return resp.json()
110
+ return []
111
+ except Exception as e:
112
+ print("Error fetching questions:", e)
113
  return []
114
 
115
  def get_random_question(self) -> Dict:
 
116
  try:
117
+ resp = requests.get(f"{GAIA_API_BASE}/random-question", timeout=30)
118
+ if resp.status_code == 200:
119
+ return resp.json()
120
+ return {}
121
+ except Exception as e:
122
+ print("Error fetching random question:", e)
123
  return {}
124
 
125
+ def get_file(self, task_id: str) -> bytes:
126
+ """Download file for a specific task"""
127
+ try:
128
+ resp = requests.get(f"{GAIA_API_BASE}/files/{task_id}", timeout=60)
129
+ if resp.status_code == 200:
130
+ return resp.content
131
+ return None
132
+ except Exception as e:
133
+ print("Error fetching file:", e)
134
+ return None
135
+
136
  def submit_answers(self, username: str, agent_code: str, answers: List[Dict]) -> Dict:
 
137
  try:
138
  payload = {
139
  "username": username,
140
  "agent_code": agent_code,
141
  "answers": answers
142
  }
143
+ resp = requests.post(f"{GAIA_API_BASE}/submit", json=payload, timeout=60)
144
+ if resp.status_code == 200:
145
+ return resp.json()
146
+ return {"error": f"Submission failed: {resp.status_code}"}
 
 
 
 
 
 
 
 
147
  except Exception as e:
148
  return {"error": f"Submission error: {str(e)}"}
149
 
150
+
151
+ # -----------------------------
152
+ # Helper Functions
153
+ # -----------------------------
154
  agent = GAIAAgent(api_key=OPENROUTER_API_KEY)
155
 
156
  def run_agent(prompt: str) -> str:
 
157
  return agent.run_agent(prompt)
158
 
159
  def test_single_question():
 
160
  question = agent.get_random_question()
161
  if question:
162
+ q_text = question.get("Question", "")
163
+ answer = run_agent(q_text)
164
+ return f"Q: {q_text}\nA: {answer}"
165
  return "Failed to get question"
166
 
167
  def run_full_evaluation(username: str, progress=gr.Progress()):
 
168
  if not username:
169
  return "Please provide your Hugging Face username"
 
170
  if not OPENROUTER_API_KEY:
171
  return "Please configure your OpenRouter API key"
172
 
173
+ progress(0.1, desc="Fetching questions...")
 
 
174
  questions = agent.get_questions()
175
  if not questions:
176
  return "Failed to retrieve questions from GAIA API"
177
 
 
 
 
178
  answers = []
179
  for i, question in enumerate(questions):
180
+ progress(0.2 + (0.7 * i / len(questions)), desc=f"Processing {i+1}/{len(questions)}")
 
181
  task_id = question.get("task_id", "")
182
+ q_text = question.get("Question", "")
183
+ if q_text:
184
+ ans = run_agent(q_text)
185
+ answers.append({"task_id": task_id, "submitted_answer": ans})
186
+ time.sleep(0.5) # avoid hammering
 
 
 
 
 
 
187
 
188
  progress(0.9, desc="Submitting answers...")
 
 
189
  agent_code = f"https://huggingface.co/spaces/{username}/Gaia-Test-Agent/tree/main"
190
  result = agent.submit_answers(username, agent_code, answers)
191
 
192
+ progress(1.0, desc="Done")
 
193
  if "error" in result:
194
  return f"Submission failed: {result['error']}"
195
+ score = result.get("score", 0)
196
+ return f"Evaluation complete!\nScore: {score}%\nAnswers submitted: {len(answers)}"
197
+
198
 
199
+ # -----------------------------
200
+ # Gradio App
201
+ # -----------------------------
202
  def create_gradio_app():
203
  with gr.Blocks(title="GAIA Test Agent", theme=gr.themes.Soft()) as app:
 
204
  gr.HTML("""
205
+ <div style="text-align:center;background:linear-gradient(90deg,#667eea,#764ba2);color:white;padding:20px;border-radius:10px;margin-bottom:20px;">
206
  <h1>GAIA Test Agent</h1>
207
  <p>AI Agent for GAIA Benchmark Evaluation</p>
208
  </div>
209
  """)
210
 
211
  with gr.Tab("Single Question Test"):
212
+ btn = gr.Button("Test Random Question", variant="primary")
213
+ out = gr.Textbox(label="Result", lines=8)
214
+ btn.click(fn=test_single_question, outputs=[out])
 
 
 
 
 
 
 
 
215
 
216
  with gr.Tab("Full Evaluation"):
217
+ username = gr.Textbox(label="HF Username", placeholder="Enter your username")
 
 
 
 
 
 
 
218
  eval_btn = gr.Button("Run Full Evaluation", variant="primary")
219
+ eval_out = gr.Textbox(label="Results", lines=12)
220
+ eval_btn.click(fn=run_full_evaluation, inputs=[username], outputs=[eval_out], show_progress=True)
 
 
 
 
 
 
 
 
 
 
221
 
222
+ with gr.Tab("Manual Test"):
223
+ q_in = gr.Textbox(label="Enter Question", lines=3)
224
+ q_btn = gr.Button("Get Answer", variant="primary")
225
+ q_out = gr.Textbox(label="Answer", lines=4)
226
+ q_btn.click(fn=run_agent, inputs=[q_in], outputs=[q_out])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
 
228
  gr.Markdown("""
229
+ ### Instructions
230
+ - **Single Question Test**: Try a random GAIA question.
231
+ - **Full Evaluation**: Run through the 20 evaluation questions and submit.
232
+ - **Manual Test**: Try custom questions.
233
+
234
+ Requirements:
235
+ - Set your `OPENROUTER_API_KEY` in Space secrets.
236
+ - Keep your Space public for leaderboard verification.
 
237
  """)
 
238
  return app
239
 
240
+
241
  if __name__ == "__main__":
242
  app = create_gradio_app()
 
243
  if os.getenv("SPACE_ID"):
244
  app.launch(server_name="0.0.0.0", server_port=7860, show_api=False)
245
  else:
246
+ app.launch(share=True, show_api=False)