lethaq commited on
Commit
c0961ba
·
verified ·
1 Parent(s): 8021035

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -379
app.py CHANGED
@@ -1,444 +1,128 @@
1
- """Enhanced Agent Evaluation Runner with improved capabilities"""
 
2
  import os
3
- import re
4
  import time
5
  import gradio as gr
6
  import requests
7
  import pandas as pd
8
- import google.generativeai as genai
9
  from dotenv import load_dotenv
10
- from urllib.parse import urlparse, parse_qs
11
- import json
12
- from agent import Agent
13
- agent = Agent()
14
 
15
-
16
-
17
-
18
- # Load environment variables
19
  load_dotenv()
20
 
21
- # Configure Gemini
22
- genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
23
-
24
- # Constants
25
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
26
 
27
- class EnhancedAgent:
28
- """An enhanced agent using Google Gemini with improved capabilities."""
29
-
30
- def __init__(self):
31
- print("EnhancedAgent initialized.")
32
- # Use gemini-1.5-pro for better performance, fallback to flash
33
- try:
34
- self.model = genai.GenerativeModel('gemini-2.0-flash')
35
- except:
36
- self.model = genai.GenerativeModel('gemini-1.5-pro')
37
-
38
- # Rate limiting
39
- self.last_request_time = 0
40
- self.min_request_interval = 1.0 # 1 second between requests
41
-
42
- def _rate_limit(self):
43
- """Simple rate limiting to avoid quota issues."""
44
- current_time = time.time()
45
- time_since_last = current_time - self.last_request_time
46
- if time_since_last < self.min_request_interval:
47
- time.sleep(self.min_request_interval - time_since_last)
48
- self.last_request_time = time.time()
49
-
50
- def _extract_youtube_info(self, question: str) -> str:
51
- """Extract information about YouTube videos mentioned in questions."""
52
- youtube_patterns = [
53
- r'youtube\.com/watch\?v=([a-zA-Z0-9_-]+)',
54
- r'youtu\.be/([a-zA-Z0-9_-]+)'
55
- ]
56
-
57
- for pattern in youtube_patterns:
58
- match = re.search(pattern, question)
59
- if match:
60
- video_id = match.group(1)
61
- return f"YouTube video ID: {video_id}. Note: Cannot access video content directly, but can make educated guesses based on context."
62
- return ""
63
-
64
- def _analyze_question_type(self, question: str) -> str:
65
- """Analyze the type of question and provide specific guidance."""
66
- question_lower = question.lower()
67
-
68
- # Different question types and their handling strategies
69
- if any(word in question_lower for word in ['youtube', 'video', 'watch']):
70
- return "VIDEO_ANALYSIS"
71
- elif any(word in question_lower for word in ['excel', 'spreadsheet', 'file', 'csv']):
72
- return "FILE_ANALYSIS"
73
- elif any(word in question_lower for word in ['how many', 'count', 'number of']):
74
- return "COUNTING"
75
- elif any(word in question_lower for word in ['who', 'what', 'where', 'when']):
76
- return "FACTUAL"
77
- elif any(word in question_lower for word in ['calculate', 'compute', 'math']):
78
- return "CALCULATION"
79
- elif any(word in question_lower for word in ['list', 'name', 'identify']):
80
- return "LIST"
81
- else:
82
- return "GENERAL"
83
-
84
- def _get_enhanced_prompt(self, question: str, question_type: str) -> str:
85
- """Generate an enhanced system prompt based on question type."""
86
-
87
- base_prompt = """You are an expert assistant with broad knowledge across many domains including:
88
- - Music, entertainment, and media
89
- - Sports statistics and history
90
- - Science and mathematics
91
- - Geography and world facts
92
- - Technology and computing
93
- - Literature and culture
94
-
95
- CRITICAL INSTRUCTIONS:
96
- 1. Always provide your best educated guess even if you're not 100% certain
97
- 2. For numerical answers, provide ONLY the number (no commas, currency symbols, or units unless specified)
98
- 3. For names/words, provide the exact spelling
99
- 4. For lists, use comma-separated format
100
- 5. End with: FINAL ANSWER: [your concise answer]
101
-
102
- """
103
-
104
- if question_type == "VIDEO_ANALYSIS":
105
- base_prompt += """
106
- For video-related questions:
107
- - If you cannot access the video content, make educated guesses based on:
108
- - Video title/URL context
109
- - Common knowledge about the topic
110
- - Typical content patterns
111
- - Provide your best estimate rather than saying "cannot access"
112
- """
113
- elif question_type == "FILE_ANALYSIS":
114
- base_prompt += """
115
- For file-related questions:
116
- - If you cannot access files directly, make reasonable assumptions
117
- - Use general knowledge about typical data in such contexts
118
- - Provide educated estimates based on the question context
119
- """
120
- elif question_type == "COUNTING":
121
- base_prompt += """
122
- For counting questions:
123
- - Provide specific numbers when possible
124
- - If exact count unknown, provide reasonable estimates
125
- - Consider historical data and typical ranges
126
- """
127
- elif question_type == "FACTUAL":
128
- base_prompt += """
129
- For factual questions:
130
- - Use your knowledge base to provide accurate information
131
- - If multiple possibilities exist, choose the most likely one
132
- - Be specific with names, dates, and details
133
- """
134
-
135
- return base_prompt
136
-
137
- def _make_api_call_with_retry(self, prompt: str, max_retries: int = 3) -> str:
138
- """Make API call with retry logic and error handling."""
139
-
140
- for attempt in range(max_retries):
141
- try:
142
- self._rate_limit() # Apply rate limiting
143
-
144
- # Generate response using Gemini
145
- response = self.model.generate_content(
146
- prompt,
147
- generation_config=genai.types.GenerationConfig(
148
- temperature=0.1, # Lower temperature for more consistent answers
149
- max_output_tokens=1000,
150
- )
151
- )
152
-
153
- if response.text:
154
- return response.text
155
- else:
156
- raise Exception("Empty response from API")
157
-
158
- except Exception as e:
159
- error_msg = str(e).lower()
160
-
161
- if "quota" in error_msg or "429" in error_msg:
162
- if attempt < max_retries - 1:
163
- wait_time = (2 ** attempt) * 5 # Exponential backoff
164
- print(f"Quota exceeded, waiting {wait_time} seconds...")
165
- time.sleep(wait_time)
166
- continue
167
- else:
168
- return "Error: API quota exceeded"
169
- elif "safety" in error_msg:
170
- return "Error: Content safety filter triggered"
171
- else:
172
- if attempt < max_retries - 1:
173
- time.sleep(2) # Wait before retry
174
- continue
175
- else:
176
- return f"Error: {str(e)}"
177
-
178
- return "Error: Max retries exceeded"
179
-
180
- def __call__(self, question: str) -> str:
181
- """Process a question and return an answer."""
182
- print(f"Agent processing: {question[:100]}...")
183
-
184
- # Analyze question type
185
- question_type = self._analyze_question_type(question)
186
- print(f"Question type identified: {question_type}")
187
-
188
- # Extract additional context
189
- youtube_info = self._extract_youtube_info(question)
190
-
191
- # Build enhanced prompt
192
- system_prompt = self._get_enhanced_prompt(question, question_type)
193
-
194
- # Add context if available
195
- context = ""
196
- if youtube_info:
197
- context += f"\nContext: {youtube_info}\n"
198
-
199
- # Combine everything
200
- full_prompt = f"{system_prompt}\n{context}\nQuestion: {question}\n\nProvide your best answer:"
201
-
202
- # Make API call with retry
203
- response = self._make_api_call_with_retry(full_prompt)
204
-
205
- # Extract final answer
206
- return self._extract_final_answer(response, question_type)
207
-
208
- def _extract_final_answer(self, response: str, question_type: str) -> str:
209
- """Extract the final answer from the response."""
210
- if response.startswith("Error:"):
211
- return response
212
-
213
- # Look for FINAL ANSWER: pattern
214
- final_answer_match = re.search(r'FINAL ANSWER:\s*(.+?)(?:\n|$)', response, re.IGNORECASE)
215
- if final_answer_match:
216
- answer = final_answer_match.group(1).strip()
217
- return self._clean_answer(answer, question_type)
218
-
219
- # Fallback: extract from end of response
220
- lines = response.strip().split('\n')
221
- for line in reversed(lines):
222
- line = line.strip()
223
- if line and len(line) < 200: # Reasonable answer length
224
- return self._clean_answer(line, question_type)
225
-
226
- # Last resort: return first part of response
227
- return self._clean_answer(response[:100], question_type)
228
-
229
- def _clean_answer(self, answer: str, question_type: str) -> str:
230
- """Clean and format the final answer."""
231
- answer = answer.strip()
232
-
233
- # Remove common prefixes
234
- prefixes_to_remove = [
235
- "the answer is", "answer:", "final answer:",
236
- "result:", "solution:", "therefore",
237
- "in conclusion", "to summarize"
238
- ]
239
-
240
- for prefix in prefixes_to_remove:
241
- if answer.lower().startswith(prefix):
242
- answer = answer[len(prefix):].strip()
243
-
244
- # Clean punctuation from the end
245
- answer = answer.rstrip('.,;:!')
246
-
247
- # For counting questions, ensure we return just the number
248
- if question_type == "COUNTING":
249
- number_match = re.search(r'\b(\d+(?:,\d{3})*(?:\.\d+)?)\b', answer)
250
- if number_match:
251
- return number_match.group(1).replace(',', '')
252
-
253
- return answer
254
-
255
  def run_and_submit_all(profile: gr.OAuthProfile | None):
256
- agent = Agent()
257
  """
258
- Fetches all questions, runs the EnhancedAgent on them, submits all answers,
259
  and displays the results.
260
  """
261
- # Check if user is logged in
262
- if profile:
263
- username = f"{profile.username}"
264
- print(f"User logged in: {username}")
265
- else:
266
- print("User not logged in.")
267
  return "Please Login to Hugging Face with the button.", None
 
268
 
269
- # Get space info
270
- space_id = os.getenv("SPACE_ID")
271
- agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else "Unknown"
272
-
273
- api_url = DEFAULT_API_URL
274
- questions_url = f"{api_url}/questions"
275
- submit_url = f"{api_url}/submit"
276
 
277
- # 1. Initialize Agent
278
- try:
279
- agent = EnhancedAgent()
280
- except Exception as e:
281
- print(f"Error initializing agent: {e}")
282
- return f"Error initializing agent: {e}", None
283
 
284
- # 2. Fetch Questions
285
- print(f"Fetching questions from: {questions_url}")
286
  try:
287
- response = requests.get(questions_url, timeout=15)
288
- response.raise_for_status()
289
- questions_data = response.json()
290
-
291
  if not questions_data:
292
  return "No questions received from server.", None
293
-
294
- print(f"Fetched {len(questions_data)} questions.")
295
-
296
  except Exception as e:
297
- print(f"Error fetching questions: {e}")
298
  return f"Error fetching questions: {e}", None
299
- submitted_answer = agent(question_text)
300
 
301
- # 3. Process Questions
302
- results_log = []
303
  answers_payload = []
304
-
305
- print(f"Processing {len(questions_data)} questions...")
306
-
307
- for i, item in enumerate(questions_data):
308
- task_id = item.get("task_id")
309
  question_text = item.get("question")
310
-
311
  if not task_id or question_text is None:
312
- print(f"Skipping invalid item: {item}")
313
  continue
314
-
315
- print(f"Processing question {i+1}/{len(questions_data)}: {task_id}")
316
-
317
  try:
318
- # Get answer from agent
319
  submitted_answer = agent(question_text)
320
-
321
- # Store results
322
  answers_payload.append({
323
- "task_id": task_id,
324
  "submitted_answer": submitted_answer
325
  })
326
-
327
  results_log.append({
328
  "Task ID": task_id,
329
- "Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
330
  "Submitted Answer": submitted_answer
331
  })
332
-
333
- # Small delay between questions to avoid rate limiting
334
  time.sleep(0.5)
335
-
336
  except Exception as e:
337
- error_msg = f"ERROR: {str(e)}"
338
- print(f"Error processing task {task_id}: {e}")
339
-
340
  results_log.append({
341
  "Task ID": task_id,
342
- "Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
343
- "Submitted Answer": error_msg
344
  })
345
 
346
  if not answers_payload:
347
  return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
348
 
349
- # 4. Submit Results
350
  submission_data = {
351
  "username": username.strip(),
352
  "agent_code": agent_code,
353
  "answers": answers_payload
354
  }
355
-
356
- print(f"Submitting {len(answers_payload)} answers...")
357
-
358
  try:
359
- response = requests.post(submit_url, json=submission_data, timeout=60)
360
- response.raise_for_status()
361
- result_data = response.json()
362
-
363
- # Format success message
364
- final_status = (
365
  f"✅ Submission Successful!\n"
366
- f"User: {result_data.get('username')}\n"
367
- f"Score: {result_data.get('score', 'N/A')}% "
368
- f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
369
- f"Message: {result_data.get('message', 'No additional message.')}"
370
  )
371
-
372
- print("Submission successful!")
373
- results_df = pd.DataFrame(results_log)
374
- return final_status, results_df
375
-
376
  except Exception as e:
377
- error_msg = f"❌ Submission Failed: {str(e)}"
378
- print(error_msg)
379
- results_df = pd.DataFrame(results_log)
380
- return error_msg, results_df
381
 
382
- # Build Gradio Interface
383
- with gr.Blocks(title="Enhanced Agent Evaluation") as demo:
384
- gr.Markdown("# Enhanced Agent Evaluation Runner")
385
  gr.Markdown("""
386
  **Instructions:**
387
- 1. Make sure you have set up your `GOOGLE_API_KEY` in the environment variables
388
- 2. Log in to your Hugging Face account using the button below
389
- 3. Click 'Run Evaluation & Submit All Answers' to start the evaluation
390
-
391
- **Enhanced Features:**
392
- - Improved question analysis and categorization
393
- - Better handling of different question types
394
- - Rate limiting to avoid API quota issues
395
- - Retry logic for failed requests
396
- - Enhanced prompting for better accuracy
397
  """)
398
-
399
  gr.LoginButton()
400
-
401
- run_button = gr.Button("Run Evaluation & Submit All Answers", variant="primary")
402
-
403
- status_output = gr.Textbox(
404
- label="Status / Results",
405
- lines=6,
406
- interactive=False
407
- )
408
-
409
- results_table = gr.DataFrame(
410
- label="Questions and Answers",
411
- wrap=True
412
- )
413
 
414
- run_button.click(
415
- fn=run_and_submit_all,
416
- outputs=[status_output, results_table]
417
- )
 
418
 
419
  if __name__ == "__main__":
420
- print("=" * 50)
421
- print("🚀 Starting Enhanced Agent Evaluation Runner")
422
- print("=" * 50)
423
-
424
- # Check environment variables
425
- if not os.getenv("GOOGLE_API_KEY"):
426
- print("⚠️ WARNING: GOOGLE_API_KEY not found in environment variables!")
427
- print(" Please set your Google API key to use Gemini.")
428
- else:
429
- print("✅ GOOGLE_API_KEY found")
430
-
431
- space_host = os.getenv("SPACE_HOST")
432
- space_id = os.getenv("SPACE_ID")
433
-
434
- if space_host:
435
- print(f"✅ Running on Hugging Face Space")
436
- print(f" URL: https://{space_host}.hf.space")
437
-
438
- if space_id:
439
- print(f"✅ Space ID: {space_id}")
440
-
441
- print("=" * 50)
442
-
443
  demo.launch(debug=True, share=False)
444
-
 
1
+ ```python
2
+ """Enhanced Agent Evaluation Runner with simplified Agent integration"""
3
  import os
 
4
  import time
5
  import gradio as gr
6
  import requests
7
  import pandas as pd
 
8
  from dotenv import load_dotenv
9
+ from agent import Agent # 引入你自己写的简易 agent.py
 
 
 
10
 
11
+ # 加载 .env 中的 GOOGLE_API_KEY(agent.py 会使用)
 
 
 
12
  load_dotenv()
13
 
14
+ # 常量
 
 
 
15
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  def run_and_submit_all(profile: gr.OAuthProfile | None):
 
18
  """
19
+ Fetches all questions, runs the Agent on them, submits all answers,
20
  and displays the results.
21
  """
22
+ # 登录检查
23
+ if not profile:
 
 
 
 
24
  return "Please Login to Hugging Face with the button.", None
25
+ username = profile.username
26
 
27
+ # 初始化你的简易 Agent
28
+ agent = Agent()
 
 
 
 
 
29
 
30
+ # 组装提交相关 URL
31
+ space_id = os.getenv("SPACE_ID")
32
+ agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else "Unknown"
33
+ questions_url = f"{DEFAULT_API_URL}/questions"
34
+ submit_url = f"{DEFAULT_API_URL}/submit"
 
35
 
36
+ # 1. 拉取题目
 
37
  try:
38
+ resp = requests.get(questions_url, timeout=20)
39
+ resp.raise_for_status()
40
+ questions_data = resp.json()
 
41
  if not questions_data:
42
  return "No questions received from server.", None
 
 
 
43
  except Exception as e:
 
44
  return f"Error fetching questions: {e}", None
 
45
 
46
+ # 2. 遍历题目并调用 Agent 获取答案
47
+ results_log = []
48
  answers_payload = []
49
+
50
+ for item in questions_data:
51
+ task_id = item.get("task_id")
 
 
52
  question_text = item.get("question")
 
53
  if not task_id or question_text is None:
 
54
  continue
55
+
 
 
56
  try:
57
+ # 调用你在 agent.py 中定义的 Agent
58
  submitted_answer = agent(question_text)
59
+
 
60
  answers_payload.append({
61
+ "task_id": task_id,
62
  "submitted_answer": submitted_answer
63
  })
 
64
  results_log.append({
65
  "Task ID": task_id,
66
+ "Question": question_text,
67
  "Submitted Answer": submitted_answer
68
  })
69
+
70
+ # 避免 API 速率限制
71
  time.sleep(0.5)
 
72
  except Exception as e:
73
+ err = f"ERROR: {e}"
 
 
74
  results_log.append({
75
  "Task ID": task_id,
76
+ "Question": question_text,
77
+ "Submitted Answer": err
78
  })
79
 
80
  if not answers_payload:
81
  return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
82
 
83
+ # 3. 提交答案
84
  submission_data = {
85
  "username": username.strip(),
86
  "agent_code": agent_code,
87
  "answers": answers_payload
88
  }
 
 
 
89
  try:
90
+ post = requests.post(submit_url, json=submission_data, timeout=60)
91
+ post.raise_for_status()
92
+ data = post.json()
93
+ status = (
 
 
94
  f"✅ Submission Successful!\n"
95
+ f"User: {data.get('username')}\n"
96
+ f"Score: {data.get('score','N/A')}% "
97
+ f"({data.get('correct_count','?')}/{data.get('total_attempted','?')})\n"
98
+ f"Message: {data.get('message','No additional message.')}"
99
  )
100
+ return status, pd.DataFrame(results_log)
 
 
 
 
101
  except Exception as e:
102
+ return f"❌ Submission Failed: {e}", pd.DataFrame(results_log)
 
 
 
103
 
104
+ # --- Gradio 界面 ---
105
+ with gr.Blocks(title="Simplified GAIA Agent Evaluation") as demo:
106
+ gr.Markdown("# Simplified GAIA Agent Evaluation Runner")
107
  gr.Markdown("""
108
  **Instructions:**
109
+ 1. Set your `GOOGLE_API_KEY` in the environment variables.
110
+ 2. Log in to your Hugging Face account using the button below.
111
+ 3. Click **Run Evaluation & Submit All Answers** to start.
112
+
113
+ This runner uses:
114
+ - A custom `agent.py` for answering GAIA questions.
115
+ - Gradio for UI.
116
+ - HTTP requests to fetch & submit answers.
 
 
117
  """)
 
118
  gr.LoginButton()
 
 
 
 
 
 
 
 
 
 
 
 
 
119
 
120
+ run_btn = gr.Button("Run Evaluation & Submit All Answers", variant="primary")
121
+ status_out = gr.Textbox(label="Status / Results", lines=6, interactive=False)
122
+ table_out = gr.DataFrame(label="Questions and Answers", wrap=True)
123
+
124
+ run_btn.click(fn=run_and_submit_all, outputs=[status_out, table_out])
125
 
126
  if __name__ == "__main__":
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  demo.launch(debug=True, share=False)
128
+ ```