saandip5 commited on
Commit
1726dc3
·
verified ·
1 Parent(s): a476965

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +282 -429
app.py CHANGED
@@ -1,463 +1,316 @@
1
- import requests
2
  import os
3
- from typing import Dict, List, Optional
4
- from io import BytesIO
5
- from docx import Document
6
- import pandas as pd
7
- import wikipediaapi
8
- import re
9
- from collections import Counter
10
  import json
 
 
 
11
 
12
- # Configuration
13
  HF_TOKEN = os.getenv("HF_TOKEN_HERE")
14
  if not HF_TOKEN:
15
  raise ValueError("HF_TOKEN_HERE is missing in Secrets!")
16
- API_BASE_URL = "https://agents-course-unit4-scoring.hf.space"
17
  HEADERS = {
18
  "Authorization": f"Bearer {HF_TOKEN}",
19
  "Content-Type": "application/json"
20
  }
 
21
 
22
- class BasicAgent:
23
- def __init__(self):
24
- print("BasicAgent initialized.")
25
- self.wiki = wikipediaapi.Wikipedia(
26
- user_agent='GAIAAgent/1.0 (saandip5@example.com)',
27
- language='en'
28
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
- def fetch_file(self, task_id: str, file_name: str) -> BytesIO:
31
- """Fetch file content for a task."""
 
 
 
 
32
  try:
33
- url = f"{API_BASE_URL}/files/{task_id}"
34
- response = requests.get(url, headers=HEADERS, verify=True, timeout=15)
35
  response.raise_for_status()
36
- print(f"Successfully fetched file {file_name} for task {task_id}")
37
- return BytesIO(response.content)
38
- except requests.RequestException as e:
39
- print(f"Error fetching file {file_name} for task {task_id}: {e}")
40
- return None
 
 
 
 
 
41
 
42
- def parse_secret_santa(self, file_content: BytesIO) -> str:
43
- """Enhanced .docx parser for Secret Santa question."""
44
- try:
45
- doc = Document(file_content)
46
- full_text = ""
47
- for paragraph in doc.paragraphs:
48
- if paragraph.text.strip():
49
- full_text += paragraph.text + " "
50
-
51
- text = full_text.lower()
52
- print(f"Secret Santa text preview: {text[:200]}...")
53
-
54
- # Extract all names mentioned
55
- common_names = ['john', 'fred', 'alice', 'bob', 'mary', 'susan', 'tom', 'emma', 'david', 'laura', 'chris', 'jane', 'mike', 'sarah', 'paul', 'lisa']
56
- found_names = set()
57
- for name in common_names:
58
- if name in text:
59
- found_names.add(name)
60
-
61
- # Look for giving patterns
62
- giving_patterns = [
63
- r'(\w+)\s+(?:gives?|gave|giving)\s+(?:to\s+)?(\w+)',
64
- r'(\w+)\s+(?:is\s+)?(?:the\s+)?secret\s+santa\s+(?:for\s+)?(\w+)',
65
- r'(\w+)\s*→\s*(\w+)',
66
- r'(\w+)\s*:\s*(\w+)'
67
- ]
68
-
69
- givers = set()
70
- receivers = set()
71
-
72
- for pattern in giving_patterns:
73
- matches = re.findall(pattern, text)
74
- for giver, receiver in matches:
75
- if giver.lower() in found_names and receiver.lower() in found_names:
76
- givers.add(giver.lower())
77
- receivers.add(receiver.lower())
78
-
79
- # Look for explicit "does not give" patterns
80
- non_giving_patterns = [
81
- r'(\w+)\s+(?:does\s+not|doesn\'t|cannot|can\'t)\s+give',
82
- r'(\w+)\s+(?:is\s+not|isn\'t)\s+(?:the\s+)?secret\s+santa',
83
- r'(\w+)\s+(?:will\s+not|won\'t)\s+be\s+giving'
84
- ]
85
-
86
- explicit_non_givers = set()
87
- for pattern in non_giving_patterns:
88
- matches = re.findall(pattern, text)
89
- for match in matches:
90
- if match.lower() in found_names:
91
- explicit_non_givers.add(match.lower())
92
-
93
- # Find who doesn't give
94
- non_giver = None
95
-
96
- # Priority 1: Explicitly mentioned non-givers
97
- if explicit_non_givers:
98
- non_giver = list(explicit_non_givers)[0]
99
- # Priority 2: Names mentioned but not in givers list
100
- elif found_names and givers:
101
- potential_non_givers = found_names - givers
102
- if potential_non_givers:
103
- non_giver = list(potential_non_givers)[0]
104
-
105
- if non_giver:
106
- result = non_giver.capitalize()
107
- print(f"Secret Santa non-giver found: {result}")
108
- return result
109
-
110
- print("No clear non-giver found, defaulting to Fred")
111
- return "Fred"
112
-
113
- except Exception as e:
114
- print(f"Error parsing Secret Santa .docx: {e}")
115
- return "Fred"
116
 
117
- def parse_land_plots(self, file_content: BytesIO) -> str:
118
- """Enhanced .xlsx parser for land connectivity question."""
119
- try:
120
- # Try different sheet reading approaches
121
- try:
122
- df = pd.read_excel(file_content, sheet_name=0)
123
- except:
124
- df = pd.read_excel(file_content)
125
-
126
- print(f"Land plots data shape: {df.shape}")
127
- print(f"Data preview:\n{df.head()}")
128
-
129
- # Convert to numeric where possible
130
- numeric_df = df.copy()
131
- for col in numeric_df.columns:
132
- numeric_df[col] = pd.to_numeric(numeric_df[col], errors='coerce')
133
-
134
- # Check for non-numeric indicators of barriers
135
- has_barriers = False
136
- for col in df.columns:
137
- if df[col].dtype == 'object':
138
- unique_vals = df[col].dropna().unique()
139
- barrier_indicators = ['x', 'wall', 'fence', 'blocked', 'no', 'barrier']
140
- if any(str(val).lower() in barrier_indicators for val in unique_vals):
141
- has_barriers = True
142
- break
143
-
144
- # Simple connectivity heuristic
145
- if has_barriers:
146
- return "no"
147
-
148
- # If mostly numeric and reasonably sized grid, assume connected
149
- if df.shape[0] >= 3 and df.shape[1] >= 3:
150
- non_null_ratio = df.notna().sum().sum() / (df.shape[0] * df.shape[1])
151
- if non_null_ratio > 0.7: # Most cells have data
152
- return "yes"
153
-
154
- return "no"
155
 
156
- except Exception as e:
157
- print(f"Error parsing land plots .xlsx: {e}")
158
- return "no"
159
-
160
- def parse_sales_excel(self, file_content: BytesIO) -> str:
161
- """Enhanced .xlsx parser for sales data."""
162
  try:
163
- # Try reading different sheets
164
- xl_file = pd.ExcelFile(file_content)
165
- print(f"Excel sheets available: {xl_file.sheet_names}")
166
-
167
- df = None
168
- for sheet_name in xl_file.sheet_names:
169
- try:
170
- temp_df = pd.read_excel(file_content, sheet_name=sheet_name)
171
- if not temp_df.empty:
172
- df = temp_df
173
- break
174
- except:
175
- continue
176
-
177
- if df is None or df.empty:
178
- return "unknown"
179
-
180
- print(f"Sales data shape: {df.shape}")
181
- print(f"Columns: {list(df.columns)}")
182
- print(f"Data preview:\n{df.head()}")
183
-
184
- # Flexible column detection
185
- sales_cols = []
186
- for col in df.columns:
187
- col_lower = str(col).lower()
188
- if any(keyword in col_lower for keyword in ['sales', 'revenue', 'amount', 'total', 'price', 'cost']):
189
- sales_cols.append(col)
190
 
191
- item_cols = []
192
- for col in df.columns:
193
- col_lower = str(col).lower()
194
- if any(keyword in col_lower for keyword in ['item', 'product', 'name', 'menu', 'food']):
195
- item_cols.append(col)
196
-
197
- if not sales_cols:
198
- print("No sales columns found")
199
- return "unknown"
200
-
201
- sales_col = sales_cols[0]
202
- print(f"Using sales column: {sales_col}")
203
-
204
- # Try to identify food items
205
- if item_cols:
206
- item_col = item_cols[0]
207
- print(f"Using item column: {item_col}")
208
-
209
- # Filter out drinks
210
- drink_keywords = ['drink', 'soda', 'coffee', 'juice', 'tea', 'water', 'milk', 'shake', 'smoothie', 'beverage']
211
- food_mask = df[item_col].astype(str).str.lower().apply(
212
- lambda x: not any(keyword in x for keyword in drink_keywords)
213
- )
214
-
215
- food_sales = df[food_mask][sales_col].sum()
216
  else:
217
- # If no item column, sum all sales
218
- food_sales = df[sales_col].sum()
219
 
220
- if pd.isna(food_sales):
221
- return "unknown"
 
 
222
 
223
- # Format the result
224
- if food_sales == int(food_sales):
225
- return str(int(food_sales))
226
- else:
227
- return f"{food_sales:.2f}"
 
 
228
 
229
  except Exception as e:
230
- print(f"Error parsing sales .xlsx: {e}")
231
- return "unknown"
 
 
 
 
 
 
 
 
 
232
 
233
- def parse_chess_position(self, file_content: BytesIO) -> str:
234
- """Enhanced chess position parser."""
235
- try:
236
- # For now, return common rook moves, but this could be enhanced with actual image analysis
237
- common_rook_moves = ["rd5", "re5", "rf5", "rd4", "rc3", "rb6", "ra2", "rd1", "rd7", "rd8"]
238
- return common_rook_moves[0].lower()
239
- except Exception as e:
240
- print(f"Error parsing chess .png: {e}")
241
- return "rd5"
242
 
243
- def enhanced_wikipedia_search(self, queries: List[str]) -> str:
244
- """Enhanced Wikipedia search with multiple query strategies."""
245
- for query in queries:
246
- try:
247
- # Direct page search
248
- page = self.wiki.page(query)
249
- if page.exists():
250
- print(f"Wikipedia found: {query}")
251
- return page.text
252
-
253
- # Try search suggestions
254
- search_results = self.wiki.search(query, results=5)
255
- for result in search_results:
256
- page = self.wiki.page(result)
257
- if page.exists():
258
- print(f"Wikipedia found via search: {result}")
259
- return page.text
260
-
261
- except Exception as e:
262
- print(f"Error searching Wikipedia for '{query}': {e}")
263
- continue
264
-
265
- return ""
266
 
267
- def extract_answer_from_wiki(self, wiki_text: str, question: str) -> str:
268
- """Enhanced answer extraction from Wikipedia."""
269
- if not wiki_text:
270
- return "unknown"
271
-
272
- question_lower = question.lower()
273
-
274
- # Question type detection
275
- is_count = any(phrase in question_lower for phrase in ["how many", "number of", "count"])
276
- is_person = any(phrase in question_lower for phrase in ["who", "whom", "person", "name"])
277
- is_date = any(phrase in question_lower for phrase in ["when", "year", "date", "time"])
278
- is_ioc = "ioc" in question_lower or "country code" in question_lower
279
- is_what = question_lower.startswith("what")
280
- is_where = question_lower.startswith("where")
281
-
282
- # Extract key terms from question
283
- question_words = set(re.findall(r'\b\w+\b', question_lower))
284
- question_words.discard('the')
285
- question_words.discard('of')
286
- question_words.discard('and')
287
-
288
- # Find most relevant sentences
289
- sentences = re.split(r'[.!?]', wiki_text)
290
- scored_sentences = []
291
-
292
- for sentence in sentences:
293
- if len(sentence.strip()) < 10:
294
- continue
295
-
296
- sentence_words = set(re.findall(r'\b\w+\b', sentence.lower()))
297
- overlap = len(question_words.intersection(sentence_words))
298
- scored_sentences.append((overlap, sentence.strip()))
299
-
300
- # Sort by relevance
301
- scored_sentences.sort(key=lambda x: x[0], reverse=True)
302
- best_sentences = [s[1] for s in scored_sentences[:5] if s[0] > 0]
303
-
304
- if not best_sentences:
305
- best_sentences = sentences[:3]
306
-
307
- best_text = " ".join(best_sentences)
308
-
309
- # Type-specific extraction
310
- if is_ioc:
311
- # Look for 3-letter country codes
312
- codes = re.findall(r'\b[A-Z]{3}\b', best_text)
313
- if codes:
314
- return codes[0].upper()
315
- return "USA" # fallback
316
-
317
- elif is_count:
318
- # Extract numbers
319
- numbers = re.findall(r'\b\d+\b', best_text)
320
- if numbers:
321
- return numbers[0]
322
- return "1"
323
-
324
- elif is_person:
325
- # Extract proper names
326
- names = re.findall(r'\b[A-Z][a-z]+(?:\s[A-Z][a-z]+)*\b', best_text)
327
- if names:
328
- # Return last name for consistency
329
- full_name = names[0]
330
- return full_name.split()[-1].lower()
331
- return "unknown"
332
-
333
- elif is_date:
334
- # Extract years or dates
335
- years = re.findall(r'\b\d{4}\b', best_text)
336
- if years:
337
- return years[0]
338
- dates = re.findall(r'\b\d{1,2}\s+\w+\s+\d{4}\b', best_text)
339
- if dates:
340
- return dates[0].lower()
341
- return "unknown"
342
-
343
- elif is_what or is_where:
344
- # Extract key nouns or concepts
345
- words = re.findall(r'\b[a-zA-Z]+\b', best_text)
346
- if words:
347
- # Filter out common words
348
- common_words = {'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'was', 'are', 'were', 'be', 'been', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'this', 'that', 'these', 'those'}
349
- filtered_words = [w.lower() for w in words if w.lower() not in common_words and len(w) > 2]
350
- if filtered_words:
351
- return filtered_words[0]
352
-
353
- return "unknown"
354
 
355
- def __call__(self, question: str, task_id: str = "", file_name: str = "") -> str:
356
- """Enhanced question processing."""
357
- question_text = question.lower().strip()
358
- print(f"\n{'='*50}")
359
- print(f"Processing question (task_id: {task_id})")
360
- print(f"File: {file_name}")
361
- print(f"Question: {question_text[:100]}...")
362
- print(f"{'='*50}")
 
363
 
364
- # Handle file-based questions first
365
- if file_name:
366
- file_content = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
367
 
368
- # Try API first for test set
369
- if API_BASE_URL and not task_id.startswith("val_"):
370
- file_content = self.fetch_file(task_id, file_name)
371
 
372
- # Fallback to local files
373
- if not file_content:
374
- try:
375
- file_path = f"files/{file_name}"
376
- with open(file_path, "rb") as f:
377
- file_content = BytesIO(f.read())
378
- print(f"Loaded local file {file_path}")
379
- except FileNotFoundError:
380
- print(f"File {file_name} not found locally")
381
- return "unknown"
 
 
 
 
 
 
382
 
383
- if file_content:
384
- if file_name.endswith(".docx"):
385
- return self.parse_secret_santa(file_content)
386
- elif file_name.endswith(".xlsx"):
387
- if any(keyword in question_text for keyword in ["sales", "revenue", "food", "restaurant"]):
388
- return self.parse_sales_excel(file_content)
389
- else:
390
- return self.parse_land_plots(file_content)
391
- elif file_name.endswith(".png"):
392
- return self.parse_chess_position(file_content)
393
-
394
- print(f"Failed to process file {file_name}")
395
- return "unknown"
396
 
397
- # Enhanced hardcoded answers (keep the ones that work, improve others)
398
- validation_answers = {
399
- "eliud kipchoge": "17",
400
- "mercedes sosa": "3",
401
- "pick that ping-pong": "3",
402
- "doctor who": "the castle",
403
- "tizin": "maktay mato apple",
404
- "logically equivalent": "(¬a → b) ↔ (a ∨ ¬b)",
405
- "family reunion": "2",
406
- "opposite": "right",
407
- "merriam-webster": "annie levin",
408
- "fish bag": "0.1777",
409
- "dinosaur": "funkmonk",
410
- "legume": "research",
411
- "youtube": "3",
412
- "nature journal": "diamond",
413
- "hreidmar": "fluffy",
414
- "bielefeld university": "guatemala",
415
- "pie menus": "mapping human oriented information to software agents for online systems usage"
416
- }
417
-
418
- # Check validation answers
419
- for key, answer in validation_answers.items():
420
- if key in question_text:
421
- print(f"Found validation answer for '{key}': {answer}")
422
- return answer
423
 
424
- # Enhanced Wikipedia search for unknown questions
425
- print("Searching Wikipedia with enhanced strategies...")
426
-
427
- # Create multiple search queries
428
- search_queries = []
429
-
430
- # Extract key phrases
431
- words = re.findall(r'\b\w+\b', question_text)
432
- if len(words) >= 2:
433
- search_queries.append(" ".join(words[:3]))
434
- search_queries.append(" ".join(words[1:4]))
435
-
436
- # Extract quoted terms
437
- quoted_terms = re.findall(r'"([^"]*)"', question_text)
438
- search_queries.extend(quoted_terms)
439
-
440
- # Extract proper nouns (capitalized words)
441
- proper_nouns = re.findall(r'\b[A-Z][a-z]+(?:\s[A-Z][a-z]+)*\b', question)
442
- search_queries.extend(proper_nouns)
443
-
444
- # Add the full question as a fallback
445
- search_queries.append(question_text[:50])
446
-
447
- # Remove duplicates while preserving order
448
- unique_queries = []
449
- for query in search_queries:
450
- if query and query not in unique_queries:
451
- unique_queries.append(query)
452
-
453
- wiki_text = self.enhanced_wikipedia_search(unique_queries[:5])
454
-
455
- if wiki_text:
456
- answer = self.extract_answer_from_wiki(wiki_text, question_text)
457
- if answer != "unknown":
458
- print(f"Wikipedia answer found: {answer}")
459
- return answer.strip()
460
-
461
- print("No answer found, returning 'unknown'")
462
- return "unknown"
463
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
+ import gradio as gr
3
+ import requests
 
 
 
 
 
4
  import json
5
+ import pandas as pd
6
+ from agent import BasicAgent
7
+ import traceback
8
 
9
+ DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
10
  HF_TOKEN = os.getenv("HF_TOKEN_HERE")
11
  if not HF_TOKEN:
12
  raise ValueError("HF_TOKEN_HERE is missing in Secrets!")
 
13
  HEADERS = {
14
  "Authorization": f"Bearer {HF_TOKEN}",
15
  "Content-Type": "application/json"
16
  }
17
+ VALIDATION_URL = "https://huggingface.co/datasets/gaia-benchmark/GAIA/resolve/main/2023/validation/metadata.jsonl"
18
 
19
+ def fetch_validation_questions():
20
+ """Fetch validation questions with better error handling."""
21
+ try:
22
+ response = requests.get(VALIDATION_URL, headers=HEADERS, timeout=15)
23
+ response.raise_for_status()
24
+ lines = response.text.splitlines()
25
+ questions = []
26
+ for line in lines:
27
+ if line.strip():
28
+ try:
29
+ row = json.loads(line)
30
+ if row.get("Level") == 1:
31
+ questions.append({
32
+ "task_id": row.get("task_id", ""),
33
+ "question": row.get("Question", ""),
34
+ "file_name": row.get("file_name", "")
35
+ })
36
+ except json.JSONDecodeError as e:
37
+ print(f"Error parsing line: {line[:50]}... Error: {e}")
38
+ continue
39
+
40
+ print(f"Fetched {len(questions)} Level 1 validation questions.")
41
+ return questions[:20] # Limit to 20 for testing
42
+ except Exception as e:
43
+ print(f"Error fetching validation questions: {e}")
44
+ print(f"Traceback: {traceback.format_exc()}")
45
+ return []
46
+
47
+ def run_and_submit_all(use_validation: bool, profile: gr.OAuthProfile | None = None):
48
+ """Enhanced run function with better logging and error handling."""
49
+ space_id = os.getenv("SPACE_ID") or "saandip5/Final_Assignment_Template"
50
+
51
+ if profile:
52
+ username = f"{profile.username}"
53
+ print(f"User logged in: {username}")
54
+ else:
55
+ print("User not logged in.")
56
+ return "Please Login to Hugging Face with the button.", None
57
+
58
+ api_url = DEFAULT_API_URL
59
+ questions_url = f"{api_url}/questions"
60
+ submit_url = f"{api_url}/submit"
61
+ agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
62
+ print(f"Agent code link: {agent_code}")
63
+
64
+ # Initialize agent with error handling
65
+ try:
66
+ agent = BasicAgent()
67
+ print("Agent initialized successfully")
68
+ except Exception as e:
69
+ error_msg = f"Error initializing agent: {e}\n{traceback.format_exc()}"
70
+ print(error_msg)
71
+ return error_msg, None
72
 
73
+ # Fetch questions
74
+ if use_validation:
75
+ print("Using validation dataset...")
76
+ questions_data = fetch_validation_questions()
77
+ else:
78
+ print(f"Fetching test questions from: {questions_url}")
79
  try:
80
+ response = requests.get(questions_url, headers=HEADERS, timeout=15)
 
81
  response.raise_for_status()
82
+ questions_data = response.json()
83
+ print(f"Fetched {len(questions_data)} test questions.")
84
+ except requests.exceptions.RequestException as e:
85
+ error_msg = f"Error fetching questions: {e}"
86
+ print(error_msg)
87
+ return error_msg, None
88
+ except json.JSONDecodeError as e:
89
+ error_msg = f"Error decoding JSON response: {e}"
90
+ print(error_msg)
91
+ return error_msg, None
92
 
93
+ if not questions_data:
94
+ error_msg = "Fetched questions list is empty."
95
+ print(error_msg)
96
+ return error_msg, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
 
98
+ # Process questions
99
+ results_log = []
100
+ answers_payload = []
101
+ successful_answers = 0
102
+
103
+ print(f"\n{'='*60}")
104
+ print(f"STARTING EVALUATION ON {len(questions_data)} QUESTIONS")
105
+ print(f"{'='*60}")
106
+
107
+ for i, item in enumerate(questions_data, 1):
108
+ task_id = item.get("task_id")
109
+ question_text = item.get("question")
110
+ file_name = item.get("file_name", "")
111
+
112
+ print(f"\n[{i}/{len(questions_data)}] Processing task: {task_id}")
113
+
114
+ if not task_id or question_text is None:
115
+ print(f"Skipping item with missing data: {item}")
116
+ continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
 
 
 
 
 
 
118
  try:
119
+ # Call agent with enhanced error handling
120
+ submitted_answer = agent(question_text, task_id, file_name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
 
122
+ if submitted_answer and submitted_answer != "unknown":
123
+ successful_answers += 1
124
+ print(f" Answer: {submitted_answer}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  else:
126
+ print(f" No answer found")
 
127
 
128
+ answers_payload.append({
129
+ "task_id": task_id,
130
+ "submitted_answer": submitted_answer
131
+ })
132
 
133
+ results_log.append({
134
+ "Task ID": task_id,
135
+ "Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
136
+ "File": file_name,
137
+ "Submitted Answer": submitted_answer,
138
+ "Status": "Success" if submitted_answer != "unknown" else "❓ Unknown"
139
+ })
140
 
141
  except Exception as e:
142
+ error_msg = f"AGENT ERROR: {str(e)}"
143
+ print(f" Error processing task {task_id}: {e}")
144
+ print(f"Traceback: {traceback.format_exc()}")
145
+
146
+ results_log.append({
147
+ "Task ID": task_id,
148
+ "Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
149
+ "File": file_name,
150
+ "Submitted Answer": error_msg,
151
+ "Status": " Error"
152
+ })
153
 
154
+ print(f"\n{'='*60}")
155
+ print(f"EVALUATION COMPLETE")
156
+ print(f"Total questions: {len(questions_data)}")
157
+ print(f"Successful answers: {successful_answers}")
158
+ print(f"Success rate: {(successful_answers/len(questions_data)*100):.1f}%")
159
+ print(f"{'='*60}")
 
 
 
160
 
161
+ if not answers_payload:
162
+ error_msg = "Agent did not produce any answers to submit."
163
+ print(error_msg)
164
+ return error_msg, pd.DataFrame(results_log)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
 
166
+ # Save results log
167
+ try:
168
+ with open("results_log.json", "w") as f:
169
+ json.dump(results_log, f, indent=2)
170
+ print(" Saved results_log.json")
171
+ except Exception as e:
172
+ print(f" Error saving results_log.json: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
 
174
+ # Prepare submission
175
+ submission_data = {
176
+ "username": username.strip(),
177
+ "agent_code": agent_code,
178
+ "answers": answers_payload
179
+ }
180
+
181
+ status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
182
+ print(status_update)
183
 
184
+ # Submit or return results
185
+ if not use_validation:
186
+ print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
187
+ try:
188
+ response = requests.post(submit_url, json=submission_data, headers=HEADERS, timeout=60)
189
+ response.raise_for_status()
190
+ result_data = response.json()
191
+
192
+ final_status = (
193
+ f" Submission Successful!\n"
194
+ f"User: {result_data.get('username')}\n"
195
+ f"Overall Score: {result_data.get('score', 'N/A')}% "
196
+ f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
197
+ f"Message: {result_data.get('message', 'No message received.')}\n\n"
198
+ f" Processing Summary:\n"
199
+ f"• Total questions processed: {len(questions_data)}\n"
200
+ f"• Answers found (non-'unknown'): {successful_answers}\n"
201
+ f"• Processing success rate: {(successful_answers/len(questions_data)*100):.1f}%"
202
+ )
203
+ print(" Submission successful.")
204
+ return final_status, pd.DataFrame(results_log)
205
+
206
+ except requests.exceptions.HTTPError as e:
207
+ error_detail = f"Server responded with status {e.response.status_code}."
208
+ try:
209
+ error_json = e.response.json()
210
+ error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
211
+ except:
212
+ error_detail += f" Response: {e.response.text[:500]}"
213
 
214
+ status_message = f" Submission Failed: {error_detail}"
215
+ print(status_message)
216
+ return status_message, pd.DataFrame(results_log)
217
 
218
+ except Exception as e:
219
+ status_message = f"Submission Failed: {e}\n{traceback.format_exc()}"
220
+ print(status_message)
221
+ return status_message, pd.DataFrame(results_log)
222
+ else:
223
+ print("Validation mode: Skipping submission, returning results.")
224
+ validation_summary = (
225
+ f" Validation Run Complete\n\n"
226
+ f" Summary:\n"
227
+ f"• Total questions processed: {len(questions_data)}\n"
228
+ f"• Answers found (non-'unknown'): {successful_answers}\n"
229
+ f"• Processing success rate: {(successful_answers/len(questions_data)*100):.1f}%\n\n"
230
+ f" This gives you an estimate of potential performance.\n"
231
+ f"Check the results table below for detailed breakdown."
232
+ )
233
+ return validation_summary, pd.DataFrame(results_log)
234
 
235
+ # Gradio Interface
236
+ with gr.Blocks(title="GAIA Benchmark Agent Evaluation", theme=gr.themes.Soft()) as demo:
237
+ gr.Markdown("# GAIA Benchmark Agent Evaluation")
238
+ gr.Markdown(
239
+ """
240
+ ### Instructions:
241
+ 1. **Setup**: Ensure `HF_TOKEN_HERE` is set in Space Secrets
242
+ 2. **Development**: Clone this Space and modify `agent.py` with your logic
243
+ 3. **Authentication**: Log in to Hugging Face below
244
+ 4. **Testing**: Select 'Use Validation' for local testing or leave unchecked for test set submission
245
+ 5. **Run**: Click 'Run Evaluation & Submit All Answers' to process questions and submit
 
 
246
 
247
+ ### Important Notes:
248
+ - **Validation Mode**: Use this to test your agent on known questions before submitting
249
+ - **Test Mode**: Submits to the actual benchmark (limited submissions per day)
250
+ - **Processing Time**: May take several minutes depending on number of questions
251
+ - **Debugging**: Check `results_log.json` if you need to debug failures
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
 
253
+ ### Current Goal: Improve accuracy
254
+ """
255
+ )
256
+
257
+ gr.LoginButton()
258
+
259
+ with gr.Row():
260
+ use_validation = gr.Checkbox(
261
+ label="🧪 Use Validation Set for Testing",
262
+ value=True, # Default to validation for safety
263
+ info="Recommended: Test on validation set first before submitting to test set"
264
+ )
265
+
266
+ run_button = gr.Button(
267
+ "🚀 Run Evaluation & Submit All Answers",
268
+ variant="primary",
269
+ size="lg"
270
+ )
271
+
272
+ status_output = gr.Textbox(
273
+ label="Run Status / Submission Result",
274
+ lines=10,
275
+ interactive=False,
276
+ show_copy_button=True
277
+ )
278
+
279
+ results_table = gr.DataFrame(
280
+ label="Detailed Results: Questions and Agent Answers",
281
+ wrap=True,
282
+ interactive=False
283
+ )
284
+
285
+ run_button.click(
286
+ fn=run_and_submit_all,
287
+ inputs=[use_validation],
288
+ outputs=[status_output, results_table]
289
+ )
290
+
291
+ if __name__ == "__main__":
292
+ print("\n" + "="*70)
293
+ print(" GAIA BENCHMARK AGENT - STARTING UP ")
294
+ print("="*70)
295
+
296
+ space_host = os.getenv("SPACE_HOST")
297
+ space_id = os.getenv("SPACE_ID") or "saandip5/Final_Assignment_Template"
298
+
299
+ if space_host:
300
+ print(f" SPACE_HOST found: {space_host}")
301
+ print(f" Runtime URL: https://{space_host}.hf.space")
302
+ else:
303
+ print(" SPACE_HOST not found (running locally?)")
304
+
305
+ if space_id:
306
+ print(f" SPACE_ID found: {space_id}")
307
+ print(f" Repo URL: https://huggingface.co/spaces/{space_id}")
308
+ print(f" Repo Tree URL: https://huggingface.co/spaces/{space_id}/tree/main")
309
+ else:
310
+ print(" SPACE_ID not found (running locally?)")
311
+
312
+ print("="*70)
313
+ print(" Launching Gradio Interface...")
314
+ print("="*70 + "\n")
315
+
316
+ demo.launch(debug=True, share=False)