mangubee commited on
Commit
ff5bca5
·
1 Parent(s): 94965d6

Update JSON export with execution time and correct flags

Browse files
Files changed (2) hide show
  1. CHANGELOG.md +67 -0
  2. app.py +71 -14
CHANGELOG.md CHANGED
@@ -261,6 +261,73 @@
261
  - ✅ Concurrent execution maintains error isolation
262
  - ⏳ Local testing with 3 questions pending
263
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
  ### Created Files
265
 
266
  ### Deleted Files
 
261
  - ✅ Concurrent execution maintains error isolation
262
  - ⏳ Local testing with 3 questions pending
263
 
264
+ ### [PROBLEM: Evaluation Metadata Tracking - Execution Time and Correct Answers]
265
+
266
+ **Problem:** No execution time tracking to verify async performance improvement. JSON export doesn't show which questions were answered correctly, making error analysis difficult.
267
+
268
+ **Modified Files:**
269
+
270
+ - **app.py** (~60 lines added/modified)
271
+ - Added `import time` (line 8) - For execution timing
272
+ - Updated `export_results_to_json()` function signature (lines 38-113)
273
+ - Added `execution_time` parameter (optional float)
274
+ - Added `submission_response` parameter (optional dict with GAIA API response)
275
+ - Extracts correct task_ids from `submission_response["results"]` if available
276
+ - Adds execution time to metadata: `execution_time_seconds` and `execution_time_formatted` (Xm Ys)
277
+ - Adds score info to metadata: `score_percent`, `correct_count`, `total_attempted`
278
+ - Adds `"correct": true/false/null` flag to each result entry
279
+ - Updated `run_and_submit_all()` timing tracking (lines 274-435)
280
+ - Added `start_time = time.time()` at function start (line 275)
281
+ - Added `execution_time = time.time() - start_time` before all returns
282
+ - Logs execution time: "Total execution time: X.XX seconds (Xm Ys)" (line 397)
283
+ - Updated all 6 `export_results_to_json()` calls to pass `execution_time`
284
+ - Successful submission: passes both `execution_time` and `result_data` (line 417)
285
+ - Added correct answer column to results display (lines 399-413)
286
+ - Extracts correct task_ids from `result_data["results"]` if available
287
+ - Adds "Correct?" column to `results_log` with "✅ Yes" or "❌ No"
288
+ - Falls back to summary message if per-question data unavailable
289
+
290
+ **Benefits:**
291
+
292
+ - ✅ **Performance verification:** Track actual execution time to confirm async speedup (expect 60-80s vs previous 240s)
293
+ - ✅ **Correct answer identification:** JSON export shows which questions were answered correctly
294
+ - ✅ **Error analysis:** Easy to identify patterns in incorrect answers for debugging
295
+ - ✅ **Progress tracking:** Execution time metadata enables historical performance comparison
296
+ - ✅ **User visibility:** Results table shows "Correct?" column with clear visual indicators (✅/❌)
297
+
298
+ **JSON Export Format:**
299
+
300
+ ```json
301
+ {
302
+ "metadata": {
303
+ "generated": "2026-01-04 18:30:00",
304
+ "timestamp": "20260104_183000",
305
+ "total_questions": 20,
306
+ "execution_time_seconds": 78.45,
307
+ "execution_time_formatted": "1m 18s",
308
+ "score_percent": 20.0,
309
+ "correct_count": 4,
310
+ "total_attempted": 20
311
+ },
312
+ "results": [
313
+ {
314
+ "task_id": "abc123",
315
+ "question": "...",
316
+ "submitted_answer": "...",
317
+ "correct": true
318
+ }
319
+ ]
320
+ }
321
+ ```
322
+
323
+ **Verification:**
324
+
325
+ - ✅ No syntax errors in app.py
326
+ - ✅ Execution time tracking added at function start and all return points
327
+ - ✅ All export_results_to_json calls updated with new parameters
328
+ - ✅ Correct answer parsing from submission response implemented
329
+ - ⏳ Testing with real GAIA submission pending
330
+
331
  ### Created Files
332
 
333
  ### Deleted Files
app.py CHANGED
@@ -5,6 +5,7 @@ import inspect
5
  import pandas as pd
6
  import logging
7
  import json
 
8
  from concurrent.futures import ThreadPoolExecutor, as_completed
9
 
10
  # Stage 1: Import GAIAAgent (LangGraph-based agent)
@@ -35,12 +36,19 @@ def check_api_keys():
35
  return "\n".join([f"{k}: {v}" for k, v in keys_status.items()])
36
 
37
 
38
- def export_results_to_json(results_log: list, submission_status: str) -> str:
 
39
  """Export evaluation results to JSON file for easy processing.
40
 
41
  - Local: Saves to ~/Downloads/gaia_results_TIMESTAMP.json
42
  - HF Spaces: Saves to ./exports/gaia_results_TIMESTAMP.json
43
  - Format: Clean JSON with full error messages, no truncation
 
 
 
 
 
 
44
  """
45
  from datetime import datetime
46
 
@@ -58,19 +66,41 @@ def export_results_to_json(results_log: list, submission_status: str) -> str:
58
  downloads_dir = os.path.expanduser("~/Downloads")
59
  filepath = os.path.join(downloads_dir, filename)
60
 
 
 
 
 
 
 
 
 
61
  # Build JSON structure
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  export_data = {
63
- "metadata": {
64
- "generated": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
65
- "timestamp": timestamp,
66
- "total_questions": len(results_log)
67
- },
68
  "submission_status": submission_status,
69
  "results": [
70
  {
71
  "task_id": result.get("Task ID", "N/A"),
72
  "question": result.get("Question", "N/A"),
73
- "submitted_answer": result.get("Submitted Answer", "N/A")
 
74
  }
75
  for result in results_log
76
  ]
@@ -241,6 +271,9 @@ def run_and_submit_all(llm_provider: str, enable_fallback: bool, profile: gr.OAu
241
  Fetches all questions, runs the BasicAgent on them, submits all answers,
242
  and displays the results.
243
  """
 
 
 
244
  # --- Determine HF Space Runtime URL and Repo URL ---
245
  space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
246
 
@@ -333,7 +366,8 @@ def run_and_submit_all(llm_provider: str, enable_fallback: bool, profile: gr.OAu
333
  print("Agent did not produce any answers to submit.")
334
  status_message = "Agent did not produce any answers to submit."
335
  results_df = pd.DataFrame(results_log)
336
- export_path = export_results_to_json(results_log, status_message)
 
337
  return status_message, results_df, export_path
338
 
339
  # 4. Prepare Submission
@@ -359,9 +393,28 @@ def run_and_submit_all(llm_provider: str, enable_fallback: bool, profile: gr.OAu
359
  f"Message: {result_data.get('message', 'No message received.')}"
360
  )
361
  print("Submission successful.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
362
  results_df = pd.DataFrame(results_log)
363
- # Export to JSON
364
- export_path = export_results_to_json(results_log, final_status)
365
  return final_status, results_df, export_path
366
  except requests.exceptions.HTTPError as e:
367
  error_detail = f"Server responded with status {e.response.status_code}."
@@ -372,26 +425,30 @@ def run_and_submit_all(llm_provider: str, enable_fallback: bool, profile: gr.OAu
372
  error_detail += f" Response: {e.response.text[:500]}"
373
  status_message = f"Submission Failed: {error_detail}"
374
  print(status_message)
 
375
  results_df = pd.DataFrame(results_log)
376
- export_path = export_results_to_json(results_log, status_message)
377
  return status_message, results_df, export_path
378
  except requests.exceptions.Timeout:
379
  status_message = "Submission Failed: The request timed out."
380
  print(status_message)
 
381
  results_df = pd.DataFrame(results_log)
382
- export_path = export_results_to_json(results_log, status_message)
383
  return status_message, results_df, export_path
384
  except requests.exceptions.RequestException as e:
385
  status_message = f"Submission Failed: Network error - {e}"
386
  print(status_message)
 
387
  results_df = pd.DataFrame(results_log)
388
- export_path = export_results_to_json(results_log, status_message)
389
  return status_message, results_df, export_path
390
  except Exception as e:
391
  status_message = f"An unexpected error occurred during submission: {e}"
392
  print(status_message)
 
393
  results_df = pd.DataFrame(results_log)
394
- export_path = export_results_to_json(results_log, status_message)
395
  return status_message, results_df, export_path
396
 
397
 
 
5
  import pandas as pd
6
  import logging
7
  import json
8
+ import time
9
  from concurrent.futures import ThreadPoolExecutor, as_completed
10
 
11
  # Stage 1: Import GAIAAgent (LangGraph-based agent)
 
36
  return "\n".join([f"{k}: {v}" for k, v in keys_status.items()])
37
 
38
 
39
+ def export_results_to_json(results_log: list, submission_status: str, execution_time: float = None,
40
+ submission_response: dict = None) -> str:
41
  """Export evaluation results to JSON file for easy processing.
42
 
43
  - Local: Saves to ~/Downloads/gaia_results_TIMESTAMP.json
44
  - HF Spaces: Saves to ./exports/gaia_results_TIMESTAMP.json
45
  - Format: Clean JSON with full error messages, no truncation
46
+
47
+ Args:
48
+ results_log: List of question results
49
+ submission_status: Status message from submission
50
+ execution_time: Total execution time in seconds
51
+ submission_response: Response from GAIA API with correctness info
52
  """
53
  from datetime import datetime
54
 
 
66
  downloads_dir = os.path.expanduser("~/Downloads")
67
  filepath = os.path.join(downloads_dir, filename)
68
 
69
+ # Extract correctness info from submission response if available
70
+ correct_task_ids = set()
71
+ if submission_response and "results" in submission_response:
72
+ # If API provides per-question results
73
+ for item in submission_response.get("results", []):
74
+ if item.get("correct"):
75
+ correct_task_ids.add(item.get("task_id"))
76
+
77
  # Build JSON structure
78
+ metadata = {
79
+ "generated": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
80
+ "timestamp": timestamp,
81
+ "total_questions": len(results_log)
82
+ }
83
+
84
+ # Add execution time if available
85
+ if execution_time is not None:
86
+ metadata["execution_time_seconds"] = round(execution_time, 2)
87
+ metadata["execution_time_formatted"] = f"{int(execution_time // 60)}m {int(execution_time % 60)}s"
88
+
89
+ # Add score info if available
90
+ if submission_response:
91
+ metadata["score_percent"] = submission_response.get("score")
92
+ metadata["correct_count"] = submission_response.get("correct_count")
93
+ metadata["total_attempted"] = submission_response.get("total_attempted")
94
+
95
  export_data = {
96
+ "metadata": metadata,
 
 
 
 
97
  "submission_status": submission_status,
98
  "results": [
99
  {
100
  "task_id": result.get("Task ID", "N/A"),
101
  "question": result.get("Question", "N/A"),
102
+ "submitted_answer": result.get("Submitted Answer", "N/A"),
103
+ "correct": result.get("Task ID") in correct_task_ids if correct_task_ids else None
104
  }
105
  for result in results_log
106
  ]
 
271
  Fetches all questions, runs the BasicAgent on them, submits all answers,
272
  and displays the results.
273
  """
274
+ # Start execution timer
275
+ start_time = time.time()
276
+
277
  # --- Determine HF Space Runtime URL and Repo URL ---
278
  space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
279
 
 
366
  print("Agent did not produce any answers to submit.")
367
  status_message = "Agent did not produce any answers to submit."
368
  results_df = pd.DataFrame(results_log)
369
+ execution_time = time.time() - start_time
370
+ export_path = export_results_to_json(results_log, status_message, execution_time)
371
  return status_message, results_df, export_path
372
 
373
  # 4. Prepare Submission
 
393
  f"Message: {result_data.get('message', 'No message received.')}"
394
  )
395
  print("Submission successful.")
396
+ execution_time = time.time() - start_time
397
+ logger.info(f"Total execution time: {execution_time:.2f} seconds ({int(execution_time // 60)}m {int(execution_time % 60)}s)")
398
+
399
+ # Extract correct task_ids from result_data if available
400
+ correct_task_ids = set()
401
+ if "results" in result_data:
402
+ for item in result_data.get("results", []):
403
+ if item.get("correct"):
404
+ correct_task_ids.add(item.get("task_id"))
405
+
406
+ # Add "Correct?" column to results
407
+ for result in results_log:
408
+ task_id = result.get("Task ID")
409
+ if correct_task_ids:
410
+ result["Correct?"] = "✅ Yes" if task_id in correct_task_ids else "❌ No"
411
+ else:
412
+ # If no per-question data, show summary info
413
+ result["Correct?"] = f"See summary: {result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct"
414
+
415
  results_df = pd.DataFrame(results_log)
416
+ # Export to JSON with execution time and submission response
417
+ export_path = export_results_to_json(results_log, final_status, execution_time, result_data)
418
  return final_status, results_df, export_path
419
  except requests.exceptions.HTTPError as e:
420
  error_detail = f"Server responded with status {e.response.status_code}."
 
425
  error_detail += f" Response: {e.response.text[:500]}"
426
  status_message = f"Submission Failed: {error_detail}"
427
  print(status_message)
428
+ execution_time = time.time() - start_time
429
  results_df = pd.DataFrame(results_log)
430
+ export_path = export_results_to_json(results_log, status_message, execution_time)
431
  return status_message, results_df, export_path
432
  except requests.exceptions.Timeout:
433
  status_message = "Submission Failed: The request timed out."
434
  print(status_message)
435
+ execution_time = time.time() - start_time
436
  results_df = pd.DataFrame(results_log)
437
+ export_path = export_results_to_json(results_log, status_message, execution_time)
438
  return status_message, results_df, export_path
439
  except requests.exceptions.RequestException as e:
440
  status_message = f"Submission Failed: Network error - {e}"
441
  print(status_message)
442
+ execution_time = time.time() - start_time
443
  results_df = pd.DataFrame(results_log)
444
+ export_path = export_results_to_json(results_log, status_message, execution_time)
445
  return status_message, results_df, export_path
446
  except Exception as e:
447
  status_message = f"An unexpected error occurred during submission: {e}"
448
  print(status_message)
449
+ execution_time = time.time() - start_time
450
  results_df = pd.DataFrame(results_log)
451
+ export_path = export_results_to_json(results_log, status_message, execution_time)
452
  return status_message, results_df, export_path
453
 
454