Update JSON export with execution time and correct flags
Browse files- CHANGELOG.md +67 -0
- app.py +71 -14
CHANGELOG.md
CHANGED
|
@@ -261,6 +261,73 @@
|
|
| 261 |
- ✅ Concurrent execution maintains error isolation
|
| 262 |
- ⏳ Local testing with 3 questions pending
|
| 263 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 264 |
### Created Files
|
| 265 |
|
| 266 |
### Deleted Files
|
|
|
|
| 261 |
- ✅ Concurrent execution maintains error isolation
|
| 262 |
- ⏳ Local testing with 3 questions pending
|
| 263 |
|
| 264 |
+
### [PROBLEM: Evaluation Metadata Tracking - Execution Time and Correct Answers]
|
| 265 |
+
|
| 266 |
+
**Problem:** No execution time tracking to verify async performance improvement. JSON export doesn't show which questions were answered correctly, making error analysis difficult.
|
| 267 |
+
|
| 268 |
+
**Modified Files:**
|
| 269 |
+
|
| 270 |
+
- **app.py** (~60 lines added/modified)
|
| 271 |
+
- Added `import time` (line 8) - For execution timing
|
| 272 |
+
- Updated `export_results_to_json()` function signature (lines 38-113)
|
| 273 |
+
- Added `execution_time` parameter (optional float)
|
| 274 |
+
- Added `submission_response` parameter (optional dict with GAIA API response)
|
| 275 |
+
- Extracts correct task_ids from `submission_response["results"]` if available
|
| 276 |
+
- Adds execution time to metadata: `execution_time_seconds` and `execution_time_formatted` (Xm Ys)
|
| 277 |
+
- Adds score info to metadata: `score_percent`, `correct_count`, `total_attempted`
|
| 278 |
+
- Adds `"correct": true/false/null` flag to each result entry
|
| 279 |
+
- Updated `run_and_submit_all()` timing tracking (lines 274-435)
|
| 280 |
+
- Added `start_time = time.time()` at function start (line 275)
|
| 281 |
+
- Added `execution_time = time.time() - start_time` before all returns
|
| 282 |
+
- Logs execution time: "Total execution time: X.XX seconds (Xm Ys)" (line 397)
|
| 283 |
+
- Updated all 6 `export_results_to_json()` calls to pass `execution_time`
|
| 284 |
+
- Successful submission: passes both `execution_time` and `result_data` (line 417)
|
| 285 |
+
- Added correct answer column to results display (lines 399-413)
|
| 286 |
+
- Extracts correct task_ids from `result_data["results"]` if available
|
| 287 |
+
- Adds "Correct?" column to `results_log` with "✅ Yes" or "❌ No"
|
| 288 |
+
- Falls back to summary message if per-question data unavailable
|
| 289 |
+
|
| 290 |
+
**Benefits:**
|
| 291 |
+
|
| 292 |
+
- ✅ **Performance verification:** Track actual execution time to confirm async speedup (expect 60-80s vs previous 240s)
|
| 293 |
+
- ✅ **Correct answer identification:** JSON export shows which questions were answered correctly
|
| 294 |
+
- ✅ **Error analysis:** Easy to identify patterns in incorrect answers for debugging
|
| 295 |
+
- ✅ **Progress tracking:** Execution time metadata enables historical performance comparison
|
| 296 |
+
- ✅ **User visibility:** Results table shows "Correct?" column with clear visual indicators (✅/❌)
|
| 297 |
+
|
| 298 |
+
**JSON Export Format:**
|
| 299 |
+
|
| 300 |
+
```json
|
| 301 |
+
{
|
| 302 |
+
"metadata": {
|
| 303 |
+
"generated": "2026-01-04 18:30:00",
|
| 304 |
+
"timestamp": "20260104_183000",
|
| 305 |
+
"total_questions": 20,
|
| 306 |
+
"execution_time_seconds": 78.45,
|
| 307 |
+
"execution_time_formatted": "1m 18s",
|
| 308 |
+
"score_percent": 20.0,
|
| 309 |
+
"correct_count": 4,
|
| 310 |
+
"total_attempted": 20
|
| 311 |
+
},
|
| 312 |
+
"results": [
|
| 313 |
+
{
|
| 314 |
+
"task_id": "abc123",
|
| 315 |
+
"question": "...",
|
| 316 |
+
"submitted_answer": "...",
|
| 317 |
+
"correct": true
|
| 318 |
+
}
|
| 319 |
+
]
|
| 320 |
+
}
|
| 321 |
+
```
|
| 322 |
+
|
| 323 |
+
**Verification:**
|
| 324 |
+
|
| 325 |
+
- ✅ No syntax errors in app.py
|
| 326 |
+
- ✅ Execution time tracking added at function start and all return points
|
| 327 |
+
- ✅ All export_results_to_json calls updated with new parameters
|
| 328 |
+
- ✅ Correct answer parsing from submission response implemented
|
| 329 |
+
- ⏳ Testing with real GAIA submission pending
|
| 330 |
+
|
| 331 |
### Created Files
|
| 332 |
|
| 333 |
### Deleted Files
|
app.py
CHANGED
|
@@ -5,6 +5,7 @@ import inspect
|
|
| 5 |
import pandas as pd
|
| 6 |
import logging
|
| 7 |
import json
|
|
|
|
| 8 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 9 |
|
| 10 |
# Stage 1: Import GAIAAgent (LangGraph-based agent)
|
|
@@ -35,12 +36,19 @@ def check_api_keys():
|
|
| 35 |
return "\n".join([f"{k}: {v}" for k, v in keys_status.items()])
|
| 36 |
|
| 37 |
|
| 38 |
-
def export_results_to_json(results_log: list, submission_status: str
|
|
|
|
| 39 |
"""Export evaluation results to JSON file for easy processing.
|
| 40 |
|
| 41 |
- Local: Saves to ~/Downloads/gaia_results_TIMESTAMP.json
|
| 42 |
- HF Spaces: Saves to ./exports/gaia_results_TIMESTAMP.json
|
| 43 |
- Format: Clean JSON with full error messages, no truncation
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
"""
|
| 45 |
from datetime import datetime
|
| 46 |
|
|
@@ -58,19 +66,41 @@ def export_results_to_json(results_log: list, submission_status: str) -> str:
|
|
| 58 |
downloads_dir = os.path.expanduser("~/Downloads")
|
| 59 |
filepath = os.path.join(downloads_dir, filename)
|
| 60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
# Build JSON structure
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
export_data = {
|
| 63 |
-
"metadata":
|
| 64 |
-
"generated": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
| 65 |
-
"timestamp": timestamp,
|
| 66 |
-
"total_questions": len(results_log)
|
| 67 |
-
},
|
| 68 |
"submission_status": submission_status,
|
| 69 |
"results": [
|
| 70 |
{
|
| 71 |
"task_id": result.get("Task ID", "N/A"),
|
| 72 |
"question": result.get("Question", "N/A"),
|
| 73 |
-
"submitted_answer": result.get("Submitted Answer", "N/A")
|
|
|
|
| 74 |
}
|
| 75 |
for result in results_log
|
| 76 |
]
|
|
@@ -241,6 +271,9 @@ def run_and_submit_all(llm_provider: str, enable_fallback: bool, profile: gr.OAu
|
|
| 241 |
Fetches all questions, runs the BasicAgent on them, submits all answers,
|
| 242 |
and displays the results.
|
| 243 |
"""
|
|
|
|
|
|
|
|
|
|
| 244 |
# --- Determine HF Space Runtime URL and Repo URL ---
|
| 245 |
space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
|
| 246 |
|
|
@@ -333,7 +366,8 @@ def run_and_submit_all(llm_provider: str, enable_fallback: bool, profile: gr.OAu
|
|
| 333 |
print("Agent did not produce any answers to submit.")
|
| 334 |
status_message = "Agent did not produce any answers to submit."
|
| 335 |
results_df = pd.DataFrame(results_log)
|
| 336 |
-
|
|
|
|
| 337 |
return status_message, results_df, export_path
|
| 338 |
|
| 339 |
# 4. Prepare Submission
|
|
@@ -359,9 +393,28 @@ def run_and_submit_all(llm_provider: str, enable_fallback: bool, profile: gr.OAu
|
|
| 359 |
f"Message: {result_data.get('message', 'No message received.')}"
|
| 360 |
)
|
| 361 |
print("Submission successful.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 362 |
results_df = pd.DataFrame(results_log)
|
| 363 |
-
# Export to JSON
|
| 364 |
-
export_path = export_results_to_json(results_log, final_status)
|
| 365 |
return final_status, results_df, export_path
|
| 366 |
except requests.exceptions.HTTPError as e:
|
| 367 |
error_detail = f"Server responded with status {e.response.status_code}."
|
|
@@ -372,26 +425,30 @@ def run_and_submit_all(llm_provider: str, enable_fallback: bool, profile: gr.OAu
|
|
| 372 |
error_detail += f" Response: {e.response.text[:500]}"
|
| 373 |
status_message = f"Submission Failed: {error_detail}"
|
| 374 |
print(status_message)
|
|
|
|
| 375 |
results_df = pd.DataFrame(results_log)
|
| 376 |
-
export_path = export_results_to_json(results_log, status_message)
|
| 377 |
return status_message, results_df, export_path
|
| 378 |
except requests.exceptions.Timeout:
|
| 379 |
status_message = "Submission Failed: The request timed out."
|
| 380 |
print(status_message)
|
|
|
|
| 381 |
results_df = pd.DataFrame(results_log)
|
| 382 |
-
export_path = export_results_to_json(results_log, status_message)
|
| 383 |
return status_message, results_df, export_path
|
| 384 |
except requests.exceptions.RequestException as e:
|
| 385 |
status_message = f"Submission Failed: Network error - {e}"
|
| 386 |
print(status_message)
|
|
|
|
| 387 |
results_df = pd.DataFrame(results_log)
|
| 388 |
-
export_path = export_results_to_json(results_log, status_message)
|
| 389 |
return status_message, results_df, export_path
|
| 390 |
except Exception as e:
|
| 391 |
status_message = f"An unexpected error occurred during submission: {e}"
|
| 392 |
print(status_message)
|
|
|
|
| 393 |
results_df = pd.DataFrame(results_log)
|
| 394 |
-
export_path = export_results_to_json(results_log, status_message)
|
| 395 |
return status_message, results_df, export_path
|
| 396 |
|
| 397 |
|
|
|
|
| 5 |
import pandas as pd
|
| 6 |
import logging
|
| 7 |
import json
|
| 8 |
+
import time
|
| 9 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 10 |
|
| 11 |
# Stage 1: Import GAIAAgent (LangGraph-based agent)
|
|
|
|
| 36 |
return "\n".join([f"{k}: {v}" for k, v in keys_status.items()])
|
| 37 |
|
| 38 |
|
| 39 |
+
def export_results_to_json(results_log: list, submission_status: str, execution_time: float = None,
|
| 40 |
+
submission_response: dict = None) -> str:
|
| 41 |
"""Export evaluation results to JSON file for easy processing.
|
| 42 |
|
| 43 |
- Local: Saves to ~/Downloads/gaia_results_TIMESTAMP.json
|
| 44 |
- HF Spaces: Saves to ./exports/gaia_results_TIMESTAMP.json
|
| 45 |
- Format: Clean JSON with full error messages, no truncation
|
| 46 |
+
|
| 47 |
+
Args:
|
| 48 |
+
results_log: List of question results
|
| 49 |
+
submission_status: Status message from submission
|
| 50 |
+
execution_time: Total execution time in seconds
|
| 51 |
+
submission_response: Response from GAIA API with correctness info
|
| 52 |
"""
|
| 53 |
from datetime import datetime
|
| 54 |
|
|
|
|
| 66 |
downloads_dir = os.path.expanduser("~/Downloads")
|
| 67 |
filepath = os.path.join(downloads_dir, filename)
|
| 68 |
|
| 69 |
+
# Extract correctness info from submission response if available
|
| 70 |
+
correct_task_ids = set()
|
| 71 |
+
if submission_response and "results" in submission_response:
|
| 72 |
+
# If API provides per-question results
|
| 73 |
+
for item in submission_response.get("results", []):
|
| 74 |
+
if item.get("correct"):
|
| 75 |
+
correct_task_ids.add(item.get("task_id"))
|
| 76 |
+
|
| 77 |
# Build JSON structure
|
| 78 |
+
metadata = {
|
| 79 |
+
"generated": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
| 80 |
+
"timestamp": timestamp,
|
| 81 |
+
"total_questions": len(results_log)
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
# Add execution time if available
|
| 85 |
+
if execution_time is not None:
|
| 86 |
+
metadata["execution_time_seconds"] = round(execution_time, 2)
|
| 87 |
+
metadata["execution_time_formatted"] = f"{int(execution_time // 60)}m {int(execution_time % 60)}s"
|
| 88 |
+
|
| 89 |
+
# Add score info if available
|
| 90 |
+
if submission_response:
|
| 91 |
+
metadata["score_percent"] = submission_response.get("score")
|
| 92 |
+
metadata["correct_count"] = submission_response.get("correct_count")
|
| 93 |
+
metadata["total_attempted"] = submission_response.get("total_attempted")
|
| 94 |
+
|
| 95 |
export_data = {
|
| 96 |
+
"metadata": metadata,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
"submission_status": submission_status,
|
| 98 |
"results": [
|
| 99 |
{
|
| 100 |
"task_id": result.get("Task ID", "N/A"),
|
| 101 |
"question": result.get("Question", "N/A"),
|
| 102 |
+
"submitted_answer": result.get("Submitted Answer", "N/A"),
|
| 103 |
+
"correct": result.get("Task ID") in correct_task_ids if correct_task_ids else None
|
| 104 |
}
|
| 105 |
for result in results_log
|
| 106 |
]
|
|
|
|
| 271 |
Fetches all questions, runs the BasicAgent on them, submits all answers,
|
| 272 |
and displays the results.
|
| 273 |
"""
|
| 274 |
+
# Start execution timer
|
| 275 |
+
start_time = time.time()
|
| 276 |
+
|
| 277 |
# --- Determine HF Space Runtime URL and Repo URL ---
|
| 278 |
space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
|
| 279 |
|
|
|
|
| 366 |
print("Agent did not produce any answers to submit.")
|
| 367 |
status_message = "Agent did not produce any answers to submit."
|
| 368 |
results_df = pd.DataFrame(results_log)
|
| 369 |
+
execution_time = time.time() - start_time
|
| 370 |
+
export_path = export_results_to_json(results_log, status_message, execution_time)
|
| 371 |
return status_message, results_df, export_path
|
| 372 |
|
| 373 |
# 4. Prepare Submission
|
|
|
|
| 393 |
f"Message: {result_data.get('message', 'No message received.')}"
|
| 394 |
)
|
| 395 |
print("Submission successful.")
|
| 396 |
+
execution_time = time.time() - start_time
|
| 397 |
+
logger.info(f"Total execution time: {execution_time:.2f} seconds ({int(execution_time // 60)}m {int(execution_time % 60)}s)")
|
| 398 |
+
|
| 399 |
+
# Extract correct task_ids from result_data if available
|
| 400 |
+
correct_task_ids = set()
|
| 401 |
+
if "results" in result_data:
|
| 402 |
+
for item in result_data.get("results", []):
|
| 403 |
+
if item.get("correct"):
|
| 404 |
+
correct_task_ids.add(item.get("task_id"))
|
| 405 |
+
|
| 406 |
+
# Add "Correct?" column to results
|
| 407 |
+
for result in results_log:
|
| 408 |
+
task_id = result.get("Task ID")
|
| 409 |
+
if correct_task_ids:
|
| 410 |
+
result["Correct?"] = "✅ Yes" if task_id in correct_task_ids else "❌ No"
|
| 411 |
+
else:
|
| 412 |
+
# If no per-question data, show summary info
|
| 413 |
+
result["Correct?"] = f"See summary: {result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct"
|
| 414 |
+
|
| 415 |
results_df = pd.DataFrame(results_log)
|
| 416 |
+
# Export to JSON with execution time and submission response
|
| 417 |
+
export_path = export_results_to_json(results_log, final_status, execution_time, result_data)
|
| 418 |
return final_status, results_df, export_path
|
| 419 |
except requests.exceptions.HTTPError as e:
|
| 420 |
error_detail = f"Server responded with status {e.response.status_code}."
|
|
|
|
| 425 |
error_detail += f" Response: {e.response.text[:500]}"
|
| 426 |
status_message = f"Submission Failed: {error_detail}"
|
| 427 |
print(status_message)
|
| 428 |
+
execution_time = time.time() - start_time
|
| 429 |
results_df = pd.DataFrame(results_log)
|
| 430 |
+
export_path = export_results_to_json(results_log, status_message, execution_time)
|
| 431 |
return status_message, results_df, export_path
|
| 432 |
except requests.exceptions.Timeout:
|
| 433 |
status_message = "Submission Failed: The request timed out."
|
| 434 |
print(status_message)
|
| 435 |
+
execution_time = time.time() - start_time
|
| 436 |
results_df = pd.DataFrame(results_log)
|
| 437 |
+
export_path = export_results_to_json(results_log, status_message, execution_time)
|
| 438 |
return status_message, results_df, export_path
|
| 439 |
except requests.exceptions.RequestException as e:
|
| 440 |
status_message = f"Submission Failed: Network error - {e}"
|
| 441 |
print(status_message)
|
| 442 |
+
execution_time = time.time() - start_time
|
| 443 |
results_df = pd.DataFrame(results_log)
|
| 444 |
+
export_path = export_results_to_json(results_log, status_message, execution_time)
|
| 445 |
return status_message, results_df, export_path
|
| 446 |
except Exception as e:
|
| 447 |
status_message = f"An unexpected error occurred during submission: {e}"
|
| 448 |
print(status_message)
|
| 449 |
+
execution_time = time.time() - start_time
|
| 450 |
results_df = pd.DataFrame(results_log)
|
| 451 |
+
export_path = export_results_to_json(results_log, status_message, execution_time)
|
| 452 |
return status_message, results_df, export_path
|
| 453 |
|
| 454 |
|