Update
Browse files- CHANGELOG.md +5 -3
- app.py +33 -43
- output/gaia_results_20260104_005858.md +35 -0
CHANGELOG.md
CHANGED
|
@@ -25,9 +25,11 @@
|
|
| 25 |
- **app.py**
|
| 26 |
- Updated `check_api_keys()` - Added HF_TOKEN status display in Test & Debug tab
|
| 27 |
- UI now shows: "HF_TOKEN (HuggingFace): ✓ SET" or "✗ MISSING"
|
| 28 |
-
- Added `
|
| 29 |
-
- Local: Saves to ~/Downloads/gaia_results_TIMESTAMP.
|
| 30 |
-
- HF Spaces: Saves to ./exports/gaia_results_TIMESTAMP.
|
|
|
|
|
|
|
| 31 |
- Updated `run_and_submit_all()` - ALL return paths now export results
|
| 32 |
- Added gr.File download button - Users can directly download results (better UX than textbox)
|
| 33 |
- Updated run_button click handler - Now outputs 3 values (status, table, export_path)
|
|
|
|
| 25 |
- **app.py**
|
| 26 |
- Updated `check_api_keys()` - Added HF_TOKEN status display in Test & Debug tab
|
| 27 |
- UI now shows: "HF_TOKEN (HuggingFace): ✓ SET" or "✗ MISSING"
|
| 28 |
+
- Added `export_results_to_json(results_log, submission_status)` - Export evaluation results as JSON
|
| 29 |
+
- Local: Saves to ~/Downloads/gaia_results_TIMESTAMP.json
|
| 30 |
+
- HF Spaces: Saves to ./exports/gaia_results_TIMESTAMP.json (fixes cloud deployment issue)
|
| 31 |
+
- JSON format: No special char escaping issues, full error messages, easy code processing
|
| 32 |
+
- Pretty formatted with indent=2, ensure_ascii=False for readability
|
| 33 |
- Updated `run_and_submit_all()` - ALL return paths now export results
|
| 34 |
- Added gr.File download button - Users can directly download results (better UX than textbox)
|
| 35 |
- Updated run_button click handler - Now outputs 3 values (status, table, export_path)
|
app.py
CHANGED
|
@@ -34,16 +34,17 @@ def check_api_keys():
|
|
| 34 |
return "\n".join([f"{k}: {v}" for k, v in keys_status.items()])
|
| 35 |
|
| 36 |
|
| 37 |
-
def
|
| 38 |
-
"""Export evaluation results to
|
| 39 |
|
| 40 |
-
- Local: Saves to ~/Downloads
|
| 41 |
-
- HF Spaces: Saves to ./exports/
|
|
|
|
| 42 |
"""
|
| 43 |
from datetime import datetime
|
| 44 |
|
| 45 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 46 |
-
filename = f"gaia_results_{timestamp}.
|
| 47 |
|
| 48 |
# Detect environment: HF Spaces or local
|
| 49 |
if os.getenv("SPACE_ID"):
|
|
@@ -56,38 +57,27 @@ def export_results_to_markdown(results_log: list, submission_status: str) -> str
|
|
| 56 |
downloads_dir = os.path.expanduser("~/Downloads")
|
| 57 |
filepath = os.path.join(downloads_dir, filename)
|
| 58 |
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
f.write("| Task ID | Question | Submitted Answer |\n")
|
| 77 |
-
f.write("|---------|----------|------------------|\n")
|
| 78 |
-
|
| 79 |
-
for result in results_log:
|
| 80 |
-
task_id = result.get("Task ID", "N/A")
|
| 81 |
-
question = result.get("Question", "N/A").replace("\n", " ").replace("|", "\\|")
|
| 82 |
-
answer = result.get("Submitted Answer", "N/A").replace("\n", " ").replace("|", "\\|")
|
| 83 |
-
|
| 84 |
-
# Truncate long text for readability
|
| 85 |
-
if len(question) > 100:
|
| 86 |
-
question = question[:97] + "..."
|
| 87 |
-
if len(answer) > 100:
|
| 88 |
-
answer = answer[:97] + "..."
|
| 89 |
|
| 90 |
-
|
|
|
|
|
|
|
| 91 |
|
| 92 |
logger.info(f"Results exported to: {filepath}")
|
| 93 |
return filepath
|
|
@@ -282,7 +272,7 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
| 282 |
print("Agent did not produce any answers to submit.")
|
| 283 |
status_message = "Agent did not produce any answers to submit."
|
| 284 |
results_df = pd.DataFrame(results_log)
|
| 285 |
-
export_path =
|
| 286 |
return status_message, results_df, export_path
|
| 287 |
|
| 288 |
# 4. Prepare Submission
|
|
@@ -309,8 +299,8 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
| 309 |
)
|
| 310 |
print("Submission successful.")
|
| 311 |
results_df = pd.DataFrame(results_log)
|
| 312 |
-
# Export to
|
| 313 |
-
export_path =
|
| 314 |
return final_status, results_df, export_path
|
| 315 |
except requests.exceptions.HTTPError as e:
|
| 316 |
error_detail = f"Server responded with status {e.response.status_code}."
|
|
@@ -322,25 +312,25 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
| 322 |
status_message = f"Submission Failed: {error_detail}"
|
| 323 |
print(status_message)
|
| 324 |
results_df = pd.DataFrame(results_log)
|
| 325 |
-
export_path =
|
| 326 |
return status_message, results_df, export_path
|
| 327 |
except requests.exceptions.Timeout:
|
| 328 |
status_message = "Submission Failed: The request timed out."
|
| 329 |
print(status_message)
|
| 330 |
results_df = pd.DataFrame(results_log)
|
| 331 |
-
export_path =
|
| 332 |
return status_message, results_df, export_path
|
| 333 |
except requests.exceptions.RequestException as e:
|
| 334 |
status_message = f"Submission Failed: Network error - {e}"
|
| 335 |
print(status_message)
|
| 336 |
results_df = pd.DataFrame(results_log)
|
| 337 |
-
export_path =
|
| 338 |
return status_message, results_df, export_path
|
| 339 |
except Exception as e:
|
| 340 |
status_message = f"An unexpected error occurred during submission: {e}"
|
| 341 |
print(status_message)
|
| 342 |
results_df = pd.DataFrame(results_log)
|
| 343 |
-
export_path =
|
| 344 |
return status_message, results_df, export_path
|
| 345 |
|
| 346 |
|
|
|
|
| 34 |
return "\n".join([f"{k}: {v}" for k, v in keys_status.items()])
|
| 35 |
|
| 36 |
|
| 37 |
+
def export_results_to_json(results_log: list, submission_status: str) -> str:
|
| 38 |
+
"""Export evaluation results to JSON file for easy processing.
|
| 39 |
|
| 40 |
+
- Local: Saves to ~/Downloads/gaia_results_TIMESTAMP.json
|
| 41 |
+
- HF Spaces: Saves to ./exports/gaia_results_TIMESTAMP.json
|
| 42 |
+
- Format: Clean JSON with full error messages, no truncation
|
| 43 |
"""
|
| 44 |
from datetime import datetime
|
| 45 |
|
| 46 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 47 |
+
filename = f"gaia_results_{timestamp}.json"
|
| 48 |
|
| 49 |
# Detect environment: HF Spaces or local
|
| 50 |
if os.getenv("SPACE_ID"):
|
|
|
|
| 57 |
downloads_dir = os.path.expanduser("~/Downloads")
|
| 58 |
filepath = os.path.join(downloads_dir, filename)
|
| 59 |
|
| 60 |
+
# Build JSON structure
|
| 61 |
+
export_data = {
|
| 62 |
+
"metadata": {
|
| 63 |
+
"generated": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
| 64 |
+
"timestamp": timestamp,
|
| 65 |
+
"total_questions": len(results_log)
|
| 66 |
+
},
|
| 67 |
+
"submission_status": submission_status,
|
| 68 |
+
"results": [
|
| 69 |
+
{
|
| 70 |
+
"task_id": result.get("Task ID", "N/A"),
|
| 71 |
+
"question": result.get("Question", "N/A"),
|
| 72 |
+
"submitted_answer": result.get("Submitted Answer", "N/A")
|
| 73 |
+
}
|
| 74 |
+
for result in results_log
|
| 75 |
+
]
|
| 76 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
|
| 78 |
+
# Write JSON file with pretty formatting
|
| 79 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
| 80 |
+
json.dump(export_data, f, indent=2, ensure_ascii=False)
|
| 81 |
|
| 82 |
logger.info(f"Results exported to: {filepath}")
|
| 83 |
return filepath
|
|
|
|
| 272 |
print("Agent did not produce any answers to submit.")
|
| 273 |
status_message = "Agent did not produce any answers to submit."
|
| 274 |
results_df = pd.DataFrame(results_log)
|
| 275 |
+
export_path = export_results_to_json(results_log, status_message)
|
| 276 |
return status_message, results_df, export_path
|
| 277 |
|
| 278 |
# 4. Prepare Submission
|
|
|
|
| 299 |
)
|
| 300 |
print("Submission successful.")
|
| 301 |
results_df = pd.DataFrame(results_log)
|
| 302 |
+
# Export to JSON
|
| 303 |
+
export_path = export_results_to_json(results_log, final_status)
|
| 304 |
return final_status, results_df, export_path
|
| 305 |
except requests.exceptions.HTTPError as e:
|
| 306 |
error_detail = f"Server responded with status {e.response.status_code}."
|
|
|
|
| 312 |
status_message = f"Submission Failed: {error_detail}"
|
| 313 |
print(status_message)
|
| 314 |
results_df = pd.DataFrame(results_log)
|
| 315 |
+
export_path = export_results_to_json(results_log, status_message)
|
| 316 |
return status_message, results_df, export_path
|
| 317 |
except requests.exceptions.Timeout:
|
| 318 |
status_message = "Submission Failed: The request timed out."
|
| 319 |
print(status_message)
|
| 320 |
results_df = pd.DataFrame(results_log)
|
| 321 |
+
export_path = export_results_to_json(results_log, status_message)
|
| 322 |
return status_message, results_df, export_path
|
| 323 |
except requests.exceptions.RequestException as e:
|
| 324 |
status_message = f"Submission Failed: Network error - {e}"
|
| 325 |
print(status_message)
|
| 326 |
results_df = pd.DataFrame(results_log)
|
| 327 |
+
export_path = export_results_to_json(results_log, status_message)
|
| 328 |
return status_message, results_df, export_path
|
| 329 |
except Exception as e:
|
| 330 |
status_message = f"An unexpected error occurred during submission: {e}"
|
| 331 |
print(status_message)
|
| 332 |
results_df = pd.DataFrame(results_log)
|
| 333 |
+
export_path = export_results_to_json(results_log, status_message)
|
| 334 |
return status_message, results_df, export_path
|
| 335 |
|
| 336 |
|
output/gaia_results_20260104_005858.md
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# GAIA Agent Evaluation Results
|
| 2 |
+
|
| 3 |
+
**Generated:** 2026-01-04 00:58:58
|
| 4 |
+
|
| 5 |
+
## Submission Status
|
| 6 |
+
|
| 7 |
+
Submission Successful!
|
| 8 |
+
User: mangoobee
|
| 9 |
+
Overall Score: 5.0% (1/20 correct)
|
| 10 |
+
Message: Score calculated successfully: 1/20 total questions answered correctly (20 valid tasks attempted). Score did not improve previous record, leaderboard not updated.
|
| 11 |
+
|
| 12 |
+
## Questions and Answers
|
| 13 |
+
|
| 14 |
+
| Task ID | Question | Submitted Answer |
|
| 15 |
+
|---------|----------|------------------|
|
| 16 |
+
| 8e867cd7-cff9-4e6c-867a-ff5ddc2550be | How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can ... | Unable to answer |
|
| 17 |
+
| a1e91b78-d3d8-4675-bb8d-62741b4b68a6 | In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird spec... | ERROR: No evidence collected. Details: Tool vision failed: Exception: Vision analysis failed - Ge... |
|
| 18 |
+
| 2d83110e-a098-4ebb-9987-066c06fa42d0 | .rewsna eht sa "tfel" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI | ERROR: No evidence collected. Details: Tool selection returned no tools - using fallback keyword ... |
|
| 19 |
+
| cca530fc-4052-43b2-b130-b30968d8aa44 | Review the chess position provided in the image. It is black's turn. Provide the correct next mov... | ERROR: No evidence collected. Details: Tool vision failed: Exception: Vision analysis failed - Ge... |
|
| 20 |
+
| 4fc2f1ae-8625-45b5-ab34-ad4433bc21f8 | Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted i... | FunkMonk |
|
| 21 |
+
| 6f37996b-2ac7-44b0-8e68-6d28256631b4 | Given this table defining * on the set S = {a, b, c, d, e} \|*\|a\|b\|c\|d\|e\| \|---\|---\|---\... | ERROR: No evidence collected. Details: Planning error: Exception: Planning failed with all LLMs. ... |
|
| 22 |
+
| 9d191bce-651d-4746-be2d-7ef8ecadb9c2 | Examine the video at https://www.youtube.com/watch?v=1htKBjuUWec. What does Teal'c say in respon... | ERROR: No evidence collected. Details: Planning error: Exception: Planning failed with all LLMs. ... |
|
| 23 |
+
| cabe07ed-9eca-40ea-8ead-410ef5e83f91 | What is the surname of the equine veterinarian mentioned in 1.E Exercises from the chemistry mate... | ERROR: Answer synthesis failed - Exception: Answer synthesis failed with all LLMs. Gemini: 429 Yo... |
|
| 24 |
+
| 3cef3a44-215e-4aed-8e3b-b1e3f08063b7 | I'm making a grocery list for my mom, but she's a professor of botany and she's a real stickler w... | ERROR: Answer synthesis failed - Exception: Answer synthesis failed with all LLMs. Gemini: 429 Yo... |
|
| 25 |
+
| 99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3 | Hi, I'm making a pie but I could use some help with my shopping list. I have everything I need fo... | ERROR: No evidence collected. Details: Planning error: Exception: Planning failed with all LLMs. ... |
|
| 26 |
+
| 305ac316-eef6-4446-960a-92d80d542f82 | Who did the actor who played Ray in the Polish-language version of Everybody Loves Raymond play i... | ERROR: No evidence collected. Details: Planning error: Exception: Planning failed with all LLMs. ... |
|
| 27 |
+
| f918266a-b3e0-4914-865d-4faa564f1aef | What is the final numeric output from the attached Python code? | ERROR: Answer synthesis failed - Exception: Answer synthesis failed with all LLMs. Gemini: 429 Yo... |
|
| 28 |
+
| 3f57289b-8c60-48be-bd80-01f8099ca449 | How many at bats did the Yankee with the most walks in the 1977 regular season have that same sea... | 589 |
|
| 29 |
+
| 1f975693-876d-457b-a649-393859e79bf3 | Hi, I was out sick from my classes on Friday, so I'm trying to figure out what I need to study fo... | ERROR: No evidence collected. Details: Tool parse_file failed: ValueError: Unsupported file type:... |
|
| 30 |
+
| 840bfca7-4f7b-481a-8794-c560c340185d | On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This art... | Unable to answer |
|
| 31 |
+
| bda648d7-d618-4883-88f4-3466eabd860e | Where were the Vietnamese specimens described by Kuznetzov in Nedoshivina's 2010 paper eventually... | Unable to answer |
|
| 32 |
+
| cf106601-ab4f-4af9-b045-5295fe67b37d | What country had the least number of athletes at the 1928 Summer Olympics? If there's a tie for a... | ERROR: Answer synthesis failed - Exception: Answer synthesis failed with all LLMs. Gemini: 429 Yo... |
|
| 33 |
+
| a0c07678-e491-4bbc-8f0b-07405144218f | Who are the pitchers with the number before and after Taishō Tamai's number as of July 2023? Give... | ERROR: Answer synthesis failed - Exception: Answer synthesis failed with all LLMs. Gemini: 429 Yo... |
|
| 34 |
+
| 7bd855d8-463d-4ed5-93ca-5fe35145f733 | The attached Excel file contains the sales of menu items for a local fast-food chain. What were t... | ERROR: No evidence collected. Details: Planning error: Exception: Planning failed with all LLMs. ... |
|
| 35 |
+
| 5a0c1adf-205e-4841-a666-7c3ef95def9d | What is the first name of the only Malko Competition recipient from the 20th Century (after 1977)... | ERROR: Answer synthesis failed - Exception: Answer synthesis failed with all LLMs. Gemini: 429 Yo... |
|