agentbee

Sleeping

App Files Files Community

mangubee commited on Jan 4

Commit

9edb481

1 Parent(s): 2fc4228

Update

Browse files

Files changed (3) hide show

CHANGELOG.md +5 -3
app.py +33 -43
output/gaia_results_20260104_005858.md +35 -0

CHANGELOG.md CHANGED Viewed

@@ -25,9 +25,11 @@
 - **app.py**
   - Updated `check_api_keys()` - Added HF_TOKEN status display in Test & Debug tab
   - UI now shows: "HF_TOKEN (HuggingFace): ✓ SET" or "✗ MISSING"
-  - Added `export_results_to_markdown(results_log, submission_status)` - Export evaluation results with environment detection
-    - Local: Saves to ~/Downloads/gaia_results_TIMESTAMP.md
-    - HF Spaces: Saves to ./exports/gaia_results_TIMESTAMP.md (fixes cloud deployment issue)
   - Updated `run_and_submit_all()` - ALL return paths now export results
   - Added gr.File download button - Users can directly download results (better UX than textbox)
   - Updated run_button click handler - Now outputs 3 values (status, table, export_path)

 - **app.py**
   - Updated `check_api_keys()` - Added HF_TOKEN status display in Test & Debug tab
   - UI now shows: "HF_TOKEN (HuggingFace): ✓ SET" or "✗ MISSING"
+  - Added `export_results_to_json(results_log, submission_status)` - Export evaluation results as JSON
+    - Local: Saves to ~/Downloads/gaia_results_TIMESTAMP.json
+    - HF Spaces: Saves to ./exports/gaia_results_TIMESTAMP.json (fixes cloud deployment issue)
+    - JSON format: No special char escaping issues, full error messages, easy code processing
+    - Pretty formatted with indent=2, ensure_ascii=False for readability
   - Updated `run_and_submit_all()` - ALL return paths now export results
   - Added gr.File download button - Users can directly download results (better UX than textbox)
   - Updated run_button click handler - Now outputs 3 values (status, table, export_path)

app.py CHANGED Viewed

@@ -34,16 +34,17 @@ def check_api_keys():
     return "\n".join([f"{k}: {v}" for k, v in keys_status.items()])
-def export_results_to_markdown(results_log: list, submission_status: str) -> str:
-    """Export evaluation results to markdown file.
-    - Local: Saves to ~/Downloads
-    - HF Spaces: Saves to ./exports/ (for Gradio file download)
     """
     from datetime import datetime
     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-    filename = f"gaia_results_{timestamp}.md"
     # Detect environment: HF Spaces or local
     if os.getenv("SPACE_ID"):
@@ -56,38 +57,27 @@ def export_results_to_markdown(results_log: list, submission_status: str) -> str
         downloads_dir = os.path.expanduser("~/Downloads")
         filepath = os.path.join(downloads_dir, filename)
-    with open(filepath, 'w') as f:
-        # Header
-        f.write("# GAIA Agent Evaluation Results\n\n")
-        f.write(f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
-        # Submission status
-        f.write("## Submission Status\n\n")
-        f.write(f"{submission_status}\n\n")
-        # Results table
-        f.write("## Questions and Answers\n\n")
-        if not results_log:
-            f.write("*No results available*\n")
-            return filepath
-        # Create markdown table
-        f.write("| Task ID | Question | Submitted Answer |\n")
-        f.write("|---------|----------|------------------|\n")
-        for result in results_log:
-            task_id = result.get("Task ID", "N/A")
-            question = result.get("Question", "N/A").replace("\n", " ").replace("|", "\\|")
-            answer = result.get("Submitted Answer", "N/A").replace("\n", " ").replace("|", "\\|")
-            # Truncate long text for readability
-            if len(question) > 100:
-                question = question[:97] + "..."
-            if len(answer) > 100:
-                answer = answer[:97] + "..."
-            f.write(f"| {task_id} | {question} | {answer} |\n")
     logger.info(f"Results exported to: {filepath}")
     return filepath
@@ -282,7 +272,7 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
         print("Agent did not produce any answers to submit.")
         status_message = "Agent did not produce any answers to submit."
         results_df = pd.DataFrame(results_log)
-        export_path = export_results_to_markdown(results_log, status_message)
         return status_message, results_df, export_path
     # 4. Prepare Submission
@@ -309,8 +299,8 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
         )
         print("Submission successful.")
         results_df = pd.DataFrame(results_log)
-        # Export to markdown
-        export_path = export_results_to_markdown(results_log, final_status)
         return final_status, results_df, export_path
     except requests.exceptions.HTTPError as e:
         error_detail = f"Server responded with status {e.response.status_code}."
@@ -322,25 +312,25 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
         status_message = f"Submission Failed: {error_detail}"
         print(status_message)
         results_df = pd.DataFrame(results_log)
-        export_path = export_results_to_markdown(results_log, status_message)
         return status_message, results_df, export_path
     except requests.exceptions.Timeout:
         status_message = "Submission Failed: The request timed out."
         print(status_message)
         results_df = pd.DataFrame(results_log)
-        export_path = export_results_to_markdown(results_log, status_message)
         return status_message, results_df, export_path
     except requests.exceptions.RequestException as e:
         status_message = f"Submission Failed: Network error - {e}"
         print(status_message)
         results_df = pd.DataFrame(results_log)
-        export_path = export_results_to_markdown(results_log, status_message)
         return status_message, results_df, export_path
     except Exception as e:
         status_message = f"An unexpected error occurred during submission: {e}"
         print(status_message)
         results_df = pd.DataFrame(results_log)
-        export_path = export_results_to_markdown(results_log, status_message)
         return status_message, results_df, export_path

     return "\n".join([f"{k}: {v}" for k, v in keys_status.items()])
+def export_results_to_json(results_log: list, submission_status: str) -> str:
+    """Export evaluation results to JSON file for easy processing.
+    - Local: Saves to ~/Downloads/gaia_results_TIMESTAMP.json
+    - HF Spaces: Saves to ./exports/gaia_results_TIMESTAMP.json
+    - Format: Clean JSON with full error messages, no truncation
     """
     from datetime import datetime
     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    filename = f"gaia_results_{timestamp}.json"
     # Detect environment: HF Spaces or local
     if os.getenv("SPACE_ID"):
         downloads_dir = os.path.expanduser("~/Downloads")
         filepath = os.path.join(downloads_dir, filename)
+    # Build JSON structure
+    export_data = {
+        "metadata": {
+            "generated": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+            "timestamp": timestamp,
+            "total_questions": len(results_log)
+        },
+        "submission_status": submission_status,
+        "results": [
+            {
+                "task_id": result.get("Task ID", "N/A"),
+                "question": result.get("Question", "N/A"),
+                "submitted_answer": result.get("Submitted Answer", "N/A")
+            }
+            for result in results_log
+        ]
+    }
+    # Write JSON file with pretty formatting
+    with open(filepath, 'w', encoding='utf-8') as f:
+        json.dump(export_data, f, indent=2, ensure_ascii=False)
     logger.info(f"Results exported to: {filepath}")
     return filepath
         print("Agent did not produce any answers to submit.")
         status_message = "Agent did not produce any answers to submit."
         results_df = pd.DataFrame(results_log)
+        export_path = export_results_to_json(results_log, status_message)
         return status_message, results_df, export_path
     # 4. Prepare Submission
         )
         print("Submission successful.")
         results_df = pd.DataFrame(results_log)
+        # Export to JSON
+        export_path = export_results_to_json(results_log, final_status)
         return final_status, results_df, export_path
     except requests.exceptions.HTTPError as e:
         error_detail = f"Server responded with status {e.response.status_code}."
         status_message = f"Submission Failed: {error_detail}"
         print(status_message)
         results_df = pd.DataFrame(results_log)
+        export_path = export_results_to_json(results_log, status_message)
         return status_message, results_df, export_path
     except requests.exceptions.Timeout:
         status_message = "Submission Failed: The request timed out."
         print(status_message)
         results_df = pd.DataFrame(results_log)
+        export_path = export_results_to_json(results_log, status_message)
         return status_message, results_df, export_path
     except requests.exceptions.RequestException as e:
         status_message = f"Submission Failed: Network error - {e}"
         print(status_message)
         results_df = pd.DataFrame(results_log)
+        export_path = export_results_to_json(results_log, status_message)
         return status_message, results_df, export_path
     except Exception as e:
         status_message = f"An unexpected error occurred during submission: {e}"
         print(status_message)
         results_df = pd.DataFrame(results_log)
+        export_path = export_results_to_json(results_log, status_message)
         return status_message, results_df, export_path

output/gaia_results_20260104_005858.md ADDED Viewed

	@@ -0,0 +1,35 @@

+# GAIA Agent Evaluation Results
+**Generated:** 2026-01-04 00:58:58
+## Submission Status
+Submission Successful!
+User: mangoobee
+Overall Score: 5.0% (1/20 correct)
+Message: Score calculated successfully: 1/20 total questions answered correctly (20 valid tasks attempted). Score did not improve previous record, leaderboard not updated.
+## Questions and Answers
+| Task ID | Question | Submitted Answer |
+|---------|----------|------------------|
+| 8e867cd7-cff9-4e6c-867a-ff5ddc2550be | How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can ... | Unable to answer |
+| a1e91b78-d3d8-4675-bb8d-62741b4b68a6 | In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird spec... | ERROR: No evidence collected. Details: Tool vision failed: Exception: Vision analysis failed - Ge... |
+| 2d83110e-a098-4ebb-9987-066c06fa42d0 | .rewsna eht sa "tfel" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI | ERROR: No evidence collected. Details: Tool selection returned no tools - using fallback keyword ... |
+| cca530fc-4052-43b2-b130-b30968d8aa44 | Review the chess position provided in the image. It is black's turn. Provide the correct next mov... | ERROR: No evidence collected. Details: Tool vision failed: Exception: Vision analysis failed - Ge... |
+| 4fc2f1ae-8625-45b5-ab34-ad4433bc21f8 | Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted i... | FunkMonk |
+| 6f37996b-2ac7-44b0-8e68-6d28256631b4 | Given this table defining * on the set S = {a, b, c, d, e}  \|*\|a\|b\|c\|d\|e\| \|---\|---\|---\... | ERROR: No evidence collected. Details: Planning error: Exception: Planning failed with all LLMs. ... |
+| 9d191bce-651d-4746-be2d-7ef8ecadb9c2 | Examine the video at https://www.youtube.com/watch?v=1htKBjuUWec.  What does Teal'c say in respon... | ERROR: No evidence collected. Details: Planning error: Exception: Planning failed with all LLMs. ... |
+| cabe07ed-9eca-40ea-8ead-410ef5e83f91 | What is the surname of the equine veterinarian mentioned in 1.E Exercises from the chemistry mate... | ERROR: Answer synthesis failed - Exception: Answer synthesis failed with all LLMs. Gemini: 429 Yo... |
+| 3cef3a44-215e-4aed-8e3b-b1e3f08063b7 | I'm making a grocery list for my mom, but she's a professor of botany and she's a real stickler w... | ERROR: Answer synthesis failed - Exception: Answer synthesis failed with all LLMs. Gemini: 429 Yo... |
+| 99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3 | Hi, I'm making a pie but I could use some help with my shopping list. I have everything I need fo... | ERROR: No evidence collected. Details: Planning error: Exception: Planning failed with all LLMs. ... |
+| 305ac316-eef6-4446-960a-92d80d542f82 | Who did the actor who played Ray in the Polish-language version of Everybody Loves Raymond play i... | ERROR: No evidence collected. Details: Planning error: Exception: Planning failed with all LLMs. ... |
+| f918266a-b3e0-4914-865d-4faa564f1aef | What is the final numeric output from the attached Python code? | ERROR: Answer synthesis failed - Exception: Answer synthesis failed with all LLMs. Gemini: 429 Yo... |
+| 3f57289b-8c60-48be-bd80-01f8099ca449 | How many at bats did the Yankee with the most walks in the 1977 regular season have that same sea... | 589 |
+| 1f975693-876d-457b-a649-393859e79bf3 | Hi, I was out sick from my classes on Friday, so I'm trying to figure out what I need to study fo... | ERROR: No evidence collected. Details: Tool parse_file failed: ValueError: Unsupported file type:... |
+| 840bfca7-4f7b-481a-8794-c560c340185d | On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This art... | Unable to answer |
+| bda648d7-d618-4883-88f4-3466eabd860e | Where were the Vietnamese specimens described by Kuznetzov in Nedoshivina's 2010 paper eventually... | Unable to answer |
+| cf106601-ab4f-4af9-b045-5295fe67b37d | What country had the least number of athletes at the 1928 Summer Olympics? If there's a tie for a... | ERROR: Answer synthesis failed - Exception: Answer synthesis failed with all LLMs. Gemini: 429 Yo... |
+| a0c07678-e491-4bbc-8f0b-07405144218f | Who are the pitchers with the number before and after Taishō Tamai's number as of July 2023? Give... | ERROR: Answer synthesis failed - Exception: Answer synthesis failed with all LLMs. Gemini: 429 Yo... |
+| 7bd855d8-463d-4ed5-93ca-5fe35145f733 | The attached Excel file contains the sales of menu items for a local fast-food chain. What were t... | ERROR: No evidence collected. Details: Planning error: Exception: Planning failed with all LLMs. ... |
+| 5a0c1adf-205e-4841-a666-7c3ef95def9d | What is the first name of the only Malko Competition recipient from the 20th Century (after 1977)... | ERROR: Answer synthesis failed - Exception: Answer synthesis failed with all LLMs. Gemini: 429 Yo... |