agentbee

Sleeping

mangubee Claude Sonnet 4.5 commited on Jan 3

Commit

2a449c8

1 Parent(s): 5d84945

Feat: Add markdown export for GAIA evaluation results

Added export_results_to_markdown() function that saves evaluation results to ~/Downloads/gaia_results_TIMESTAMP.md with formatted markdown table. Updated all return paths in run_and_submit_all() to export results (success and error cases). Added export_output UI component to display file path.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>

Files changed (2) hide show

CHANGELOG.md +4 -0
app.py +74 -13

CHANGELOG.md CHANGED Viewed

@@ -25,6 +25,10 @@
 - **app.py**
   - Updated `check_api_keys()` - Added HF_TOKEN status display in Test & Debug tab
   - UI now shows: "HF_TOKEN (HuggingFace): ✓ SET" or "✗ MISSING"
 - **src/tools/__init__.py** (Fixed earlier in session)
   - Fixed TOOLS schema bug - Changed parameters from list to dict format

 - **app.py**
   - Updated `check_api_keys()` - Added HF_TOKEN status display in Test & Debug tab
   - UI now shows: "HF_TOKEN (HuggingFace): ✓ SET" or "✗ MISSING"
+  - Added `export_results_to_markdown(results_log, submission_status)` - Export evaluation results to markdown file
+  - Updated `run_and_submit_all()` - ALL return paths now export results to ~/Downloads/gaia_results_TIMESTAMP.md
+  - Added export_output UI component - Displays exported file path to user
+  - Updated run_button click handler - Now outputs 3 values (status, table, export_path)
 - **src/tools/__init__.py** (Fixed earlier in session)
   - Fixed TOOLS schema bug - Changed parameters from list to dict format

app.py CHANGED Viewed

@@ -34,6 +34,52 @@ def check_api_keys():
     return "\n".join([f"{k}: {v}" for k, v in keys_status.items()])
 def format_diagnostics(final_state: dict) -> str:
     """Format agent state for diagnostic display."""
     diagnostics = []
@@ -147,7 +193,7 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
         print(f"User logged in: {username}")
     else:
         print("User not logged in.")
-        return "Please Login to Hugging Face with the button.", None
     api_url = DEFAULT_API_URL
     questions_url = f"{api_url}/questions"
@@ -161,7 +207,7 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
     except Exception as e:
         logger.error(f"Error instantiating agent: {e}")
         print(f"Error instantiating agent: {e}")
-        return f"Error initializing agent: {e}", None
     # In the case of an app running as a hugging Face space, this link points toward your codebase ( usefull for others so please keep it public)
     agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
     print(agent_code)
@@ -174,18 +220,18 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
         questions_data = response.json()
         if not questions_data:
             print("Fetched questions list is empty.")
-            return "Fetched questions list is empty or invalid format.", None
         print(f"Fetched {len(questions_data)} questions.")
     except requests.exceptions.RequestException as e:
         print(f"Error fetching questions: {e}")
-        return f"Error fetching questions: {e}", None
     except requests.exceptions.JSONDecodeError as e:
         print(f"Error decoding JSON response from questions endpoint: {e}")
         print(f"Response text: {response.text[:500]}")
-        return f"Error decoding server response for questions: {e}", None
     except Exception as e:
         print(f"An unexpected error occurred fetching questions: {e}")
-        return f"An unexpected error occurred fetching questions: {e}", None
     # 3. Run your Agent
     results_log = []
@@ -221,7 +267,10 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
     if not answers_payload:
         print("Agent did not produce any answers to submit.")
-        return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
     # 4. Prepare Submission
     submission_data = {
@@ -247,7 +296,9 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
         )
         print("Submission successful.")
         results_df = pd.DataFrame(results_log)
-        return final_status, results_df
     except requests.exceptions.HTTPError as e:
         error_detail = f"Server responded with status {e.response.status_code}."
         try:
@@ -258,22 +309,26 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
         status_message = f"Submission Failed: {error_detail}"
         print(status_message)
         results_df = pd.DataFrame(results_log)
-        return status_message, results_df
     except requests.exceptions.Timeout:
         status_message = "Submission Failed: The request timed out."
         print(status_message)
         results_df = pd.DataFrame(results_log)
-        return status_message, results_df
     except requests.exceptions.RequestException as e:
         status_message = f"Submission Failed: Network error - {e}"
         print(status_message)
         results_df = pd.DataFrame(results_log)
-        return status_message, results_df
     except Exception as e:
         status_message = f"An unexpected error occurred during submission: {e}"
         print(status_message)
         results_df = pd.DataFrame(results_log)
-        return status_message, results_df
 # --- Build Gradio Interface using Blocks ---
@@ -359,7 +414,13 @@ with gr.Blocks() as demo:
             # Removed max_rows=10 from DataFrame constructor
             results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
-            run_button.click(fn=run_and_submit_all, outputs=[status_output, results_table])
 if __name__ == "__main__":
     print("\n" + "-" * 30 + " App Starting " + "-" * 30)

     return "\n".join([f"{k}: {v}" for k, v in keys_status.items()])
+def export_results_to_markdown(results_log: list, submission_status: str) -> str:
+    """Export evaluation results to markdown file in Downloads folder."""
+    from datetime import datetime
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    downloads_dir = os.path.expanduser("~/Downloads")
+    filename = f"gaia_results_{timestamp}.md"
+    filepath = os.path.join(downloads_dir, filename)
+    with open(filepath, 'w') as f:
+        # Header
+        f.write("# GAIA Agent Evaluation Results\n\n")
+        f.write(f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
+        # Submission status
+        f.write("## Submission Status\n\n")
+        f.write(f"{submission_status}\n\n")
+        # Results table
+        f.write("## Questions and Answers\n\n")
+        if not results_log:
+            f.write("*No results available*\n")
+            return filepath
+        # Create markdown table
+        f.write("| Task ID | Question | Submitted Answer |\n")
+        f.write("|---------|----------|------------------|\n")
+        for result in results_log:
+            task_id = result.get("Task ID", "N/A")
+            question = result.get("Question", "N/A").replace("\n", " ").replace("|", "\\|")
+            answer = result.get("Submitted Answer", "N/A").replace("\n", " ").replace("|", "\\|")
+            # Truncate long text for readability
+            if len(question) > 100:
+                question = question[:97] + "..."
+            if len(answer) > 100:
+                answer = answer[:97] + "..."
+            f.write(f"| {task_id} | {question} | {answer} |\n")
+    logger.info(f"Results exported to: {filepath}")
+    return filepath
 def format_diagnostics(final_state: dict) -> str:
     """Format agent state for diagnostic display."""
     diagnostics = []
         print(f"User logged in: {username}")
     else:
         print("User not logged in.")
+        return "Please Login to Hugging Face with the button.", None, ""
     api_url = DEFAULT_API_URL
     questions_url = f"{api_url}/questions"
     except Exception as e:
         logger.error(f"Error instantiating agent: {e}")
         print(f"Error instantiating agent: {e}")
+        return f"Error initializing agent: {e}", None, ""
     # In the case of an app running as a hugging Face space, this link points toward your codebase ( usefull for others so please keep it public)
     agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
     print(agent_code)
         questions_data = response.json()
         if not questions_data:
             print("Fetched questions list is empty.")
+            return "Fetched questions list is empty or invalid format.", None, ""
         print(f"Fetched {len(questions_data)} questions.")
     except requests.exceptions.RequestException as e:
         print(f"Error fetching questions: {e}")
+        return f"Error fetching questions: {e}", None, ""
     except requests.exceptions.JSONDecodeError as e:
         print(f"Error decoding JSON response from questions endpoint: {e}")
         print(f"Response text: {response.text[:500]}")
+        return f"Error decoding server response for questions: {e}", None, ""
     except Exception as e:
         print(f"An unexpected error occurred fetching questions: {e}")
+        return f"An unexpected error occurred fetching questions: {e}", None, ""
     # 3. Run your Agent
     results_log = []
     if not answers_payload:
         print("Agent did not produce any answers to submit.")
+        status_message = "Agent did not produce any answers to submit."
+        results_df = pd.DataFrame(results_log)
+        export_path = export_results_to_markdown(results_log, status_message)
+        return status_message, results_df, export_path
     # 4. Prepare Submission
     submission_data = {
         )
         print("Submission successful.")
         results_df = pd.DataFrame(results_log)
+        # Export to markdown
+        export_path = export_results_to_markdown(results_log, final_status)
+        return final_status, results_df, export_path
     except requests.exceptions.HTTPError as e:
         error_detail = f"Server responded with status {e.response.status_code}."
         try:
         status_message = f"Submission Failed: {error_detail}"
         print(status_message)
         results_df = pd.DataFrame(results_log)
+        export_path = export_results_to_markdown(results_log, status_message)
+        return status_message, results_df, export_path
     except requests.exceptions.Timeout:
         status_message = "Submission Failed: The request timed out."
         print(status_message)
         results_df = pd.DataFrame(results_log)
+        export_path = export_results_to_markdown(results_log, status_message)
+        return status_message, results_df, export_path
     except requests.exceptions.RequestException as e:
         status_message = f"Submission Failed: Network error - {e}"
         print(status_message)
         results_df = pd.DataFrame(results_log)
+        export_path = export_results_to_markdown(results_log, status_message)
+        return status_message, results_df, export_path
     except Exception as e:
         status_message = f"An unexpected error occurred during submission: {e}"
         print(status_message)
         results_df = pd.DataFrame(results_log)
+        export_path = export_results_to_markdown(results_log, status_message)
+        return status_message, results_df, export_path
 # --- Build Gradio Interface using Blocks ---
             # Removed max_rows=10 from DataFrame constructor
             results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
+            export_output = gr.Textbox(
+                label="Exported Results",
+                placeholder="Results will be exported to markdown file in ~/Downloads",
+                interactive=False
+            )
+            run_button.click(fn=run_and_submit_all, outputs=[status_output, results_table, export_output])
 if __name__ == "__main__":
     print("\n" + "-" * 30 + " App Starting " + "-" * 30)