Update app.py
Browse files
app.py
CHANGED
|
@@ -1,40 +1,156 @@
|
|
| 1 |
import os
|
|
|
|
| 2 |
import gradio as gr
|
| 3 |
import requests
|
| 4 |
-
import inspect
|
| 5 |
import pandas as pd
|
| 6 |
-
from
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
-
# (Keep Constants as is)
|
| 10 |
-
# --- Constants ---
|
| 11 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
|
|
|
|
|
|
| 16 |
def __init__(self):
|
| 17 |
-
print("
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
messages = self.graph.invoke({"messages": messages})
|
| 25 |
-
answer = messages['messages'][-1].content
|
| 26 |
-
return answer
|
| 27 |
-
|
| 28 |
-
def run_and_submit_all( profile: gr.OAuthProfile | None):
|
| 29 |
-
"""
|
| 30 |
-
Fetches all questions, runs the BasicAgent on them, submits all answers,
|
| 31 |
-
and displays the results.
|
| 32 |
-
"""
|
| 33 |
-
# --- Determine HF Space Runtime URL and Repo URL ---
|
| 34 |
-
space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
|
| 35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
if profile:
|
| 37 |
-
username= f"{profile.username}"
|
| 38 |
print(f"User logged in: {username}")
|
| 39 |
else:
|
| 40 |
print("User not logged in.")
|
|
@@ -44,20 +160,24 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
|
|
| 44 |
questions_url = f"{api_url}/questions"
|
| 45 |
submit_url = f"{api_url}/submit"
|
| 46 |
|
| 47 |
-
# 1. Instantiate Agent ( modify this part to create your agent)
|
| 48 |
try:
|
| 49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
except Exception as e:
|
| 51 |
print(f"Error instantiating agent: {e}")
|
|
|
|
| 52 |
return f"Error initializing agent: {e}", None
|
| 53 |
-
# In the case of an app running as a hugging Face space, this link points toward your codebase ( usefull for others so please keep it public)
|
| 54 |
-
agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
|
| 55 |
-
print(agent_code)
|
| 56 |
|
| 57 |
-
|
|
|
|
|
|
|
| 58 |
print(f"Fetching questions from: {questions_url}")
|
| 59 |
try:
|
| 60 |
-
response = requests.get(questions_url, timeout=
|
| 61 |
response.raise_for_status()
|
| 62 |
questions_data = response.json()
|
| 63 |
if not questions_data:
|
|
@@ -73,12 +193,12 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
|
|
| 73 |
return f"Error decoding server response for questions: {e}", None
|
| 74 |
except Exception as e:
|
| 75 |
print(f"An unexpected error occurred fetching questions: {e}")
|
|
|
|
| 76 |
return f"An unexpected error occurred fetching questions: {e}", None
|
| 77 |
|
| 78 |
-
# 3. Run your Agent
|
| 79 |
results_log = []
|
| 80 |
answers_payload = []
|
| 81 |
-
print(f"Running agent on {len(questions_data)} questions...")
|
| 82 |
for item in questions_data:
|
| 83 |
task_id = item.get("task_id")
|
| 84 |
question_text = item.get("question")
|
|
@@ -86,81 +206,108 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
|
|
| 86 |
print(f"Skipping item with missing task_id or question: {item}")
|
| 87 |
continue
|
| 88 |
try:
|
| 89 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
| 91 |
-
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
|
| 92 |
except Exception as e:
|
| 93 |
print(f"Error running agent on task {task_id}: {e}")
|
|
|
|
|
|
|
|
|
|
| 94 |
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
|
| 95 |
|
| 96 |
if not answers_payload:
|
| 97 |
print("Agent did not produce any answers to submit.")
|
| 98 |
return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
|
| 99 |
|
| 100 |
-
# 4. Prepare Submission
|
| 101 |
submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
|
| 102 |
-
|
| 103 |
-
print(status_update)
|
| 104 |
|
| 105 |
-
# 5. Submit
|
| 106 |
-
print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
|
| 107 |
try:
|
| 108 |
-
response = requests.post(submit_url, json=submission_data, timeout=
|
| 109 |
response.raise_for_status()
|
| 110 |
result_data = response.json()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
final_status = (
|
| 112 |
f"Submission Successful!\n"
|
| 113 |
f"User: {result_data.get('username')}\n"
|
| 114 |
-
f"Overall Score: {
|
| 115 |
-
f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
|
| 116 |
f"Message: {result_data.get('message', 'No message received.')}"
|
| 117 |
)
|
| 118 |
print("Submission successful.")
|
|
|
|
| 119 |
results_df = pd.DataFrame(results_log)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
return final_status, results_df
|
|
|
|
| 121 |
except requests.exceptions.HTTPError as e:
|
| 122 |
error_detail = f"Server responded with status {e.response.status_code}."
|
| 123 |
-
try:
|
| 124 |
-
|
| 125 |
-
error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
|
| 126 |
-
except requests.exceptions.JSONDecodeError:
|
| 127 |
-
error_detail += f" Response: {e.response.text[:500]}"
|
| 128 |
status_message = f"Submission Failed: {error_detail}"
|
| 129 |
print(status_message)
|
| 130 |
-
|
|
|
|
|
|
|
|
|
|
| 131 |
return status_message, results_df
|
| 132 |
except requests.exceptions.Timeout:
|
| 133 |
status_message = "Submission Failed: The request timed out."
|
| 134 |
print(status_message)
|
| 135 |
-
|
|
|
|
|
|
|
|
|
|
| 136 |
return status_message, results_df
|
| 137 |
except requests.exceptions.RequestException as e:
|
| 138 |
status_message = f"Submission Failed: Network error - {e}"
|
| 139 |
print(status_message)
|
| 140 |
-
|
|
|
|
|
|
|
|
|
|
| 141 |
return status_message, results_df
|
| 142 |
except Exception as e:
|
| 143 |
status_message = f"An unexpected error occurred during submission: {e}"
|
| 144 |
print(status_message)
|
| 145 |
-
|
|
|
|
|
|
|
|
|
|
| 146 |
return status_message, results_df
|
| 147 |
|
| 148 |
|
| 149 |
-
# --- Build Gradio Interface using Blocks ---
|
| 150 |
with gr.Blocks() as demo:
|
| 151 |
-
gr.Markdown("# Basic Agent Evaluation Runner")
|
| 152 |
gr.Markdown(
|
| 153 |
-
"""
|
| 154 |
-
|
| 155 |
|
| 156 |
-
|
|
|
|
| 157 |
2. Log in to your Hugging Face account using the button below. This uses your HF username for submission.
|
| 158 |
3. Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
|
| 159 |
-
|
| 160 |
---
|
| 161 |
**Disclaimers:**
|
| 162 |
-
Once clicking on the
|
| 163 |
-
This space provides a basic setup and is intentionally sub-optimal to encourage you to develop your own, more robust solution. For instance for the delay process of the submit button, a solution could be to cache the answers and submit in a seperate action or even to answer the questions in async.
|
| 164 |
"""
|
| 165 |
)
|
| 166 |
|
|
@@ -169,34 +316,25 @@ with gr.Blocks() as demo:
|
|
| 169 |
run_button = gr.Button("Run Evaluation & Submit All Answers")
|
| 170 |
|
| 171 |
status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
|
| 172 |
-
|
| 173 |
-
results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
|
| 174 |
|
| 175 |
run_button.click(
|
| 176 |
fn=run_and_submit_all,
|
| 177 |
-
outputs=[status_output, results_table]
|
|
|
|
| 178 |
)
|
| 179 |
|
|
|
|
| 180 |
if __name__ == "__main__":
|
| 181 |
print("\n" + "-"*30 + " App Starting " + "-"*30)
|
| 182 |
-
# Check for SPACE_HOST and SPACE_ID at startup for information
|
| 183 |
space_host_startup = os.getenv("SPACE_HOST")
|
| 184 |
-
space_id_startup = os.getenv("SPACE_ID")
|
| 185 |
-
|
| 186 |
-
if space_host_startup:
|
| 187 |
-
print(f"✅ SPACE_HOST found: {space_host_startup}")
|
| 188 |
-
print(f" Runtime URL should be: https://{space_host_startup}.hf.space")
|
| 189 |
-
else:
|
| 190 |
-
print("ℹ️ SPACE_HOST environment variable not found (running locally?).")
|
| 191 |
|
| 192 |
-
if
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
else:
|
| 197 |
-
print("ℹ️ SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined.")
|
| 198 |
|
| 199 |
print("-"*(60 + len(" App Starting ")) + "\n")
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
demo.launch(debug=True, share=False)
|
|
|
|
| 1 |
import os
|
| 2 |
+
import io
|
| 3 |
import gradio as gr
|
| 4 |
import requests
|
|
|
|
| 5 |
import pandas as pd
|
| 6 |
+
from dotenv import load_dotenv
|
| 7 |
+
import traceback
|
| 8 |
+
|
| 9 |
+
from langchain_openai import ChatOpenAI
|
| 10 |
+
from langchain_community.tools.tavily_search import TavilySearchResults
|
| 11 |
+
from langchain.agents import AgentExecutor, create_openai_tools_agent
|
| 12 |
+
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
|
| 13 |
+
from langchain_core.tools import BaseTool
|
| 14 |
+
from langchain.pydantic_v1 import BaseModel, Field
|
| 15 |
+
from openai import OpenAI
|
| 16 |
+
|
| 17 |
+
load_dotenv()
|
| 18 |
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
| 19 |
+
TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")
|
| 20 |
+
|
| 21 |
+
if not OPENAI_API_KEY:
|
| 22 |
+
print("⚠️ WARNING: OPENAI_API_KEY environment variable not set.")
|
| 23 |
+
if not TAVILY_API_KEY:
|
| 24 |
+
print("⚠️ WARNING: TAVILY_API_KEY environment variable not set.")
|
| 25 |
|
|
|
|
|
|
|
| 26 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
| 27 |
+
MAX_AGENT_ITERATIONS = 35
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class TranscribeAudioByIdInput(BaseModel):
|
| 31 |
+
task_id: str = Field(description="The task_id associated with the audio file to be downloaded and transcribed.")
|
| 32 |
+
|
| 33 |
+
class AudioTranscriptionByIdTool(BaseTool):
|
| 34 |
+
name: str = "transcribe_audio_by_id"
|
| 35 |
+
description: str = "Downloads an audio file associated with a specific task_id from the scoring server's /files/{task_id} endpoint and transcribes it using OpenAI's whisper-1 model. Input must be the task_id."
|
| 36 |
+
args_schema: type[BaseModel] = TranscribeAudioByIdInput
|
| 37 |
+
openai_client: OpenAI = Field(default_factory=lambda: OpenAI(api_key=OPENAI_API_KEY))
|
| 38 |
+
|
| 39 |
+
def _run(self, task_id: str) -> str:
|
| 40 |
+
file_url = f"{DEFAULT_API_URL}/files/{task_id}"
|
| 41 |
+
print(f"Attempting to download audio file from: {file_url}")
|
| 42 |
+
try:
|
| 43 |
+
response = requests.get(file_url, timeout=20)
|
| 44 |
+
response.raise_for_status()
|
| 45 |
+
audio_content = response.content
|
| 46 |
+
print(f"Downloaded {len(audio_content)} bytes of audio data.")
|
| 47 |
+
|
| 48 |
+
audio_file_like = io.BytesIO(audio_content)
|
| 49 |
+
audio_file_like.name = f"{task_id}.wav"
|
| 50 |
+
|
| 51 |
+
print(f"Attempting to transcribe audio for task_id: {task_id}")
|
| 52 |
+
transcription = self.openai_client.audio.transcriptions.create(
|
| 53 |
+
model="whisper-1",
|
| 54 |
+
file=audio_file_like
|
| 55 |
+
)
|
| 56 |
+
print("Transcription successful.")
|
| 57 |
+
return transcription.text
|
| 58 |
+
except requests.exceptions.RequestException as e:
|
| 59 |
+
error_msg = f"Error downloading audio file for task_id {task_id}: {e}"
|
| 60 |
+
print(error_msg)
|
| 61 |
+
return error_msg
|
| 62 |
+
except Exception as e:
|
| 63 |
+
error_msg = f"Error during audio transcription for task_id {task_id}: {e}"
|
| 64 |
+
print(error_msg)
|
| 65 |
+
traceback.print_exc()
|
| 66 |
+
return f"Error transcribing audio: {str(e)}"
|
| 67 |
|
| 68 |
+
async def _arun(self, task_id: str) -> str:
|
| 69 |
+
return self._run(task_id)
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
class EnhancedAgent:
|
| 73 |
def __init__(self):
|
| 74 |
+
print("Initializing GAIA Agent...")
|
| 75 |
+
if not OPENAI_API_KEY or not TAVILY_API_KEY:
|
| 76 |
+
raise ValueError("Missing required API keys (OpenAI or Tavily). Please set them.")
|
| 77 |
+
|
| 78 |
+
self.llm = ChatOpenAI(model="o4-mini", openai_api_key=OPENAI_API_KEY)
|
| 79 |
+
self.search_tool = TavilySearchResults(max_results=5, tavily_api_key=TAVILY_API_KEY)
|
| 80 |
+
self.transcription_tool = AudioTranscriptionByIdTool()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
|
| 82 |
+
self.tools = [self.search_tool, self.transcription_tool]
|
| 83 |
+
print(f"Total tools available: {len(self.tools)}")
|
| 84 |
+
|
| 85 |
+
system_prompt = """You are a general AI assistant designed to answer GAIA benchmark questions. I will ask you a question, which may include a task_id for accessing related files.
|
| 86 |
+
First, think step-by-step about how to answer the question. Use the available tools if necessary. The available tools are: tavily_search, transcribe_audio_by_id.
|
| 87 |
+
The 'transcribe_audio_by_id' tool downloads and transcribes an audio file using its task_id. If the question refers to an audio file (e.g., "transcribe file X.wav"), you MUST use the task_id provided within the user's input message (formatted like "(Use task_id: <the_task_id>)") as the input for the 'transcribe_audio_by_id' tool. Do NOT try to use a filename as input for this tool.
|
| 88 |
+
Prioritize using the search tool for general knowledge or recent information. Use the audio tool only if explicitly asked to process an audio file using its task_id.
|
| 89 |
+
Report your thoughts clearly.
|
| 90 |
+
Finally, conclude your response STRICTLY with the following template:
|
| 91 |
+
YOUR FINAL ANSWER
|
| 92 |
+
|
| 93 |
+
YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
|
| 94 |
+
Specific formatting rules for the FINAL ANSWER:
|
| 95 |
+
- If a number is requested: Write only the number. Do not use thousands separators (commas). Do not include units like '$' or '%' unless the question explicitly asks for the unit. Example: 12345
|
| 96 |
+
- If a string is requested: Use minimal words. Do not use articles (a, an, the). Write digits as words (e.g., "five" not "5") unless the question specifies digits. Do not use abbreviations (e.g., write "Los Angeles" not "LA"). Example: Mount Everest
|
| 97 |
+
- If a comma-separated list is requested: Apply the number or string rules to each element in the list. Separate elements with a comma and a single space. Example for numbers: 10, 25, 300. Example for strings: red, blue, green. Example for mixed: 5, apple, ten"""
|
| 98 |
+
|
| 99 |
+
self.prompt = ChatPromptTemplate.from_messages([
|
| 100 |
+
("system", system_prompt),
|
| 101 |
+
("human", "{input}"),
|
| 102 |
+
MessagesPlaceholder(variable_name="agent_scratchpad"),
|
| 103 |
+
])
|
| 104 |
+
|
| 105 |
+
agent = create_openai_tools_agent(self.llm, self.tools, self.prompt)
|
| 106 |
+
|
| 107 |
+
self.agent_executor = AgentExecutor(
|
| 108 |
+
agent=agent,
|
| 109 |
+
tools=self.tools,
|
| 110 |
+
verbose=True,
|
| 111 |
+
max_iterations=MAX_AGENT_ITERATIONS,
|
| 112 |
+
handle_parsing_errors="Error: LLM output parsing failed. Please check the thought process.",
|
| 113 |
+
)
|
| 114 |
+
print(f"GAIA Agent initialized successfully with {len(self.tools)} tools and max_iterations={MAX_AGENT_ITERATIONS}.")
|
| 115 |
+
|
| 116 |
+
def __call__(self, user_input: str) -> str:
|
| 117 |
+
print(f"Agent received input (first 100 chars): {user_input[:100]}...")
|
| 118 |
+
try:
|
| 119 |
+
response = self.agent_executor.invoke({"input": user_input})
|
| 120 |
+
output = response.get("output", "Agent did not produce an output.")
|
| 121 |
+
lines = output.strip().split('\n')
|
| 122 |
+
final_answer = lines[-1] if lines else output
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
if "YOUR FINAL ANSWER" in lines[-1].upper():
|
| 126 |
+
parts = lines[-1].split(':', 1)
|
| 127 |
+
if len(parts) > 1:
|
| 128 |
+
final_answer = parts[1].strip()
|
| 129 |
+
else:
|
| 130 |
+
final_answer = lines[-1]
|
| 131 |
+
elif not lines[-1].strip():
|
| 132 |
+
if len(lines) > 1:
|
| 133 |
+
final_answer = lines[-2]
|
| 134 |
+
|
| 135 |
+
print(f"Agent returning final answer: {final_answer}")
|
| 136 |
+
return final_answer
|
| 137 |
+
|
| 138 |
+
except Exception as e:
|
| 139 |
+
print(f"\n!!! Critical Error during agent execution !!!")
|
| 140 |
+
print(f"Error Type: {type(e).__name__}")
|
| 141 |
+
print(f"Error Details: {e}")
|
| 142 |
+
print("--- Traceback ---")
|
| 143 |
+
traceback.print_exc()
|
| 144 |
+
print("--- End Traceback ---\n")
|
| 145 |
+
return f"Error processing input - {type(e).__name__}: {str(e)}"
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
def run_and_submit_all(profile: gr.OAuthProfile | None):
|
| 149 |
+
print("\n--- Starting Evaluation and Submission Run ---")
|
| 150 |
+
|
| 151 |
+
space_id = os.getenv("SPACE_ID")
|
| 152 |
if profile:
|
| 153 |
+
username = f"{profile.username}"
|
| 154 |
print(f"User logged in: {username}")
|
| 155 |
else:
|
| 156 |
print("User not logged in.")
|
|
|
|
| 160 |
questions_url = f"{api_url}/questions"
|
| 161 |
submit_url = f"{api_url}/submit"
|
| 162 |
|
|
|
|
| 163 |
try:
|
| 164 |
+
if not OPENAI_API_KEY or not TAVILY_API_KEY:
|
| 165 |
+
raise ValueError("Missing required API keys (OpenAI or Tavily). Please set them as environment variables or secrets.")
|
| 166 |
+
agent_instance = EnhancedAgent()
|
| 167 |
+
except ValueError as e:
|
| 168 |
+
print(f"Error instantiating agent: {e}")
|
| 169 |
+
return f"Error initializing agent: {e}", None
|
| 170 |
except Exception as e:
|
| 171 |
print(f"Error instantiating agent: {e}")
|
| 172 |
+
traceback.print_exc()
|
| 173 |
return f"Error initializing agent: {e}", None
|
|
|
|
|
|
|
|
|
|
| 174 |
|
| 175 |
+
agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else "Code link unavailable (SPACE_ID not set)"
|
| 176 |
+
print(f"Agent code link: {agent_code}")
|
| 177 |
+
|
| 178 |
print(f"Fetching questions from: {questions_url}")
|
| 179 |
try:
|
| 180 |
+
response = requests.get(questions_url, timeout=30)
|
| 181 |
response.raise_for_status()
|
| 182 |
questions_data = response.json()
|
| 183 |
if not questions_data:
|
|
|
|
| 193 |
return f"Error decoding server response for questions: {e}", None
|
| 194 |
except Exception as e:
|
| 195 |
print(f"An unexpected error occurred fetching questions: {e}")
|
| 196 |
+
traceback.print_exc()
|
| 197 |
return f"An unexpected error occurred fetching questions: {e}", None
|
| 198 |
|
|
|
|
| 199 |
results_log = []
|
| 200 |
answers_payload = []
|
| 201 |
+
print(f"Running agent on {len(questions_data)} questions for submission...")
|
| 202 |
for item in questions_data:
|
| 203 |
task_id = item.get("task_id")
|
| 204 |
question_text = item.get("question")
|
|
|
|
| 206 |
print(f"Skipping item with missing task_id or question: {item}")
|
| 207 |
continue
|
| 208 |
try:
|
| 209 |
+
agent_input = f"{question_text}\n(Use task_id: {task_id} if you need to access associated files)"
|
| 210 |
+
submitted_answer = agent_instance(agent_input)
|
| 211 |
+
if submitted_answer.upper().startswith("FINAL ANSWER:"):
|
| 212 |
+
submitted_answer = submitted_answer.split(":", 1)[1].strip()
|
| 213 |
+
|
| 214 |
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
| 215 |
+
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
|
| 216 |
except Exception as e:
|
| 217 |
print(f"Error running agent on task {task_id}: {e}")
|
| 218 |
+
traceback.print_exc()
|
| 219 |
+
error_answer = f"Agent Error - {str(e)}"
|
| 220 |
+
answers_payload.append({"task_id": task_id, "submitted_answer": error_answer})
|
| 221 |
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
|
| 222 |
|
| 223 |
if not answers_payload:
|
| 224 |
print("Agent did not produce any answers to submit.")
|
| 225 |
return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
|
| 226 |
|
|
|
|
| 227 |
submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
|
| 228 |
+
print(f"Submitting {len(answers_payload)} answers for user '{username}'...")
|
|
|
|
| 229 |
|
|
|
|
|
|
|
| 230 |
try:
|
| 231 |
+
response = requests.post(submit_url, json=submission_data, timeout=120)
|
| 232 |
response.raise_for_status()
|
| 233 |
result_data = response.json()
|
| 234 |
+
|
| 235 |
+
score_details = result_data.get('evaluation_results', [])
|
| 236 |
+
correct_count = result_data.get('correct_count', '?')
|
| 237 |
+
total_attempted = result_data.get('total_attempted', '?')
|
| 238 |
+
score_percent = result_data.get('score', 'N/A')
|
| 239 |
+
|
| 240 |
final_status = (
|
| 241 |
f"Submission Successful!\n"
|
| 242 |
f"User: {result_data.get('username')}\n"
|
| 243 |
+
f"Overall Score: {score_percent}% ({correct_count}/{total_attempted} correct)\n"
|
|
|
|
| 244 |
f"Message: {result_data.get('message', 'No message received.')}"
|
| 245 |
)
|
| 246 |
print("Submission successful.")
|
| 247 |
+
|
| 248 |
results_df = pd.DataFrame(results_log)
|
| 249 |
+
|
| 250 |
+
if score_details:
|
| 251 |
+
eval_map = {res['task_id']: {'Correct': res.get('is_correct'), 'Expected': res.get('expected_answer')} for res in score_details}
|
| 252 |
+
results_df['Correct'] = results_df['Task ID'].map(lambda tid: eval_map.get(tid, {}).get('Correct'))
|
| 253 |
+
results_df['Expected Answer'] = results_df['Task ID'].map(lambda tid: eval_map.get(tid, {}).get('Expected'))
|
| 254 |
+
results_df = results_df[['Task ID', 'Question', 'Correct', 'Expected Answer', 'Submitted Answer']]
|
| 255 |
+
else:
|
| 256 |
+
results_df = results_df[['Task ID', 'Question', 'Submitted Answer']]
|
| 257 |
+
|
| 258 |
+
|
| 259 |
+
print("--- Evaluation and Submission Run Complete ---")
|
| 260 |
return final_status, results_df
|
| 261 |
+
|
| 262 |
except requests.exceptions.HTTPError as e:
|
| 263 |
error_detail = f"Server responded with status {e.response.status_code}."
|
| 264 |
+
try: error_json = e.response.json(); error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
|
| 265 |
+
except requests.exceptions.JSONDecodeError: error_detail += f" Response: {e.response.text[:500]}"
|
|
|
|
|
|
|
|
|
|
| 266 |
status_message = f"Submission Failed: {error_detail}"
|
| 267 |
print(status_message)
|
| 268 |
+
traceback.print_exc()
|
| 269 |
+
results_df = pd.DataFrame(results_log) if results_log else None
|
| 270 |
+
if results_df is not None:
|
| 271 |
+
results_df = results_df[['Task ID', 'Question', 'Submitted Answer']]
|
| 272 |
return status_message, results_df
|
| 273 |
except requests.exceptions.Timeout:
|
| 274 |
status_message = "Submission Failed: The request timed out."
|
| 275 |
print(status_message)
|
| 276 |
+
traceback.print_exc()
|
| 277 |
+
results_df = pd.DataFrame(results_log) if results_log else None
|
| 278 |
+
if results_df is not None:
|
| 279 |
+
results_df = results_df[['Task ID', 'Question', 'Submitted Answer']]
|
| 280 |
return status_message, results_df
|
| 281 |
except requests.exceptions.RequestException as e:
|
| 282 |
status_message = f"Submission Failed: Network error - {e}"
|
| 283 |
print(status_message)
|
| 284 |
+
traceback.print_exc()
|
| 285 |
+
results_df = pd.DataFrame(results_log) if results_log else None
|
| 286 |
+
if results_df is not None:
|
| 287 |
+
results_df = results_df[['Task ID', 'Question', 'Submitted Answer']]
|
| 288 |
return status_message, results_df
|
| 289 |
except Exception as e:
|
| 290 |
status_message = f"An unexpected error occurred during submission: {e}"
|
| 291 |
print(status_message)
|
| 292 |
+
traceback.print_exc()
|
| 293 |
+
results_df = pd.DataFrame(results_log) if results_log else None
|
| 294 |
+
if results_df is not None:
|
| 295 |
+
results_df = results_df[['Task ID', 'Question', 'Submitted Answer']]
|
| 296 |
return status_message, results_df
|
| 297 |
|
| 298 |
|
|
|
|
| 299 |
with gr.Blocks() as demo:
|
|
|
|
| 300 |
gr.Markdown(
|
| 301 |
+
f"""
|
| 302 |
+
# GAIA Agent Evaluation Runner
|
| 303 |
|
| 304 |
+
**Instructions:**
|
| 305 |
+
1. Ensure `OPENAI_API_KEY` and `TAVILY_API_KEY` are set as Hugging Face Secrets or environment variables.
|
| 306 |
2. Log in to your Hugging Face account using the button below. This uses your HF username for submission.
|
| 307 |
3. Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
|
|
|
|
| 308 |
---
|
| 309 |
**Disclaimers:**
|
| 310 |
+
Once clicking on the submit button, it can take some time for the agent to process all questions.
|
|
|
|
| 311 |
"""
|
| 312 |
)
|
| 313 |
|
|
|
|
| 316 |
run_button = gr.Button("Run Evaluation & Submit All Answers")
|
| 317 |
|
| 318 |
status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
|
| 319 |
+
results_table = gr.DataFrame(label="Evaluation Results (includes Correctness if available)", wrap=True)
|
|
|
|
| 320 |
|
| 321 |
run_button.click(
|
| 322 |
fn=run_and_submit_all,
|
| 323 |
+
outputs=[status_output, results_table],
|
| 324 |
+
api_name="run_submit_gaia"
|
| 325 |
)
|
| 326 |
|
| 327 |
+
|
| 328 |
if __name__ == "__main__":
|
| 329 |
print("\n" + "-"*30 + " App Starting " + "-"*30)
|
|
|
|
| 330 |
space_host_startup = os.getenv("SPACE_HOST")
|
| 331 |
+
space_id_startup = os.getenv("SPACE_ID")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 332 |
|
| 333 |
+
if space_host_startup: print(f"✅ SPACE_HOST found: {space_host_startup}")
|
| 334 |
+
else: print("ℹ️ SPACE_HOST environment variable not found (running locally?).")
|
| 335 |
+
if space_id_startup: print(f"✅ SPACE_ID found: {space_id_startup}")
|
| 336 |
+
else: print("ℹ️ SPACE_ID environment variable not found (running locally?).")
|
|
|
|
|
|
|
| 337 |
|
| 338 |
print("-"*(60 + len(" App Starting ")) + "\n")
|
| 339 |
+
print("Launching Gradio Interface for GAIA Agent Evaluation...")
|
| 340 |
+
demo.launch(debug=False, share=False)
|
|
|