Tuan197 commited on
Commit
9dd713e
·
verified ·
1 Parent(s): 01e506d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +221 -83
app.py CHANGED
@@ -1,40 +1,156 @@
1
  import os
 
2
  import gradio as gr
3
  import requests
4
- import inspect
5
  import pandas as pd
6
- from langchain_core.messages import HumanMessage
7
- from agent import build_graph
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
- # (Keep Constants as is)
10
- # --- Constants ---
11
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
- # --- Basic Agent Definition ---
14
- # ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
15
- class BasicAgent:
 
 
16
  def __init__(self):
17
- print("BasicAgent initialized.")
18
- # self.graph = build_graph(provider = "huggingface")
19
-
20
- self.graph = build_graph(provider = "groq")
21
- def __call__(self, question: str) -> str:
22
- print(f"Agent received question (first 50 chars): {question[:50]}...")
23
- messages = [HumanMessage(content=question)]
24
- messages = self.graph.invoke({"messages": messages})
25
- answer = messages['messages'][-1].content
26
- return answer
27
-
28
- def run_and_submit_all( profile: gr.OAuthProfile | None):
29
- """
30
- Fetches all questions, runs the BasicAgent on them, submits all answers,
31
- and displays the results.
32
- """
33
- # --- Determine HF Space Runtime URL and Repo URL ---
34
- space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  if profile:
37
- username= f"{profile.username}"
38
  print(f"User logged in: {username}")
39
  else:
40
  print("User not logged in.")
@@ -44,20 +160,24 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
44
  questions_url = f"{api_url}/questions"
45
  submit_url = f"{api_url}/submit"
46
 
47
- # 1. Instantiate Agent ( modify this part to create your agent)
48
  try:
49
- agent = BasicAgent()
 
 
 
 
 
50
  except Exception as e:
51
  print(f"Error instantiating agent: {e}")
 
52
  return f"Error initializing agent: {e}", None
53
- # In the case of an app running as a hugging Face space, this link points toward your codebase ( usefull for others so please keep it public)
54
- agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
55
- print(agent_code)
56
 
57
- # 2. Fetch Questions
 
 
58
  print(f"Fetching questions from: {questions_url}")
59
  try:
60
- response = requests.get(questions_url, timeout=15)
61
  response.raise_for_status()
62
  questions_data = response.json()
63
  if not questions_data:
@@ -73,12 +193,12 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
73
  return f"Error decoding server response for questions: {e}", None
74
  except Exception as e:
75
  print(f"An unexpected error occurred fetching questions: {e}")
 
76
  return f"An unexpected error occurred fetching questions: {e}", None
77
 
78
- # 3. Run your Agent
79
  results_log = []
80
  answers_payload = []
81
- print(f"Running agent on {len(questions_data)} questions...")
82
  for item in questions_data:
83
  task_id = item.get("task_id")
84
  question_text = item.get("question")
@@ -86,81 +206,108 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
86
  print(f"Skipping item with missing task_id or question: {item}")
87
  continue
88
  try:
89
- submitted_answer = agent(question_text)
 
 
 
 
90
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
91
- results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
92
  except Exception as e:
93
  print(f"Error running agent on task {task_id}: {e}")
 
 
 
94
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
95
 
96
  if not answers_payload:
97
  print("Agent did not produce any answers to submit.")
98
  return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
99
 
100
- # 4. Prepare Submission
101
  submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
102
- status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
103
- print(status_update)
104
 
105
- # 5. Submit
106
- print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
107
  try:
108
- response = requests.post(submit_url, json=submission_data, timeout=60)
109
  response.raise_for_status()
110
  result_data = response.json()
 
 
 
 
 
 
111
  final_status = (
112
  f"Submission Successful!\n"
113
  f"User: {result_data.get('username')}\n"
114
- f"Overall Score: {result_data.get('score', 'N/A')}% "
115
- f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
116
  f"Message: {result_data.get('message', 'No message received.')}"
117
  )
118
  print("Submission successful.")
 
119
  results_df = pd.DataFrame(results_log)
 
 
 
 
 
 
 
 
 
 
 
120
  return final_status, results_df
 
121
  except requests.exceptions.HTTPError as e:
122
  error_detail = f"Server responded with status {e.response.status_code}."
123
- try:
124
- error_json = e.response.json()
125
- error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
126
- except requests.exceptions.JSONDecodeError:
127
- error_detail += f" Response: {e.response.text[:500]}"
128
  status_message = f"Submission Failed: {error_detail}"
129
  print(status_message)
130
- results_df = pd.DataFrame(results_log)
 
 
 
131
  return status_message, results_df
132
  except requests.exceptions.Timeout:
133
  status_message = "Submission Failed: The request timed out."
134
  print(status_message)
135
- results_df = pd.DataFrame(results_log)
 
 
 
136
  return status_message, results_df
137
  except requests.exceptions.RequestException as e:
138
  status_message = f"Submission Failed: Network error - {e}"
139
  print(status_message)
140
- results_df = pd.DataFrame(results_log)
 
 
 
141
  return status_message, results_df
142
  except Exception as e:
143
  status_message = f"An unexpected error occurred during submission: {e}"
144
  print(status_message)
145
- results_df = pd.DataFrame(results_log)
 
 
 
146
  return status_message, results_df
147
 
148
 
149
- # --- Build Gradio Interface using Blocks ---
150
  with gr.Blocks() as demo:
151
- gr.Markdown("# Basic Agent Evaluation Runner")
152
  gr.Markdown(
153
- """
154
- **Instructions:**
155
 
156
- 1. Please clone this space, then modify the code to define your agent's logic, the tools, the necessary packages, etc ...
 
157
  2. Log in to your Hugging Face account using the button below. This uses your HF username for submission.
158
  3. Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
159
-
160
  ---
161
  **Disclaimers:**
162
- Once clicking on the "submit button, it can take quite some time ( this is the time for the agent to go through all the questions).
163
- This space provides a basic setup and is intentionally sub-optimal to encourage you to develop your own, more robust solution. For instance for the delay process of the submit button, a solution could be to cache the answers and submit in a seperate action or even to answer the questions in async.
164
  """
165
  )
166
 
@@ -169,34 +316,25 @@ with gr.Blocks() as demo:
169
  run_button = gr.Button("Run Evaluation & Submit All Answers")
170
 
171
  status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
172
- # Removed max_rows=10 from DataFrame constructor
173
- results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
174
 
175
  run_button.click(
176
  fn=run_and_submit_all,
177
- outputs=[status_output, results_table]
 
178
  )
179
 
 
180
  if __name__ == "__main__":
181
  print("\n" + "-"*30 + " App Starting " + "-"*30)
182
- # Check for SPACE_HOST and SPACE_ID at startup for information
183
  space_host_startup = os.getenv("SPACE_HOST")
184
- space_id_startup = os.getenv("SPACE_ID") # Get SPACE_ID at startup
185
-
186
- if space_host_startup:
187
- print(f"✅ SPACE_HOST found: {space_host_startup}")
188
- print(f" Runtime URL should be: https://{space_host_startup}.hf.space")
189
- else:
190
- print("ℹ️ SPACE_HOST environment variable not found (running locally?).")
191
 
192
- if space_id_startup: # Print repo URLs if SPACE_ID is found
193
- print(f" SPACE_ID found: {space_id_startup}")
194
- print(f" Repo URL: https://huggingface.co/spaces/{space_id_startup}")
195
- print(f" Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main")
196
- else:
197
- print("ℹ️ SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined.")
198
 
199
  print("-"*(60 + len(" App Starting ")) + "\n")
200
-
201
- print("Launching Gradio Interface for Basic Agent Evaluation...")
202
- demo.launch(debug=True, share=False)
 
1
  import os
2
+ import io
3
  import gradio as gr
4
  import requests
 
5
  import pandas as pd
6
+ from dotenv import load_dotenv
7
+ import traceback
8
+
9
+ from langchain_openai import ChatOpenAI
10
+ from langchain_community.tools.tavily_search import TavilySearchResults
11
+ from langchain.agents import AgentExecutor, create_openai_tools_agent
12
+ from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
13
+ from langchain_core.tools import BaseTool
14
+ from langchain.pydantic_v1 import BaseModel, Field
15
+ from openai import OpenAI
16
+
17
+ load_dotenv()
18
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
19
+ TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")
20
+
21
+ if not OPENAI_API_KEY:
22
+ print("⚠️ WARNING: OPENAI_API_KEY environment variable not set.")
23
+ if not TAVILY_API_KEY:
24
+ print("⚠️ WARNING: TAVILY_API_KEY environment variable not set.")
25
 
 
 
26
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
27
+ MAX_AGENT_ITERATIONS = 35
28
+
29
+
30
+ class TranscribeAudioByIdInput(BaseModel):
31
+ task_id: str = Field(description="The task_id associated with the audio file to be downloaded and transcribed.")
32
+
33
+ class AudioTranscriptionByIdTool(BaseTool):
34
+ name: str = "transcribe_audio_by_id"
35
+ description: str = "Downloads an audio file associated with a specific task_id from the scoring server's /files/{task_id} endpoint and transcribes it using OpenAI's whisper-1 model. Input must be the task_id."
36
+ args_schema: type[BaseModel] = TranscribeAudioByIdInput
37
+ openai_client: OpenAI = Field(default_factory=lambda: OpenAI(api_key=OPENAI_API_KEY))
38
+
39
+ def _run(self, task_id: str) -> str:
40
+ file_url = f"{DEFAULT_API_URL}/files/{task_id}"
41
+ print(f"Attempting to download audio file from: {file_url}")
42
+ try:
43
+ response = requests.get(file_url, timeout=20)
44
+ response.raise_for_status()
45
+ audio_content = response.content
46
+ print(f"Downloaded {len(audio_content)} bytes of audio data.")
47
+
48
+ audio_file_like = io.BytesIO(audio_content)
49
+ audio_file_like.name = f"{task_id}.wav"
50
+
51
+ print(f"Attempting to transcribe audio for task_id: {task_id}")
52
+ transcription = self.openai_client.audio.transcriptions.create(
53
+ model="whisper-1",
54
+ file=audio_file_like
55
+ )
56
+ print("Transcription successful.")
57
+ return transcription.text
58
+ except requests.exceptions.RequestException as e:
59
+ error_msg = f"Error downloading audio file for task_id {task_id}: {e}"
60
+ print(error_msg)
61
+ return error_msg
62
+ except Exception as e:
63
+ error_msg = f"Error during audio transcription for task_id {task_id}: {e}"
64
+ print(error_msg)
65
+ traceback.print_exc()
66
+ return f"Error transcribing audio: {str(e)}"
67
 
68
+ async def _arun(self, task_id: str) -> str:
69
+ return self._run(task_id)
70
+
71
+
72
+ class EnhancedAgent:
73
  def __init__(self):
74
+ print("Initializing GAIA Agent...")
75
+ if not OPENAI_API_KEY or not TAVILY_API_KEY:
76
+ raise ValueError("Missing required API keys (OpenAI or Tavily). Please set them.")
77
+
78
+ self.llm = ChatOpenAI(model="o4-mini", openai_api_key=OPENAI_API_KEY)
79
+ self.search_tool = TavilySearchResults(max_results=5, tavily_api_key=TAVILY_API_KEY)
80
+ self.transcription_tool = AudioTranscriptionByIdTool()
 
 
 
 
 
 
 
 
 
 
 
81
 
82
+ self.tools = [self.search_tool, self.transcription_tool]
83
+ print(f"Total tools available: {len(self.tools)}")
84
+
85
+ system_prompt = """You are a general AI assistant designed to answer GAIA benchmark questions. I will ask you a question, which may include a task_id for accessing related files.
86
+ First, think step-by-step about how to answer the question. Use the available tools if necessary. The available tools are: tavily_search, transcribe_audio_by_id.
87
+ The 'transcribe_audio_by_id' tool downloads and transcribes an audio file using its task_id. If the question refers to an audio file (e.g., "transcribe file X.wav"), you MUST use the task_id provided within the user's input message (formatted like "(Use task_id: <the_task_id>)") as the input for the 'transcribe_audio_by_id' tool. Do NOT try to use a filename as input for this tool.
88
+ Prioritize using the search tool for general knowledge or recent information. Use the audio tool only if explicitly asked to process an audio file using its task_id.
89
+ Report your thoughts clearly.
90
+ Finally, conclude your response STRICTLY with the following template:
91
+ YOUR FINAL ANSWER
92
+
93
+ YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
94
+ Specific formatting rules for the FINAL ANSWER:
95
+ - If a number is requested: Write only the number. Do not use thousands separators (commas). Do not include units like '$' or '%' unless the question explicitly asks for the unit. Example: 12345
96
+ - If a string is requested: Use minimal words. Do not use articles (a, an, the). Write digits as words (e.g., "five" not "5") unless the question specifies digits. Do not use abbreviations (e.g., write "Los Angeles" not "LA"). Example: Mount Everest
97
+ - If a comma-separated list is requested: Apply the number or string rules to each element in the list. Separate elements with a comma and a single space. Example for numbers: 10, 25, 300. Example for strings: red, blue, green. Example for mixed: 5, apple, ten"""
98
+
99
+ self.prompt = ChatPromptTemplate.from_messages([
100
+ ("system", system_prompt),
101
+ ("human", "{input}"),
102
+ MessagesPlaceholder(variable_name="agent_scratchpad"),
103
+ ])
104
+
105
+ agent = create_openai_tools_agent(self.llm, self.tools, self.prompt)
106
+
107
+ self.agent_executor = AgentExecutor(
108
+ agent=agent,
109
+ tools=self.tools,
110
+ verbose=True,
111
+ max_iterations=MAX_AGENT_ITERATIONS,
112
+ handle_parsing_errors="Error: LLM output parsing failed. Please check the thought process.",
113
+ )
114
+ print(f"GAIA Agent initialized successfully with {len(self.tools)} tools and max_iterations={MAX_AGENT_ITERATIONS}.")
115
+
116
+ def __call__(self, user_input: str) -> str:
117
+ print(f"Agent received input (first 100 chars): {user_input[:100]}...")
118
+ try:
119
+ response = self.agent_executor.invoke({"input": user_input})
120
+ output = response.get("output", "Agent did not produce an output.")
121
+ lines = output.strip().split('\n')
122
+ final_answer = lines[-1] if lines else output
123
+
124
+
125
+ if "YOUR FINAL ANSWER" in lines[-1].upper():
126
+ parts = lines[-1].split(':', 1)
127
+ if len(parts) > 1:
128
+ final_answer = parts[1].strip()
129
+ else:
130
+ final_answer = lines[-1]
131
+ elif not lines[-1].strip():
132
+ if len(lines) > 1:
133
+ final_answer = lines[-2]
134
+
135
+ print(f"Agent returning final answer: {final_answer}")
136
+ return final_answer
137
+
138
+ except Exception as e:
139
+ print(f"\n!!! Critical Error during agent execution !!!")
140
+ print(f"Error Type: {type(e).__name__}")
141
+ print(f"Error Details: {e}")
142
+ print("--- Traceback ---")
143
+ traceback.print_exc()
144
+ print("--- End Traceback ---\n")
145
+ return f"Error processing input - {type(e).__name__}: {str(e)}"
146
+
147
+
148
+ def run_and_submit_all(profile: gr.OAuthProfile | None):
149
+ print("\n--- Starting Evaluation and Submission Run ---")
150
+
151
+ space_id = os.getenv("SPACE_ID")
152
  if profile:
153
+ username = f"{profile.username}"
154
  print(f"User logged in: {username}")
155
  else:
156
  print("User not logged in.")
 
160
  questions_url = f"{api_url}/questions"
161
  submit_url = f"{api_url}/submit"
162
 
 
163
  try:
164
+ if not OPENAI_API_KEY or not TAVILY_API_KEY:
165
+ raise ValueError("Missing required API keys (OpenAI or Tavily). Please set them as environment variables or secrets.")
166
+ agent_instance = EnhancedAgent()
167
+ except ValueError as e:
168
+ print(f"Error instantiating agent: {e}")
169
+ return f"Error initializing agent: {e}", None
170
  except Exception as e:
171
  print(f"Error instantiating agent: {e}")
172
+ traceback.print_exc()
173
  return f"Error initializing agent: {e}", None
 
 
 
174
 
175
+ agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else "Code link unavailable (SPACE_ID not set)"
176
+ print(f"Agent code link: {agent_code}")
177
+
178
  print(f"Fetching questions from: {questions_url}")
179
  try:
180
+ response = requests.get(questions_url, timeout=30)
181
  response.raise_for_status()
182
  questions_data = response.json()
183
  if not questions_data:
 
193
  return f"Error decoding server response for questions: {e}", None
194
  except Exception as e:
195
  print(f"An unexpected error occurred fetching questions: {e}")
196
+ traceback.print_exc()
197
  return f"An unexpected error occurred fetching questions: {e}", None
198
 
 
199
  results_log = []
200
  answers_payload = []
201
+ print(f"Running agent on {len(questions_data)} questions for submission...")
202
  for item in questions_data:
203
  task_id = item.get("task_id")
204
  question_text = item.get("question")
 
206
  print(f"Skipping item with missing task_id or question: {item}")
207
  continue
208
  try:
209
+ agent_input = f"{question_text}\n(Use task_id: {task_id} if you need to access associated files)"
210
+ submitted_answer = agent_instance(agent_input)
211
+ if submitted_answer.upper().startswith("FINAL ANSWER:"):
212
+ submitted_answer = submitted_answer.split(":", 1)[1].strip()
213
+
214
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
215
+ results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
216
  except Exception as e:
217
  print(f"Error running agent on task {task_id}: {e}")
218
+ traceback.print_exc()
219
+ error_answer = f"Agent Error - {str(e)}"
220
+ answers_payload.append({"task_id": task_id, "submitted_answer": error_answer})
221
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
222
 
223
  if not answers_payload:
224
  print("Agent did not produce any answers to submit.")
225
  return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
226
 
 
227
  submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
228
+ print(f"Submitting {len(answers_payload)} answers for user '{username}'...")
 
229
 
 
 
230
  try:
231
+ response = requests.post(submit_url, json=submission_data, timeout=120)
232
  response.raise_for_status()
233
  result_data = response.json()
234
+
235
+ score_details = result_data.get('evaluation_results', [])
236
+ correct_count = result_data.get('correct_count', '?')
237
+ total_attempted = result_data.get('total_attempted', '?')
238
+ score_percent = result_data.get('score', 'N/A')
239
+
240
  final_status = (
241
  f"Submission Successful!\n"
242
  f"User: {result_data.get('username')}\n"
243
+ f"Overall Score: {score_percent}% ({correct_count}/{total_attempted} correct)\n"
 
244
  f"Message: {result_data.get('message', 'No message received.')}"
245
  )
246
  print("Submission successful.")
247
+
248
  results_df = pd.DataFrame(results_log)
249
+
250
+ if score_details:
251
+ eval_map = {res['task_id']: {'Correct': res.get('is_correct'), 'Expected': res.get('expected_answer')} for res in score_details}
252
+ results_df['Correct'] = results_df['Task ID'].map(lambda tid: eval_map.get(tid, {}).get('Correct'))
253
+ results_df['Expected Answer'] = results_df['Task ID'].map(lambda tid: eval_map.get(tid, {}).get('Expected'))
254
+ results_df = results_df[['Task ID', 'Question', 'Correct', 'Expected Answer', 'Submitted Answer']]
255
+ else:
256
+ results_df = results_df[['Task ID', 'Question', 'Submitted Answer']]
257
+
258
+
259
+ print("--- Evaluation and Submission Run Complete ---")
260
  return final_status, results_df
261
+
262
  except requests.exceptions.HTTPError as e:
263
  error_detail = f"Server responded with status {e.response.status_code}."
264
+ try: error_json = e.response.json(); error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
265
+ except requests.exceptions.JSONDecodeError: error_detail += f" Response: {e.response.text[:500]}"
 
 
 
266
  status_message = f"Submission Failed: {error_detail}"
267
  print(status_message)
268
+ traceback.print_exc()
269
+ results_df = pd.DataFrame(results_log) if results_log else None
270
+ if results_df is not None:
271
+ results_df = results_df[['Task ID', 'Question', 'Submitted Answer']]
272
  return status_message, results_df
273
  except requests.exceptions.Timeout:
274
  status_message = "Submission Failed: The request timed out."
275
  print(status_message)
276
+ traceback.print_exc()
277
+ results_df = pd.DataFrame(results_log) if results_log else None
278
+ if results_df is not None:
279
+ results_df = results_df[['Task ID', 'Question', 'Submitted Answer']]
280
  return status_message, results_df
281
  except requests.exceptions.RequestException as e:
282
  status_message = f"Submission Failed: Network error - {e}"
283
  print(status_message)
284
+ traceback.print_exc()
285
+ results_df = pd.DataFrame(results_log) if results_log else None
286
+ if results_df is not None:
287
+ results_df = results_df[['Task ID', 'Question', 'Submitted Answer']]
288
  return status_message, results_df
289
  except Exception as e:
290
  status_message = f"An unexpected error occurred during submission: {e}"
291
  print(status_message)
292
+ traceback.print_exc()
293
+ results_df = pd.DataFrame(results_log) if results_log else None
294
+ if results_df is not None:
295
+ results_df = results_df[['Task ID', 'Question', 'Submitted Answer']]
296
  return status_message, results_df
297
 
298
 
 
299
  with gr.Blocks() as demo:
 
300
  gr.Markdown(
301
+ f"""
302
+ # GAIA Agent Evaluation Runner
303
 
304
+ **Instructions:**
305
+ 1. Ensure `OPENAI_API_KEY` and `TAVILY_API_KEY` are set as Hugging Face Secrets or environment variables.
306
  2. Log in to your Hugging Face account using the button below. This uses your HF username for submission.
307
  3. Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
 
308
  ---
309
  **Disclaimers:**
310
+ Once clicking on the submit button, it can take some time for the agent to process all questions.
 
311
  """
312
  )
313
 
 
316
  run_button = gr.Button("Run Evaluation & Submit All Answers")
317
 
318
  status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
319
+ results_table = gr.DataFrame(label="Evaluation Results (includes Correctness if available)", wrap=True)
 
320
 
321
  run_button.click(
322
  fn=run_and_submit_all,
323
+ outputs=[status_output, results_table],
324
+ api_name="run_submit_gaia"
325
  )
326
 
327
+
328
  if __name__ == "__main__":
329
  print("\n" + "-"*30 + " App Starting " + "-"*30)
 
330
  space_host_startup = os.getenv("SPACE_HOST")
331
+ space_id_startup = os.getenv("SPACE_ID")
 
 
 
 
 
 
332
 
333
+ if space_host_startup: print(f"✅ SPACE_HOST found: {space_host_startup}")
334
+ else: print("ℹ️ SPACE_HOST environment variable not found (running locally?).")
335
+ if space_id_startup: print(f" SPACE_ID found: {space_id_startup}")
336
+ else: print("ℹ️ SPACE_ID environment variable not found (running locally?).")
 
 
337
 
338
  print("-"*(60 + len(" App Starting ")) + "\n")
339
+ print("Launching Gradio Interface for GAIA Agent Evaluation...")
340
+ demo.launch(debug=False, share=False)