Update app.py
Browse files
app.py
CHANGED
|
@@ -1,156 +1,46 @@
|
|
| 1 |
import os
|
| 2 |
-
import
|
| 3 |
import gradio as gr
|
| 4 |
import requests
|
| 5 |
import pandas as pd
|
| 6 |
-
from
|
| 7 |
-
import
|
| 8 |
|
| 9 |
-
from langchain_openai import ChatOpenAI
|
| 10 |
-
from langchain_community.tools.tavily_search import TavilySearchResults
|
| 11 |
-
from langchain.agents import AgentExecutor, create_openai_tools_agent
|
| 12 |
-
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
|
| 13 |
-
from langchain_core.tools import BaseTool
|
| 14 |
-
from langchain.pydantic_v1 import BaseModel, Field
|
| 15 |
-
from openai import OpenAI
|
| 16 |
|
| 17 |
-
load_dotenv()
|
| 18 |
-
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
| 19 |
-
TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")
|
| 20 |
-
|
| 21 |
-
if not OPENAI_API_KEY:
|
| 22 |
-
print("⚠️ WARNING: OPENAI_API_KEY environment variable not set.")
|
| 23 |
-
if not TAVILY_API_KEY:
|
| 24 |
-
print("⚠️ WARNING: TAVILY_API_KEY environment variable not set.")
|
| 25 |
|
|
|
|
|
|
|
| 26 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
| 27 |
-
MAX_AGENT_ITERATIONS = 35
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
class TranscribeAudioByIdInput(BaseModel):
|
| 31 |
-
task_id: str = Field(description="The task_id associated with the audio file to be downloaded and transcribed.")
|
| 32 |
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
description: str = "Downloads an audio file associated with a specific task_id from the scoring server's /files/{task_id} endpoint and transcribes it using OpenAI's whisper-1 model. Input must be the task_id."
|
| 36 |
-
args_schema: type[BaseModel] = TranscribeAudioByIdInput
|
| 37 |
-
openai_client: OpenAI = Field(default_factory=lambda: OpenAI(api_key=OPENAI_API_KEY))
|
| 38 |
|
| 39 |
-
def _run(self, task_id: str) -> str:
|
| 40 |
-
file_url = f"{DEFAULT_API_URL}/files/{task_id}"
|
| 41 |
-
print(f"Attempting to download audio file from: {file_url}")
|
| 42 |
-
try:
|
| 43 |
-
response = requests.get(file_url, timeout=20)
|
| 44 |
-
response.raise_for_status()
|
| 45 |
-
audio_content = response.content
|
| 46 |
-
print(f"Downloaded {len(audio_content)} bytes of audio data.")
|
| 47 |
-
|
| 48 |
-
audio_file_like = io.BytesIO(audio_content)
|
| 49 |
-
audio_file_like.name = f"{task_id}.wav"
|
| 50 |
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
model="whisper-1",
|
| 54 |
-
file=audio_file_like
|
| 55 |
-
)
|
| 56 |
-
print("Transcription successful.")
|
| 57 |
-
return transcription.text
|
| 58 |
-
except requests.exceptions.RequestException as e:
|
| 59 |
-
error_msg = f"Error downloading audio file for task_id {task_id}: {e}"
|
| 60 |
-
print(error_msg)
|
| 61 |
-
return error_msg
|
| 62 |
-
except Exception as e:
|
| 63 |
-
error_msg = f"Error during audio transcription for task_id {task_id}: {e}"
|
| 64 |
-
print(error_msg)
|
| 65 |
-
traceback.print_exc()
|
| 66 |
-
return f"Error transcribing audio: {str(e)}"
|
| 67 |
-
|
| 68 |
-
async def _arun(self, task_id: str) -> str:
|
| 69 |
-
return self._run(task_id)
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
class EnhancedAgent:
|
| 73 |
def __init__(self):
|
| 74 |
-
print("
|
| 75 |
-
|
| 76 |
-
raise ValueError("Missing required API keys (OpenAI or Tavily). Please set them.")
|
| 77 |
-
|
| 78 |
-
self.llm = ChatOpenAI(model="o4-mini", openai_api_key=OPENAI_API_KEY)
|
| 79 |
-
self.search_tool = TavilySearchResults(max_results=5, tavily_api_key=TAVILY_API_KEY)
|
| 80 |
-
self.transcription_tool = AudioTranscriptionByIdTool()
|
| 81 |
-
|
| 82 |
-
self.tools = [self.search_tool, self.transcription_tool]
|
| 83 |
-
print(f"Total tools available: {len(self.tools)}")
|
| 84 |
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
YOUR FINAL ANSWER
|
| 92 |
|
| 93 |
-
YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
|
| 94 |
-
Specific formatting rules for the FINAL ANSWER:
|
| 95 |
-
- If a number is requested: Write only the number. Do not use thousands separators (commas). Do not include units like '$' or '%' unless the question explicitly asks for the unit. Example: 12345
|
| 96 |
-
- If a string is requested: Use minimal words. Do not use articles (a, an, the). Write digits as words (e.g., "five" not "5") unless the question specifies digits. Do not use abbreviations (e.g., write "Los Angeles" not "LA"). Example: Mount Everest
|
| 97 |
-
- If a comma-separated list is requested: Apply the number or string rules to each element in the list. Separate elements with a comma and a single space. Example for numbers: 10, 25, 300. Example for strings: red, blue, green. Example for mixed: 5, apple, ten"""
|
| 98 |
-
|
| 99 |
-
self.prompt = ChatPromptTemplate.from_messages([
|
| 100 |
-
("system", system_prompt),
|
| 101 |
-
("human", "{input}"),
|
| 102 |
-
MessagesPlaceholder(variable_name="agent_scratchpad"),
|
| 103 |
-
])
|
| 104 |
-
|
| 105 |
-
agent = create_openai_tools_agent(self.llm, self.tools, self.prompt)
|
| 106 |
-
|
| 107 |
-
self.agent_executor = AgentExecutor(
|
| 108 |
-
agent=agent,
|
| 109 |
-
tools=self.tools,
|
| 110 |
-
verbose=True,
|
| 111 |
-
max_iterations=MAX_AGENT_ITERATIONS,
|
| 112 |
-
handle_parsing_errors="Error: LLM output parsing failed. Please check the thought process.",
|
| 113 |
-
)
|
| 114 |
-
print(f"GAIA Agent initialized successfully with {len(self.tools)} tools and max_iterations={MAX_AGENT_ITERATIONS}.")
|
| 115 |
-
|
| 116 |
-
def __call__(self, user_input: str) -> str:
|
| 117 |
-
print(f"Agent received input (first 100 chars): {user_input[:100]}...")
|
| 118 |
-
try:
|
| 119 |
-
response = self.agent_executor.invoke({"input": user_input})
|
| 120 |
-
output = response.get("output", "Agent did not produce an output.")
|
| 121 |
-
lines = output.strip().split('\n')
|
| 122 |
-
final_answer = lines[-1] if lines else output
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
if "YOUR FINAL ANSWER" in lines[-1].upper():
|
| 126 |
-
parts = lines[-1].split(':', 1)
|
| 127 |
-
if len(parts) > 1:
|
| 128 |
-
final_answer = parts[1].strip()
|
| 129 |
-
else:
|
| 130 |
-
final_answer = lines[-1]
|
| 131 |
-
elif not lines[-1].strip():
|
| 132 |
-
if len(lines) > 1:
|
| 133 |
-
final_answer = lines[-2]
|
| 134 |
-
|
| 135 |
-
print(f"Agent returning final answer: {final_answer}")
|
| 136 |
-
return final_answer
|
| 137 |
-
|
| 138 |
-
except Exception as e:
|
| 139 |
-
print(f"\n!!! Critical Error during agent execution !!!")
|
| 140 |
-
print(f"Error Type: {type(e).__name__}")
|
| 141 |
-
print(f"Error Details: {e}")
|
| 142 |
-
print("--- Traceback ---")
|
| 143 |
-
traceback.print_exc()
|
| 144 |
-
print("--- End Traceback ---\n")
|
| 145 |
-
return f"Error processing input - {type(e).__name__}: {str(e)}"
|
| 146 |
|
| 147 |
|
| 148 |
-
def run_and_submit_all(profile: gr.OAuthProfile | None):
|
| 149 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
|
| 151 |
-
space_id = os.getenv("SPACE_ID")
|
| 152 |
if profile:
|
| 153 |
-
username
|
| 154 |
print(f"User logged in: {username}")
|
| 155 |
else:
|
| 156 |
print("User not logged in.")
|
|
@@ -160,24 +50,20 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
| 160 |
questions_url = f"{api_url}/questions"
|
| 161 |
submit_url = f"{api_url}/submit"
|
| 162 |
|
|
|
|
| 163 |
try:
|
| 164 |
-
|
| 165 |
-
raise ValueError("Missing required API keys (OpenAI or Tavily). Please set them as environment variables or secrets.")
|
| 166 |
-
agent_instance = EnhancedAgent()
|
| 167 |
-
except ValueError as e:
|
| 168 |
-
print(f"Error instantiating agent: {e}")
|
| 169 |
-
return f"Error initializing agent: {e}", None
|
| 170 |
except Exception as e:
|
| 171 |
print(f"Error instantiating agent: {e}")
|
| 172 |
-
traceback.print_exc()
|
| 173 |
return f"Error initializing agent: {e}", None
|
|
|
|
|
|
|
|
|
|
| 174 |
|
| 175 |
-
|
| 176 |
-
print(f"Agent code link: {agent_code}")
|
| 177 |
-
|
| 178 |
print(f"Fetching questions from: {questions_url}")
|
| 179 |
try:
|
| 180 |
-
response = requests.get(questions_url, timeout=
|
| 181 |
response.raise_for_status()
|
| 182 |
questions_data = response.json()
|
| 183 |
if not questions_data:
|
|
@@ -193,12 +79,12 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
| 193 |
return f"Error decoding server response for questions: {e}", None
|
| 194 |
except Exception as e:
|
| 195 |
print(f"An unexpected error occurred fetching questions: {e}")
|
| 196 |
-
traceback.print_exc()
|
| 197 |
return f"An unexpected error occurred fetching questions: {e}", None
|
| 198 |
|
|
|
|
| 199 |
results_log = []
|
| 200 |
answers_payload = []
|
| 201 |
-
print(f"Running agent on {len(questions_data)} questions
|
| 202 |
for item in questions_data:
|
| 203 |
task_id = item.get("task_id")
|
| 204 |
question_text = item.get("question")
|
|
@@ -206,108 +92,79 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
| 206 |
print(f"Skipping item with missing task_id or question: {item}")
|
| 207 |
continue
|
| 208 |
try:
|
| 209 |
-
|
| 210 |
-
submitted_answer = agent_instance(agent_input)
|
| 211 |
-
if submitted_answer.upper().startswith("FINAL ANSWER:"):
|
| 212 |
-
submitted_answer = submitted_answer.split(":", 1)[1].strip()
|
| 213 |
-
|
| 214 |
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
| 215 |
-
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
|
| 216 |
except Exception as e:
|
| 217 |
print(f"Error running agent on task {task_id}: {e}")
|
| 218 |
-
traceback.print_exc()
|
| 219 |
-
error_answer = f"Agent Error - {str(e)}"
|
| 220 |
-
answers_payload.append({"task_id": task_id, "submitted_answer": error_answer})
|
| 221 |
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
|
| 222 |
|
| 223 |
if not answers_payload:
|
| 224 |
print("Agent did not produce any answers to submit.")
|
| 225 |
return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
|
| 226 |
|
|
|
|
| 227 |
submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
|
| 228 |
-
|
|
|
|
| 229 |
|
|
|
|
|
|
|
| 230 |
try:
|
| 231 |
-
response = requests.post(submit_url, json=submission_data, timeout=
|
| 232 |
response.raise_for_status()
|
| 233 |
result_data = response.json()
|
| 234 |
-
|
| 235 |
-
score_details = result_data.get('evaluation_results', [])
|
| 236 |
-
correct_count = result_data.get('correct_count', '?')
|
| 237 |
-
total_attempted = result_data.get('total_attempted', '?')
|
| 238 |
-
score_percent = result_data.get('score', 'N/A')
|
| 239 |
-
|
| 240 |
final_status = (
|
| 241 |
f"Submission Successful!\n"
|
| 242 |
f"User: {result_data.get('username')}\n"
|
| 243 |
-
f"Overall Score: {
|
|
|
|
| 244 |
f"Message: {result_data.get('message', 'No message received.')}"
|
| 245 |
)
|
| 246 |
print("Submission successful.")
|
| 247 |
-
|
| 248 |
results_df = pd.DataFrame(results_log)
|
| 249 |
-
|
| 250 |
-
if score_details:
|
| 251 |
-
eval_map = {res['task_id']: {'Correct': res.get('is_correct'), 'Expected': res.get('expected_answer')} for res in score_details}
|
| 252 |
-
results_df['Correct'] = results_df['Task ID'].map(lambda tid: eval_map.get(tid, {}).get('Correct'))
|
| 253 |
-
results_df['Expected Answer'] = results_df['Task ID'].map(lambda tid: eval_map.get(tid, {}).get('Expected'))
|
| 254 |
-
results_df = results_df[['Task ID', 'Question', 'Correct', 'Expected Answer', 'Submitted Answer']]
|
| 255 |
-
else:
|
| 256 |
-
results_df = results_df[['Task ID', 'Question', 'Submitted Answer']]
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
print("--- Evaluation and Submission Run Complete ---")
|
| 260 |
return final_status, results_df
|
| 261 |
-
|
| 262 |
except requests.exceptions.HTTPError as e:
|
| 263 |
error_detail = f"Server responded with status {e.response.status_code}."
|
| 264 |
-
try:
|
| 265 |
-
|
|
|
|
|
|
|
|
|
|
| 266 |
status_message = f"Submission Failed: {error_detail}"
|
| 267 |
print(status_message)
|
| 268 |
-
|
| 269 |
-
results_df = pd.DataFrame(results_log) if results_log else None
|
| 270 |
-
if results_df is not None:
|
| 271 |
-
results_df = results_df[['Task ID', 'Question', 'Submitted Answer']]
|
| 272 |
return status_message, results_df
|
| 273 |
except requests.exceptions.Timeout:
|
| 274 |
status_message = "Submission Failed: The request timed out."
|
| 275 |
print(status_message)
|
| 276 |
-
|
| 277 |
-
results_df = pd.DataFrame(results_log) if results_log else None
|
| 278 |
-
if results_df is not None:
|
| 279 |
-
results_df = results_df[['Task ID', 'Question', 'Submitted Answer']]
|
| 280 |
return status_message, results_df
|
| 281 |
except requests.exceptions.RequestException as e:
|
| 282 |
status_message = f"Submission Failed: Network error - {e}"
|
| 283 |
print(status_message)
|
| 284 |
-
|
| 285 |
-
results_df = pd.DataFrame(results_log) if results_log else None
|
| 286 |
-
if results_df is not None:
|
| 287 |
-
results_df = results_df[['Task ID', 'Question', 'Submitted Answer']]
|
| 288 |
return status_message, results_df
|
| 289 |
except Exception as e:
|
| 290 |
status_message = f"An unexpected error occurred during submission: {e}"
|
| 291 |
print(status_message)
|
| 292 |
-
|
| 293 |
-
results_df = pd.DataFrame(results_log) if results_log else None
|
| 294 |
-
if results_df is not None:
|
| 295 |
-
results_df = results_df[['Task ID', 'Question', 'Submitted Answer']]
|
| 296 |
return status_message, results_df
|
| 297 |
|
| 298 |
|
|
|
|
| 299 |
with gr.Blocks() as demo:
|
|
|
|
| 300 |
gr.Markdown(
|
| 301 |
-
|
| 302 |
-
# GAIA Agent Evaluation Runner
|
| 303 |
-
|
| 304 |
**Instructions:**
|
| 305 |
-
1.
|
| 306 |
2. Log in to your Hugging Face account using the button below. This uses your HF username for submission.
|
| 307 |
3. Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
|
| 308 |
---
|
| 309 |
**Disclaimers:**
|
| 310 |
-
Once clicking on the submit button, it can take some time for the agent to
|
|
|
|
| 311 |
"""
|
| 312 |
)
|
| 313 |
|
|
@@ -316,25 +173,34 @@ with gr.Blocks() as demo:
|
|
| 316 |
run_button = gr.Button("Run Evaluation & Submit All Answers")
|
| 317 |
|
| 318 |
status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
|
| 319 |
-
|
|
|
|
| 320 |
|
| 321 |
run_button.click(
|
| 322 |
fn=run_and_submit_all,
|
| 323 |
-
outputs=[status_output, results_table]
|
| 324 |
-
api_name="run_submit_gaia"
|
| 325 |
)
|
| 326 |
|
| 327 |
-
|
| 328 |
if __name__ == "__main__":
|
| 329 |
print("\n" + "-"*30 + " App Starting " + "-"*30)
|
|
|
|
| 330 |
space_host_startup = os.getenv("SPACE_HOST")
|
| 331 |
-
space_id_startup = os.getenv("SPACE_ID")
|
| 332 |
|
| 333 |
-
if space_host_startup:
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
else:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 337 |
|
| 338 |
print("-"*(60 + len(" App Starting ")) + "\n")
|
| 339 |
-
|
| 340 |
-
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
+
import inspect
|
| 3 |
import gradio as gr
|
| 4 |
import requests
|
| 5 |
import pandas as pd
|
| 6 |
+
from langchain_core.messages import HumanMessage
|
| 7 |
+
from agent import build_graph
|
| 8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
+
# (Keep Constants as is)
|
| 12 |
+
# --- Constants ---
|
| 13 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
+
# --- Basic Agent Definition ---
|
| 16 |
+
# ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
|
|
|
|
|
|
|
|
|
|
| 17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
+
class BasicAgent:
|
| 20 |
+
"""A langgraph agent."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
def __init__(self):
|
| 22 |
+
print("BasicAgent initialized.")
|
| 23 |
+
self.graph = build_graph()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
+
def __call__(self, question: str) -> str:
|
| 26 |
+
print(f"Agent received question (first 50 chars): {question[:50]}...")
|
| 27 |
+
messages = [HumanMessage(content=question)]
|
| 28 |
+
result = self.graph.invoke({"messages": messages})
|
| 29 |
+
answer = result['messages'][-1].content
|
| 30 |
+
return answer # kein [14:] mehr nötig!
|
|
|
|
| 31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
|
| 34 |
+
def run_and_submit_all( profile: gr.OAuthProfile | None):
|
| 35 |
+
"""
|
| 36 |
+
Fetches all questions, runs the BasicAgent on them, submits all answers,
|
| 37 |
+
and displays the results.
|
| 38 |
+
"""
|
| 39 |
+
# --- Determine HF Space Runtime URL and Repo URL ---
|
| 40 |
+
space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
|
| 41 |
|
|
|
|
| 42 |
if profile:
|
| 43 |
+
username= f"{profile.username}"
|
| 44 |
print(f"User logged in: {username}")
|
| 45 |
else:
|
| 46 |
print("User not logged in.")
|
|
|
|
| 50 |
questions_url = f"{api_url}/questions"
|
| 51 |
submit_url = f"{api_url}/submit"
|
| 52 |
|
| 53 |
+
# 1. Instantiate Agent ( modify this part to create your agent)
|
| 54 |
try:
|
| 55 |
+
agent = BasicAgent()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
except Exception as e:
|
| 57 |
print(f"Error instantiating agent: {e}")
|
|
|
|
| 58 |
return f"Error initializing agent: {e}", None
|
| 59 |
+
# In the case of an app running as a hugging Face space, this link points toward your codebase ( usefull for others so please keep it public)
|
| 60 |
+
agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
|
| 61 |
+
print(agent_code)
|
| 62 |
|
| 63 |
+
# 2. Fetch Questions
|
|
|
|
|
|
|
| 64 |
print(f"Fetching questions from: {questions_url}")
|
| 65 |
try:
|
| 66 |
+
response = requests.get(questions_url, timeout=15)
|
| 67 |
response.raise_for_status()
|
| 68 |
questions_data = response.json()
|
| 69 |
if not questions_data:
|
|
|
|
| 79 |
return f"Error decoding server response for questions: {e}", None
|
| 80 |
except Exception as e:
|
| 81 |
print(f"An unexpected error occurred fetching questions: {e}")
|
|
|
|
| 82 |
return f"An unexpected error occurred fetching questions: {e}", None
|
| 83 |
|
| 84 |
+
# 3. Run your Agent
|
| 85 |
results_log = []
|
| 86 |
answers_payload = []
|
| 87 |
+
print(f"Running agent on {len(questions_data)} questions...")
|
| 88 |
for item in questions_data:
|
| 89 |
task_id = item.get("task_id")
|
| 90 |
question_text = item.get("question")
|
|
|
|
| 92 |
print(f"Skipping item with missing task_id or question: {item}")
|
| 93 |
continue
|
| 94 |
try:
|
| 95 |
+
submitted_answer = agent(question_text)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
| 97 |
+
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
|
| 98 |
except Exception as e:
|
| 99 |
print(f"Error running agent on task {task_id}: {e}")
|
|
|
|
|
|
|
|
|
|
| 100 |
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
|
| 101 |
|
| 102 |
if not answers_payload:
|
| 103 |
print("Agent did not produce any answers to submit.")
|
| 104 |
return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
|
| 105 |
|
| 106 |
+
# 4. Prepare Submission
|
| 107 |
submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
|
| 108 |
+
status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
|
| 109 |
+
print(status_update)
|
| 110 |
|
| 111 |
+
# 5. Submit
|
| 112 |
+
print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
|
| 113 |
try:
|
| 114 |
+
response = requests.post(submit_url, json=submission_data, timeout=60)
|
| 115 |
response.raise_for_status()
|
| 116 |
result_data = response.json()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
final_status = (
|
| 118 |
f"Submission Successful!\n"
|
| 119 |
f"User: {result_data.get('username')}\n"
|
| 120 |
+
f"Overall Score: {result_data.get('score', 'N/A')}% "
|
| 121 |
+
f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
|
| 122 |
f"Message: {result_data.get('message', 'No message received.')}"
|
| 123 |
)
|
| 124 |
print("Submission successful.")
|
|
|
|
| 125 |
results_df = pd.DataFrame(results_log)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
return final_status, results_df
|
|
|
|
| 127 |
except requests.exceptions.HTTPError as e:
|
| 128 |
error_detail = f"Server responded with status {e.response.status_code}."
|
| 129 |
+
try:
|
| 130 |
+
error_json = e.response.json()
|
| 131 |
+
error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
|
| 132 |
+
except requests.exceptions.JSONDecodeError:
|
| 133 |
+
error_detail += f" Response: {e.response.text[:500]}"
|
| 134 |
status_message = f"Submission Failed: {error_detail}"
|
| 135 |
print(status_message)
|
| 136 |
+
results_df = pd.DataFrame(results_log)
|
|
|
|
|
|
|
|
|
|
| 137 |
return status_message, results_df
|
| 138 |
except requests.exceptions.Timeout:
|
| 139 |
status_message = "Submission Failed: The request timed out."
|
| 140 |
print(status_message)
|
| 141 |
+
results_df = pd.DataFrame(results_log)
|
|
|
|
|
|
|
|
|
|
| 142 |
return status_message, results_df
|
| 143 |
except requests.exceptions.RequestException as e:
|
| 144 |
status_message = f"Submission Failed: Network error - {e}"
|
| 145 |
print(status_message)
|
| 146 |
+
results_df = pd.DataFrame(results_log)
|
|
|
|
|
|
|
|
|
|
| 147 |
return status_message, results_df
|
| 148 |
except Exception as e:
|
| 149 |
status_message = f"An unexpected error occurred during submission: {e}"
|
| 150 |
print(status_message)
|
| 151 |
+
results_df = pd.DataFrame(results_log)
|
|
|
|
|
|
|
|
|
|
| 152 |
return status_message, results_df
|
| 153 |
|
| 154 |
|
| 155 |
+
# --- Build Gradio Interface using Blocks ---
|
| 156 |
with gr.Blocks() as demo:
|
| 157 |
+
gr.Markdown("# Basic Agent Evaluation Runner")
|
| 158 |
gr.Markdown(
|
| 159 |
+
"""
|
|
|
|
|
|
|
| 160 |
**Instructions:**
|
| 161 |
+
1. Please clone this space, then modify the code to define your agent's logic, the tools, the necessary packages, etc ...
|
| 162 |
2. Log in to your Hugging Face account using the button below. This uses your HF username for submission.
|
| 163 |
3. Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
|
| 164 |
---
|
| 165 |
**Disclaimers:**
|
| 166 |
+
Once clicking on the "submit button, it can take quite some time ( this is the time for the agent to go through all the questions).
|
| 167 |
+
This space provides a basic setup and is intentionally sub-optimal to encourage you to develop your own, more robust solution. For instance for the delay process of the submit button, a solution could be to cache the answers and submit in a seperate action or even to answer the questions in async.
|
| 168 |
"""
|
| 169 |
)
|
| 170 |
|
|
|
|
| 173 |
run_button = gr.Button("Run Evaluation & Submit All Answers")
|
| 174 |
|
| 175 |
status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
|
| 176 |
+
# Removed max_rows=10 from DataFrame constructor
|
| 177 |
+
results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
|
| 178 |
|
| 179 |
run_button.click(
|
| 180 |
fn=run_and_submit_all,
|
| 181 |
+
outputs=[status_output, results_table]
|
|
|
|
| 182 |
)
|
| 183 |
|
|
|
|
| 184 |
if __name__ == "__main__":
|
| 185 |
print("\n" + "-"*30 + " App Starting " + "-"*30)
|
| 186 |
+
# Check for SPACE_HOST and SPACE_ID at startup for information
|
| 187 |
space_host_startup = os.getenv("SPACE_HOST")
|
| 188 |
+
space_id_startup = os.getenv("SPACE_ID") # Get SPACE_ID at startup
|
| 189 |
|
| 190 |
+
if space_host_startup:
|
| 191 |
+
print(f"✅ SPACE_HOST found: {space_host_startup}")
|
| 192 |
+
print(f" Runtime URL should be: https://{space_host_startup}.hf.space")
|
| 193 |
+
else:
|
| 194 |
+
print("ℹ️ SPACE_HOST environment variable not found (running locally?).")
|
| 195 |
+
|
| 196 |
+
if space_id_startup: # Print repo URLs if SPACE_ID is found
|
| 197 |
+
print(f"✅ SPACE_ID found: {space_id_startup}")
|
| 198 |
+
print(f" Repo URL: https://huggingface.co/spaces/{space_id_startup}")
|
| 199 |
+
print(f" Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main")
|
| 200 |
+
else:
|
| 201 |
+
print("ℹ️ SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined.")
|
| 202 |
|
| 203 |
print("-"*(60 + len(" App Starting ")) + "\n")
|
| 204 |
+
|
| 205 |
+
print("Launching Gradio Interface for Basic Agent Evaluation...")
|
| 206 |
+
demo.launch(debug=True, share=False)
|