File size: 15,486 Bytes
953a28e b06fdec e378548 a0b1b1d b06fdec 953a28e e378548 fb9fdbd 407e466 953a28e 9168c5e 953a28e fb9fdbd a1e2111 fb9fdbd f30003b a1e2111 953a28e a0b1b1d 953a28e fb9fdbd 8298566 407e466 1b94a5b 407e466 1b94a5b fb9fdbd 407e466 fb9fdbd 1b94a5b fb9fdbd 1b94a5b b06fdec 1b94a5b b06fdec 1b94a5b fb9fdbd 1b94a5b 57e2365 41273dd 407e466 41273dd b06fdec 57e2365 b06fdec 41273dd fb9fdbd 41273dd fb9fdbd 41273dd a1e2111 41273dd 407e466 953a28e 407e466 832bd05 1b94a5b 953a28e 4db235f 9168c5e b06fdec 9168c5e 953a28e 9168c5e 953a28e 9168c5e 953a28e 4db235f 9168c5e b06fdec 9168c5e 4db235f 953a28e 9168c5e b06fdec 953a28e 9168c5e 953a28e 9168c5e b06fdec 9168c5e b06fdec 9168c5e b06fdec 9168c5e 92925a0 9168c5e 92925a0 b06fdec 9168c5e b06fdec 407e466 953a28e 8298566 fb9fdbd 8298566 fb9fdbd 9168c5e fb9fdbd 407e466 9168c5e 832bd05 407e466 953a28e a1e2111 953a28e 41273dd 8298566 407e466 b06fdec 4db235f 407e466 953a28e b06fdec 5f04c51 953a28e 25c5c17 8298566 92925a0 953a28e 92925a0 b06fdec 953a28e a0b1b1d 953a28e a0b1b1d fb9fdbd a0b1b1d 953a28e b06fdec 953a28e a0b1b1d 5f04c51 a0b1b1d 5f04c51 a0b1b1d 8298566 25c5c17 a0b1b1d 8298566 a0b1b1d 92925a0 a0b1b1d 953a28e fb9fdbd 953a28e 5f04c51 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 | import os
import argparse
import requests
import pandas as pd
import json
import time
import warnings
import logging
from enum import Enum
from colorama import init
# Initialize colorama for Windows compatibility
init(autoreset=True)
# Suppress asyncio event loop cleanup warnings (common on HF Spaces)
warnings.filterwarnings('ignore', message='.*Invalid file descriptor.*')
logging.getLogger('asyncio').setLevel(logging.ERROR)
# Import configuration
import config
# Agent-related code is imported via agent_runner module
# Import Gradio UI creation function
from gradioapp import create_ui
# Import scoring function for answer verification
from scorer import question_scorer
# Import new utilities
from question_loader import QuestionLoader
from result_formatter import ResultFormatter
from agent_runner import AgentRunner
from validators import InputValidator, ValidationError
from utils import retry_with_backoff
from langfuse_tracking import track_session
# --- Run Modes ---
class RunMode(Enum):
UI = "ui" # Gradio UI mode
CLI = "cli" # Command-line test mode
@retry_with_backoff(max_retries=3, initial_delay=2.0)
def _submit_to_server(submit_url: str, submission_data: dict) -> dict:
"""Internal function to submit to server (with retries)."""
response = requests.post(submit_url, json=submission_data, timeout=config.SUBMIT_TIMEOUT)
response.raise_for_status()
return response.json()
def submit_and_score(username: str, results: list) -> str:
"""
Submit answers to the GAIA scoring server and return status message.
Args:
username: Hugging Face username for submission
results: List of tuples (task_id, question_text, answer)
Returns:
str: Status message (success or error details)
"""
# Validate username
try:
username = InputValidator.validate_username(username)
except ValidationError as e:
error_msg = f"Invalid username: {e}"
print(error_msg)
return error_msg
# Format results for API submission
answers_payload = ResultFormatter.format_for_api(results)
if not answers_payload:
error_msg = "No answers to submit."
print(error_msg)
return error_msg
space_id = config.SPACE_ID
submit_url = f"{config.DEFAULT_API_URL}/submit"
agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
# Prepare submission data
submission_data = {
"username": username,
"agent_code": agent_code,
"answers": answers_payload
}
print(f"\n{'=' * config.SEPARATOR_WIDTH}")
print(f"Submitting {len(answers_payload)} answers for user '{username}'...")
print(f"{'=' * config.SEPARATOR_WIDTH}\n")
# Submit to server
print(f"Submitting to: {submit_url}")
try:
result_data = _submit_to_server(submit_url, submission_data)
final_status = (
f"Submission Successful!\n"
f"User: {result_data.get('username')}\n"
f"Overall Score: {result_data.get('score', 'N/A')}% "
f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
f"Message: {result_data.get('message', 'No message received.')}"
)
print("Submission successful.")
return final_status
except requests.exceptions.HTTPError as e:
error_detail = f"Server responded with status {e.response.status_code}."
try:
error_json = e.response.json()
error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
except requests.exceptions.JSONDecodeError:
error_detail += f" Response: {e.response.text[:500]}"
status_message = f"Submission Failed: {error_detail}"
print(status_message)
return status_message
except requests.exceptions.Timeout:
status_message = "Submission Failed: The request timed out."
print(status_message)
return status_message
except requests.exceptions.RequestException as e:
status_message = f"Submission Failed: Network error - {e}"
print(status_message)
return status_message
except Exception as e:
status_message = f"An unexpected error occurred during submission: {e}"
print(status_message)
return status_message
def run_and_submit_all(username: str, active_agent: str = None) -> tuple:
"""
Fetches all questions, runs the GAIA agent on them, submits all answers,
and displays the results.
Args:
username: Hugging Face username for submission
active_agent: The agent type to use (default: config.AGENT_LANGGRAPH)
Returns:
tuple: (status_message: str, results_df: pd.DataFrame)
"""
# Fetch questions from API (always online for submission)
try:
questions_data = QuestionLoader().get_questions(test_mode=False)
except Exception as e:
return f"Error loading questions: {e}", None
# Validate questions data
try:
questions_data = InputValidator.validate_questions_data(questions_data)
except ValidationError as e:
return f"Invalid questions data: {e}", None
# Run agent on all questions with specified agent type (with Langfuse session tracking)
with track_session("Submit_All", {
"agent": active_agent or config.ACTIVE_AGENT,
"username": username,
"question_count": len(questions_data),
"mode": "submission"
}):
results = AgentRunner(active_agent=active_agent).run_on_questions(questions_data)
if results is None:
return "Error initializing agent.", None
# Submit answers and get score (formatting happens inside submit_and_score)
status_message = submit_and_score(username, results)
# Format results for UI display
results_for_display = ResultFormatter.format_for_display(results)
results_df = pd.DataFrame(results_for_display)
return status_message, results_df
def _load_ground_truth(file_path: str = config.METADATA_FILE) -> dict:
"""Load ground truth data indexed by task_id.
Args:
file_path: Path to the metadata file
Returns:
dict: Mapping of task_id -> {"question": str, "answer": str}
"""
truth_mapping = {}
try:
with open(file_path, 'r', encoding='utf-8') as f:
for line in f:
data = json.loads(line)
task_id = data.get("task_id")
question = data.get("Question")
answer = data.get("Final answer")
if task_id and answer:
truth_mapping[task_id] = {
"question": question,
"answer": answer
}
except Exception as e:
print(f"Error loading ground truth: {e}")
return truth_mapping
def _verify_answers(results: list, log_output: list, runtime: tuple = None) -> None:
"""Verify answers against ground truth using the official GAIA scorer.
Args:
results: List of tuples (task_id, question_text, answer)
log_output: List to append verification results to
runtime: Optional tuple of (minutes, seconds) for total runtime
"""
ground_truth = _load_ground_truth()
log_output.append("\n=== Verification Results ===")
correct_count = 0
total_count = 0
for task_id, question_text, answer in results:
if task_id in ground_truth:
truth_data = ground_truth[task_id]
correct_answer = truth_data["answer"]
# Use the official GAIA question_scorer for comparison
# This handles numbers, lists, and strings with proper normalization
is_correct = question_scorer(str(answer), str(correct_answer))
if is_correct:
correct_count += 1
total_count += 1
log_output.append(f"Task ID: {task_id}")
log_output.append(f"Question: {question_text[:config.ERROR_MESSAGE_LENGTH]}...")
log_output.append(f"Expected: {correct_answer}")
log_output.append(f"Got: {answer}")
log_output.append(f"Match: {'✓ Correct' if is_correct else '✗ Incorrect'}\n")
else:
log_output.append(f"Task ID: {task_id}")
log_output.append(f"Question: {question_text[:config.ERROR_MESSAGE_LENGTH]}...")
log_output.append(f"No ground truth found.\n")
# Add summary statistics
if total_count > 0:
accuracy = (correct_count / total_count) * 100
log_output.append("=" * config.SEPARATOR_WIDTH)
log_output.append(f"SUMMARY: {correct_count}/{total_count} correct ({accuracy:.1f}%)")
if runtime:
minutes, seconds = runtime
log_output.append(f"Runtime: {minutes}m {seconds}s")
log_output.append("=" * config.SEPARATOR_WIDTH)
def run_test_code(filter=None, active_agent=None) -> pd.DataFrame:
"""Run test code on selected questions.
Args:
filter: Optional tuple/list of question indices to test (e.g., (4, 7, 15)).
If None, processes all questions.
active_agent: Optional agent type to use (e.g., "LangGraph", "ReActLangGraph", "LLamaIndex").
If None, uses config.ACTIVE_AGENT.
Returns:
pd.DataFrame: Results and verification output
"""
start_time = time.time()
logs_for_display = []
logs_for_display.append("=== Processing Example Questions One by One ===")
# Fetch questions (OFFLINE for testing)
try:
questions_data = QuestionLoader().get_questions(test_mode=True)
except Exception as e:
return pd.DataFrame([f"Error loading questions: {e}"])
# Validate questions data
try:
questions_data = InputValidator.validate_questions_data(questions_data)
except ValidationError as e:
return pd.DataFrame([f"Invalid questions data: {e}"])
# Validate and apply filter
try:
filter = InputValidator.validate_filter_indices(filter, len(questions_data))
except ValidationError as e:
return pd.DataFrame([f"Invalid filter: {e}"])
# Apply filter or use all questions
if filter is not None:
questions_to_process = [questions_data[i] for i in filter]
logs_for_display.append(f"Testing {len(questions_to_process)} selected questions (indices: {filter})")
else:
questions_to_process = questions_data
logs_for_display.append(f"Testing all {len(questions_to_process)} questions")
# Run agent on selected questions with specified agent type (with Langfuse session tracking)
with track_session("Test_Run", {
"agent": active_agent or config.ACTIVE_AGENT,
"question_count": len(questions_to_process),
"filter": str(filter) if filter else "all",
"mode": "test"
}):
results = AgentRunner(active_agent=active_agent).run_on_questions(questions_to_process)
if results is None:
return pd.DataFrame(["Error initializing agent."])
logs_for_display.append("\n=== Completed Example Questions ===")
# Calculate runtime
elapsed_time = time.time() - start_time
minutes = int(elapsed_time // 60)
seconds = int(elapsed_time % 60)
_verify_answers(results, logs_for_display, runtime=(minutes, seconds))
return pd.DataFrame(logs_for_display)
def main() -> None:
"""Main entry point for the application."""
parser = argparse.ArgumentParser(description="Run the agent application.")
parser.add_argument("--test", type=str, nargs='?', const='default', help="Run local tests on selected questions and exit. Optionally provide comma-separated question indices (e.g., --test 2,4,6). If no indices provided, uses default test questions.")
parser.add_argument("--testall", action="store_true", help="Run local tests on all questions and exit.")
parser.add_argument("--agent", type=str, choices=['langgraph', 'reactlangg', 'llamaindex'], help="Agent to use in CLI mode (case-insensitive). Options: langgraph, react langgraph, llamaindex. Default: uses config.ACTIVE_AGENT")
args = parser.parse_args()
# Map agent name to config constant (case-insensitive)
agent_mapping = {
'langgraph': config.AGENT_LANGGRAPH,
'reactlangg': config.AGENT_REACT_LANGGRAPH,
'llamaindex': config.AGENT_LLAMAINDEX,
}
active_agent = None
if args.agent:
agent_key = args.agent.lower()
active_agent = agent_mapping.get(agent_key)
if not active_agent:
print(f"Error: Unknown agent '{args.agent}'. Valid options: langgraph, react, llamaindex")
return
print(f"[CLI] Using agent: {active_agent}")
print(f"\n{'-' * 30} App Starting {'-' * 30}")
# Determine run mode
run_mode = RunMode.CLI if (args.test or args.testall) else RunMode.UI
# Print environment info only in UI mode
if run_mode == RunMode.UI:
space_host = config.SPACE_HOST
space_id = config.SPACE_ID
if space_host:
print(f"[OK] SPACE_HOST found: {space_host}")
print(f" Runtime URL should be: https://{space_host}.hf.space")
else:
print("[INFO] SPACE_HOST environment variable not found (running locally?).")
if space_id:
print(f"[OK] SPACE_ID found: {space_id}")
print(f" Repo URL: https://huggingface.co/spaces/{space_id}")
print(f" Repo Tree URL: https://huggingface.co/spaces/{space_id}/tree/main")
else:
print("[INFO] SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined.")
print(f"{'-' * (60 + len(' App Starting '))}\n")
# Execute based on run mode
if run_mode == RunMode.UI:
print("Launching Gradio Interface for Basic Agent Evaluation...")
grTestApp = create_ui(run_and_submit_all, run_test_code)
grTestApp.launch()
else: # RunMode.CLI
# Determine test filter based on which CLI flag was used
if args.test:
# Check if custom indices were provided
if args.test == 'default':
# No indices provided, use default
test_filter = config.DEFAULT_TEST_FILTER
else:
# Parse comma-separated indices
try:
test_filter = tuple(int(idx.strip()) for idx in args.test.split(','))
except ValueError:
print(f"Error: Invalid test indices '{args.test}'. Must be comma-separated integers (e.g., 2,4,6)")
return
else: # args.testall
test_filter = None # Test all questions
print(f"Running test code on {len(test_filter) if test_filter else 'ALL'} questions (CLI mode)...")
result = run_test_code(filter=test_filter, active_agent=active_agent)
# Print results
if isinstance(result, pd.DataFrame):
ResultFormatter.print_dataframe(result)
else:
print(result)
if __name__ == "__main__":
main()
|