Update app.py
Browse files
app.py
CHANGED
|
@@ -8,319 +8,237 @@ import base64
|
|
| 8 |
from typing import Optional, Dict, List, Any
|
| 9 |
import anthropic
|
| 10 |
|
| 11 |
-
# API URL для GAIA
|
| 12 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
| 13 |
|
| 14 |
class GAIAAgent:
|
| 15 |
def __init__(self):
|
| 16 |
print("Initializing GAIA Agent powered by Claude...")
|
| 17 |
-
# Получение API-ключа Claude из переменных окружения
|
| 18 |
self.claude_key = os.environ.get("ANTHROPIC_API_KEY")
|
| 19 |
if not self.claude_key:
|
| 20 |
raise ValueError("ANTHROPIC_API_KEY not found in environment variables")
|
| 21 |
|
| 22 |
-
# Инициализация клиента Claude
|
| 23 |
self.client = anthropic.Anthropic(api_key=self.claude_key)
|
| 24 |
-
|
| 25 |
-
# API URL для GAIA
|
| 26 |
self.api_url = DEFAULT_API_URL
|
| 27 |
-
|
| 28 |
-
# Словарь для кеширования результатов поиска и ответов
|
| 29 |
-
self.search_cache = {}
|
| 30 |
self.file_cache = {}
|
| 31 |
|
| 32 |
-
#
|
| 33 |
-
self.system_prompt = """
|
| 34 |
-
You are an AI assistant specially designed to answer questions from the GAIA benchmark with exceptional accuracy.
|
| 35 |
-
The GAIA benchmark evaluates AI's ability to perform real-world tasks that require reasoning, web browsing, and tool use.
|
| 36 |
-
|
| 37 |
-
Your goal is to provide the EXACT answer in the format requested by each question. GAIA uses exact matching for evaluation.
|
| 38 |
-
|
| 39 |
-
Guidelines for GAIA answers:
|
| 40 |
-
1. Provide ONLY the final answer, with NO explanations, reasoning, or additional text
|
| 41 |
-
2. Format is critical - follow the instructions in the question precisely
|
| 42 |
-
3. For comma-separated lists, provide "item1, item2, item3" with no quotes or extra punctuation
|
| 43 |
-
4. For numeric answers, provide just the number without units unless specifically requested
|
| 44 |
-
5. Maintain exact capitalization and spacing as requested in the question
|
| 45 |
-
6. If asked to order items, follow the requested ordering precisely
|
| 46 |
-
|
| 47 |
-
Examples of correct formatting:
|
| 48 |
-
- If asked for fruits in alphabetical order: "apples, bananas, oranges"
|
| 49 |
-
- If asked for a single word: "photosynthesis"
|
| 50 |
-
- If asked for a number: "42"
|
| 51 |
-
- If asked for a date in MM/DD/YY format: "05/04/25"
|
| 52 |
-
|
| 53 |
-
Remember, your score depends on exact matching against the reference answer.
|
| 54 |
-
"""
|
| 55 |
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
# Собираем результаты из разных полей
|
| 73 |
-
results = []
|
| 74 |
-
if data.get("AbstractText"):
|
| 75 |
-
results.append(f"Abstract: {data['AbstractText']}")
|
| 76 |
-
if data.get("RelatedTopics"):
|
| 77 |
-
topics = data.get("RelatedTopics", [])
|
| 78 |
-
for i, topic in enumerate(topics[:5]): # Ограничиваем 5 результатами
|
| 79 |
-
if isinstance(topic, dict) and topic.get("Text"):
|
| 80 |
-
results.append(f"Related Topic {i+1}: {topic['Text']}")
|
| 81 |
-
|
| 82 |
-
result_text = "\n\n".join(results) if results else "No results found"
|
| 83 |
-
|
| 84 |
-
# Вторичный поиск с использованием серпапи.com (если бы у нас был ключ API)
|
| 85 |
-
# В реальном приложении здесь можно было бы использовать другой поисковый API
|
| 86 |
-
|
| 87 |
-
# Кешируем и возвращаем результаты
|
| 88 |
-
self.search_cache[query] = result_text
|
| 89 |
-
return result_text
|
| 90 |
-
except Exception as e:
|
| 91 |
-
print(f"Web search error: {e}")
|
| 92 |
-
return f"Web search failed: {str(e)}"
|
| 93 |
|
| 94 |
def fetch_file(self, task_id: str) -> Optional[Dict[str, Any]]:
|
| 95 |
"""Fetches and processes a file associated with a task"""
|
| 96 |
if task_id in self.file_cache:
|
| 97 |
-
print(f"Using cached file for task: {task_id}")
|
| 98 |
return self.file_cache[task_id]
|
| 99 |
|
| 100 |
print(f"Fetching file for task: {task_id}")
|
| 101 |
try:
|
| 102 |
response = requests.get(f"{self.api_url}/files/{task_id}", timeout=15)
|
| 103 |
|
| 104 |
-
if response.status_code =
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
"content": file_content,
|
| 108 |
-
"content_type": response.headers.get("Content-Type", ""),
|
| 109 |
-
"size": len(file_content)
|
| 110 |
-
}
|
| 111 |
-
|
| 112 |
-
# Определяем тип файла и обрабатываем соответственно
|
| 113 |
-
content_type = file_info["content_type"].lower()
|
| 114 |
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
try:
|
| 127 |
-
file_info["text"] = file_content.decode('
|
| 128 |
file_info["type"] = "text"
|
| 129 |
-
|
| 130 |
-
except UnicodeDecodeError:
|
| 131 |
file_info["type"] = "binary"
|
| 132 |
-
print(f"Could not decode text file ({file_info['size']} bytes)")
|
| 133 |
-
else:
|
| 134 |
-
file_info["type"] = "binary"
|
| 135 |
-
print(f"Detected binary file ({file_info['size']} bytes, {content_type})")
|
| 136 |
-
|
| 137 |
-
# Кешируем файл
|
| 138 |
-
self.file_cache[task_id] = file_info
|
| 139 |
-
return file_info
|
| 140 |
else:
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
except Exception as e:
|
| 145 |
-
print(f"Error fetching file: {e}")
|
| 146 |
return None
|
| 147 |
|
| 148 |
def extract_answer(self, response_text: str) -> str:
|
| 149 |
-
"""Extract
|
| 150 |
-
#
|
| 151 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
|
| 153 |
-
#
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
first_line = lines[0].strip()
|
| 160 |
-
# Если первая строка выглядит как полный ответ, возвращаем только её
|
| 161 |
-
if len(first_line) > 5 and not first_line.startswith('I ') and not first_line.startswith('The '):
|
| 162 |
-
return first_line
|
| 163 |
-
|
| 164 |
-
# Вычищаем кавычки в начале и конце
|
| 165 |
-
cleaned = cleaned.strip()
|
| 166 |
-
if cleaned.startswith('"') and cleaned.endswith('"'):
|
| 167 |
-
cleaned = cleaned[1:-1]
|
| 168 |
-
|
| 169 |
-
return cleaned.strip()
|
| 170 |
-
|
| 171 |
-
def process_question(self, question: str, task_id: str = None) -> Dict[str, Any]:
|
| 172 |
-
"""Processes a question to extract relevant information and prepare for Claude"""
|
| 173 |
-
question_info = {
|
| 174 |
-
"original": question,
|
| 175 |
-
"task_id": task_id,
|
| 176 |
-
"has_file": False,
|
| 177 |
-
"file_info": None,
|
| 178 |
-
"contains_math": bool(re.search(r'calculate|compute|sum|average|mean|median|formula|equation', question, re.IGNORECASE)),
|
| 179 |
-
"requires_list": bool(re.search(r'list|order|sequence|rank|items|elements|values', question, re.IGNORECASE)),
|
| 180 |
-
"format_requirements": None
|
| 181 |
-
}
|
| 182 |
|
| 183 |
-
#
|
| 184 |
-
|
| 185 |
-
if
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
if task_id and self.fetch_file(task_id):
|
| 190 |
-
question_info["has_file"] = True
|
| 191 |
-
question_info["file_info"] = self.fetch_file(task_id)
|
| 192 |
|
| 193 |
-
return
|
| 194 |
|
| 195 |
def __call__(self, question: str, task_id: str = None) -> str:
|
| 196 |
-
"""
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
if match:
|
| 201 |
-
task_id = match.group(1)
|
| 202 |
-
|
| 203 |
-
print(f"Processing question for task_id: {task_id}")
|
| 204 |
-
print(f"Question: {question[:100]}...")
|
| 205 |
-
|
| 206 |
-
# Обработка вопроса
|
| 207 |
-
question_info = self.process_question(question, task_id)
|
| 208 |
|
| 209 |
try:
|
| 210 |
-
#
|
| 211 |
-
|
| 212 |
|
| 213 |
-
#
|
| 214 |
-
user_content
|
| 215 |
"type": "text",
|
| 216 |
-
"text": f""
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
Remember:
|
| 220 |
-
1. Provide ONLY the final answer
|
| 221 |
-
2. Format exactly as requested
|
| 222 |
-
3. No explanations or reasoning
|
| 223 |
-
"""
|
| 224 |
-
}]
|
| 225 |
|
| 226 |
-
#
|
| 227 |
-
|
| 228 |
-
if
|
| 229 |
-
|
| 230 |
-
"type": "text",
|
| 231 |
-
"text": f"""
|
| 232 |
-
Web search results related to this question:
|
| 233 |
-
|
| 234 |
-
{web_results}
|
| 235 |
-
"""
|
| 236 |
-
})
|
| 237 |
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
file_info = question_info["file_info"]
|
| 241 |
|
| 242 |
-
if
|
| 243 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 244 |
user_content.append({
|
| 245 |
"type": "image",
|
| 246 |
"source": {
|
| 247 |
-
"type": "base64",
|
| 248 |
-
"media_type":
|
| 249 |
"data": file_info["base64"]
|
| 250 |
}
|
| 251 |
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 252 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 253 |
user_content.append({
|
| 254 |
"type": "text",
|
| 255 |
-
"text": "The
|
| 256 |
})
|
| 257 |
-
|
| 258 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 259 |
user_content.append({
|
| 260 |
"type": "text",
|
| 261 |
-
"text": f""
|
| 262 |
-
The question includes a text file with the following content:
|
| 263 |
-
|
| 264 |
-
{file_info["text"][:4000]} # ограничиваем, чтобы не превысить лимиты токенов
|
| 265 |
-
"""
|
| 266 |
})
|
|
|
|
|
|
|
|
|
|
| 267 |
|
| 268 |
-
#
|
| 269 |
-
if question_info["format_requirements"]:
|
| 270 |
-
user_content.append({
|
| 271 |
-
"type": "text",
|
| 272 |
-
"text": f"""
|
| 273 |
-
Important format requirement: {question_info["format_requirements"]}
|
| 274 |
-
Make sure your answer follows this format EXACTLY.
|
| 275 |
-
"""
|
| 276 |
-
})
|
| 277 |
-
|
| 278 |
-
messages.append({
|
| 279 |
-
"role": "user",
|
| 280 |
-
"content": user_content
|
| 281 |
-
})
|
| 282 |
-
|
| 283 |
-
# Запрос к Claude
|
| 284 |
response = self.client.messages.create(
|
| 285 |
model="claude-sonnet-4-6",
|
| 286 |
system=self.system_prompt,
|
| 287 |
-
messages=
|
| 288 |
-
temperature=0
|
| 289 |
max_tokens=4096
|
| 290 |
)
|
| 291 |
|
| 292 |
-
# Получаем ответ
|
| 293 |
raw_answer = response.content[0].text.strip()
|
|
|
|
| 294 |
|
| 295 |
-
|
| 296 |
-
|
| 297 |
|
| 298 |
-
|
| 299 |
-
print(f"Clean answer: {clean_answer}")
|
| 300 |
|
| 301 |
-
return clean_answer
|
| 302 |
except Exception as e:
|
| 303 |
-
print(f"Error
|
| 304 |
import traceback
|
| 305 |
traceback.print_exc()
|
| 306 |
-
return f"
|
| 307 |
|
| 308 |
|
| 309 |
-
# Используем наш агент как BasicAgent для совместимости с остальным кодом
|
| 310 |
class BasicAgent(GAIAAgent):
|
| 311 |
pass
|
| 312 |
|
| 313 |
|
| 314 |
def run_and_submit_all(profile: gr.OAuthProfile | None):
|
| 315 |
-
""
|
| 316 |
-
Fetches all questions, runs the BasicAgent on them, submits all answers,
|
| 317 |
-
and displays the results.
|
| 318 |
-
"""
|
| 319 |
-
# --- Determine HF Space Runtime URL and Repo URL ---
|
| 320 |
-
space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
|
| 321 |
|
| 322 |
if profile:
|
| 323 |
-
username= f"{profile.username}"
|
| 324 |
print(f"User logged in: {username}")
|
| 325 |
else:
|
| 326 |
print("User not logged in.")
|
|
@@ -330,41 +248,31 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
| 330 |
questions_url = f"{api_url}/questions"
|
| 331 |
submit_url = f"{api_url}/submit"
|
| 332 |
|
| 333 |
-
# 1. Instantiate Agent ( modify this part to create your agent)
|
| 334 |
try:
|
| 335 |
agent = BasicAgent()
|
| 336 |
except Exception as e:
|
| 337 |
print(f"Error instantiating agent: {e}")
|
| 338 |
return f"Error initializing agent: {e}", None
|
| 339 |
-
|
| 340 |
agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
|
| 341 |
print(agent_code)
|
| 342 |
|
| 343 |
-
# 2. Fetch Questions
|
| 344 |
print(f"Fetching questions from: {questions_url}")
|
| 345 |
try:
|
| 346 |
response = requests.get(questions_url, timeout=15)
|
| 347 |
response.raise_for_status()
|
| 348 |
questions_data = response.json()
|
| 349 |
if not questions_data:
|
| 350 |
-
|
| 351 |
-
return "Fetched questions list is empty or invalid format.", None
|
| 352 |
print(f"Fetched {len(questions_data)} questions.")
|
| 353 |
-
except
|
| 354 |
print(f"Error fetching questions: {e}")
|
| 355 |
return f"Error fetching questions: {e}", None
|
| 356 |
-
except requests.exceptions.JSONDecodeError as e:
|
| 357 |
-
print(f"Error decoding JSON response from questions endpoint: {e}")
|
| 358 |
-
print(f"Response text: {response.text[:500]}")
|
| 359 |
-
return f"Error decoding server response for questions: {e}", None
|
| 360 |
-
except Exception as e:
|
| 361 |
-
print(f"An unexpected error occurred fetching questions: {e}")
|
| 362 |
-
return f"An unexpected error occurred fetching questions: {e}", None
|
| 363 |
|
| 364 |
-
# 3. Run your Agent
|
| 365 |
results_log = []
|
| 366 |
answers_payload = []
|
| 367 |
print(f"Running agent on {len(questions_data)} questions...")
|
|
|
|
| 368 |
for item in questions_data:
|
| 369 |
task_id = item.get("task_id")
|
| 370 |
question_text = item.get("question")
|
|
@@ -374,22 +282,29 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
| 374 |
try:
|
| 375 |
submitted_answer = agent(question_text, task_id)
|
| 376 |
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
| 377 |
-
results_log.append({
|
|
|
|
|
|
|
|
|
|
|
|
|
| 378 |
except Exception as e:
|
| 379 |
-
|
| 380 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 381 |
|
| 382 |
if not answers_payload:
|
| 383 |
-
print("Agent did not produce any answers to submit.")
|
| 384 |
return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
|
| 385 |
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
|
|
|
|
|
|
| 390 |
|
| 391 |
-
# 5. Submit
|
| 392 |
-
print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
|
| 393 |
try:
|
| 394 |
response = requests.post(submit_url, json=submission_data, timeout=60)
|
| 395 |
response.raise_for_status()
|
|
@@ -402,53 +317,33 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
| 402 |
f"Message: {result_data.get('message', 'No message received.')}"
|
| 403 |
)
|
| 404 |
print("Submission successful.")
|
| 405 |
-
|
| 406 |
-
return final_status, results_df
|
| 407 |
except requests.exceptions.HTTPError as e:
|
| 408 |
error_detail = f"Server responded with status {e.response.status_code}."
|
| 409 |
try:
|
| 410 |
error_json = e.response.json()
|
| 411 |
error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
|
| 412 |
-
except
|
| 413 |
error_detail += f" Response: {e.response.text[:500]}"
|
| 414 |
-
|
| 415 |
-
print(status_message)
|
| 416 |
-
results_df = pd.DataFrame(results_log)
|
| 417 |
-
return status_message, results_df
|
| 418 |
-
except requests.exceptions.Timeout:
|
| 419 |
-
status_message = "Submission Failed: The request timed out."
|
| 420 |
-
print(status_message)
|
| 421 |
-
results_df = pd.DataFrame(results_log)
|
| 422 |
-
return status_message, results_df
|
| 423 |
-
except requests.exceptions.RequestException as e:
|
| 424 |
-
status_message = f"Submission Failed: Network error - {e}"
|
| 425 |
-
print(status_message)
|
| 426 |
-
results_df = pd.DataFrame(results_log)
|
| 427 |
-
return status_message, results_df
|
| 428 |
except Exception as e:
|
| 429 |
-
|
| 430 |
-
print(status_message)
|
| 431 |
-
results_df = pd.DataFrame(results_log)
|
| 432 |
-
return status_message, results_df
|
| 433 |
|
| 434 |
|
| 435 |
-
# --- Build Gradio Interface using Blocks ---
|
| 436 |
with gr.Blocks() as demo:
|
| 437 |
gr.Markdown("# GAIA Benchmark Agent Evaluation")
|
| 438 |
gr.Markdown(
|
| 439 |
"""
|
| 440 |
**Instructions:**
|
| 441 |
-
1. Log in to your Hugging Face account using the button below.
|
| 442 |
2. Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
|
| 443 |
|
| 444 |
-
This agent uses Claude
|
| 445 |
"""
|
| 446 |
)
|
| 447 |
|
| 448 |
gr.LoginButton()
|
| 449 |
-
|
| 450 |
run_button = gr.Button("Run Evaluation & Submit All Answers")
|
| 451 |
-
|
| 452 |
status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
|
| 453 |
results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
|
| 454 |
|
|
@@ -459,24 +354,19 @@ with gr.Blocks() as demo:
|
|
| 459 |
|
| 460 |
if __name__ == "__main__":
|
| 461 |
print("\n" + "-"*30 + " App Starting " + "-"*30)
|
| 462 |
-
# Check for SPACE_HOST and SPACE_ID at startup for information
|
| 463 |
space_host_startup = os.getenv("SPACE_HOST")
|
| 464 |
-
space_id_startup = os.getenv("SPACE_ID")
|
| 465 |
|
| 466 |
if space_host_startup:
|
| 467 |
print(f"✅ SPACE_HOST found: {space_host_startup}")
|
| 468 |
-
print(f" Runtime URL should be: https://{space_host_startup}.hf.space")
|
| 469 |
else:
|
| 470 |
-
print("ℹ️ SPACE_HOST
|
| 471 |
|
| 472 |
-
if space_id_startup:
|
| 473 |
print(f"✅ SPACE_ID found: {space_id_startup}")
|
| 474 |
-
print(f" Repo URL: https://huggingface.co/spaces/{space_id_startup}")
|
| 475 |
-
print(f" Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main")
|
| 476 |
else:
|
| 477 |
-
print("ℹ️ SPACE_ID
|
| 478 |
-
|
| 479 |
-
print("-"*(60 + len(" App Starting ")) + "\n")
|
| 480 |
|
|
|
|
| 481 |
print("Launching Gradio Interface for GAIA Agent Evaluation...")
|
| 482 |
demo.launch(debug=True, share=False)
|
|
|
|
| 8 |
from typing import Optional, Dict, List, Any
|
| 9 |
import anthropic
|
| 10 |
|
|
|
|
| 11 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
| 12 |
|
| 13 |
class GAIAAgent:
|
| 14 |
def __init__(self):
|
| 15 |
print("Initializing GAIA Agent powered by Claude...")
|
|
|
|
| 16 |
self.claude_key = os.environ.get("ANTHROPIC_API_KEY")
|
| 17 |
if not self.claude_key:
|
| 18 |
raise ValueError("ANTHROPIC_API_KEY not found in environment variables")
|
| 19 |
|
|
|
|
| 20 |
self.client = anthropic.Anthropic(api_key=self.claude_key)
|
|
|
|
|
|
|
| 21 |
self.api_url = DEFAULT_API_URL
|
|
|
|
|
|
|
|
|
|
| 22 |
self.file_cache = {}
|
| 23 |
|
| 24 |
+
# System prompt — instructs to wrap final answer in <answer> tags
|
| 25 |
+
self.system_prompt = """You are a highly accurate assistant solving GAIA benchmark tasks.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
+
GAIA uses EXACT STRING MATCHING to grade answers. Your response format is CRITICAL.
|
| 28 |
+
|
| 29 |
+
Instructions:
|
| 30 |
+
1. Think step by step to figure out the correct answer.
|
| 31 |
+
2. At the very end of your response, output your final answer wrapped in <answer> tags like this:
|
| 32 |
+
<answer>your exact answer here</answer>
|
| 33 |
+
|
| 34 |
+
Rules for the content inside <answer>:
|
| 35 |
+
- For numbers: just the number, e.g. <answer>42</answer>
|
| 36 |
+
- For lists: comma-separated, e.g. <answer>apples, bananas, oranges</answer>
|
| 37 |
+
- For single words: just the word, e.g. <answer>photosynthesis</answer>
|
| 38 |
+
- For dates: use the format specified in the question
|
| 39 |
+
- NO extra punctuation, quotes, or explanation inside the tags
|
| 40 |
+
- Follow the exact format requested by the question
|
| 41 |
+
|
| 42 |
+
Think carefully before giving the final answer."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
def fetch_file(self, task_id: str) -> Optional[Dict[str, Any]]:
|
| 45 |
"""Fetches and processes a file associated with a task"""
|
| 46 |
if task_id in self.file_cache:
|
|
|
|
| 47 |
return self.file_cache[task_id]
|
| 48 |
|
| 49 |
print(f"Fetching file for task: {task_id}")
|
| 50 |
try:
|
| 51 |
response = requests.get(f"{self.api_url}/files/{task_id}", timeout=15)
|
| 52 |
|
| 53 |
+
if response.status_code != 200:
|
| 54 |
+
print(f"No file found for task {task_id}, status: {response.status_code}")
|
| 55 |
+
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
+
file_content = response.content
|
| 58 |
+
content_type = response.headers.get("Content-Type", "").lower()
|
| 59 |
+
|
| 60 |
+
file_info = {
|
| 61 |
+
"content": file_content,
|
| 62 |
+
"content_type": content_type,
|
| 63 |
+
"size": len(file_content)
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
if "image" in content_type:
|
| 67 |
+
file_info["base64"] = base64.b64encode(file_content).decode('utf-8')
|
| 68 |
+
file_info["type"] = "image"
|
| 69 |
+
print(f"Image file: {file_info['size']} bytes, type: {content_type}")
|
| 70 |
+
elif "pdf" in content_type:
|
| 71 |
+
file_info["base64"] = base64.b64encode(file_content).decode('utf-8')
|
| 72 |
+
file_info["type"] = "pdf"
|
| 73 |
+
print(f"PDF file: {file_info['size']} bytes")
|
| 74 |
+
elif any(t in content_type for t in ["text", "json", "csv"]):
|
| 75 |
+
try:
|
| 76 |
+
file_info["text"] = file_content.decode('utf-8')
|
| 77 |
+
file_info["type"] = "text"
|
| 78 |
+
print(f"Text file: {file_info['size']} bytes")
|
| 79 |
+
except UnicodeDecodeError:
|
| 80 |
+
# Try latin-1 fallback
|
| 81 |
try:
|
| 82 |
+
file_info["text"] = file_content.decode('latin-1')
|
| 83 |
file_info["type"] = "text"
|
| 84 |
+
except:
|
|
|
|
| 85 |
file_info["type"] = "binary"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
else:
|
| 87 |
+
# Try to decode as text anyway
|
| 88 |
+
try:
|
| 89 |
+
file_info["text"] = file_content.decode('utf-8')
|
| 90 |
+
file_info["type"] = "text"
|
| 91 |
+
print(f"Unknown type decoded as text: {content_type}")
|
| 92 |
+
except:
|
| 93 |
+
file_info["type"] = "binary"
|
| 94 |
+
print(f"Binary file: {content_type}, {file_info['size']} bytes")
|
| 95 |
+
|
| 96 |
+
self.file_cache[task_id] = file_info
|
| 97 |
+
return file_info
|
| 98 |
except Exception as e:
|
| 99 |
+
print(f"Error fetching file for {task_id}: {e}")
|
| 100 |
return None
|
| 101 |
|
| 102 |
def extract_answer(self, response_text: str) -> str:
|
| 103 |
+
"""Extract answer from <answer> tags"""
|
| 104 |
+
# Primary: look for <answer> tags
|
| 105 |
+
match = re.search(r'<answer>(.*?)</answer>', response_text, re.DOTALL | re.IGNORECASE)
|
| 106 |
+
if match:
|
| 107 |
+
answer = match.group(1).strip()
|
| 108 |
+
print(f"Extracted from <answer> tags: {repr(answer)}")
|
| 109 |
+
return answer
|
| 110 |
|
| 111 |
+
# Fallback: look for "Final answer:" pattern
|
| 112 |
+
match = re.search(r'(?:final answer|the answer is)[:\s]+(.+?)(?:\n|$)', response_text, re.IGNORECASE)
|
| 113 |
+
if match:
|
| 114 |
+
answer = match.group(1).strip().strip('"\'')
|
| 115 |
+
print(f"Extracted from 'final answer:' pattern: {repr(answer)}")
|
| 116 |
+
return answer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
|
| 118 |
+
# Last resort: take the last non-empty line
|
| 119 |
+
lines = [l.strip() for l in response_text.strip().split('\n') if l.strip()]
|
| 120 |
+
if lines:
|
| 121 |
+
answer = lines[-1].strip('"\'.,')
|
| 122 |
+
print(f"Fallback to last line: {repr(answer)}")
|
| 123 |
+
return answer
|
|
|
|
|
|
|
|
|
|
| 124 |
|
| 125 |
+
return response_text.strip()
|
| 126 |
|
| 127 |
def __call__(self, question: str, task_id: str = None) -> str:
|
| 128 |
+
"""Process a question and return an answer"""
|
| 129 |
+
print(f"\n{'='*60}")
|
| 130 |
+
print(f"Task ID: {task_id}")
|
| 131 |
+
print(f"Question: {question[:200]}...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
|
| 133 |
try:
|
| 134 |
+
# Build message content
|
| 135 |
+
user_content = []
|
| 136 |
|
| 137 |
+
# Add question text
|
| 138 |
+
user_content.append({
|
| 139 |
"type": "text",
|
| 140 |
+
"text": f"Question: {question}"
|
| 141 |
+
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
|
| 143 |
+
# Try to fetch associated file
|
| 144 |
+
file_info = None
|
| 145 |
+
if task_id:
|
| 146 |
+
file_info = self.fetch_file(task_id)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
|
| 148 |
+
if file_info:
|
| 149 |
+
file_type = file_info.get("type", "unknown")
|
|
|
|
| 150 |
|
| 151 |
+
if file_type == "image" and "base64" in file_info:
|
| 152 |
+
# Determine media type for Claude
|
| 153 |
+
ct = file_info["content_type"]
|
| 154 |
+
if "jpeg" in ct or "jpg" in ct:
|
| 155 |
+
media_type = "image/jpeg"
|
| 156 |
+
elif "png" in ct:
|
| 157 |
+
media_type = "image/png"
|
| 158 |
+
elif "gif" in ct:
|
| 159 |
+
media_type = "image/gif"
|
| 160 |
+
elif "webp" in ct:
|
| 161 |
+
media_type = "image/webp"
|
| 162 |
+
else:
|
| 163 |
+
media_type = "image/png" # default
|
| 164 |
+
|
| 165 |
user_content.append({
|
| 166 |
"type": "image",
|
| 167 |
"source": {
|
| 168 |
+
"type": "base64",
|
| 169 |
+
"media_type": media_type,
|
| 170 |
"data": file_info["base64"]
|
| 171 |
}
|
| 172 |
})
|
| 173 |
+
user_content.append({
|
| 174 |
+
"type": "text",
|
| 175 |
+
"text": "The image above is provided as part of this question. Analyze it carefully."
|
| 176 |
+
})
|
| 177 |
+
print("Added image to message")
|
| 178 |
|
| 179 |
+
elif file_type == "pdf" and "base64" in file_info:
|
| 180 |
+
user_content.append({
|
| 181 |
+
"type": "document",
|
| 182 |
+
"source": {
|
| 183 |
+
"type": "base64",
|
| 184 |
+
"media_type": "application/pdf",
|
| 185 |
+
"data": file_info["base64"]
|
| 186 |
+
}
|
| 187 |
+
})
|
| 188 |
user_content.append({
|
| 189 |
"type": "text",
|
| 190 |
+
"text": "The PDF document above is provided as part of this question. Read it carefully."
|
| 191 |
})
|
| 192 |
+
print("Added PDF to message")
|
| 193 |
+
|
| 194 |
+
elif file_type == "text" and "text" in file_info:
|
| 195 |
+
file_text = file_info["text"]
|
| 196 |
+
# Limit to avoid token overflow but keep enough context
|
| 197 |
+
max_chars = 8000
|
| 198 |
+
if len(file_text) > max_chars:
|
| 199 |
+
file_text = file_text[:max_chars] + f"\n... [truncated, total {len(file_info['text'])} chars]"
|
| 200 |
+
|
| 201 |
user_content.append({
|
| 202 |
"type": "text",
|
| 203 |
+
"text": f"\nAttached file content:\n```\n{file_text}\n```"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
})
|
| 205 |
+
print(f"Added text file content ({len(file_info['text'])} chars)")
|
| 206 |
+
else:
|
| 207 |
+
print(f"File type {file_type} not added to message")
|
| 208 |
|
| 209 |
+
# Call Claude
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
response = self.client.messages.create(
|
| 211 |
model="claude-sonnet-4-6",
|
| 212 |
system=self.system_prompt,
|
| 213 |
+
messages=[{"role": "user", "content": user_content}],
|
| 214 |
+
temperature=0, # deterministic
|
| 215 |
max_tokens=4096
|
| 216 |
)
|
| 217 |
|
|
|
|
| 218 |
raw_answer = response.content[0].text.strip()
|
| 219 |
+
print(f"\nRaw response:\n{raw_answer[:500]}...")
|
| 220 |
|
| 221 |
+
final_answer = self.extract_answer(raw_answer)
|
| 222 |
+
print(f"Final answer: {repr(final_answer)}")
|
| 223 |
|
| 224 |
+
return final_answer
|
|
|
|
| 225 |
|
|
|
|
| 226 |
except Exception as e:
|
| 227 |
+
print(f"Error processing task {task_id}: {e}")
|
| 228 |
import traceback
|
| 229 |
traceback.print_exc()
|
| 230 |
+
return f"ERROR: {str(e)}"
|
| 231 |
|
| 232 |
|
|
|
|
| 233 |
class BasicAgent(GAIAAgent):
|
| 234 |
pass
|
| 235 |
|
| 236 |
|
| 237 |
def run_and_submit_all(profile: gr.OAuthProfile | None):
|
| 238 |
+
space_id = os.getenv("SPACE_ID")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
|
| 240 |
if profile:
|
| 241 |
+
username = f"{profile.username}"
|
| 242 |
print(f"User logged in: {username}")
|
| 243 |
else:
|
| 244 |
print("User not logged in.")
|
|
|
|
| 248 |
questions_url = f"{api_url}/questions"
|
| 249 |
submit_url = f"{api_url}/submit"
|
| 250 |
|
|
|
|
| 251 |
try:
|
| 252 |
agent = BasicAgent()
|
| 253 |
except Exception as e:
|
| 254 |
print(f"Error instantiating agent: {e}")
|
| 255 |
return f"Error initializing agent: {e}", None
|
| 256 |
+
|
| 257 |
agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
|
| 258 |
print(agent_code)
|
| 259 |
|
|
|
|
| 260 |
print(f"Fetching questions from: {questions_url}")
|
| 261 |
try:
|
| 262 |
response = requests.get(questions_url, timeout=15)
|
| 263 |
response.raise_for_status()
|
| 264 |
questions_data = response.json()
|
| 265 |
if not questions_data:
|
| 266 |
+
return "Fetched questions list is empty or invalid format.", None
|
|
|
|
| 267 |
print(f"Fetched {len(questions_data)} questions.")
|
| 268 |
+
except Exception as e:
|
| 269 |
print(f"Error fetching questions: {e}")
|
| 270 |
return f"Error fetching questions: {e}", None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 271 |
|
|
|
|
| 272 |
results_log = []
|
| 273 |
answers_payload = []
|
| 274 |
print(f"Running agent on {len(questions_data)} questions...")
|
| 275 |
+
|
| 276 |
for item in questions_data:
|
| 277 |
task_id = item.get("task_id")
|
| 278 |
question_text = item.get("question")
|
|
|
|
| 282 |
try:
|
| 283 |
submitted_answer = agent(question_text, task_id)
|
| 284 |
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
| 285 |
+
results_log.append({
|
| 286 |
+
"Task ID": task_id,
|
| 287 |
+
"Question": question_text[:100],
|
| 288 |
+
"Submitted Answer": submitted_answer
|
| 289 |
+
})
|
| 290 |
except Exception as e:
|
| 291 |
+
print(f"Error running agent on task {task_id}: {e}")
|
| 292 |
+
results_log.append({
|
| 293 |
+
"Task ID": task_id,
|
| 294 |
+
"Question": question_text[:100],
|
| 295 |
+
"Submitted Answer": f"AGENT ERROR: {e}"
|
| 296 |
+
})
|
| 297 |
|
| 298 |
if not answers_payload:
|
|
|
|
| 299 |
return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
|
| 300 |
|
| 301 |
+
submission_data = {
|
| 302 |
+
"username": username.strip(),
|
| 303 |
+
"agent_code": agent_code,
|
| 304 |
+
"answers": answers_payload
|
| 305 |
+
}
|
| 306 |
+
print(f"Submitting {len(answers_payload)} answers for user '{username}'...")
|
| 307 |
|
|
|
|
|
|
|
| 308 |
try:
|
| 309 |
response = requests.post(submit_url, json=submission_data, timeout=60)
|
| 310 |
response.raise_for_status()
|
|
|
|
| 317 |
f"Message: {result_data.get('message', 'No message received.')}"
|
| 318 |
)
|
| 319 |
print("Submission successful.")
|
| 320 |
+
return final_status, pd.DataFrame(results_log)
|
|
|
|
| 321 |
except requests.exceptions.HTTPError as e:
|
| 322 |
error_detail = f"Server responded with status {e.response.status_code}."
|
| 323 |
try:
|
| 324 |
error_json = e.response.json()
|
| 325 |
error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
|
| 326 |
+
except:
|
| 327 |
error_detail += f" Response: {e.response.text[:500]}"
|
| 328 |
+
return f"Submission Failed: {error_detail}", pd.DataFrame(results_log)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 329 |
except Exception as e:
|
| 330 |
+
return f"Submission Failed: {e}", pd.DataFrame(results_log)
|
|
|
|
|
|
|
|
|
|
| 331 |
|
| 332 |
|
|
|
|
| 333 |
with gr.Blocks() as demo:
|
| 334 |
gr.Markdown("# GAIA Benchmark Agent Evaluation")
|
| 335 |
gr.Markdown(
|
| 336 |
"""
|
| 337 |
**Instructions:**
|
| 338 |
+
1. Log in to your Hugging Face account using the button below.
|
| 339 |
2. Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
|
| 340 |
|
| 341 |
+
This agent uses Claude Sonnet 4.6 to solve GAIA benchmark tasks.
|
| 342 |
"""
|
| 343 |
)
|
| 344 |
|
| 345 |
gr.LoginButton()
|
|
|
|
| 346 |
run_button = gr.Button("Run Evaluation & Submit All Answers")
|
|
|
|
| 347 |
status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
|
| 348 |
results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
|
| 349 |
|
|
|
|
| 354 |
|
| 355 |
if __name__ == "__main__":
|
| 356 |
print("\n" + "-"*30 + " App Starting " + "-"*30)
|
|
|
|
| 357 |
space_host_startup = os.getenv("SPACE_HOST")
|
| 358 |
+
space_id_startup = os.getenv("SPACE_ID")
|
| 359 |
|
| 360 |
if space_host_startup:
|
| 361 |
print(f"✅ SPACE_HOST found: {space_host_startup}")
|
|
|
|
| 362 |
else:
|
| 363 |
+
print("ℹ️ SPACE_HOST not found (running locally?).")
|
| 364 |
|
| 365 |
+
if space_id_startup:
|
| 366 |
print(f"✅ SPACE_ID found: {space_id_startup}")
|
|
|
|
|
|
|
| 367 |
else:
|
| 368 |
+
print("ℹ️ SPACE_ID not found (running locally?).")
|
|
|
|
|
|
|
| 369 |
|
| 370 |
+
print("-"*60 + "\n")
|
| 371 |
print("Launching Gradio Interface for GAIA Agent Evaluation...")
|
| 372 |
demo.launch(debug=True, share=False)
|