Update app.py
Browse files
app.py
CHANGED
|
@@ -2,72 +2,316 @@ import os
|
|
| 2 |
import gradio as gr
|
| 3 |
import requests
|
| 4 |
import pandas as pd
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
|
|
|
| 6 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
| 7 |
|
| 8 |
-
|
| 9 |
-
CodeAgent,
|
| 10 |
-
tool,
|
| 11 |
-
DuckDuckGoSearchTool,
|
| 12 |
-
PythonInterpreterTool,
|
| 13 |
-
FinalAnswerTool,
|
| 14 |
-
)
|
| 15 |
-
from smolagents.llms.anthropic_api import AnthropicModel
|
| 16 |
-
|
| 17 |
-
# Прокладка над DuckDuckGoSearchTool, возвращает "" при ошибках
|
| 18 |
-
@tool
|
| 19 |
-
def web_search(query: str) -> str:
|
| 20 |
-
"""
|
| 21 |
-
Performs a web search using DuckDuckGo.
|
| 22 |
-
Args:
|
| 23 |
-
query: The search query string.
|
| 24 |
-
Returns:
|
| 25 |
-
The raw text results, or empty string on failure.
|
| 26 |
-
"""
|
| 27 |
-
try:
|
| 28 |
-
return DuckDuckGoSearchTool()(query=query)
|
| 29 |
-
except Exception:
|
| 30 |
-
return ""
|
| 31 |
-
|
| 32 |
-
class BasicAgent:
|
| 33 |
def __init__(self):
|
| 34 |
-
print("Initializing
|
| 35 |
-
#
|
| 36 |
-
claude_key = os.environ
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
|
| 58 |
-
def
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
|
| 65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
|
| 67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
|
| 70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
"""
|
| 72 |
Fetches all questions, runs the BasicAgent on them, submits all answers,
|
| 73 |
and displays the results.
|
|
@@ -128,7 +372,7 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
|
|
| 128 |
print(f"Skipping item with missing task_id or question: {item}")
|
| 129 |
continue
|
| 130 |
try:
|
| 131 |
-
submitted_answer = agent(question_text)
|
| 132 |
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
| 133 |
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
|
| 134 |
except Exception as e:
|
|
@@ -190,19 +434,14 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
|
|
| 190 |
|
| 191 |
# --- Build Gradio Interface using Blocks ---
|
| 192 |
with gr.Blocks() as demo:
|
| 193 |
-
gr.Markdown("#
|
| 194 |
gr.Markdown(
|
| 195 |
"""
|
| 196 |
**Instructions:**
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
---
|
| 203 |
-
**Disclaimers:**
|
| 204 |
-
Once clicking on the "submit button, it can take quite some time ( this is the time for the agent to go through all the questions).
|
| 205 |
-
This space provides a basic setup and is intentionally sub-optimal to encourage you to develop your own, more robust solution. For instance for the delay process of the submit button, a solution could be to cache the answers and submit in a seperate action or even to answer the questions in async.
|
| 206 |
"""
|
| 207 |
)
|
| 208 |
|
|
@@ -211,7 +450,6 @@ with gr.Blocks() as demo:
|
|
| 211 |
run_button = gr.Button("Run Evaluation & Submit All Answers")
|
| 212 |
|
| 213 |
status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
|
| 214 |
-
# Removed max_rows=10 from DataFrame constructor
|
| 215 |
results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
|
| 216 |
|
| 217 |
run_button.click(
|
|
@@ -240,5 +478,5 @@ if __name__ == "__main__":
|
|
| 240 |
|
| 241 |
print("-"*(60 + len(" App Starting ")) + "\n")
|
| 242 |
|
| 243 |
-
print("Launching Gradio Interface for
|
| 244 |
demo.launch(debug=True, share=False)
|
|
|
|
| 2 |
import gradio as gr
|
| 3 |
import requests
|
| 4 |
import pandas as pd
|
| 5 |
+
import json
|
| 6 |
+
import re
|
| 7 |
+
import base64
|
| 8 |
+
from typing import Optional, Dict, List, Any
|
| 9 |
+
import anthropic
|
| 10 |
|
| 11 |
+
# API URL для GAIA
|
| 12 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
| 13 |
|
| 14 |
+
class GAIAAgent:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
def __init__(self):
|
| 16 |
+
print("Initializing GAIA Agent powered by Claude...")
|
| 17 |
+
# Получение API-ключа Claude из переменных окружения
|
| 18 |
+
self.claude_key = os.environ.get("ANTHROPIC_API_KEY")
|
| 19 |
+
if not self.claude_key:
|
| 20 |
+
raise ValueError("ANTHROPIC_API_KEY not found in environment variables")
|
| 21 |
+
|
| 22 |
+
# Инициализация клиента Claude
|
| 23 |
+
self.client = anthropic.Anthropic(api_key=self.claude_key)
|
| 24 |
+
|
| 25 |
+
# API URL для GAIA
|
| 26 |
+
self.api_url = DEFAULT_API_URL
|
| 27 |
+
|
| 28 |
+
# Словарь для кеширования результатов поиска и ответов
|
| 29 |
+
self.search_cache = {}
|
| 30 |
+
self.file_cache = {}
|
| 31 |
+
|
| 32 |
+
# Системный промпт для Claude
|
| 33 |
+
self.system_prompt = """
|
| 34 |
+
You are an AI assistant specially designed to answer questions from the GAIA benchmark with exceptional accuracy.
|
| 35 |
+
The GAIA benchmark evaluates AI's ability to perform real-world tasks that require reasoning, web browsing, and tool use.
|
| 36 |
+
|
| 37 |
+
Your goal is to provide the EXACT answer in the format requested by each question. GAIA uses exact matching for evaluation.
|
| 38 |
+
|
| 39 |
+
Guidelines for GAIA answers:
|
| 40 |
+
1. Provide ONLY the final answer, with NO explanations, reasoning, or additional text
|
| 41 |
+
2. Format is critical - follow the instructions in the question precisely
|
| 42 |
+
3. For comma-separated lists, provide "item1, item2, item3" with no quotes or extra punctuation
|
| 43 |
+
4. For numeric answers, provide just the number without units unless specifically requested
|
| 44 |
+
5. Maintain exact capitalization and spacing as requested in the question
|
| 45 |
+
6. If asked to order items, follow the requested ordering precisely
|
| 46 |
+
|
| 47 |
+
Examples of correct formatting:
|
| 48 |
+
- If asked for fruits in alphabetical order: "apples, bananas, oranges"
|
| 49 |
+
- If asked for a single word: "photosynthesis"
|
| 50 |
+
- If asked for a number: "42"
|
| 51 |
+
- If asked for a date in MM/DD/YY format: "05/04/25"
|
| 52 |
+
|
| 53 |
+
Remember, your score depends on exact matching against the reference answer.
|
| 54 |
+
"""
|
| 55 |
+
|
| 56 |
+
def search_web(self, query: str) -> str:
|
| 57 |
+
"""Improved web search function with caching"""
|
| 58 |
+
if query in self.search_cache:
|
| 59 |
+
print(f"Using cached search results for: {query}")
|
| 60 |
+
return self.search_cache[query]
|
| 61 |
+
|
| 62 |
+
print(f"Performing web search for: {query}")
|
| 63 |
+
try:
|
| 64 |
+
# DuckDuckGo Instant Answer API
|
| 65 |
+
response = requests.get(
|
| 66 |
+
"https://api.duckduckgo.com/",
|
| 67 |
+
params={"q": query, "format": "json"},
|
| 68 |
+
timeout=10
|
| 69 |
+
)
|
| 70 |
+
data = response.json()
|
| 71 |
+
|
| 72 |
+
# Собираем результаты из разных полей
|
| 73 |
+
results = []
|
| 74 |
+
if data.get("AbstractText"):
|
| 75 |
+
results.append(f"Abstract: {data['AbstractText']}")
|
| 76 |
+
if data.get("RelatedTopics"):
|
| 77 |
+
topics = data.get("RelatedTopics", [])
|
| 78 |
+
for i, topic in enumerate(topics[:5]): # Ограничиваем 5 результатами
|
| 79 |
+
if isinstance(topic, dict) and topic.get("Text"):
|
| 80 |
+
results.append(f"Related Topic {i+1}: {topic['Text']}")
|
| 81 |
+
|
| 82 |
+
result_text = "\n\n".join(results) if results else "No results found"
|
| 83 |
+
|
| 84 |
+
# Вторичный поиск с использованием серпапи.com (если бы у нас был ключ API)
|
| 85 |
+
# В реальном приложении здесь можно было бы использовать другой поисковый API
|
| 86 |
+
|
| 87 |
+
# Кешируем и возвращаем результаты
|
| 88 |
+
self.search_cache[query] = result_text
|
| 89 |
+
return result_text
|
| 90 |
+
except Exception as e:
|
| 91 |
+
print(f"Web search error: {e}")
|
| 92 |
+
return f"Web search failed: {str(e)}"
|
| 93 |
+
|
| 94 |
+
def fetch_file(self, task_id: str) -> Optional[Dict[str, Any]]:
|
| 95 |
+
"""Fetches and processes a file associated with a task"""
|
| 96 |
+
if task_id in self.file_cache:
|
| 97 |
+
print(f"Using cached file for task: {task_id}")
|
| 98 |
+
return self.file_cache[task_id]
|
| 99 |
+
|
| 100 |
+
print(f"Fetching file for task: {task_id}")
|
| 101 |
+
try:
|
| 102 |
+
response = requests.get(f"{self.api_url}/files/{task_id}", timeout=15)
|
| 103 |
+
|
| 104 |
+
if response.status_code == 200:
|
| 105 |
+
file_content = response.content
|
| 106 |
+
file_info = {
|
| 107 |
+
"content": file_content,
|
| 108 |
+
"content_type": response.headers.get("Content-Type", ""),
|
| 109 |
+
"size": len(file_content)
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
# Определяем тип файла и обрабатываем соответственно
|
| 113 |
+
content_type = file_info["content_type"].lower()
|
| 114 |
+
|
| 115 |
+
if "image" in content_type:
|
| 116 |
+
# Преобразуем изображение в base64 для Claude
|
| 117 |
+
file_info["base64"] = base64.b64encode(file_content).decode('utf-8')
|
| 118 |
+
file_info["type"] = "image"
|
| 119 |
+
print(f"Processed image file ({file_info['size']} bytes)")
|
| 120 |
+
elif "pdf" in content_type:
|
| 121 |
+
# Для PDF мы можем только сказать, что это PDF
|
| 122 |
+
file_info["type"] = "pdf"
|
| 123 |
+
print(f"Detected PDF file ({file_info['size']} bytes)")
|
| 124 |
+
elif "text" in content_type or "json" in content_type or "csv" in content_type:
|
| 125 |
+
# Для текстовых файлов пытаемся декодировать
|
| 126 |
+
try:
|
| 127 |
+
file_info["text"] = file_content.decode('utf-8')
|
| 128 |
+
file_info["type"] = "text"
|
| 129 |
+
print(f"Processed text file ({file_info['size']} bytes)")
|
| 130 |
+
except UnicodeDecodeError:
|
| 131 |
+
file_info["type"] = "binary"
|
| 132 |
+
print(f"Could not decode text file ({file_info['size']} bytes)")
|
| 133 |
+
else:
|
| 134 |
+
file_info["type"] = "binary"
|
| 135 |
+
print(f"Detected binary file ({file_info['size']} bytes, {content_type})")
|
| 136 |
+
|
| 137 |
+
# Кешируем файл
|
| 138 |
+
self.file_cache[task_id] = file_info
|
| 139 |
+
return file_info
|
| 140 |
+
else:
|
| 141 |
+
print(f"Failed to fetch file, status code: {response.status_code}")
|
| 142 |
+
print(f"Response: {response.text[:1000]}")
|
| 143 |
+
return None
|
| 144 |
+
except Exception as e:
|
| 145 |
+
print(f"Error fetching file: {e}")
|
| 146 |
+
return None
|
| 147 |
+
|
| 148 |
+
def extract_answer(self, response_text: str) -> str:
|
| 149 |
+
"""Extract just the final answer from Claude's response"""
|
| 150 |
+
# Удаляем очевидные вводные фразы
|
| 151 |
+
cleaned = re.sub(r'^(final answer|the answer is|answer|Here\'s the answer|response):?\s*', '', response_text, flags=re.IGNORECASE)
|
| 152 |
+
|
| 153 |
+
# Удаляем объяснения в конце
|
| 154 |
+
cleaned = re.sub(r'\n.*?explain.*?$', '', cleaned, flags=re.IGNORECASE | re.DOTALL)
|
| 155 |
+
|
| 156 |
+
# Проверяем на многострочный ответ и берем только первую строку, если она содержит ответ
|
| 157 |
+
lines = cleaned.strip().split('\n')
|
| 158 |
+
if len(lines) > 1:
|
| 159 |
+
first_line = lines[0].strip()
|
| 160 |
+
# Если первая строка выглядит как полный ответ, возвращаем только её
|
| 161 |
+
if len(first_line) > 5 and not first_line.startswith('I ') and not first_line.startswith('The '):
|
| 162 |
+
return first_line
|
| 163 |
+
|
| 164 |
+
# Вычищаем кавычки в начале и конце
|
| 165 |
+
cleaned = cleaned.strip()
|
| 166 |
+
if cleaned.startswith('"') and cleaned.endswith('"'):
|
| 167 |
+
cleaned = cleaned[1:-1]
|
| 168 |
+
|
| 169 |
+
return cleaned.strip()
|
| 170 |
|
| 171 |
+
def process_question(self, question: str, task_id: str = None) -> Dict[str, Any]:
|
| 172 |
+
"""Processes a question to extract relevant information and prepare for Claude"""
|
| 173 |
+
question_info = {
|
| 174 |
+
"original": question,
|
| 175 |
+
"task_id": task_id,
|
| 176 |
+
"has_file": False,
|
| 177 |
+
"file_info": None,
|
| 178 |
+
"contains_math": bool(re.search(r'calculate|compute|sum|average|mean|median|formula|equation', question, re.IGNORECASE)),
|
| 179 |
+
"requires_list": bool(re.search(r'list|order|sequence|rank|items|elements|values', question, re.IGNORECASE)),
|
| 180 |
+
"format_requirements": None
|
| 181 |
+
}
|
| 182 |
+
|
| 183 |
+
# Извлекаем формат, если указан
|
| 184 |
+
format_match = re.search(r'(format|in the format|formatted as|as a|in) ([^\.]+)', question, re.IGNORECASE)
|
| 185 |
+
if format_match:
|
| 186 |
+
question_info["format_requirements"] = format_match.group(2).strip()
|
| 187 |
+
|
| 188 |
+
# Проверяем наличие файла
|
| 189 |
+
if task_id and self.fetch_file(task_id):
|
| 190 |
+
question_info["has_file"] = True
|
| 191 |
+
question_info["file_info"] = self.fetch_file(task_id)
|
| 192 |
+
|
| 193 |
+
return question_info
|
| 194 |
|
| 195 |
+
def __call__(self, question: str, task_id: str = None) -> str:
|
| 196 |
+
"""Main method to process a question and return an answer"""
|
| 197 |
+
if task_id is None:
|
| 198 |
+
# Пытаемся извлечь task_id из вопроса, если он там есть
|
| 199 |
+
match = re.search(r'task[\s_-]?id:?\s*(\w+)', question, re.IGNORECASE)
|
| 200 |
+
if match:
|
| 201 |
+
task_id = match.group(1)
|
| 202 |
+
|
| 203 |
+
print(f"Processing question for task_id: {task_id}")
|
| 204 |
+
print(f"Question: {question[:100]}...")
|
| 205 |
+
|
| 206 |
+
# Обработка вопроса
|
| 207 |
+
question_info = self.process_question(question, task_id)
|
| 208 |
+
|
| 209 |
+
try:
|
| 210 |
+
# Подготовка сообщения для Claude
|
| 211 |
+
messages = []
|
| 212 |
+
|
| 213 |
+
# Подготовка контента сообщения
|
| 214 |
+
user_content = [{
|
| 215 |
+
"type": "text",
|
| 216 |
+
"text": f"""
|
| 217 |
+
Question from GAIA benchmark: {question}
|
| 218 |
+
|
| 219 |
+
Remember:
|
| 220 |
+
1. Provide ONLY the final answer
|
| 221 |
+
2. Format exactly as requested
|
| 222 |
+
3. No explanations or reasoning
|
| 223 |
+
"""
|
| 224 |
+
}]
|
| 225 |
+
|
| 226 |
+
# Добавляем результаты поиска, если нужно
|
| 227 |
+
web_results = self.search_web(question)
|
| 228 |
+
if web_results:
|
| 229 |
+
user_content.append({
|
| 230 |
+
"type": "text",
|
| 231 |
+
"text": f"""
|
| 232 |
+
Web search results related to this question:
|
| 233 |
|
| 234 |
+
{web_results}
|
| 235 |
+
"""
|
| 236 |
+
})
|
| 237 |
+
|
| 238 |
+
# Добавляем файл, если он есть
|
| 239 |
+
if question_info["has_file"] and question_info["file_info"]:
|
| 240 |
+
file_info = question_info["file_info"]
|
| 241 |
+
|
| 242 |
+
if file_info["type"] == "image":
|
| 243 |
+
# Добавляем изображение для Claude
|
| 244 |
+
user_content.append({
|
| 245 |
+
"type": "image",
|
| 246 |
+
"source": {
|
| 247 |
+
"type": "base64",
|
| 248 |
+
"media_type": file_info["content_type"],
|
| 249 |
+
"data": file_info["base64"]
|
| 250 |
+
}
|
| 251 |
+
})
|
| 252 |
+
|
| 253 |
+
user_content.append({
|
| 254 |
+
"type": "text",
|
| 255 |
+
"text": "The above image is part of the question. Please analyze it carefully."
|
| 256 |
+
})
|
| 257 |
+
elif file_info["type"] == "text" and "text" in file_info:
|
| 258 |
+
# Для текстовых файлов добавляем содержимое
|
| 259 |
+
user_content.append({
|
| 260 |
+
"type": "text",
|
| 261 |
+
"text": f"""
|
| 262 |
+
The question includes a text file with the following content:
|
| 263 |
|
| 264 |
+
{file_info["text"][:4000]} # ограничиваем, чтобы не превысить лимиты токенов
|
| 265 |
+
"""
|
| 266 |
+
})
|
| 267 |
+
|
| 268 |
+
# Добавляем форматирование, если указано
|
| 269 |
+
if question_info["format_requirements"]:
|
| 270 |
+
user_content.append({
|
| 271 |
+
"type": "text",
|
| 272 |
+
"text": f"""
|
| 273 |
+
Important format requirement: {question_info["format_requirements"]}
|
| 274 |
+
Make sure your answer follows this format EXACTLY.
|
| 275 |
+
"""
|
| 276 |
+
})
|
| 277 |
+
|
| 278 |
+
messages.append({
|
| 279 |
+
"role": "user",
|
| 280 |
+
"content": user_content
|
| 281 |
+
})
|
| 282 |
+
|
| 283 |
+
# Запрос к Claude
|
| 284 |
+
response = self.client.messages.create(
|
| 285 |
+
model="claude-3-5-sonnet-20241022",
|
| 286 |
+
system=self.system_prompt,
|
| 287 |
+
messages=messages,
|
| 288 |
+
temperature=0.1, # Низкая температура для точных ответов
|
| 289 |
+
max_tokens=4096
|
| 290 |
+
)
|
| 291 |
+
|
| 292 |
+
# Получаем ответ
|
| 293 |
+
raw_answer = response.content[0].text.strip()
|
| 294 |
+
|
| 295 |
+
# Вычищаем ответ от лишнего
|
| 296 |
+
clean_answer = self.extract_answer(raw_answer)
|
| 297 |
+
|
| 298 |
+
print(f"Raw answer: {raw_answer}")
|
| 299 |
+
print(f"Clean answer: {clean_answer}")
|
| 300 |
+
|
| 301 |
+
return clean_answer
|
| 302 |
+
except Exception as e:
|
| 303 |
+
print(f"Error in agent: {e}")
|
| 304 |
+
import traceback
|
| 305 |
+
traceback.print_exc()
|
| 306 |
+
return f"Error processing question: {str(e)}"
|
| 307 |
|
| 308 |
+
|
| 309 |
+
# Используем наш агент как BasicAgent для совместимости с остальным кодом
|
| 310 |
+
class BasicAgent(GAIAAgent):
|
| 311 |
+
pass
|
| 312 |
+
|
| 313 |
+
|
| 314 |
+
def run_and_submit_all(profile: gr.OAuthProfile | None):
|
| 315 |
"""
|
| 316 |
Fetches all questions, runs the BasicAgent on them, submits all answers,
|
| 317 |
and displays the results.
|
|
|
|
| 372 |
print(f"Skipping item with missing task_id or question: {item}")
|
| 373 |
continue
|
| 374 |
try:
|
| 375 |
+
submitted_answer = agent(question_text, task_id)
|
| 376 |
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
| 377 |
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
|
| 378 |
except Exception as e:
|
|
|
|
| 434 |
|
| 435 |
# --- Build Gradio Interface using Blocks ---
|
| 436 |
with gr.Blocks() as demo:
|
| 437 |
+
gr.Markdown("# GAIA Benchmark Agent Evaluation")
|
| 438 |
gr.Markdown(
|
| 439 |
"""
|
| 440 |
**Instructions:**
|
| 441 |
+
1. Log in to your Hugging Face account using the button below. This uses your HF username for submission.
|
| 442 |
+
2. Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
|
| 443 |
+
|
| 444 |
+
This agent uses Claude 3.5 Sonnet to solve GAIA benchmark tasks.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 445 |
"""
|
| 446 |
)
|
| 447 |
|
|
|
|
| 450 |
run_button = gr.Button("Run Evaluation & Submit All Answers")
|
| 451 |
|
| 452 |
status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
|
|
|
|
| 453 |
results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
|
| 454 |
|
| 455 |
run_button.click(
|
|
|
|
| 478 |
|
| 479 |
print("-"*(60 + len(" App Starting ")) + "\n")
|
| 480 |
|
| 481 |
+
print("Launching Gradio Interface for GAIA Agent Evaluation...")
|
| 482 |
demo.launch(debug=True, share=False)
|