Final_Assignment_Template

Sleeping

App Files Files Community

Alexis-alexis commited on 6 days ago

Commit

62b42a1

verified ·

1 Parent(s): e92e982

Update app.py

Browse files

Files changed (1) hide show

app.py +182 -292

app.py CHANGED Viewed

@@ -8,319 +8,237 @@ import base64
 from typing import Optional, Dict, List, Any
 import anthropic
-# API URL для GAIA
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 class GAIAAgent:
     def __init__(self):
         print("Initializing GAIA Agent powered by Claude...")
-        # Получение API-ключа Claude из переменных окружения
         self.claude_key = os.environ.get("ANTHROPIC_API_KEY")
         if not self.claude_key:
             raise ValueError("ANTHROPIC_API_KEY not found in environment variables")
-        # Инициализация клиента Claude
         self.client = anthropic.Anthropic(api_key=self.claude_key)
-        # API URL для GAIA
         self.api_url = DEFAULT_API_URL
-        # Словарь для кеширования результатов поиска и ответов
-        self.search_cache = {}
         self.file_cache = {}
-        # Системный промпт для Claude
-        self.system_prompt = """
-        You are an AI assistant specially designed to answer questions from the GAIA benchmark with exceptional accuracy.
-        The GAIA benchmark evaluates AI's ability to perform real-world tasks that require reasoning, web browsing, and tool use.
-        Your goal is to provide the EXACT answer in the format requested by each question. GAIA uses exact matching for evaluation.
-        Guidelines for GAIA answers:
-        1. Provide ONLY the final answer, with NO explanations, reasoning, or additional text
-        2. Format is critical - follow the instructions in the question precisely
-        3. For comma-separated lists, provide "item1, item2, item3" with no quotes or extra punctuation
-        4. For numeric answers, provide just the number without units unless specifically requested
-        5. Maintain exact capitalization and spacing as requested in the question
-        6. If asked to order items, follow the requested ordering precisely
-        Examples of correct formatting:
-        - If asked for fruits in alphabetical order: "apples, bananas, oranges"
-        - If asked for a single word: "photosynthesis"
-        - If asked for a number: "42"
-        - If asked for a date in MM/DD/YY format: "05/04/25"
-        Remember, your score depends on exact matching against the reference answer.
-        """
-    def search_web(self, query: str) -> str:
-        """Improved web search function with caching"""
-        if query in self.search_cache:
-            print(f"Using cached search results for: {query}")
-            return self.search_cache[query]
-        print(f"Performing web search for: {query}")
-        try:
-            # DuckDuckGo Instant Answer API
-            response = requests.get(
-                "https://api.duckduckgo.com/",
-                params={"q": query, "format": "json"},
-                timeout=10
-            )
-            data = response.json()
-            # Собираем результаты из разных полей
-            results = []
-            if data.get("AbstractText"):
-                results.append(f"Abstract: {data['AbstractText']}")
-            if data.get("RelatedTopics"):
-                topics = data.get("RelatedTopics", [])
-                for i, topic in enumerate(topics[:5]):  # Ограничиваем 5 результатами
-                    if isinstance(topic, dict) and topic.get("Text"):
-                        results.append(f"Related Topic {i+1}: {topic['Text']}")
-            result_text = "\n\n".join(results) if results else "No results found"
-            # Вторичный поиск с использованием серпапи.com (если бы у нас был ключ API)
-            # В реальном приложении здесь можно было бы использовать другой поисковый API
-            # Кешируем и возвращаем результаты
-            self.search_cache[query] = result_text
-            return result_text
-        except Exception as e:
-            print(f"Web search error: {e}")
-            return f"Web search failed: {str(e)}"
     def fetch_file(self, task_id: str) -> Optional[Dict[str, Any]]:
         """Fetches and processes a file associated with a task"""
         if task_id in self.file_cache:
-            print(f"Using cached file for task: {task_id}")
             return self.file_cache[task_id]
         print(f"Fetching file for task: {task_id}")
         try:
             response = requests.get(f"{self.api_url}/files/{task_id}", timeout=15)
-            if response.status_code == 200:
-                file_content = response.content
-                file_info = {
-                    "content": file_content,
-                    "content_type": response.headers.get("Content-Type", ""),
-                    "size": len(file_content)
-                }
-                # Определяем тип файла и обрабатываем соответственно
-                content_type = file_info["content_type"].lower()
-                if "image" in content_type:
-                    # Преобразуем изображение в base64 для Claude
-                    file_info["base64"] = base64.b64encode(file_content).decode('utf-8')
-                    file_info["type"] = "image"
-                    print(f"Processed image file ({file_info['size']} bytes)")
-                elif "pdf" in content_type:
-                    # Для PDF мы можем только сказать, что это PDF
-                    file_info["type"] = "pdf"
-                    print(f"Detected PDF file ({file_info['size']} bytes)")
-                elif "text" in content_type or "json" in content_type or "csv" in content_type:
-                    # Для текстовых файлов пытаемся декодировать
                     try:
-                        file_info["text"] = file_content.decode('utf-8')
                         file_info["type"] = "text"
-                        print(f"Processed text file ({file_info['size']} bytes)")
-                    except UnicodeDecodeError:
                         file_info["type"] = "binary"
-                        print(f"Could not decode text file ({file_info['size']} bytes)")
-                else:
-                    file_info["type"] = "binary"
-                    print(f"Detected binary file ({file_info['size']} bytes, {content_type})")
-                # Кешируем файл
-                self.file_cache[task_id] = file_info
-                return file_info
             else:
-                print(f"Failed to fetch file, status code: {response.status_code}")
-                print(f"Response: {response.text[:1000]}")
-                return None
         except Exception as e:
-            print(f"Error fetching file: {e}")
             return None
     def extract_answer(self, response_text: str) -> str:
-        """Extract just the final answer from Claude's response"""
-        # Удаляем очевидные вводные фразы
-        cleaned = re.sub(r'^(final answer|the answer is|answer|Here\'s the answer|response):?\s*', '', response_text, flags=re.IGNORECASE)
-        # Удаляем объяснения в конце
-        cleaned = re.sub(r'\n.*?explain.*?$', '', cleaned, flags=re.IGNORECASE | re.DOTALL)
-        # Проверяем на многострочный ответ и берем только первую строку, если она содержит ответ
-        lines = cleaned.strip().split('\n')
-        if len(lines) > 1:
-            first_line = lines[0].strip()
-            # Если первая строка выглядит как полный ответ, возвращаем только её
-            if len(first_line) > 5 and not first_line.startswith('I ') and not first_line.startswith('The '):
-                return first_line
-        # Вычищаем кавычки в начале и конце
-        cleaned = cleaned.strip()
-        if cleaned.startswith('"') and cleaned.endswith('"'):
-            cleaned = cleaned[1:-1]
-        return cleaned.strip()
-    def process_question(self, question: str, task_id: str = None) -> Dict[str, Any]:
-        """Processes a question to extract relevant information and prepare for Claude"""
-        question_info = {
-            "original": question,
-            "task_id": task_id,
-            "has_file": False,
-            "file_info": None,
-            "contains_math": bool(re.search(r'calculate|compute|sum|average|mean|median|formula|equation', question, re.IGNORECASE)),
-            "requires_list": bool(re.search(r'list|order|sequence|rank|items|elements|values', question, re.IGNORECASE)),
-            "format_requirements": None
-        }
-        # Извлекаем формат, если указан
-        format_match = re.search(r'(format|in the format|formatted as|as a|in) ([^\.]+)', question, re.IGNORECASE)
-        if format_match:
-            question_info["format_requirements"] = format_match.group(2).strip()
-        # Проверяем наличие файла
-        if task_id and self.fetch_file(task_id):
-            question_info["has_file"] = True
-            question_info["file_info"] = self.fetch_file(task_id)
-        return question_info
     def __call__(self, question: str, task_id: str = None) -> str:
-        """Main method to process a question and return an answer"""
-        if task_id is None:
-            # Пытаемся извлечь task_id из вопроса, если он там есть
-            match = re.search(r'task[\s_-]?id:?\s*(\w+)', question, re.IGNORECASE)
-            if match:
-                task_id = match.group(1)
-        print(f"Processing question for task_id: {task_id}")
-        print(f"Question: {question[:100]}...")
-        # Обработка вопроса
-        question_info = self.process_question(question, task_id)
         try:
-            # Подготовка сообщения для Claude
-            messages = []
-            # Подготовка контента сообщения
-            user_content = [{
                 "type": "text",
-                "text": f"""
-Question from GAIA benchmark: {question}
-Remember:
-1. Provide ONLY the final answer
-2. Format exactly as requested
-3. No explanations or reasoning
-"""
-            }]
-            # Добавляем результаты поиска, если нужно
-            web_results = self.search_web(question)
-            if web_results:
-                user_content.append({
-                    "type": "text",
-                    "text": f"""
-Web search results related to this question:
-{web_results}
-"""
-                })
-            # Добавляем файл, если он есть
-            if question_info["has_file"] and question_info["file_info"]:
-                file_info = question_info["file_info"]
-                if file_info["type"] == "image":
-                    # Добавляем изображение для Claude
                     user_content.append({
                         "type": "image",
                         "source": {
-                            "type": "base64",
-                            "media_type": file_info["content_type"],
                             "data": file_info["base64"]
                         }
                     })
                     user_content.append({
                         "type": "text",
-                        "text": "The above image is part of the question. Please analyze it carefully."
                     })
-                elif file_info["type"] == "text" and "text" in file_info:
-                    # Для текстовых файлов добавляем содержимое
                     user_content.append({
                         "type": "text",
-                        "text": f"""
-The question includes a text file with the following content:
-{file_info["text"][:4000]}  # ограничиваем, чтобы не превысить лимиты токенов
-"""
                     })
-            # Добавляем форматирование, если указано
-            if question_info["format_requirements"]:
-                user_content.append({
-                    "type": "text",
-                    "text": f"""
-Important format requirement: {question_info["format_requirements"]}
-Make sure your answer follows this format EXACTLY.
-"""
-                })
-            messages.append({
-                "role": "user",
-                "content": user_content
-            })
-            # Запрос к Claude
             response = self.client.messages.create(
                 model="claude-sonnet-4-6",
                 system=self.system_prompt,
-                messages=messages,
-                temperature=0.1,  # Низкая температура для точных ответов
                 max_tokens=4096
             )
-            # Получаем ответ
             raw_answer = response.content[0].text.strip()
-            # Вычищаем ответ от лишнего
-            clean_answer = self.extract_answer(raw_answer)
-            print(f"Raw answer: {raw_answer}")
-            print(f"Clean answer: {clean_answer}")
-            return clean_answer
         except Exception as e:
-            print(f"Error in agent: {e}")
             import traceback
             traceback.print_exc()
-            return f"Error processing question: {str(e)}"
-# Используем наш агент как BasicAgent для совместимости с остальным кодом
 class BasicAgent(GAIAAgent):
     pass
 def run_and_submit_all(profile: gr.OAuthProfile | None):
-    """
-    Fetches all questions, runs the BasicAgent on them, submits all answers,
-    and displays the results.
-    """
-    # --- Determine HF Space Runtime URL and Repo URL ---
-    space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
     if profile:
-        username= f"{profile.username}"
         print(f"User logged in: {username}")
     else:
         print("User not logged in.")
@@ -330,41 +248,31 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
     questions_url = f"{api_url}/questions"
     submit_url = f"{api_url}/submit"
-    # 1. Instantiate Agent ( modify this part to create your agent)
     try:
         agent = BasicAgent()
     except Exception as e:
         print(f"Error instantiating agent: {e}")
         return f"Error initializing agent: {e}", None
-    # In the case of an app running as a hugging Face space, this link points toward your codebase ( usefull for others so please keep it public)
     agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
     print(agent_code)
-    # 2. Fetch Questions
     print(f"Fetching questions from: {questions_url}")
     try:
         response = requests.get(questions_url, timeout=15)
         response.raise_for_status()
         questions_data = response.json()
         if not questions_data:
-             print("Fetched questions list is empty.")
-             return "Fetched questions list is empty or invalid format.", None
         print(f"Fetched {len(questions_data)} questions.")
-    except requests.exceptions.RequestException as e:
         print(f"Error fetching questions: {e}")
         return f"Error fetching questions: {e}", None
-    except requests.exceptions.JSONDecodeError as e:
-         print(f"Error decoding JSON response from questions endpoint: {e}")
-         print(f"Response text: {response.text[:500]}")
-         return f"Error decoding server response for questions: {e}", None
-    except Exception as e:
-        print(f"An unexpected error occurred fetching questions: {e}")
-        return f"An unexpected error occurred fetching questions: {e}", None
-    # 3. Run your Agent
     results_log = []
     answers_payload = []
     print(f"Running agent on {len(questions_data)} questions...")
     for item in questions_data:
         task_id = item.get("task_id")
         question_text = item.get("question")
@@ -374,22 +282,29 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
         try:
             submitted_answer = agent(question_text, task_id)
             answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
-            results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
         except Exception as e:
-             print(f"Error running agent on task {task_id}: {e}")
-             results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
     if not answers_payload:
-        print("Agent did not produce any answers to submit.")
         return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
-    # 4. Prepare Submission
-    submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
-    status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
-    print(status_update)
-    # 5. Submit
-    print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
     try:
         response = requests.post(submit_url, json=submission_data, timeout=60)
         response.raise_for_status()
@@ -402,53 +317,33 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
             f"Message: {result_data.get('message', 'No message received.')}"
         )
         print("Submission successful.")
-        results_df = pd.DataFrame(results_log)
-        return final_status, results_df
     except requests.exceptions.HTTPError as e:
         error_detail = f"Server responded with status {e.response.status_code}."
         try:
             error_json = e.response.json()
             error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
-        except requests.exceptions.JSONDecodeError:
             error_detail += f" Response: {e.response.text[:500]}"
-        status_message = f"Submission Failed: {error_detail}"
-        print(status_message)
-        results_df = pd.DataFrame(results_log)
-        return status_message, results_df
-    except requests.exceptions.Timeout:
-        status_message = "Submission Failed: The request timed out."
-        print(status_message)
-        results_df = pd.DataFrame(results_log)
-        return status_message, results_df
-    except requests.exceptions.RequestException as e:
-        status_message = f"Submission Failed: Network error - {e}"
-        print(status_message)
-        results_df = pd.DataFrame(results_log)
-        return status_message, results_df
     except Exception as e:
-        status_message = f"An unexpected error occurred during submission: {e}"
-        print(status_message)
-        results_df = pd.DataFrame(results_log)
-        return status_message, results_df
-# --- Build Gradio Interface using Blocks ---
 with gr.Blocks() as demo:
     gr.Markdown("# GAIA Benchmark Agent Evaluation")
     gr.Markdown(
         """
         **Instructions:**
-        1. Log in to your Hugging Face account using the button below. This uses your HF username for submission.
         2. Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
-        This agent uses Claude 3.5 Sonnet to solve GAIA benchmark tasks.
         """
     )
     gr.LoginButton()
     run_button = gr.Button("Run Evaluation & Submit All Answers")
     status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
     results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
@@ -459,24 +354,19 @@ with gr.Blocks() as demo:
 if __name__ == "__main__":
     print("\n" + "-"*30 + " App Starting " + "-"*30)
-    # Check for SPACE_HOST and SPACE_ID at startup for information
     space_host_startup = os.getenv("SPACE_HOST")
-    space_id_startup = os.getenv("SPACE_ID") # Get SPACE_ID at startup
     if space_host_startup:
         print(f"✅ SPACE_HOST found: {space_host_startup}")
-        print(f"   Runtime URL should be: https://{space_host_startup}.hf.space")
     else:
-        print("ℹ️  SPACE_HOST environment variable not found (running locally?).")
-    if space_id_startup: # Print repo URLs if SPACE_ID is found
         print(f"✅ SPACE_ID found: {space_id_startup}")
-        print(f"   Repo URL: https://huggingface.co/spaces/{space_id_startup}")
-        print(f"   Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main")
     else:
-        print("ℹ️  SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined.")
-    print("-"*(60 + len(" App Starting ")) + "\n")
     print("Launching Gradio Interface for GAIA Agent Evaluation...")
     demo.launch(debug=True, share=False)

 from typing import Optional, Dict, List, Any
 import anthropic
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 class GAIAAgent:
     def __init__(self):
         print("Initializing GAIA Agent powered by Claude...")
         self.claude_key = os.environ.get("ANTHROPIC_API_KEY")
         if not self.claude_key:
             raise ValueError("ANTHROPIC_API_KEY not found in environment variables")
         self.client = anthropic.Anthropic(api_key=self.claude_key)
         self.api_url = DEFAULT_API_URL
         self.file_cache = {}
+        # System prompt — instructs to wrap final answer in <answer> tags
+        self.system_prompt = """You are a highly accurate assistant solving GAIA benchmark tasks.
+GAIA uses EXACT STRING MATCHING to grade answers. Your response format is CRITICAL.
+Instructions:
+1. Think step by step to figure out the correct answer.
+2. At the very end of your response, output your final answer wrapped in <answer> tags like this:
+   <answer>your exact answer here</answer>
+Rules for the content inside <answer>:
+- For numbers: just the number, e.g. <answer>42</answer>
+- For lists: comma-separated, e.g. <answer>apples, bananas, oranges</answer>
+- For single words: just the word, e.g. <answer>photosynthesis</answer>
+- For dates: use the format specified in the question
+- NO extra punctuation, quotes, or explanation inside the tags
+- Follow the exact format requested by the question
+Think carefully before giving the final answer."""
     def fetch_file(self, task_id: str) -> Optional[Dict[str, Any]]:
         """Fetches and processes a file associated with a task"""
         if task_id in self.file_cache:
             return self.file_cache[task_id]
         print(f"Fetching file for task: {task_id}")
         try:
             response = requests.get(f"{self.api_url}/files/{task_id}", timeout=15)
+            if response.status_code != 200:
+                print(f"No file found for task {task_id}, status: {response.status_code}")
+                return None
+            file_content = response.content
+            content_type = response.headers.get("Content-Type", "").lower()
+            file_info = {
+                "content": file_content,
+                "content_type": content_type,
+                "size": len(file_content)
+            }
+            if "image" in content_type:
+                file_info["base64"] = base64.b64encode(file_content).decode('utf-8')
+                file_info["type"] = "image"
+                print(f"Image file: {file_info['size']} bytes, type: {content_type}")
+            elif "pdf" in content_type:
+                file_info["base64"] = base64.b64encode(file_content).decode('utf-8')
+                file_info["type"] = "pdf"
+                print(f"PDF file: {file_info['size']} bytes")
+            elif any(t in content_type for t in ["text", "json", "csv"]):
+                try:
+                    file_info["text"] = file_content.decode('utf-8')
+                    file_info["type"] = "text"
+                    print(f"Text file: {file_info['size']} bytes")
+                except UnicodeDecodeError:
+                    # Try latin-1 fallback
                     try:
+                        file_info["text"] = file_content.decode('latin-1')
                         file_info["type"] = "text"
+                    except:
                         file_info["type"] = "binary"
             else:
+                # Try to decode as text anyway
+                try:
+                    file_info["text"] = file_content.decode('utf-8')
+                    file_info["type"] = "text"
+                    print(f"Unknown type decoded as text: {content_type}")
+                except:
+                    file_info["type"] = "binary"
+                    print(f"Binary file: {content_type}, {file_info['size']} bytes")
+            self.file_cache[task_id] = file_info
+            return file_info
         except Exception as e:
+            print(f"Error fetching file for {task_id}: {e}")
             return None
     def extract_answer(self, response_text: str) -> str:
+        """Extract answer from <answer> tags"""
+        # Primary: look for <answer> tags
+        match = re.search(r'<answer>(.*?)</answer>', response_text, re.DOTALL | re.IGNORECASE)
+        if match:
+            answer = match.group(1).strip()
+            print(f"Extracted from <answer> tags: {repr(answer)}")
+            return answer
+        # Fallback: look for "Final answer:" pattern
+        match = re.search(r'(?:final answer|the answer is)[:\s]+(.+?)(?:\n|$)', response_text, re.IGNORECASE)
+        if match:
+            answer = match.group(1).strip().strip('"\'')
+            print(f"Extracted from 'final answer:' pattern: {repr(answer)}")
+            return answer
+        # Last resort: take the last non-empty line
+        lines = [l.strip() for l in response_text.strip().split('\n') if l.strip()]
+        if lines:
+            answer = lines[-1].strip('"\'.,')
+            print(f"Fallback to last line: {repr(answer)}")
+            return answer
+        return response_text.strip()
     def __call__(self, question: str, task_id: str = None) -> str:
+        """Process a question and return an answer"""
+        print(f"\n{'='*60}")
+        print(f"Task ID: {task_id}")
+        print(f"Question: {question[:200]}...")
         try:
+            # Build message content
+            user_content = []
+            # Add question text
+            user_content.append({
                 "type": "text",
+                "text": f"Question: {question}"
+            })
+            # Try to fetch associated file
+            file_info = None
+            if task_id:
+                file_info = self.fetch_file(task_id)
+            if file_info:
+                file_type = file_info.get("type", "unknown")
+                if file_type == "image" and "base64" in file_info:
+                    # Determine media type for Claude
+                    ct = file_info["content_type"]
+                    if "jpeg" in ct or "jpg" in ct:
+                        media_type = "image/jpeg"
+                    elif "png" in ct:
+                        media_type = "image/png"
+                    elif "gif" in ct:
+                        media_type = "image/gif"
+                    elif "webp" in ct:
+                        media_type = "image/webp"
+                    else:
+                        media_type = "image/png"  # default
                     user_content.append({
                         "type": "image",
                         "source": {
+                            "type": "base64",
+                            "media_type": media_type,
                             "data": file_info["base64"]
                         }
                     })
+                    user_content.append({
+                        "type": "text",
+                        "text": "The image above is provided as part of this question. Analyze it carefully."
+                    })
+                    print("Added image to message")
+                elif file_type == "pdf" and "base64" in file_info:
+                    user_content.append({
+                        "type": "document",
+                        "source": {
+                            "type": "base64",
+                            "media_type": "application/pdf",
+                            "data": file_info["base64"]
+                        }
+                    })
                     user_content.append({
                         "type": "text",
+                        "text": "The PDF document above is provided as part of this question. Read it carefully."
                     })
+                    print("Added PDF to message")
+                elif file_type == "text" and "text" in file_info:
+                    file_text = file_info["text"]
+                    # Limit to avoid token overflow but keep enough context
+                    max_chars = 8000
+                    if len(file_text) > max_chars:
+                        file_text = file_text[:max_chars] + f"\n... [truncated, total {len(file_info['text'])} chars]"
                     user_content.append({
                         "type": "text",
+                        "text": f"\nAttached file content:\n```\n{file_text}\n```"
                     })
+                    print(f"Added text file content ({len(file_info['text'])} chars)")
+                else:
+                    print(f"File type {file_type} not added to message")
+            # Call Claude
             response = self.client.messages.create(
                 model="claude-sonnet-4-6",
                 system=self.system_prompt,
+                messages=[{"role": "user", "content": user_content}],
+                temperature=0,  # deterministic
                 max_tokens=4096
             )
             raw_answer = response.content[0].text.strip()
+            print(f"\nRaw response:\n{raw_answer[:500]}...")
+            final_answer = self.extract_answer(raw_answer)
+            print(f"Final answer: {repr(final_answer)}")
+            return final_answer
         except Exception as e:
+            print(f"Error processing task {task_id}: {e}")
             import traceback
             traceback.print_exc()
+            return f"ERROR: {str(e)}"
 class BasicAgent(GAIAAgent):
     pass
 def run_and_submit_all(profile: gr.OAuthProfile | None):
+    space_id = os.getenv("SPACE_ID")
     if profile:
+        username = f"{profile.username}"
         print(f"User logged in: {username}")
     else:
         print("User not logged in.")
     questions_url = f"{api_url}/questions"
     submit_url = f"{api_url}/submit"
     try:
         agent = BasicAgent()
     except Exception as e:
         print(f"Error instantiating agent: {e}")
         return f"Error initializing agent: {e}", None
     agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
     print(agent_code)
     print(f"Fetching questions from: {questions_url}")
     try:
         response = requests.get(questions_url, timeout=15)
         response.raise_for_status()
         questions_data = response.json()
         if not questions_data:
+            return "Fetched questions list is empty or invalid format.", None
         print(f"Fetched {len(questions_data)} questions.")
+    except Exception as e:
         print(f"Error fetching questions: {e}")
         return f"Error fetching questions: {e}", None
     results_log = []
     answers_payload = []
     print(f"Running agent on {len(questions_data)} questions...")
     for item in questions_data:
         task_id = item.get("task_id")
         question_text = item.get("question")
         try:
             submitted_answer = agent(question_text, task_id)
             answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
+            results_log.append({
+                "Task ID": task_id,
+                "Question": question_text[:100],
+                "Submitted Answer": submitted_answer
+            })
         except Exception as e:
+            print(f"Error running agent on task {task_id}: {e}")
+            results_log.append({
+                "Task ID": task_id,
+                "Question": question_text[:100],
+                "Submitted Answer": f"AGENT ERROR: {e}"
+            })
     if not answers_payload:
         return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
+    submission_data = {
+        "username": username.strip(),
+        "agent_code": agent_code,
+        "answers": answers_payload
+    }
+    print(f"Submitting {len(answers_payload)} answers for user '{username}'...")
     try:
         response = requests.post(submit_url, json=submission_data, timeout=60)
         response.raise_for_status()
             f"Message: {result_data.get('message', 'No message received.')}"
         )
         print("Submission successful.")
+        return final_status, pd.DataFrame(results_log)
     except requests.exceptions.HTTPError as e:
         error_detail = f"Server responded with status {e.response.status_code}."
         try:
             error_json = e.response.json()
             error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
+        except:
             error_detail += f" Response: {e.response.text[:500]}"
+        return f"Submission Failed: {error_detail}", pd.DataFrame(results_log)
     except Exception as e:
+        return f"Submission Failed: {e}", pd.DataFrame(results_log)
 with gr.Blocks() as demo:
     gr.Markdown("# GAIA Benchmark Agent Evaluation")
     gr.Markdown(
         """
         **Instructions:**
+        1. Log in to your Hugging Face account using the button below.
         2. Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
+        This agent uses Claude Sonnet 4.6 to solve GAIA benchmark tasks.
         """
     )
     gr.LoginButton()
     run_button = gr.Button("Run Evaluation & Submit All Answers")
     status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
     results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
 if __name__ == "__main__":
     print("\n" + "-"*30 + " App Starting " + "-"*30)
     space_host_startup = os.getenv("SPACE_HOST")
+    space_id_startup = os.getenv("SPACE_ID")
     if space_host_startup:
         print(f"✅ SPACE_HOST found: {space_host_startup}")
     else:
+        print("ℹ️  SPACE_HOST not found (running locally?).")
+    if space_id_startup:
         print(f"✅ SPACE_ID found: {space_id_startup}")
     else:
+        print("ℹ️  SPACE_ID not found (running locally?).")
+    print("-"*60 + "\n")
     print("Launching Gradio Interface for GAIA Agent Evaluation...")
     demo.launch(debug=True, share=False)