FinalTest

Runtime error

App Files Files Community

yoshizen commited on May 29, 2025

Commit

f7cf33f

verified ·

1 Parent(s): f0bb83e

Update app.py

Browse files

Files changed (1) hide show

app.py +67 -152

app.py CHANGED Viewed

@@ -1,20 +1,16 @@
-import os
 import json
 import time
-import torch
 import requests
 import gradio as gr
 import pandas as pd
-from typing import List, Dict, Any, Optional, Union, Callable, Tuple
-from agent_gaia import GAIAExpertAgent as OptimizedGAIAAgent
 # Константы
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
-MAX_RETRIES = 3
-RETRY_DELAY = 5
 class EvaluationRunner:
-    """Обрабатывает процесс оценки: получение вопросов, запуск агента, отправку ответов"""
     def __init__(self, api_url=DEFAULT_API_URL):
         self.api_url = api_url
@@ -24,213 +20,132 @@ class EvaluationRunner:
         self.correct_answers = 0
         self.total_questions = 0
-    def run_evaluation(self,
-                      agent: Callable[[str], str],
-                      username: str,
-                      agent_code: str) -> tuple[str, pd.DataFrame]:
-        # Получаем вопросы
         questions_data = self._fetch_questions()
-        if isinstance(questions_data, str):  # Сообщение об ошибке
-            return questions_data, None
-        # Запускаем агента на всех вопросах
         results_log, answers_payload = self._run_agent_on_questions(agent, questions_data)
         if not answers_payload:
-            return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
-        # Отправляем ответы
         submission_result = self._submit_answers(username, agent_code, answers_payload)
-        # Проверяем результаты
-        self._check_results(username)
-        self.print_evaluation_summary(username)
         return submission_result, pd.DataFrame(results_log)
-    def _fetch_questions(self) -> Union[List[Dict[str, Any]], str]:
         try:
-            response = requests.get(self.questions_url, timeout=15)
             response.raise_for_status()
             questions_data = response.json()
-            if not questions_data:
-                return "Fetched questions list is empty or invalid format."
             self.total_questions = len(questions_data)
-            print(f"Successfully fetched {self.total_questions} questions.")
             return questions_data
         except Exception as e:
-            return f"Error fetching questions: {e}"
-    def _run_agent_on_questions(self,
-                               agent: Any,
-                               questions_data: List[Dict[str, Any]]) -> tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
         results_log = []
         answers_payload = []
-        print(f"Running agent on {len(questions_data)} questions...")
-        for item in questions_data:
             task_id = item.get("task_id")
             question_text = item.get("question")
-            if not task_id or question_text is None:
                 continue
             try:
                 json_response = agent(question_text, task_id)
                 response_obj = json.loads(json_response)
-                submitted_answer = response_obj.get("final_answer", "")
-                answers_payload.append({
-                    "task_id": task_id,
-                    "submitted_answer": submitted_answer
-                })
                 results_log.append({
-                    "Task ID": task_id,
-                    "Question": question_text,
-                    "Submitted Answer": submitted_answer,
-                    "Full Response": json_response
                 })
             except Exception as e:
                 results_log.append({
-                    "Task ID": task_id,
-                    "Question": question_text,
-                    "Submitted Answer": f"AGENT ERROR: {e}"
                 })
         return results_log, answers_payload
-    def _submit_answers(self,
-                       username: str,
-                       agent_code: str,
-                       answers_payload: List[Dict[str, Any]]) -> str:
         submission_data = {
             "username": username.strip(),
-            "agent_code": agent_code.strip(),  # Ключевое исправление: agent_code вместо agent_code_url
             "answers": answers_payload
         }
-        print(f"Submitting {len(answers_payload)} answers to: {self.submit_url}")
-        print("Submission data:", json.dumps(submission_data, indent=2))
-        for attempt in range(1, MAX_RETRIES + 1):
-            try:
-                response = requests.post(
-                    self.submit_url,
-                    json=submission_data,
-                    headers={"Content-Type": "application/json"},
-                    timeout=30
-                )
-                response.raise_for_status()
-                try:
-                    result = response.json()
-                    if "message" in result:
-                        return result["message"]
-                    return "Evaluation submitted successfully"
-                except:
-                    return f"Submission successful, but response was not JSON: {response.text}"
-            except Exception as e:
-                print(f"Submission attempt {attempt} failed: {e}")
-                time.sleep(RETRY_DELAY)
-        return "Error submitting answers after multiple attempts"
-    def _check_results(self, username: str) -> None:
         try:
-            results_url = f"{self.results_url}?username={username}"
-            response = requests.get(results_url, timeout=15)
-            if response.status_code == 200:
-                data = response.json()
-                if isinstance(data, dict) and "score" in data:
-                    self.correct_answers = int(data["score"])
         except Exception as e:
-            print(f"Error checking results: {e}")
-    def get_correct_answers_count(self) -> int:
-        return self.correct_answers
-    def get_total_questions_count(self) -> int:
-        return self.total_questions
-    def print_evaluation_summary(self, username: str) -> None:
-        print("\n===== EVALUATION SUMMARY =====")
-        print(f"User: {username}")
-        print(f"Overall Score: {self.correct_answers}/{self.total_questions}")
-        print("=============================\n")
-def run_evaluation(username: str,
-                  agent_code: str,  # Исправлено имя параметра
-                  model_name: str = "google/flan-t5-base",
-                  use_cache: bool = False) -> Tuple[str, int, int, str, str, str]:  # Кэш отключен по умолчанию
-    start_time = time.time()
-    # Инициализируем агента
-    agent = EnhancedGAIAAgent(model_name=model_name, use_cache=use_cache)
-    # Инициализируем runner
-    runner = EvaluationRunner(api_url=DEFAULT_API_URL)
-    # Запускаем оценку
-    result, results_log = runner.run_evaluation(agent, username, agent_code)
-    # Вычисляем время выполнения
-    elapsed_time = time.time() - start_time
-    elapsed_time_str = f"{elapsed_time:.2f} seconds"
-    # Формируем URL результатов
-    results_url = f"{DEFAULT_API_URL}/results?username={username}"
-    cache_status = "Cache enabled and used" if use_cache else "Cache disabled"
-    return (
-        result,
-        runner.get_correct_answers_count(),
-        runner.get_total_questions_count(),
-        elapsed_time_str,
-        results_url,
-        cache_status
-    )
 def create_gradio_interface():
-    with gr.Blocks(title="GAIA Agent Evaluation") as demo:
-        gr.Markdown("# GAIA Agent Evaluation")
         with gr.Row():
             with gr.Column():
-                username = gr.Textbox(label="Hugging Face Username")
-                agent_code = gr.Textbox(label="Agent Code", lines=2, placeholder="Your agent code here")
                 model_name = gr.Dropdown(
                     label="Model",
-                    choices=["google/flan-t5-small", "google/flan-t5-base", "google/flan-t5-large"],
-                    value="google/flan-t5-base"
                 )
-                use_cache = gr.Checkbox(label="Use Answer Cache", value=False)
-                run_button = gr.Button("Run Evaluation & Submit All Answers")
             with gr.Column():
-                result_text = gr.Textbox(label="Result", lines=2)
                 correct_answers = gr.Number(label="Correct Answers")
                 total_questions = gr.Number(label="Total Questions")
-                elapsed_time = gr.Textbox(label="Elapsed Time")
-                results_url = gr.Textbox(label="Results URL")
-                cache_status = gr.Textbox(label="Cache Status")
         run_button.click(
             fn=run_evaluation,
-            inputs=[username, agent_code, model_name, use_cache],
-            outputs=[
-                result_text,
-                correct_answers,
-                total_questions,
-                elapsed_time,
-                results_url,
-                cache_status
-            ]
         )
     return demo
@@ -238,4 +153,4 @@ def create_gradio_interface():
 if __name__ == "__main__":
     demo = create_gradio_interface()
-    demo.launch(share=True)

 import json
 import time
 import requests
 import gradio as gr
 import pandas as pd
+from tqdm import tqdm
+from agent import GAIAExpertAgent
 # Константы
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 class EvaluationRunner:
+    """Оптимизированный обработчик оценки"""
     def __init__(self, api_url=DEFAULT_API_URL):
         self.api_url = api_url
         self.correct_answers = 0
         self.total_questions = 0
+    def run_evaluation(self, agent, username: str, agent_code: str) -> Tuple[str, pd.DataFrame]:
         questions_data = self._fetch_questions()
+        if not isinstance(questions_data, list):
+            return questions_data, pd.DataFrame()
         results_log, answers_payload = self._run_agent_on_questions(agent, questions_data)
         if not answers_payload:
+            return "No answers generated", pd.DataFrame()
         submission_result = self._submit_answers(username, agent_code, answers_payload)
         return submission_result, pd.DataFrame(results_log)
+    def _fetch_questions(self):
         try:
+            response = requests.get(self.questions_url, timeout=30)
             response.raise_for_status()
             questions_data = response.json()
             self.total_questions = len(questions_data)
+            print(f"Fetched {self.total_questions} questions")
             return questions_data
         except Exception as e:
+            return f"Error: {str(e)}"
+    def _run_agent_on_questions(self, agent, questions_data):
         results_log = []
         answers_payload = []
+        print(f"Processing {len(questions_data)} questions...")
+        for item in tqdm(questions_data, desc="Questions"):
             task_id = item.get("task_id")
             question_text = item.get("question")
+            if not task_id or not question_text:
                 continue
             try:
                 json_response = agent(question_text, task_id)
                 response_obj = json.loads(json_response)
+                answer = response_obj.get("final_answer", "")
+                answers_payload.append({"task_id": task_id, "submitted_answer": answer})
                 results_log.append({
+                    "Task ID": task_id,
+                    "Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
+                    "Answer": answer[:50] + "..." if len(answer) > 50 else answer
                 })
             except Exception as e:
+                answers_payload.append({"task_id": task_id, "submitted_answer": f"ERROR: {str(e)}"})
                 results_log.append({
+                    "Task ID": task_id,
+                    "Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
+                    "Answer": f"ERROR: {str(e)}"
                 })
         return results_log, answers_payload
+    def _submit_answers(self, username: str, agent_code: str, answers_payload):
         submission_data = {
             "username": username.strip(),
+            "agent_code": agent_code.strip(),
             "answers": answers_payload
         }
+        print("Submitting answers...")
         try:
+            response = requests.post(
+                self.submit_url,
+                json=submission_data,
+                headers={"Content-Type": "application/json"},
+                timeout=60
+            )
+            response.raise_for_status()
+            return response.json().get("message", "Answers submitted successfully")
         except Exception as e:
+            return f"Submission failed: {str(e)}"
+def run_evaluation(username: str, agent_code: str, model_name: str):
+    print("Initializing GAIA Expert Agent...")
+    agent = GAIAExpertAgent(model_name=model_name)
+    print("Starting evaluation...")
+    runner = EvaluationRunner()
+    result, results_df = runner.run_evaluation(agent, username, agent_code)
+    # Добавляем счетчики вопросов
+    total_questions = runner.total_questions
+    correct_answers = runner.correct_answers if hasattr(runner, 'correct_answers') else 0
+    return result, correct_answers, total_questions, results_df
 def create_gradio_interface():
+    with gr.Blocks(title="GAIA Expert Agent") as demo:
+        gr.Markdown("# 🧠 GAIA Expert Agent Evaluation")
         with gr.Row():
             with gr.Column():
+                gr.Markdown("### Configuration")
+                username = gr.Textbox(label="Hugging Face Username", value="yoshizen")
+                agent_code = gr.Textbox(
+                    label="Agent Code",
+                    value="https://huggingface.co/spaces/yoshizen/FinalTest"
+                )
                 model_name = gr.Dropdown(
                     label="Model",
+                    choices=[
+                        "google/flan-t5-small",
+                        "google/flan-t5-base",
+                        "google/flan-t5-large"
+                    ],
+                    value="google/flan-t5-large"
                 )
+                run_button = gr.Button("🚀 Run Evaluation", variant="primary")
             with gr.Column():
+                gr.Markdown("### Results")
+                result_text = gr.Textbox(label="Submission Status")
                 correct_answers = gr.Number(label="Correct Answers")
                 total_questions = gr.Number(label="Total Questions")
+                results_table = gr.Dataframe(label="Processed Questions", interactive=False)
         run_button.click(
             fn=run_evaluation,
+            inputs=[username, agent_code, model_name],
+            outputs=[result_text, correct_answers, total_questions, results_table]
         )
     return demo
 if __name__ == "__main__":
     demo = create_gradio_interface()
+    demo.launch(server_name="0.0.0.0", server_port=7860)