| import os |
| import json |
| import time |
| import torch |
| import requests |
| import gradio as gr |
| import pandas as pd |
| from typing import List, Dict, Any, Optional, Union, Callable, Tuple |
| from agent import EnhancedGAIAAgent |
|
|
| |
| DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space" |
| MAX_RETRIES = 3 |
| RETRY_DELAY = 5 |
|
|
| class EvaluationRunner: |
| """Обрабатывает процесс оценки: получение вопросов, запуск агента, отправку ответов""" |
| |
| def __init__(self, api_url=DEFAULT_API_URL): |
| self.api_url = api_url |
| self.questions_url = f"{api_url}/questions" |
| self.submit_url = f"{api_url}/submit" |
| self.results_url = f"{api_url}/results" |
| self.correct_answers = 0 |
| self.total_questions = 0 |
| |
| def run_evaluation(self, |
| agent: Callable[[str], str], |
| username: str, |
| agent_code: str) -> tuple[str, pd.DataFrame]: |
| |
| questions_data = self._fetch_questions() |
| if isinstance(questions_data, str): |
| return questions_data, None |
| |
| |
| results_log, answers_payload = self._run_agent_on_questions(agent, questions_data) |
| if not answers_payload: |
| return "Agent did not produce any answers to submit.", pd.DataFrame(results_log) |
| |
| |
| submission_result = self._submit_answers(username, agent_code, answers_payload) |
| |
| |
| self._check_results(username) |
| self.print_evaluation_summary(username) |
| |
| return submission_result, pd.DataFrame(results_log) |
| |
| def _fetch_questions(self) -> Union[List[Dict[str, Any]], str]: |
| try: |
| response = requests.get(self.questions_url, timeout=15) |
| response.raise_for_status() |
| questions_data = response.json() |
| |
| if not questions_data: |
| return "Fetched questions list is empty or invalid format." |
| |
| self.total_questions = len(questions_data) |
| print(f"Successfully fetched {self.total_questions} questions.") |
| return questions_data |
| |
| except Exception as e: |
| return f"Error fetching questions: {e}" |
| |
| def _run_agent_on_questions(self, |
| agent: Any, |
| questions_data: List[Dict[str, Any]]) -> tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: |
| results_log = [] |
| answers_payload = [] |
| |
| print(f"Running agent on {len(questions_data)} questions...") |
| for item in questions_data: |
| task_id = item.get("task_id") |
| question_text = item.get("question") |
| |
| if not task_id or question_text is None: |
| continue |
| |
| try: |
| json_response = agent(question_text, task_id) |
| response_obj = json.loads(json_response) |
| submitted_answer = response_obj.get("final_answer", "") |
| |
| answers_payload.append({ |
| "task_id": task_id, |
| "submitted_answer": submitted_answer |
| }) |
| |
| results_log.append({ |
| "Task ID": task_id, |
| "Question": question_text, |
| "Submitted Answer": submitted_answer, |
| "Full Response": json_response |
| }) |
| except Exception as e: |
| results_log.append({ |
| "Task ID": task_id, |
| "Question": question_text, |
| "Submitted Answer": f"AGENT ERROR: {e}" |
| }) |
| |
| return results_log, answers_payload |
| |
| def _submit_answers(self, |
| username: str, |
| agent_code: str, |
| answers_payload: List[Dict[str, Any]]) -> str: |
| submission_data = { |
| "username": username.strip(), |
| "agent_code": agent_code.strip(), |
| "answers": answers_payload |
| } |
| |
| print(f"Submitting {len(answers_payload)} answers to: {self.submit_url}") |
| print("Submission data:", json.dumps(submission_data, indent=2)) |
| |
| for attempt in range(1, MAX_RETRIES + 1): |
| try: |
| response = requests.post( |
| self.submit_url, |
| json=submission_data, |
| headers={"Content-Type": "application/json"}, |
| timeout=30 |
| ) |
| response.raise_for_status() |
| |
| try: |
| result = response.json() |
| if "message" in result: |
| return result["message"] |
| return "Evaluation submitted successfully" |
| except: |
| return f"Submission successful, but response was not JSON: {response.text}" |
| |
| except Exception as e: |
| print(f"Submission attempt {attempt} failed: {e}") |
| time.sleep(RETRY_DELAY) |
| |
| return "Error submitting answers after multiple attempts" |
| |
| def _check_results(self, username: str) -> None: |
| try: |
| results_url = f"{self.results_url}?username={username}" |
| response = requests.get(results_url, timeout=15) |
| if response.status_code == 200: |
| data = response.json() |
| if isinstance(data, dict) and "score" in data: |
| self.correct_answers = int(data["score"]) |
| except Exception as e: |
| print(f"Error checking results: {e}") |
| |
| def get_correct_answers_count(self) -> int: |
| return self.correct_answers |
| |
| def get_total_questions_count(self) -> int: |
| return self.total_questions |
| |
| def print_evaluation_summary(self, username: str) -> None: |
| print("\n===== EVALUATION SUMMARY =====") |
| print(f"User: {username}") |
| print(f"Overall Score: {self.correct_answers}/{self.total_questions}") |
| print("=============================\n") |
|
|
|
|
| def run_evaluation(username: str, |
| agent_code: str, |
| model_name: str = "google/flan-t5-base", |
| use_cache: bool = False) -> Tuple[str, int, int, str, str, str]: |
| start_time = time.time() |
| |
| |
| agent = EnhancedGAIAAgent(model_name=model_name, use_cache=use_cache) |
| |
| |
| runner = EvaluationRunner(api_url=DEFAULT_API_URL) |
| |
| |
| result, results_log = runner.run_evaluation(agent, username, agent_code) |
| |
| |
| elapsed_time = time.time() - start_time |
| elapsed_time_str = f"{elapsed_time:.2f} seconds" |
| |
| |
| results_url = f"{DEFAULT_API_URL}/results?username={username}" |
| cache_status = "Cache enabled and used" if use_cache else "Cache disabled" |
| |
| return ( |
| result, |
| runner.get_correct_answers_count(), |
| runner.get_total_questions_count(), |
| elapsed_time_str, |
| results_url, |
| cache_status |
| ) |
|
|
|
|
| def create_gradio_interface(): |
| with gr.Blocks(title="GAIA Agent Evaluation") as demo: |
| gr.Markdown("# GAIA Agent Evaluation") |
| |
| with gr.Row(): |
| with gr.Column(): |
| username = gr.Textbox(label="Hugging Face Username") |
| agent_code = gr.Textbox(label="Agent Code", lines=2, placeholder="Your agent code here") |
| model_name = gr.Dropdown( |
| label="Model", |
| choices=["google/flan-t5-small", "google/flan-t5-base", "google/flan-t5-large"], |
| value="google/flan-t5-base" |
| ) |
| use_cache = gr.Checkbox(label="Use Answer Cache", value=False) |
| run_button = gr.Button("Run Evaluation & Submit All Answers") |
| |
| with gr.Column(): |
| result_text = gr.Textbox(label="Result", lines=2) |
| correct_answers = gr.Number(label="Correct Answers") |
| total_questions = gr.Number(label="Total Questions") |
| elapsed_time = gr.Textbox(label="Elapsed Time") |
| results_url = gr.Textbox(label="Results URL") |
| cache_status = gr.Textbox(label="Cache Status") |
| |
| run_button.click( |
| fn=run_evaluation, |
| inputs=[username, agent_code, model_name, use_cache], |
| outputs=[ |
| result_text, |
| correct_answers, |
| total_questions, |
| elapsed_time, |
| results_url, |
| cache_status |
| ] |
| ) |
| |
| return demo |
|
|
|
|
| if __name__ == "__main__": |
| demo = create_gradio_interface() |
| demo.launch(share=True) |