import pandas as pd from datasets import load_dataset import os import json import re import requests from huggingface_hub import (login, logout) from langfuse import Langfuse from dotenv import load_dotenv from tools.download_attachments import download_file # --- Constants --- DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space" load_dotenv() def prepare_dataset(base_doc:str)->pd.DataFrame: df = pd.read_csv(base_doc) answer_data = [] for index, row in df.iterrows(): question, answer = get_question_and_answer(row['content']) task_id = get_tag_id(row['metadata']) answer_data.append({ 'question': question, 'answer': answer, 'task_id': task_id }) answer_df = pd.DataFrame(answer_data) return answer_df def get_questions_from_gaia(): api_url = DEFAULT_API_URL questions_url = f"{api_url}/questions" try: response = requests.get(questions_url, timeout=15) response.raise_for_status() questions_data = response.json() if not questions_data: print("Fetched questions list is empty.") return "Fetched questions list is empty or invalid format.", None print(f"Fetched {len(questions_data)} questions.") except requests.exceptions.RequestException as e: print(f"Error fetching questions: {e}") return f"Error fetching questions: {e}", None except requests.exceptions.JSONDecodeError as e: print(f"Error decoding JSON response from questions endpoint: {e}") print(f"Response text: {response.text[:500]}") return f"Error decoding server response for questions: {e}", None except Exception as e: print(f"An unexpected error occurred fetching questions: {e}") return f"An unexpected error occurred fetching questions: {e}", None return questions_data def prepare_evaluation_data(gaia_questions:list, answer_df:pd.DataFrame)->pd.DataFrame: evaluation_data = [] for item in gaia_questions: task_id = item.get("task_id") question_text = item.get("question") # check if task_id has a file: has_file = False filename = download_file(task_id) if filename: has_file = filename # search task id in answer_df answer_row = answer_df[answer_df['task_id'] == task_id] evaluation_data.append({ 'task_id': task_id, 'attachment': has_file, 'question': question_text, 'answer': answer_row['answer'].values[0] if not answer_row.empty else None, }) evaluation_df = pd.DataFrame(evaluation_data) return evaluation_df def get_tag_id(line:str)->str: return json.loads(line.replace("'",'"'))['task_id'] def get_question_and_answer(line:str)->str: search = "Final answer :" length = len(search) pos = line.find(search) if pos == -1: raise Exception("Final answer not found in line: " + line) return line[:pos] ,line[pos + length:].strip() def create_langfuse_dataset(evaluation_df:pd.DataFrame, dataset_name:str,dataset_description:str): langfuse = Langfuse() langfuse.create_dataset( name=dataset_name, description=dataset_description, metadata={ "source ": "GAIA", "type" : "benchmark", "date": "2025-06-29" } ) for index, row in evaluation_df.iterrows(): langfuse.create_dataset_item( dataset_name=dataset_name, input=row['question'], expected_output=row['answer'], metadata={"task_id":row['task_id'],"attachment":row['attachment']}, #tags=["GAIA_Evaluation"], #tags=["GAIA_Evaluation", "level_1"] ) def main(): hf_token = os.environ.get("HF_TOKEN") if hf_token: login(hf_token) print("Hugging Face token set successfully.") else: print("HF token not in env vars") exit(-2) gaia_questions = get_questions_from_gaia() answer_df = prepare_dataset('supabase_docs.csv') evaluation_df = prepare_evaluation_data(gaia_questions, answer_df) evaluation_df.to_csv('evaluation_data.csv', index=False) create_langfuse_dataset(evaluation_df, "GAIA_Evaluation_Dataset","Evaluation of 20 questions level 1 from GAIA") if __name__ == "__main__": main()