Spaces:
Sleeping
Sleeping
File size: 4,439 Bytes
fc6b400 61c17f1 fc6b400 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 | import pandas as pd
from datasets import load_dataset
import os
import json
import re
import requests
from huggingface_hub import (login, logout)
from langfuse import Langfuse
from dotenv import load_dotenv
from tools.download_attachments import download_file
# --- Constants ---
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
load_dotenv()
def prepare_dataset(base_doc:str)->pd.DataFrame:
df = pd.read_csv(base_doc)
answer_data = []
for index, row in df.iterrows():
question, answer = get_question_and_answer(row['content'])
task_id = get_tag_id(row['metadata'])
answer_data.append({
'question': question,
'answer': answer,
'task_id': task_id
})
answer_df = pd.DataFrame(answer_data)
return answer_df
def get_questions_from_gaia():
api_url = DEFAULT_API_URL
questions_url = f"{api_url}/questions"
try:
response = requests.get(questions_url, timeout=15)
response.raise_for_status()
questions_data = response.json()
if not questions_data:
print("Fetched questions list is empty.")
return "Fetched questions list is empty or invalid format.", None
print(f"Fetched {len(questions_data)} questions.")
except requests.exceptions.RequestException as e:
print(f"Error fetching questions: {e}")
return f"Error fetching questions: {e}", None
except requests.exceptions.JSONDecodeError as e:
print(f"Error decoding JSON response from questions endpoint: {e}")
print(f"Response text: {response.text[:500]}")
return f"Error decoding server response for questions: {e}", None
except Exception as e:
print(f"An unexpected error occurred fetching questions: {e}")
return f"An unexpected error occurred fetching questions: {e}", None
return questions_data
def prepare_evaluation_data(gaia_questions:list, answer_df:pd.DataFrame)->pd.DataFrame:
evaluation_data = []
for item in gaia_questions:
task_id = item.get("task_id")
question_text = item.get("question")
# check if task_id has a file:
has_file = False
filename = download_file(task_id)
if filename:
has_file = filename
# search task id in answer_df
answer_row = answer_df[answer_df['task_id'] == task_id]
evaluation_data.append({
'task_id': task_id,
'attachment': has_file,
'question': question_text,
'answer': answer_row['answer'].values[0] if not answer_row.empty else None,
})
evaluation_df = pd.DataFrame(evaluation_data)
return evaluation_df
def get_tag_id(line:str)->str:
return json.loads(line.replace("'",'"'))['task_id']
def get_question_and_answer(line:str)->str:
search = "Final answer :"
length = len(search)
pos = line.find(search)
if pos == -1:
raise Exception("Final answer not found in line: " + line)
return line[:pos] ,line[pos + length:].strip()
def create_langfuse_dataset(evaluation_df:pd.DataFrame, dataset_name:str,dataset_description:str):
langfuse = Langfuse()
langfuse.create_dataset(
name=dataset_name,
description=dataset_description,
metadata={
"source ": "GAIA",
"type" : "benchmark",
"date": "2025-06-29"
}
)
for index, row in evaluation_df.iterrows():
langfuse.create_dataset_item(
dataset_name=dataset_name,
input=row['question'],
expected_output=row['answer'],
metadata={"task_id":row['task_id'],"attachment":row['attachment']},
#tags=["GAIA_Evaluation"],
#tags=["GAIA_Evaluation", "level_1"]
)
def main():
hf_token = os.environ.get("HF_TOKEN")
if hf_token:
login(hf_token)
print("Hugging Face token set successfully.")
else:
print("HF token not in env vars")
exit(-2)
gaia_questions = get_questions_from_gaia()
answer_df = prepare_dataset('supabase_docs.csv')
evaluation_df = prepare_evaluation_data(gaia_questions, answer_df)
evaluation_df.to_csv('evaluation_data.csv', index=False)
create_langfuse_dataset(evaluation_df, "GAIA_Evaluation_Dataset","Evaluation of 20 questions level 1 from GAIA")
if __name__ == "__main__":
main()
|