File size: 4,439 Bytes
fc6b400
 
 
 
 
 
 
 
 
 
 
 
61c17f1
fc6b400
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import pandas as pd
from datasets import load_dataset
import os
import json
import re
import requests
from huggingface_hub import (login, logout)
from langfuse import Langfuse


from dotenv import load_dotenv

from tools.download_attachments import download_file

# --- Constants ---
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"


load_dotenv()


def prepare_dataset(base_doc:str)->pd.DataFrame:
    
    df = pd.read_csv(base_doc)

    answer_data = []
    for index, row in df.iterrows():
        question, answer = get_question_and_answer(row['content'])
        task_id = get_tag_id(row['metadata'])

        answer_data.append({
            'question': question,
            'answer': answer,
            'task_id': task_id
        })
    answer_df = pd.DataFrame(answer_data)
    return answer_df

def get_questions_from_gaia():
    api_url = DEFAULT_API_URL
    questions_url = f"{api_url}/questions"
    try:
        response = requests.get(questions_url, timeout=15)
        response.raise_for_status()
        questions_data = response.json()
        if not questions_data:
             print("Fetched questions list is empty.")
             return "Fetched questions list is empty or invalid format.", None
        print(f"Fetched {len(questions_data)} questions.")
    except requests.exceptions.RequestException as e:
        print(f"Error fetching questions: {e}")
        return f"Error fetching questions: {e}", None
    except requests.exceptions.JSONDecodeError as e:
         print(f"Error decoding JSON response from questions endpoint: {e}")
         print(f"Response text: {response.text[:500]}")
         return f"Error decoding server response for questions: {e}", None
    except Exception as e:
        print(f"An unexpected error occurred fetching questions: {e}")
        return f"An unexpected error occurred fetching questions: {e}", None
    return questions_data

    
def prepare_evaluation_data(gaia_questions:list, answer_df:pd.DataFrame)->pd.DataFrame:
    evaluation_data = []
    for item in gaia_questions:
        task_id = item.get("task_id")
        question_text = item.get("question")
        # check if task_id has a file:
        has_file = False
        filename = download_file(task_id)
        if filename:
            has_file = filename

        # search task id in answer_df
        answer_row = answer_df[answer_df['task_id'] == task_id]
        evaluation_data.append({
            'task_id': task_id,
            'attachment': has_file,
            'question': question_text,
            'answer': answer_row['answer'].values[0] if not answer_row.empty else None,
        })
    evaluation_df = pd.DataFrame(evaluation_data)
    return evaluation_df

def get_tag_id(line:str)->str:
    return json.loads(line.replace("'",'"'))['task_id']

def get_question_and_answer(line:str)->str:
   search = "Final answer :"
   length = len(search)
   pos = line.find(search)
   if pos == -1:
         raise Exception("Final answer not found in line: " + line)
   return line[:pos] ,line[pos + length:].strip()

def create_langfuse_dataset(evaluation_df:pd.DataFrame, dataset_name:str,dataset_description:str):
    langfuse = Langfuse()

    langfuse.create_dataset(
        name=dataset_name,
        description=dataset_description,
        metadata={
            "source ": "GAIA",
            "type" : "benchmark",
            "date": "2025-06-29" 
        }   
    )
    for index, row in evaluation_df.iterrows():
        langfuse.create_dataset_item(
            dataset_name=dataset_name,
            input=row['question'],
            expected_output=row['answer'],
            metadata={"task_id":row['task_id'],"attachment":row['attachment']},
            #tags=["GAIA_Evaluation"],
            #tags=["GAIA_Evaluation", "level_1"]
        )
    


def main():
    hf_token = os.environ.get("HF_TOKEN")
    if hf_token:
        login(hf_token)
        print("Hugging Face token set successfully.")
    else:
        print("HF token not in env vars")
        exit(-2)
    gaia_questions = get_questions_from_gaia()
    answer_df = prepare_dataset('supabase_docs.csv')
    evaluation_df = prepare_evaluation_data(gaia_questions, answer_df)
    evaluation_df.to_csv('evaluation_data.csv', index=False)
    create_langfuse_dataset(evaluation_df, "GAIA_Evaluation_Dataset","Evaluation of 20 questions level 1 from GAIA")
    


if __name__ == "__main__":
    main()