File size: 2,197 Bytes
72ea7b9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# this file submits the evaluation on the json file to the langfuse dataset
import os
import json
from dotenv import load_dotenv

from langfuse import Langfuse
from evaluation import simple_evaluation
# Load environment variables
load_dotenv()
langfuse = Langfuse()

def find_task(responses:list, task_id:str)->dict:
    for response in responses:
        if response.get("task_id") == task_id:
            return response
    return None



def submit_evaluation(evaluation_file:str,langfuse_dataset:str,run_name:str,model_id:str)->None:
    # open the evaluation file
    if not os.path.exists(evaluation_file):
        print(f"Evaluation file {evaluation_file} does not exist.")
        return
    responses = []
    with open(evaluation_file, "r") as file:
        responses = json.load(file)
    

    dataset = langfuse.get_dataset(langfuse_dataset)

    for item in dataset.items:
        print(f"Processing item with task_id: {item.metadata['task_id']}")
        with item.run(
            run_name = run_name
        ) as root_span:
            root_span.update(input=item.input)
            task_id = item.metadata["task_id"]
            #if task_id == "7bd855d8-463d-4ed5-93ca-5fe35145f733":
            try:
                
                output = find_task(responses, task_id)['submitted_answer']
                responses.append({"task_id": task_id, "submitted_answer": output})
                root_span.update(output=output)
            except Exception as e:
                output = f"Error running agent: {e}"

            # score the result against the expected output
            root_span.score_trace(name="exact_match", value = simple_evaluation(output, item.expected_output))

    langfuse.flush()

if __name__ == "__main__":
    #submit_evaluation("responses_GAIA_Evaluation_DatasetSingle Smolagent with tools Qwen 32B 3planningSteps.json","GAIA_Evaluation_Dataset","Single Smolagent with tools Qwen 32B 3planningSteps","dQwen/Qwen2.5-Coder-32B-Instruct")
    submit_evaluation("responses_GAIA_Evaluation_DatasetSingle Smolagent with tools OpenAI 4o 3planningSteps.json","GAIA_Evaluation_Dataset","Single Smolagent with tools OpenAI gtp4o 3planningSteps","openai/gpt4o")