Spaces:
Sleeping
Sleeping
File size: 2,197 Bytes
72ea7b9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
# this file submits the evaluation on the json file to the langfuse dataset
import os
import json
from dotenv import load_dotenv
from langfuse import Langfuse
from evaluation import simple_evaluation
# Load environment variables
load_dotenv()
langfuse = Langfuse()
def find_task(responses:list, task_id:str)->dict:
for response in responses:
if response.get("task_id") == task_id:
return response
return None
def submit_evaluation(evaluation_file:str,langfuse_dataset:str,run_name:str,model_id:str)->None:
# open the evaluation file
if not os.path.exists(evaluation_file):
print(f"Evaluation file {evaluation_file} does not exist.")
return
responses = []
with open(evaluation_file, "r") as file:
responses = json.load(file)
dataset = langfuse.get_dataset(langfuse_dataset)
for item in dataset.items:
print(f"Processing item with task_id: {item.metadata['task_id']}")
with item.run(
run_name = run_name
) as root_span:
root_span.update(input=item.input)
task_id = item.metadata["task_id"]
#if task_id == "7bd855d8-463d-4ed5-93ca-5fe35145f733":
try:
output = find_task(responses, task_id)['submitted_answer']
responses.append({"task_id": task_id, "submitted_answer": output})
root_span.update(output=output)
except Exception as e:
output = f"Error running agent: {e}"
# score the result against the expected output
root_span.score_trace(name="exact_match", value = simple_evaluation(output, item.expected_output))
langfuse.flush()
if __name__ == "__main__":
#submit_evaluation("responses_GAIA_Evaluation_DatasetSingle Smolagent with tools Qwen 32B 3planningSteps.json","GAIA_Evaluation_Dataset","Single Smolagent with tools Qwen 32B 3planningSteps","dQwen/Qwen2.5-Coder-32B-Instruct")
submit_evaluation("responses_GAIA_Evaluation_DatasetSingle Smolagent with tools OpenAI 4o 3planningSteps.json","GAIA_Evaluation_Dataset","Single Smolagent with tools OpenAI gtp4o 3planningSteps","openai/gpt4o") |