# this file submits the evaluation on the json file to the langfuse dataset import os import json from dotenv import load_dotenv from langfuse import Langfuse from evaluation import simple_evaluation # Load environment variables load_dotenv() langfuse = Langfuse() def find_task(responses:list, task_id:str)->dict: for response in responses: if response.get("task_id") == task_id: return response return None def submit_evaluation(evaluation_file:str,langfuse_dataset:str,run_name:str,model_id:str)->None: # open the evaluation file if not os.path.exists(evaluation_file): print(f"Evaluation file {evaluation_file} does not exist.") return responses = [] with open(evaluation_file, "r") as file: responses = json.load(file) dataset = langfuse.get_dataset(langfuse_dataset) for item in dataset.items: print(f"Processing item with task_id: {item.metadata['task_id']}") with item.run( run_name = run_name ) as root_span: root_span.update(input=item.input) task_id = item.metadata["task_id"] #if task_id == "7bd855d8-463d-4ed5-93ca-5fe35145f733": try: output = find_task(responses, task_id)['submitted_answer'] responses.append({"task_id": task_id, "submitted_answer": output}) root_span.update(output=output) except Exception as e: output = f"Error running agent: {e}" # score the result against the expected output root_span.score_trace(name="exact_match", value = simple_evaluation(output, item.expected_output)) langfuse.flush() if __name__ == "__main__": #submit_evaluation("responses_GAIA_Evaluation_DatasetSingle Smolagent with tools Qwen 32B 3planningSteps.json","GAIA_Evaluation_Dataset","Single Smolagent with tools Qwen 32B 3planningSteps","dQwen/Qwen2.5-Coder-32B-Instruct") submit_evaluation("responses_GAIA_Evaluation_DatasetSingle Smolagent with tools OpenAI 4o 3planningSteps.json","GAIA_Evaluation_Dataset","Single Smolagent with tools OpenAI gtp4o 3planningSteps","openai/gpt4o")