Spaces:
Sleeping
Sleeping
| # this file submits the evaluation on the json file to the langfuse dataset | |
| import os | |
| import json | |
| from dotenv import load_dotenv | |
| from langfuse import Langfuse | |
| from evaluation import simple_evaluation | |
| # Load environment variables | |
| load_dotenv() | |
| langfuse = Langfuse() | |
| def find_task(responses:list, task_id:str)->dict: | |
| for response in responses: | |
| if response.get("task_id") == task_id: | |
| return response | |
| return None | |
| def submit_evaluation(evaluation_file:str,langfuse_dataset:str,run_name:str,model_id:str)->None: | |
| # open the evaluation file | |
| if not os.path.exists(evaluation_file): | |
| print(f"Evaluation file {evaluation_file} does not exist.") | |
| return | |
| responses = [] | |
| with open(evaluation_file, "r") as file: | |
| responses = json.load(file) | |
| dataset = langfuse.get_dataset(langfuse_dataset) | |
| for item in dataset.items: | |
| print(f"Processing item with task_id: {item.metadata['task_id']}") | |
| with item.run( | |
| run_name = run_name | |
| ) as root_span: | |
| root_span.update(input=item.input) | |
| task_id = item.metadata["task_id"] | |
| #if task_id == "7bd855d8-463d-4ed5-93ca-5fe35145f733": | |
| try: | |
| output = find_task(responses, task_id)['submitted_answer'] | |
| responses.append({"task_id": task_id, "submitted_answer": output}) | |
| root_span.update(output=output) | |
| except Exception as e: | |
| output = f"Error running agent: {e}" | |
| # score the result against the expected output | |
| root_span.score_trace(name="exact_match", value = simple_evaluation(output, item.expected_output)) | |
| langfuse.flush() | |
| if __name__ == "__main__": | |
| #submit_evaluation("responses_GAIA_Evaluation_DatasetSingle Smolagent with tools Qwen 32B 3planningSteps.json","GAIA_Evaluation_Dataset","Single Smolagent with tools Qwen 32B 3planningSteps","dQwen/Qwen2.5-Coder-32B-Instruct") | |
| submit_evaluation("responses_GAIA_Evaluation_DatasetSingle Smolagent with tools OpenAI 4o 3planningSteps.json","GAIA_Evaluation_Dataset","Single Smolagent with tools OpenAI gtp4o 3planningSteps","openai/gpt4o") |