Spaces:
Sleeping
Sleeping
| import os | |
| import json | |
| from dotenv import load_dotenv | |
| from opentelemetry.trace import format_trace_id, get_tracer | |
| from opentelemetry import trace | |
| from opentelemetry.sdk.trace import TracerProvider | |
| from opentelemetry.sdk.trace.export import SimpleSpanProcessor | |
| from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter | |
| from openinference.instrumentation.smolagents import SmolagentsInstrumentor | |
| from langfuse import observe | |
| from PIL import Image | |
| from single_smolagent import build_agents | |
| from langfuse import Langfuse | |
| # Load environment variables | |
| load_dotenv() | |
| langfuse = Langfuse() | |
| # Initialize OpenTelemetry Tracer | |
| #trace_provider = TracerProvider() | |
| #trace_provider.add_span_processor(SimpleSpanProcessor(OTLPSpanExporter())) | |
| #trace.set_tracer_provider(trace_provider) # Set as global provider | |
| #tracer = trace.get_tracer(__name__) # Get a tracer instance | |
| #SmolagentsInstrumentor().instrument(tracer_provider=trace_provider) | |
| def add_image(metadata)->list: | |
| images = [] | |
| task_id = metadata["task_id"] | |
| attachment = metadata.get("attachment", False) | |
| if attachment: | |
| os.file_path = f"attachments/{attachment}" | |
| if os.path.exists(os.file_path): | |
| print("Attachments found for task_id:", task_id) | |
| # with open(os.file_path, "rb") as file: | |
| # question += f"\n\nAttachments: {file.read().decode('utf-8')}" | |
| else: | |
| print(f"No attachments found for task_id: {task_id}") | |
| # if the file is an image, we can add it to the question | |
| if os.path.isfile(os.file_path) and os.path.splitext(os.file_path)[1].lower() in ['.jpg', '.jpeg', '.png']: | |
| # open the image and convert it to RGB | |
| with open(os.file_path, "rb") as file: | |
| # Read the image file and convert it to RGB | |
| image = Image.open(file).convert("RGB") | |
| images.append(image) | |
| return images | |
| #@observe() | |
| def run_agent(agent, question,trace_name,metadata): | |
| # with tracer.start_as_current_span(trace_name) as span: | |
| # span.set_attribute("langfuse.tag", "dataset-run") | |
| # span.set_attribute("langfuse.input", question) | |
| # if the question has attachments: | |
| # find file under /attachments with the same task_id | |
| images = add_image(metadata) | |
| question = question + " The task_id is: " + metadata["task_id"] | |
| try: | |
| output = agent.run(question, images=images) | |
| except Exception as e: | |
| print(f"Error running agent: {e}") | |
| output = f"Error running agent: {e}" | |
| #span.set_attribute("langfuse.output", output) | |
| # current_span = trace.get_current_span() | |
| # span_context = current_span.get_span_context() | |
| # trace_id = span_context.trace_id | |
| # formatted_trace_id = format_trace_id(trace_id) | |
| # langfuse_trace = langfuse.get_trace(id=formatted_trace_id) | |
| # langfuse_trace = langfuse.trace( | |
| # id=formatted_trace_id, | |
| # input=question, | |
| # output=output | |
| # ) | |
| return output | |
| def simple_evaluation(output, expected_output): | |
| trimmed_output = str(output).strip().strip('"').strip("$") | |
| # see if the output is a list: | |
| expected_output_list = [item.strip() for item in expected_output.split(",") if item.strip()] | |
| output_list = [item.strip() for item in trimmed_output.split(",") if item.strip()] | |
| similarity = 0.0 | |
| if not expected_output_list and not output_list: | |
| similarity = 0.0 | |
| if trimmed_output == expected_output: | |
| similarity = 1.0 | |
| elif expected_output.toLower() == trimmed_output.toLower(): | |
| similarity = 0.8 | |
| else: | |
| similarity = 0.0 | |
| return similarity | |
| common_items_count = 0 | |
| matched_items = [False]*len(output_list) | |
| for item in expected_output_list: | |
| for i,item2 in enumerate(output_list): | |
| if not matched_items[i]: | |
| similarity = 0.0 | |
| if item == item2: | |
| similarity = 1.0 | |
| elif item.lower() == item2.lower(): | |
| similarity = 0.8 | |
| else: | |
| similarity = 0.0 | |
| if similarity >= 0.8: | |
| matched_items[i] = True | |
| break | |
| common_items_count = sum(matched_items) / len(expected_output_list) | |
| return common_items_count | |
| def run_evaluation(agent,langfuse_dataset,run_name,model_id,trace_name): | |
| dataset = langfuse.get_dataset(langfuse_dataset) | |
| responses = [] | |
| # Run our agent against each dataset item (limited to first 10 above) | |
| for item in dataset.items: | |
| print(f"Processing item with task_id: {item.metadata['task_id']}") | |
| with item.run( | |
| run_name = run_name | |
| ) as root_span: | |
| root_span.update(input=item.input) | |
| task_id = item.metadata["task_id"] | |
| if task_id == "a1e91b78-d3d8-4675-bb8d-62741b4b68a6": | |
| try: | |
| output = run_agent(agent,item.input,trace_name,item.metadata) | |
| responses.append({"task_id": task_id, "submitted_answer": output}) | |
| root_span.update(output=output) | |
| except Exception as e: | |
| output = f"Error running agent: {e}" | |
| # score the result against the expected output | |
| root_span.score_trace(name="exact_match", value = simple_evaluation(output, item.expected_output)) | |
| # Link the trace to the dataset item for analysis | |
| # item.link( | |
| # langfuse_trace, | |
| # run_name=run_name, | |
| # run_metadata={ "model": model_id } | |
| # ) | |
| # Optionally, store a quick evaluation score for demonstration | |
| # langfuse_trace.score( | |
| # name="<example_eval>", | |
| # value=1, | |
| # comment="This is a comment" | |
| # ) | |
| # Flush data to ensure all telemetry is sent | |
| langfuse.flush() | |
| # Save the responses to a JSON lines file | |
| print("Saving responses to file...") | |
| responses = [{"task_id": item["task_id"], "submitted_answer": item["submitted_answer"]} for item in responses] | |
| filename = langfuse_dataset+run_name | |
| output_file = f"responses_{filename}.json" | |
| with open(output_file, "w") as f: | |
| json.dump(responses, f, indent=4) | |
| print(f"Responses saved to {output_file}") | |
| def evaluate(): | |
| print("Starting agent...") | |
| agent = build_agents() | |
| print("Agent built successfully.") | |
| run_evaluation(agent,"GAIA_Evaluation_Dataset","Single Smolagent with tools OpenAI 4o 3planningSteps youtube","OpenAI gpt4o","smolagent-trace") | |
| # simple_evaluation("Dimitry","Clasu") | |
| # print("comparison", simple_evaluation("Dimitry","Clasu")) | |
| # print("sain", simple_evaluation('"Saint Petersburg"',"Saint Petersburg")) | |
| # print("pages", simple_evaluation('"132,133,136,195,245"',"132, 133, 134, 197, 245")) | |
| # print("veg", simple_evaluation('"cornstarch, freshly squeezed lemon juice, granulated sugar, pure vanilla extract, ripe strawberries"',"cornstarch, freshly squeezed lemon juice, granulated sugar, pure vanilla extract, ripe strawberries")) | |
| # print("right", simple_evaluation('"right"',"Right")) | |
| if __name__ == "__main__": | |
| evaluate() |