Final_Assignment_Template

Sleeping

File size: 7,234 Bytes

import os
import json
from dotenv import load_dotenv
from opentelemetry.trace import format_trace_id, get_tracer
from opentelemetry import trace
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import SimpleSpanProcessor
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
from openinference.instrumentation.smolagents import SmolagentsInstrumentor
from langfuse import observe
from PIL import Image

from single_smolagent import build_agents
from langfuse import Langfuse
# Load environment variables
load_dotenv()
langfuse = Langfuse()
# Initialize OpenTelemetry Tracer
#trace_provider = TracerProvider()
#trace_provider.add_span_processor(SimpleSpanProcessor(OTLPSpanExporter()))
#trace.set_tracer_provider(trace_provider) # Set as global provider
#tracer = trace.get_tracer(__name__) # Get a tracer instance

#SmolagentsInstrumentor().instrument(tracer_provider=trace_provider)

def add_image(metadata)->list:
    images = []
    task_id = metadata["task_id"]
    attachment = metadata.get("attachment", False)
    if attachment:
        os.file_path = f"attachments/{attachment}"
        if os.path.exists(os.file_path):
            print("Attachments found for task_id:", task_id)
        #    with open(os.file_path, "rb") as file:
        #        question += f"\n\nAttachments: {file.read().decode('utf-8')}"
        else:
            print(f"No attachments found for task_id: {task_id}")
        # if the file is an image, we can add it to the question
        
        if os.path.isfile(os.file_path) and os.path.splitext(os.file_path)[1].lower() in ['.jpg', '.jpeg', '.png']:
            # open the image and convert it to RGB
            with open(os.file_path, "rb") as file:
                # Read the image file and convert it to RGB
                image = Image.open(file).convert("RGB")
                images.append(image)
    return images


#@observe()
def run_agent(agent, question,trace_name,metadata):
    # with tracer.start_as_current_span(trace_name) as span:
    #     span.set_attribute("langfuse.tag", "dataset-run")
    #     span.set_attribute("langfuse.input", question)
        # if the question has attachments:
        # find file under /attachments with the same task_id
    images = add_image(metadata)

    question = question + " The task_id is: " + metadata["task_id"]

        
    try:    
        output = agent.run(question, images=images)
    except Exception as e:
        print(f"Error running agent: {e}")
        output = f"Error running agent: {e}"
    
    #span.set_attribute("langfuse.output", output)

    # current_span = trace.get_current_span()
    # span_context = current_span.get_span_context()
    # trace_id = span_context.trace_id
    # formatted_trace_id = format_trace_id(trace_id)
    # langfuse_trace = langfuse.get_trace(id=formatted_trace_id)

    # langfuse_trace = langfuse.trace(
    #     id=formatted_trace_id, 
    #     input=question, 
    #     output=output
    # )
    return output
def simple_evaluation(output, expected_output):
  
  trimmed_output = str(output).strip().strip('"').strip("$")
  # see if the output is a list:
  expected_output_list = [item.strip() for item in expected_output.split(",") if item.strip()]
  output_list = [item.strip() for item in trimmed_output.split(",") if item.strip()]
  similarity = 0.0
  if not expected_output_list and not output_list:
    similarity = 0.0
    if trimmed_output == expected_output:
        similarity = 1.0
    elif expected_output.toLower() == trimmed_output.toLower():
        similarity = 0.8
    else:
        similarity = 0.0
    return similarity
  common_items_count = 0
  matched_items = [False]*len(output_list)
  for item in expected_output_list:
      for i,item2 in enumerate(output_list):
          if not matched_items[i]:
              similarity = 0.0
              if item == item2:
                  similarity = 1.0
              elif item.lower() == item2.lower():
                  similarity = 0.8
              else:
                  similarity = 0.0
              if similarity >= 0.8:
                  matched_items[i] = True
                  break
  common_items_count = sum(matched_items) / len(expected_output_list)   
  return common_items_count          


def run_evaluation(agent,langfuse_dataset,run_name,model_id,trace_name):
    dataset = langfuse.get_dataset(langfuse_dataset)
    responses = []
    # Run our agent against each dataset item (limited to first 10 above)
    for item in dataset.items:
        print(f"Processing item with task_id: {item.metadata['task_id']}")
        with item.run(
            run_name = run_name
        ) as root_span:
            root_span.update(input=item.input)
            task_id = item.metadata["task_id"]
            if task_id == "a1e91b78-d3d8-4675-bb8d-62741b4b68a6":
                try:
                    output = run_agent(agent,item.input,trace_name,item.metadata)
                    responses.append({"task_id": task_id, "submitted_answer": output})
                    root_span.update(output=output)
                except Exception as e:
                    output = f"Error running agent: {e}"

                # score the result against the expected output
                root_span.score_trace(name="exact_match", value = simple_evaluation(output, item.expected_output))

            # Link the trace to the dataset item for analysis
            # item.link(
            #     langfuse_trace,
            #     run_name=run_name,
            #     run_metadata={ "model": model_id }
            # )

            # Optionally, store a quick evaluation score for demonstration
            # langfuse_trace.score(
            #     name="<example_eval>",
            #     value=1,
            #     comment="This is a comment"
            # )

        # Flush data to ensure all telemetry is sent
    langfuse.flush()

    # Save the responses to a JSON lines file
    print("Saving responses to file...")
    responses = [{"task_id": item["task_id"], "submitted_answer": item["submitted_answer"]} for item in responses]

    filename = langfuse_dataset+run_name
    output_file = f"responses_{filename}.json"  
    with open(output_file, "w") as f:
        json.dump(responses, f, indent=4)
    print(f"Responses saved to {output_file}")


def evaluate():
    print("Starting agent...")
    agent = build_agents()
    print("Agent built successfully.")
    run_evaluation(agent,"GAIA_Evaluation_Dataset","Single Smolagent with tools OpenAI 4o 3planningSteps youtube","OpenAI gpt4o","smolagent-trace")
    # simple_evaluation("Dimitry","Clasu")
    # print("comparison", simple_evaluation("Dimitry","Clasu"))
    # print("sain", simple_evaluation('"Saint Petersburg"',"Saint Petersburg"))
    # print("pages", simple_evaluation('"132,133,136,195,245"',"132, 133, 134, 197, 245"))
    # print("veg", simple_evaluation('"cornstarch, freshly squeezed lemon juice, granulated sugar, pure vanilla extract, ripe strawberries"',"cornstarch, freshly squeezed lemon juice, granulated sugar, pure vanilla extract, ripe strawberries"))
    # print("right", simple_evaluation('"right"',"Right"))

if __name__ == "__main__":
    evaluate()