In [29]:
import pandas as pd
import json
from phoenix.client import Client

# Load the existing spans
spans_df = Client().spans.get_spans_dataframe(project_name="default")

In [30]:
# Load the source of truth
dataset_df = pd.read_json("../data/metadata.jsonl", lines=True)

In [31]:
# Filter by root agents
agents_df = spans_df[(spans_df.span_kind == 'AGENT') & (spans_df.parent_id.isna()) & (spans_df.status_code == 'OK')]

In [32]:
# Retrieve the right question and add the answer
agents_df["task"] = agents_df["attributes.input.value"].apply(json.loads).apply(lambda x : x["task"]).str.replace(r'\s*The mentionned file can be downloaded from.*$', '', regex=True)
agents_merged_df = pd.merge(agents_df,dataset_df,how="left",left_on="task", right_on="Question")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  agents_df["task"] = agents_df["attributes.input.value"].apply(json.loads).apply(lambda x : x["task"]).str.replace(r'\s*The mentionned file can be downloaded from.*$', '', regex=True)


In [33]:
from phoenix.evals.evaluators import bind_evaluator, async_evaluate_dataframe
from evaluators import conciseness_evaluator
from scorer import question_scorer_wrapper as question_scorer

# Define the evaluator
conciseness_evaluator = bind_evaluator(evaluator=conciseness_evaluator, input_mapping={ "output": "attributes.output.value", "expected": "Final answer"})
question_scorer_eval = bind_evaluator(evaluator=question_scorer, input_mapping={ "output": "attributes.output.value", "expected": "Final answer"})
results_df = await async_evaluate_dataframe(agents_merged_df, evaluators=[exact_match_eval, conciseness_evaluator, question_scorer_eval])


In [34]:
results_df["conciseness"] = results_df.conciseness_evaluator_score.apply(json.loads).apply(lambda x : x["label"])
results_df["question_scorer"] = results_df.question_scorer_score.apply(json.loads).apply(lambda x : x["score"])
results_df["agent_type"] = results_df["attributes.smolagents"].apply(lambda x : "multi_agent" if "managed_agents" in x else "llm_agent")
results_filtered_df = results_df[["name", "span_kind", "start_time", "context.span_id", "context.trace_id","attributes.output.value", "task_id", "Question", "Final answer", "agent_type", "conciseness_evaluator_score", "question_scorer_score", "conciseness", "question_scorer"]]

In [35]:
# Upload results
import numpy as np
from phoenix.evals.utils import to_annotation_dataframe

annotation_df = to_annotation_dataframe(results_filtered_df)
annotation_df = annotation_df.replace({np.nan: None})
Client().spans.log_span_annotations_dataframe(dataframe=annotation_df)


  result_df = pd.concat(result_dfs, ignore_index=True)
