File size: 7,306 Bytes
3a7aaed 279f51f 3a7aaed 279f51f 3a7aaed 279f51f 3a7aaed 279f51f 3a7aaed 279f51f 3a7aaed 279f51f 3a7aaed 279f51f 3a7aaed 279f51f 3a7aaed 279f51f 3a7aaed a0888ca 3a7aaed a0888ca 279f51f 3a7aaed 279f51f 3a7aaed a0888ca 3a7aaed a0888ca 3a7aaed 279f51f 3a7aaed |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 |
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/romainfayoux/Documents/Programmation/Final_Assignment_Template/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
}
],
"source": [
"import pandas as pd\n",
"import json\n",
"from phoenix.client import Client\n",
"\n",
"# Load the existing spans\n",
"spans_df = Client().spans.get_spans_dataframe(project_name=\"default\", start_time=\"2025-10-23\")"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# Load the source of truth\n",
"dataset_df = pd.read_json(\"../data/metadata.jsonl\", lines=True)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# Filter by root agents\n",
"agents_df = spans_df[(spans_df.span_kind == 'AGENT') & (spans_df.parent_id.isna()) & (spans_df.status_code == 'OK')]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/pj/v1zrqj1d10x9_1rd2njh_r_r0000gn/T/ipykernel_35129/3107371246.py:2: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" agents_df[\"task\"] = agents_df[\"attributes.input.value\"].apply(json.loads).apply(lambda x : x[\"task\"]).str.replace(r'\\s*The mentionned file can be downloaded from.*$', '', regex=True)\n"
]
}
],
"source": [
"# Retrieve the right question and add the answer\n",
"agents_df[\"task\"] = agents_df[\"attributes.input.value\"].apply(json.loads).apply(lambda x : x[\"task\"]).str.replace(r'\\s*The mentionned file can be downloaded from.*$', '', regex=True)\n",
"agents_merged_df = pd.merge(agents_df,dataset_df,how=\"left\",left_on=\"task\", right_on=\"Question\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'exact_match_eval' is not defined",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mNameError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[5]\u001b[39m\u001b[32m, line 8\u001b[39m\n\u001b[32m 6\u001b[39m conciseness_evaluator = bind_evaluator(evaluator=conciseness_evaluator, input_mapping={ \u001b[33m\"\u001b[39m\u001b[33moutput\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33mattributes.output.value\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33mexpected\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33mFinal answer\u001b[39m\u001b[33m\"\u001b[39m})\n\u001b[32m 7\u001b[39m question_scorer_eval = bind_evaluator(evaluator=question_scorer, input_mapping={ \u001b[33m\"\u001b[39m\u001b[33moutput\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33mattributes.output.value\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33mexpected\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33mFinal answer\u001b[39m\u001b[33m\"\u001b[39m})\n\u001b[32m----> \u001b[39m\u001b[32m8\u001b[39m results_df = \u001b[38;5;28;01mawait\u001b[39;00m async_evaluate_dataframe(agents_merged_df, evaluators=[\u001b[43mexact_match_eval\u001b[49m, conciseness_evaluator, question_scorer_eval])\n",
"\u001b[31mNameError\u001b[39m: name 'exact_match_eval' is not defined"
]
}
],
"source": [
"from phoenix.evals.evaluators import bind_evaluator, async_evaluate_dataframe\n",
"from evaluators import conciseness_evaluator\n",
"from scorer import question_scorer_wrapper as question_scorer\n",
"\n",
"# Define the evaluator\n",
"conciseness_evaluator = bind_evaluator(evaluator=conciseness_evaluator, input_mapping={ \"output\": \"attributes.output.value\", \"expected\": \"Final answer\"})\n",
"question_scorer_eval = bind_evaluator(evaluator=question_scorer, input_mapping={ \"output\": \"attributes.output.value\", \"expected\": \"Final answer\"})\n",
"results_df = await async_evaluate_dataframe(agents_merged_df, evaluators=[conciseness_evaluator, question_scorer_eval])\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"results_df[\"conciseness\"] = results_df.conciseness_evaluator_score.apply(json.loads).apply(lambda x : x[\"label\"])\n",
"results_df[\"question_scorer\"] = results_df.question_scorer_score.apply(json.loads).apply(lambda x : x[\"score\"])\n",
"results_df[\"agent_type\"] = results_df[\"attributes.smolagents\"].apply(lambda x : \"multi_agent\" if \"managed_agents\" in x else \"llm_agent\")\n",
"results_filtered_df = results_df[[\"name\", \"span_kind\", \"start_time\", \"context.span_id\", \"context.trace_id\",\"attributes.output.value\", \"task_id\", \"Question\", \"Final answer\", \"agent_type\", \"conciseness_evaluator_score\", \"question_scorer_score\", \"conciseness\", \"question_scorer\"]]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/romainfayoux/Documents/Programmation/Final_Assignment_Template/.venv/lib/python3.12/site-packages/phoenix/evals/utils.py:367: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
" result_df = pd.concat(result_dfs, ignore_index=True)\n"
]
}
],
"source": [
"# Upload results\n",
"import numpy as np\n",
"from phoenix.evals.utils import to_annotation_dataframe\n",
"\n",
"annotation_df = to_annotation_dataframe(results_filtered_df)\n",
"annotation_df = annotation_df.replace({np.nan: None})\n",
"Client().spans.log_span_annotations_dataframe(dataframe=annotation_df)\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Final_Assignment_Template",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.11"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
|