Final_Assignment_Template

Sleeping

File size: 7,306 Bytes

3a7aaed
 
 
 
279f51f
3a7aaed
279f51f
 
 
 
 
 
 
 
 
 
3a7aaed
 
 
 
 
 
279f51f
3a7aaed
 
 
 
279f51f
3a7aaed
 
 
 
 
 
 
 
 
279f51f
3a7aaed
 
 
 
 
 
 
 
 
279f51f
3a7aaed
 
 
 
 
 
279f51f
3a7aaed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
279f51f
3a7aaed
279f51f
 
 
 
 
 
 
 
 
 
 
 
 
3a7aaed
 
 
a0888ca
3a7aaed
 
 
a0888ca
279f51f
3a7aaed
 
 
 
279f51f
3a7aaed
 
 
 
a0888ca
3a7aaed
a0888ca
3a7aaed
 
 
 
279f51f
3a7aaed

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/romainfayoux/Documents/Programmation/Final_Assignment_Template/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import json\n",
    "from phoenix.client import Client\n",
    "\n",
    "# Load the existing spans\n",
    "spans_df = Client().spans.get_spans_dataframe(project_name=\"default\", start_time=\"2025-10-23\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load the source of truth\n",
    "dataset_df = pd.read_json(\"../data/metadata.jsonl\", lines=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Filter by root agents\n",
    "agents_df = spans_df[(spans_df.span_kind == 'AGENT') & (spans_df.parent_id.isna()) & (spans_df.status_code == 'OK')]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/pj/v1zrqj1d10x9_1rd2njh_r_r0000gn/T/ipykernel_35129/3107371246.py:2: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  agents_df[\"task\"] = agents_df[\"attributes.input.value\"].apply(json.loads).apply(lambda x : x[\"task\"]).str.replace(r'\\s*The mentionned file can be downloaded from.*$', '', regex=True)\n"
     ]
    }
   ],
   "source": [
    "# Retrieve the right question and add the answer\n",
    "agents_df[\"task\"] = agents_df[\"attributes.input.value\"].apply(json.loads).apply(lambda x : x[\"task\"]).str.replace(r'\\s*The mentionned file can be downloaded from.*$', '', regex=True)\n",
    "agents_merged_df = pd.merge(agents_df,dataset_df,how=\"left\",left_on=\"task\", right_on=\"Question\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'exact_match_eval' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
      "\u001b[31mNameError\u001b[39m                                 Traceback (most recent call last)",
      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[5]\u001b[39m\u001b[32m, line 8\u001b[39m\n\u001b[32m      6\u001b[39m conciseness_evaluator = bind_evaluator(evaluator=conciseness_evaluator, input_mapping={ \u001b[33m\"\u001b[39m\u001b[33moutput\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33mattributes.output.value\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33mexpected\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33mFinal answer\u001b[39m\u001b[33m\"\u001b[39m})\n\u001b[32m      7\u001b[39m question_scorer_eval = bind_evaluator(evaluator=question_scorer, input_mapping={ \u001b[33m\"\u001b[39m\u001b[33moutput\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33mattributes.output.value\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33mexpected\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33mFinal answer\u001b[39m\u001b[33m\"\u001b[39m})\n\u001b[32m----> \u001b[39m\u001b[32m8\u001b[39m results_df = \u001b[38;5;28;01mawait\u001b[39;00m async_evaluate_dataframe(agents_merged_df, evaluators=[\u001b[43mexact_match_eval\u001b[49m, conciseness_evaluator, question_scorer_eval])\n",
      "\u001b[31mNameError\u001b[39m: name 'exact_match_eval' is not defined"
     ]
    }
   ],
   "source": [
    "from phoenix.evals.evaluators import bind_evaluator, async_evaluate_dataframe\n",
    "from evaluators import conciseness_evaluator\n",
    "from scorer import question_scorer_wrapper as question_scorer\n",
    "\n",
    "# Define the evaluator\n",
    "conciseness_evaluator = bind_evaluator(evaluator=conciseness_evaluator, input_mapping={ \"output\": \"attributes.output.value\", \"expected\": \"Final answer\"})\n",
    "question_scorer_eval = bind_evaluator(evaluator=question_scorer, input_mapping={ \"output\": \"attributes.output.value\", \"expected\": \"Final answer\"})\n",
    "results_df = await async_evaluate_dataframe(agents_merged_df, evaluators=[conciseness_evaluator, question_scorer_eval])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "results_df[\"conciseness\"] = results_df.conciseness_evaluator_score.apply(json.loads).apply(lambda x : x[\"label\"])\n",
    "results_df[\"question_scorer\"] = results_df.question_scorer_score.apply(json.loads).apply(lambda x : x[\"score\"])\n",
    "results_df[\"agent_type\"] = results_df[\"attributes.smolagents\"].apply(lambda x : \"multi_agent\" if \"managed_agents\" in x else \"llm_agent\")\n",
    "results_filtered_df = results_df[[\"name\", \"span_kind\", \"start_time\", \"context.span_id\", \"context.trace_id\",\"attributes.output.value\", \"task_id\", \"Question\", \"Final answer\", \"agent_type\", \"conciseness_evaluator_score\", \"question_scorer_score\", \"conciseness\", \"question_scorer\"]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/romainfayoux/Documents/Programmation/Final_Assignment_Template/.venv/lib/python3.12/site-packages/phoenix/evals/utils.py:367: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
      "  result_df = pd.concat(result_dfs, ignore_index=True)\n"
     ]
    }
   ],
   "source": [
    "# Upload results\n",
    "import numpy as np\n",
    "from phoenix.evals.utils import to_annotation_dataframe\n",
    "\n",
    "annotation_df = to_annotation_dataframe(results_filtered_df)\n",
    "annotation_df = annotation_df.replace({np.nan: None})\n",
    "Client().spans.log_span_annotations_dataframe(dataframe=annotation_df)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Final_Assignment_Template",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}