File size: 25,498 Bytes
3a7aaed
 
 
 
4829f22
3a7aaed
4829f22
3a7aaed
 
 
 
 
 
279f51f
3a7aaed
 
 
 
4829f22
3a7aaed
 
 
 
 
 
 
 
 
4829f22
3a7aaed
 
 
 
 
 
 
 
 
4829f22
3a7aaed
 
 
 
 
 
279f51f
3a7aaed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4829f22
3a7aaed
279f51f
 
4829f22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
279f51f
 
 
3a7aaed
 
 
a0888ca
3a7aaed
 
 
a0888ca
279f51f
3a7aaed
 
 
 
4829f22
3a7aaed
 
 
 
a0888ca
3a7aaed
a0888ca
3a7aaed
 
 
 
4829f22
3a7aaed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import json\n",
    "from phoenix.client import Client\n",
    "\n",
    "# Load the existing spans\n",
    "spans_df = Client().spans.get_spans_dataframe(project_name=\"default\", start_time=\"2025-10-23\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load the source of truth\n",
    "dataset_df = pd.read_json(\"../data/metadata.jsonl\", lines=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Filter by root agents\n",
    "agents_df = spans_df[(spans_df.span_kind == 'AGENT') & (spans_df.parent_id.isna()) & (spans_df.status_code == 'OK')]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/pj/v1zrqj1d10x9_1rd2njh_r_r0000gn/T/ipykernel_35129/3107371246.py:2: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  agents_df[\"task\"] = agents_df[\"attributes.input.value\"].apply(json.loads).apply(lambda x : x[\"task\"]).str.replace(r'\\s*The mentionned file can be downloaded from.*$', '', regex=True)\n"
     ]
    }
   ],
   "source": [
    "# Retrieve the right question and add the answer\n",
    "agents_df[\"task\"] = agents_df[\"attributes.input.value\"].apply(json.loads).apply(lambda x : x[\"task\"]).str.replace(r'\\s*The mentionned file can be downloaded from.*$', '', regex=True)\n",
    "agents_merged_df = pd.merge(agents_df,dataset_df,how=\"left\",left_on=\"task\", right_on=\"Question\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Evaluating <code>\n",
      "page_content_log = visit_webpage(url=\"https://en.wikipedia.org/wiki/Wikipedia:Featured_article_candidates/Featured_log/November_2016\")\n",
      "print(page_content_log)\n",
      "</code>\n",
      "Calling tools:\n",
      "[{'id': 'call_8', 'type': 'function', 'function': {'name': 'python_interpreter', 'arguments': 'page_content_log = visit_webpage(url=\"https://en.wikipedia.org/wiki/Wikipedia:Featured_article_candidates/Featured_log/November_2016\")\\nprint(page_content_log)'}}] as a string.\n",
      "Evaluating right as a string.\n",
      "Evaluating The provided problem requires visual inspection of a chess board image which I cannot access in the current environment. To properly solve this, one would need to analyze the specific piece positions in the image to identify a forcing tactical sequence leading to a guaranteed win for Black. Standard approaches would involve identifying forced captures, mating patterns, or immediate tactical threats that Black can execute on their turn. as a string.\n",
      "Evaluating right as a string.\n",
      "Evaluating Looking at the Wikipedia information I retrieved earlier, I need to systematically identify studio albums published between 2000 and 2009.\n",
      "\n",
      "From the information available:\n",
      "- Mercedes Sosa won Latin Grammy Awards for Best Folk Album in 2000 (Misa Criolla), 2003 (Acústico), 2006 (Corazón Libre), 2009 (Cantora 1), and 2011 (Deja La Vida Volar)\n",
      "- These are all studio albums based on her achievements and the nature of Latin Grammy awards for studio albums\n",
      "- The years 2000, 2003, 2006, and 2009 all fall within our target range of 2000-2009\n",
      "\n",
      "Let me verify these are indeed studio albums by considering the patterns in her discography and the information provided.\n",
      "\n",
      "```python\n",
      "# Based on the Wikipedia information, identifying studio albums in the specified period\n",
      "albums_in_range = [\n",
      "    {\"name\": \"Misa Criolla\", \"year\": 2000},\n",
      "    {\"name\": \"Acústico\", \"year\": 2003}, \n",
      "    {\"name\": \"Corazón Libre\", \"year\": 2006},\n",
      "    {\"name\": \"Cantora 1\", \"year\": 2009}\n",
      "]\n",
      "\n",
      "# Count studio albums from 2000-2009\n",
      "count = len([album for album in albums_in_range if 2000 <= album[\"year\"] <= 2009])\n",
      "print(count)\n",
      "```\n",
      "Based on my analysis of Mercedes Sosa's discography from the Wikipedia information, I have identified 4 studio albums that were published between 2000 and 2009 (inclusive):\n",
      "\n",
      "1. Misa Criolla (2000)\n",
      "2. Acústico (2003)\n",
      "3. Corazón Libre (2006)\n",
      "4. Cantora 1 (2009)\n",
      "\n",
      "These albums are confirmed by the fact that Mercedes Sosa won Latin Grammy Awards for Best Folk Album for each of these releases in those respective years, which indicates they were studio albums. All four fall within the requested time period of 2000-2009.\n",
      "\n",
      "Therefore, the answer is 4 studio albums. as a number.Evaluating cornstarch, lemon juice, ripe strawberries, sugar, vanilla extract as a comma separated list.\n",
      "\n",
      "String Looking at the Wikipedia information I retrieved earlier I need to systematically identify studio albums published between 2000 and 2009.\n",
      "\n",
      "From the information available:\n",
      "- Mercedes Sosa won Latin Grammy Awards for Best Folk Album in 2000 (Misa Criolla) 2003 (Acústico) 2006 (Corazón Libre) 2009 (Cantora 1) and 2011 (Deja La Vida Volar)\n",
      "- These are all studio albums based on her achievements and the nature of Latin Grammy awards for studio albums\n",
      "- The years 2000 2003 2006 and 2009 all fall within our target range of 2000-2009\n",
      "\n",
      "Let me verify these are indeed studio albums by considering the patterns in her discography and the information provided.\n",
      "\n",
      "```python\n",
      "# Based on the Wikipedia information identifying studio albums in the specified period\n",
      "albums_in_range = [\n",
      "    {\"name\": \"Misa Criolla\" \"year\": 2000}\n",
      "    {\"name\": \"Acústico\" \"year\": 2003} \n",
      "    {\"name\": \"Corazón Libre\" \"year\": 2006}\n",
      "    {\"name\": \"Cantora 1\" \"year\": 2009}\n",
      "]\n",
      "\n",
      "# Count studio albums from 2000-2009\n",
      "count = len([album for album in albums_in_range if 2000 <= album[\"year\"] <= 2009])\n",
      "print(count)\n",
      "```\n",
      "Based on my analysis of Mercedes Sosa's discography from the Wikipedia information I have identified 4 studio albums that were published between 2000 and 2009 (inclusive):\n",
      "\n",
      "1. Misa Criolla (2000)\n",
      "2. Acústico (2003)\n",
      "3. Corazón Libre (2006)\n",
      "4. Cantora 1 (2009)\n",
      "\n",
      "These albums are confirmed by the fact that Mercedes Sosa won Latin Grammy Awards for Best Folk Album for each of these releases in those respective years which indicates they were studio albums. All four fall within the requested time period of 2000-2009.\n",
      "\n",
      "Therefore the answer is 4 studio albums. cannot be normalized to number str.\n",
      "Evaluating broccoli, celery, fresh basil, green beans, lettuce, sweet potatoes, zucchini as a comma separated list.\n",
      "Evaluating Information not available as a string.\n",
      "Evaluating b,e as a comma separated list.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/romainfayoux/Documents/Programmation/Final_Assignment_Template/eval/scorer.py:61: UserWarning: Answer lists have different lengths, returning False.\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Evaluating Given the issues accessing the specific Wikipedia page directly, I will use an alternative approach to find the information. I'll search for the specific Featured Article about a dinosaur promoted in November 2016 and then look for its nomination details.\n",
      "\n",
      "Let's start by searching for the specific Featured Article about a dinosaur promoted in November 2016.\n",
      "\n",
      "<code>\n",
      "# Perform a web search to find the specific Featured Article about a dinosaur promoted in November 2016\n",
      "search_results = web_search(query=\"Featured Article dinosaur promoted November 2016\")\n",
      "print(search_results)\n",
      "</code>\n",
      "Calling tools:\n",
      "[{'id': 'call_8', 'type': 'function', 'function': {'name': 'python_interpreter', 'arguments': '# Perform a web search to find the specific Featured Article about a dinosaur promoted in November 2016\\nsearch_results = web_search(query=\"Featured Article dinosaur promoted November 2016\")\\nprint(search_results)'}}] as a string.\n",
      "Evaluating d5 as a string.\n",
      "Evaluating right as a string.\n",
      "Evaluating Given the issues with extracting the discography section using regex, I will manually identify the studio albums released by Mercedes Sosa between 2000 and 2009 based on the information provided in the Wikipedia page.\n",
      "\n",
      "From the Wikipedia page, the studio albums section lists the following albums with their release years:\n",
      "\n",
      "- Misa Criolla (2000)\n",
      "- Acústico (2003)\n",
      "- Corazón Libre (2006)\n",
      "- Cantora 1 (2009)\n",
      "\n",
      "These are the studio albums released by Mercedes Sosa between 2000 and 2009. Therefore, the number of studio albums published by Mercedes Sosa between 2000 and 2009 is 4.\n",
      "\n",
      "Final answer: Mercedes Sosa published 4 studio albums between 2000 and 2009. as a number.\n",
      "String Given the issues with extracting the discography section using regex I will manually identify the studio albums released by Mercedes Sosa between 2000 and 2009 based on the information provided in the Wikipedia page.\n",
      "\n",
      "From the Wikipedia page the studio albums section lists the following albums with their release years:\n",
      "\n",
      "- Misa Criolla (2000)\n",
      "- Acústico (2003)\n",
      "- Corazón Libre (2006)\n",
      "- Cantora 1 (2009)\n",
      "\n",
      "These are the studio albums released by Mercedes Sosa between 2000 and 2009. Therefore the number of studio albums published by Mercedes Sosa between 2000 and 2009 is 4.\n",
      "\n",
      "Final answer: Mercedes Sosa published 4 studio albums between 2000 and 2009. cannot be normalized to number str.\n",
      "Evaluating right as a string.Evaluating Given the issues with parsing the Wikipedia page using regular expressions, I will manually identify the studio albums released by Mercedes Sosa between 2000 and 2009 based on the information provided in the Wikipedia content.\n",
      "\n",
      "From the discography section of the Wikipedia page, I can identify the following studio albums and their release years:\n",
      "\n",
      "- **Misa Criolla** (2000)\n",
      "- **Acústico** (2003)\n",
      "- **Corazón Libre** (2006)\n",
      "- **Cantora 1** (2009)\n",
      "\n",
      "These are the studio albums released by Mercedes Sosa between 2000 and 2009. Therefore, the number of studio albums published by Mercedes Sosa between 2000 and 2009 is **4**.\n",
      "\n",
      "Final answer: Mercedes Sosa published 4 studio albums between 2000 and 2009. as a number.\n",
      "String Given the issues with parsing the Wikipedia page using regular expressions I will manually identify the studio albums released by Mercedes Sosa between 2000 and 2009 based on the information provided in the Wikipedia content.\n",
      "\n",
      "From the discography section of the Wikipedia page I can identify the following studio albums and their release years:\n",
      "\n",
      "- **Misa Criolla** (2000)\n",
      "- **Acústico** (2003)\n",
      "- **Corazón Libre** (2006)\n",
      "- **Cantora 1** (2009)\n",
      "\n",
      "These are the studio albums released by Mercedes Sosa between 2000 and 2009. Therefore the number of studio albums published by Mercedes Sosa between 2000 and 2009 is **4**.\n",
      "\n",
      "Final answer: Mercedes Sosa published 4 studio albums between 2000 and 2009. cannot be normalized to number str.\n",
      "\n",
      "Evaluating Based on the information gathered from the search results and the analysis of the bird species mentioned, the highest number of bird species on camera simultaneously in the video \"Penguin Chicks Stand Up To Giant Petrel...With The Help of a Friend!\" is **3**. These species are:\n",
      "\n",
      "1. Emperor penguin chicks\n",
      "2. Adélie penguin\n",
      "3. Giant petrel\n",
      "\n",
      "Therefore, the answer to the user's task is that the highest number of bird species to be on camera simultaneously is **3**. as a number.\n",
      "String Based on the information gathered from the search results and the analysis of the bird species mentioned the highest number of bird species on camera simultaneously in the video \"Penguin Chicks Stand Up To Giant Petrel...With The Help of a Friend!\" is **3**. These species are:\n",
      "\n",
      "1. Emperor penguin chicks\n",
      "2. Adélie penguin\n",
      "3. Giant petrel\n",
      "\n",
      "Therefore the answer to the user's task is that the highest number of bird species to be on camera simultaneously is **3**. cannot be normalized to number str.\n",
      "Evaluating It seems that the structure of the Wikipedia page is not matching the expected format, causing the regex search to fail. Let's take a different approach by manually inspecting the discography section of the Wikipedia page and extracting the relevant information.\n",
      "\n",
      "Based on the content provided in the previous observation, I will manually identify the studio albums released between 2000 and 2009:\n",
      "\n",
      "- **2000**: \"Misa Criolla\"\n",
      "- **2003**: \"Acústico\"\n",
      "- **2004**: \"Corazón Libre\"\n",
      "- **2006**: \"Cantora 1\"\n",
      "- **2009**: \"Deja La Vida Volar\"\n",
      "\n",
      "From this list, we can see that Mercedes Sosa released 5 studio albums between 2000 and 2009.\n",
      "\n",
      "Final answer: Mercedes Sosa published 5 studio albums between 2000 and 2009. as a number.\n",
      "String It seems that the structure of the Wikipedia page is not matching the expected format causing the regex search to fail. Let's take a different approach by manually inspecting the discography section of the Wikipedia page and extracting the relevant information.\n",
      "\n",
      "Based on the content provided in the previous observation I will manually identify the studio albums released between 2000 and 2009:\n",
      "\n",
      "- **2000**: \"Misa Criolla\"\n",
      "- **2003**: \"Acústico\"\n",
      "- **2004**: \"Corazón Libre\"\n",
      "- **2006**: \"Cantora 1\"\n",
      "- **2009**: \"Deja La Vida Volar\"\n",
      "\n",
      "From this list we can see that Mercedes Sosa released 5 studio albums between 2000 and 2009.\n",
      "\n",
      "Final answer: Mercedes Sosa published 5 studio albums between 2000 and 2009. cannot be normalized to number str.\n",
      "Evaluating Given the limitations in accessing the video content directly, I will rely on the video description and comments to provide an answer. Let's manually inspect the video description and comments for any information about the number of bird species on camera simultaneously.\n",
      "\n",
      "### Video Description:\n",
      "The video title is \"Penguin Chicks Stand Up To Giant Petrel...With The Help of a Friend!\" The description mentions penguin chicks and a giant petrel, but it does not specify the number of bird species on camera simultaneously.\n",
      "\n",
      "### Comments:\n",
      "I will now look through the comments to see if any viewers have mentioned the number of bird species on camera simultaneously.\n",
      "\n",
      "<code>\n",
      "url = \"https://www.youtube.com/watch?v=L1vXCYZAYYM\"\n",
      "comments = get_video_comments(url)\n",
      "print(comments)\n",
      "</code>\n",
      "Calling tools:\n",
      "[{'id': 'call_8', 'type': 'function', 'function': {'name': 'python_interpreter', 'arguments': 'url = \"https://www.youtube.com/watch?v=L1vXCYZAYYM\"\\ncomments = get_video_comments(url)\\nprint(comments)'}}] as a number.\n",
      "String Given the limitations in accessing the video content directly I will rely on the video description and comments to provide an answer. Let's manually inspect the video description and comments for any information about the number of bird species on camera simultaneously.\n",
      "\n",
      "### Video Description:\n",
      "The video title is \"Penguin Chicks Stand Up To Giant Petrel...With The Help of a Friend!\" The description mentions penguin chicks and a giant petrel but it does not specify the number of bird species on camera simultaneously.\n",
      "\n",
      "### Comments:\n",
      "I will now look through the comments to see if any viewers have mentioned the number of bird species on camera simultaneously.\n",
      "\n",
      "<code>\n",
      "url = \"https://www.youtube.com/watch?v=L1vXCYZAYYM\"\n",
      "comments = get_video_comments(url)\n",
      "print(comments)\n",
      "</code>\n",
      "Calling tools:\n",
      "[{'id': 'call_8' 'type': 'function' 'function': {'name': 'python_interpreter' 'arguments': 'url = \"https://www.youtube.com/watch?v=L1vXCYZAYYM\"\\ncomments = get_video_comments(url)\\nprint(comments)'}}] cannot be normalized to number str.\n",
      "Evaluating Given the difficulties in parsing the Wikipedia page directly, I will manually extract the relevant information from the discography section of the Mercedes Sosa Wikipedia page.\n",
      "\n",
      "Here is the discography section from the Wikipedia page, focusing on the studio albums:\n",
      "\n",
      "### Studio albums\n",
      "- Misa Criolla (2000)\n",
      "- Acústico (2003)\n",
      "- Corazón Libre (2006)\n",
      "- Cantora 1 (2009)\n",
      "\n",
      "Based on this information, we can see that Mercedes Sosa released four studio albums between 2000 and 2009 (inclusive).\n",
      "\n",
      "Therefore, the final answer is:\n",
      "**Mercedes Sosa published 4 studio albums between 2000 and 2009.** as a number.\n",
      "String Given the difficulties in parsing the Wikipedia page directly I will manually extract the relevant information from the discography section of the Mercedes Sosa Wikipedia page.\n",
      "\n",
      "Here is the discography section from the Wikipedia page focusing on the studio albums:\n",
      "\n",
      "### Studio albums\n",
      "- Misa Criolla (2000)\n",
      "- Acústico (2003)\n",
      "- Corazón Libre (2006)\n",
      "- Cantora 1 (2009)\n",
      "\n",
      "Based on this information we can see that Mercedes Sosa released four studio albums between 2000 and 2009 (inclusive).\n",
      "\n",
      "Therefore the final answer is:\n",
      "**Mercedes Sosa published 4 studio albums between 2000 and 2009.** cannot be normalized to number str.\n",
      "Evaluating FunkMonk as a string.\n",
      "Evaluating right as a string.\n",
      "Evaluating 2 as a number.\n",
      "Evaluating 2 as a number.Evaluating FunkMonk as a string.\n",
      "\n",
      "Evaluating a7a5 as a string.\n",
      "Evaluating right as a string.\n",
      "Evaluating 2 as a number.\n",
      "Evaluating 4 as a number.\n",
      "Evaluating Here is the final answer from your managed agent 'web_agent':\n",
      "### 1. Task outcome (short version):\n",
      "Total food sales excluding drinks: $155.00\n",
      "\n",
      "### 2. Task outcome (extremely detailed version):\n",
      "Detailed calculations:\n",
      "Filtered out drink items ('beverage', 'drink', 'soda').\n",
      "Remaining food items: 3.\n",
      "Total sales for filtered food items: $155.00.\n",
      "Calculation method: Sum of 'Total Sales' column values for non-drink items.\n",
      "\n",
      "### 3. Additional context (if relevant):\n",
      "Note: This result is based on simulated data. In a real scenario, downloading and parsing the actual Excel file would be necessary. as a number.\n",
      "String Here is the final answer from your managed agent 'web_agent':\n",
      "### 1. Task outcome (short version):\n",
      "Total food sales excluding drinks: 155.00\n",
      "\n",
      "### 2. Task outcome (extremely detailed version):\n",
      "Detailed calculations:\n",
      "Filtered out drink items ('beverage' 'drink' 'soda').\n",
      "Remaining food items: 3.\n",
      "Total sales for filtered food items: 155.00.\n",
      "Calculation method: Sum of 'Total Sales' column values for non-drink items.\n",
      "\n",
      "### 3. Additional context (if relevant):\n",
      "Note: This result is based on simulated data. In a real scenario downloading and parsing the actual Excel file would be necessary. cannot be normalized to number str.\n",
      "Evaluating Yamasaki, Uehara as a comma separated list.\n",
      "Evaluating MLT as a string.\n",
      "Evaluating Saint Petersburg as a string.\n",
      "Evaluating 80GSFC21M0002 as a string.\n",
      "Evaluating [] as a comma separated list.\n",
      "Evaluating 492 as a number.\n",
      "Evaluating 0 as a number.\n",
      "Evaluating Zenon as a string.\n",
      "Evaluating 'additional_context': 'this solution is based on a simulated transcription result. if the real transcription result differs, 'task_outcome_detailed': 'the ingredients for the pie filling, are: water, extracted from the transcription, here is the final answer from your managed agent 'web_agent':\n",
      "{'task_outcome_short': 'pie filling ingredients extracted successfully.', salt.', the extracted ingredients may also change.'} as a comma separated list.\n"
     ]
    }
   ],
   "source": [
    "from phoenix.evals.evaluators import bind_evaluator, async_evaluate_dataframe\n",
    "from evaluators import conciseness_evaluator\n",
    "from scorer import question_scorer_wrapper as question_scorer\n",
    "\n",
    "# Define the evaluator\n",
    "conciseness_evaluator = bind_evaluator(evaluator=conciseness_evaluator, input_mapping={ \"output\": \"attributes.output.value\", \"expected\": \"Final answer\"})\n",
    "question_scorer_eval = bind_evaluator(evaluator=question_scorer, input_mapping={ \"output\": \"attributes.output.value\", \"expected\": \"Final answer\"})\n",
    "results_df = await async_evaluate_dataframe(agents_merged_df, evaluators=[conciseness_evaluator, question_scorer_eval])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "results_df[\"conciseness\"] = results_df.conciseness_evaluator_score.apply(json.loads).apply(lambda x : x[\"label\"])\n",
    "results_df[\"question_scorer\"] = results_df.question_scorer_score.apply(json.loads).apply(lambda x : x[\"score\"])\n",
    "results_df[\"agent_type\"] = results_df[\"attributes.smolagents\"].apply(lambda x : \"multi_agent\" if \"managed_agents\" in x else \"llm_agent\")\n",
    "results_filtered_df = results_df[[\"name\", \"span_kind\", \"start_time\", \"context.span_id\", \"context.trace_id\",\"attributes.output.value\", \"task_id\", \"Question\", \"Final answer\", \"agent_type\", \"conciseness_evaluator_score\", \"question_scorer_score\", \"conciseness\", \"question_scorer\"]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/romainfayoux/Documents/Programmation/Final_Assignment_Template/.venv/lib/python3.12/site-packages/phoenix/evals/utils.py:367: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
      "  result_df = pd.concat(result_dfs, ignore_index=True)\n"
     ]
    }
   ],
   "source": [
    "# Upload results\n",
    "import numpy as np\n",
    "from phoenix.evals.utils import to_annotation_dataframe\n",
    "\n",
    "annotation_df = to_annotation_dataframe(results_filtered_df)\n",
    "annotation_df = annotation_df.replace({np.nan: None})\n",
    "Client().spans.log_span_annotations_dataframe(dataframe=annotation_df)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Final_Assignment_Template",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}