deleted unused ntb
Browse files
notebooks/flatten_examples_analysis.ipynb
DELETED
|
@@ -1,568 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"cells": [
|
| 3 |
-
{
|
| 4 |
-
"cell_type": "markdown",
|
| 5 |
-
"id": "a1",
|
| 6 |
-
"metadata": {},
|
| 7 |
-
"source": [
|
| 8 |
-
"# Flatten to Examples — Inspection\n",
|
| 9 |
-
"\n",
|
| 10 |
-
"Visual walkthrough of what `flatten_to_examples` produces for each mode.\n",
|
| 11 |
-
"\n",
|
| 12 |
-
"**Structure**\n",
|
| 13 |
-
"1. Load Augmented Data\n",
|
| 14 |
-
"2. Marker Mode Examples\n",
|
| 15 |
-
"3. QA-M Mode Examples\n",
|
| 16 |
-
"4. QA-B Mode Examples\n",
|
| 17 |
-
"5. Example Counts Summary"
|
| 18 |
-
]
|
| 19 |
-
},
|
| 20 |
-
{
|
| 21 |
-
"cell_type": "code",
|
| 22 |
-
"id": "a2",
|
| 23 |
-
"metadata": {
|
| 24 |
-
"ExecuteTime": {
|
| 25 |
-
"end_time": "2026-04-19T00:12:35.836199Z",
|
| 26 |
-
"start_time": "2026-04-19T00:12:35.833236Z"
|
| 27 |
-
}
|
| 28 |
-
},
|
| 29 |
-
"source": [
|
| 30 |
-
"import os\n",
|
| 31 |
-
"import sys\n",
|
| 32 |
-
"from collections import Counter\n",
|
| 33 |
-
"\n",
|
| 34 |
-
"import pandas as pd\n",
|
| 35 |
-
"\n",
|
| 36 |
-
"sys.path.insert(0, os.path.abspath(\"..\"))\n",
|
| 37 |
-
"\n",
|
| 38 |
-
"from src.models.dataset import load_data, flatten_to_examples\n",
|
| 39 |
-
"from src.schemas.labels import SENTIMENT_LABELS"
|
| 40 |
-
],
|
| 41 |
-
"outputs": [],
|
| 42 |
-
"execution_count": 34
|
| 43 |
-
},
|
| 44 |
-
{
|
| 45 |
-
"cell_type": "markdown",
|
| 46 |
-
"id": "a3",
|
| 47 |
-
"metadata": {},
|
| 48 |
-
"source": [
|
| 49 |
-
"## 1. Load Augmented Data"
|
| 50 |
-
]
|
| 51 |
-
},
|
| 52 |
-
{
|
| 53 |
-
"cell_type": "code",
|
| 54 |
-
"id": "a4",
|
| 55 |
-
"metadata": {
|
| 56 |
-
"ExecuteTime": {
|
| 57 |
-
"end_time": "2026-04-19T00:12:35.991309Z",
|
| 58 |
-
"start_time": "2026-04-19T00:12:35.845709Z"
|
| 59 |
-
}
|
| 60 |
-
},
|
| 61 |
-
"source": [
|
| 62 |
-
"samples = load_data(os.path.join(\"..\", \"data\", \"data_augmented_256.jsonl\"))\n",
|
| 63 |
-
"print(f\"Loaded {len(samples)} samples\")\n",
|
| 64 |
-
"print(f\"First sample has {len(samples[0]['entities'])} entities\")\n",
|
| 65 |
-
"print(f\"First entity has {len(samples[0]['entities'][0]['positions'])} positions\")"
|
| 66 |
-
],
|
| 67 |
-
"outputs": [
|
| 68 |
-
{
|
| 69 |
-
"name": "stdout",
|
| 70 |
-
"output_type": "stream",
|
| 71 |
-
"text": [
|
| 72 |
-
"Loaded 1629 samples\n",
|
| 73 |
-
"First sample has 3 entities\n",
|
| 74 |
-
"First entity has 1 positions\n"
|
| 75 |
-
]
|
| 76 |
-
}
|
| 77 |
-
],
|
| 78 |
-
"execution_count": 35
|
| 79 |
-
},
|
| 80 |
-
{
|
| 81 |
-
"cell_type": "markdown",
|
| 82 |
-
"id": "a5",
|
| 83 |
-
"metadata": {},
|
| 84 |
-
"source": [
|
| 85 |
-
"### Raw structure of one entity (before flattening)"
|
| 86 |
-
]
|
| 87 |
-
},
|
| 88 |
-
{
|
| 89 |
-
"cell_type": "code",
|
| 90 |
-
"id": "a6",
|
| 91 |
-
"metadata": {
|
| 92 |
-
"ExecuteTime": {
|
| 93 |
-
"end_time": "2026-04-19T00:12:36.000280Z",
|
| 94 |
-
"start_time": "2026-04-19T00:12:35.997489Z"
|
| 95 |
-
}
|
| 96 |
-
},
|
| 97 |
-
"source": [
|
| 98 |
-
"s = samples[0]\n",
|
| 99 |
-
"e = s[\"entities\"][0]\n",
|
| 100 |
-
"p = e[\"positions\"][0]\n",
|
| 101 |
-
"\n",
|
| 102 |
-
"print(f\"Sample ID: {s['id']}\")\n",
|
| 103 |
-
"print(f\"Entity: {e['entity_text']} ({e['entity_type']})\")\n",
|
| 104 |
-
"print(f\"Label: {e['label']}\")\n",
|
| 105 |
-
"print(f\"Positions: {len(e['positions'])}\")\n",
|
| 106 |
-
"print()\n",
|
| 107 |
-
"print(\"Position fields:\")\n",
|
| 108 |
-
"for key in p:\n",
|
| 109 |
-
" val = p[key]\n",
|
| 110 |
-
" if isinstance(val, str) and len(val) > 80:\n",
|
| 111 |
-
" val = val[:80] + \"...\"\n",
|
| 112 |
-
" print(f\" {key}: {val}\")"
|
| 113 |
-
],
|
| 114 |
-
"outputs": [
|
| 115 |
-
{
|
| 116 |
-
"name": "stdout",
|
| 117 |
-
"output_type": "stream",
|
| 118 |
-
"text": [
|
| 119 |
-
"Sample ID: 0\n",
|
| 120 |
-
"Entity: Verge (company)\n",
|
| 121 |
-
"Label: neutral\n",
|
| 122 |
-
"Positions: 1\n",
|
| 123 |
-
"\n",
|
| 124 |
-
"Position fields:\n",
|
| 125 |
-
" position_text: Verge\n",
|
| 126 |
-
" offset: 2082\n",
|
| 127 |
-
" length: 5\n",
|
| 128 |
-
" entity_centered_window: if the companies don't comply with the law. The two mole and skin tag removal pr...\n",
|
| 129 |
-
" marker_text: if the companies don't comply with the law. The two mole and skin tag removal pr...\n",
|
| 130 |
-
" qa_m_question: What do you think of the sentiment of the company Verge ?\n",
|
| 131 |
-
" qa_b_hypotheses: {'negative': 'The polarity of the company Verge is negative .', 'neutral': 'The polarity of the company Verge is neutral .', 'positive': 'The polarity of the company Verge is positive .'}\n"
|
| 132 |
-
]
|
| 133 |
-
}
|
| 134 |
-
],
|
| 135 |
-
"execution_count": 36
|
| 136 |
-
},
|
| 137 |
-
{
|
| 138 |
-
"cell_type": "code",
|
| 139 |
-
"id": "99904016",
|
| 140 |
-
"source": "print(\"All positions for first few entities:\\n\")\nfor s in samples[:2]:\n print(f\"Sample {s['id']}:\")\n for e in s[\"entities\"]:\n print(f\" Entity: {e['entity_text']} ({e['entity_type']}) — label: {e.get('label', 'N/A')}\")\n for j, p in enumerate(e[\"positions\"]):\n print(f\" Position {j}: offset={p['offset']}, length={p['length']}, text=\\\"{p['position_text']}\\\"\")\n print()",
|
| 141 |
-
"metadata": {
|
| 142 |
-
"ExecuteTime": {
|
| 143 |
-
"end_time": "2026-04-19T00:12:36.009947Z",
|
| 144 |
-
"start_time": "2026-04-19T00:12:36.007896Z"
|
| 145 |
-
}
|
| 146 |
-
},
|
| 147 |
-
"outputs": [
|
| 148 |
-
{
|
| 149 |
-
"name": "stdout",
|
| 150 |
-
"output_type": "stream",
|
| 151 |
-
"text": [
|
| 152 |
-
"All positions for first few entities:\n",
|
| 153 |
-
"\n",
|
| 154 |
-
"Sample 0:\n",
|
| 155 |
-
" Entity: Verge (company) — label: neutral\n",
|
| 156 |
-
" Position 0: offset=2082, length=5, text=\"Verge\"\n",
|
| 157 |
-
" Entity: Amazon (company) — label: negative\n",
|
| 158 |
-
" Position 0: offset=0, length=6, text=\"Amazon\"\n",
|
| 159 |
-
" Position 1: offset=121, length=6, text=\"Amazon\"\n",
|
| 160 |
-
" Position 2: offset=449, length=6, text=\"Amazon\"\n",
|
| 161 |
-
" Position 3: offset=556, length=6, text=\"Amazon\"\n",
|
| 162 |
-
" Position 4: offset=1689, length=6, text=\"Amazon\"\n",
|
| 163 |
-
" Position 5: offset=1798, length=6, text=\"Amazon\"\n",
|
| 164 |
-
" Position 6: offset=1848, length=6, text=\"Amazon\"\n",
|
| 165 |
-
" Position 7: offset=2097, length=6, text=\"Amazon\"\n",
|
| 166 |
-
" Entity: US (location) — label: neutral\n",
|
| 167 |
-
" Position 0: offset=1201, length=2, text=\"US\"\n",
|
| 168 |
-
"\n",
|
| 169 |
-
"Sample 1:\n",
|
| 170 |
-
" Entity: Sputnik International (company) — label: neutral\n",
|
| 171 |
-
" Position 0: offset=79, length=21, text=\"Sputnik International\"\n",
|
| 172 |
-
" Entity: Kremlin (company) — label: neutral\n",
|
| 173 |
-
" Position 0: offset=378, length=7, text=\"Kremlin\"\n",
|
| 174 |
-
" Position 1: offset=1126, length=7, text=\"Kremlin\"\n",
|
| 175 |
-
" Entity: Russian (company) — label: neutral\n",
|
| 176 |
-
" Position 0: offset=336, length=7, text=\"Russian\"\n",
|
| 177 |
-
" Entity: Foreign Ministry (company) — label: neutral\n",
|
| 178 |
-
" Position 0: offset=344, length=16, text=\"Foreign Ministry\"\n",
|
| 179 |
-
" Position 1: offset=1024, length=16, text=\"Foreign Ministry\"\n",
|
| 180 |
-
" Position 2: offset=2293, length=16, text=\"Foreign Ministry\"\n",
|
| 181 |
-
" Entity: FBI (company) — label: neutral\n",
|
| 182 |
-
" Position 0: offset=1591, length=3, text=\"FBI\"\n",
|
| 183 |
-
" Entity: SolarWinds (company) — label: neutral\n",
|
| 184 |
-
" Position 0: offset=271, length=10, text=\"SolarWinds\"\n",
|
| 185 |
-
" Position 1: offset=2767, length=10, text=\"SolarWinds\"\n",
|
| 186 |
-
" Entity: CIA (company) — label: neutral\n",
|
| 187 |
-
" Position 0: offset=1993, length=3, text=\"CIA\"\n",
|
| 188 |
-
" Entity: Federal Bureau of Prisons (company) — label: neutral\n",
|
| 189 |
-
" Position 0: offset=1723, length=25, text=\"Federal Bureau of Prisons\"\n",
|
| 190 |
-
" Entity: Sputnik News (company) — label: neutral\n",
|
| 191 |
-
" Position 0: offset=55, length=12, text=\"Sputnik News\"\n",
|
| 192 |
-
"\n"
|
| 193 |
-
]
|
| 194 |
-
}
|
| 195 |
-
],
|
| 196 |
-
"execution_count": 37
|
| 197 |
-
},
|
| 198 |
-
{
|
| 199 |
-
"cell_type": "markdown",
|
| 200 |
-
"id": "a7",
|
| 201 |
-
"metadata": {},
|
| 202 |
-
"source": [
|
| 203 |
-
"## 2. Marker Mode Examples\n",
|
| 204 |
-
"\n",
|
| 205 |
-
"One example per position. `seg_a` = entity wrapped with `[E]...[/E]`, `seg_b` = None."
|
| 206 |
-
]
|
| 207 |
-
},
|
| 208 |
-
{
|
| 209 |
-
"cell_type": "code",
|
| 210 |
-
"id": "a8",
|
| 211 |
-
"metadata": {
|
| 212 |
-
"ExecuteTime": {
|
| 213 |
-
"end_time": "2026-04-19T00:12:36.032677Z",
|
| 214 |
-
"start_time": "2026-04-19T00:12:36.018233Z"
|
| 215 |
-
}
|
| 216 |
-
},
|
| 217 |
-
"source": "marker_exs = flatten_to_examples(samples, mode=\"marker\")\nprint(f\"Total marker examples: {len(marker_exs)}\")\nprint()",
|
| 218 |
-
"outputs": [
|
| 219 |
-
{
|
| 220 |
-
"name": "stdout",
|
| 221 |
-
"output_type": "stream",
|
| 222 |
-
"text": [
|
| 223 |
-
"Total marker examples: 26776\n",
|
| 224 |
-
"\n"
|
| 225 |
-
]
|
| 226 |
-
}
|
| 227 |
-
],
|
| 228 |
-
"execution_count": 38
|
| 229 |
-
},
|
| 230 |
-
{
|
| 231 |
-
"cell_type": "code",
|
| 232 |
-
"id": "a9",
|
| 233 |
-
"metadata": {
|
| 234 |
-
"ExecuteTime": {
|
| 235 |
-
"end_time": "2026-04-19T00:12:36.040278Z",
|
| 236 |
-
"start_time": "2026-04-19T00:12:36.038083Z"
|
| 237 |
-
}
|
| 238 |
-
},
|
| 239 |
-
"source": [
|
| 240 |
-
"for i, ex in enumerate(marker_exs[:3]):\n",
|
| 241 |
-
" print(f\"--- Example {i} ---\")\n",
|
| 242 |
-
" print(f\" entity: {ex['entity_text']} ({ex['entity_type']})\")\n",
|
| 243 |
-
" print(f\" label: {SENTIMENT_LABELS.id2label[ex['label']]} (id={ex['label']})\")\n",
|
| 244 |
-
" print(f\" seg_a: {ex['seg_a']}...\")\n",
|
| 245 |
-
" print(f\" seg_b: {ex['seg_b']}\")\n",
|
| 246 |
-
" print()"
|
| 247 |
-
],
|
| 248 |
-
"outputs": [
|
| 249 |
-
{
|
| 250 |
-
"name": "stdout",
|
| 251 |
-
"output_type": "stream",
|
| 252 |
-
"text": [
|
| 253 |
-
"--- Example 0 ---\n",
|
| 254 |
-
" entity: Verge (company)\n",
|
| 255 |
-
" label: neutral (id=1)\n",
|
| 256 |
-
" seg_a: if the companies don't comply with the law. The two mole and skin tag removal products appear to no longer be available on Amazon's website. But there are still multiple other mole and skin tag removal serums and creams for sale on Amazon, according to a search for \"mole remover.\" Amazon has received warnings from the FDA before. In 2021, the FDA sent the company an untitled letter (a step below a warning letter), saying that the sale of sexual enhancement and weight loss products violated the law. Source: The [E] Verge [/E] The post Amazon sold unauthorized mole removers, and the FDA isn't happy about it appeared first on Trend Fool ....\n",
|
| 257 |
-
" seg_b: None\n",
|
| 258 |
-
"\n",
|
| 259 |
-
"--- Example 1 ---\n",
|
| 260 |
-
" entity: Amazon (company)\n",
|
| 261 |
-
" label: negative (id=0)\n",
|
| 262 |
-
" seg_a: [E] Amazon [/E] sold unauthorized mole removers, and the FDA isn't happy about it Unauthorized mole and skin tag removers sold on Amazon put the company in the crosshairs of the Food and Drug Administration, which sent a warning letter to the retail giant this month asking that it remove the products from its website. There are no authorized over-the-counter drugs that remove moles or skin tags, the FDA said in its warning letter, which was addressed to Amazon CEO Andy Jassy. As part of its research, the agency says it bought two...\n",
|
| 263 |
-
" seg_b: None\n",
|
| 264 |
-
"\n",
|
| 265 |
-
"--- Example 2 ---\n",
|
| 266 |
-
" entity: Amazon (company)\n",
|
| 267 |
-
" label: negative (id=0)\n",
|
| 268 |
-
" seg_a: Amazon sold unauthorized mole removers, and the FDA isn't happy about it Unauthorized mole and skin tag removers sold on [E] Amazon [/E] put the company in the crosshairs of the Food and Drug Administration, which sent a warning letter to the retail giant this month asking that it remove the products from its website. There are no authorized over-the-counter drugs that remove moles or skin tags, the FDA said in its warning letter, which was addressed to Amazon CEO Andy Jassy. As part of its research, the agency says it bought two of the offending products on Amazon: the \"Deisana Skin Tag Remover, Mole Remover and Repair Gel Set\" and the \"Skincell...\n",
|
| 269 |
-
" seg_b: None\n",
|
| 270 |
-
"\n"
|
| 271 |
-
]
|
| 272 |
-
}
|
| 273 |
-
],
|
| 274 |
-
"execution_count": 39
|
| 275 |
-
},
|
| 276 |
-
{
|
| 277 |
-
"cell_type": "markdown",
|
| 278 |
-
"id": "a10",
|
| 279 |
-
"metadata": {},
|
| 280 |
-
"source": [
|
| 281 |
-
"## 3. QA-M Mode Examples\n",
|
| 282 |
-
"\n",
|
| 283 |
-
"One example per position. `seg_a` = context window, `seg_b` = question about the entity."
|
| 284 |
-
]
|
| 285 |
-
},
|
| 286 |
-
{
|
| 287 |
-
"cell_type": "code",
|
| 288 |
-
"id": "a11",
|
| 289 |
-
"metadata": {
|
| 290 |
-
"ExecuteTime": {
|
| 291 |
-
"end_time": "2026-04-19T00:12:36.119437Z",
|
| 292 |
-
"start_time": "2026-04-19T00:12:36.049072Z"
|
| 293 |
-
}
|
| 294 |
-
},
|
| 295 |
-
"source": "qa_m_exs = flatten_to_examples(samples, mode=\"qa_m\")\nprint(f\"Total qa_m examples: {len(qa_m_exs)}\")\nprint()",
|
| 296 |
-
"outputs": [
|
| 297 |
-
{
|
| 298 |
-
"name": "stdout",
|
| 299 |
-
"output_type": "stream",
|
| 300 |
-
"text": [
|
| 301 |
-
"Total qa_m examples: 26776\n",
|
| 302 |
-
"\n"
|
| 303 |
-
]
|
| 304 |
-
}
|
| 305 |
-
],
|
| 306 |
-
"execution_count": 40
|
| 307 |
-
},
|
| 308 |
-
{
|
| 309 |
-
"cell_type": "code",
|
| 310 |
-
"id": "a12",
|
| 311 |
-
"metadata": {
|
| 312 |
-
"ExecuteTime": {
|
| 313 |
-
"end_time": "2026-04-19T00:12:36.130551Z",
|
| 314 |
-
"start_time": "2026-04-19T00:12:36.128443Z"
|
| 315 |
-
}
|
| 316 |
-
},
|
| 317 |
-
"source": [
|
| 318 |
-
"for i, ex in enumerate(qa_m_exs[:3]):\n",
|
| 319 |
-
" print(f\"--- Example {i} ---\")\n",
|
| 320 |
-
" print(f\" entity: {ex['entity_text']} ({ex['entity_type']})\")\n",
|
| 321 |
-
" print(f\" label: {SENTIMENT_LABELS.id2label[ex['label']]} (id={ex['label']})\")\n",
|
| 322 |
-
" print(f\" seg_a: {ex['seg_a']}...\")\n",
|
| 323 |
-
" print(f\" seg_b: {ex['seg_b']}\")\n",
|
| 324 |
-
" print()"
|
| 325 |
-
],
|
| 326 |
-
"outputs": [
|
| 327 |
-
{
|
| 328 |
-
"name": "stdout",
|
| 329 |
-
"output_type": "stream",
|
| 330 |
-
"text": [
|
| 331 |
-
"--- Example 0 ---\n",
|
| 332 |
-
" entity: Verge (company)\n",
|
| 333 |
-
" label: neutral (id=1)\n",
|
| 334 |
-
" seg_a: if the companies don't comply with the law. The two mole and skin tag removal products appear to no longer be available on Amazon's website. But there are still multiple other mole and skin tag removal serums and creams for sale on Amazon, according to a search for \"mole remover.\" Amazon has received warnings from the FDA before. In 2021, the FDA sent the company an untitled letter (a step below a warning letter), saying that the sale of sexual enhancement and weight loss products violated the law. Source: The Verge The post Amazon sold unauthorized mole removers, and the FDA isn't happy about it appeared first on Trend Fool ....\n",
|
| 335 |
-
" seg_b: What do you think of the sentiment of the company Verge ?\n",
|
| 336 |
-
"\n",
|
| 337 |
-
"--- Example 1 ---\n",
|
| 338 |
-
" entity: Amazon (company)\n",
|
| 339 |
-
" label: negative (id=0)\n",
|
| 340 |
-
" seg_a: Amazon sold unauthorized mole removers, and the FDA isn't happy about it\n",
|
| 341 |
-
"Unauthorized mole and skin tag removers sold on Amazon put the company in the crosshairs of the Food and Drug Administration, which sent a warning letter to the retail giant this month asking that it remove the products from its website. There are no authorized over-the-counter drugs that remove moles or skin tags, the FDA said in its warning letter, which was addressed to Amazon CEO Andy Jassy. As part of its research, the agency says it bought two...\n",
|
| 342 |
-
" seg_b: What do you think of the sentiment of the company Amazon ?\n",
|
| 343 |
-
"\n",
|
| 344 |
-
"--- Example 2 ---\n",
|
| 345 |
-
" entity: Amazon (company)\n",
|
| 346 |
-
" label: negative (id=0)\n",
|
| 347 |
-
" seg_a: Amazon sold unauthorized mole removers, and the FDA isn't happy about it\n",
|
| 348 |
-
"Unauthorized mole and skin tag removers sold on Amazon put the company in the crosshairs of the Food and Drug Administration, which sent a warning letter to the retail giant this month asking that it remove the products from its website. There are no authorized over-the-counter drugs that remove moles or skin tags, the FDA said in its warning letter, which was addressed to Amazon CEO Andy Jassy. As part of its research, the agency says it bought two of the offending products on Amazon: the \"Deisana Skin Tag Remover, Mole Remover and Repair Gel Set\" and the \"Skincell...\n",
|
| 349 |
-
" seg_b: What do you think of the sentiment of the company Amazon ?\n",
|
| 350 |
-
"\n"
|
| 351 |
-
]
|
| 352 |
-
}
|
| 353 |
-
],
|
| 354 |
-
"execution_count": 41
|
| 355 |
-
},
|
| 356 |
-
{
|
| 357 |
-
"cell_type": "markdown",
|
| 358 |
-
"id": "a13",
|
| 359 |
-
"metadata": {},
|
| 360 |
-
"source": [
|
| 361 |
-
"## 4. QA-B Mode Examples\n",
|
| 362 |
-
"\n",
|
| 363 |
-
"**Three** examples per position — one per sentiment hypothesis. Labels are binary (1 = correct sentiment, 0 = incorrect). Triplets are always in order: negative, neutral, positive."
|
| 364 |
-
]
|
| 365 |
-
},
|
| 366 |
-
{
|
| 367 |
-
"cell_type": "code",
|
| 368 |
-
"id": "a14",
|
| 369 |
-
"metadata": {
|
| 370 |
-
"ExecuteTime": {
|
| 371 |
-
"end_time": "2026-04-19T00:12:36.226209Z",
|
| 372 |
-
"start_time": "2026-04-19T00:12:36.198371Z"
|
| 373 |
-
}
|
| 374 |
-
},
|
| 375 |
-
"source": "qa_b_exs = flatten_to_examples(samples, mode=\"qa_b\")\nprint(f\"Total qa_b examples: {len(qa_b_exs)}\")\nprint(f\" (= {len(qa_b_exs) // 3} triplets x 3 sentiments)\")\nprint()",
|
| 376 |
-
"outputs": [
|
| 377 |
-
{
|
| 378 |
-
"name": "stdout",
|
| 379 |
-
"output_type": "stream",
|
| 380 |
-
"text": [
|
| 381 |
-
"Total qa_b examples: 80328\n",
|
| 382 |
-
" (= 26776 triplets x 3 sentiments)\n",
|
| 383 |
-
"\n"
|
| 384 |
-
]
|
| 385 |
-
}
|
| 386 |
-
],
|
| 387 |
-
"execution_count": 42
|
| 388 |
-
},
|
| 389 |
-
{
|
| 390 |
-
"cell_type": "code",
|
| 391 |
-
"id": "a15",
|
| 392 |
-
"metadata": {
|
| 393 |
-
"ExecuteTime": {
|
| 394 |
-
"end_time": "2026-04-19T00:12:36.236907Z",
|
| 395 |
-
"start_time": "2026-04-19T00:12:36.234978Z"
|
| 396 |
-
}
|
| 397 |
-
},
|
| 398 |
-
"source": [
|
| 399 |
-
"print(\"First triplet (3 examples for one entity-position pair):\")\n",
|
| 400 |
-
"print()\n",
|
| 401 |
-
"for i, ex in enumerate(qa_b_exs[:3]):\n",
|
| 402 |
-
" print(f\"--- Triplet example {i} ({ex['sentiment']}) ---\")\n",
|
| 403 |
-
" print(f\" entity: {ex['entity_text']} ({ex['entity_type']})\")\n",
|
| 404 |
-
" print(f\" sentiment: {ex['sentiment']}\")\n",
|
| 405 |
-
" print(f\" label: {ex['label']} ({'yes' if ex['label'] == 1 else 'no'})\")\n",
|
| 406 |
-
" print(f\" seg_a: {ex['seg_a'][:80]}...\")\n",
|
| 407 |
-
" print(f\" seg_b: {ex['seg_b']}\")\n",
|
| 408 |
-
" print()"
|
| 409 |
-
],
|
| 410 |
-
"outputs": [
|
| 411 |
-
{
|
| 412 |
-
"name": "stdout",
|
| 413 |
-
"output_type": "stream",
|
| 414 |
-
"text": [
|
| 415 |
-
"First triplet (3 examples for one entity-position pair):\n",
|
| 416 |
-
"\n",
|
| 417 |
-
"--- Triplet example 0 (negative) ---\n",
|
| 418 |
-
" entity: Verge (company)\n",
|
| 419 |
-
" sentiment: negative\n",
|
| 420 |
-
" label: 0 (no)\n",
|
| 421 |
-
" seg_a: if the companies don't comply with the law. The two mole and skin tag removal pr...\n",
|
| 422 |
-
" seg_b: The polarity of the company Verge is negative .\n",
|
| 423 |
-
"\n",
|
| 424 |
-
"--- Triplet example 1 (neutral) ---\n",
|
| 425 |
-
" entity: Verge (company)\n",
|
| 426 |
-
" sentiment: neutral\n",
|
| 427 |
-
" label: 1 (yes)\n",
|
| 428 |
-
" seg_a: if the companies don't comply with the law. The two mole and skin tag removal pr...\n",
|
| 429 |
-
" seg_b: The polarity of the company Verge is neutral .\n",
|
| 430 |
-
"\n",
|
| 431 |
-
"--- Triplet example 2 (positive) ---\n",
|
| 432 |
-
" entity: Verge (company)\n",
|
| 433 |
-
" sentiment: positive\n",
|
| 434 |
-
" label: 0 (no)\n",
|
| 435 |
-
" seg_a: if the companies don't comply with the law. The two mole and skin tag removal pr...\n",
|
| 436 |
-
" seg_b: The polarity of the company Verge is positive .\n",
|
| 437 |
-
"\n"
|
| 438 |
-
]
|
| 439 |
-
}
|
| 440 |
-
],
|
| 441 |
-
"execution_count": 43
|
| 442 |
-
},
|
| 443 |
-
{
|
| 444 |
-
"cell_type": "code",
|
| 445 |
-
"id": "a16",
|
| 446 |
-
"metadata": {
|
| 447 |
-
"ExecuteTime": {
|
| 448 |
-
"end_time": "2026-04-19T00:12:36.251498Z",
|
| 449 |
-
"start_time": "2026-04-19T00:12:36.249261Z"
|
| 450 |
-
}
|
| 451 |
-
},
|
| 452 |
-
"source": [
|
| 453 |
-
"print(\"Second triplet (different entity/position):\")\n",
|
| 454 |
-
"print()\n",
|
| 455 |
-
"for i, ex in enumerate(qa_b_exs[3:6]):\n",
|
| 456 |
-
" print(f\"--- Triplet example {i} ({ex['sentiment']}) ---\")\n",
|
| 457 |
-
" print(f\" entity: {ex['entity_text']}\")\n",
|
| 458 |
-
" print(f\" sentiment: {ex['sentiment']}\")\n",
|
| 459 |
-
" print(f\" label: {ex['label']} ({'yes' if ex['label'] == 1 else 'no'})\")\n",
|
| 460 |
-
" print(f\" seg_b: {ex['seg_b']}\")\n",
|
| 461 |
-
" print()"
|
| 462 |
-
],
|
| 463 |
-
"outputs": [
|
| 464 |
-
{
|
| 465 |
-
"name": "stdout",
|
| 466 |
-
"output_type": "stream",
|
| 467 |
-
"text": [
|
| 468 |
-
"Second triplet (different entity/position):\n",
|
| 469 |
-
"\n",
|
| 470 |
-
"--- Triplet example 0 (negative) ---\n",
|
| 471 |
-
" entity: Amazon\n",
|
| 472 |
-
" sentiment: negative\n",
|
| 473 |
-
" label: 1 (yes)\n",
|
| 474 |
-
" seg_b: The polarity of the company Amazon is negative .\n",
|
| 475 |
-
"\n",
|
| 476 |
-
"--- Triplet example 1 (neutral) ---\n",
|
| 477 |
-
" entity: Amazon\n",
|
| 478 |
-
" sentiment: neutral\n",
|
| 479 |
-
" label: 0 (no)\n",
|
| 480 |
-
" seg_b: The polarity of the company Amazon is neutral .\n",
|
| 481 |
-
"\n",
|
| 482 |
-
"--- Triplet example 2 (positive) ---\n",
|
| 483 |
-
" entity: Amazon\n",
|
| 484 |
-
" sentiment: positive\n",
|
| 485 |
-
" label: 0 (no)\n",
|
| 486 |
-
" seg_b: The polarity of the company Amazon is positive .\n",
|
| 487 |
-
"\n"
|
| 488 |
-
]
|
| 489 |
-
}
|
| 490 |
-
],
|
| 491 |
-
"execution_count": 44
|
| 492 |
-
},
|
| 493 |
-
{
|
| 494 |
-
"cell_type": "markdown",
|
| 495 |
-
"id": "a17",
|
| 496 |
-
"metadata": {},
|
| 497 |
-
"source": [
|
| 498 |
-
"## 5. Example Counts Summary"
|
| 499 |
-
]
|
| 500 |
-
},
|
| 501 |
-
{
|
| 502 |
-
"cell_type": "code",
|
| 503 |
-
"id": "a18",
|
| 504 |
-
"metadata": {
|
| 505 |
-
"ExecuteTime": {
|
| 506 |
-
"end_time": "2026-04-19T00:12:36.270922Z",
|
| 507 |
-
"start_time": "2026-04-19T00:12:36.260121Z"
|
| 508 |
-
}
|
| 509 |
-
},
|
| 510 |
-
"source": [
|
| 511 |
-
"total_positions = sum(\n",
|
| 512 |
-
" len(p)\n",
|
| 513 |
-
" for s in samples\n",
|
| 514 |
-
" for e in s[\"entities\"]\n",
|
| 515 |
-
" for p in [e[\"positions\"]]\n",
|
| 516 |
-
")\n",
|
| 517 |
-
"total_entities = sum(len(s[\"entities\"]) for s in samples)\n",
|
| 518 |
-
"\n",
|
| 519 |
-
"print(f\"Samples: {len(samples)}\")\n",
|
| 520 |
-
"print(f\"Entities: {total_entities}\")\n",
|
| 521 |
-
"print(f\"Positions: {total_positions}\")\n",
|
| 522 |
-
"print()\n",
|
| 523 |
-
"print(f\"Marker examples: {len(marker_exs):>6} (1 per position)\")\n",
|
| 524 |
-
"print(f\"QA-M examples: {len(qa_m_exs):>6} (1 per position)\")\n",
|
| 525 |
-
"print(f\"QA-B examples: {len(qa_b_exs):>6} (3 per position)\")\n",
|
| 526 |
-
"print()\n",
|
| 527 |
-
"print(\"Label distributions:\")\n",
|
| 528 |
-
"print(f\" Marker: {dict(Counter(SENTIMENT_LABELS.id2label[e['label']] for e in marker_exs))}\")\n",
|
| 529 |
-
"print(f\" QA-M: {dict(Counter(SENTIMENT_LABELS.id2label[e['label']] for e in qa_m_exs))}\")\n",
|
| 530 |
-
"print(f\" QA-B: yes={sum(1 for e in qa_b_exs if e['label']==1)} no={sum(1 for e in qa_b_exs if e['label']==0)}\")"
|
| 531 |
-
],
|
| 532 |
-
"outputs": [
|
| 533 |
-
{
|
| 534 |
-
"name": "stdout",
|
| 535 |
-
"output_type": "stream",
|
| 536 |
-
"text": [
|
| 537 |
-
"Samples: 1629\n",
|
| 538 |
-
"Entities: 10550\n",
|
| 539 |
-
"Positions: 26776\n",
|
| 540 |
-
"\n",
|
| 541 |
-
"Marker examples: 26776 (1 per position)\n",
|
| 542 |
-
"QA-M examples: 26776 (1 per position)\n",
|
| 543 |
-
"QA-B examples: 80328 (3 per position)\n",
|
| 544 |
-
"\n",
|
| 545 |
-
"Label distributions:\n",
|
| 546 |
-
" Marker: {'neutral': 10905, 'negative': 7854, 'positive': 8017}\n",
|
| 547 |
-
" QA-M: {'neutral': 10905, 'negative': 7854, 'positive': 8017}\n",
|
| 548 |
-
" QA-B: yes=26776 no=53552\n"
|
| 549 |
-
]
|
| 550 |
-
}
|
| 551 |
-
],
|
| 552 |
-
"execution_count": 45
|
| 553 |
-
}
|
| 554 |
-
],
|
| 555 |
-
"metadata": {
|
| 556 |
-
"kernelspec": {
|
| 557 |
-
"display_name": "Python 3",
|
| 558 |
-
"language": "python",
|
| 559 |
-
"name": "python3"
|
| 560 |
-
},
|
| 561 |
-
"language_info": {
|
| 562 |
-
"name": "python",
|
| 563 |
-
"version": "3.11.0"
|
| 564 |
-
}
|
| 565 |
-
},
|
| 566 |
-
"nbformat": 4,
|
| 567 |
-
"nbformat_minor": 5
|
| 568 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|