Spaces:

lamossta
/

sv-task

Sleeping

File size: 69,683 Bytes
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Data Preprocessing Analysis\n",
    "\n",
    "Data hygiene checks for entity-level sentiment classification.\n",
    "\n",
    "**Structure**\n",
    "1. Load Data\n",
    "2. Dataset Overview\n",
    "3. Duplicate Texts\n",
    "4. Entity Deduplication Pipeline\n",
    "5. Label Validity\n",
    "6. Position Text Mismatches\n",
    "7. HTML Tags\n",
    "8. Preprocessing Summary"
   ],
   "id": "8cdbbbb3574744ab"
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2026-04-18T19:53:18.500521Z",
     "start_time": "2026-04-18T19:53:18.495644Z"
    }
   },
   "source": [
    "import os\n",
    "import json\n",
    "import warnings\n",
    "from collections import Counter\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import matplotlib.patches as mpatches\n",
    "import seaborn as sns\n",
    "\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "sns.set_theme(style=\"whitegrid\", palette=\"muted\")\n",
    "plt.rcParams[\"figure.dpi\"]   = 120\n",
    "plt.rcParams[\"axes.titlesize\"] = 13\n",
    "plt.rcParams[\"axes.labelsize\"] = 11\n",
    "\n",
    "VALID_LABELS  = {\"positive\", \"neutral\", \"negative\"}\n",
    "LABEL_ORDER   = [\"negative\", \"neutral\", \"positive\"]\n",
    "LABEL_COLORS  = {\"positive\": \"#4CAF50\", \"neutral\": \"#90A4AE\", \"negative\": \"#EF5350\"}"
   ],
   "id": "32a9ac93d2dec54",
   "outputs": [],
   "execution_count": 13
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Load Data"
   ],
   "id": "5eabfa4c06905b4"
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2026-04-18T19:53:18.574483Z",
     "start_time": "2026-04-18T19:53:18.515917Z"
    }
   },
   "source": [
    "data_path = os.path.join(\"..\", \"data\", \"data_raw.json\")\n",
    "with open(data_path, \"r\", encoding=\"utf-8\") as f:\n",
    "    data_raw = json.load(f)\n",
    "\n",
    "# Flat article-level dataframe\n",
    "df_articles = pd.DataFrame([\n",
    "    {\"id\": s[\"id\"], \"text\": s[\"text\"], \"n_entities\": len(s[\"entities\"])}\n",
    "    for s in data_raw\n",
    "])\n",
    "\n",
    "# Flat entity-level dataframe\n",
    "df_entities = pd.DataFrame([\n",
    "    {\n",
    "        \"sample_id\":   s[\"id\"],\n",
    "        \"entity_id\":   e[\"entity_id\"],\n",
    "        \"entity_text\": e[\"entity_text\"],\n",
    "        \"entity_type\": e[\"entity_type\"],\n",
    "        \"label\":       e[\"label\"],\n",
    "        \"n_positions\": len(e[\"positions\"]),\n",
    "    }\n",
    "    for s in data_raw\n",
    "    for e in s[\"entities\"]\n",
    "])\n",
    "\n",
    "print(f\"Articles : {len(df_articles):,}\")\n",
    "print(f\"Entities : {len(df_entities):,}\")\n",
    "print(f\"Avg entities per article: {len(df_entities)/len(df_articles):.2f}\")\n",
    "df_articles.head()"
   ],
   "id": "463eb928d9be91b0",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Articles : 1,637\n",
      "Entities : 12,627\n",
      "Avg entities per article: 7.71\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "   id                                               text  n_entities\n",
       "0   0  Amazon sold unauthorized mole removers, and th...           4\n",
       "1   1  Russia Announces Response Measures to New US S...          11\n",
       "2   2  India's richest man takes on Amazon, Walmart i...          12\n",
       "3   3  Govt may impose anti-dumping duty on chemical ...          10\n",
       "4   4  Apollo Go: AI-Powered Autonomous Ride-Hailing ...          13"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>text</th>\n",
       "      <th>n_entities</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>Amazon sold unauthorized mole removers, and th...</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>Russia Announces Response Measures to New US S...</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>India's richest man takes on Amazon, Walmart i...</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>Govt may impose anti-dumping duty on chemical ...</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>Apollo Go: AI-Powered Autonomous Ride-Hailing ...</td>\n",
       "      <td>13</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 14
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Dataset Overview"
   ],
   "id": "6812878d46314b80"
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2026-04-18T19:53:18.601499Z",
     "start_time": "2026-04-18T19:53:18.597184Z"
    }
   },
   "source": [
    "print(\"=\" * 45)\n",
    "print(\"DATASET OVERVIEW\")\n",
    "print(\"=\" * 45)\n",
    "print(f\"  Total samples        : {len(data_raw):,}\")\n",
    "print(f\"  Total entities       : {len(df_entities):,}\")\n",
    "print(f\"  Unique entity texts  : {df_entities['entity_text'].nunique():,}\")\n",
    "print(f\"  Unique entity types  : {df_entities['entity_type'].nunique()}\")\n",
    "print()\n",
    "print(\"Entity type breakdown:\")\n",
    "for etype, cnt in df_entities[\"entity_type\"].value_counts().items():\n",
    "    print(f\"  {etype:12s}: {cnt:6,}  ({cnt / len(df_entities) * 100:.1f}%)\")"
   ],
   "id": "c7f3929c5b35b77c",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "=============================================\n",
      "DATASET OVERVIEW\n",
      "=============================================\n",
      "  Total samples        : 1,637\n",
      "  Total entities       : 12,627\n",
      "  Unique entity texts  : 5,815\n",
      "  Unique entity types  : 2\n",
      "\n",
      "Entity type breakdown:\n",
      "  company     : 10,351  (82.0%)\n",
      "  location    :  2,276  (18.0%)\n"
     ]
    }
   ],
   "execution_count": 15
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Duplicate Article Texts\n",
    "\n",
    "Two samples sharing the same article text would allow the model to see identical context\n",
    "in both training and evaluation, artificially inflating metrics."
   ],
   "id": "7623b7ffc328b9a7"
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2026-04-18T19:53:18.626539Z",
     "start_time": "2026-04-18T19:53:18.621684Z"
    }
   },
   "source": [
    "dup_mask     = df_articles.duplicated(\"text\", keep=False)\n",
    "dup_articles = df_articles[dup_mask]\n",
    "\n",
    "print(f\"Duplicate article texts: {len(dup_articles)}\")\n",
    "\n",
    "if len(dup_articles) > 0:\n",
    "    print(dup_articles.head(10))"
   ],
   "id": "17313e6496ece22",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Duplicate article texts: 16\n",
      "        id                                               text  n_entities\n",
      "48      48  Elon Musk buys Twitter for $44B and will priva...          13\n",
      "254    254  Judge tosses Biden threats case against northe...          12\n",
      "265    265  Dozens feared dead as Russian shell hits Ukrai...           9\n",
      "322    322  US weekly jobless claims unexpectedly fall\\nWA...           4\n",
      "393    393  Yemen's rebels launch drone and missile strike...           9\n",
      "610    610  Dozens feared dead as Russian shell hits Ukrai...           8\n",
      "667    667  Elon Musk buys Twitter for $44B and will priva...          14\n",
      "975    975  Mercado Pago anuncia 'conta que mais rende no ...           3\n",
      "1019  1019  Yemen's rebels launch drone and missile strike...           8\n",
      "1166  1166  US weekly jobless claims unexpectedly fall\\nWA...           4\n"
     ]
    }
   ],
   "execution_count": 16
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Entity Deduplication Pipeline\n",
    "\n",
    "Within each sample, multiple entity records can refer to the same real-world entity\n",
    "with overlapping or identical position spans. The pipeline below merges and cleans\n",
    "these in four steps:\n",
    "\n",
    "1. **Merge entity records** by `(entity_text, label)` → all positions for that entity in one list\n",
    "2. **Deduplicate exact-duplicate positions** (same offset + length)\n",
    "3. **Resolve overlapping positions** (same offset, different length → keep longest)\n",
    "4. **Handle partial overlaps** (rare edge cases)"
   ],
   "id": "612bfc4a2968aa02"
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 1 — Merge entity records by `(entity_text, label)`\n",
    "\n",
    "Multiple entity records with the same `entity_text` and `label` inside one sample\n",
    "are merged into a single entity with a unified position list. Conflicting labels\n",
    "for the same entity text are flagged as annotation errors."
   ],
   "id": "7765b7af17c9b50d"
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2026-04-18T19:53:18.656417Z",
     "start_time": "2026-04-18T19:53:18.650897Z"
    }
   },
   "source": [
    "merge_stats = {\"merged_entities\": 0, \"different_label\": 0, \"affected_samples\": set()}\n",
    "conflict_examples = []\n",
    "\n",
    "for s in data_raw:\n",
    "    seen = {}\n",
    "    for e in s[\"entities\"]:\n",
    "        key = e[\"entity_text\"].lower()\n",
    "        if key in seen:\n",
    "            merge_stats[\"affected_samples\"].add(s[\"id\"])\n",
    "            if seen[key] == e[\"label\"]:\n",
    "                merge_stats[\"merged_entities\"] += 1\n",
    "            else:\n",
    "                merge_stats[\"different_label\"] += 1\n",
    "                conflict_examples.append({\n",
    "                    \"sample_id\":   s[\"id\"],\n",
    "                    \"entity_text\": e[\"entity_text\"],\n",
    "                    \"labels\":      [seen[key], e[\"label\"]],\n",
    "                })\n",
    "        else:\n",
    "            seen[key] = e[\"label\"]\n",
    "\n",
    "total_merged = merge_stats[\"merged_entities\"] + merge_stats[\"different_label\"]\n",
    "print(\"Step 1: Merge entity records by (entity_text, label)\")\n",
    "print(\"=\" * 55)\n",
    "print(f\"Duplicate (entity_text, sample) pairs:  {total_merged:,}\")\n",
    "print(f\"Same label:                             {merge_stats['merged_entities']:,}\")\n",
    "print(f\"Different_labels:                       {merge_stats['different_label']:,}\")\n",
    "print(f\"Affected samples:                       {len(merge_stats['affected_samples']):,}\")\n",
    "print()\n",
    "\n",
    "if merge_stats[\"different_label\"] == 0:\n",
    "    print(\"All duplicates have consistent labels.\")\n",
    "    print(f\"Merging will consolidate {total_merged:,} redundant entity records ({total_merged / len(df_entities) * 100:.1f}% of total).\")\n",
    "else:\n",
    "    print(f\"{merge_stats['different_label']} different-label duplicates.\")\n",
    "    print(pd.DataFrame(conflict_examples).head())"
   ],
   "id": "e58872333d152f15",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Step 1: Merge entity records by (entity_text, label)\n",
      "=======================================================\n",
      "Duplicate (entity_text, sample) pairs:  2,014\n",
      "Same label:                             2,014\n",
      "Different_labels:                       0\n",
      "Affected samples:                       838\n",
      "\n",
      "All duplicates have consistent labels.\n",
      "Merging will consolidate 2,014 redundant entity records (15.9% of total).\n"
     ]
    }
   ],
   "execution_count": 17
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 2 — Deduplicate exact-duplicate positions\n",
    "\n",
    "After merging, a single entity may carry duplicate position entries (identical offset\n",
    "and length). These are removed, keeping one copy per unique span."
   ],
   "id": "b0eb6a25890d61b6"
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2026-04-18T19:53:18.694440Z",
     "start_time": "2026-04-18T19:53:18.675611Z"
    }
   },
   "source": [
    "exact_dup_positions = []\n",
    "\n",
    "for s in data_raw:\n",
    "    merged = {}\n",
    "    for e in s[\"entities\"]:\n",
    "        key = (e[\"entity_text\"].lower(), e[\"label\"])\n",
    "        if key not in merged:\n",
    "            merged[key] = {\"entity_text\": e[\"entity_text\"], \"label\": e[\"label\"], \"positions\": []}\n",
    "        merged[key][\"positions\"].extend(e[\"positions\"])\n",
    "\n",
    "    for key, ent in merged.items():\n",
    "        seen_spans = set()\n",
    "        for p in ent[\"positions\"]:\n",
    "            span = (p[\"offset\"], p[\"length\"])\n",
    "            if span in seen_spans:\n",
    "                exact_dup_positions.append({\n",
    "                    \"sample_id\":     s[\"id\"],\n",
    "                    \"entity_text\":   ent[\"entity_text\"],\n",
    "                    \"position_text\": p[\"position_text\"],\n",
    "                    \"offset\":        p[\"offset\"],\n",
    "                    \"length\":        p[\"length\"],\n",
    "                })\n",
    "            else:\n",
    "                seen_spans.add(span)\n",
    "\n",
    "print(\"Step 2: Deduplicate exact-duplicate positions\")\n",
    "print(\"=\" * 55)\n",
    "print(f\"Exact-duplicate positions found: {len(exact_dup_positions):,}\")\n",
    "if exact_dup_positions:\n",
    "    df_exact_dup = pd.DataFrame(exact_dup_positions)\n",
    "    n_samples = df_exact_dup[\"sample_id\"].nunique()\n",
    "    print(f\"Across {n_samples:,} sample(s)\")\n",
    "    print()\n",
    "    print(df_exact_dup.head(20).to_string(index=False))\n",
    "else:\n",
    "    print(\"No exact-duplicate positions after merging.\")"
   ],
   "id": "d5acf8bf642c044",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Step 2: Deduplicate exact-duplicate positions\n",
      "=======================================================\n",
      "Exact-duplicate positions found: 9,634\n",
      "Across 938 sample(s)\n",
      "\n",
      " sample_id             entity_text           position_text  offset  length\n",
      "         0                   Verge                   Verge    2082       5\n",
      "         1              SolarWinds              SolarWinds     271      10\n",
      "         1              SolarWinds              SolarWinds    2767      10\n",
      "         2                 Grofers                 Grofers    2184       7\n",
      "         3 Reliance industries Ltd Reliance industries Ltd      70      23\n",
      "         3 Reliance industries Ltd Reliance industries Ltd     630      23\n",
      "         3 Reliance industries Ltd Reliance industries Ltd      70      23\n",
      "         3 Reliance industries Ltd                Reliance      70       8\n",
      "         3 Reliance industries Ltd Reliance industries Ltd     630      23\n",
      "         3 Reliance industries Ltd                Reliance     630       8\n",
      "         3   India Glycols Limited   India Glycols Limited     924      21\n",
      "         3   India Glycols Limited   India Glycols Limited     924      21\n",
      "         4                   Baidu                   Baidu     459       5\n",
      "         4                   Baidu                   Baidu     980       5\n",
      "         4                   Baidu                   Baidu    1289       5\n",
      "         4                   Baidu                   Baidu    2915       5\n",
      "         4                   Baidu                   Baidu    4180       5\n",
      "         4       global industries       global industries     175      17\n",
      "         5        Nokian Tyres plc        Nokian Tyres plc      51      16\n",
      "         5        Nokian Tyres plc        Nokian Tyres plc      51      16\n"
     ]
    }
   ],
   "execution_count": 18
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 3 — Resolve overlapping positions (same offset, different length)\n",
    "\n",
    "Two positions for the same entity starting at the same offset but with different\n",
    "lengths represent the same mention captured at different granularity.\n",
    "Resolution: keep the longest span (it captures the full entity mention)."
   ],
   "id": "31dd5cc13f612c0b"
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2026-04-18T19:53:18.721435Z",
     "start_time": "2026-04-18T19:53:18.703478Z"
    }
   },
   "source": [
    "same_offset_diff_len = []\n",
    "\n",
    "for s in data_raw:\n",
    "    merged = {}\n",
    "    for e in s[\"entities\"]:\n",
    "        key = (e[\"entity_text\"].lower(), e[\"label\"])\n",
    "        if key not in merged:\n",
    "            merged[key] = {\"entity_text\": e[\"entity_text\"], \"positions\": []}\n",
    "        merged[key][\"positions\"].extend(e[\"positions\"])\n",
    "\n",
    "    for key, ent in merged.items():\n",
    "        # Deduplicate exact spans first\n",
    "        unique_positions = {}\n",
    "        for p in ent[\"positions\"]:\n",
    "            span = (p[\"offset\"], p[\"length\"])\n",
    "            if span not in unique_positions:\n",
    "                unique_positions[span] = p\n",
    "\n",
    "        # Group by offset\n",
    "        by_offset = {}\n",
    "        for (off, length), p in unique_positions.items():\n",
    "            by_offset.setdefault(off, []).append(p)\n",
    "\n",
    "        for off, positions in by_offset.items():\n",
    "            if len(positions) > 1:\n",
    "                positions.sort(key=lambda p: p[\"length\"], reverse=True)\n",
    "                keep = positions[0]\n",
    "                for discard in positions[1:]:\n",
    "                    same_offset_diff_len.append({\n",
    "                        \"sample_id\":      s[\"id\"],\n",
    "                        \"entity_text\":    ent[\"entity_text\"],\n",
    "                        \"keep_text\":      keep[\"position_text\"],\n",
    "                        \"keep_span\":      f'[{keep[\"offset\"]}:{keep[\"offset\"]+keep[\"length\"]})',\n",
    "                        \"discard_text\":   discard[\"position_text\"],\n",
    "                        \"discard_span\":   f'[{discard[\"offset\"]}:{discard[\"offset\"]+discard[\"length\"]})',\n",
    "                    })\n",
    "\n",
    "print(\"Step 3: Resolve same-offset, different-length positions\")\n",
    "print(\"=\" * 55)\n",
    "print(f\"Cases found: {len(same_offset_diff_len):,}\")\n",
    "if same_offset_diff_len:\n",
    "    df_same_off = pd.DataFrame(same_offset_diff_len)\n",
    "    n_samples = df_same_off[\"sample_id\"].nunique()\n",
    "    print(f\"Across {n_samples:,} sample(s)\")\n",
    "    print()\n",
    "    print(df_same_off.head(10).to_string(index=False))\n",
    "else:\n",
    "    print(\"No same-offset length conflicts after deduplication.\")"
   ],
   "id": "2ec34f05729dd40c",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Step 3: Resolve same-offset, different-length positions\n",
      "=======================================================\n",
      "Cases found: 805\n",
      "Across 377 sample(s)\n",
      "\n",
      " sample_id                      entity_text                             keep_text   keep_span                     discard_text discard_span\n",
      "         3          Reliance industries Ltd               Reliance industries Ltd     [70:93)                         Reliance      [70:78)\n",
      "         3          Reliance industries Ltd               Reliance industries Ltd   [630:653)                         Reliance    [630:638)\n",
      "         7                   Charles Schwab            Charles Schwab Corporation   [171:197)                   Charles Schwab    [171:185)\n",
      "         7                   Charles Schwab                      Charles Schwab's   [823:839)                   Charles Schwab    [823:837)\n",
      "         7                   Charles Schwab                      Charles Schwab's [1208:1224)                   Charles Schwab  [1208:1222)\n",
      "         7                   Charles Schwab                      Charles Schwab's [1404:1420)                   Charles Schwab  [1404:1418)\n",
      "        10                   Cracker Barrel Cracker Barrel Old Country Store, Inc   [130:167) Cracker Barrel Old Country Store    [130:162)\n",
      "        10                   Cracker Barrel Cracker Barrel Old Country Store, Inc   [130:167)                   Cracker Barrel    [130:144)\n",
      "        14 American Water Works Association      American Water Works Association [1299:1331)                   American Water  [1299:1313)\n",
      "        15                      Sainsbury's                           Sainsbury's   [232:243)                        Sainsbury    [232:241)\n"
     ]
    }
   ],
   "execution_count": 19
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 4 — Handle partial overlaps\n",
    "\n",
    "Remaining overlaps where two positions for the same entity share some characters\n",
    "but start at different offsets. These are rare and may indicate tokenisation\n",
    "differences in the original annotation."
   ],
   "id": "abdbd0fa2842058c"
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2026-04-18T19:53:18.748741Z",
     "start_time": "2026-04-18T19:53:18.726218Z"
    }
   },
   "source": [
    "partial_overlaps = []\n",
    "\n",
    "for s in data_raw:\n",
    "    merged = {}\n",
    "    for e in s[\"entities\"]:\n",
    "        key = (e[\"entity_text\"].lower(), e[\"label\"])\n",
    "        if key not in merged:\n",
    "            merged[key] = {\"entity_text\": e[\"entity_text\"], \"positions\": []}\n",
    "        merged[key][\"positions\"].extend(e[\"positions\"])\n",
    "\n",
    "    for key, ent in merged.items():\n",
    "        # Deduplicate exact spans\n",
    "        unique_positions = {}\n",
    "        for p in ent[\"positions\"]:\n",
    "            span = (p[\"offset\"], p[\"length\"])\n",
    "            if span not in unique_positions:\n",
    "                unique_positions[span] = p\n",
    "\n",
    "        # Resolve same-offset conflicts (keep longest)\n",
    "        by_offset = {}\n",
    "        for (off, length), p in unique_positions.items():\n",
    "            if off not in by_offset or length > by_offset[off][\"length\"]:\n",
    "                by_offset[off] = p\n",
    "        resolved = sorted(by_offset.values(), key=lambda p: p[\"offset\"])\n",
    "\n",
    "        # Check remaining partial overlaps\n",
    "        for i in range(len(resolved)):\n",
    "            for j in range(i + 1, len(resolved)):\n",
    "                si_off = resolved[i][\"offset\"]\n",
    "                si_end = si_off + resolved[i][\"length\"]\n",
    "                sj_off = resolved[j][\"offset\"]\n",
    "                sj_end = sj_off + resolved[j][\"length\"]\n",
    "                if sj_off >= si_end:\n",
    "                    break\n",
    "                partial_overlaps.append({\n",
    "                    \"sample_id\":      s[\"id\"],\n",
    "                    \"entity_text\":    ent[\"entity_text\"],\n",
    "                    \"position_text_a\": resolved[i][\"position_text\"],\n",
    "                    \"position_text_b\": resolved[j][\"position_text\"],\n",
    "                    \"span_a\":         f'[{si_off}:{si_end})',\n",
    "                    \"span_b\":         f'[{sj_off}:{sj_end})',\n",
    "                    \"overlap_chars\":  si_end - sj_off,\n",
    "                })\n",
    "\n",
    "print(\"Step 4: Handle partial overlaps\")\n",
    "print(\"=\" * 55)\n",
    "print(f\"Partial overlaps remaining: {len(partial_overlaps):,}\")\n",
    "if partial_overlaps:\n",
    "    df_partial = pd.DataFrame(partial_overlaps)\n",
    "    n_samples = df_partial[\"sample_id\"].nunique()\n",
    "    print(f\"Across {n_samples:,} sample(s)\")\n",
    "    print()\n",
    "    print(df_partial.to_string(index=False))\n",
    "else:\n",
    "    print(\"No partial overlaps remain.\")"
   ],
   "id": "af1fb86847fd23b3",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Step 4: Handle partial overlaps\n",
      "=======================================================\n",
      "Partial overlaps remaining: 19\n",
      "Across 11 sample(s)\n",
      "\n",
      " sample_id             entity_text         position_text_a       position_text_b      span_a      span_b  overlap_chars\n",
      "        35 The Walt Disney Company The Walt Disney Company Walt Disney Company's      [0:23)      [4:25)             19\n",
      "        35 The Walt Disney Company The Walt Disney Company Walt Disney Company's   [136:159)   [140:161)             19\n",
      "        35 The Walt Disney Company The Walt Disney Company Walt Disney Company's   [462:485)   [466:487)             19\n",
      "       147                   Shell     Royal Dutch Shell's                 Shell   [154:173)   [166:171)              7\n",
      "       147                   Shell       Royal Dutch Shell                 Shell   [479:496)   [491:496)              5\n",
      "       397 the Walt Disney Company the Walt Disney Company   Walt Disney Company   [427:450)   [431:450)             19\n",
      "       397                  Disney          Walt Disney Co                Disney   [135:149)   [140:146)              9\n",
      "       397                  Disney             Walt Disney                Disney   [916:927)   [921:927)              6\n",
      "       537       Royal Dutch Shell       Royal Dutch Shell                 Shell  [991:1008) [1003:1008)              5\n",
      "       548                   Shell   Royal Dutch Shell PLC                 Shell [1340:1361) [1352:1357)              9\n",
      "       886                  Disney          Walt Disney Co                Disney     [70:84)     [75:81)              9\n",
      "       925       Royal Dutch Shell       Royal Dutch Shell                 Shell [2720:2737) [2732:2737)              5\n",
      "       938                  Disney             Walt Disney                Disney   [192:203)   [197:203)              6\n",
      "      1050                  Disney The Walt Disney Company   Walt Disney Company     [61:84)     [65:84)             19\n",
      "      1050                  Disney The Walt Disney Company                Disney     [61:84)     [70:76)             14\n",
      "      1050                  Disney     Walt Disney Company                Disney     [65:84)     [70:76)             14\n",
      "      1050 The Walt Disney Company The Walt Disney Company   Walt Disney Company     [61:84)     [65:84)             19\n",
      "      1400   Royal Dutch Shell Plc   Royal Dutch Shell Plc                 Shell [2864:2885) [2876:2881)              9\n",
      "      1410                    Vale       PT Vale Indonesia                  Vale   [241:258)   [244:248)             14\n"
     ]
    }
   ],
   "execution_count": 20
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. Label Validity\n",
    "\n",
    "The schema defines three valid labels: `positive`, `neutral`, `negative`.\n",
    "Any other value is an annotation error. We check for remappable near-matches\n",
    "(e.g. `very positive`) that can be salvaged rather than dropped."
   ],
   "id": "fee7fd592186f064"
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2026-04-18T19:53:18.765731Z",
     "start_time": "2026-04-18T19:53:18.762683Z"
    }
   },
   "source": [
    "label_counts = df_entities[\"label\"].value_counts()\n",
    "invalid = label_counts[~label_counts.index.isin(VALID_LABELS)]\n",
    "\n",
    "print(\"All label values and counts:\")\n",
    "print(label_counts.to_frame(\"count\").to_string())\n",
    "print()\n",
    "\n",
    "if len(invalid) == 0:\n",
    "    print(\"All labels are valid.\")"
   ],
   "id": "123db84848d0be7f",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "All label values and counts:\n",
      "               count\n",
      "label               \n",
      "neutral         8400\n",
      "negative        2148\n",
      "positive        2078\n",
      "very positive      1\n",
      "\n"
     ]
    }
   ],
   "execution_count": 21
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6. Position Text Mismatches\n",
    "\n",
    "Each position stores `position_text`, `offset`, and `length`. The actual span in\n",
    "the article is `text[offset : offset+length]`. We check whether these agree.\n",
    "\n",
    "- **Case-only mismatch** (`BRUSSELS` vs `Brussels`): offset is correct, stored string differs → fix by overwriting.\n",
    "- **Content mismatch**: offset itself is wrong → requires manual review."
   ],
   "id": "62d9dd3ff74f13c6"
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2026-04-18T19:53:18.785834Z",
     "start_time": "2026-04-18T19:53:18.775763Z"
    }
   },
   "source": [
    "case_mismatches, content_mismatches = [], []\n",
    "\n",
    "for s in data_raw:\n",
    "    txt = s[\"text\"]\n",
    "    for e in s[\"entities\"]:\n",
    "        for p in e[\"positions\"]:\n",
    "            end = p[\"offset\"] + p[\"length\"]\n",
    "            actual = txt[p[\"offset\"]:end]\n",
    "            stored = p[\"position_text\"]\n",
    "\n",
    "            if actual != stored:\n",
    "                row = {\n",
    "                    \"sample_id\":   s[\"id\"],\n",
    "                    \"entity_text\": e[\"entity_text\"],\n",
    "                    \"stored\":      stored,\n",
    "                    \"actual\":      actual,\n",
    "                }\n",
    "\n",
    "                if actual.lower() == stored.lower():\n",
    "                    case_mismatches.append(row)\n",
    "                else:\n",
    "                    content_mismatches.append(row)\n",
    "\n",
    "print(f\"Case-only mismatches : {len(case_mismatches):,}\")\n",
    "print(f\"Content mismatches   : {len(content_mismatches):,}\")\n",
    "print()\n",
    "\n",
    "if case_mismatches:\n",
    "    print(\"Sample case mismatches:\")\n",
    "    print(pd.DataFrame(case_mismatches).head()[[\"sample_id\", \"entity_text\", \"stored\", \"actual\"]].to_string(index=False))\n",
    "    print()\n",
    "    print(\"All case mismatches: overwrite position_text with actual span from text.\")\n",
    "\n",
    "if content_mismatches:\n",
    "    print(\"Content mismatches require manual review:\")\n",
    "    print(pd.DataFrame(content_mismatches).head().to_string(index=False))\n",
    "else:\n",
    "    print(\"No content mismatches.\")"
   ],
   "id": "ccbe961ef0ec6248",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Case-only mismatches : 224\n",
      "Content mismatches   : 0\n",
      "\n",
      "Sample case mismatches:\n",
      " sample_id entity_text   stored   actual\n",
      "        12    BRUSSELS BRUSSELS Brussels\n",
      "        12    BRUSSELS BRUSSELS Brussels\n",
      "        12    BRUSSELS BRUSSELS Brussels\n",
      "        12    BRUSSELS BRUSSELS Brussels\n",
      "        12    BRUSSELS BRUSSELS Brussels\n",
      "\n",
      "All case mismatches: overwrite position_text with actual span from text.\n",
      "No content mismatches.\n"
     ]
    }
   ],
   "execution_count": 22
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 7. HTML Tags\n",
    "\n",
    "Raw text may contain residual HTML tags from web scraping (e.g. `<b>`, `<a href=...>`, `&amp;`).\n",
    "These add noise to the model input and should be stripped during preprocessing."
   ],
   "id": "93ccd719e8730f47"
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2026-04-18T19:53:18.838764Z",
     "start_time": "2026-04-18T19:53:18.795458Z"
    }
   },
   "source": [
    "from bs4 import BeautifulSoup\n",
    "import html as html_lib\n",
    "\n",
    "articles_with_tags = []\n",
    "articles_with_entities = []\n",
    "tag_counts = Counter()\n",
    "entity_counts = Counter()\n",
    "\n",
    "for s in data_raw:\n",
    "    txt = s[\"text\"]\n",
    "\n",
    "    soup = BeautifulSoup(txt, \"html.parser\")\n",
    "    tags = soup.find_all()\n",
    "    if tags:\n",
    "        tag_names = [str(t) for t in tags]\n",
    "        articles_with_tags.append({\"id\": s[\"id\"], \"tags\": tag_names, \"count\": len(tag_names)})\n",
    "        for t in tags:\n",
    "            tag_counts[t.name] += 1\n",
    "\n",
    "    decoded = html_lib.unescape(txt)\n",
    "    if decoded != txt:\n",
    "        diff_chars = [\n",
    "            txt[i:i+20] for i in range(len(txt))\n",
    "            if i < len(decoded) and txt[i] != decoded[i]\n",
    "        ]\n",
    "        entities_found = []\n",
    "        i = 0\n",
    "        while i < len(txt):\n",
    "            if txt[i] == \"&\":\n",
    "                end = txt.find(\";\", i)\n",
    "                if end != -1:\n",
    "                    candidate = txt[i:end+1]\n",
    "                    if html_lib.unescape(candidate) != candidate:\n",
    "                        entities_found.append(candidate)\n",
    "                        i = end + 1\n",
    "                        continue\n",
    "            i += 1\n",
    "        if entities_found:\n",
    "            articles_with_entities.append({\"id\": s[\"id\"], \"entities\": entities_found, \"count\": len(entities_found)})\n",
    "            for e in entities_found:\n",
    "                entity_counts[e] += 1\n",
    "\n",
    "print(f\"Articles containing HTML tags     : {len(articles_with_tags):,} / {len(data_raw):,}\")\n",
    "print(f\"Articles containing HTML entities  : {len(articles_with_entities):,} / {len(data_raw):,}\")\n",
    "print()\n",
    "\n",
    "if tag_counts:\n",
    "    print(\"Most common HTML tags:\")\n",
    "    for tag, cnt in tag_counts.most_common(10):\n",
    "        print(f\"  {tag:30s}: {cnt:,}\")\n",
    "else:\n",
    "    print(\"\\u2713  No HTML tags found in article texts.\")\n",
    "\n",
    "print()\n",
    "if entity_counts:\n",
    "    print(\"Most common HTML entities:\")\n",
    "    for ent, cnt in entity_counts.most_common(10):\n",
    "        print(f\"  {ent:30s}: {cnt:,}\")\n",
    "else:\n",
    "    print(\"\\u2713  No HTML entities found in article texts.\")\n",
    "\n",
    "if articles_with_tags:\n",
    "    print()\n",
    "    print(\"Sample articles with HTML tags:\")\n",
    "    df_html = pd.DataFrame(articles_with_tags).sort_values(\"count\", ascending=False)\n",
    "    print(df_html.head(10).to_string(index=False))"
   ],
   "id": "51fe8e19c409a770",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Articles containing HTML tags     : 2 / 1,637\n",
      "Articles containing HTML entities  : 6 / 1,637\n",
      "\n",
      "Most common HTML tags:\n",
      "  azn.l                         : 1\n",
      "  jnj.n                         : 1\n",
      "  sasy.pa                       : 1\n",
      "  mrna.o                        : 1\n",
      "  cvac.o                        : 1\n",
      "  pfe.n                         : 1\n",
      "  bntx.o                        : 1\n",
      "  rena.pa                       : 1\n",
      "\n",
      "Most common HTML entities:\n",
      "  &CloseCurlyQuote;             : 13\n",
      "  &CloseCurlyDoubleQuote;       : 10\n",
      "  &sol;                         : 2\n",
      "  &euro;                        : 2\n",
      "  &mdash;                       : 1\n",
      "  &P Global raises EDB credit rating to 'AA'\n",
      "S&amp;: 1\n",
      "  &colon;                       : 1\n",
      "\n",
      "Sample articles with HTML tags:\n",
      "  id                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       tags  count\n",
      "1046 [<azn.l> to secure at least 300 million doses of its potential COVID-19 vaccine, a spokesman said on Thursday. The deal covers development, liability and other costs faced by the vaccine maker. The EU has also secured an option to buy 100 million additional doses of the vaccine under development. The 27 EU states could buy it at a later stage, should the vaccine prove successful. The overall price they will pay to acquire the doses has not been revealed, but under an earlier deal struck in June with AstraZeneca by Germany, France, Italy and the Netherlands, all members of the EU, AstraZeneca agreed to sell 300 million doses for 750 million euros ($843 million). The EU deal completed the preliminary accord reached with the drug maker by the four countries, the Commission said in a statement. \"We cannot indicate at this stage the specific pricing per dose. However, a significant part of the overall costs are funded by a contribution from the overall ESI funding for vaccines,\" the commission spokesman said, referring to the 336 million euros paid through the bloc's so-called emergency support instrument. It is the first contract signed by the EU with a maker of potential COVID-19 vaccines. Brussels was previously said to be in advanced talks with Johnson &amp; Johnson <jnj.n>, Sanofi <sasy.pa>, Moderna <mrna.o> and CureVac <cvac.o> for their potential vaccines. EU officials told Reuters in July the bloc was also talking with Pfizer <pfe.n> and BionTech <bntx.o> for the shot they are developing together. The contract with AstraZeneca follows an advance purchase agreement signed by Brussels with the company earlier in August. Part of the money the EU pays for supply deals covers legal risks faced by vaccine makers if their shots have unexpected side effects. These risks are increased by the hastened process to develop a vaccine in the race against the COVID-19 pandemic. \"In order to compensate for such high risks taken by manufacturers, the Advanced Purchase Agreements provide for member states to indemnify the manufacturer for liabilities incurred under certain conditions,\" the commission said. \"Liability still remains with the companies,\" it added. This issue has been one of the stumbling blocs in talks with other vaccine makers, official told Reuters, as companies prefer to have a broader shield.</bntx.o></pfe.n></cvac.o></mrna.o></sasy.pa></jnj.n></azn.l>, <jnj.n>, Sanofi <sasy.pa>, Moderna <mrna.o> and CureVac <cvac.o> for their potential vaccines. EU officials told Reuters in July the bloc was also talking with Pfizer <pfe.n> and BionTech <bntx.o> for the shot they are developing together. The contract with AstraZeneca follows an advance purchase agreement signed by Brussels with the company earlier in August. Part of the money the EU pays for supply deals covers legal risks faced by vaccine makers if their shots have unexpected side effects. These risks are increased by the hastened process to develop a vaccine in the race against the COVID-19 pandemic. \"In order to compensate for such high risks taken by manufacturers, the Advanced Purchase Agreements provide for member states to indemnify the manufacturer for liabilities incurred under certain conditions,\" the commission said. \"Liability still remains with the companies,\" it added. This issue has been one of the stumbling blocs in talks with other vaccine makers, official told Reuters, as companies prefer to have a broader shield.</bntx.o></pfe.n></cvac.o></mrna.o></sasy.pa></jnj.n>, <sasy.pa>, Moderna <mrna.o> and CureVac <cvac.o> for their potential vaccines. EU officials told Reuters in July the bloc was also talking with Pfizer <pfe.n> and BionTech <bntx.o> for the shot they are developing together. The contract with AstraZeneca follows an advance purchase agreement signed by Brussels with the company earlier in August. Part of the money the EU pays for supply deals covers legal risks faced by vaccine makers if their shots have unexpected side effects. These risks are increased by the hastened process to develop a vaccine in the race against the COVID-19 pandemic. \"In order to compensate for such high risks taken by manufacturers, the Advanced Purchase Agreements provide for member states to indemnify the manufacturer for liabilities incurred under certain conditions,\" the commission said. \"Liability still remains with the companies,\" it added. This issue has been one of the stumbling blocs in talks with other vaccine makers, official told Reuters, as companies prefer to have a broader shield.</bntx.o></pfe.n></cvac.o></mrna.o></sasy.pa>, <mrna.o> and CureVac <cvac.o> for their potential vaccines. EU officials told Reuters in July the bloc was also talking with Pfizer <pfe.n> and BionTech <bntx.o> for the shot they are developing together. The contract with AstraZeneca follows an advance purchase agreement signed by Brussels with the company earlier in August. Part of the money the EU pays for supply deals covers legal risks faced by vaccine makers if their shots have unexpected side effects. These risks are increased by the hastened process to develop a vaccine in the race against the COVID-19 pandemic. \"In order to compensate for such high risks taken by manufacturers, the Advanced Purchase Agreements provide for member states to indemnify the manufacturer for liabilities incurred under certain conditions,\" the commission said. \"Liability still remains with the companies,\" it added. This issue has been one of the stumbling blocs in talks with other vaccine makers, official told Reuters, as companies prefer to have a broader shield.</bntx.o></pfe.n></cvac.o></mrna.o>, <cvac.o> for their potential vaccines. EU officials told Reuters in July the bloc was also talking with Pfizer <pfe.n> and BionTech <bntx.o> for the shot they are developing together. The contract with AstraZeneca follows an advance purchase agreement signed by Brussels with the company earlier in August. Part of the money the EU pays for supply deals covers legal risks faced by vaccine makers if their shots have unexpected side effects. These risks are increased by the hastened process to develop a vaccine in the race against the COVID-19 pandemic. \"In order to compensate for such high risks taken by manufacturers, the Advanced Purchase Agreements provide for member states to indemnify the manufacturer for liabilities incurred under certain conditions,\" the commission said. \"Liability still remains with the companies,\" it added. This issue has been one of the stumbling blocs in talks with other vaccine makers, official told Reuters, as companies prefer to have a broader shield.</bntx.o></pfe.n></cvac.o>, <pfe.n> and BionTech <bntx.o> for the shot they are developing together. The contract with AstraZeneca follows an advance purchase agreement signed by Brussels with the company earlier in August. Part of the money the EU pays for supply deals covers legal risks faced by vaccine makers if their shots have unexpected side effects. These risks are increased by the hastened process to develop a vaccine in the race against the COVID-19 pandemic. \"In order to compensate for such high risks taken by manufacturers, the Advanced Purchase Agreements provide for member states to indemnify the manufacturer for liabilities incurred under certain conditions,\" the commission said. \"Liability still remains with the companies,\" it added. This issue has been one of the stumbling blocs in talks with other vaccine makers, official told Reuters, as companies prefer to have a broader shield.</bntx.o></pfe.n>, <bntx.o> for the shot they are developing together. The contract with AstraZeneca follows an advance purchase agreement signed by Brussels with the company earlier in August. Part of the money the EU pays for supply deals covers legal risks faced by vaccine makers if their shots have unexpected side effects. These risks are increased by the hastened process to develop a vaccine in the race against the COVID-19 pandemic. \"In order to compensate for such high risks taken by manufacturers, the Advanced Purchase Agreements provide for member states to indemnify the manufacturer for liabilities incurred under certain conditions,\" the commission said. \"Liability still remains with the companies,\" it added. This issue has been one of the stumbling blocs in talks with other vaccine makers, official told Reuters, as companies prefer to have a broader shield.</bntx.o>]      7\n",
      "1585                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         [<rena.pa> and thrown Nissan into disarray as it finds itself on course to book its lowest operating profit in 11 years. The sources said Nissan will likely kill loss-making variants for the Titan full-size pickup. Unprofitable variants include the single-cab and diesel versions. A planned shuttering of under-utilised production lines will most probably hit plants in emerging markets building Datsun and other small cars hardest, they added. \"We need to chart a recovery but the rot goes deep,\" one of the sources said of the many problems facing Nissan. The second source said all markets with factories except China were being looked at for possible reductions in production capacity. That source also said, however, that there were no plans to close an entire plant or withdraw completely from any country. In the United States, one of Nissan's biggest markets, the plan calls for fresh efforts to weed out the practice of buying market share by selling vehicles to rental car and other fleet operators at heavy discounts - a practice which destroyed profitability and undermined Nissan's brand image. \"We're trying to clean up what had happened in the past,\" one of the sources said, adding that under Ghosn, Nissan sought to meet sales objectives at any cost, including \"practically giving away cars\" to fleet customers. A team led by Jun Seki, a senior vice president and incoming vice chief operating officer, is expected to unveil the wide-ranging plan this month though some aspects are still being finalised, said the sources, who were not authorised to speak to media and declined to be identified. Nissan declined to comment. Seki is part of a new management team that will see Makoto Uchida, Nissan's head of China operations, take the helm - an appointment that is expected to take effect by Jan. 1. The new steps follow plans unveiled in July to cut headcount by 12,500 globally by early 2023 and which also flagged cuts to production capacity. At the time, then-CEO Hiroto Saikawa said 14 facilities would be affected. EMERGING MARKET WOES Overall, the plan's aim is to free up resources to focus more on the United States and China, the sources said. To that end it will roll back an aggressive expansionist strategy Ghosn set in motion under a five-year plan called Power 88 which aimed to raise profit margins and global market share to 8 percent by fiscal 2016 - goals which were never achieved. The Datsun brand - revived for emerging markets under Ghosn after being phased out in the 1980s - will likely bear the brunt of the restructuring. The models are manufactured in Indonesia, India and Russia. The sources said problems emerged after Nissan began deploying the no-frills cars in 2014 in small markets such as Indonesia, India, Russia and South Africa where it also sells vehicles under its mainstay Nissan brand. In Indonesia, for example, after a relatively good start, Datsun cars soon began eating into Nissan sales. \"We ended up pushing two mainstream brands in a market where you have a one or two percent market share. You cannot do that,\" one of the sources said, adding that there had been similar outcomes in India, South Africa and Russia. In its bigger markets, a steady supply of new or significantly redesigned models – starting with the redesigned Altima which was launched in the United States late last year – is expected to help Nissan reset the way it prices its vehicles. \"Still, it takes about a year to get any sort of tangible results,\" one of the sources said, adding that until then the Japanese automaker would continue to see sales by volume fall in the U.S. market. (Reporting by Norihiko Shirouzu; Additional reporting by Aditi Shah in New Delhi, Paul Lienert in Detroit and Naomi Tajitsu in Tokyo; Editing by Edwina Gibbs)</rena.pa>]      1\n"
     ]
    }
   ],
   "execution_count": 23
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 8. Preprocessing Summary\n",
    "\n",
    "All hygiene findings mapped to the preprocessing steps that resolve them."
   ],
   "id": "aac93d52102c5dcf"
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2026-04-18T19:53:18.856223Z",
     "start_time": "2026-04-18T19:53:18.851227Z"
    }
   },
   "source": [
    "summary = [\n",
    "    {\n",
    "        \"Issue\":    \"Duplicate article texts\",\n",
    "        \"Count\":    len(dup_articles),\n",
    "        \"Action\":   \"Deduplicate at sample level before splitting\",\n",
    "    },\n",
    "    {\n",
    "        \"Issue\":    \"Duplicate entities within sample (same label)\",\n",
    "        \"Count\":    merge_stats[\"merged_entities\"],\n",
    "        \"Action\":   \"Remove — redundant, no information loss\",\n",
    "    },\n",
    "    {\n",
    "        \"Issue\":    \"Non-standard label ('very positive')\",\n",
    "        \"Count\":    int((df_entities[\"label\"] == \"very positive\").sum()),\n",
    "        \"Action\":   \"Remap to 'positive'\",\n",
    "    },\n",
    "    {\n",
    "        \"Issue\":    \"position_text case mismatches (e.g. BRUSSELS vs Brussels)\",\n",
    "        \"Count\":    len(case_mismatches),\n",
    "        \"Action\":   \"Overwrite position_text with actual span from article text\",\n",
    "    },\n",
    "    {\n",
    "        \"Issue\":    \"position_text content mismatches\",\n",
    "        \"Count\":    len(content_mismatches),\n",
    "        \"Action\":   \"Manual review; drop affected positions if unresolvable\",\n",
    "    },\n",
    "]\n",
    "\n",
    "df_summary = pd.DataFrame(summary)\n",
    "display(\n",
    "df_summary.style\n",
    ".set_table_styles([{\n",
    "  \"selector\": \"th\",\n",
    "  \"props\": [(\"font-weight\", \"bold\"), (\"background-color\", \"#ECEFF1\")]\n",
    "}])\n",
    ")\n",
    "\n",
    "print()\n",
    "total_removed = merge_stats[\"merged_entities\"] + int((df_entities[\"label\"] == \"very positive\").sum())\n",
    "print(f\"Entities before preprocessing : {len(df_entities):,}\")\n",
    "print(f\"Entities removed              : {total_removed:,}  ({total_removed / len(df_entities) * 100:.1f}%)\")\n",
    "print(f\"Entities after preprocessing  : {len(df_entities) - total_removed:,}\")"
   ],
   "id": "77d44d31f2a37305",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<pandas.io.formats.style.Styler at 0x1141db4d0>"
      ],
      "text/html": [
       "<style type=\"text/css\">\n",
       "#T_b0e0e th {\n",
       "  font-weight: bold;\n",
       "  background-color: #ECEFF1;\n",
       "}\n",
       "</style>\n",
       "<table id=\"T_b0e0e\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th class=\"blank level0\" >&nbsp;</th>\n",
       "      <th id=\"T_b0e0e_level0_col0\" class=\"col_heading level0 col0\" >Issue</th>\n",
       "      <th id=\"T_b0e0e_level0_col1\" class=\"col_heading level0 col1\" >Count</th>\n",
       "      <th id=\"T_b0e0e_level0_col2\" class=\"col_heading level0 col2\" >Action</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th id=\"T_b0e0e_level0_row0\" class=\"row_heading level0 row0\" >0</th>\n",
       "      <td id=\"T_b0e0e_row0_col0\" class=\"data row0 col0\" >Duplicate article texts</td>\n",
       "      <td id=\"T_b0e0e_row0_col1\" class=\"data row0 col1\" >16</td>\n",
       "      <td id=\"T_b0e0e_row0_col2\" class=\"data row0 col2\" >Deduplicate at sample level before splitting</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_b0e0e_level0_row1\" class=\"row_heading level0 row1\" >1</th>\n",
       "      <td id=\"T_b0e0e_row1_col0\" class=\"data row1 col0\" >Duplicate entities within sample (same label)</td>\n",
       "      <td id=\"T_b0e0e_row1_col1\" class=\"data row1 col1\" >2014</td>\n",
       "      <td id=\"T_b0e0e_row1_col2\" class=\"data row1 col2\" >Remove — redundant, no information loss</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_b0e0e_level0_row2\" class=\"row_heading level0 row2\" >2</th>\n",
       "      <td id=\"T_b0e0e_row2_col0\" class=\"data row2 col0\" >Non-standard label ('very positive')</td>\n",
       "      <td id=\"T_b0e0e_row2_col1\" class=\"data row2 col1\" >1</td>\n",
       "      <td id=\"T_b0e0e_row2_col2\" class=\"data row2 col2\" >Remap to 'positive'</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_b0e0e_level0_row3\" class=\"row_heading level0 row3\" >3</th>\n",
       "      <td id=\"T_b0e0e_row3_col0\" class=\"data row3 col0\" >position_text case mismatches (e.g. BRUSSELS vs Brussels)</td>\n",
       "      <td id=\"T_b0e0e_row3_col1\" class=\"data row3 col1\" >224</td>\n",
       "      <td id=\"T_b0e0e_row3_col2\" class=\"data row3 col2\" >Overwrite position_text with actual span from article text</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_b0e0e_level0_row4\" class=\"row_heading level0 row4\" >4</th>\n",
       "      <td id=\"T_b0e0e_row4_col0\" class=\"data row4 col0\" >position_text content mismatches</td>\n",
       "      <td id=\"T_b0e0e_row4_col1\" class=\"data row4 col1\" >0</td>\n",
       "      <td id=\"T_b0e0e_row4_col2\" class=\"data row4 col2\" >Manual review; drop affected positions if unresolvable</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Entities before preprocessing : 12,627\n",
      "Entities removed              : 2,015  (16.0%)\n",
      "Entities after preprocessing  : 10,612\n"
     ]
    }
   ],
   "execution_count": 24
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "name": "python",
   "version": "3.10.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}