{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Data Augmentation Analysis\n", "\n", "Statistical analysis of data characteristics for entity-level sentiment classification.\n", "\n", "**Structure**\n", "1. Load Data\n", "2. Article Length Analysis\n", "3. Sentiment Label Distribution\n", "4. Label Distribution by Entity Type\n", "5. Entities per Sample\n", "6. Entity Position within Article\n", "7. Window Size Analysis" ], "id": "e6791992fa7aa25b" }, { "cell_type": "code", "metadata": { "ExecuteTime": { "end_time": "2026-04-18T21:29:34.813264Z", "start_time": "2026-04-18T21:29:34.810262Z" } }, "source": [ "import os\n", "import json\n", "import warnings\n", "from collections import Counter\n", "\n", "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import matplotlib.patches as mpatches\n", "import seaborn as sns\n", "\n", "warnings.filterwarnings(\"ignore\")\n", "sns.set_theme(style=\"whitegrid\", palette=\"muted\")\n", "plt.rcParams[\"figure.dpi\"] = 120\n", "plt.rcParams[\"axes.titlesize\"] = 13\n", "plt.rcParams[\"axes.labelsize\"] = 11\n", "\n", "VALID_LABELS = {\"positive\", \"neutral\", \"negative\"}\n", "LABEL_ORDER = [\"negative\", \"neutral\", \"positive\"]\n", "LABEL_COLORS = {\"positive\": \"#4CAF50\", \"neutral\": \"#90A4AE\", \"negative\": \"#EF5350\"}" ], "id": "32a9ac93d2dec54", "outputs": [], "execution_count": 80 }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1. Load Data" ], "id": "5eabfa4c06905b4" }, { "cell_type": "code", "metadata": { "ExecuteTime": { "end_time": "2026-04-18T21:29:34.875246Z", "start_time": "2026-04-18T21:29:34.831148Z" } }, "source": [ "data_path = os.path.join(\"..\", \"data\", \"data_preprocessed.jsonl\")\n", "with open(data_path, \"r\", encoding=\"utf-8\") as f:\n", " data = [json.loads(line) for line in f]\n", "\n", "# Flat article-level dataframe\n", "df_articles = pd.DataFrame([\n", " {\"id\": s[\"id\"], \"text\": s[\"text\"], \"n_entities\": len(s[\"entities\"])}\n", " for s in data\n", "])\n", "\n", "# Flat entity-level dataframe\n", "df_entities = pd.DataFrame([\n", " {\n", " \"sample_id\": s[\"id\"],\n", " \"entity_id\": e[\"entity_id\"],\n", " \"entity_text\": e[\"entity_text\"],\n", " \"entity_type\": e[\"entity_type\"],\n", " \"label\": e[\"label\"],\n", " \"n_positions\": len(e[\"positions\"]),\n", " }\n", " for s in data\n", " for e in s[\"entities\"]\n", "])\n", "\n", "print(f\"Articles : {len(df_articles):,}\")\n", "print(f\"Entities : {len(df_entities):,}\")\n", "print(f\"Avg entities per article: {len(df_entities)/len(df_articles):.2f}\")\n", "df_articles.head()" ], "id": "463eb928d9be91b0", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Articles : 1,629\n", "Entities : 10,550\n", "Avg entities per article: 6.48\n" ] }, { "data": { "text/plain": [ " id text n_entities\n", "0 0 Amazon sold unauthorized mole removers, and th... 3\n", "1 1 Russia Announces Response Measures to New US S... 9\n", "2 2 India's richest man takes on Amazon, Walmart i... 11\n", "3 3 Govt may impose anti-dumping duty on chemical ... 6\n", "4 4 Apollo Go: AI-Powered Autonomous Ride-Hailing ... 11" ], "text/html": [ "
| \n", " | id | \n", "text | \n", "n_entities | \n", "
|---|---|---|---|
| 0 | \n", "0 | \n", "Amazon sold unauthorized mole removers, and th... | \n", "3 | \n", "
| 1 | \n", "1 | \n", "Russia Announces Response Measures to New US S... | \n", "9 | \n", "
| 2 | \n", "2 | \n", "India's richest man takes on Amazon, Walmart i... | \n", "11 | \n", "
| 3 | \n", "3 | \n", "Govt may impose anti-dumping duty on chemical ... | \n", "6 | \n", "
| 4 | \n", "4 | \n", "Apollo Go: AI-Powered Autonomous Ride-Hailing ... | \n", "11 | \n", "