IsmatS commited on
Commit
b14ae8a
·
1 Parent(s): f3b924c
Files changed (1) hide show
  1. models/research.ipynb +616 -0
models/research.ipynb ADDED
@@ -0,0 +1,616 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": [],
7
+ "authorship_tag": "ABX9TyOYWYuP39K5ztx8szll3Adf"
8
+ },
9
+ "kernelspec": {
10
+ "name": "python3",
11
+ "display_name": "Python 3"
12
+ },
13
+ "language_info": {
14
+ "name": "python"
15
+ }
16
+ },
17
+ "cells": [
18
+ {
19
+ "cell_type": "code",
20
+ "execution_count": 1,
21
+ "metadata": {
22
+ "colab": {
23
+ "base_uri": "https://localhost:8080/"
24
+ },
25
+ "id": "DpqFfWCx8YpB",
26
+ "outputId": "fa23a1ea-0b94-4bc3-80eb-28957bc12ed6"
27
+ },
28
+ "outputs": [
29
+ {
30
+ "output_type": "stream",
31
+ "name": "stdout",
32
+ "text": [
33
+ "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.44.2)\n",
34
+ "Collecting datasets\n",
35
+ " Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)\n",
36
+ "Collecting seqeval\n",
37
+ " Downloading seqeval-1.2.2.tar.gz (43 kB)\n",
38
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.6/43.6 kB\u001b[0m \u001b[31m1.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
39
+ "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
40
+ "Requirement already satisfied: huggingface_hub in /usr/local/lib/python3.10/dist-packages (0.24.7)\n",
41
+ "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.16.1)\n",
42
+ "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.26.4)\n",
43
+ "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (24.1)\n",
44
+ "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0.2)\n",
45
+ "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2024.9.11)\n",
46
+ "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.32.3)\n",
47
+ "Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.4.5)\n",
48
+ "Requirement already satisfied: tokenizers<0.20,>=0.19 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.19.1)\n",
49
+ "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.66.6)\n",
50
+ "Requirement already satisfied: pyarrow>=15.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (17.0.0)\n",
51
+ "Collecting dill<0.3.9,>=0.3.0 (from datasets)\n",
52
+ " Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)\n",
53
+ "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets) (2.2.2)\n",
54
+ "Collecting xxhash (from datasets)\n",
55
+ " Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)\n",
56
+ "Collecting multiprocess<0.70.17 (from datasets)\n",
57
+ " Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)\n",
58
+ "Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)\n",
59
+ " Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)\n",
60
+ "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets) (3.10.10)\n",
61
+ "Requirement already satisfied: scikit-learn>=0.21.3 in /usr/local/lib/python3.10/dist-packages (from seqeval) (1.5.2)\n",
62
+ "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface_hub) (4.12.2)\n",
63
+ "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (2.4.3)\n",
64
+ "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.3.1)\n",
65
+ "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (24.2.0)\n",
66
+ "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.5.0)\n",
67
+ "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (6.1.0)\n",
68
+ "Requirement already satisfied: yarl<2.0,>=1.12.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.17.0)\n",
69
+ "Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (4.0.3)\n",
70
+ "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.4.0)\n",
71
+ "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.10)\n",
72
+ "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.2.3)\n",
73
+ "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2024.8.30)\n",
74
+ "Requirement already satisfied: scipy>=1.6.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.21.3->seqeval) (1.13.1)\n",
75
+ "Requirement already satisfied: joblib>=1.2.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.21.3->seqeval) (1.4.2)\n",
76
+ "Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.21.3->seqeval) (3.5.0)\n",
77
+ "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2.8.2)\n",
78
+ "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2024.2)\n",
79
+ "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2024.2)\n",
80
+ "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.16.0)\n",
81
+ "Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.10/dist-packages (from yarl<2.0,>=1.12.0->aiohttp->datasets) (0.2.0)\n",
82
+ "Downloading datasets-3.1.0-py3-none-any.whl (480 kB)\n",
83
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m480.6/480.6 kB\u001b[0m \u001b[31m6.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
84
+ "\u001b[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)\n",
85
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m7.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
86
+ "\u001b[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (179 kB)\n",
87
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m179.3/179.3 kB\u001b[0m \u001b[31m10.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
88
+ "\u001b[?25hDownloading multiprocess-0.70.16-py310-none-any.whl (134 kB)\n",
89
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m8.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
90
+ "\u001b[?25hDownloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)\n",
91
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m11.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
92
+ "\u001b[?25hBuilding wheels for collected packages: seqeval\n",
93
+ " Building wheel for seqeval (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
94
+ " Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=c55117a3e0b989cf8561c80200a7836d267b8a0cad5764952e6fa20385d174de\n",
95
+ " Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa\n",
96
+ "Successfully built seqeval\n",
97
+ "Installing collected packages: xxhash, fsspec, dill, multiprocess, seqeval, datasets\n",
98
+ " Attempting uninstall: fsspec\n",
99
+ " Found existing installation: fsspec 2024.10.0\n",
100
+ " Uninstalling fsspec-2024.10.0:\n",
101
+ " Successfully uninstalled fsspec-2024.10.0\n",
102
+ "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
103
+ "gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.9.0 which is incompatible.\u001b[0m\u001b[31m\n",
104
+ "\u001b[0mSuccessfully installed datasets-3.1.0 dill-0.3.8 fsspec-2024.9.0 multiprocess-0.70.16 seqeval-1.2.2 xxhash-3.5.0\n"
105
+ ]
106
+ }
107
+ ],
108
+ "source": [
109
+ "!pip install transformers datasets seqeval huggingface_hub"
110
+ ]
111
+ },
112
+ {
113
+ "cell_type": "code",
114
+ "source": [
115
+ "# Standard library imports\n",
116
+ "import os # Provides functions for interacting with the operating system\n",
117
+ "import warnings # Used to handle or suppress warnings\n",
118
+ "import numpy as np # Essential for numerical operations and array manipulation\n",
119
+ "import torch # PyTorch library for tensor computations and model handling\n",
120
+ "import ast # Used for safe evaluation of strings to Python objects (e.g., parsing tokens)\n",
121
+ "import pandas as pd\n",
122
+ "import matplotlib.pyplot as plt\n",
123
+ "import seaborn as sns\n",
124
+ "from collections import Counter\n",
125
+ "from datasets import load_dataset\n",
126
+ "\n",
127
+ "\n",
128
+ "# Hugging Face and Transformers imports\n",
129
+ "from datasets import load_dataset # Loads datasets for model training and evaluation\n",
130
+ "from transformers import (\n",
131
+ " AutoTokenizer, # Initializes a tokenizer from a pre-trained model\n",
132
+ " DataCollatorForTokenClassification, # Handles padding and formatting of token classification data\n",
133
+ " TrainingArguments, # Defines training parameters like batch size and learning rate\n",
134
+ " Trainer, # High-level API for managing training and evaluation\n",
135
+ " AutoModelForTokenClassification, # Loads a pre-trained model for token classification tasks\n",
136
+ " get_linear_schedule_with_warmup, # Learning rate scheduler for gradual warm-up and linear decay\n",
137
+ " EarlyStoppingCallback # Callback to stop training if validation performance plateaus\n",
138
+ ")\n",
139
+ "\n",
140
+ "# Hugging Face Hub\n",
141
+ "from huggingface_hub import login # Allows logging in to Hugging Face Hub to upload models\n",
142
+ "\n",
143
+ "# seqeval metrics for NER evaluation\n",
144
+ "from seqeval.metrics import precision_score, recall_score, f1_score, classification_report\n",
145
+ "# Provides precision, recall, F1-score, and classification report for evaluating NER model performance\n",
146
+ "\n",
147
+ "\n",
148
+ "\n",
149
+ "# Log in to Hugging Face Hub\n",
150
+ "login(token=\"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\")\n",
151
+ "\n",
152
+ "# Disable WandB (Weights & Biases) logging to avoid unwanted log outputs during training\n",
153
+ "os.environ[\"WANDB_DISABLED\"] = \"true\"\n",
154
+ "\n",
155
+ "# Suppress warning messages to keep output clean, especially during training and evaluation\n",
156
+ "warnings.filterwarnings(\"ignore\")\n",
157
+ "\n",
158
+ "\n",
159
+ "\n",
160
+ "# Load the Azerbaijani NER dataset from Hugging Face\n",
161
+ "dataset = load_dataset(\"LocalDoc/azerbaijani-ner-dataset\")\n",
162
+ "print(dataset) # Display dataset structure (e.g., train/validation splits)"
163
+ ],
164
+ "metadata": {
165
+ "colab": {
166
+ "base_uri": "https://localhost:8080/"
167
+ },
168
+ "id": "nIeCH4bs822V",
169
+ "outputId": "ea94d8ae-fdc0-41e7-e6a3-6473b3094b47"
170
+ },
171
+ "execution_count": 1,
172
+ "outputs": [
173
+ {
174
+ "output_type": "stream",
175
+ "name": "stdout",
176
+ "text": [
177
+ "The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.\n",
178
+ "Token is valid (permission: fineGrained).\n",
179
+ "Your token has been saved to /root/.cache/huggingface/token\n",
180
+ "Login successful\n",
181
+ "DatasetDict({\n",
182
+ " train: Dataset({\n",
183
+ " features: ['index', 'tokens', 'ner_tags'],\n",
184
+ " num_rows: 99545\n",
185
+ " })\n",
186
+ "})\n"
187
+ ]
188
+ }
189
+ ]
190
+ },
191
+ {
192
+ "cell_type": "code",
193
+ "source": [
194
+ "train_df = pd.DataFrame(dataset['train'])\n",
195
+ "\n",
196
+ "# Display basic info\n",
197
+ "print(\"Dataset Information:\")\n",
198
+ "print(train_df.info())\n",
199
+ "\n",
200
+ "print(\"\\nSample Rows:\")\n",
201
+ "print(train_df.head())\n",
202
+ "\n",
203
+ "# Convert string representation of lists to actual lists (if necessary)\n",
204
+ "train_df['tokens'] = train_df['tokens'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)\n",
205
+ "train_df['ner_tags'] = train_df['ner_tags'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)\n"
206
+ ],
207
+ "metadata": {
208
+ "colab": {
209
+ "base_uri": "https://localhost:8080/"
210
+ },
211
+ "id": "0Gqze-Vu82vh",
212
+ "outputId": "54d2a45e-9ab4-41d3-9479-fe1476524aa7"
213
+ },
214
+ "execution_count": 2,
215
+ "outputs": [
216
+ {
217
+ "output_type": "stream",
218
+ "name": "stdout",
219
+ "text": [
220
+ "Dataset Information:\n",
221
+ "<class 'pandas.core.frame.DataFrame'>\n",
222
+ "RangeIndex: 99545 entries, 0 to 99544\n",
223
+ "Data columns (total 3 columns):\n",
224
+ " # Column Non-Null Count Dtype \n",
225
+ "--- ------ -------------- ----- \n",
226
+ " 0 index 99545 non-null object\n",
227
+ " 1 tokens 99528 non-null object\n",
228
+ " 2 ner_tags 99528 non-null object\n",
229
+ "dtypes: object(3)\n",
230
+ "memory usage: 2.3+ MB\n",
231
+ "None\n",
232
+ "\n",
233
+ "Sample Rows:\n",
234
+ " index \\\n",
235
+ "0 640b71a8-014e-424b-96e1-80c74c9317bb \n",
236
+ "1 70cd64eb-6fad-49ae-821f-5e540d9b96fd \n",
237
+ "2 ec937367-1043-4d7d-bd89-895a4002f914 \n",
238
+ "3 f32c58c9-7836-4985-82f2-8e2db283a250 \n",
239
+ "4 bd7a3758-3300-4d34-a5d6-74090b6c5d04 \n",
240
+ "\n",
241
+ " tokens \\\n",
242
+ "0 ['Komitədən', 'bildirilib', 'ki', ',', 'sovet'... \n",
243
+ "1 ['2003-2013', '-', 'cü', 'illərdə', 'ölkədə', ... \n",
244
+ "2 ['Prezidentin', 'müvafiq', 'sərəncamlarına', '... \n",
245
+ "3 ['Hazırda', 'Gəncə', 'şəhər', 'İmamzadə', 'ziy... \n",
246
+ "4 ['“', 'Gianni', 'Versace', '”', 'şirkətinin', ... \n",
247
+ "\n",
248
+ " ner_tags \n",
249
+ "0 [3, 0, 0, 0, 0, 0, 14, 0, 17, 0, 0, 0, 0, 3, 0... \n",
250
+ "1 [4, 0, 0, 0, 0, 17, 8, 0, 0, 0, 0, 0, 0, 0, 0,... \n",
251
+ "2 [0, 0, 0, 0, 0, 0, 0, 8, 8, 0, 0, 8, 0, 0, 8, ... \n",
252
+ "3 [0, 14, 0, 8, 8, 0, 0, 0, 0, 0] \n",
253
+ "4 [0, 1, 1, 0, 3, 0, 0, 0, 0, 0, 0] \n"
254
+ ]
255
+ }
256
+ ]
257
+ },
258
+ {
259
+ "cell_type": "markdown",
260
+ "source": [
261
+ "## Basic Statistics"
262
+ ],
263
+ "metadata": {
264
+ "id": "sGxTQ8HLCA_C"
265
+ }
266
+ },
267
+ {
268
+ "cell_type": "code",
269
+ "source": [
270
+ "# Basic statistics\n",
271
+ "print(\"\\nBasic Statistics:\")\n",
272
+ "print(train_df.describe())\n"
273
+ ],
274
+ "metadata": {
275
+ "id": "0WNiCOFB82r-"
276
+ },
277
+ "execution_count": null,
278
+ "outputs": []
279
+ },
280
+ {
281
+ "cell_type": "markdown",
282
+ "source": [
283
+ "## Distribution of Sentence Lengths (Number of Tokens)"
284
+ ],
285
+ "metadata": {
286
+ "id": "MZl1dnrXB-AZ"
287
+ }
288
+ },
289
+ {
290
+ "cell_type": "code",
291
+ "source": [
292
+ "train_df['num_tokens'] = train_df['tokens'].apply(len)\n",
293
+ "plt.figure(figsize=(10, 6))\n",
294
+ "sns.histplot(train_df['num_tokens'], bins=30, kde=True)\n",
295
+ "plt.title(\"Distribution of Sentence Lengths (Number of Tokens)\")\n",
296
+ "plt.xlabel(\"Number of Tokens\")\n",
297
+ "plt.ylabel(\"Frequency\")\n",
298
+ "plt.show()\n"
299
+ ],
300
+ "metadata": {
301
+ "id": "nhK7yHom82oX"
302
+ },
303
+ "execution_count": null,
304
+ "outputs": []
305
+ },
306
+ {
307
+ "cell_type": "markdown",
308
+ "source": [
309
+ "## Distribution of NER Tags"
310
+ ],
311
+ "metadata": {
312
+ "id": "dsP6Kq6-B8Gb"
313
+ }
314
+ },
315
+ {
316
+ "cell_type": "code",
317
+ "source": [
318
+ "# Flatten the list of NER tags\n",
319
+ "all_tags = [tag for tags in train_df['ner_tags'] for tag in tags]\n",
320
+ "tag_counts = Counter(all_tags)\n",
321
+ "\n",
322
+ "# Convert to DataFrame for plotting\n",
323
+ "tag_df = pd.DataFrame(tag_counts.items(), columns=['NER Tag', 'Count']).sort_values(by='Count', ascending=False)\n",
324
+ "\n",
325
+ "plt.figure(figsize=(12, 6))\n",
326
+ "sns.barplot(data=tag_df, x='NER Tag', y='Count')\n",
327
+ "plt.title(\"Distribution of NER Tags\")\n",
328
+ "plt.xlabel(\"NER Tag\")\n",
329
+ "plt.ylabel(\"Count\")\n",
330
+ "plt.xticks(rotation=45)\n",
331
+ "plt.show()\n"
332
+ ],
333
+ "metadata": {
334
+ "id": "ZHU9_Xov82lI"
335
+ },
336
+ "execution_count": null,
337
+ "outputs": []
338
+ },
339
+ {
340
+ "cell_type": "markdown",
341
+ "source": [
342
+ "## Average Number of Tokens per NER Tag\n"
343
+ ],
344
+ "metadata": {
345
+ "id": "G5XwARGNB0jV"
346
+ }
347
+ },
348
+ {
349
+ "cell_type": "code",
350
+ "source": [
351
+ "train_df['num_tags'] = train_df['ner_tags'].apply(len)\n",
352
+ "print(\"\\nAverage Number of Tokens per NER Tag:\")\n",
353
+ "print(train_df['num_tags'].mean())\n"
354
+ ],
355
+ "metadata": {
356
+ "id": "FySAFwja82h6"
357
+ },
358
+ "execution_count": null,
359
+ "outputs": []
360
+ },
361
+ {
362
+ "cell_type": "markdown",
363
+ "source": [
364
+ "## Token Frequency Distribution"
365
+ ],
366
+ "metadata": {
367
+ "id": "YfagXljcBxL1"
368
+ }
369
+ },
370
+ {
371
+ "cell_type": "code",
372
+ "source": [
373
+ "# Flatten the list of tokens\n",
374
+ "all_tokens = [token for tokens in train_df['tokens'] for token in tokens]\n",
375
+ "token_counts = Counter(all_tokens)\n",
376
+ "\n",
377
+ "# Convert to DataFrame for plotting\n",
378
+ "token_df = pd.DataFrame(token_counts.items(), columns=['Token', 'Count']).sort_values(by='Count', ascending=False)\n",
379
+ "\n",
380
+ "# Display the top 20 most frequent tokens\n",
381
+ "print(\"\\nTop 20 Most Frequent Tokens:\")\n",
382
+ "print(token_df.head(20))\n",
383
+ "\n",
384
+ "# Plot the top 20 most frequent tokens\n",
385
+ "plt.figure(figsize=(12, 6))\n",
386
+ "sns.barplot(data=token_df.head(20), x='Token', y='Count')\n",
387
+ "plt.title(\"Top 20 Most Frequent Tokens\")\n",
388
+ "plt.xlabel(\"Token\")\n",
389
+ "plt.ylabel(\"Count\")\n",
390
+ "plt.xticks(rotation=45)\n",
391
+ "plt.show()\n"
392
+ ],
393
+ "metadata": {
394
+ "id": "7Uz8VJx_82e1"
395
+ },
396
+ "execution_count": null,
397
+ "outputs": []
398
+ },
399
+ {
400
+ "cell_type": "markdown",
401
+ "source": [
402
+ "## Unique NER Tag Distribution Across Sentences"
403
+ ],
404
+ "metadata": {
405
+ "id": "KbxqjdhmBvlr"
406
+ }
407
+ },
408
+ {
409
+ "cell_type": "code",
410
+ "source": [
411
+ "unique_tag_counts = train_df['ner_tags'].apply(lambda x: len(set(x)))\n",
412
+ "plt.figure(figsize=(10, 6))\n",
413
+ "sns.histplot(unique_tag_counts, bins=20, kde=True)\n",
414
+ "plt.title(\"Distribution of Unique NER Tags per Sentence\")\n",
415
+ "plt.xlabel(\"Number of Unique NER Tags\")\n",
416
+ "plt.ylabel(\"Frequency\")\n",
417
+ "plt.show()\n"
418
+ ],
419
+ "metadata": {
420
+ "id": "liUV1Xpi82bn"
421
+ },
422
+ "execution_count": null,
423
+ "outputs": []
424
+ },
425
+ {
426
+ "cell_type": "markdown",
427
+ "source": [
428
+ "## Proportion of Sentences with a Specific NER Tag"
429
+ ],
430
+ "metadata": {
431
+ "id": "6qFdS_qMBqlh"
432
+ }
433
+ },
434
+ {
435
+ "cell_type": "code",
436
+ "source": [
437
+ "tag_presence = {}\n",
438
+ "for tag in set(all_tags):\n",
439
+ " tag_presence[tag] = sum([1 for tags in train_df['ner_tags'] if tag in tags])\n",
440
+ "\n",
441
+ "tag_presence_df = pd.DataFrame(tag_presence.items(), columns=['NER Tag', 'Sentence Count']).sort_values(by='Sentence Count', ascending=False)\n",
442
+ "\n",
443
+ "plt.figure(figsize=(12, 6))\n",
444
+ "sns.barplot(data=tag_presence_df, x='NER Tag', y='Sentence Count')\n",
445
+ "plt.title(\"Number of Sentences Containing Each NER Tag\")\n",
446
+ "plt.xlabel(\"NER Tag\")\n",
447
+ "plt.ylabel(\"Number of Sentences\")\n",
448
+ "plt.xticks(rotation=45)\n",
449
+ "plt.show()\n"
450
+ ],
451
+ "metadata": {
452
+ "id": "9iFL0jw882Xz"
453
+ },
454
+ "execution_count": null,
455
+ "outputs": []
456
+ },
457
+ {
458
+ "cell_type": "markdown",
459
+ "source": [
460
+ "## Sample Sentence and Tags Display"
461
+ ],
462
+ "metadata": {
463
+ "id": "w-i4AhrMBnSN"
464
+ }
465
+ },
466
+ {
467
+ "cell_type": "code",
468
+ "source": [
469
+ "sample_idx = train_df.sample(1).index[0]\n",
470
+ "print(f\"\\nSample Sentence and Tags (Index {sample_idx}):\")\n",
471
+ "print(f\"Tokens: {train_df.loc[sample_idx, 'tokens']}\")\n",
472
+ "print(f\"NER Tags: {train_df.loc[sample_idx, 'ner_tags']}\")\n"
473
+ ],
474
+ "metadata": {
475
+ "id": "xz8OZh6m82SV"
476
+ },
477
+ "execution_count": null,
478
+ "outputs": []
479
+ },
480
+ {
481
+ "cell_type": "code",
482
+ "source": [],
483
+ "metadata": {
484
+ "id": "3lkut05B82PX"
485
+ },
486
+ "execution_count": null,
487
+ "outputs": []
488
+ },
489
+ {
490
+ "cell_type": "code",
491
+ "source": [],
492
+ "metadata": {
493
+ "id": "4farZ19482L5"
494
+ },
495
+ "execution_count": null,
496
+ "outputs": []
497
+ },
498
+ {
499
+ "cell_type": "code",
500
+ "source": [],
501
+ "metadata": {
502
+ "id": "sroPMXuY82JF"
503
+ },
504
+ "execution_count": null,
505
+ "outputs": []
506
+ },
507
+ {
508
+ "cell_type": "code",
509
+ "source": [],
510
+ "metadata": {
511
+ "id": "wB4lkpal82BM"
512
+ },
513
+ "execution_count": null,
514
+ "outputs": []
515
+ },
516
+ {
517
+ "cell_type": "code",
518
+ "source": [],
519
+ "metadata": {
520
+ "id": "zdCsyNGZ81yE"
521
+ },
522
+ "execution_count": null,
523
+ "outputs": []
524
+ },
525
+ {
526
+ "cell_type": "code",
527
+ "source": [],
528
+ "metadata": {
529
+ "id": "DgLOAamV81vG"
530
+ },
531
+ "execution_count": null,
532
+ "outputs": []
533
+ },
534
+ {
535
+ "cell_type": "code",
536
+ "source": [],
537
+ "metadata": {
538
+ "id": "dl-zf4_381sI"
539
+ },
540
+ "execution_count": null,
541
+ "outputs": []
542
+ },
543
+ {
544
+ "cell_type": "code",
545
+ "source": [],
546
+ "metadata": {
547
+ "id": "lYV22K0v81pM"
548
+ },
549
+ "execution_count": null,
550
+ "outputs": []
551
+ },
552
+ {
553
+ "cell_type": "code",
554
+ "source": [],
555
+ "metadata": {
556
+ "id": "T9rn2nhr81jQ"
557
+ },
558
+ "execution_count": null,
559
+ "outputs": []
560
+ },
561
+ {
562
+ "cell_type": "code",
563
+ "source": [],
564
+ "metadata": {
565
+ "id": "KAiANeQx81dy"
566
+ },
567
+ "execution_count": null,
568
+ "outputs": []
569
+ },
570
+ {
571
+ "cell_type": "code",
572
+ "source": [],
573
+ "metadata": {
574
+ "id": "1SwT6UJY81bD"
575
+ },
576
+ "execution_count": null,
577
+ "outputs": []
578
+ },
579
+ {
580
+ "cell_type": "code",
581
+ "source": [],
582
+ "metadata": {
583
+ "id": "K8QqSRor81Yb"
584
+ },
585
+ "execution_count": null,
586
+ "outputs": []
587
+ },
588
+ {
589
+ "cell_type": "code",
590
+ "source": [],
591
+ "metadata": {
592
+ "id": "Va1o3qjn81Sk"
593
+ },
594
+ "execution_count": null,
595
+ "outputs": []
596
+ },
597
+ {
598
+ "cell_type": "code",
599
+ "source": [],
600
+ "metadata": {
601
+ "id": "tsvbHQ5L81O9"
602
+ },
603
+ "execution_count": null,
604
+ "outputs": []
605
+ },
606
+ {
607
+ "cell_type": "code",
608
+ "source": [],
609
+ "metadata": {
610
+ "id": "FuJs0TBV81Lz"
611
+ },
612
+ "execution_count": null,
613
+ "outputs": []
614
+ }
615
+ ]
616
+ }