IsmatS
/

Named_Entity_Recognition

+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "authorship_tag": "ABX9TyOYWYuP39K5ztx8szll3Adf"
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": 1,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "DpqFfWCx8YpB",
+        "outputId": "fa23a1ea-0b94-4bc3-80eb-28957bc12ed6"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.44.2)\n",
+            "Collecting datasets\n",
+            "  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)\n",
+            "Collecting seqeval\n",
+            "  Downloading seqeval-1.2.2.tar.gz (43 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.6/43.6 kB\u001b[0m \u001b[31m1.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25h  Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+            "Requirement already satisfied: huggingface_hub in /usr/local/lib/python3.10/dist-packages (0.24.7)\n",
+            "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.16.1)\n",
+            "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.26.4)\n",
+            "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (24.1)\n",
+            "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0.2)\n",
+            "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2024.9.11)\n",
+            "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.32.3)\n",
+            "Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.4.5)\n",
+            "Requirement already satisfied: tokenizers<0.20,>=0.19 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.19.1)\n",
+            "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.66.6)\n",
+            "Requirement already satisfied: pyarrow>=15.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (17.0.0)\n",
+            "Collecting dill<0.3.9,>=0.3.0 (from datasets)\n",
+            "  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)\n",
+            "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets) (2.2.2)\n",
+            "Collecting xxhash (from datasets)\n",
+            "  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)\n",
+            "Collecting multiprocess<0.70.17 (from datasets)\n",
+            "  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)\n",
+            "Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)\n",
+            "  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)\n",
+            "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets) (3.10.10)\n",
+            "Requirement already satisfied: scikit-learn>=0.21.3 in /usr/local/lib/python3.10/dist-packages (from seqeval) (1.5.2)\n",
+            "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface_hub) (4.12.2)\n",
+            "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (2.4.3)\n",
+            "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.3.1)\n",
+            "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (24.2.0)\n",
+            "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.5.0)\n",
+            "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (6.1.0)\n",
+            "Requirement already satisfied: yarl<2.0,>=1.12.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.17.0)\n",
+            "Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (4.0.3)\n",
+            "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.4.0)\n",
+            "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.10)\n",
+            "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.2.3)\n",
+            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2024.8.30)\n",
+            "Requirement already satisfied: scipy>=1.6.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.21.3->seqeval) (1.13.1)\n",
+            "Requirement already satisfied: joblib>=1.2.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.21.3->seqeval) (1.4.2)\n",
+            "Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.21.3->seqeval) (3.5.0)\n",
+            "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2.8.2)\n",
+            "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2024.2)\n",
+            "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2024.2)\n",
+            "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.16.0)\n",
+            "Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.10/dist-packages (from yarl<2.0,>=1.12.0->aiohttp->datasets) (0.2.0)\n",
+            "Downloading datasets-3.1.0-py3-none-any.whl (480 kB)\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m480.6/480.6 kB\u001b[0m \u001b[31m6.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m7.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (179 kB)\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m179.3/179.3 kB\u001b[0m \u001b[31m10.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hDownloading multiprocess-0.70.16-py310-none-any.whl (134 kB)\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m8.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hDownloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m11.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hBuilding wheels for collected packages: seqeval\n",
+            "  Building wheel for seqeval (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+            "  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=c55117a3e0b989cf8561c80200a7836d267b8a0cad5764952e6fa20385d174de\n",
+            "  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa\n",
+            "Successfully built seqeval\n",
+            "Installing collected packages: xxhash, fsspec, dill, multiprocess, seqeval, datasets\n",
+            "  Attempting uninstall: fsspec\n",
+            "    Found existing installation: fsspec 2024.10.0\n",
+            "    Uninstalling fsspec-2024.10.0:\n",
+            "      Successfully uninstalled fsspec-2024.10.0\n",
+            "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
+            "gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.9.0 which is incompatible.\u001b[0m\u001b[31m\n",
+            "\u001b[0mSuccessfully installed datasets-3.1.0 dill-0.3.8 fsspec-2024.9.0 multiprocess-0.70.16 seqeval-1.2.2 xxhash-3.5.0\n"
+          ]
+        }
+      ],
+      "source": [
+        "!pip install transformers datasets seqeval huggingface_hub"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Standard library imports\n",
+        "import os                 # Provides functions for interacting with the operating system\n",
+        "import warnings           # Used to handle or suppress warnings\n",
+        "import numpy as np        # Essential for numerical operations and array manipulation\n",
+        "import torch              # PyTorch library for tensor computations and model handling\n",
+        "import ast                # Used for safe evaluation of strings to Python objects (e.g., parsing tokens)\n",
+        "import pandas as pd\n",
+        "import matplotlib.pyplot as plt\n",
+        "import seaborn as sns\n",
+        "from collections import Counter\n",
+        "from datasets import load_dataset\n",
+        "\n",
+        "\n",
+        "# Hugging Face and Transformers imports\n",
+        "from datasets import load_dataset                     # Loads datasets for model training and evaluation\n",
+        "from transformers import (\n",
+        "    AutoTokenizer,                                   # Initializes a tokenizer from a pre-trained model\n",
+        "    DataCollatorForTokenClassification,              # Handles padding and formatting of token classification data\n",
+        "    TrainingArguments,                               # Defines training parameters like batch size and learning rate\n",
+        "    Trainer,                                         # High-level API for managing training and evaluation\n",
+        "    AutoModelForTokenClassification,                 # Loads a pre-trained model for token classification tasks\n",
+        "    get_linear_schedule_with_warmup,                 # Learning rate scheduler for gradual warm-up and linear decay\n",
+        "    EarlyStoppingCallback                           # Callback to stop training if validation performance plateaus\n",
+        ")\n",
+        "\n",
+        "# Hugging Face Hub\n",
+        "from huggingface_hub import login                   # Allows logging in to Hugging Face Hub to upload models\n",
+        "\n",
+        "# seqeval metrics for NER evaluation\n",
+        "from seqeval.metrics import precision_score, recall_score, f1_score, classification_report\n",
+        "# Provides precision, recall, F1-score, and classification report for evaluating NER model performance\n",
+        "\n",
+        "\n",
+        "\n",
+        "# Log in to Hugging Face Hub\n",
+        "login(token=\"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\")\n",
+        "\n",
+        "# Disable WandB (Weights & Biases) logging to avoid unwanted log outputs during training\n",
+        "os.environ[\"WANDB_DISABLED\"] = \"true\"\n",
+        "\n",
+        "# Suppress warning messages to keep output clean, especially during training and evaluation\n",
+        "warnings.filterwarnings(\"ignore\")\n",
+        "\n",
+        "\n",
+        "\n",
+        "# Load the Azerbaijani NER dataset from Hugging Face\n",
+        "dataset = load_dataset(\"LocalDoc/azerbaijani-ner-dataset\")\n",
+        "print(dataset)  # Display dataset structure (e.g., train/validation splits)"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "nIeCH4bs822V",
+        "outputId": "ea94d8ae-fdc0-41e7-e6a3-6473b3094b47"
+      },
+      "execution_count": 1,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.\n",
+            "Token is valid (permission: fineGrained).\n",
+            "Your token has been saved to /root/.cache/huggingface/token\n",
+            "Login successful\n",
+            "DatasetDict({\n",
+            "    train: Dataset({\n",
+            "        features: ['index', 'tokens', 'ner_tags'],\n",
+            "        num_rows: 99545\n",
+            "    })\n",
+            "})\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "train_df = pd.DataFrame(dataset['train'])\n",
+        "\n",
+        "# Display basic info\n",
+        "print(\"Dataset Information:\")\n",
+        "print(train_df.info())\n",
+        "\n",
+        "print(\"\\nSample Rows:\")\n",
+        "print(train_df.head())\n",
+        "\n",
+        "# Convert string representation of lists to actual lists (if necessary)\n",
+        "train_df['tokens'] = train_df['tokens'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)\n",
+        "train_df['ner_tags'] = train_df['ner_tags'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)\n"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "0Gqze-Vu82vh",
+        "outputId": "54d2a45e-9ab4-41d3-9479-fe1476524aa7"
+      },
+      "execution_count": 2,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Dataset Information:\n",
+            "<class 'pandas.core.frame.DataFrame'>\n",
+            "RangeIndex: 99545 entries, 0 to 99544\n",
+            "Data columns (total 3 columns):\n",
+            " #   Column    Non-Null Count  Dtype \n",
+            "---  ------    --------------  ----- \n",
+            " 0   index     99545 non-null  object\n",
+            " 1   tokens    99528 non-null  object\n",
+            " 2   ner_tags  99528 non-null  object\n",
+            "dtypes: object(3)\n",
+            "memory usage: 2.3+ MB\n",
+            "None\n",
+            "\n",
+            "Sample Rows:\n",
+            "                                  index  \\\n",
+            "0  640b71a8-014e-424b-96e1-80c74c9317bb   \n",
+            "1  70cd64eb-6fad-49ae-821f-5e540d9b96fd   \n",
+            "2  ec937367-1043-4d7d-bd89-895a4002f914   \n",
+            "3  f32c58c9-7836-4985-82f2-8e2db283a250   \n",
+            "4  bd7a3758-3300-4d34-a5d6-74090b6c5d04   \n",
+            "\n",
+            "                                              tokens  \\\n",
+            "0  ['Komitədən', 'bildirilib', 'ki', ',', 'sovet'...   \n",
+            "1  ['2003-2013', '-', 'cü', 'illərdə', 'ölkədə', ...   \n",
+            "2  ['Prezidentin', 'müvafiq', 'sərəncamlarına', '...   \n",
+            "3  ['Hazırda', 'Gəncə', 'şəhər', 'İmamzadə', 'ziy...   \n",
+            "4  ['“', 'Gianni', 'Versace', '”', 'şirkətinin', ...   \n",
+            "\n",
+            "                                            ner_tags  \n",
+            "0  [3, 0, 0, 0, 0, 0, 14, 0, 17, 0, 0, 0, 0, 3, 0...  \n",
+            "1  [4, 0, 0, 0, 0, 17, 8, 0, 0, 0, 0, 0, 0, 0, 0,...  \n",
+            "2  [0, 0, 0, 0, 0, 0, 0, 8, 8, 0, 0, 8, 0, 0, 8, ...  \n",
+            "3                    [0, 14, 0, 8, 8, 0, 0, 0, 0, 0]  \n",
+            "4                  [0, 1, 1, 0, 3, 0, 0, 0, 0, 0, 0]  \n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Basic Statistics"
+      ],
+      "metadata": {
+        "id": "sGxTQ8HLCA_C"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Basic statistics\n",
+        "print(\"\\nBasic Statistics:\")\n",
+        "print(train_df.describe())\n"
+      ],
+      "metadata": {
+        "id": "0WNiCOFB82r-"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Distribution of Sentence Lengths (Number of Tokens)"
+      ],
+      "metadata": {
+        "id": "MZl1dnrXB-AZ"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "train_df['num_tokens'] = train_df['tokens'].apply(len)\n",
+        "plt.figure(figsize=(10, 6))\n",
+        "sns.histplot(train_df['num_tokens'], bins=30, kde=True)\n",
+        "plt.title(\"Distribution of Sentence Lengths (Number of Tokens)\")\n",
+        "plt.xlabel(\"Number of Tokens\")\n",
+        "plt.ylabel(\"Frequency\")\n",
+        "plt.show()\n"
+      ],
+      "metadata": {
+        "id": "nhK7yHom82oX"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Distribution of NER Tags"
+      ],
+      "metadata": {
+        "id": "dsP6Kq6-B8Gb"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Flatten the list of NER tags\n",
+        "all_tags = [tag for tags in train_df['ner_tags'] for tag in tags]\n",
+        "tag_counts = Counter(all_tags)\n",
+        "\n",
+        "# Convert to DataFrame for plotting\n",
+        "tag_df = pd.DataFrame(tag_counts.items(), columns=['NER Tag', 'Count']).sort_values(by='Count', ascending=False)\n",
+        "\n",
+        "plt.figure(figsize=(12, 6))\n",
+        "sns.barplot(data=tag_df, x='NER Tag', y='Count')\n",
+        "plt.title(\"Distribution of NER Tags\")\n",
+        "plt.xlabel(\"NER Tag\")\n",
+        "plt.ylabel(\"Count\")\n",
+        "plt.xticks(rotation=45)\n",
+        "plt.show()\n"
+      ],
+      "metadata": {
+        "id": "ZHU9_Xov82lI"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Average Number of Tokens per NER Tag\n"
+      ],
+      "metadata": {
+        "id": "G5XwARGNB0jV"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "train_df['num_tags'] = train_df['ner_tags'].apply(len)\n",
+        "print(\"\\nAverage Number of Tokens per NER Tag:\")\n",
+        "print(train_df['num_tags'].mean())\n"
+      ],
+      "metadata": {
+        "id": "FySAFwja82h6"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Token Frequency Distribution"
+      ],
+      "metadata": {
+        "id": "YfagXljcBxL1"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Flatten the list of tokens\n",
+        "all_tokens = [token for tokens in train_df['tokens'] for token in tokens]\n",
+        "token_counts = Counter(all_tokens)\n",
+        "\n",
+        "# Convert to DataFrame for plotting\n",
+        "token_df = pd.DataFrame(token_counts.items(), columns=['Token', 'Count']).sort_values(by='Count', ascending=False)\n",
+        "\n",
+        "# Display the top 20 most frequent tokens\n",
+        "print(\"\\nTop 20 Most Frequent Tokens:\")\n",
+        "print(token_df.head(20))\n",
+        "\n",
+        "# Plot the top 20 most frequent tokens\n",
+        "plt.figure(figsize=(12, 6))\n",
+        "sns.barplot(data=token_df.head(20), x='Token', y='Count')\n",
+        "plt.title(\"Top 20 Most Frequent Tokens\")\n",
+        "plt.xlabel(\"Token\")\n",
+        "plt.ylabel(\"Count\")\n",
+        "plt.xticks(rotation=45)\n",
+        "plt.show()\n"
+      ],
+      "metadata": {
+        "id": "7Uz8VJx_82e1"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Unique NER Tag Distribution Across Sentences"
+      ],
+      "metadata": {
+        "id": "KbxqjdhmBvlr"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "unique_tag_counts = train_df['ner_tags'].apply(lambda x: len(set(x)))\n",
+        "plt.figure(figsize=(10, 6))\n",
+        "sns.histplot(unique_tag_counts, bins=20, kde=True)\n",
+        "plt.title(\"Distribution of Unique NER Tags per Sentence\")\n",
+        "plt.xlabel(\"Number of Unique NER Tags\")\n",
+        "plt.ylabel(\"Frequency\")\n",
+        "plt.show()\n"
+      ],
+      "metadata": {
+        "id": "liUV1Xpi82bn"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Proportion of Sentences with a Specific NER Tag"
+      ],
+      "metadata": {
+        "id": "6qFdS_qMBqlh"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "tag_presence = {}\n",
+        "for tag in set(all_tags):\n",
+        "    tag_presence[tag] = sum([1 for tags in train_df['ner_tags'] if tag in tags])\n",
+        "\n",
+        "tag_presence_df = pd.DataFrame(tag_presence.items(), columns=['NER Tag', 'Sentence Count']).sort_values(by='Sentence Count', ascending=False)\n",
+        "\n",
+        "plt.figure(figsize=(12, 6))\n",
+        "sns.barplot(data=tag_presence_df, x='NER Tag', y='Sentence Count')\n",
+        "plt.title(\"Number of Sentences Containing Each NER Tag\")\n",
+        "plt.xlabel(\"NER Tag\")\n",
+        "plt.ylabel(\"Number of Sentences\")\n",
+        "plt.xticks(rotation=45)\n",
+        "plt.show()\n"
+      ],
+      "metadata": {
+        "id": "9iFL0jw882Xz"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Sample Sentence and Tags Display"
+      ],
+      "metadata": {
+        "id": "w-i4AhrMBnSN"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "sample_idx = train_df.sample(1).index[0]\n",
+        "print(f\"\\nSample Sentence and Tags (Index {sample_idx}):\")\n",
+        "print(f\"Tokens: {train_df.loc[sample_idx, 'tokens']}\")\n",
+        "print(f\"NER Tags: {train_df.loc[sample_idx, 'ner_tags']}\")\n"
+      ],
+      "metadata": {
+        "id": "xz8OZh6m82SV"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "3lkut05B82PX"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "4farZ19482L5"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "sroPMXuY82JF"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "wB4lkpal82BM"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "zdCsyNGZ81yE"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "DgLOAamV81vG"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "dl-zf4_381sI"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "lYV22K0v81pM"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "T9rn2nhr81jQ"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "KAiANeQx81dy"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "1SwT6UJY81bD"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "K8QqSRor81Yb"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "Va1o3qjn81Sk"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "tsvbHQ5L81O9"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "FuJs0TBV81Lz"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}