File size: 15,406 Bytes

a119f50

{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "machine_shape": "hm",
      "gpuType": "A100"
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    },
    "accelerator": "GPU"
  },
  "cells": [
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "62TB1_OCUVfz"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "i0hQIwu8UVc0"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Sample Azerbaijani sentences with entity labels (PERSON, LOCATION, ORGANIZATION)\n",
        "sentences = [\n",
        "    [\"İlham\", \"Əliyev\", \"Bakıda\", \"BMT-nin\", \"konfransında\", \"iştirak\", \"etdi\"],\n",
        "    [\"Leyla\", \"Gəncə\", \"şəhərində\", \"Azərsun\", \"şirkətində\", \"işləyir\"],\n",
        "    [\"Rəşad\", \"Sumqayıt\", \"şəhərinə\", \"səyahət\", \"etdi\"],\n",
        "    [\"Nigar\", \"və\", \"Zaur\", \"İstanbulda\", \"Türk Hava Yolları\", \"ofisində\", \"görüşdülər\"],\n",
        "    [\"Samir\", \"Bakıda\", \"BP\", \"şirkətinə\", \"işə\", \"daxil\", \"oldu\"]\n",
        "]\n",
        "\n",
        "labels = [\n",
        "    [\"B-PERSON\", \"I-PERSON\", \"B-LOCATION\", \"B-ORGANIZATION\", \"O\", \"O\", \"O\"],\n",
        "    [\"B-PERSON\", \"B-LOCATION\", \"O\", \"B-ORGANIZATION\", \"O\", \"O\"],\n",
        "    [\"B-PERSON\", \"B-LOCATION\", \"O\", \"O\", \"O\"],\n",
        "    [\"B-PERSON\", \"O\", \"B-PERSON\", \"B-LOCATION\", \"B-ORGANIZATION\", \"O\", \"O\"],\n",
        "    [\"B-PERSON\", \"B-LOCATION\", \"B-ORGANIZATION\", \"O\", \"O\", \"O\", \"O\"]\n",
        "]\n",
        "\n",
        "# Create vocabulary and label mappings\n",
        "all_words = [word for sentence in sentences for word in sentence]\n",
        "unique_words = set(all_words)\n",
        "word_to_idx = {word: idx for idx, word in enumerate(unique_words, 1)}\n",
        "word_to_idx[\"<UNK>\"] = 0  # Unknown token\n",
        "\n",
        "# Map labels to integers\n",
        "label_to_idx = {\"B-PERSON\": 0, \"I-PERSON\": 1, \"B-LOCATION\": 2, \"B-ORGANIZATION\": 3, \"O\": 4}\n",
        "idx_to_label = {idx: label for label, idx in label_to_idx.items()}\n"
      ],
      "metadata": {
        "id": "RoZCdhnaTryk"
      },
      "execution_count": 1,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from sklearn.model_selection import train_test_split\n",
        "\n",
        "# Split data into training and validation sets (80% train, 20% validation)\n",
        "train_sentences, val_sentences, train_labels, val_labels = train_test_split(\n",
        "    sentences, labels, test_size=0.2, random_state=42\n",
        ")\n"
      ],
      "metadata": {
        "id": "WrpBPRFvTrvs"
      },
      "execution_count": 2,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "import torch\n",
        "from torch.utils.data import Dataset, DataLoader\n",
        "from torch.nn.utils.rnn import pad_sequence\n",
        "\n",
        "class NERDataset(Dataset):\n",
        "    def __init__(self, sentences, labels, word_to_idx, label_to_idx):\n",
        "        self.sentences = sentences\n",
        "        self.labels = labels\n",
        "        self.word_to_idx = word_to_idx\n",
        "        self.label_to_idx = label_to_idx\n",
        "\n",
        "    def __len__(self):\n",
        "        return len(self.sentences)\n",
        "\n",
        "    def __getitem__(self, idx):\n",
        "        words = self.sentences[idx]\n",
        "        tags = self.labels[idx]\n",
        "\n",
        "        word_idxs = [self.word_to_idx.get(word, self.word_to_idx[\"<UNK>\"]) for word in words]\n",
        "        tag_idxs = [self.label_to_idx[tag] for tag in tags]\n",
        "\n",
        "        return torch.tensor(word_idxs, dtype=torch.long), torch.tensor(tag_idxs, dtype=torch.long)\n",
        "\n",
        "def pad_collate(batch):\n",
        "    (sentences, labels) = zip(*batch)\n",
        "    sentences_padded = pad_sequence(sentences, batch_first=True, padding_value=word_to_idx[\"<UNK>\"])\n",
        "    labels_padded = pad_sequence(labels, batch_first=True, padding_value=-100)  # -100 for ignored tokens\n",
        "    return sentences_padded, labels_padded\n",
        "\n",
        "# Create DataLoader instances for train and validation\n",
        "train_dataset = NERDataset(train_sentences, train_labels, word_to_idx, label_to_idx)\n",
        "val_dataset = NERDataset(val_sentences, val_labels, word_to_idx, label_to_idx)\n",
        "\n",
        "train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True, collate_fn=pad_collate)\n",
        "val_loader = DataLoader(val_dataset, batch_size=1, collate_fn=pad_collate)\n"
      ],
      "metadata": {
        "id": "KFbd0e77gpEh"
      },
      "execution_count": 3,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "import torch.nn as nn\n",
        "\n",
        "class BiLSTM_NER(nn.Module):\n",
        "    def __init__(self, vocab_size, tagset_size, embedding_dim=64, hidden_dim=128):\n",
        "        super(BiLSTM_NER, self).__init__()\n",
        "        self.embedding = nn.Embedding(vocab_size, embedding_dim)\n",
        "        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2, num_layers=1, bidirectional=True, batch_first=True)\n",
        "        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)\n",
        "\n",
        "    def forward(self, sentence):\n",
        "        embeds = self.embedding(sentence)\n",
        "        lstm_out, _ = self.lstm(embeds)\n",
        "        tag_space = self.hidden2tag(lstm_out)\n",
        "        tag_scores = torch.log_softmax(tag_space, dim=2)\n",
        "        return tag_scores\n"
      ],
      "metadata": {
        "id": "i096tTXPgpB5"
      },
      "execution_count": 4,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "import pandas as pd\n",
        "from sklearn.metrics import classification_report\n",
        "\n",
        "def train_model(model, train_loader, val_loader, num_epochs=10):\n",
        "    # Initialize lists to collect metrics for each epoch\n",
        "    epoch_list, loss_list, precision_list, recall_list, f1_list = [], [], [], [], []\n",
        "\n",
        "    loss_function = nn.CrossEntropyLoss(ignore_index=-100)  # Ignore padding label (-100)\n",
        "    optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)\n",
        "\n",
        "    # Training loop with metric tracking\n",
        "    for epoch in range(1, num_epochs + 1):\n",
        "        model.train()  # Set model to training mode\n",
        "        total_loss = 0\n",
        "\n",
        "        # Training phase\n",
        "        for sentence, tags in train_loader:\n",
        "            model.zero_grad()\n",
        "            tag_scores = model(sentence)\n",
        "\n",
        "            # Reshape to match dimensions required by CrossEntropyLoss\n",
        "            tag_scores = tag_scores.view(-1, tag_scores.shape[-1])\n",
        "            tags = tags.view(-1)\n",
        "\n",
        "            loss = loss_function(tag_scores, tags)\n",
        "            loss.backward()\n",
        "            optimizer.step()\n",
        "            total_loss += loss.item()\n",
        "\n",
        "        avg_loss = total_loss / len(train_loader)\n",
        "\n",
        "        # Evaluation phase\n",
        "        true_labels, predicted_labels = evaluate_model(model, val_loader, idx_to_label)\n",
        "        report = classification_report(true_labels, predicted_labels, labels=list(label_to_idx.keys()), zero_division=0, output_dict=True)\n",
        "\n",
        "        # Retrieve metrics\n",
        "        precision = report['weighted avg']['precision']\n",
        "        recall = report['weighted avg']['recall']\n",
        "        f1_score = report['weighted avg']['f1-score']\n",
        "\n",
        "        # Append metrics to lists\n",
        "        epoch_list.append(f\"Epoch {epoch}/{num_epochs}\")\n",
        "        loss_list.append(avg_loss)\n",
        "        precision_list.append(precision)\n",
        "        recall_list.append(recall)\n",
        "        f1_list.append(f1_score)\n",
        "\n",
        "    # Create a DataFrame with the collected metrics\n",
        "    df = pd.DataFrame({\n",
        "        \"Epoch\": epoch_list,\n",
        "        \"Loss\": loss_list,\n",
        "        \"Precision\": precision_list,\n",
        "        \"Recall\": recall_list,\n",
        "        \"F1-score\": f1_list\n",
        "    })\n",
        "\n",
        "    # Display the DataFrame\n",
        "    print(\"\\nTraining Progress\")\n",
        "    print(df.to_string(index=False))\n",
        "    return df\n"
      ],
      "metadata": {
        "id": "cB2Qsvv0go-9"
      },
      "execution_count": 5,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def evaluate_model(model, data_loader, idx_to_label):\n",
        "    all_predictions = []\n",
        "    all_true_labels = []\n",
        "\n",
        "    model.eval()  # Set the model to evaluation mode\n",
        "    with torch.no_grad():\n",
        "        for sentences, labels in data_loader:\n",
        "            # Make predictions\n",
        "            tag_scores = model(sentences)\n",
        "            predictions = torch.argmax(tag_scores, dim=2)\n",
        "\n",
        "            for pred, true in zip(predictions, labels):\n",
        "                pred = pred.cpu().numpy()\n",
        "                true = true.cpu().numpy()\n",
        "\n",
        "                # Remove padding (-100) for accurate evaluation\n",
        "                true = [t for t in true if t != -100]\n",
        "                pred = pred[:len(true)]\n",
        "\n",
        "                all_predictions.extend([idx_to_label[p] for p in pred])\n",
        "                all_true_labels.extend([idx_to_label[t] for t in true])\n",
        "\n",
        "    return all_true_labels, all_predictions\n"
      ],
      "metadata": {
        "id": "lh4HFt20go8T"
      },
      "execution_count": 6,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Initialize model and DataLoader instances\n",
        "vocab_size = len(word_to_idx)\n",
        "tagset_size = len(label_to_idx)\n",
        "model = BiLSTM_NER(vocab_size, tagset_size)\n",
        "\n",
        "# Train the model and display training progress\n",
        "training_progress_df = train_model(model, train_loader, val_loader)\n",
        "\n",
        "# Evaluate on test data\n",
        "true_labels, predicted_labels = evaluate_model(model, val_loader, idx_to_label)\n",
        "print(classification_report(true_labels, predicted_labels, labels=list(label_to_idx.keys()), zero_division=0))\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "YH6j-0n7go5a",
        "outputId": "25936497-94b0-4691-86e1-b44162c89005"
      },
      "execution_count": 7,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "\n",
            "Training Progress\n",
            "      Epoch     Loss  Precision  Recall  F1-score\n",
            " Epoch 1/10 1.616464   0.333333     0.5  0.396825\n",
            " Epoch 2/10 1.577114   0.250000     0.5  0.333333\n",
            " Epoch 3/10 1.519056   0.250000     0.5  0.333333\n",
            " Epoch 4/10 1.438615   0.250000     0.5  0.333333\n",
            " Epoch 5/10 1.365465   0.250000     0.5  0.333333\n",
            " Epoch 6/10 1.290568   0.250000     0.5  0.333333\n",
            " Epoch 7/10 1.226007   0.250000     0.5  0.333333\n",
            " Epoch 8/10 1.162358   0.250000     0.5  0.333333\n",
            " Epoch 9/10 1.107923   0.250000     0.5  0.333333\n",
            "Epoch 10/10 1.051664   0.250000     0.5  0.333333\n",
            "                precision    recall  f1-score   support\n",
            "\n",
            "      B-PERSON       0.00      0.00      0.00         1\n",
            "      I-PERSON       0.00      0.00      0.00         0\n",
            "    B-LOCATION       0.00      0.00      0.00         1\n",
            "B-ORGANIZATION       0.00      0.00      0.00         1\n",
            "             O       0.50      1.00      0.67         3\n",
            "\n",
            "      accuracy                           0.50         6\n",
            "     macro avg       0.10      0.20      0.13         6\n",
            "  weighted avg       0.25      0.50      0.33         6\n",
            "\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "VVneEo1Ygo2v"
      },
      "execution_count": 7,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "9CU9Qp5ugoz-"
      },
      "execution_count": 7,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "m9bsMovcgox8"
      },
      "execution_count": 7,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "-5-ErtI0gou6"
      },
      "execution_count": 7,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "qAZWIPZ9gosZ"
      },
      "execution_count": 7,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "rpenB8bDgopn"
      },
      "execution_count": 7,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "c4j_rWc9gom9"
      },
      "execution_count": 7,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "Mg1R4n2Ygoke"
      },
      "execution_count": 7,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "LemxYPend6X1"
      },
      "execution_count": 7,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "LZXLa4KWd6U7"
      },
      "execution_count": 7,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "pT2qxBR9d6SR"
      },
      "execution_count": 7,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "1UvYkxq1d6O5"
      },
      "execution_count": 7,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "5BEpFEOiTF-a"
      },
      "execution_count": 7,
      "outputs": []
    }
  ]
}