{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "machine_shape": "hm", "gpuType": "A100" }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "accelerator": "GPU" }, "cells": [ { "cell_type": "code", "source": [], "metadata": { "id": "62TB1_OCUVfz" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "i0hQIwu8UVc0" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Sample Azerbaijani sentences with entity labels (PERSON, LOCATION, ORGANIZATION)\n", "sentences = [\n", " [\"İlham\", \"Əliyev\", \"Bakıda\", \"BMT-nin\", \"konfransında\", \"iştirak\", \"etdi\"],\n", " [\"Leyla\", \"Gəncə\", \"şəhərində\", \"Azərsun\", \"şirkətində\", \"işləyir\"],\n", " [\"Rəşad\", \"Sumqayıt\", \"şəhərinə\", \"səyahət\", \"etdi\"],\n", " [\"Nigar\", \"və\", \"Zaur\", \"İstanbulda\", \"Türk Hava Yolları\", \"ofisində\", \"görüşdülər\"],\n", " [\"Samir\", \"Bakıda\", \"BP\", \"şirkətinə\", \"işə\", \"daxil\", \"oldu\"]\n", "]\n", "\n", "labels = [\n", " [\"B-PERSON\", \"I-PERSON\", \"B-LOCATION\", \"B-ORGANIZATION\", \"O\", \"O\", \"O\"],\n", " [\"B-PERSON\", \"B-LOCATION\", \"O\", \"B-ORGANIZATION\", \"O\", \"O\"],\n", " [\"B-PERSON\", \"B-LOCATION\", \"O\", \"O\", \"O\"],\n", " [\"B-PERSON\", \"O\", \"B-PERSON\", \"B-LOCATION\", \"B-ORGANIZATION\", \"O\", \"O\"],\n", " [\"B-PERSON\", \"B-LOCATION\", \"B-ORGANIZATION\", \"O\", \"O\", \"O\", \"O\"]\n", "]\n", "\n", "# Create vocabulary and label mappings\n", "all_words = [word for sentence in sentences for word in sentence]\n", "unique_words = set(all_words)\n", "word_to_idx = {word: idx for idx, word in enumerate(unique_words, 1)}\n", "word_to_idx[\"\"] = 0 # Unknown token\n", "\n", "# Map labels to integers\n", "label_to_idx = {\"B-PERSON\": 0, \"I-PERSON\": 1, \"B-LOCATION\": 2, \"B-ORGANIZATION\": 3, \"O\": 4}\n", "idx_to_label = {idx: label for label, idx in label_to_idx.items()}\n" ], "metadata": { "id": "RoZCdhnaTryk" }, "execution_count": 1, "outputs": [] }, { "cell_type": "code", "source": [ "from sklearn.model_selection import train_test_split\n", "\n", "# Split data into training and validation sets (80% train, 20% validation)\n", "train_sentences, val_sentences, train_labels, val_labels = train_test_split(\n", " sentences, labels, test_size=0.2, random_state=42\n", ")\n" ], "metadata": { "id": "WrpBPRFvTrvs" }, "execution_count": 2, "outputs": [] }, { "cell_type": "code", "source": [ "import torch\n", "from torch.utils.data import Dataset, DataLoader\n", "from torch.nn.utils.rnn import pad_sequence\n", "\n", "class NERDataset(Dataset):\n", " def __init__(self, sentences, labels, word_to_idx, label_to_idx):\n", " self.sentences = sentences\n", " self.labels = labels\n", " self.word_to_idx = word_to_idx\n", " self.label_to_idx = label_to_idx\n", "\n", " def __len__(self):\n", " return len(self.sentences)\n", "\n", " def __getitem__(self, idx):\n", " words = self.sentences[idx]\n", " tags = self.labels[idx]\n", "\n", " word_idxs = [self.word_to_idx.get(word, self.word_to_idx[\"\"]) for word in words]\n", " tag_idxs = [self.label_to_idx[tag] for tag in tags]\n", "\n", " return torch.tensor(word_idxs, dtype=torch.long), torch.tensor(tag_idxs, dtype=torch.long)\n", "\n", "def pad_collate(batch):\n", " (sentences, labels) = zip(*batch)\n", " sentences_padded = pad_sequence(sentences, batch_first=True, padding_value=word_to_idx[\"\"])\n", " labels_padded = pad_sequence(labels, batch_first=True, padding_value=-100) # -100 for ignored tokens\n", " return sentences_padded, labels_padded\n", "\n", "# Create DataLoader instances for train and validation\n", "train_dataset = NERDataset(train_sentences, train_labels, word_to_idx, label_to_idx)\n", "val_dataset = NERDataset(val_sentences, val_labels, word_to_idx, label_to_idx)\n", "\n", "train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True, collate_fn=pad_collate)\n", "val_loader = DataLoader(val_dataset, batch_size=1, collate_fn=pad_collate)\n" ], "metadata": { "id": "KFbd0e77gpEh" }, "execution_count": 3, "outputs": [] }, { "cell_type": "code", "source": [ "import torch.nn as nn\n", "\n", "class BiLSTM_NER(nn.Module):\n", " def __init__(self, vocab_size, tagset_size, embedding_dim=64, hidden_dim=128):\n", " super(BiLSTM_NER, self).__init__()\n", " self.embedding = nn.Embedding(vocab_size, embedding_dim)\n", " self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2, num_layers=1, bidirectional=True, batch_first=True)\n", " self.hidden2tag = nn.Linear(hidden_dim, tagset_size)\n", "\n", " def forward(self, sentence):\n", " embeds = self.embedding(sentence)\n", " lstm_out, _ = self.lstm(embeds)\n", " tag_space = self.hidden2tag(lstm_out)\n", " tag_scores = torch.log_softmax(tag_space, dim=2)\n", " return tag_scores\n" ], "metadata": { "id": "i096tTXPgpB5" }, "execution_count": 4, "outputs": [] }, { "cell_type": "code", "source": [ "import pandas as pd\n", "from sklearn.metrics import classification_report\n", "\n", "def train_model(model, train_loader, val_loader, num_epochs=10):\n", " # Initialize lists to collect metrics for each epoch\n", " epoch_list, loss_list, precision_list, recall_list, f1_list = [], [], [], [], []\n", "\n", " loss_function = nn.CrossEntropyLoss(ignore_index=-100) # Ignore padding label (-100)\n", " optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)\n", "\n", " # Training loop with metric tracking\n", " for epoch in range(1, num_epochs + 1):\n", " model.train() # Set model to training mode\n", " total_loss = 0\n", "\n", " # Training phase\n", " for sentence, tags in train_loader:\n", " model.zero_grad()\n", " tag_scores = model(sentence)\n", "\n", " # Reshape to match dimensions required by CrossEntropyLoss\n", " tag_scores = tag_scores.view(-1, tag_scores.shape[-1])\n", " tags = tags.view(-1)\n", "\n", " loss = loss_function(tag_scores, tags)\n", " loss.backward()\n", " optimizer.step()\n", " total_loss += loss.item()\n", "\n", " avg_loss = total_loss / len(train_loader)\n", "\n", " # Evaluation phase\n", " true_labels, predicted_labels = evaluate_model(model, val_loader, idx_to_label)\n", " report = classification_report(true_labels, predicted_labels, labels=list(label_to_idx.keys()), zero_division=0, output_dict=True)\n", "\n", " # Retrieve metrics\n", " precision = report['weighted avg']['precision']\n", " recall = report['weighted avg']['recall']\n", " f1_score = report['weighted avg']['f1-score']\n", "\n", " # Append metrics to lists\n", " epoch_list.append(f\"Epoch {epoch}/{num_epochs}\")\n", " loss_list.append(avg_loss)\n", " precision_list.append(precision)\n", " recall_list.append(recall)\n", " f1_list.append(f1_score)\n", "\n", " # Create a DataFrame with the collected metrics\n", " df = pd.DataFrame({\n", " \"Epoch\": epoch_list,\n", " \"Loss\": loss_list,\n", " \"Precision\": precision_list,\n", " \"Recall\": recall_list,\n", " \"F1-score\": f1_list\n", " })\n", "\n", " # Display the DataFrame\n", " print(\"\\nTraining Progress\")\n", " print(df.to_string(index=False))\n", " return df\n" ], "metadata": { "id": "cB2Qsvv0go-9" }, "execution_count": 5, "outputs": [] }, { "cell_type": "code", "source": [ "def evaluate_model(model, data_loader, idx_to_label):\n", " all_predictions = []\n", " all_true_labels = []\n", "\n", " model.eval() # Set the model to evaluation mode\n", " with torch.no_grad():\n", " for sentences, labels in data_loader:\n", " # Make predictions\n", " tag_scores = model(sentences)\n", " predictions = torch.argmax(tag_scores, dim=2)\n", "\n", " for pred, true in zip(predictions, labels):\n", " pred = pred.cpu().numpy()\n", " true = true.cpu().numpy()\n", "\n", " # Remove padding (-100) for accurate evaluation\n", " true = [t for t in true if t != -100]\n", " pred = pred[:len(true)]\n", "\n", " all_predictions.extend([idx_to_label[p] for p in pred])\n", " all_true_labels.extend([idx_to_label[t] for t in true])\n", "\n", " return all_true_labels, all_predictions\n" ], "metadata": { "id": "lh4HFt20go8T" }, "execution_count": 6, "outputs": [] }, { "cell_type": "code", "source": [ "# Initialize model and DataLoader instances\n", "vocab_size = len(word_to_idx)\n", "tagset_size = len(label_to_idx)\n", "model = BiLSTM_NER(vocab_size, tagset_size)\n", "\n", "# Train the model and display training progress\n", "training_progress_df = train_model(model, train_loader, val_loader)\n", "\n", "# Evaluate on test data\n", "true_labels, predicted_labels = evaluate_model(model, val_loader, idx_to_label)\n", "print(classification_report(true_labels, predicted_labels, labels=list(label_to_idx.keys()), zero_division=0))\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "YH6j-0n7go5a", "outputId": "25936497-94b0-4691-86e1-b44162c89005" }, "execution_count": 7, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\n", "Training Progress\n", " Epoch Loss Precision Recall F1-score\n", " Epoch 1/10 1.616464 0.333333 0.5 0.396825\n", " Epoch 2/10 1.577114 0.250000 0.5 0.333333\n", " Epoch 3/10 1.519056 0.250000 0.5 0.333333\n", " Epoch 4/10 1.438615 0.250000 0.5 0.333333\n", " Epoch 5/10 1.365465 0.250000 0.5 0.333333\n", " Epoch 6/10 1.290568 0.250000 0.5 0.333333\n", " Epoch 7/10 1.226007 0.250000 0.5 0.333333\n", " Epoch 8/10 1.162358 0.250000 0.5 0.333333\n", " Epoch 9/10 1.107923 0.250000 0.5 0.333333\n", "Epoch 10/10 1.051664 0.250000 0.5 0.333333\n", " precision recall f1-score support\n", "\n", " B-PERSON 0.00 0.00 0.00 1\n", " I-PERSON 0.00 0.00 0.00 0\n", " B-LOCATION 0.00 0.00 0.00 1\n", "B-ORGANIZATION 0.00 0.00 0.00 1\n", " O 0.50 1.00 0.67 3\n", "\n", " accuracy 0.50 6\n", " macro avg 0.10 0.20 0.13 6\n", " weighted avg 0.25 0.50 0.33 6\n", "\n" ] } ] }, { "cell_type": "code", "source": [], "metadata": { "id": "VVneEo1Ygo2v" }, "execution_count": 7, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "9CU9Qp5ugoz-" }, "execution_count": 7, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "m9bsMovcgox8" }, "execution_count": 7, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "-5-ErtI0gou6" }, "execution_count": 7, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "qAZWIPZ9gosZ" }, "execution_count": 7, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "rpenB8bDgopn" }, "execution_count": 7, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "c4j_rWc9gom9" }, "execution_count": 7, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "Mg1R4n2Ygoke" }, "execution_count": 7, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "LemxYPend6X1" }, "execution_count": 7, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "LZXLa4KWd6U7" }, "execution_count": 7, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "pT2qxBR9d6SR" }, "execution_count": 7, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "1UvYkxq1d6O5" }, "execution_count": 7, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "5BEpFEOiTF-a" }, "execution_count": 7, "outputs": [] } ] }