{ "cells": [ { "cell_type": "markdown", "id": "7c413f6e-62bf-43e5-91b5-f68761a90bef", "metadata": {}, "source": [ "# 1. Objective" ] }, { "cell_type": "raw", "id": "be82cf3f-4671-480d-aaca-c4f238e3810d", "metadata": {}, "source": [ "Text Classification using RNN" ] }, { "cell_type": "markdown", "id": "65506456-317f-4ef6-af5c-99ffa1006551", "metadata": {}, "source": [ "# 2. Imports" ] }, { "cell_type": "code", "execution_count": 18, "id": "11b56a45", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "11b56a45", "outputId": "e0a3e575-bbac-4482-bb49-14d1168be7c7", "papermill": { "duration": 5.049477, "end_time": "2024-08-08T15:25:12.643518", "exception": false, "start_time": "2024-08-08T15:25:07.594041", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "#read the data\n", "import pandas as pd\n", "import time\n", "from sklearn.model_selection import train_test_split\n", "from tokenizers import Tokenizer\n", "from tokenizers.models import WordLevel\n", "from tokenizers.pre_tokenizers import Whitespace\n", "from collections import Counter\n", "import torch\n", "from torch.nn.utils.rnn import pad_sequence\n", "import torch\n", "import torch.nn.functional as F\n", "from sklearn.preprocessing import LabelEncoder\n", "import torch\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", "import torch.optim as optim\n", "from torch.utils.data import DataLoader, TensorDataset\n" ] }, { "cell_type": "markdown", "id": "5a54d59b-e63a-4272-80f3-28bbacee8fef", "metadata": {}, "source": [ "## GPU" ] }, { "cell_type": "code", "execution_count": 2, "id": "a7725648-ea0f-464e-a643-a5540ab96550", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'NVIDIA GeForce RTX 4050 Laptop GPU'" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "torch.cuda.get_device_name()" ] }, { "cell_type": "markdown", "id": "5cb79f2d", "metadata": { "id": "5cb79f2d", "papermill": { "duration": 0.009874, "end_time": "2024-08-08T15:25:12.665020", "exception": false, "start_time": "2024-08-08T15:25:12.655146", "status": "completed" }, "tags": [] }, "source": [ "## 3.Initial Data Exploration and Cleaning" ] }, { "cell_type": "code", "execution_count": 3, "id": "48f9564d-b1b7-4c91-a0c8-603b3f253576", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | Unnamed: 0 | \n", "0 | \n", "a | \n", "
|---|---|---|---|
| 0 | \n", "0 | \n", "Thyroid_Cancer | \n", "Thyroid surgery in children in a single insti... | \n", "
| 1 | \n", "1 | \n", "Thyroid_Cancer | \n", "\" The adopted strategy was the same as that us... | \n", "
| 2 | \n", "2 | \n", "Thyroid_Cancer | \n", "coronary arterybypass grafting thrombosis ï¬b... | \n", "
| 3 | \n", "3 | \n", "Thyroid_Cancer | \n", "Solitary plasmacytoma SP of the skull is an u... | \n", "
| 4 | \n", "4 | \n", "Thyroid_Cancer | \n", "This study aimed to investigate serum matrix ... | \n", "