Spaces:
Build error
Build error
Upload 5 files
Browse files- .gitattributes +1 -0
- alldata_1_for_kaggle.csv +3 -0
- app.py +209 -0
- requirements.txt +7 -0
- rnn_classification_model_weights.pth +3 -0
- text_classification_using_rnn.ipynb +1039 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
alldata_1_for_kaggle.csv filter=lfs diff=lfs merge=lfs -text
|
alldata_1_for_kaggle.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0c3b56772f9f0940ab5ba4c7c8c85afff6e75b06463428483867278e03b614d9
|
| 3 |
+
size 189221599
|
app.py
ADDED
|
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import torch
|
| 4 |
+
import torch.nn as nn
|
| 5 |
+
import torch.nn.functional as F
|
| 6 |
+
import numpy as np
|
| 7 |
+
import os
|
| 8 |
+
from PIL import Image
|
| 9 |
+
from sklearn.preprocessing import StandardScaler, LabelEncoder
|
| 10 |
+
from tokenizers import Tokenizer
|
| 11 |
+
from tokenizers.models import WordLevel
|
| 12 |
+
from tokenizers.pre_tokenizers import Whitespace
|
| 13 |
+
from collections import Counter
|
| 14 |
+
import torch
|
| 15 |
+
from torch.nn.utils.rnn import pad_sequence
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
st.set_page_config(layout="centered")
|
| 20 |
+
|
| 21 |
+
# Add custom CSS for background image and styling
|
| 22 |
+
# Add custom CSS for background image and styling
|
| 23 |
+
st.markdown("""
|
| 24 |
+
<style>
|
| 25 |
+
.stApp {
|
| 26 |
+
background-image: url("");
|
| 27 |
+
background-size: cover;
|
| 28 |
+
background-position: center;
|
| 29 |
+
background-repeat: no-repeat;
|
| 30 |
+
height: auto; /* Allows the page to expand for scrolling */
|
| 31 |
+
overflow: auto; /* Enables scrolling if the page content overflows */
|
| 32 |
+
# position : relative
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
/* Adjust opacity of overlay to make content more visible */
|
| 36 |
+
.stApp::before {
|
| 37 |
+
content: "";
|
| 38 |
+
position: absolute;
|
| 39 |
+
top: 0;
|
| 40 |
+
left: 0;
|
| 41 |
+
width: 100%;
|
| 42 |
+
height: 100%;
|
| 43 |
+
background-color: rgba(255, 255, 255, 0.8); /* Slightly higher opacity */
|
| 44 |
+
z-index: -1;
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
/* Ensure content appears above the overlay */
|
| 48 |
+
.stApp > * {
|
| 49 |
+
position: relative;
|
| 50 |
+
z-index: 2;
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
/* Ensure the dataframe is visible */
|
| 54 |
+
.dataframe {
|
| 55 |
+
background-color: rgba(255, 255, 255, 0.9) !important;
|
| 56 |
+
z-index: 3;
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
/* Style text elements for better visibility */
|
| 60 |
+
h1, h3, span, div {
|
| 61 |
+
text-shadow: 1px 1px 2px rgba(255, 255, 255, 0.2);
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
/* Custom CSS for select box heading */
|
| 65 |
+
div.stSelectbox > label {
|
| 66 |
+
color: #000000 !important; /* Change to your desired color */
|
| 67 |
+
# background-color: black !important; /* Background color of the dropdown */
|
| 68 |
+
font-size: 24px !important; /* Change font size */
|
| 69 |
+
font-weight: bold !important; /* Make text bold */
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
/* Custom CSS for image caption */
|
| 73 |
+
.custom-caption {
|
| 74 |
+
color: #000000 !important; /* Change to your desired color */
|
| 75 |
+
font-size: 24px !important; /* Optional: Change font size */
|
| 76 |
+
text-align: center; /* Center-align the caption */
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
.stMainBlockContainer {
|
| 80 |
+
background-color: white !important; /* Background color of the dropdown */
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
.stTextArea{
|
| 84 |
+
color: #000000 !important
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
</style>
|
| 88 |
+
""", unsafe_allow_html=True)
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
# Custom title styling functions
|
| 92 |
+
def colored_title(text, color):
|
| 93 |
+
st.markdown(f"<h1 style='color: {color};'>{text}</h1>", unsafe_allow_html=True)
|
| 94 |
+
|
| 95 |
+
def colored_subheader(text, color):
|
| 96 |
+
st.markdown(f"<h3 style='color: {color};'>{text}</h3>", unsafe_allow_html=True)
|
| 97 |
+
|
| 98 |
+
def colored_text(text, color):
|
| 99 |
+
st.markdown(f"<span style='color: {color};'>{text}</span>", unsafe_allow_html=True)
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
embedding_dim = 128
|
| 103 |
+
hidden_units = 128
|
| 104 |
+
num_classes = 3
|
| 105 |
+
|
| 106 |
+
class RNNModel(nn.Module):
|
| 107 |
+
def __init__(self, vocab_size, embedding_dim, hidden_units, num_classes):
|
| 108 |
+
super(RNNModel, self).__init__()
|
| 109 |
+
self.embedding = nn.Embedding(vocab_size, embedding_dim)
|
| 110 |
+
self.rnn = nn.RNN(embedding_dim, hidden_units, batch_first=True, dropout=0.2)
|
| 111 |
+
self.fc = nn.Linear(hidden_units, num_classes)
|
| 112 |
+
|
| 113 |
+
def forward(self, x):
|
| 114 |
+
x = self.embedding(x)
|
| 115 |
+
output, _ = self.rnn(x)
|
| 116 |
+
x = output[:, -1, :] # Use last timestep output
|
| 117 |
+
x = self.fc(x)
|
| 118 |
+
return F.softmax(x, dim=1)
|
| 119 |
+
|
| 120 |
+
@st.cache_resource
|
| 121 |
+
def load_model(vocab_size):
|
| 122 |
+
|
| 123 |
+
model = RNNModel(vocab_size,embedding_dim,hidden_units,num_classes)
|
| 124 |
+
try:
|
| 125 |
+
state_dict = torch.load('rnn_classification_model_weights.pth', map_location=torch.device('cpu'))
|
| 126 |
+
model.load_state_dict(state_dict)
|
| 127 |
+
model.eval()
|
| 128 |
+
return model
|
| 129 |
+
except Exception as e:
|
| 130 |
+
st.error(f"Error loading model: {str(e)}")
|
| 131 |
+
return None
|
| 132 |
+
|
| 133 |
+
@st.cache_data
|
| 134 |
+
def load_data():
|
| 135 |
+
|
| 136 |
+
df=pd.read_csv("alldata_1_for_kaggle.csv",encoding='latin1')
|
| 137 |
+
df = df.rename(columns={'0': 'labels', 'a': 'text'})
|
| 138 |
+
texts = df['text'].values
|
| 139 |
+
unique_classes = (df['labels'].unique()) # Ensure consistent order
|
| 140 |
+
class_mapping = {i: f"{idx}" for i,idx in enumerate(unique_classes)}
|
| 141 |
+
|
| 142 |
+
# Tokenization (basic whitespace tokenizer)
|
| 143 |
+
def tokenize(text):
|
| 144 |
+
return text.lower().split()
|
| 145 |
+
|
| 146 |
+
# Build vocabulary based on word frequency (similar to Keras Tokenizer)
|
| 147 |
+
word_counts = Counter()
|
| 148 |
+
for text in texts:
|
| 149 |
+
word_counts.update(tokenize(text))
|
| 150 |
+
|
| 151 |
+
# Sort words by frequency (most common words get lower indices)
|
| 152 |
+
sorted_words = [word for word, _ in word_counts.most_common()]
|
| 153 |
+
|
| 154 |
+
# Create vocabulary mapping with <pad> and <unk> tokens
|
| 155 |
+
vocab = {"<pad>": 0, "<unk>": 1}
|
| 156 |
+
vocab.update({word: idx + 2 for idx, word in enumerate(sorted_words)})
|
| 157 |
+
|
| 158 |
+
# Initialize tokenizer
|
| 159 |
+
tokenizer = Tokenizer(WordLevel(vocab, unk_token="<unk>"))
|
| 160 |
+
tokenizer.pre_tokenizer = Whitespace()
|
| 161 |
+
|
| 162 |
+
# Convert texts to sequences
|
| 163 |
+
def text_to_sequence(texts):
|
| 164 |
+
return [tokenizer.encode(text.lower()).ids for text in texts]
|
| 165 |
+
|
| 166 |
+
X_train_seq = text_to_sequence(texts)
|
| 167 |
+
max_len = max([len(seq) for seq in X_train_seq])
|
| 168 |
+
# Convert each sequence to a tensor individually
|
| 169 |
+
X_train_seq = [torch.tensor(seq) for seq in X_train_seq]
|
| 170 |
+
|
| 171 |
+
# Padding the sequences correctly
|
| 172 |
+
X_train_seq_reversed = [seq.flip(0) for seq in X_train_seq]
|
| 173 |
+
X_train_pad_reversed = pad_sequence(X_train_seq_reversed, batch_first=True, padding_value=0)
|
| 174 |
+
X_train_pad = X_train_pad_reversed.flip(1)
|
| 175 |
+
vocab_size = len(tokenizer.get_vocab())
|
| 176 |
+
|
| 177 |
+
return X_train_pad, texts, class_mapping, vocab_size
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
def main():
|
| 181 |
+
colored_title("Text Classification using RNN", "black")
|
| 182 |
+
|
| 183 |
+
# Load data
|
| 184 |
+
X_test,texts,class_mapping, vocab_size = load_data()
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
# Display test images for selection
|
| 188 |
+
colored_subheader("Select a Row for Prediction:", "black")
|
| 189 |
+
selected_index = st.selectbox("Select a row", options=range(len(texts)), index=0)
|
| 190 |
+
|
| 191 |
+
colored_text("Selected Text:","black")
|
| 192 |
+
st.text_area("Text Content:", value=texts[selected_index], height=150, disabled=True)
|
| 193 |
+
|
| 194 |
+
# Predict button
|
| 195 |
+
if st.button("Predict"):
|
| 196 |
+
model = load_model(vocab_size)
|
| 197 |
+
if model is not None:
|
| 198 |
+
with torch.no_grad():
|
| 199 |
+
output = model(X_test[[selected_index]])
|
| 200 |
+
predicted_class = torch.argmax(output, dim=1).item()
|
| 201 |
+
|
| 202 |
+
# Display prediction result
|
| 203 |
+
colored_subheader("Prediction Results:", "green")
|
| 204 |
+
colored_text(f"Predicted Class: {class_mapping[predicted_class]}", "green")
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
if __name__ == "__main__":
|
| 209 |
+
main()
|
requirements.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit==1.30.0
|
| 2 |
+
pandas==2.1.4
|
| 3 |
+
torch==2.2.0
|
| 4 |
+
torchvision==0.17.0
|
| 5 |
+
numpy==1.26.3
|
| 6 |
+
scikit-learn==1.3.2
|
| 7 |
+
tokenizers==0.15.1
|
rnn_classification_model_weights.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:61a89b017c0a640b6cc429d433a47a01801da0e66d284219fbf77123048ad74e
|
| 3 |
+
size 115119704
|
text_classification_using_rnn.ipynb
ADDED
|
@@ -0,0 +1,1039 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"id": "7c413f6e-62bf-43e5-91b5-f68761a90bef",
|
| 6 |
+
"metadata": {},
|
| 7 |
+
"source": [
|
| 8 |
+
"# 1. Objective"
|
| 9 |
+
]
|
| 10 |
+
},
|
| 11 |
+
{
|
| 12 |
+
"cell_type": "raw",
|
| 13 |
+
"id": "be82cf3f-4671-480d-aaca-c4f238e3810d",
|
| 14 |
+
"metadata": {},
|
| 15 |
+
"source": [
|
| 16 |
+
"Text Classification using RNN"
|
| 17 |
+
]
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"cell_type": "markdown",
|
| 21 |
+
"id": "65506456-317f-4ef6-af5c-99ffa1006551",
|
| 22 |
+
"metadata": {},
|
| 23 |
+
"source": [
|
| 24 |
+
"# 2. Imports"
|
| 25 |
+
]
|
| 26 |
+
},
|
| 27 |
+
{
|
| 28 |
+
"cell_type": "code",
|
| 29 |
+
"execution_count": 18,
|
| 30 |
+
"id": "11b56a45",
|
| 31 |
+
"metadata": {
|
| 32 |
+
"colab": {
|
| 33 |
+
"base_uri": "https://localhost:8080/",
|
| 34 |
+
"height": 206
|
| 35 |
+
},
|
| 36 |
+
"id": "11b56a45",
|
| 37 |
+
"outputId": "e0a3e575-bbac-4482-bb49-14d1168be7c7",
|
| 38 |
+
"papermill": {
|
| 39 |
+
"duration": 5.049477,
|
| 40 |
+
"end_time": "2024-08-08T15:25:12.643518",
|
| 41 |
+
"exception": false,
|
| 42 |
+
"start_time": "2024-08-08T15:25:07.594041",
|
| 43 |
+
"status": "completed"
|
| 44 |
+
},
|
| 45 |
+
"tags": []
|
| 46 |
+
},
|
| 47 |
+
"outputs": [],
|
| 48 |
+
"source": [
|
| 49 |
+
"#read the data\n",
|
| 50 |
+
"import pandas as pd\n",
|
| 51 |
+
"import time\n",
|
| 52 |
+
"from sklearn.model_selection import train_test_split\n",
|
| 53 |
+
"from tokenizers import Tokenizer\n",
|
| 54 |
+
"from tokenizers.models import WordLevel\n",
|
| 55 |
+
"from tokenizers.pre_tokenizers import Whitespace\n",
|
| 56 |
+
"from collections import Counter\n",
|
| 57 |
+
"import torch\n",
|
| 58 |
+
"from torch.nn.utils.rnn import pad_sequence\n",
|
| 59 |
+
"import torch\n",
|
| 60 |
+
"import torch.nn.functional as F\n",
|
| 61 |
+
"from sklearn.preprocessing import LabelEncoder\n",
|
| 62 |
+
"import torch\n",
|
| 63 |
+
"import torch.nn as nn\n",
|
| 64 |
+
"import torch.nn.functional as F\n",
|
| 65 |
+
"import torch.optim as optim\n",
|
| 66 |
+
"from torch.utils.data import DataLoader, TensorDataset\n"
|
| 67 |
+
]
|
| 68 |
+
},
|
| 69 |
+
{
|
| 70 |
+
"cell_type": "markdown",
|
| 71 |
+
"id": "5a54d59b-e63a-4272-80f3-28bbacee8fef",
|
| 72 |
+
"metadata": {},
|
| 73 |
+
"source": [
|
| 74 |
+
"## GPU"
|
| 75 |
+
]
|
| 76 |
+
},
|
| 77 |
+
{
|
| 78 |
+
"cell_type": "code",
|
| 79 |
+
"execution_count": 2,
|
| 80 |
+
"id": "a7725648-ea0f-464e-a643-a5540ab96550",
|
| 81 |
+
"metadata": {},
|
| 82 |
+
"outputs": [
|
| 83 |
+
{
|
| 84 |
+
"data": {
|
| 85 |
+
"text/plain": [
|
| 86 |
+
"'NVIDIA GeForce RTX 4050 Laptop GPU'"
|
| 87 |
+
]
|
| 88 |
+
},
|
| 89 |
+
"execution_count": 2,
|
| 90 |
+
"metadata": {},
|
| 91 |
+
"output_type": "execute_result"
|
| 92 |
+
}
|
| 93 |
+
],
|
| 94 |
+
"source": [
|
| 95 |
+
"torch.cuda.get_device_name()"
|
| 96 |
+
]
|
| 97 |
+
},
|
| 98 |
+
{
|
| 99 |
+
"cell_type": "markdown",
|
| 100 |
+
"id": "5cb79f2d",
|
| 101 |
+
"metadata": {
|
| 102 |
+
"id": "5cb79f2d",
|
| 103 |
+
"papermill": {
|
| 104 |
+
"duration": 0.009874,
|
| 105 |
+
"end_time": "2024-08-08T15:25:12.665020",
|
| 106 |
+
"exception": false,
|
| 107 |
+
"start_time": "2024-08-08T15:25:12.655146",
|
| 108 |
+
"status": "completed"
|
| 109 |
+
},
|
| 110 |
+
"tags": []
|
| 111 |
+
},
|
| 112 |
+
"source": [
|
| 113 |
+
"## 3.Initial Data Exploration and Cleaning"
|
| 114 |
+
]
|
| 115 |
+
},
|
| 116 |
+
{
|
| 117 |
+
"cell_type": "code",
|
| 118 |
+
"execution_count": 3,
|
| 119 |
+
"id": "48f9564d-b1b7-4c91-a0c8-603b3f253576",
|
| 120 |
+
"metadata": {},
|
| 121 |
+
"outputs": [
|
| 122 |
+
{
|
| 123 |
+
"data": {
|
| 124 |
+
"text/html": [
|
| 125 |
+
"<div>\n",
|
| 126 |
+
"<style scoped>\n",
|
| 127 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 128 |
+
" vertical-align: middle;\n",
|
| 129 |
+
" }\n",
|
| 130 |
+
"\n",
|
| 131 |
+
" .dataframe tbody tr th {\n",
|
| 132 |
+
" vertical-align: top;\n",
|
| 133 |
+
" }\n",
|
| 134 |
+
"\n",
|
| 135 |
+
" .dataframe thead th {\n",
|
| 136 |
+
" text-align: right;\n",
|
| 137 |
+
" }\n",
|
| 138 |
+
"</style>\n",
|
| 139 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 140 |
+
" <thead>\n",
|
| 141 |
+
" <tr style=\"text-align: right;\">\n",
|
| 142 |
+
" <th></th>\n",
|
| 143 |
+
" <th>Unnamed: 0</th>\n",
|
| 144 |
+
" <th>0</th>\n",
|
| 145 |
+
" <th>a</th>\n",
|
| 146 |
+
" </tr>\n",
|
| 147 |
+
" </thead>\n",
|
| 148 |
+
" <tbody>\n",
|
| 149 |
+
" <tr>\n",
|
| 150 |
+
" <th>0</th>\n",
|
| 151 |
+
" <td>0</td>\n",
|
| 152 |
+
" <td>Thyroid_Cancer</td>\n",
|
| 153 |
+
" <td>Thyroid surgery in children in a single insti...</td>\n",
|
| 154 |
+
" </tr>\n",
|
| 155 |
+
" <tr>\n",
|
| 156 |
+
" <th>1</th>\n",
|
| 157 |
+
" <td>1</td>\n",
|
| 158 |
+
" <td>Thyroid_Cancer</td>\n",
|
| 159 |
+
" <td>\" The adopted strategy was the same as that us...</td>\n",
|
| 160 |
+
" </tr>\n",
|
| 161 |
+
" <tr>\n",
|
| 162 |
+
" <th>2</th>\n",
|
| 163 |
+
" <td>2</td>\n",
|
| 164 |
+
" <td>Thyroid_Cancer</td>\n",
|
| 165 |
+
" <td>coronary arterybypass grafting thrombosis ï¬b...</td>\n",
|
| 166 |
+
" </tr>\n",
|
| 167 |
+
" <tr>\n",
|
| 168 |
+
" <th>3</th>\n",
|
| 169 |
+
" <td>3</td>\n",
|
| 170 |
+
" <td>Thyroid_Cancer</td>\n",
|
| 171 |
+
" <td>Solitary plasmacytoma SP of the skull is an u...</td>\n",
|
| 172 |
+
" </tr>\n",
|
| 173 |
+
" <tr>\n",
|
| 174 |
+
" <th>4</th>\n",
|
| 175 |
+
" <td>4</td>\n",
|
| 176 |
+
" <td>Thyroid_Cancer</td>\n",
|
| 177 |
+
" <td>This study aimed to investigate serum matrix ...</td>\n",
|
| 178 |
+
" </tr>\n",
|
| 179 |
+
" </tbody>\n",
|
| 180 |
+
"</table>\n",
|
| 181 |
+
"</div>"
|
| 182 |
+
],
|
| 183 |
+
"text/plain": [
|
| 184 |
+
" Unnamed: 0 0 \\\n",
|
| 185 |
+
"0 0 Thyroid_Cancer \n",
|
| 186 |
+
"1 1 Thyroid_Cancer \n",
|
| 187 |
+
"2 2 Thyroid_Cancer \n",
|
| 188 |
+
"3 3 Thyroid_Cancer \n",
|
| 189 |
+
"4 4 Thyroid_Cancer \n",
|
| 190 |
+
"\n",
|
| 191 |
+
" a \n",
|
| 192 |
+
"0 Thyroid surgery in children in a single insti... \n",
|
| 193 |
+
"1 \" The adopted strategy was the same as that us... \n",
|
| 194 |
+
"2 coronary arterybypass grafting thrombosis ï¬b... \n",
|
| 195 |
+
"3 Solitary plasmacytoma SP of the skull is an u... \n",
|
| 196 |
+
"4 This study aimed to investigate serum matrix ... "
|
| 197 |
+
]
|
| 198 |
+
},
|
| 199 |
+
"execution_count": 3,
|
| 200 |
+
"metadata": {},
|
| 201 |
+
"output_type": "execute_result"
|
| 202 |
+
}
|
| 203 |
+
],
|
| 204 |
+
"source": [
|
| 205 |
+
"df=pd.read_csv(\"alldata_1_for_kaggle.csv\",encoding='latin1')\n",
|
| 206 |
+
"df.head()"
|
| 207 |
+
]
|
| 208 |
+
},
|
| 209 |
+
{
|
| 210 |
+
"cell_type": "code",
|
| 211 |
+
"execution_count": 4,
|
| 212 |
+
"id": "b525ffae",
|
| 213 |
+
"metadata": {
|
| 214 |
+
"colab": {
|
| 215 |
+
"base_uri": "https://localhost:8080/"
|
| 216 |
+
},
|
| 217 |
+
"id": "b525ffae",
|
| 218 |
+
"outputId": "dc7c2843-0900-426d-afdc-a667cdaec5eb",
|
| 219 |
+
"papermill": {
|
| 220 |
+
"duration": 0.037312,
|
| 221 |
+
"end_time": "2024-08-08T15:25:12.711939",
|
| 222 |
+
"exception": false,
|
| 223 |
+
"start_time": "2024-08-08T15:25:12.674627",
|
| 224 |
+
"status": "completed"
|
| 225 |
+
},
|
| 226 |
+
"tags": []
|
| 227 |
+
},
|
| 228 |
+
"outputs": [
|
| 229 |
+
{
|
| 230 |
+
"name": "stdout",
|
| 231 |
+
"output_type": "stream",
|
| 232 |
+
"text": [
|
| 233 |
+
"<class 'pandas.core.frame.DataFrame'>\n",
|
| 234 |
+
"RangeIndex: 7570 entries, 0 to 7569\n",
|
| 235 |
+
"Data columns (total 3 columns):\n",
|
| 236 |
+
" # Column Non-Null Count Dtype \n",
|
| 237 |
+
"--- ------ -------------- ----- \n",
|
| 238 |
+
" 0 Unnamed: 0 7570 non-null int64 \n",
|
| 239 |
+
" 1 0 7570 non-null object\n",
|
| 240 |
+
" 2 a 7570 non-null object\n",
|
| 241 |
+
"dtypes: int64(1), object(2)\n",
|
| 242 |
+
"memory usage: 177.6+ KB\n"
|
| 243 |
+
]
|
| 244 |
+
}
|
| 245 |
+
],
|
| 246 |
+
"source": [
|
| 247 |
+
"df.info()"
|
| 248 |
+
]
|
| 249 |
+
},
|
| 250 |
+
{
|
| 251 |
+
"cell_type": "code",
|
| 252 |
+
"execution_count": 5,
|
| 253 |
+
"id": "40635443",
|
| 254 |
+
"metadata": {
|
| 255 |
+
"colab": {
|
| 256 |
+
"base_uri": "https://localhost:8080/"
|
| 257 |
+
},
|
| 258 |
+
"id": "40635443",
|
| 259 |
+
"outputId": "de366877-cc8d-4366-bd77-b92645ffc2b4",
|
| 260 |
+
"papermill": {
|
| 261 |
+
"duration": 0.287576,
|
| 262 |
+
"end_time": "2024-08-08T15:25:13.030383",
|
| 263 |
+
"exception": false,
|
| 264 |
+
"start_time": "2024-08-08T15:25:12.742807",
|
| 265 |
+
"status": "completed"
|
| 266 |
+
},
|
| 267 |
+
"tags": []
|
| 268 |
+
},
|
| 269 |
+
"outputs": [
|
| 270 |
+
{
|
| 271 |
+
"data": {
|
| 272 |
+
"text/plain": [
|
| 273 |
+
"np.int64(0)"
|
| 274 |
+
]
|
| 275 |
+
},
|
| 276 |
+
"execution_count": 5,
|
| 277 |
+
"metadata": {},
|
| 278 |
+
"output_type": "execute_result"
|
| 279 |
+
}
|
| 280 |
+
],
|
| 281 |
+
"source": [
|
| 282 |
+
"#Check Duplicated vals\n",
|
| 283 |
+
"df.duplicated().sum()"
|
| 284 |
+
]
|
| 285 |
+
},
|
| 286 |
+
{
|
| 287 |
+
"cell_type": "code",
|
| 288 |
+
"execution_count": 6,
|
| 289 |
+
"id": "f48819da",
|
| 290 |
+
"metadata": {
|
| 291 |
+
"id": "f48819da",
|
| 292 |
+
"papermill": {
|
| 293 |
+
"duration": 0.019039,
|
| 294 |
+
"end_time": "2024-08-08T15:25:13.061571",
|
| 295 |
+
"exception": false,
|
| 296 |
+
"start_time": "2024-08-08T15:25:13.042532",
|
| 297 |
+
"status": "completed"
|
| 298 |
+
},
|
| 299 |
+
"tags": []
|
| 300 |
+
},
|
| 301 |
+
"outputs": [],
|
| 302 |
+
"source": [
|
| 303 |
+
"#rename cols\n",
|
| 304 |
+
"df = df.rename(columns={'0': 'labels', 'a': 'text'})"
|
| 305 |
+
]
|
| 306 |
+
},
|
| 307 |
+
{
|
| 308 |
+
"cell_type": "code",
|
| 309 |
+
"execution_count": 7,
|
| 310 |
+
"id": "ecb539f3",
|
| 311 |
+
"metadata": {
|
| 312 |
+
"colab": {
|
| 313 |
+
"base_uri": "https://localhost:8080/",
|
| 314 |
+
"height": 209
|
| 315 |
+
},
|
| 316 |
+
"id": "ecb539f3",
|
| 317 |
+
"outputId": "9d7a45f0-840d-4185-8c72-0caf92e261e5",
|
| 318 |
+
"papermill": {
|
| 319 |
+
"duration": 0.023086,
|
| 320 |
+
"end_time": "2024-08-08T15:25:13.095199",
|
| 321 |
+
"exception": false,
|
| 322 |
+
"start_time": "2024-08-08T15:25:13.072113",
|
| 323 |
+
"status": "completed"
|
| 324 |
+
},
|
| 325 |
+
"tags": []
|
| 326 |
+
},
|
| 327 |
+
"outputs": [
|
| 328 |
+
{
|
| 329 |
+
"data": {
|
| 330 |
+
"text/plain": [
|
| 331 |
+
"labels\n",
|
| 332 |
+
"Thyroid_Cancer 2810\n",
|
| 333 |
+
"Colon_Cancer 2580\n",
|
| 334 |
+
"Lung_Cancer 2180\n",
|
| 335 |
+
"Name: count, dtype: int64"
|
| 336 |
+
]
|
| 337 |
+
},
|
| 338 |
+
"execution_count": 7,
|
| 339 |
+
"metadata": {},
|
| 340 |
+
"output_type": "execute_result"
|
| 341 |
+
}
|
| 342 |
+
],
|
| 343 |
+
"source": [
|
| 344 |
+
"df['labels'].value_counts()"
|
| 345 |
+
]
|
| 346 |
+
},
|
| 347 |
+
{
|
| 348 |
+
"cell_type": "code",
|
| 349 |
+
"execution_count": 8,
|
| 350 |
+
"id": "e85ba39f-6029-47eb-8c43-d68870e73be7",
|
| 351 |
+
"metadata": {},
|
| 352 |
+
"outputs": [
|
| 353 |
+
{
|
| 354 |
+
"name": "stdout",
|
| 355 |
+
"output_type": "stream",
|
| 356 |
+
"text": [
|
| 357 |
+
"{0: 'Thyroid_Cancer', 1: 'Colon_Cancer', 2: 'Lung_Cancer'}\n"
|
| 358 |
+
]
|
| 359 |
+
}
|
| 360 |
+
],
|
| 361 |
+
"source": [
|
| 362 |
+
"unique_classes = (df['labels'].unique()) # Ensure consistent order\n",
|
| 363 |
+
"class_mapping = {i: f\"{idx}\" for i,idx in enumerate(unique_classes)}\n",
|
| 364 |
+
"\n",
|
| 365 |
+
"print(class_mapping)\n"
|
| 366 |
+
]
|
| 367 |
+
},
|
| 368 |
+
{
|
| 369 |
+
"cell_type": "code",
|
| 370 |
+
"execution_count": 9,
|
| 371 |
+
"id": "86b8333a",
|
| 372 |
+
"metadata": {
|
| 373 |
+
"id": "86b8333a",
|
| 374 |
+
"papermill": {
|
| 375 |
+
"duration": 0.017608,
|
| 376 |
+
"end_time": "2024-08-08T15:25:13.143258",
|
| 377 |
+
"exception": false,
|
| 378 |
+
"start_time": "2024-08-08T15:25:13.125650",
|
| 379 |
+
"status": "completed"
|
| 380 |
+
},
|
| 381 |
+
"tags": []
|
| 382 |
+
},
|
| 383 |
+
"outputs": [],
|
| 384 |
+
"source": [
|
| 385 |
+
"texts = df['text'].values\n",
|
| 386 |
+
"labels = df['labels'].values"
|
| 387 |
+
]
|
| 388 |
+
},
|
| 389 |
+
{
|
| 390 |
+
"cell_type": "markdown",
|
| 391 |
+
"id": "796aa449",
|
| 392 |
+
"metadata": {
|
| 393 |
+
"id": "796aa449",
|
| 394 |
+
"papermill": {
|
| 395 |
+
"duration": 0.009915,
|
| 396 |
+
"end_time": "2024-08-08T15:25:13.163996",
|
| 397 |
+
"exception": false,
|
| 398 |
+
"start_time": "2024-08-08T15:25:13.154081",
|
| 399 |
+
"status": "completed"
|
| 400 |
+
},
|
| 401 |
+
"tags": []
|
| 402 |
+
},
|
| 403 |
+
"source": [
|
| 404 |
+
"## 4.Spliting the data"
|
| 405 |
+
]
|
| 406 |
+
},
|
| 407 |
+
{
|
| 408 |
+
"cell_type": "code",
|
| 409 |
+
"execution_count": 10,
|
| 410 |
+
"id": "78cd4ab3",
|
| 411 |
+
"metadata": {
|
| 412 |
+
"colab": {
|
| 413 |
+
"base_uri": "https://localhost:8080/"
|
| 414 |
+
},
|
| 415 |
+
"id": "78cd4ab3",
|
| 416 |
+
"outputId": "14a16d84-74ef-43be-cc22-1adcb1d5eea3",
|
| 417 |
+
"papermill": {
|
| 418 |
+
"duration": 1.105941,
|
| 419 |
+
"end_time": "2024-08-08T15:25:14.280223",
|
| 420 |
+
"exception": false,
|
| 421 |
+
"start_time": "2024-08-08T15:25:13.174282",
|
| 422 |
+
"status": "completed"
|
| 423 |
+
},
|
| 424 |
+
"tags": []
|
| 425 |
+
},
|
| 426 |
+
"outputs": [
|
| 427 |
+
{
|
| 428 |
+
"name": "stdout",
|
| 429 |
+
"output_type": "stream",
|
| 430 |
+
"text": [
|
| 431 |
+
"Dimensions of X_train : (6056,)\n",
|
| 432 |
+
"Dimensions of X_test : (1514,)\n",
|
| 433 |
+
"Dimensions of y_train : (6056,)\n",
|
| 434 |
+
"Dimensions of y_test : (1514,)\n"
|
| 435 |
+
]
|
| 436 |
+
}
|
| 437 |
+
],
|
| 438 |
+
"source": [
|
| 439 |
+
"X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42,shuffle=True,stratify=labels)\n",
|
| 440 |
+
"\n",
|
| 441 |
+
"print(\"Dimensions of X_train :\", X_train.shape)\n",
|
| 442 |
+
"print(\"Dimensions of X_test :\", X_test.shape)\n",
|
| 443 |
+
"print(\"Dimensions of y_train :\", y_train.shape)\n",
|
| 444 |
+
"print(\"Dimensions of y_test :\", y_test.shape)"
|
| 445 |
+
]
|
| 446 |
+
},
|
| 447 |
+
{
|
| 448 |
+
"cell_type": "markdown",
|
| 449 |
+
"id": "6597d366",
|
| 450 |
+
"metadata": {
|
| 451 |
+
"id": "6597d366",
|
| 452 |
+
"papermill": {
|
| 453 |
+
"duration": 0.010665,
|
| 454 |
+
"end_time": "2024-08-08T15:25:14.321468",
|
| 455 |
+
"exception": false,
|
| 456 |
+
"start_time": "2024-08-08T15:25:14.310803",
|
| 457 |
+
"status": "completed"
|
| 458 |
+
},
|
| 459 |
+
"tags": []
|
| 460 |
+
},
|
| 461 |
+
"source": [
|
| 462 |
+
"## 5.Text Tokenization and Sequence Conversion\n"
|
| 463 |
+
]
|
| 464 |
+
},
|
| 465 |
+
{
|
| 466 |
+
"cell_type": "code",
|
| 467 |
+
"execution_count": 11,
|
| 468 |
+
"id": "heM2tow_QwB-",
|
| 469 |
+
"metadata": {
|
| 470 |
+
"colab": {
|
| 471 |
+
"base_uri": "https://localhost:8080/"
|
| 472 |
+
},
|
| 473 |
+
"id": "heM2tow_QwB-",
|
| 474 |
+
"outputId": "95c07961-806f-4061-9b00-16c0854369f2"
|
| 475 |
+
},
|
| 476 |
+
"outputs": [
|
| 477 |
+
{
|
| 478 |
+
"name": "stdout",
|
| 479 |
+
"output_type": "stream",
|
| 480 |
+
"text": [
|
| 481 |
+
"[1373, 327, 6, 33362, 8237, 50, 7, 865, 199, 8, 2, 167, 12848, 5989, 16, 1929, 4, 12, 2, 17591, 106, 278, 16, 5, 768, 4, 2, 6365, 10870, 16, 5, 308, 1500, 644, 5989, 16, 64577, 22, 4619, 105, 204996, 2576, 211, 1239, 124, 20, 91, 64, 25, 1436, 2, 2277, 1593, 26178, 1955, 4, 55, 26, 79, 8, 632, 483, 1304, 72611, 25, 3, 28127, 35898, 4, 4, 1474, 128, 178, 64578, 237, 2, 160, 2, 2576, 1593, 12, 419, 1955, 4, 12, 204997, 13, 570, 64, 25, 3, 7940, 7153, 204998, 1639, 4, 10926, 2086, 59, 3555, 5765, 30, 33, 340, 115, 204999, 5989, 17, 2877, 1923, 7, 2277, 1593, 3, 1929, 1634, 39818, 8, 632, 483, 30, 2725, 14, 2, 211, 1593, 501, 2877, 17, 129, 26, 1159, 13, 1851, 2, 25, 3, 1639, 822, 1134, 7738, 3, 2, 17, 40968, 4, 2877, 17, 11, 185, 8, 205000, 39819, 4017, 15679, 1775, 75, 22721, 1039, 29, 342, 444, 15679, 1775, 75, 19, 7, 349, 32430, 4, 1039, 9, 239, 2, 60, 260, 1368, 14, 46, 3, 2, 2877, 12168, 1702, 521, 2, 25, 3, 1639, 4, 5765, 663, 46, 8, 205001, 2, 25, 3, 213, 3, 2, 2277, 28127, 107, 7, 536, 46, 32430, 4, 1039, 995, 5, 7, 704, 560, 3, 255, 198, 79, 8, 2, 79287, 3, 1572, 240, 35898, 21086, 31027, 3876, 4, 19718, 74, 32, 14472, 228, 3, 1639, 179, 1134, 24, 172, 3183, 2, 333, 3, 77744, 5, 2720, 2, 2576, 1593, 3, 1929, 5, 434, 673, 57, 22, 1074, 24327, 2, 587, 3, 37, 1639, 3818, 822, 172, 547, 6, 760, 205002, 3, 205003, 8319, 79288, 1199, 205004, 78, 31028, 205005, 76, 1403, 24328, 34776, 3681, 228, 3, 1639, 4, 318, 3, 5306, 1496, 442, 198, 79, 2663, 2277, 1593, 3, 1572, 16, 5, 24689, 8001, 17, 72, 205006, 1781, 205007, 31029, 253, 16162, 154, 3, 818, 205008, 15475, 67, 844, 1266, 321, 455, 1047, 79288, 23, 12, 29, 9481, 773, 180, 2, 586, 3, 9764, 254, 869, 807, 7176, 3489, 108, 283, 7821, 5, 224, 354, 430, 2, 8095, 4, 619, 22, 7669, 1749, 3155, 49, 899, 32, 11547, 2, 1233, 4, 117, 1959, 23206, 205009, 205010, 40969, 1016, 3, 29394, 139, 1038, 3196, 1674, 10137, 205011, 119, 4, 2131, 6232, 205012, 5714, 1573, 205013, 205014, 21, 14079, 717, 3, 612, 164, 1038, 7472, 72, 1781, 18511, 455, 72, 42493, 1918, 2, 158, 33, 8418, 56, 1076, 1918, 205015, 3, 2576, 131, 5, 2277, 1572, 205016, 16, 552, 12, 2, 21719, 106, 278, 16, 601, 1169, 4, 5, 17441, 16, 771, 774, 14, 377, 252, 121, 3, 552, 502, 26, 196, 697, 971, 4, 377, 970, 502, 1562, 21, 1572, 16, 161, 7237, 205017, 1718, 1929, 22, 2, 106, 278, 635, 2, 557, 106, 601, 16, 3, 205018, 1084, 5504, 743, 16, 90, 118, 53, 12, 2, 17591, 106, 278, 16, 5, 19911, 2, 6365, 106, 278, 16, 5, 308, 5989, 240, 22, 1495, 15, 1500, 644, 64577, 19, 205019, 205020, 5989, 240, 33, 7, 246, 5617, 6, 2425, 205021, 33, 7, 99, 152, 3, 183, 4, 7, 881, 82, 152, 3, 842, 205022, 4, 205023, 33, 52, 75559, 105, 315, 169, 8, 2, 2277, 4, 2576, 1593, 635, 2, 106, 2641, 2, 2576, 1593, 3, 88, 1929, 1607, 2, 1283, 501, 381, 229, 2577, 32431, 4, 7, 109, 116, 3, 79289, 23, 1593, 205024, 9, 1507, 2351, 12, 419, 1955, 4, 50, 7, 83, 2971, 113, 588, 90, 118, 2277, 1495, 160, 33, 7, 3544, 216, 82, 73, 6, 2576, 160, 181, 740, 1923, 632, 483, 22, 1955, 4, 87, 4285, 205025, 14, 9270, 4, 1622, 6, 2824, 1052, 377, 3, 79289, 6264, 387, 3, 2, 1507, 131, 33, 405, 406, 4, 29, 216, 881, 82, 152, 3, 1373, 327, 6, 33362, 8237, 50, 7, 865, 199, 8, 2, 7782, 88, 1929, 2, 102, 81, 3, 1929, 3162, 6, 2, 338, 7238, 1201, 3, 2, 205026, 1277, 33, 52, 196, 8, 79290, 606, 16, 309, 30, 205027, 7, 24, 763, 136, 3, 79290, 5989, 16, 13, 16435, 2, 9530, 205028, 5989, 24, 259, 8001, 6, 33362, 47, 4480, 24, 280, 1612, 160, 5, 17592, 110, 14, 2299, 54, 9, 21422, 205029, 4, 205030, 987, 506, 320, 6, 2, 2277, 1593, 3, 88, 64577, 90, 118, 2, 38701, 7, 2351, 320, 6, 205031, 24, 1718, 8, 2718, 1201, 3, 205032, 90, 118, 34, 12, 79, 8, 405, 266, 90, 118, 2, 211, 439, 5664, 7, 36, 2065, 7, 35899, 1593, 12, 205033, 5, 7, 378, 31, 11852, 571, 14, 386, 178, 205034, 2028, 19718, 12, 238, 27, 99, 71, 5, 35899, 1572, 205035, 5, 1201, 3, 632, 483, 4, 12, 79, 8, 247, 358, 12849, 4, 1182, 555, 3, 2, 58, 2, 31, 44, 603, 14, 102, 25, 3, 19718, 55, 2457, 2, 25, 3, 66, 386, 159, 85, 15, 36, 59, 3876, 205036, 44, 402, 15, 3876, 34, 12, 402, 6, 26, 79, 8, 2, 79287, 3, 1929, 205037, 234, 14, 3876, 1536, 6, 2, 1120, 751, 921, 3, 2, 64, 75560, 63, 86, 31912, 64, 205038, 38702, 15, 25741, 4, 21086, 276, 711, 91, 25, 2, 31, 143, 5146, 101, 21086, 4, 25741, 647, 2, 304, 3, 2, 2277, 1593, 3, 1929, 5, 443, 4, 16964, 2, 2576, 1593, 3, 1929, 12, 79, 8, 2, 25, 3, 2, 1120, 205039, 2086, 59, 3555, 5765, 34777, 257, 59, 8002, 4, 7940, 7153, 245, 2900, 1639, 90, 118, 2, 228, 3, 1639, 8, 29, 205040, 7822, 2, 25, 3, 19718, 4, 783, 632, 483, 5, 443, 2, 1623, 114, 3, 1639, 142, 5, 1844, 12, 7670, 2176, 107, 2600, 3, 1639, 5, 1572, 240, 12, 7, 2971, 305, 745, 101, 5, 23553, 5, 443, 57, 472, 14, 1639, 2485, 85, 15, 27459, 55, 647, 483, 783, 367, 172, 764, 1080, 4, 1480, 222, 5, 1572, 16, 72, 1781, 18511, 455, 72, 18133, 3, 2576, 131, 5, 2277, 1572, 16, 309, 8419, 2818, 367, 172, 8, 1474, 128, 178, 64578, 547, 50, 52, 48, 44332, 6, 760, 10719, 1500, 644, 205041, 128, 314, 2, 342, 547, 301, 35, 33, 2, 338, 587, 5, 205042, 2, 3742, 3, 23, 31, 10, 6, 526, 226, 2, 228, 3, 1639, 4, 318, 3, 5306, 5, 2, 8001, 3187, 4, 2, 24689, 8001, 16436, 502, 205043, 25, 3, 54, 275, 5, 2720, 2, 35899, 204, 3, 1572, 6997, 764, 54, 14, 11, 79, 8, 2, 205044, 1005, 3, 1572, 24329, 4, 205045, 2486, 79291, 110, 1415, 21, 44333, 11, 48, 5, 47, 57, 26548, 11, 9410, 332, 6, 7, 12023, 27, 1, 1, 68, 180, 7, 205046, 7460, 480, 747, 4, 1519, 72612, 1323, 10772, 356, 205047, 3, 2, 8001, 4480, 24, 280, 8237, 79292, 4, 2, 35900, 24, 259, 11, 1090, 13, 2583, 1025, 27, 7, 400, 3, 253, 5773, 2, 6000, 2162, 16041, 3, 17592, 2486, 79291, 110, 23, 31, 15031, 6, 205048, 17271, 5, 2, 3159, 9, 2, 227, 4, 108, 3, 666, 924, 3, 205049, 36, 2370, 11, 503, 4109, 4, 2, 924, 11, 4855, 162, 2, 269, 3, 5584, 10, 842, 28, 545, 19, 162, 17271, 13, 62, 477, 205050, 1447, 13, 2487, 40971, 4, 11225, 6, 2, 975, 5827, 164, 199, 3081, 20, 10720, 251, 10, 468, 13, 27460, 1097, 4, 205051, 236, 34, 2, 1052, 4, 36, 11, 2291, 227, 3, 1030, 6, 2113, 32432, 10, 56, 1743, 6, 2, 924, 145, 2, 1240, 2, 755, 10, 438, 13, 22722, 3, 2754, 64579, 673, 227, 860, 205052, 13630, 8001, 3187, 17, 4, 93, 3, 2, 24689, 16436, 8237, 4, 11091, 11, 964, 5, 5, 1, 5364, 76, 1766, 6213, 5364, 76, 354, 2819, 2170, 8, 8956, 79293, 258, 15, 391, 340, 2, 17, 11, 22185, 27, 7, 421, 205054, 4, 2, 2946, 11, 3710, 3273, 128, 354, 925, 166, 239, 2, 64580, 24, 259, 50, 52, 23207, 37, 1152, 6294, 5334, 19719, 42, 2, 24689, 16436, 48, 5, 2, 369, 31, 33, 52, 340, 1304, 9, 117, 569, 6, 652, 2780, 5, 3590, 4207, 652, 7872, 162, 3563, 5, 30488, 6424, 205055, 652, 160, 162, 1642, 5307, 5, 40972, 110, 90, 2722, 2, 8237, 55, 44, 652, 160, 935, 5788, 1025, 9, 303, 205056, 3187, 4, 2, 24689, 16436, 8237, 4, 11091, 11, 3563, 6, 10611, 23208, 590, 354, 236, 34, 2, 17, 11, 671, 8, 7, 258, 205057, 3382, 3, 7, 1596, 3, 2819, 4, 1, 5364, 76, 19394, 128, 354, 9, 77, 14886, 11, 182, 1377, 6, 549, 8363, 10773, 2845, 2, 303, 4567, 205059, 15679, 1775, 75, 7, 1639, 4017, 22721, 1039, 15679, 1775, 75, 29, 1474, 128, 178, 64578, 444, 19, 7, 349, 3, 1702, 4, 1039, 12480, 9, 4, 1428, 2, 496, 3, 2, 573, 11, 2776, 124, 20, 3208, 205060, 3, 79294, 205061, 141, 3, 1702, 4, 1039, 20, 2, 8001, 17, 10, 392, 13, 3979, 3, 205062, 79294, 2029, 15, 391, 340, 13, 23, 666, 4046, 203, 133, 1329, 2, 24, 21423, 11, 1793, 2288, 8, 3400, 7177, 72, 1781, 18511, 455, 72, 18133, 3, 2576, 131, 5, 2277, 1572, 205063, 1146, 236, 34, 2, 17, 11, 1295, 9, 466, 8, 2952, 4, 205064, 52551, 1146, 2, 15476, 17, 11, 1013, 8, 1, 1775, 192, 3606, 1, 1775, 1, 5, 9067, 205067, 2595, 4, 1631, 475, 17846, 215, 10, 1008, 37, 34778, 1802, 211, 119, 409, 15, 391, 340, 2, 25, 3, 315, 54, 10, 503, 8, 1631, 1000, 28818, 1351, 847, 1841, 37, 1608, 14, 11, 1415, 5178, 205068, 2054, 2, 54, 1157, 8, 2, 13926, 116, 3, 2, 1608, 22, 1776, 64581, 310, 1057, 92, 157, 215, 15679, 1775, 165, 10, 4504, 6, 1741, 37, 205069, 1741, 765, 700, 4135, 2054, 2, 1257, 3, 2, 1741, 10, 103, 37, 2, 52552, 4221, 4398, 1119, 20545, 4135, 2054, 8, 15679, 1775, 1, 4, 15679, 1775, 75, 1608, 5, 7, 157, 334, 3, 15679, 1775, 192, 5, 7, 17847, 1631, 416, 68039, 2054, 1257, 10, 3011, 13, 4398, 1119, 1644, 205071, 2750, 3, 7, 76, 72614, 451, 13, 1398, 3, 6967, 27, 1, 1, 68, 9, 205072, 27, 1, 1, 68, 9, 76, 4, 2558, 27, 1, 1, 68, 9, 76, 34, 3471, 1852, 205073, 2, 873, 71, 11, 1449, 6, 15679, 3087, 2209, 25, 503, 13, 2, 205074, 2, 2051, 1398, 29395, 9, 15679, 3087, 2209, 4, 2, 873, 7628, 29395, 9, 2, 218, 54, 27461, 5, 1829, 205075, 762, 64582, 762, 42, 10, 103, 15, 391, 340, 2, 24, 10288, 11, 5002, 5, 7370, 77745, 9961, 194, 3980, 3298, 1161, 2170, 205076, 2588, 444, 8213, 4, 2046, 205077, 3141, 3582, 2331, 9902, 4603, 10, 19148, 4, 2, 6968, 10, 3358, 6, 3943, 367, 12686, 205078, 11, 2001, 37, 2, 10774, 2542, 59, 194, 700, 205079, 205080, 6, 3642, 122, 11, 296, 4, 16601, 2, 59, 2648, 11, 2387, 205081, 205082, 77746, 6487, 1415, 21, 4135, 2054, 4, 2204, 11548, 15679, 1775, 75, 205083, 6858, 12582, 640, 37, 205084, 2464, 2, 205085, 2799, 5, 22723, 2254, 17593, 590, 20313, 9669, 4, 205086, 258, 3418, 7322, 9, 466, 27, 1219, 931, 2, 1931, 11, 205087, 27, 1, 1, 68, 8, 2, 188, 411, 3151, 5, 9326, 2172, 258, 3418, 205088, 11, 1415, 21, 1729, 18512, 4, 11, 1242, 390, 402, 202, 4, 298, 1634, 24, 280, 13, 446, 42, 721, 6, 108, 5, 618, 1949, 2, 619, 3, 2, 411, 1157, 8, 91, 13926, 994, 22, 98, 5, 2407, 205089, 4428, 661, 9, 2604, 96, 8957, 5, 9669, 2, 1931, 11, 671, 2663, 9327, 19, 8958, 793, 411, 9, 466, 27, 1219, 72615, 5699, 11, 3568, 37, 2, 10786, 56245, 446, 7018, 1364, 2337, 205091, 205092, 1054, 11, 2851, 27, 28, 15679, 1775, 75, 4, 28819, 5, 40973, 218, 6620, 205093, 5, 7, 205094, 9, 466, 2, 1054, 11, 2659, 6, 22724, 6, 1219, 931, 205095, 105, 9669, 9, 466, 2, 188, 562, 48, 5, 23, 31, 1157, 8, 205096, 4, 12766, 994, 22, 1776, 5, 2407, 92, 2, 188, 562, 11, 205097, 8067, 4060, 4877, 17848, 923, 9, 2559, 19, 356, 188, 562, 11853, 931, 9, 466, 2483, 11149, 8067, 10, 48, 9, 205098, 323, 10, 103, 9, 28, 5585, 27, 1219, 931, 37, 205099, 8067, 2404, 11, 10612, 5, 9067, 1519, 10927, 5, 6412, 2952, 72, 1781, 18511, 455, 72, 18133, 3, 2576, 131, 5, 2277, 1572, 205100, 5, 13146, 4, 205101, 93, 4961, 23209, 2, 401, 4, 1335, 205102, 5, 2, 205103, 7473, 450, 11, 103, 5, 3954, 4, 2, 60, 22, 238, 15, 2, 209, 65, 205104, 341, 11, 103, 37, 2811, 205105, 564, 724, 37, 3012, 1382, 8, 8772, 5364, 76, 19, 37069, 5364, 76, 6052, 795, 9, 64, 25, 1022, 205106, 20, 2, 7628, 480, 2051, 15679, 1, 464, 250, 14, 11, 804, 21, 2676, 6, 15679, 3087, 2209, 71, 646, 563, 1313, 2, 123, 3, 731, 10, 205107, 3, 27459, 4, 22721, 20, 2, 799, 4, 2468, 205108, 3187, 4, 2877, 7239, 8001, 3187, 4, 2877, 17, 8237, 4, 11091, 11, 185, 8, 549, 2845, 205109, 1702, 15679, 1775, 75, 22721, 1039, 15679, 1775, 75, 19, 1702, 4, 1039, 12480, 9, 670, 15, 26179, 38, 1078, 28, 3074, 126, 10, 56, 467, 5, 2, 2468, 3, 2, 8001, 3187, 17, 8, 205111, 126, 10, 7, 467, 5, 2, 2468, 3, 2, 8237, 4, 11091, 17, 162, 6042, 1702, 38, 6457, 4, 18321, 4, 12480, 38, 2771, 4, 25742, 2, 17, 8364, 83, 205112, 205113, 4, 24330, 2, 3523, 17, 3, 2, 1572, 126, 10, 7, 560, 697, 116, 3, 8001, 3187, 17, 185, 8, 1039, 4, 12480, 162, 73, 6, 2, 19912, 8, 1702, 676, 19, 8, 2845, 38, 30489, 126, 10, 44, 7, 560, 5, 2, 116, 205114, 17, 162, 185, 8, 1702, 4, 12480, 162, 73, 6, 2, 2845, 185, 17, 205115, 126, 10, 56, 109, 560, 5, 2, 116, 3, 8237, 17, 5, 224, 3, 2, 25743, 38, 29396, 29, 1096, 234, 2, 1103, 3, 6680, 17, 5, 2, 185, 169, 1794, 560, 5, 24, 116, 73, 6, 2, 2845, 63, 129, 26, 201, 6, 1103, 3, 205116, 102, 483, 3, 205117, 3, 1639, 228, 4, 342, 318, 20, 2, 25, 205118, 17594, 386, 159, 1639, 5765, 4, 8002, 695, 7, 114, 5, 2, 5346, 3, 52553, 1593, 3, 1572, 16, 90, 118, 57, 1447, 13, 64583, 33, 115, 205119, 228, 3, 1639, 8, 4749, 318, 3, 342, 809, 5, 144, 88, 5989, 17, 828, 2, 1685, 3, 2, 1639, 4017, 5, 13631, 31, 30, 613, 2, 156, 3, 2, 1639, 4017, 1702, 4, 29, 342, 444, 205120, 2, 25, 3, 2576, 1120, 159, 5, 2, 8001, 3187, 17, 4, 2, 205121, 25, 71, 9, 2, 218, 54, 5, 23, 31, 22, 98, 9, 7, 670, 670, 4, 670, 22725, 9, 2, 3187, 8237, 4, 11091, 17, 1057, 1829, 4, 2407, 3238, 219, 9, 33363, 2, 670, 64, 4, 59, 71, 22, 98, 5, 2, 641, 420, 3, 2, 205122, 8, 1702, 102, 2, 25, 3, 1639, 5, 2, 8001, 3187, 38, 1487, 193, 1375, 64584, 24, 259, 7, 320, 156, 10, 968, 5, 8237, 38, 1997, 193, 1375, 4, 584, 4, 11091, 38, 2606, 193, 1375, 4, 205123, 280, 46, 8, 1039, 301, 35, 764, 2, 25, 3, 1639, 5, 2, 8001, 205124, 1487, 1375, 4, 584, 19, 8237, 38, 1997, 1375, 4, 584, 17, 111, 126, 10, 7, 293, 247, 5, 1639, 59, 5, 2, 11091, 17, 38, 2606, 1375, 4, 584, 46, 8, 101, 1702, 4, 1039, 12480, 12408, 25, 71, 3, 1639, 268, 5, 2, 8001, 3187, 17, 111, 126, 10, 56, 247, 697, 59, 71, 126, 10, 56, 247, 5, 268, 25, 5, 2, 11091, 17, 111, 126, 10, 205125, 247, 5, 2, 59, 71, 38, 2606, 193, 1375, 4, 584, 72, 1781, 18511, 455, 72, 18133, 3, 2576, 131, 5, 2277, 1572, 12244, 2468, 4, 799, 3, 8001, 3187, 4, 2877, 17, 2, 8001, 3187, 1323, 4, 2877, 17, 8237, 19395, 4, 11091, 1650, 11, 185, 30490, 2845, 104, 27459, 1702, 15679, 1775, 75, 22721, 1039, 15679, 1775, 75, 19, 1702, 4, 1039, 12480, 9, 670, 2, 1812, 11, 103, 5, 205126, 2, 250, 98, 22, 209, 789, 3, 104, 65, 2061, 13227, 3012, 1382, 10, 103, 451, 13, 8772, 5364, 76, 6052, 170, 2227, 79295, 4136, 472, 109, 339, 40, 1781, 205127, 64, 4, 59, 25, 5, 2, 8001, 3187, 17, 185, 8, 1702, 40974, 38, 1487, 456, 1375, 4, 8159, 107, 2, 25, 10, 102, 5, 2, 8237, 4, 64585, 38, 1997, 456, 1375, 4, 8159, 4, 2606, 456, 1375, 4, 8159, 27, 2, 59, 123, 46, 8, 1039, 205128, 59, 5, 2, 3187, 17, 111, 2, 71, 11, 570, 5, 2, 8237, 17, 79296, 2, 25, 3, 5765, 5, 2, 8001, 3187, 17, 111, 53, 102, 2, 25, 205129, 2, 11091, 17, 27, 2, 59, 205130, 3, 2, 8001, 3187, 17, 8, 1702, 1039, 19, 12480, 301, 35, 247, 2, 12338, 3, 8002, 111, 126, 10, 29, 247, 5, 2, 59, 71, 51, 46, 8, 1702, 205131, 162, 73, 6, 2, 2845, 185, 63, 38, 1487, 720, 1375, 4, 8520, 5, 8237, 4, 64585, 126, 10, 29, 7258, 578, 3, 8002, 59, 13, 37, 2, 536, 12480, 46, 38, 1997, 1375, 4, 8520, 4, 2606, 1375, 4, 8520, 72, 1781, 18511, 455, 72, 18133, 3, 2576, 131, 5, 2277, 1572, 16, 72, 1781, 18511, 455, 72, 18133, 3, 2576, 131, 5, 2277, 1572, 12244, 25, 3, 2576, 541, 5, 8001, 3187, 4, 2877, 17, 2, 8001, 3187, 205132, 8237, 79297, 4, 11091, 79298, 14700, 8, 549, 2845, 104, 27459, 1702, 15679, 1775, 75, 22721, 1039, 15679, 1775, 75, 19, 1702, 4, 1039, 12480, 9, 670, 2832, 133, 79299, 10, 103, 6, 2693, 64, 25, 7, 70, 205133, 446, 762, 42, 10, 48, 6, 1200, 59, 71, 7, 70, 16965, 4, 205134, 4670, 1023, 36974, 3, 96, 59, 2974, 10, 422, 7, 70, 205135, 64, 25, 10, 1449, 6, 15679, 3087, 2209, 4, 64, 11639, 22, 3981, 15, 4559, 374, 6, 2, 2845, 104, 1257, 10, 1755, 3189, 71, 9, 1639, 5, 2845, 8237, 696, 205136, 480, 2051, 15679, 1, 464, 225, 3, 10, 2326, 34, 12, 132, 69, 2, 918, 7628, 464, 249, 9, 1639, 5, 14, 24, 205137, 1812, 3, 64, 4, 59, 32, 11, 103, 4, 22, 98, 15, 209, 65, 2061, 13227, 3012, 1382, 5044, 451, 13, 8772, 5364, 76, 6052, 170, 2227, 8, 9765, 4136, 472, 109, 339, 40, 1781, 205138, 3, 1702, 4, 1039, 20, 2, 809, 4, 25, 71, 3, 205139, 156, 3, 1702, 4, 2, 342, 444, 1039, 10, 392, 20, 2, 25, 4, 809, 3, 342, 5, 2, 8001, 3187, 4, 2877, 17, 125, 2, 349, 3, 79296, 2, 25, 3, 342, 268, 5, 2, 8001, 3187, 17, 107, 49, 166, 926, 270, 2, 59, 71, 38, 1342, 193, 456, 4, 720, 126, 10, 56, 2277, 809, 40975, 5, 2, 8001, 3187, 17, 4, 2143, 3, 2, 926, 84, 224, 156, 20, 2, 809, 71, 38, 1342, 456, 2, 25, 3, 342, 5, 2, 2877, 17, 2377, 8, 7, 560, 205140, 17, 4, 29, 247, 5, 11091, 17, 38, 2362, 193, 456, 4, 1375, 4, 38, 3275, 193, 456, 4, 1375, 68040, 2, 4480, 24, 280, 84, 2277, 809, 3, 2, 342, 39820, 4, 46, 8, 1039, 270, 2, 39820, 71, 5, 8237, 38, 2362, 456, 4, 720, 4, 11091, 38, 3275, 456, 31913, 34, 792, 14, 2, 1039, 46, 10, 205141, 3, 2, 1639, 4017, 4, 318, 342, 809, 20, 2600, 3, 205142, 103, 13, 64583, 33, 115, 14, 318, 3, 2, 342, 8, 4749, 228, 3, 1639, 142, 18972, 144, 88, 5989, 17, 21, 7, 632, 167038, 1132, 6, 7, 11854, 1422, 1005, 8, 2, 5374, 205143, 4, 2, 1029, 3, 31030, 4, 35901, 241, 30, 18513, 6, 68041, 2, 8001, 3187, 4, 2, 2877, 17, 502, 31914, 83, 21, 7, 2277, 1132, 6, 10613, 205144, 1132, 162, 185, 8, 1702, 4, 1039, 2, 10289, 32, 12, 115, 5, 38, 4, 2, 59, 25, 32, 12, 115, 5, 38, 9, 79300, 3187, 17, 126, 10, 7, 560, 5, 25, 3, 35898, 20546, 4, 21086, 8, 205145, 38, 2005, 193, 456, 4, 8159, 4, 38, 2510, 193, 456, 4, 1375, 2, 59, 9, 35898, 10, 205146, 2, 8001, 3187, 17, 2, 25, 3, 205147, 4, 31027, 102, 8, 1702, 205148, 8, 1039, 4, 12480, 38, 2005, 720, 1375, 584, 4, 8520, 4, 38, 2510, 720, 4, 584, 2, 205149, 956, 35, 4013, 39, 2, 205150, 57426, 4, 205151, 2114, 4, 8838, 59, 372, 13, 47, 166, 54, 276, 53, 12, 35, 402, 34, 2310, 12, 205152, 27, 2, 59, 123, 126, 10, 7, 560, 5, 25, 3, 31030, 5, 2, 64580, 17, 38, 2005, 15896, 4, 38, 2510, 193, 4, 8159, 5, 2, 8237, 17, 2, 25, 71, 3, 205153, 64586, 8, 315, 926, 10, 320, 6, 2, 8001, 3187, 17, 38, 2601, 79301, 14080, 3190, 205154, 8, 2, 6366, 3, 35898, 59, 34, 10, 238, 5, 2, 8237, 17, 10397, 25, 270, 8, 1702, 4, 12480, 46, 5, 2, 11091, 17, 2, 25, 501, 64586, 10, 320, 6, 8237, 8, 2, 6366, 3, 31027, 38, 3148, 205155, 46, 79302, 270, 2, 25, 3, 31027, 2, 59, 71, 9, 2, 49, 2, 64586, 10, 11701, 2, 268, 123, 8, 2, 6366, 3, 20546, 31030, 4, 31027, 20546, 89, 9996, 5, 25, 8, 1039, 4, 12480, 46, 4, 31027, 34, 89, 9996, 5, 25, 8, 1039, 38, 5242, 79301, 5, 2, 8237, 4, 11091, 17, 31030, 4047, 10, 296, 107, 2, 59, 71, 11, 570, 21, 49, 166, 205156, 2601, 15896, 38, 3148, 15896, 4, 38, 3190, 193, 8520, 38, 5242, 193, 8520, 72, 1781, 18511, 455, 72, 18133, 3, 2576, 131, 5, 2277, 1572, 12244, 25, 4, 809, 3, 1474, 128, 178, 245, 2, 8001, 3187, 205157, 8237, 205158, 4, 11091, 205159, 14700, 8, 549, 2845, 104, 27459, 1702, 15679, 1775, 75, 22721, 1039, 15679, 1775, 75, 19, 1702, 4, 1039, 12480, 9, 670, 2832, 133, 79299, 10, 103, 6, 2693, 64, 25, 7, 70, 271, 446, 762, 42, 10, 48, 6, 1200, 59, 71, 7, 70, 205160, 4, 2, 79303, 1023, 36974, 3, 96, 59, 2974, 10, 422, 205161, 70, 4, 205162, 4, 1375, 205163, 10, 35, 249, 5, 2, 8001, 205164, 259, 64, 25, 10, 1449, 6, 15679, 3087, 2209, 4, 64, 4, 59, 22, 3981, 15, 4559, 374, 6, 2, 2845, 104, 205165, 3, 64, 4, 59, 32, 11, 103, 4, 22, 98, 15, 209, 65, 2061, 13227, 3012, 1382, 10, 103, 451, 1, 5364, 76, 6052, 170, 2227, 8, 9765, 4136, 472, 109, 339, 40, 1781, 205167, 3, 1639, 228, 4, 342, 318, 20, 25, 38703, 159, 3876, 4, 19718, 4, 2, 1824, 205168, 8, 632, 205169, 19718, 50, 52, 2524, 5, 2, 167, 3, 632, 483, 205170, 240, 4, 228, 3, 1639, 12, 115, 6, 7822, 2, 25, 3, 19718, 30, 241, 613, 2, 141, 3, 1639, 228, 4, 342, 318, 20, 2, 25, 3, 19718, 5, 8001, 3187, 4, 2877, 17, 74, 60, 1368, 14, 1702, 205171, 59, 237, 2, 3187, 4, 8237, 17, 179, 2, 268, 4, 59, 10, 33364, 2, 11091, 17, 21, 1702, 327, 46, 8, 1039, 15, 184, 15, 12480, 270, 2, 72, 1781, 18511, 455, 72, 18133, 3, 2576, 131, 5, 2277, 1572, 16, 72, 1781, 18511, 455, 72, 18133, 3, 2576, 131, 5, 2277, 1572, 12244, 64, 25, 3, 28127, 2, 8001, 3187, 205172, 8237, 205173, 4, 11091, 79304, 11, 185, 30490, 2845, 104, 27459, 1702, 15679, 1775, 75, 22721, 1039, 15679, 1775, 75, 19, 1702, 4, 1039, 12480, 9, 670, 2832, 205174, 42, 10, 103, 6, 2693, 64, 25, 7, 70, 79304, 64, 25, 10, 1449, 6, 15679, 3087, 2209, 13331, 3981, 15, 4559, 374, 6, 2, 2845, 104, 3954, 1812, 3, 64, 71, 11, 103, 4, 27461, 15, 209, 65, 2061, 13227, 3012, 1382, 10, 103, 451, 13, 8772, 5364, 76, 6052, 170, 2227, 79295, 4136, 472, 109, 339, 40, 1781, 205175, 3, 19718, 5, 2, 8001, 3187, 38, 3749, 193, 1375, 4, 584, 8237, 38, 5161, 193, 1375, 4, 584, 79292, 17, 38, 5324, 193, 1375, 4, 205176, 178, 3876, 12, 568, 59, 79, 8, 88, 1572, 205177, 5, 35899, 541, 90, 118, 126, 10, 7, 560, 5, 2, 25, 3, 3876, 697, 8001, 3187, 17, 8, 49, 926, 38, 3749, 456, 1375, 4, 8159, 5, 2, 8237, 17, 2, 25, 3, 3876, 10, 186, 4, 46, 8, 1702, 102, 117, 25, 107, 46, 79305, 4, 12480, 270, 117, 25, 38, 5161, 456, 1375, 4, 8159, 2, 25, 3, 3876, 5, 64585, 102, 27, 2, 268, 123, 8, 1702, 4, 12480, 926, 107, 2, 59, 12687, 270, 162, 73, 6, 2, 2845, 104, 38, 5324, 456, 1375, 4, 205178, 25, 3, 25741, 7, 64, 79, 8, 2, 2277, 64, 25, 1573, 10, 44, 392, 5, 2, 8001, 3187, 4, 2, 24689, 17, 9, 2, 64580, 4, 8237, 17, 126, 10, 7, 560, 5, 2, 25, 3, 25741, 5, 17, 185, 79305, 4, 12480, 38, 3749, 4, 5161, 720, 1375, 4, 8520, 9, 11091, 1039, 4, 12480, 926, 205179, 59, 38, 5324, 1375, 4, 205180, 3, 25741, 19718, 4, 3876, 237, 160, 2706, 205181, 8237, 11091, 4, 35900, 205182, 8237, 4, 11091, 22, 291, 6, 26, 3, 2, 2277, 211, 1593, 179, 35900, 26549, 291, 6, 26, 3, 2, 2576, 211, 1593, 3, 1572, 16, 17, 241, 205183, 6, 1845, 2, 5, 434, 25, 3, 25741, 19718, 4, 3876, 237, 2, 79306, 35899, 1201, 3, 160, 9615, 21, 8001, 8237, 4, 11091, 17, 39821, 6, 2, 25, 5, 160, 9615, 21, 2, 2576, 35900, 17, 49, 166, 4856, 198, 11, 955, 237, 2, 79306, 1201, 3, 160, 598, 21, 79300, 8237, 4, 11091, 17, 38, 7941, 9566, 12024, 13810, 23554, 4, 18322, 2016, 7, 246, 25, 7598, 5, 2, 9903, 1201, 3, 2, 8001, 8237, 4, 11091, 160, 38, 7941, 205184, 13810, 23554, 4, 1, 1, 8839, 126, 10, 186, 6, 56, 323, 5, 2, 35900, 17, 9, 205186, 4, 3876, 38, 10186, 15343, 4, 205187, 634, 3, 1929, 105, 315, 1239, 50, 2381, 5, 2, 216, 113, 575, 3, 2, 58, 8, 2, 2277, 1593, 1234, 7, 1835, 588, 162, 73, 6, 52553, 1593, 2, 211, 428, 275, 5, 2, 167, 3, 47, 205188, 35, 1694, 907, 107, 2, 114, 3, 213, 1120, 159, 12, 907, 205189, 7873, 29, 223, 114, 5, 2, 228, 3, 2, 2576, 281, 54, 90, 118, 205190, 7873, 7, 114, 5, 2, 167, 3, 2, 2277, 1593, 3, 1929, 5, 207, 30491, 144, 88, 5989, 17, 571, 14, 228, 3, 1639, 8, 29, 4017, 1157, 68042, 3, 367, 172, 231, 2, 342, 131, 55, 6425, 17, 21, 7, 205191, 1132, 6, 7, 83, 11854, 1422, 1132, 349, 205192, 342, 547, 4, 1639, 3818, 571, 1438, 60, 390, 213, 205193, 5, 434, 15, 184, 15, 390, 66, 240, 124, 20, 47, 57, 30, 4176, 9904, 226, 2, 8001, 3187, 4, 2, 24689, 8001, 24, 280, 14, 22, 72, 1781, 18511, 455, 72, 18133, 3, 2576, 131, 5, 2277, 1572, 16, 72, 1781, 18511, 455, 72, 18133, 3, 2576, 131, 5, 2277, 1572, 12244, 59, 25, 3, 28127, 2, 8001, 3187, 205194, 8237, 79297, 4, 11091, 79298, 11, 185, 8, 549, 2845, 104, 27459, 1702, 15679, 1775, 75, 22721, 1039, 15679, 1775, 75, 19, 1702, 4, 1039, 12480, 9, 670, 446, 762, 42, 10, 48, 6, 1200, 59, 71, 4845, 8214, 271, 4, 2, 79303, 1023, 36974, 3, 96, 59, 2974, 10, 422, 205195, 70, 4, 205196, 59, 71, 22, 3981, 15, 4559, 374, 6, 2, 2845, 205197, 1812, 3, 59, 32, 11, 103, 4, 22, 98, 15, 209, 65, 2061, 13227, 3012, 1382, 10, 103, 451, 13, 8772, 5364, 1, 170, 2227, 8, 9765, 4136, 472, 109, 339, 40, 1781, 205199, 1304, 15, 7, 2277, 1593, 3, 552, 129, 11226, 6, 7, 2576, 205200, 204, 162, 185, 8, 7, 1639, 4017, 822, 29, 342, 444, 15, 23, 129, 922, 17272, 588, 3, 2, 14570, 8001, 3187, 17, 162, 3563, 5, 258, 9037, 319, 54, 14, 22, 2231, 2, 2277, 1593, 5, 23, 31, 46, 3, 47, 17, 8, 2, 1639, 4017, 205201, 2, 25, 3, 1639, 15, 184, 15, 8002, 111, 35, 5765, 5, 19720, 88, 5989, 17, 46, 8, 2, 1639, 4017, 603, 29, 247, 5, 2, 25, 3, 2, 386, 178, 5765, 4, 23, 12, 5423, 6, 1983, 12, 968, 5, 74, 31, 47, 72616, 26, 201, 6, 2, 24, 204, 645, 30, 22, 37, 9530, 17, 2, 205202, 17, 89, 29, 997, 5, 2, 25, 3, 1639, 4, 5765, 162, 185, 79302, 107, 2, 25, 3, 8002, 5, 47, 4480, 280, 10, 35, 704, 5, 40976, 280, 57, 13, 1512, 33, 115, 14, 33, 115, 14, 8002, 4, 5765, 40977, 1639, 228, 6, 3260, 2, 483, 3, 7, 2277, 1572, 16, 1593, 6, 7, 205203, 2576, 1593, 66, 57, 33, 44, 1154, 2, 25, 3, 5765, 1635, 167, 3, 2, 2576, 1593, 3, 5989, 16, 309, 5, 74, 57, 9182, 1702, 301, 433, 5, 2, 483, 3, 2, 24689, 17, 124, 935, 3528, 295, 4, 380, 2, 25, 3, 1639, 15, 184, 15, 5765, 101, 3, 34, 22, 205204, 6, 3260, 2576, 483, 3, 1572, 16, 24, 205205, 231, 2, 1639, 131, 12, 992, 9, 2, 128, 2757, 4, 3734, 483, 3, 4208, 4, 66, 144, 423, 309, 4, 16, 17, 309, 205206, 231, 2, 342, 245, 778, 7, 114, 5, 24, 172, 2, 28127, 695, 29, 42494, 5, 2, 483, 3, 423, 17, 4, 97, 21422, 1411, 22, 238, 27, 97, 946, 3, 483, 2, 144, 1572, 50, 166, 97, 946, 3, 24690, 47, 946, 22, 2350, 13, 2, 25, 3, 21086, 20546, 4, 35901, 21086, 17132, 5, 7, 2109, 3, 2277, 17, 14, 22, 2713, 6, 695, 7, 114, 5, 6070, 15, 184, 205207, 663, 66, 2277, 17, 4, 3523, 17, 5, 2, 144, 205208, 20546, 2, 25, 3, 35901, 12, 3119, 6, 2, 106, 1422, 17, 205209, 17, 47, 483, 946, 3, 2, 1572, 22, 1937, 13, 1572, 25744, 508, 2550, 55, 1562, 5, 224, 3, 2, 97, 24, 324, 3, 2, 1572, 2600, 3, 21086, 12, 968, 5, 2, 497, 3, 2, 1422, 160, 4, 117, 25, 3162, 6, 405, 266, 21086, 663, 2, 25, 3, 35901, 12, 3119, 6, 1422, 1572, 240, 4, 117, 25, 12, 79, 8, 878, 266, 7, 31, 77747, 64583, 89, 14, 144, 88, 5989, 17, 5, 763, 2299, 21086, 205210, 2, 25, 3, 31030, 4, 35901, 228, 3, 1639, 5, 47, 17, 380, 2600, 3, 31030, 4, 270, 2, 25, 3, 21086, 23, 156, 10, 77748, 162]\n"
|
| 482 |
+
]
|
| 483 |
+
}
|
| 484 |
+
],
|
| 485 |
+
"source": [
|
| 486 |
+
"def tokenize(text):\n",
|
| 487 |
+
" return text.lower().split()\n",
|
| 488 |
+
"\n",
|
| 489 |
+
"word_counts = Counter()\n",
|
| 490 |
+
"for text in X_train:\n",
|
| 491 |
+
" word_counts.update(tokenize(text))\n",
|
| 492 |
+
"\n",
|
| 493 |
+
"sorted_words = [word for word, _ in word_counts.most_common()]\n",
|
| 494 |
+
"\n",
|
| 495 |
+
"vocab = {\"<pad>\": 0, \"<unk>\": 1}\n",
|
| 496 |
+
"vocab.update({word: idx + 2 for idx, word in enumerate(sorted_words)})\n",
|
| 497 |
+
"\n",
|
| 498 |
+
"tokenizer = Tokenizer(WordLevel(vocab, unk_token=\"<unk>\"))\n",
|
| 499 |
+
"tokenizer.pre_tokenizer = Whitespace()\n",
|
| 500 |
+
"\n",
|
| 501 |
+
"def text_to_sequence(texts):\n",
|
| 502 |
+
" return [tokenizer.encode(text.lower()).ids for text in texts]\n",
|
| 503 |
+
"\n",
|
| 504 |
+
"X_train_seq = text_to_sequence(X_train)\n",
|
| 505 |
+
"X_test_seq = text_to_sequence(X_test)\n",
|
| 506 |
+
"\n",
|
| 507 |
+
"# Display the first sequence\n",
|
| 508 |
+
"print(X_train_seq[0])\n"
|
| 509 |
+
]
|
| 510 |
+
},
|
| 511 |
+
{
|
| 512 |
+
"cell_type": "markdown",
|
| 513 |
+
"id": "a5fbedc8",
|
| 514 |
+
"metadata": {
|
| 515 |
+
"id": "a5fbedc8",
|
| 516 |
+
"papermill": {
|
| 517 |
+
"duration": 0.011214,
|
| 518 |
+
"end_time": "2024-08-08T15:26:35.539595",
|
| 519 |
+
"exception": false,
|
| 520 |
+
"start_time": "2024-08-08T15:26:35.528381",
|
| 521 |
+
"status": "completed"
|
| 522 |
+
},
|
| 523 |
+
"tags": []
|
| 524 |
+
},
|
| 525 |
+
"source": [
|
| 526 |
+
"## 6.Sequence Padding and Length Adjustment"
|
| 527 |
+
]
|
| 528 |
+
},
|
| 529 |
+
{
|
| 530 |
+
"cell_type": "code",
|
| 531 |
+
"execution_count": 12,
|
| 532 |
+
"id": "j8v22pCcMkKW",
|
| 533 |
+
"metadata": {
|
| 534 |
+
"colab": {
|
| 535 |
+
"base_uri": "https://localhost:8080/"
|
| 536 |
+
},
|
| 537 |
+
"id": "j8v22pCcMkKW",
|
| 538 |
+
"outputId": "c6f7c315-fe5a-436e-9b3c-de0e32aec153"
|
| 539 |
+
},
|
| 540 |
+
"outputs": [
|
| 541 |
+
{
|
| 542 |
+
"name": "stdout",
|
| 543 |
+
"output_type": "stream",
|
| 544 |
+
"text": [
|
| 545 |
+
"Padded Training Sequences:\n",
|
| 546 |
+
"tensor([[ 0, 0, 0, ..., 10, 77748, 162],\n",
|
| 547 |
+
" [ 0, 0, 0, ..., 67, 1040, 1383],\n",
|
| 548 |
+
" [ 0, 0, 0, ..., 4300, 1, 1383],\n",
|
| 549 |
+
" ...,\n",
|
| 550 |
+
" [ 0, 0, 0, ..., 2942, 2742, 1383],\n",
|
| 551 |
+
" [ 0, 0, 0, ..., 98495, 14944, 58599],\n",
|
| 552 |
+
" [ 0, 0, 0, ..., 28107, 1040, 29357]])\n",
|
| 553 |
+
"\n",
|
| 554 |
+
"Padded Testing Sequences:\n",
|
| 555 |
+
"tensor([[ 0, 0, 0, ..., 19, 1729, 1383],\n",
|
| 556 |
+
" [ 0, 0, 0, ..., 32, 42, 1383],\n",
|
| 557 |
+
" [ 0, 0, 0, ..., 15124, 6281, 3539],\n",
|
| 558 |
+
" ...,\n",
|
| 559 |
+
" [ 0, 0, 0, ..., 86, 8647, 1383],\n",
|
| 560 |
+
" [ 0, 0, 0, ..., 4, 217, 194767],\n",
|
| 561 |
+
" [ 0, 0, 0, ..., 155, 4, 3127]])\n"
|
| 562 |
+
]
|
| 563 |
+
}
|
| 564 |
+
],
|
| 565 |
+
"source": [
|
| 566 |
+
"max_len = max([len(seq) for seq in X_train_seq])\n",
|
| 567 |
+
"\n",
|
| 568 |
+
"X_train_seq = [torch.tensor(seq) for seq in X_train_seq]\n",
|
| 569 |
+
"X_test_seq = [torch.tensor(seq) for seq in X_test_seq]\n",
|
| 570 |
+
"\n",
|
| 571 |
+
"X_train_pad = pad_sequence(X_train_seq, batch_first=True, padding_value=0,padding_side=\"left\")\n",
|
| 572 |
+
"X_test_pad = pad_sequence(X_test_seq, batch_first=True, padding_value=0,padding_side=\"left\")\n",
|
| 573 |
+
"\n",
|
| 574 |
+
"# Output\n",
|
| 575 |
+
"print(\"Padded Training Sequences:\")\n",
|
| 576 |
+
"print(X_train_pad)\n",
|
| 577 |
+
"\n",
|
| 578 |
+
"print(\"\\nPadded Testing Sequences:\")\n",
|
| 579 |
+
"print(X_test_pad)\n"
|
| 580 |
+
]
|
| 581 |
+
},
|
| 582 |
+
{
|
| 583 |
+
"cell_type": "markdown",
|
| 584 |
+
"id": "f0281378",
|
| 585 |
+
"metadata": {
|
| 586 |
+
"execution": {
|
| 587 |
+
"iopub.execute_input": "2024-08-08T13:32:07.711479Z",
|
| 588 |
+
"iopub.status.busy": "2024-08-08T13:32:07.710553Z",
|
| 589 |
+
"iopub.status.idle": "2024-08-08T13:32:07.717413Z",
|
| 590 |
+
"shell.execute_reply": "2024-08-08T13:32:07.716327Z",
|
| 591 |
+
"shell.execute_reply.started": "2024-08-08T13:32:07.711437Z"
|
| 592 |
+
},
|
| 593 |
+
"id": "f0281378",
|
| 594 |
+
"papermill": {
|
| 595 |
+
"duration": 0.011855,
|
| 596 |
+
"end_time": "2024-08-08T15:26:37.668786",
|
| 597 |
+
"exception": false,
|
| 598 |
+
"start_time": "2024-08-08T15:26:37.656931",
|
| 599 |
+
"status": "completed"
|
| 600 |
+
},
|
| 601 |
+
"tags": []
|
| 602 |
+
},
|
| 603 |
+
"source": [
|
| 604 |
+
"## 7.One-Hot Encoding"
|
| 605 |
+
]
|
| 606 |
+
},
|
| 607 |
+
{
|
| 608 |
+
"cell_type": "code",
|
| 609 |
+
"execution_count": 14,
|
| 610 |
+
"id": "pojEi95sTJrq",
|
| 611 |
+
"metadata": {
|
| 612 |
+
"colab": {
|
| 613 |
+
"base_uri": "https://localhost:8080/"
|
| 614 |
+
},
|
| 615 |
+
"id": "pojEi95sTJrq",
|
| 616 |
+
"outputId": "6e0564f1-cf89-4208-ce3b-31cef684e8bf"
|
| 617 |
+
},
|
| 618 |
+
"outputs": [
|
| 619 |
+
{
|
| 620 |
+
"name": "stdout",
|
| 621 |
+
"output_type": "stream",
|
| 622 |
+
"text": [
|
| 623 |
+
"One-Hot Encoded Training Labels:\n",
|
| 624 |
+
"tensor([[1, 0, 0],\n",
|
| 625 |
+
" [0, 0, 1],\n",
|
| 626 |
+
" [0, 1, 0],\n",
|
| 627 |
+
" ...,\n",
|
| 628 |
+
" [0, 1, 0],\n",
|
| 629 |
+
" [1, 0, 0],\n",
|
| 630 |
+
" [0, 0, 1]])\n",
|
| 631 |
+
"\n",
|
| 632 |
+
"One-Hot Encoded Testing Labels:\n",
|
| 633 |
+
"tensor([[0, 1, 0],\n",
|
| 634 |
+
" [0, 1, 0],\n",
|
| 635 |
+
" [0, 0, 1],\n",
|
| 636 |
+
" ...,\n",
|
| 637 |
+
" [0, 1, 0],\n",
|
| 638 |
+
" [0, 0, 1],\n",
|
| 639 |
+
" [0, 0, 1]])\n"
|
| 640 |
+
]
|
| 641 |
+
}
|
| 642 |
+
],
|
| 643 |
+
"source": [
|
| 644 |
+
"label_encoder = LabelEncoder()\n",
|
| 645 |
+
"y_train_ = label_encoder.fit_transform(y_train)\n",
|
| 646 |
+
"y_test_ = label_encoder.transform(y_test)\n",
|
| 647 |
+
"\n",
|
| 648 |
+
"# Convert to PyTorch tensors\n",
|
| 649 |
+
"y_train_tensor = torch.tensor(y_train_)\n",
|
| 650 |
+
"y_test_tensor = torch.tensor(y_test_)\n",
|
| 651 |
+
"num_classes = 3\n",
|
| 652 |
+
"\n",
|
| 653 |
+
"y_train_cat = F.one_hot(y_train_tensor, num_classes=num_classes)\n",
|
| 654 |
+
"y_test_cat = F.one_hot(y_test_tensor, num_classes=num_classes)\n",
|
| 655 |
+
"\n",
|
| 656 |
+
"# Output\n",
|
| 657 |
+
"print(\"One-Hot Encoded Training Labels:\")\n",
|
| 658 |
+
"print(y_train_cat)\n",
|
| 659 |
+
"\n",
|
| 660 |
+
"print(\"\\nOne-Hot Encoded Testing Labels:\")\n",
|
| 661 |
+
"print(y_test_cat)\n"
|
| 662 |
+
]
|
| 663 |
+
},
|
| 664 |
+
{
|
| 665 |
+
"cell_type": "markdown",
|
| 666 |
+
"id": "a8eb740b",
|
| 667 |
+
"metadata": {
|
| 668 |
+
"id": "a8eb740b",
|
| 669 |
+
"papermill": {
|
| 670 |
+
"duration": 0.012226,
|
| 671 |
+
"end_time": "2024-08-08T15:26:37.776293",
|
| 672 |
+
"exception": false,
|
| 673 |
+
"start_time": "2024-08-08T15:26:37.764067",
|
| 674 |
+
"status": "completed"
|
| 675 |
+
},
|
| 676 |
+
"tags": []
|
| 677 |
+
},
|
| 678 |
+
"source": [
|
| 679 |
+
"## 8.RNN Architecture"
|
| 680 |
+
]
|
| 681 |
+
},
|
| 682 |
+
{
|
| 683 |
+
"cell_type": "code",
|
| 684 |
+
"execution_count": 15,
|
| 685 |
+
"id": "a23abe59-d803-4a3d-b4be-fe07ca1aa250",
|
| 686 |
+
"metadata": {},
|
| 687 |
+
"outputs": [
|
| 688 |
+
{
|
| 689 |
+
"name": "stdout",
|
| 690 |
+
"output_type": "stream",
|
| 691 |
+
"text": [
|
| 692 |
+
"RNNModel(\n",
|
| 693 |
+
" (embedding): Embedding(224577, 128)\n",
|
| 694 |
+
" (rnn): RNN(128, 128, batch_first=True, dropout=0.2)\n",
|
| 695 |
+
" (fc): Linear(in_features=128, out_features=3, bias=True)\n",
|
| 696 |
+
")\n"
|
| 697 |
+
]
|
| 698 |
+
},
|
| 699 |
+
{
|
| 700 |
+
"name": "stderr",
|
| 701 |
+
"output_type": "stream",
|
| 702 |
+
"text": [
|
| 703 |
+
"/home/aravind/myenv/lib/python3.12/site-packages/torch/nn/modules/rnn.py:123: UserWarning: dropout option adds dropout after all but last recurrent layer, so non-zero dropout expects num_layers greater than 1, but got dropout=0.2 and num_layers=1\n",
|
| 704 |
+
" warnings.warn(\n"
|
| 705 |
+
]
|
| 706 |
+
}
|
| 707 |
+
],
|
| 708 |
+
"source": [
|
| 709 |
+
"vocab_size = len(tokenizer.get_vocab()) \n",
|
| 710 |
+
"embedding_dim = 128 \n",
|
| 711 |
+
"hidden_units = 128 \n",
|
| 712 |
+
"num_classes = 3 \n",
|
| 713 |
+
"max_len = max_len\n",
|
| 714 |
+
"\n",
|
| 715 |
+
"class RNNModel(nn.Module):\n",
|
| 716 |
+
" def __init__(self, vocab_size, embedding_dim, hidden_units, num_classes):\n",
|
| 717 |
+
" super(RNNModel, self).__init__()\n",
|
| 718 |
+
" self.embedding = nn.Embedding(vocab_size, embedding_dim)\n",
|
| 719 |
+
" self.rnn = nn.RNN(embedding_dim, hidden_units, batch_first=True, dropout=0.2)\n",
|
| 720 |
+
" self.fc = nn.Linear(hidden_units, num_classes)\n",
|
| 721 |
+
"\n",
|
| 722 |
+
" def forward(self, x):\n",
|
| 723 |
+
" x = self.embedding(x)\n",
|
| 724 |
+
" output, _ = self.rnn(x)\n",
|
| 725 |
+
" x = output[:, -1, :] \n",
|
| 726 |
+
" x = self.fc(x)\n",
|
| 727 |
+
" return F.softmax(x, dim=1)\n",
|
| 728 |
+
"\n",
|
| 729 |
+
"\n",
|
| 730 |
+
"model = RNNModel(vocab_size, embedding_dim, hidden_units, num_classes)\n",
|
| 731 |
+
"\n",
|
| 732 |
+
"print(model)"
|
| 733 |
+
]
|
| 734 |
+
},
|
| 735 |
+
{
|
| 736 |
+
"cell_type": "markdown",
|
| 737 |
+
"id": "99b14295",
|
| 738 |
+
"metadata": {
|
| 739 |
+
"id": "99b14295",
|
| 740 |
+
"papermill": {
|
| 741 |
+
"duration": 0.012565,
|
| 742 |
+
"end_time": "2024-08-08T15:26:38.661353",
|
| 743 |
+
"exception": false,
|
| 744 |
+
"start_time": "2024-08-08T15:26:38.648788",
|
| 745 |
+
"status": "completed"
|
| 746 |
+
},
|
| 747 |
+
"tags": []
|
| 748 |
+
},
|
| 749 |
+
"source": [
|
| 750 |
+
"## 9.Compile the Model"
|
| 751 |
+
]
|
| 752 |
+
},
|
| 753 |
+
{
|
| 754 |
+
"cell_type": "code",
|
| 755 |
+
"execution_count": 16,
|
| 756 |
+
"id": "QGbg8QVzulyP",
|
| 757 |
+
"metadata": {
|
| 758 |
+
"id": "QGbg8QVzulyP"
|
| 759 |
+
},
|
| 760 |
+
"outputs": [
|
| 761 |
+
{
|
| 762 |
+
"name": "stdout",
|
| 763 |
+
"output_type": "stream",
|
| 764 |
+
"text": [
|
| 765 |
+
"cuda\n"
|
| 766 |
+
]
|
| 767 |
+
},
|
| 768 |
+
{
|
| 769 |
+
"data": {
|
| 770 |
+
"text/plain": [
|
| 771 |
+
"RNNModel(\n",
|
| 772 |
+
" (embedding): Embedding(224577, 128)\n",
|
| 773 |
+
" (rnn): RNN(128, 128, batch_first=True, dropout=0.2)\n",
|
| 774 |
+
" (fc): Linear(in_features=128, out_features=3, bias=True)\n",
|
| 775 |
+
")"
|
| 776 |
+
]
|
| 777 |
+
},
|
| 778 |
+
"execution_count": 16,
|
| 779 |
+
"metadata": {},
|
| 780 |
+
"output_type": "execute_result"
|
| 781 |
+
}
|
| 782 |
+
],
|
| 783 |
+
"source": [
|
| 784 |
+
"optimizer = optim.Adam(model.parameters(), lr=0.001)\n",
|
| 785 |
+
"criterion = nn.CrossEntropyLoss()\n",
|
| 786 |
+
"def accuracy(y_pred, y_true):\n",
|
| 787 |
+
" _, predicted = torch.max(y_pred, 1) # Get class with max probability\n",
|
| 788 |
+
" correct = (predicted == y_true).sum().item()\n",
|
| 789 |
+
" return correct / y_true.size(0)\n",
|
| 790 |
+
"\n",
|
| 791 |
+
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
|
| 792 |
+
"print(device)\n",
|
| 793 |
+
"model.to(device)"
|
| 794 |
+
]
|
| 795 |
+
},
|
| 796 |
+
{
|
| 797 |
+
"cell_type": "markdown",
|
| 798 |
+
"id": "44bd65c1-b36a-4204-8073-cad73ee815a6",
|
| 799 |
+
"metadata": {},
|
| 800 |
+
"source": [
|
| 801 |
+
"## 10.Training"
|
| 802 |
+
]
|
| 803 |
+
},
|
| 804 |
+
{
|
| 805 |
+
"cell_type": "code",
|
| 806 |
+
"execution_count": 20,
|
| 807 |
+
"id": "Sdzi9huhvfK4",
|
| 808 |
+
"metadata": {
|
| 809 |
+
"id": "Sdzi9huhvfK4"
|
| 810 |
+
},
|
| 811 |
+
"outputs": [
|
| 812 |
+
{
|
| 813 |
+
"name": "stdout",
|
| 814 |
+
"output_type": "stream",
|
| 815 |
+
"text": [
|
| 816 |
+
"Epoch 1/10, Loss: 0.6014, Accuracy: 0.9510\n",
|
| 817 |
+
"Epoch 2/10, Loss: 0.5979, Accuracy: 0.9541\n",
|
| 818 |
+
"Epoch 3/10, Loss: 0.5966, Accuracy: 0.9549\n",
|
| 819 |
+
"Epoch 4/10, Loss: 0.5944, Accuracy: 0.9571\n",
|
| 820 |
+
"Epoch 5/10, Loss: 0.5965, Accuracy: 0.9554\n",
|
| 821 |
+
"Epoch 6/10, Loss: 0.5992, Accuracy: 0.9521\n",
|
| 822 |
+
"Epoch 7/10, Loss: 0.5936, Accuracy: 0.9577\n",
|
| 823 |
+
"Epoch 8/10, Loss: 0.5927, Accuracy: 0.9587\n",
|
| 824 |
+
"Epoch 9/10, Loss: 0.5924, Accuracy: 0.9589\n",
|
| 825 |
+
"Epoch 10/10, Loss: 0.5924, Accuracy: 0.9589\n",
|
| 826 |
+
"Time taken for training 115.47041082382202 sec\n"
|
| 827 |
+
]
|
| 828 |
+
}
|
| 829 |
+
],
|
| 830 |
+
"source": [
|
| 831 |
+
"\n",
|
| 832 |
+
"y_train_labels = torch.argmax(y_train_cat, dim=1) # Convert one-hot to class index\n",
|
| 833 |
+
"\n",
|
| 834 |
+
"# Convert to TensorDataset\n",
|
| 835 |
+
"train_dataset = TensorDataset(X_train_pad, y_train_labels)\n",
|
| 836 |
+
"train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)\n",
|
| 837 |
+
"\n",
|
| 838 |
+
"# Training loop\n",
|
| 839 |
+
"num_epochs = 10\n",
|
| 840 |
+
"t0 = time.time()\n",
|
| 841 |
+
"\n",
|
| 842 |
+
"for epoch in range(num_epochs):\n",
|
| 843 |
+
" model.train() # Set to training mode\n",
|
| 844 |
+
" running_loss = 0.0\n",
|
| 845 |
+
" correct_preds = 0\n",
|
| 846 |
+
" total_samples = 0\n",
|
| 847 |
+
"\n",
|
| 848 |
+
" for inputs, labels in train_loader:\n",
|
| 849 |
+
" inputs, labels = inputs.to(device), labels.to(device)\n",
|
| 850 |
+
"\n",
|
| 851 |
+
" optimizer.zero_grad() # Reset gradients\n",
|
| 852 |
+
" outputs = model(inputs) # Forward pass\n",
|
| 853 |
+
"\n",
|
| 854 |
+
" loss = criterion(outputs, labels) # Compute loss\n",
|
| 855 |
+
" loss.backward() # Backpropagation\n",
|
| 856 |
+
" optimizer.step() # Update weights\n",
|
| 857 |
+
"\n",
|
| 858 |
+
" # Compute accuracy\n",
|
| 859 |
+
" _, predicted = torch.max(outputs, 1)\n",
|
| 860 |
+
" correct_preds += (predicted == labels).sum().item()\n",
|
| 861 |
+
" total_samples += labels.size(0)\n",
|
| 862 |
+
"\n",
|
| 863 |
+
" running_loss += loss.item()\n",
|
| 864 |
+
"\n",
|
| 865 |
+
" # Compute epoch loss and accuracy\n",
|
| 866 |
+
" epoch_loss = running_loss / len(train_loader)\n",
|
| 867 |
+
" epoch_acc = correct_preds / total_samples\n",
|
| 868 |
+
"\n",
|
| 869 |
+
" print(f\"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_acc:.4f}\")\n",
|
| 870 |
+
"print(\"Time taken for training\",time.time()-t0,\"sec\")\n"
|
| 871 |
+
]
|
| 872 |
+
},
|
| 873 |
+
{
|
| 874 |
+
"cell_type": "markdown",
|
| 875 |
+
"id": "92b2ba57",
|
| 876 |
+
"metadata": {
|
| 877 |
+
"id": "92b2ba57",
|
| 878 |
+
"papermill": {
|
| 879 |
+
"duration": 0.076899,
|
| 880 |
+
"end_time": "2024-08-08T15:32:26.322204",
|
| 881 |
+
"exception": false,
|
| 882 |
+
"start_time": "2024-08-08T15:32:26.245305",
|
| 883 |
+
"status": "completed"
|
| 884 |
+
},
|
| 885 |
+
"tags": []
|
| 886 |
+
},
|
| 887 |
+
"source": [
|
| 888 |
+
"## 11. Saving Model Weights"
|
| 889 |
+
]
|
| 890 |
+
},
|
| 891 |
+
{
|
| 892 |
+
"cell_type": "code",
|
| 893 |
+
"execution_count": 21,
|
| 894 |
+
"id": "44159482-1028-427d-9155-7e97473a3834",
|
| 895 |
+
"metadata": {},
|
| 896 |
+
"outputs": [
|
| 897 |
+
{
|
| 898 |
+
"name": "stdout",
|
| 899 |
+
"output_type": "stream",
|
| 900 |
+
"text": [
|
| 901 |
+
"Model weights saved to rnn_classification_model_weights.pth\n"
|
| 902 |
+
]
|
| 903 |
+
}
|
| 904 |
+
],
|
| 905 |
+
"source": [
|
| 906 |
+
"model_weights_path = 'rnn_classification_model_weights.pth'\n",
|
| 907 |
+
"\n",
|
| 908 |
+
"torch.save(model.state_dict(), model_weights_path)\n",
|
| 909 |
+
"\n",
|
| 910 |
+
"print(f\"Model weights saved to {model_weights_path}\")"
|
| 911 |
+
]
|
| 912 |
+
},
|
| 913 |
+
{
|
| 914 |
+
"cell_type": "code",
|
| 915 |
+
"execution_count": 22,
|
| 916 |
+
"id": "67399f88-0c26-4915-9d6f-96385e010912",
|
| 917 |
+
"metadata": {},
|
| 918 |
+
"outputs": [],
|
| 919 |
+
"source": [
|
| 920 |
+
"try:\n",
|
| 921 |
+
" state_dict = torch.load('rnn_classification_model_weights.pth', map_location=torch.device('cuda'))\n",
|
| 922 |
+
" model.load_state_dict(state_dict)\n",
|
| 923 |
+
" model.eval()\n",
|
| 924 |
+
"except Exception as e:\n",
|
| 925 |
+
" st.error(f\"Error loading model: {str(e)}\")"
|
| 926 |
+
]
|
| 927 |
+
},
|
| 928 |
+
{
|
| 929 |
+
"cell_type": "code",
|
| 930 |
+
"execution_count": 23,
|
| 931 |
+
"id": "df87cdce-251a-4e77-9317-9faaf3895593",
|
| 932 |
+
"metadata": {},
|
| 933 |
+
"outputs": [
|
| 934 |
+
{
|
| 935 |
+
"data": {
|
| 936 |
+
"text/plain": [
|
| 937 |
+
"RNNModel(\n",
|
| 938 |
+
" (embedding): Embedding(224577, 128)\n",
|
| 939 |
+
" (rnn): RNN(128, 128, batch_first=True, dropout=0.2)\n",
|
| 940 |
+
" (fc): Linear(in_features=128, out_features=3, bias=True)\n",
|
| 941 |
+
")"
|
| 942 |
+
]
|
| 943 |
+
},
|
| 944 |
+
"execution_count": 23,
|
| 945 |
+
"metadata": {},
|
| 946 |
+
"output_type": "execute_result"
|
| 947 |
+
}
|
| 948 |
+
],
|
| 949 |
+
"source": [
|
| 950 |
+
"model.to(device)"
|
| 951 |
+
]
|
| 952 |
+
},
|
| 953 |
+
{
|
| 954 |
+
"cell_type": "code",
|
| 955 |
+
"execution_count": 24,
|
| 956 |
+
"id": "xQB-r-EqSIQE",
|
| 957 |
+
"metadata": {
|
| 958 |
+
"colab": {
|
| 959 |
+
"base_uri": "https://localhost:8080/",
|
| 960 |
+
"height": 442
|
| 961 |
+
},
|
| 962 |
+
"id": "xQB-r-EqSIQE",
|
| 963 |
+
"outputId": "8c3514f9-06c1-4483-9ab8-21ea7dfa05bf"
|
| 964 |
+
},
|
| 965 |
+
"outputs": [
|
| 966 |
+
{
|
| 967 |
+
"name": "stdout",
|
| 968 |
+
"output_type": "stream",
|
| 969 |
+
"text": [
|
| 970 |
+
"Thyroid surgery in children in a single institution from Osama Ibrahim Almosallama Ali Aseerib Ahmed Alhumaida Ali S AlZahranic Saif Alsobhib Saud AlShanafeybFrom the aDepartment of Surgery College of Medicine Qassim University Buraidah Al Qassim Saudi Arabia bDepartment of Surgery King Faisal Specialist Hospital and Research Center Riyadh Saudi Arabia cDepartment of Medicine King Faisal Specialist Hospital and Research Center Riyadh Saudi Arabia Correspondence Dr Osama Ibrahim Almosallam Department of Surgery College of Medicine Qassim University PO Box Buraidah Al Qassim Saudi Arabia osama_iaahotmailcom ORCID orcid0000000290367564 Citation Almosallam OI Aseeri A Alhumaid A AlZahrani AS Alsobhi S AlShanafey S Thyroid surgery in children in a single institution from Ann Saudi Med Received January Accepted May Published August Copyright Copyright Annals of Saudi Medicine Saudi Arabia This is an access under the Creative Commons AttributionNonCommercialNoDerivatives International License CC BYNCND The details of which can be accessed at httpcreativecommons licensesbyncnd40Funding NoneBACKGROUND Data on thyroid surgery in children are scarceOBJECTIVE Analyze outcome data on thyroid surgery in a pediatric populationDESIGN Medical record reviewSETTING Tertiary health care institutionPATIENTS AND METHODS We collected demographic and clinical data on patients years or younger who had thyroid surgery in the period to Descriptive data are presentedMAIN OUTCOME MEASURES Indications for thyroidectomy thyroid pathology complications length of stay and radioactive iodine treatment and recurrencesSAMPLE SIZE RESULTS Of patients who underwent thyroidectomy procedures were females and the mean age at operation was years and were associated with multiple endocrine neoplasia type There was no history of radiation exposure Eightyone patients had fine needle aspiration FNA which correlated with the final histopathology in of cases Sixtysix patients had malignant cancer papillary of patients who had neck dissection had lymph node metastasis and had distant metastases to the lung Procedures included total thyroidectomy hemithyroidectomy completion and subtotal thyroidectomy Twentythree patients developed hypocalcemia permanent and had unilateral recurrent laryngeal nerve injury permanent Patients were followed up for a mean duration of months median months Of patients with thyroid cancer received radioactive iodine and had recurrence Malignancy is the commonest indication for thyroid surgery in children and FNA is highly diagnostic Hypocalcemia and recurrent laryngeal nerve injury are significant complications The recurrence rate in thyroid cancer is LIMITATIONS RetrospectiveCONFLICT OF INTEREST Noneoriginal ANN SAUDI MED JULYAUGUST WWWANNSAUDIMEDNET 0cThyroid diseases requiring surgery are relatively uncommon in children compared to adults The prevalence of palpable thyroid nodules in children ranges from to Sporadic welldifferentiated thyroid cancer is the most common endocrine malignancy in children accounting for of pediatric cancers in the prepubertal age group and up to of cancers in adolescents aged year2 The most common indication for thyroid surgery in children varies among published studies but thyroidectomy for malignant conditions is rising38 Data in children throughout the world are relatively scarce The objective of this study was to analyze the clinical data and outcome of thyroid surgery in a large series of children treated at a single center at King Faisal Specialist Hospital and Research Center KFSHRC in RiyadhPATIENT AND METHODS With the approval of the Institutional Review Board IRB at KFSHRC the medical records of all patients years old and younger who underwent a thyroid surgery between and were retrospectively reviewed We elected to include patients up to the year to ensure a reasonable followup period Patients for the study were identified by a search of the operating room log for all procedures involving the thyroid gland for the specified age groupDemographic data clinical features and surgical outcomes were collected Specific data that were obtained included age at operation gender family history presenting symptoms history of radiation exposure presence of multiple endocrine neoplasia type MEN thyroid function test presence and size of thyroid nodules by ultrasound presence of lymph nodes metastasis or distant metastasis fine needle aspiration FNA cytology surgical procedure final histopathology and length of followup Outcomes analyzed were postoperative complications including transient or permanent hypocalcemia transient or permanent recurrent laryngeal nerve paralysis wound infection and hematoma length of stay and radioactive iodine treatment and recurrences Thyroid procedures in this series included hemithyroidectomy subtotal total and completion thyroidectomy Surgeries were performed by either an endocrine adult surgeon or a pediatric surgeon No intraoperative nerve monitoring was used Early in the series procedures were performed by adult endocrine surgeons but lately a combined approach was adopted where pediatric surgeons and adult endocrine surgeons collaborated in such cases proceduresthe normal range in our laboratory regardless of symptoms Transient hypocalcemia was identified if it lasted for less than months while permanent hypocalcemia was considered if the serum calcium level remained below normal range and the patient continued on calcium supplementation after months of the surgery All patients with a family history of MEN underwent genetic testing of the RET protooncogene to confirm the diagnosis All patients who underwent completion thyroidectomy had a preoperative and postoperative vocal cords assessment at the Otolaryngology clinic Descriptive data were generated and comparisons were conducted using the t test for continuous data and the chisquare or Fisher exact tests for proportionsRESULTSBetween and patients underwent surgical procedures patients underwent two procedures for thyroid disease at our institution Eighty patients were females The mean age at operation was years median years range years The most common indication for thyroidectomy was thyroid nodule which was present in of cases Table The mean SD size of thyroid nodules was mm There were cases associated with MEN syndromes The final pathology in two patients with MEN syndrome showed medullary thyroid cancer MTC while the remaining patients had prophylactic procedures before development of MTC None of the patients had a history of radiation exposure Eightyone patients FNA which correlated with the final histopathology in of cases There were three cases of toxic adenoma and one case of Graves disease which did not require FNA The remaining cases underwent FNA at another institution and FNA was not repeated at our institution or they came for completion thyroidectomy with documented pathology for malignancy after they had their first surgery in another hospitalThe most common diagnoses included papillary thyroid cancer and multinodular goiter or colloid Table Indications for thyroidectomy in patients IndicationNodulen MEN prophylaxisHyperthyroidismMultinodular goiterCompletion thyroidectomy Hypocalcemia was defined by calcium levels below Data are number original PEDIATRIC THYROID SURGERYANN SAUDI MED JULYAUGUST WWWANNSAUDIMEDNET 0cnodule Table Surgical procedures included total thyroidectomy hemithyroidectomy completion thyroidectomy and subtotal thyroidectomy Neck dissection was performed in patients Operative complications were observed in patients The most common complication was hypocalcemia transient permanent and Table Thyroid pathology in the patientsPathologyn BenignNormal thyroid tissueColloid noduleCystAdenomaThyroiditisGraves diseaseThyroid cancerPapillaryFollicularMedullaryHurthleAnaplasticTotalData are number Table Benign and malignant lesions in patientsBenignn37Malignantn66 P value Age meanyearsGender malefemalePresence of noduleHypocalcemiaRecurrent laryngeal nerve palsyBleedinghematomaWound infectionTracheal injuryOverall complicationsMean length of stay daysMEN recurrent laryngeal nerve palsy transient permanent all were unilateral Table Of patients with malignant lesions had lymph node metastasis and patients had distant metastases to the lung None of the patients developed postoperative bleeding wound infection or tracheal injury Patients were followed up for a mean of months median range months radioactive iodine treatment was delivered to patients with malignant lesions patients had recurrences were local recurrences and were local and distant recurrences to the lung Three cases received radioactive iodine RAI before and after recurrence One case was low risk before recurrence so did not receive RAI until after recurrence One case had medullary thyroid cancer so did not receive RAI In the remaining five cases there was no clear data whether those patients received RAI before or only after a recurrence All local recurrences underwent resection except for one patient who was lost follow up There was no mortality in this study DISCUSSIONThe most common indication for thyroidectomy in this series was thyroid nodule which correlates with previously published reports in the pediatric population35 Children with thyroid nodules have an estimated fourfold higher risk of developing thyroid cancer compared to adults910 The high incidence of malignancy in this series suggests children with a thyroid nodule should be carefully evaluatedFNA is a valuablemethod for preoperative evaluation of thyroid nodules However there are limitations on the routine use of FNA in children including the need for sedation sampling errors and the limited availability of experienced cytopathologists11 Many previous studies reported high sensitivity and specificity of FNA in evaluating thyroid nodule in children1114 which correlate with our findingsOur data showed lymph node metastasis in of thyroid cancer cases which supports the notion that children with thyroid cancer frequently present with more extensive disease than adults Lymphnode involvement at diagnosis is seen in to of children compared with to of adults with differentiated thyroid cancer1523 Because our hospital is the largest referral center in Saudi Arabia especially for oncology cases this may explain the large number of lymph node and distant metastasis In this cohortThe most common complication reported after thyroidectomy in children is hypoparathyroidism with an incidence ranging between to which original PEDIATRIC THYROID SURGERYANN SAUDI MED JULYAUGUST WWWANNSAUDIMEDNET 0ccorresponds with our results of which are reported as hypocalcemia in Table One study found that total thyroidectomy central and bilateral neck dissection Graves disease and malignancy were risk factors for hypocalcemia after thyroid surgery3 In this cohort postoperative hypocalcemia was noted more in malignant cases but it failed to reach statistical significance Moreover there was no significant difference between benign and malignant cases in terms of mean age gender distribution recurrent laryngeal nerve injury or overall complications a finding that was reported previously26 Multiple studies in recent years have found an inverse relationship between surgeon volume and complication rates2728 but similar data in the pediatric population is lacking One study found that highvolume endocrine surgeons have better outcomes and shorter lengths of stay and lower costs after thyroidectomy and parathyroidectomy in children compared to pediatric surgeons general surgeons or otolaryngologists29 Scheumann and colleagues also concluded that a collaborative approach between pediatric and endocrine surgeons would have better outcomes This has led other authors to suggest that a combined approach with endocrine and pediatric surgeons in addition to pediatric endocrinologists may optimize the care of children with surgical thyroid disease given the low number of pediatric patients4 Our data do not allow for comparisons of different approaches given the late adoption of the combined approach The recurrence rate for thyroid cancer in children after thyroidectomy has varied widely in reported studies ranging from to while it was in this cohort Only a few studies explored the predictors of recurrence Lymph node involvement multiple nodules male gender younger age histologic subtype and advanced tumor stage were risk factors associated with recurrence17233033 In this study of patients with malignant lesions received RAI Although there are conflicting data regarding the indications of postoperative RAI treatment in lowrisk patients the current recommendation is that lowrisk patients can be treated without RAI3436There are some limitations to this study The retrospective nature may affect the validity and quality of the data The small number of cases in some categories did not enable us to compare groups and explore predictors relative to these factors On the other hand this study adds to the scarce data on thyroid surgery in pediatric age group Malignancy is the commonest indication for thyroid surgery in children and FNA is highly diagnostic Hypocalcemia and recurrent laryngeal nerve injury are significant complications Cancerrelated death is extremely rare but recurrence is not uncommon and a significant number of patients with malignant cases received RAI treatmentoriginal PEDIATRIC THYROID SURGERYANN SAUDI MED JULYAUGUST WWWANNSAUDIMEDNET 0cREFERENCES Trowbridge FL Matovinovic J McLaren GD Nichaman MZ Iodine and goiter in children Pediatrics Ries LAG Melbert D Krapcho M Stinchcomb DG Howlader N Horner MJ et al SEER Cancer Statistics Review Bethesda National Cancer Institute Based on November SEER data submission Chen Y[h] Masiakos PT Gaz RD Hodin RA Parangi S Randolph GW et al Pediatric thyroidectomy in a high volume thyroid surgery center Risk factors for postoperative hypocalcemia J Pediatr Surg Aug5081316 Wood JH Partrick DA Barham HP Bensard DD Travers HS Bruny JL et al Pediatric thyroidectomy a collaborative surgical approach J Pediatr Surg May4658238 Scholz S Smith JR Chaignaud B Shamberger RC Huang SA Thyroid surgery at Childrens Hospital Boston a 35year singleinstitution experience J Pediatr Surg Mar46343742 Josefson J Zimmerman D Thyroid nodules and cancers in children Pediatr Endocrinol Rev Sep611423 Hameed R Zacharin MR Changing face of paediatric and adolescent thyroid cancer J Paediatr Child Health LugoVicente H Ortiz VN Irizarry H Camps JI Pagán V Pediatric thyroid nodules management in the era of fine needle aspirationJ Pediatr Surg Mussa A De Andrea M Motta M Mormile A Palestini N Corrias A Predictors of Malignancy in Children with Thyroid Nodules J Pediatr Oct167488692 Amirazodi E Propst EJ Chung CT Parra DA Wasserman JD Pediatric thyroid FNA biopsy Outcomes and impact on management over years at a tertiary care center Cancer Cytopathol Partyka KL Huang EC2 Cramer HM Chen S Wu HH Histologic and clinical followup of thyroid fineneedle aspirates in pediatric patients Cancer Cytopathol Sinha CK Decoppi P Pierro A Brain C Hindmarsh P Butler G et al Thyroid Surgery in Children Clinical Outcomes Eur J Pediatr Surg Oct2554259 Kundel A Thompson GB Richards ML Qiu LX Cai Y Schwenk FW et al Pediatric Endocrine Surgery A 20Year Experience at the Mayo Clinic J Clin Endocrinol Metab February Jiang W Newbury RO Newfield RS Pediatric thyroid surgery and management of thyroid nodulesan institutional experience features and over a 10year period Int J Pediatr Endocrinol Burke JF Sippel RS Chen H Evolution of Pediatric Thyroid Surgery at a Tertiary Medical Center Surg Res AlQahtani KH Tunio MA Al Asiri M Aljohani NJ Bayoumi Y Riaz K et al Clinicopathological treatment outcomes of differentiated thyroid cancer in Saudi children and adults J Otolaryngol Head Neck Surg Nov Kluijfhout WP van Beek DJ Verrijn Stuart AA Lodewijk L Valk GD Van der Zee DC et al Postoperative Complications After Prophylactic Thyroidectomy for Very Young Patients With Multiple Endocrine Neoplasia Type Medicine Baltimore 20159429e1108 Raval MV Browne M Chin AC Zimmerman D Angelos P Reynolds M Total thyroidectomy for benign disease in the pediatric patientfeasible and safe J Pediatr Surg Stavrakis AI Ituarte PH Ko CY Yeh MW Surgeon volume as a predictor of outcomes in inpatient and outpatient endocrine surgery Surgery Sosa JA Bowman HM Tielsch JM Powe NR Gordon TA Udelsman R The importance of surgeon experience for clinical and economic outcomes from thyroidectomy Ann Surg Tuggle CT Roman SA Wang TS Boudourakis L Thomas D Udelsman R et al Pediatric endocrine surgery Who is operating on our children Surgery Dec144686977 Park S Jeong JS Ryu HR Lee C Park JH Kang S et al Differentiated Thyroid Carcinoma of Children and Adolescents27Year Experience in the Yonsei University Health System J Korean Med Sci Palmer BA Zarroug AE Poley RN Kollars JP Moir CR Papillary thyroid carcinoma in children risk factors and complications of disease recurrence J Pediatr Surg Wada N Sugino K Mimura T Nagahama M Kitagawa W Shibuya H et al Pediatric differentiated thyroid carcinoma in stage I risk factor analysis for disease free survival BMC Cancer D Danese Gardini A Farsetti A Sciacchitano S Andreoli M Pontecorvi A Thyroid carcinoma in children and adolescents Eur J Pediatr Astl J Chovanec M Lukes P Katra R Dvorakova M Vlcek P et al Thyroid carcinoma surgery in children and adolescents years experience surgery of pediatric thyroid lymph node metastases carcinoma Int J Pediatr Otorhinolaryngol Chaukar DA Rangarajan V Nair N Nadkarni MS Pai PS Dcruz AK et al Pediatric thyroid cancer J Surg Oncol Dzodic R Buta M Markovic I Gavrilo D Matovic M Milovanovic Z et al Surgical management of welldifferentiated thyroid carcinoma in children and adolescents years of experience of a single institution in Serbia Endocr J Scheumann GF Gimm O Wegener G Hundeshagen H Dralle H Prognostic significance and surgical management of locoregional in papillary thyroid cancer World J Surg Shi RL Qu N Yang SW Tumor size interpretation for predicting cervical lymph node metastasis using a differentiated thyroid cancer risk model Onco Targets Ther Zimmerman D Hay ID Gough IR Goellner JR Ryan JJ Grant CS et al Papillary thyroid carcinoma in children and adults longterm followup of patients conservatively treated at one institution during three decades Surgery Collini P Mattavelli F Pellegrinelli A Barisella M Ferrari A Massimino M Papillary carcinoma of the thyroid gland of childhood and adolescence Morphologic subtypes biologic behavior and prognosis a clinicopathologic study of sporadic cases treated at a single institution during a 30year period Am J Surg Pathol BorsonChazot Causeret S Lifante JC Augros M Berger N Peix JL Predictive factors for recurrence from a series of children and adolescents with differentiated thyroid cancer World J Surg Baumgarten HD Bauer AJ Isaza A MostoufiMoab S Kazahaya K Adzick NS Surgical management of pediatric thyroid disease Complication rates after thyroidectomy at the Childrens Hospital of Philadelphia highvolume Pediatric Thyroid Center Journal of pediatric surgery Oct Kurzawinski TR De Coppi P Thyroidectomy in Children InPediatric Surgery pp Springer Berlin Heidelberg Francis G Waguespack SG Bauer AJ Angelog P Benvenga S et al Management Guidelines for Children with Thyroid Nodules and Differentiated Thyroid Cancer The American Thyroid Association Guidelines Task Force on Pediatric Thyroid Cancer THYROID Volume Number original PEDIATRIC THYROID SURGERYANN SAUDI MED JULYAUGUST WWWANNSAUDIMEDNET 0c'\n"
|
| 971 |
+
]
|
| 972 |
+
}
|
| 973 |
+
],
|
| 974 |
+
"source": [
|
| 975 |
+
"print(texts[0])"
|
| 976 |
+
]
|
| 977 |
+
},
|
| 978 |
+
{
|
| 979 |
+
"cell_type": "code",
|
| 980 |
+
"execution_count": null,
|
| 981 |
+
"id": "7727274b-19a0-4506-be03-8f9977bff825",
|
| 982 |
+
"metadata": {},
|
| 983 |
+
"outputs": [],
|
| 984 |
+
"source": []
|
| 985 |
+
}
|
| 986 |
+
],
|
| 987 |
+
"metadata": {
|
| 988 |
+
"accelerator": "GPU",
|
| 989 |
+
"colab": {
|
| 990 |
+
"gpuType": "T4",
|
| 991 |
+
"provenance": []
|
| 992 |
+
},
|
| 993 |
+
"kaggle": {
|
| 994 |
+
"accelerator": "gpu",
|
| 995 |
+
"dataSources": [
|
| 996 |
+
{
|
| 997 |
+
"datasetId": 2389764,
|
| 998 |
+
"sourceId": 4033428,
|
| 999 |
+
"sourceType": "datasetVersion"
|
| 1000 |
+
}
|
| 1001 |
+
],
|
| 1002 |
+
"isGpuEnabled": true,
|
| 1003 |
+
"isInternetEnabled": true,
|
| 1004 |
+
"language": "python",
|
| 1005 |
+
"sourceType": "notebook"
|
| 1006 |
+
},
|
| 1007 |
+
"kernelspec": {
|
| 1008 |
+
"display_name": "Python 3 (ipykernel)",
|
| 1009 |
+
"language": "python",
|
| 1010 |
+
"name": "python3"
|
| 1011 |
+
},
|
| 1012 |
+
"language_info": {
|
| 1013 |
+
"codemirror_mode": {
|
| 1014 |
+
"name": "ipython",
|
| 1015 |
+
"version": 3
|
| 1016 |
+
},
|
| 1017 |
+
"file_extension": ".py",
|
| 1018 |
+
"mimetype": "text/x-python",
|
| 1019 |
+
"name": "python",
|
| 1020 |
+
"nbconvert_exporter": "python",
|
| 1021 |
+
"pygments_lexer": "ipython3",
|
| 1022 |
+
"version": "3.12.3"
|
| 1023 |
+
},
|
| 1024 |
+
"papermill": {
|
| 1025 |
+
"default_parameters": {},
|
| 1026 |
+
"duration": 458.547826,
|
| 1027 |
+
"end_time": "2024-08-08T15:32:43.096410",
|
| 1028 |
+
"environment_variables": {},
|
| 1029 |
+
"exception": null,
|
| 1030 |
+
"input_path": "__notebook__.ipynb",
|
| 1031 |
+
"output_path": "__notebook__.ipynb",
|
| 1032 |
+
"parameters": {},
|
| 1033 |
+
"start_time": "2024-08-08T15:25:04.548584",
|
| 1034 |
+
"version": "2.5.0"
|
| 1035 |
+
}
|
| 1036 |
+
},
|
| 1037 |
+
"nbformat": 4,
|
| 1038 |
+
"nbformat_minor": 5
|
| 1039 |
+
}
|