PranavReddy18 commited on
Commit
e2328ed
Β·
verified Β·
1 Parent(s): 9632a88

Upload 5 files

Browse files
Files changed (5) hide show
  1. app.py +67 -0
  2. model.h5 +3 -0
  3. requirements.txt +7 -0
  4. spam.ipynb +869 -0
  5. tokenizer.pkl +3 -0
app.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import tensorflow as tf
3
+ from tensorflow.keras.preprocessing.text import Tokenizer
4
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
5
+
6
+ # Load your saved model and tokenizer
7
+ def load_model_and_tokenizer():
8
+ # Assuming the model is saved as 'spam_ham_model.h5' and tokenizer saved as 'tokenizer.pickle'
9
+ model = tf.keras.models.load_model('model.h5')
10
+
11
+ # You need to have a way to load the tokenizer that you used
12
+ import pickle
13
+ with open('tokenizer.pkl', 'rb') as handle:
14
+ tokenizer = pickle.load(handle)
15
+
16
+ return model, tokenizer
17
+
18
+ # Preprocessing function for the user input
19
+ def preprocess_input(texts, tokenizer, maxlen=50):
20
+ sequences = tokenizer.texts_to_sequences(texts)
21
+ return pad_sequences(sequences, maxlen=maxlen, padding='post')
22
+
23
+ # Prediction function
24
+ def predict_text(model, tokenizer, sample_texts, maxlen=50):
25
+ X_predict = preprocess_input(sample_texts, tokenizer, maxlen)
26
+ predictions = model.predict(X_predict)
27
+
28
+ results = []
29
+ for text, pred in zip(sample_texts, predictions):
30
+ label = "spam" if pred[0] > 0.5 else "ham"
31
+ results.append({
32
+ "Text": text,
33
+ "Predicted Label": label,
34
+ "Prediction Confidence": f"{pred[0]:.4f}"
35
+ })
36
+ return results
37
+
38
+ # Streamlit App Interface
39
+ def main():
40
+ st.title('Spam vs Ham Text Classifier')
41
+ st.markdown("""
42
+ This is a simple Streamlit app that predicts whether a given text is **Spam** or **Ham** using a pre-trained model.
43
+ """)
44
+
45
+ # Load model and tokenizer
46
+ model, tokenizer = load_model_and_tokenizer()
47
+
48
+ # Text input
49
+ text_input = st.text_area("Enter the text you want to classify:")
50
+
51
+ # Button to predict
52
+ if st.button("Predict"):
53
+ if text_input:
54
+ # Get the prediction
55
+ prediction_results = predict_text(model, tokenizer, [text_input])
56
+
57
+ # Display the result
58
+ for result in prediction_results:
59
+ st.write(f"**Text**: {result['Text']}")
60
+ st.write(f"**Predicted Label**: {result['Predicted Label']}")
61
+ st.write(f"**Prediction Confidence**: {result['Prediction Confidence']}")
62
+ else:
63
+ st.error("Please enter some text to classify.")
64
+
65
+ # Run the app
66
+ if __name__ == "__main__":
67
+ main()
model.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:70c5de9a0ae2f929728bbc60eeaab72b9da027efb1f64981395ac9a1687b4041
3
+ size 12831032
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ tensorflow==2.17.0
2
+ scikit-learn==1.5.1
3
+ pandas==2.1.4
4
+ numpy==1.23.5
5
+ matplotlib==3.7.0
6
+ seaborn==0.13.2
7
+ streamlit==1.37.1
spam.ipynb ADDED
@@ -0,0 +1,869 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import pandas as pd \n",
10
+ "import numpy as np \n",
11
+ "import seaborn as sns\n",
12
+ "import matplotlib.pyplot as plt"
13
+ ]
14
+ },
15
+ {
16
+ "cell_type": "code",
17
+ "execution_count": 3,
18
+ "metadata": {},
19
+ "outputs": [],
20
+ "source": [
21
+ "df=pd.read_csv(\"C:\\\\Users\\\\saipr\\\\Downloads\\\\spam (1).csv\",encoding='ISO-8859-1')"
22
+ ]
23
+ },
24
+ {
25
+ "cell_type": "code",
26
+ "execution_count": 4,
27
+ "metadata": {},
28
+ "outputs": [
29
+ {
30
+ "data": {
31
+ "text/html": [
32
+ "<div>\n",
33
+ "<style scoped>\n",
34
+ " .dataframe tbody tr th:only-of-type {\n",
35
+ " vertical-align: middle;\n",
36
+ " }\n",
37
+ "\n",
38
+ " .dataframe tbody tr th {\n",
39
+ " vertical-align: top;\n",
40
+ " }\n",
41
+ "\n",
42
+ " .dataframe thead th {\n",
43
+ " text-align: right;\n",
44
+ " }\n",
45
+ "</style>\n",
46
+ "<table border=\"1\" class=\"dataframe\">\n",
47
+ " <thead>\n",
48
+ " <tr style=\"text-align: right;\">\n",
49
+ " <th></th>\n",
50
+ " <th>v1</th>\n",
51
+ " <th>v2</th>\n",
52
+ " <th>Unnamed: 2</th>\n",
53
+ " <th>Unnamed: 3</th>\n",
54
+ " <th>Unnamed: 4</th>\n",
55
+ " </tr>\n",
56
+ " </thead>\n",
57
+ " <tbody>\n",
58
+ " <tr>\n",
59
+ " <th>0</th>\n",
60
+ " <td>ham</td>\n",
61
+ " <td>Go until jurong point, crazy.. Available only ...</td>\n",
62
+ " <td>NaN</td>\n",
63
+ " <td>NaN</td>\n",
64
+ " <td>NaN</td>\n",
65
+ " </tr>\n",
66
+ " <tr>\n",
67
+ " <th>1</th>\n",
68
+ " <td>ham</td>\n",
69
+ " <td>Ok lar... Joking wif u oni...</td>\n",
70
+ " <td>NaN</td>\n",
71
+ " <td>NaN</td>\n",
72
+ " <td>NaN</td>\n",
73
+ " </tr>\n",
74
+ " <tr>\n",
75
+ " <th>2</th>\n",
76
+ " <td>spam</td>\n",
77
+ " <td>Free entry in 2 a wkly comp to win FA Cup fina...</td>\n",
78
+ " <td>NaN</td>\n",
79
+ " <td>NaN</td>\n",
80
+ " <td>NaN</td>\n",
81
+ " </tr>\n",
82
+ " <tr>\n",
83
+ " <th>3</th>\n",
84
+ " <td>ham</td>\n",
85
+ " <td>U dun say so early hor... U c already then say...</td>\n",
86
+ " <td>NaN</td>\n",
87
+ " <td>NaN</td>\n",
88
+ " <td>NaN</td>\n",
89
+ " </tr>\n",
90
+ " <tr>\n",
91
+ " <th>4</th>\n",
92
+ " <td>ham</td>\n",
93
+ " <td>Nah I don't think he goes to usf, he lives aro...</td>\n",
94
+ " <td>NaN</td>\n",
95
+ " <td>NaN</td>\n",
96
+ " <td>NaN</td>\n",
97
+ " </tr>\n",
98
+ " </tbody>\n",
99
+ "</table>\n",
100
+ "</div>"
101
+ ],
102
+ "text/plain": [
103
+ " v1 v2 Unnamed: 2 \\\n",
104
+ "0 ham Go until jurong point, crazy.. Available only ... NaN \n",
105
+ "1 ham Ok lar... Joking wif u oni... NaN \n",
106
+ "2 spam Free entry in 2 a wkly comp to win FA Cup fina... NaN \n",
107
+ "3 ham U dun say so early hor... U c already then say... NaN \n",
108
+ "4 ham Nah I don't think he goes to usf, he lives aro... NaN \n",
109
+ "\n",
110
+ " Unnamed: 3 Unnamed: 4 \n",
111
+ "0 NaN NaN \n",
112
+ "1 NaN NaN \n",
113
+ "2 NaN NaN \n",
114
+ "3 NaN NaN \n",
115
+ "4 NaN NaN "
116
+ ]
117
+ },
118
+ "execution_count": 4,
119
+ "metadata": {},
120
+ "output_type": "execute_result"
121
+ }
122
+ ],
123
+ "source": [
124
+ "df.head()"
125
+ ]
126
+ },
127
+ {
128
+ "cell_type": "code",
129
+ "execution_count": 6,
130
+ "metadata": {},
131
+ "outputs": [
132
+ {
133
+ "data": {
134
+ "text/plain": [
135
+ "(5572, 5)"
136
+ ]
137
+ },
138
+ "execution_count": 6,
139
+ "metadata": {},
140
+ "output_type": "execute_result"
141
+ }
142
+ ],
143
+ "source": [
144
+ "df.shape"
145
+ ]
146
+ },
147
+ {
148
+ "cell_type": "code",
149
+ "execution_count": 8,
150
+ "metadata": {},
151
+ "outputs": [],
152
+ "source": [
153
+ "df=df[['v1','v2']]"
154
+ ]
155
+ },
156
+ {
157
+ "cell_type": "code",
158
+ "execution_count": 9,
159
+ "metadata": {},
160
+ "outputs": [
161
+ {
162
+ "data": {
163
+ "text/html": [
164
+ "<div>\n",
165
+ "<style scoped>\n",
166
+ " .dataframe tbody tr th:only-of-type {\n",
167
+ " vertical-align: middle;\n",
168
+ " }\n",
169
+ "\n",
170
+ " .dataframe tbody tr th {\n",
171
+ " vertical-align: top;\n",
172
+ " }\n",
173
+ "\n",
174
+ " .dataframe thead th {\n",
175
+ " text-align: right;\n",
176
+ " }\n",
177
+ "</style>\n",
178
+ "<table border=\"1\" class=\"dataframe\">\n",
179
+ " <thead>\n",
180
+ " <tr style=\"text-align: right;\">\n",
181
+ " <th></th>\n",
182
+ " <th>v1</th>\n",
183
+ " <th>v2</th>\n",
184
+ " </tr>\n",
185
+ " </thead>\n",
186
+ " <tbody>\n",
187
+ " <tr>\n",
188
+ " <th>0</th>\n",
189
+ " <td>ham</td>\n",
190
+ " <td>Go until jurong point, crazy.. Available only ...</td>\n",
191
+ " </tr>\n",
192
+ " <tr>\n",
193
+ " <th>1</th>\n",
194
+ " <td>ham</td>\n",
195
+ " <td>Ok lar... Joking wif u oni...</td>\n",
196
+ " </tr>\n",
197
+ " <tr>\n",
198
+ " <th>2</th>\n",
199
+ " <td>spam</td>\n",
200
+ " <td>Free entry in 2 a wkly comp to win FA Cup fina...</td>\n",
201
+ " </tr>\n",
202
+ " <tr>\n",
203
+ " <th>3</th>\n",
204
+ " <td>ham</td>\n",
205
+ " <td>U dun say so early hor... U c already then say...</td>\n",
206
+ " </tr>\n",
207
+ " <tr>\n",
208
+ " <th>4</th>\n",
209
+ " <td>ham</td>\n",
210
+ " <td>Nah I don't think he goes to usf, he lives aro...</td>\n",
211
+ " </tr>\n",
212
+ " <tr>\n",
213
+ " <th>...</th>\n",
214
+ " <td>...</td>\n",
215
+ " <td>...</td>\n",
216
+ " </tr>\n",
217
+ " <tr>\n",
218
+ " <th>5567</th>\n",
219
+ " <td>spam</td>\n",
220
+ " <td>This is the 2nd time we have tried 2 contact u...</td>\n",
221
+ " </tr>\n",
222
+ " <tr>\n",
223
+ " <th>5568</th>\n",
224
+ " <td>ham</td>\n",
225
+ " <td>Will Ì_ b going to esplanade fr home?</td>\n",
226
+ " </tr>\n",
227
+ " <tr>\n",
228
+ " <th>5569</th>\n",
229
+ " <td>ham</td>\n",
230
+ " <td>Pity, * was in mood for that. So...any other s...</td>\n",
231
+ " </tr>\n",
232
+ " <tr>\n",
233
+ " <th>5570</th>\n",
234
+ " <td>ham</td>\n",
235
+ " <td>The guy did some bitching but I acted like i'd...</td>\n",
236
+ " </tr>\n",
237
+ " <tr>\n",
238
+ " <th>5571</th>\n",
239
+ " <td>ham</td>\n",
240
+ " <td>Rofl. Its true to its name</td>\n",
241
+ " </tr>\n",
242
+ " </tbody>\n",
243
+ "</table>\n",
244
+ "<p>5572 rows Γ— 2 columns</p>\n",
245
+ "</div>"
246
+ ],
247
+ "text/plain": [
248
+ " v1 v2\n",
249
+ "0 ham Go until jurong point, crazy.. Available only ...\n",
250
+ "1 ham Ok lar... Joking wif u oni...\n",
251
+ "2 spam Free entry in 2 a wkly comp to win FA Cup fina...\n",
252
+ "3 ham U dun say so early hor... U c already then say...\n",
253
+ "4 ham Nah I don't think he goes to usf, he lives aro...\n",
254
+ "... ... ...\n",
255
+ "5567 spam This is the 2nd time we have tried 2 contact u...\n",
256
+ "5568 ham Will Ì_ b going to esplanade fr home?\n",
257
+ "5569 ham Pity, * was in mood for that. So...any other s...\n",
258
+ "5570 ham The guy did some bitching but I acted like i'd...\n",
259
+ "5571 ham Rofl. Its true to its name\n",
260
+ "\n",
261
+ "[5572 rows x 2 columns]"
262
+ ]
263
+ },
264
+ "execution_count": 9,
265
+ "metadata": {},
266
+ "output_type": "execute_result"
267
+ }
268
+ ],
269
+ "source": [
270
+ "df"
271
+ ]
272
+ },
273
+ {
274
+ "cell_type": "code",
275
+ "execution_count": 13,
276
+ "metadata": {},
277
+ "outputs": [],
278
+ "source": [
279
+ "df['v1'] = df['v1'].map({'ham': 0, 'spam': 1})\n"
280
+ ]
281
+ },
282
+ {
283
+ "cell_type": "code",
284
+ "execution_count": 14,
285
+ "metadata": {},
286
+ "outputs": [
287
+ {
288
+ "data": {
289
+ "text/plain": [
290
+ "<Axes: xlabel='v1', ylabel='count'>"
291
+ ]
292
+ },
293
+ "execution_count": 14,
294
+ "metadata": {},
295
+ "output_type": "execute_result"
296
+ },
297
+ {
298
+ "data": {
299
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAkQAAAGxCAYAAACDV6ltAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy88F64QAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAjV0lEQVR4nO3dfXBU5d2H8e+akBgwORJIdl2NFMdIwSBtoxOWqYICEdqQOnZEGxrpiKCi0C1QKGWs0PFJCo6ANZUC1eILitNafKk0EluJLxDA1IyCSNWmNQxZEnXZAMYNhvP8YTnjsogYkpwN9/WZ2Rn33l8292Em5pqzZzce27ZtAQAAGOwMtzcAAADgNoIIAAAYjyACAADGI4gAAIDxCCIAAGA8gggAABiPIAIAAMYjiAAAgPGS3d5AT3HkyBHt3btX6enp8ng8bm8HAACcBNu2deDAAfn9fp1xxpefByKITtLevXuVk5Pj9jYAAEAHNDQ06LzzzvvSx10NooULF2rRokUxa16vV6FQSNLnVbdo0SKtWrVK4XBYBQUF+t3vfqeLL77YmY9Go5ozZ46eeOIJtba2avTo0XrggQdiDjocDmvmzJl69tlnJUnFxcW6//77dfbZZ5/0XtPT0yV9/g+akZHR0UMGAADdqKWlRTk5Oc7v8S/j+hmiiy++WC+++KJzPykpyfnvJUuWaOnSpVqzZo0uuugi3X333Ro7dqx2797tHFgwGNRzzz2ndevWqV+/fpo9e7aKiopUW1vrPFdJSYn27NmjyspKSdK0adNUWlqq55577qT3efRlsoyMDIIIAIAe5qsud3E9iJKTk+Xz+eLWbdvW8uXLtWDBAl177bWSpIcfflher1ePP/64brnlFkUiET344IN69NFHNWbMGEnSY489ppycHL344ou6+uqrtWvXLlVWVqqmpkYFBQWSpNWrVysQCGj37t0aNGhQ9x0sAABISK6/y+zdd9+V3+/XwIEDdcMNN+jf//63JKm+vl6hUEiFhYXObGpqqkaOHKnNmzdLkmpra3X48OGYGb/fr7y8PGdmy5YtsizLiSFJGj58uCzLcmaOJxqNqqWlJeYGAABOT64GUUFBgR555BG98MILWr16tUKhkEaMGKGPPvrIuY7I6/XGfM0XrzEKhUJKSUlR3759TziTnZ0d972zs7OdmeMpLy+XZVnOjQuqAQA4fbkaROPHj9cPf/hDDR06VGPGjNHzzz8v6fOXxo469jU/27a/8nXAY2eON/9VzzN//nxFIhHn1tDQcFLHBAAAeh7XXzL7oj59+mjo0KF69913neuKjj2L09TU5Jw18vl8amtrUzgcPuHMvn374r5Xc3Nz3NmnL0pNTXUuoOZCagAATm8JFUTRaFS7du3SOeeco4EDB8rn86mqqsp5vK2tTdXV1RoxYoQkKT8/X7169YqZaWxs1I4dO5yZQCCgSCSibdu2OTNbt25VJBJxZgAAgNlcfZfZnDlzNGHCBJ1//vlqamrS3XffrZaWFk2ePFkej0fBYFBlZWXKzc1Vbm6uysrK1Lt3b5WUlEiSLMvSlClTNHv2bPXr10+ZmZmaM2eO8xKcJA0ePFjjxo3T1KlTtXLlSkmfv+2+qKiId5gBAABJLgfRnj179KMf/UgffvihsrKyNHz4cNXU1GjAgAGSpLlz56q1tVXTp093Pphx48aNMR+utGzZMiUnJ2vixInOBzOuWbMm5vOM1q5dq5kzZzrvRisuLlZFRUX3HiwAAEhYHtu2bbc30RO0tLTIsixFIhGuJwIAoIc42d/fCXUNEQAAgBsIIgAAYDyCCAAAGI8gAgAAxiOIAACA8QgiAABgPFc/hwjx8n/+iNtbABJO7T03ur0FAKc5zhABAADjEUQAAMB4BBEAADAeQQQAAIxHEAEAAOMRRAAAwHgEEQAAMB5BBAAAjEcQAQAA4xFEAADAeAQRAAAwHkEEAACMRxABAADjEUQAAMB4BBEAADAeQQQAAIxHEAEAAOMRRAAAwHgEEQAAMB5BBAAAjEcQAQAA4xFEAADAeAQRAAAwHkEEAACMRxABAADjEUQAAMB4BBEAADAeQQQAAIxHEAEAAOMRRAAAwHgEEQAAMB5BBAAAjEcQAQAA4xFEAADAeAQRAAAwHkEEAACMRxABAADjEUQAAMB4BBEAADAeQQQAAIxHEAEAAOMRRAAAwHgEEQAAMB5BBAAAjEcQAQAA4xFEAADAeAQRAAAwHkEEAACMRxABAADjEUQAAMB4BBEAADAeQQQAAIxHEAEAAOMRRAAAwHgEEQAAMF7CBFF5ebk8Ho+CwaCzZtu2Fi5cKL/fr7S0NI0aNUo7d+6M+bpoNKoZM2aof//+6tOnj4qLi7Vnz56YmXA4rNLSUlmWJcuyVFpaqv3793fDUQEAgJ4gIYJo+/btWrVqlS655JKY9SVLlmjp0qWqqKjQ9u3b5fP5NHbsWB04cMCZCQaDWr9+vdatW6dXX31VBw8eVFFRkdrb252ZkpIS1dXVqbKyUpWVlaqrq1NpaWm3HR8AAEhsrgfRwYMHNWnSJK1evVp9+/Z11m3b1vLly7VgwQJde+21ysvL08MPP6xPPvlEjz/+uCQpEonowQcf1L333qsxY8bo29/+th577DG99dZbevHFFyVJu3btUmVlpf7whz8oEAgoEAho9erV+utf/6rdu3e7cswAACCxuB5Et99+u77//e9rzJgxMev19fUKhUIqLCx01lJTUzVy5Eht3rxZklRbW6vDhw/HzPj9fuXl5TkzW7ZskWVZKigocGaGDx8uy7KcmeOJRqNqaWmJuQEAgNNTspvffN26dfrnP/+p7du3xz0WCoUkSV6vN2bd6/Xqv//9rzOTkpISc2bp6MzRrw+FQsrOzo57/uzsbGfmeMrLy7Vo0aKvd0AAAKBHcu0MUUNDg37605/qscce05lnnvmlcx6PJ+a+bdtxa8c6duZ481/1PPPnz1ckEnFuDQ0NJ/yeAACg53ItiGpra9XU1KT8/HwlJycrOTlZ1dXV+u1vf6vk5GTnzNCxZ3Gampqcx3w+n9ra2hQOh084s2/fvrjv39zcHHf26YtSU1OVkZERcwMAAKcn14Jo9OjReuutt1RXV+fcLr30Uk2aNEl1dXW64IIL5PP5VFVV5XxNW1ubqqurNWLECElSfn6+evXqFTPT2NioHTt2ODOBQECRSETbtm1zZrZu3apIJOLMAAAAs7l2DVF6erry8vJi1vr06aN+/fo568FgUGVlZcrNzVVubq7KysrUu3dvlZSUSJIsy9KUKVM0e/Zs9evXT5mZmZozZ46GDh3qXKQ9ePBgjRs3TlOnTtXKlSslSdOmTVNRUZEGDRrUjUcMAAASlasXVX+VuXPnqrW1VdOnT1c4HFZBQYE2btyo9PR0Z2bZsmVKTk7WxIkT1draqtGjR2vNmjVKSkpyZtauXauZM2c670YrLi5WRUVFtx8PAABITB7btm23N9ETtLS0yLIsRSKRLr2eKP/nj3TZcwM9Ve09N7q9BQA91Mn+/nb9c4gAAADcRhABAADjEUQAAMB4BBEAADAeQQQAAIxHEAEAAOMRRAAAwHgEEQAAMB5BBAAAjEcQAQAA4xFEAADAeAQRAAAwHkEEAACMRxABAADjEUQAAMB4BBEAADAeQQQAAIxHEAEAAOMRRAAAwHgEEQAAMB5BBAAAjEcQAQAA4xFEAADAeAQRAAAwHkEEAACMRxABAADjEUQAAMB4BBEAADAeQQQAAIxHEAEAAOMRRAAAwHgEEQAAMB5BBAAAjEcQAQAA4xFEAADAeAQRAAAwHkEEAACMRxABAADjEUQAAMB4BBEAADAeQQQAAIxHEAEAAOMRRAAAwHgEEQAAMB5BBAAAjEcQAQAA4xFEAADAeAQRAAAwHkEEAACMRxABAADjEUQAAMB4BBEAADAeQQQAAIxHEAEAAOMRRAAAwHgEEQAAMB5BBAAAjEcQAQAA4xFEAADAeAQRAAAwHkEEAACMRxABAADjuRpEK1as0CWXXKKMjAxlZGQoEAjob3/7m/O4bdtauHCh/H6/0tLSNGrUKO3cuTPmOaLRqGbMmKH+/furT58+Ki4u1p49e2JmwuGwSktLZVmWLMtSaWmp9u/f3x2HCAAAegBXg+i8887Tb37zG73++ut6/fXXddVVV+kHP/iBEz1LlizR0qVLVVFRoe3bt8vn82ns2LE6cOCA8xzBYFDr16/XunXr9Oqrr+rgwYMqKipSe3u7M1NSUqK6ujpVVlaqsrJSdXV1Ki0t7fbjBQAAiclj27bt9ia+KDMzU/fcc49uuukm+f1+BYNBzZs3T9LnZ4O8Xq8WL16sW265RZFIRFlZWXr00Ud1/fXXS5L27t2rnJwcbdiwQVdffbV27dqlIUOGqKamRgUFBZKkmpoaBQIBvfPOOxo0aNBJ7aulpUWWZSkSiSgjI6NrDl5S/s8f6bLnBnqq2ntudHsLAHqok/39nTDXELW3t2vdunU6dOiQAoGA6uvrFQqFVFhY6MykpqZq5MiR2rx5sySptrZWhw8fjpnx+/3Ky8tzZrZs2SLLspwYkqThw4fLsixn5nii0ahaWlpibgAA4PTkehC99dZbOuuss5Samqpbb71V69ev15AhQxQKhSRJXq83Zt7r9TqPhUIhpaSkqG/fviecyc7Ojvu+2dnZzszxlJeXO9ccWZalnJycUzpOAACQuFwPokGDBqmurk41NTW67bbbNHnyZL399tvO4x6PJ2betu24tWMdO3O8+a96nvnz5ysSiTi3hoaGkz0kAADQw7geRCkpKbrwwgt16aWXqry8XMOGDdN9990nn88nSXFncZqampyzRj6fT21tbQqHwyec2bdvX9z3bW5ujjv79EWpqanOu9+O3gAAwOnJ9SA6lm3bikajGjhwoHw+n6qqqpzH2traVF1drREjRkiS8vPz1atXr5iZxsZG7dixw5kJBAKKRCLatm2bM7N161ZFIhFnBgAAmC3ZzW/+y1/+UuPHj1dOTo4OHDigdevWadOmTaqsrJTH41EwGFRZWZlyc3OVm5ursrIy9e7dWyUlJZIky7I0ZcoUzZ49W/369VNmZqbmzJmjoUOHasyYMZKkwYMHa9y4cZo6dapWrlwpSZo2bZqKiopO+h1mAADg9OZqEO3bt0+lpaVqbGyUZVm65JJLVFlZqbFjx0qS5s6dq9bWVk2fPl3hcFgFBQXauHGj0tPTnedYtmyZkpOTNXHiRLW2tmr06NFas2aNkpKSnJm1a9dq5syZzrvRiouLVVFR0b0HCwAAElbCfQ5RouJziAD38DlEADqqx30OEQAAgFsIIgAAYDyCCAAAGI8gAgAAxiOIAACA8QgiAABgPIIIAAAYjyACAADGI4gAAIDxCCIAAGC8DgXRVVddpf3798ett7S06KqrrjrVPQEAAHSrDgXRpk2b1NbWFrf+6aef6pVXXjnlTQEAAHSnr/XX7t98803nv99++22FQiHnfnt7uyorK3Xuued23u4AAAC6wdcKom9961vyeDzyeDzHfWksLS1N999/f6dtDgAAoDt8rSCqr6+Xbdu64IILtG3bNmVlZTmPpaSkKDs7W0lJSZ2+SQAAgK70tYJowIABkqQjR450yWYAAADc8LWC6Iv+9a9/adOmTWpqaooLpF/96lenvDEAAIDu0qEgWr16tW677Tb1799fPp9PHo/Heczj8RBEAACgR+lQEN199936v//7P82bN6+z9wMAANDtOvQ5ROFwWNddd11n7wUAAMAVHQqi6667Ths3buzsvQAAALiiQy+ZXXjhhbrzzjtVU1OjoUOHqlevXjGPz5w5s1M2BwAA0B06FESrVq3SWWedperqalVXV8c85vF4CCIAANCjdCiI6uvrO3sfAAAArunQNUQAAACnkw6dIbrppptO+PhDDz3Uoc0AAAC4oUNBFA6HY+4fPnxYO3bs0P79+4/7R18BAAASWYeCaP369XFrR44c0fTp03XBBRec8qYAAAC6U6ddQ3TGGWfoZz/7mZYtW9ZZTwkAANAtOvWi6vfff1+fffZZZz4lAABAl+vQS2azZs2KuW/bthobG/X8889r8uTJnbIxAACA7tKhIHrjjTdi7p9xxhnKysrSvffe+5XvQAMAAEg0HQqil156qbP3AQAA4JoOBdFRzc3N2r17tzwejy666CJlZWV11r4AAAC6TYcuqj506JBuuukmnXPOObriiit0+eWXy+/3a8qUKfrkk086e48AAABdqkNBNGvWLFVXV+u5557T/v37tX//fj3zzDOqrq7W7NmzO3uPAAAAXapDL5k99dRT+vOf/6xRo0Y5a9/73veUlpamiRMnasWKFZ21PwAAgC7XoTNEn3zyibxeb9x6dnY2L5kBAIAep0NBFAgEdNddd+nTTz911lpbW7Vo0SIFAoFO2xwAAEB36NBLZsuXL9f48eN13nnnadiwYfJ4PKqrq1Nqaqo2btzY2XsEAADoUh0KoqFDh+rdd9/VY489pnfeeUe2beuGG27QpEmTlJaW1tl7BAAA6FIdCqLy8nJ5vV5NnTo1Zv2hhx5Sc3Oz5s2b1ymbAwAA6A4duoZo5cqV+uY3vxm3fvHFF+v3v//9KW8KAACgO3UoiEKhkM4555y49aysLDU2Np7ypgAAALpTh4IoJydHr732Wtz6a6+9Jr/ff8qbAgAA6E4duobo5ptvVjAY1OHDh3XVVVdJkv7+979r7ty5fFI1AADocToURHPnztXHH3+s6dOnq62tTZJ05plnat68eZo/f36nbhAAAKCrdSiIPB6PFi9erDvvvFO7du1SWlqacnNzlZqa2tn7AwAA6HIdCqKjzjrrLF122WWdtRcAAABXdOiiagAAgNMJQQQAAIxHEAEAAOMRRAAAwHgEEQAAMB5BBAAAjEcQAQAA4xFEAADAeAQRAAAwHkEEAACMRxABAADjEUQAAMB4BBEAADCeq0FUXl6uyy67TOnp6crOztY111yj3bt3x8zYtq2FCxfK7/crLS1No0aN0s6dO2NmotGoZsyYof79+6tPnz4qLi7Wnj17YmbC4bBKS0tlWZYsy1Jpaan279/f1YcIAAB6AFeDqLq6WrfffrtqampUVVWlzz77TIWFhTp06JAzs2TJEi1dulQVFRXavn27fD6fxo4dqwMHDjgzwWBQ69ev17p16/Tqq6/q4MGDKioqUnt7uzNTUlKiuro6VVZWqrKyUnV1dSotLe3W4wUAAInJY9u27fYmjmpublZ2draqq6t1xRVXyLZt+f1+BYNBzZs3T9LnZ4O8Xq8WL16sW265RZFIRFlZWXr00Ud1/fXXS5L27t2rnJwcbdiwQVdffbV27dqlIUOGqKamRgUFBZKkmpoaBQIBvfPOOxo0aNBX7q2lpUWWZSkSiSgjI6PL/g3yf/5Ilz030FPV3nOj21sA0EOd7O/vhLqGKBKJSJIyMzMlSfX19QqFQiosLHRmUlNTNXLkSG3evFmSVFtbq8OHD8fM+P1+5eXlOTNbtmyRZVlODEnS8OHDZVmWM3OsaDSqlpaWmBsAADg9JUwQ2batWbNm6bvf/a7y8vIkSaFQSJLk9XpjZr1er/NYKBRSSkqK+vbte8KZ7OzsuO+ZnZ3tzByrvLzcud7Isizl5OSc2gECAICElTBBdMcdd+jNN9/UE088EfeYx+OJuW/bdtzasY6dOd78iZ5n/vz5ikQizq2hoeFkDgMAAPRACRFEM2bM0LPPPquXXnpJ5513nrPu8/kkKe4sTlNTk3PWyOfzqa2tTeFw+IQz+/bti/u+zc3NcWefjkpNTVVGRkbMDQAAnJ5cDSLbtnXHHXfoL3/5i/7xj39o4MCBMY8PHDhQPp9PVVVVzlpbW5uqq6s1YsQISVJ+fr569eoVM9PY2KgdO3Y4M4FAQJFIRNu2bXNmtm7dqkgk4swAAABzJbv5zW+//XY9/vjjeuaZZ5Senu6cCbIsS2lpafJ4PAoGgyorK1Nubq5yc3NVVlam3r17q6SkxJmdMmWKZs+erX79+ikzM1Nz5szR0KFDNWbMGEnS4MGDNW7cOE2dOlUrV66UJE2bNk1FRUUn9Q4zAABwenM1iFasWCFJGjVqVMz6H//4R/3kJz+RJM2dO1etra2aPn26wuGwCgoKtHHjRqWnpzvzy5YtU3JysiZOnKjW1laNHj1aa9asUVJSkjOzdu1azZw503k3WnFxsSoqKrr2AAEAQI+QUJ9DlMj4HCLAPXwOEYCO6pGfQwQAAOAGgggAABiPIAIAAMYjiAAAgPEIIgAAYDyCCAAAGI8gAgAAxiOIAACA8QgiAABgPIIIAAAYjyACAADGI4gAAIDxCCIAAGA8gggAABiPIAIAAMYjiAAAgPEIIgAAYDyCCAAAGI8gAgAAxiOIAACA8QgiAABgPIIIAAAYjyACAADGI4gAAIDxCCIAAGA8gggAABiPIAIAAMYjiAAAgPEIIgAAYDyCCAAAGI8gAgAAxiOIAACA8QgiAABgPIIIAAAYjyACAADGI4gAAIDxCCIAAGA8gggAABiPIAIAAMYjiAAAgPEIIgAAYDyCCAAAGI8gAgAAxiOIAACA8QgiAABgPIIIAAAYjyACAADGI4gAAIDxCCIAAGA8gggAABiPIAIAAMYjiAAAgPEIIgAAYDyCCAAAGI8gAgAAxiOIAACA8QgiAABgPIIIAAAYjyACAADGI4gAAIDxCCIAAGA8gggAABjP1SB6+eWXNWHCBPn9fnk8Hj399NMxj9u2rYULF8rv9ystLU2jRo3Szp07Y2ai0ahmzJih/v37q0+fPiouLtaePXtiZsLhsEpLS2VZlizLUmlpqfbv39/FRwcAAHoKV4Po0KFDGjZsmCoqKo77+JIlS7R06VJVVFRo+/bt8vl8Gjt2rA4cOODMBINBrV+/XuvWrdOrr76qgwcPqqioSO3t7c5MSUmJ6urqVFlZqcrKStXV1am0tLTLjw8AAPQMHtu2bbc3IUkej0fr16/XNddcI+nzs0N+v1/BYFDz5s2T9PnZIK/Xq8WLF+uWW25RJBJRVlaWHn30UV1//fWSpL179yonJ0cbNmzQ1VdfrV27dmnIkCGqqalRQUGBJKmmpkaBQEDvvPOOBg0adFL7a2lpkWVZikQiysjI6Px/gP/J//kjXfbcQE9Ve8+Nbm8BQA91sr+/E/Yaovr6eoVCIRUWFjprqampGjlypDZv3ixJqq2t1eHDh2Nm/H6/8vLynJktW7bIsiwnhiRp+PDhsizLmTmeaDSqlpaWmBsAADg9JWwQhUIhSZLX641Z93q9zmOhUEgpKSnq27fvCWeys7Pjnj87O9uZOZ7y8nLnmiPLspSTk3NKxwMAABJXwgbRUR6PJ+a+bdtxa8c6duZ481/1PPPnz1ckEnFuDQ0NX3PnAACgp0jYIPL5fJIUdxanqanJOWvk8/nU1tamcDh8wpl9+/bFPX9zc3Pc2acvSk1NVUZGRswNAACcnhI2iAYOHCifz6eqqipnra2tTdXV1RoxYoQkKT8/X7169YqZaWxs1I4dO5yZQCCgSCSibdu2OTNbt25VJBJxZgAAgNmS3fzmBw8e1Hvvvefcr6+vV11dnTIzM3X++ecrGAyqrKxMubm5ys3NVVlZmXr37q2SkhJJkmVZmjJlimbPnq1+/fopMzNTc+bM0dChQzVmzBhJ0uDBgzVu3DhNnTpVK1eulCRNmzZNRUVFJ/0OMwAAcHpzNYhef/11XXnllc79WbNmSZImT56sNWvWaO7cuWptbdX06dMVDodVUFCgjRs3Kj093fmaZcuWKTk5WRMnTlRra6tGjx6tNWvWKCkpyZlZu3atZs6c6bwbrbi4+Es/+wgAAJgnYT6HKNHxOUSAe/gcIgAd1eM/hwgAAKC7EEQAAMB4BBEAADAeQQQAAIxHEAEAAOMRRAAAwHgEEQAAMB5BBAAAjEcQAQAA4xFEAADAeAQRAAAwnqt/3BUATMLfKgTiJcrfKuQMEQAAMB5BBAAAjEcQAQAA4xFEAADAeAQRAAAwHkEEAACMRxABAADjEUQAAMB4BBEAADAeQQQAAIxHEAEAAOMRRAAAwHgEEQAAMB5BBAAAjEcQAQAA4xFEAADAeAQRAAAwHkEEAACMRxABAADjEUQAAMB4BBEAADAeQQQAAIxHEAEAAOMRRAAAwHgEEQAAMB5BBAAAjEcQAQAA4xFEAADAeAQRAAAwHkEEAACMRxABAADjEUQAAMB4BBEAADAeQQQAAIxHEAEAAOMRRAAAwHgEEQAAMB5BBAAAjEcQAQAA4xFEAADAeAQRAAAwHkEEAACMRxABAADjEUQAAMB4BBEAADAeQQQAAIxHEAEAAOMRRAAAwHgEEQAAMB5BBAAAjGdUED3wwAMaOHCgzjzzTOXn5+uVV15xe0sAACABGBNETz75pILBoBYsWKA33nhDl19+ucaPH68PPvjA7a0BAACXGRNES5cu1ZQpU3TzzTdr8ODBWr58uXJycrRixQq3twYAAFyW7PYGukNbW5tqa2v1i1/8Ima9sLBQmzdvPu7XRKNRRaNR534kEpEktbS0dN1GJbVHW7v0+YGeqKt/7roLP99AvK7++T76/LZtn3DOiCD68MMP1d7eLq/XG7Pu9XoVCoWO+zXl5eVatGhR3HpOTk6X7BHAl7Puv9XtLQDoIt31833gwAFZlvWljxsRREd5PJ6Y+7Ztx60dNX/+fM2aNcu5f+TIEX388cfq16/fl34NTh8tLS3KyclRQ0ODMjIy3N4OgE7Ez7dZbNvWgQMH5Pf7TzhnRBD1799fSUlJcWeDmpqa4s4aHZWamqrU1NSYtbPPPrurtogElZGRwf8wgdMUP9/mONGZoaOMuKg6JSVF+fn5qqqqilmvqqrSiBEjXNoVAABIFEacIZKkWbNmqbS0VJdeeqkCgYBWrVqlDz74QLfeyrUJAACYzpgguv766/XRRx/p17/+tRobG5WXl6cNGzZowIABbm8NCSg1NVV33XVX3MumAHo+fr5xPB77q96HBgAAcJoz4hoiAACAEyGIAACA8QgiAABgPIIIAAAYjyACjvHAAw9o4MCBOvPMM5Wfn69XXnnF7S0B6AQvv/yyJkyYIL/fL4/Ho6efftrtLSGBEETAFzz55JMKBoNasGCB3njjDV1++eUaP368PvjgA7e3BuAUHTp0SMOGDVNFRYXbW0EC4m33wBcUFBToO9/5jlasWOGsDR48WNdcc43Ky8td3BmAzuTxeLR+/Xpdc801bm8FCYIzRMD/tLW1qba2VoWFhTHrhYWF2rx5s0u7AgB0B4II+J8PP/xQ7e3tcX/w1+v1xv1hYADA6YUgAo7h8Xhi7tu2HbcGADi9EETA//Tv319JSUlxZ4OamprizhoBAE4vBBHwPykpKcrPz1dVVVXMelVVlUaMGOHSrgAA3cGYv3YPnIxZs2aptLRUl156qQKBgFatWqUPPvhAt956q9tbA3CKDh48qPfee8+5X19fr7q6OmVmZur88893cWdIBLztHjjGAw88oCVLlqixsVF5eXlatmyZrrjiCre3BeAUbdq0SVdeeWXc+uTJk7VmzZru3xASCkEEAACMxzVEAADAeAQRAAAwHkEEAACMRxABAADjEUQAAMB4BBEAADAeQQQAAIxHEAEAAOMRRAAAwHgEEQBIamxsVElJiQYNGqQzzjhDwWDQ7S0B6EYEEQBIikajysrK0oIFCzRs2DC3twOgmxFEAIywcuVKnXvuuTpy5EjMenFxsSZPnqxvfOMbuu+++3TjjTfKsiyXdgnALQQRACNcd911+vDDD/XSSy85a+FwWC+88IImTZrk4s4AJAKCCIARMjMzNW7cOD3++OPO2p/+9CdlZmZq9OjRLu4MQCIgiAAYY9KkSXrqqacUjUYlSWvXrtUNN9ygpKQkl3cGwG0EEQBjTJgwQUeOHNHzzz+vhoYGvfLKK/rxj3/s9rYAJIBktzcAAN0lLS1N1157rdauXav33ntPF110kfLz893eFoAEQBABMMqkSZM0YcIE7dy5M+7sUF1dnSTp4MGDam5uVl1dnVJSUjRkyBAXdgqgO3ls27bd3gQAdJf29nbl5OSosbFR77//vi644ALnMY/HEzc/YMAA/ec//+nGHQJwA0EEAACMx0XVAADAeAQRAAAwHkEEAACMRxABAADjEUQAAMB4BBEAADAeQQQAAIxHEAEAAOMRRAAAwHgEEQAAMB5BBAAAjPf/aqPUYg6cnwwAAAAASUVORK5CYII=",
300
+ "text/plain": [
301
+ "<Figure size 640x480 with 1 Axes>"
302
+ ]
303
+ },
304
+ "metadata": {},
305
+ "output_type": "display_data"
306
+ }
307
+ ],
308
+ "source": [
309
+ "sns.countplot(x=df['v1'])"
310
+ ]
311
+ },
312
+ {
313
+ "cell_type": "code",
314
+ "execution_count": 15,
315
+ "metadata": {},
316
+ "outputs": [
317
+ {
318
+ "data": {
319
+ "text/plain": [
320
+ "v1 0\n",
321
+ "v2 0\n",
322
+ "dtype: int64"
323
+ ]
324
+ },
325
+ "execution_count": 15,
326
+ "metadata": {},
327
+ "output_type": "execute_result"
328
+ }
329
+ ],
330
+ "source": [
331
+ "df.isnull().sum()"
332
+ ]
333
+ },
334
+ {
335
+ "cell_type": "code",
336
+ "execution_count": 16,
337
+ "metadata": {},
338
+ "outputs": [],
339
+ "source": [
340
+ "X=df['v2']\n",
341
+ "y=df['v1']"
342
+ ]
343
+ },
344
+ {
345
+ "cell_type": "code",
346
+ "execution_count": 17,
347
+ "metadata": {},
348
+ "outputs": [],
349
+ "source": [
350
+ "from sklearn.model_selection import train_test_split\n",
351
+ "X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3)"
352
+ ]
353
+ },
354
+ {
355
+ "cell_type": "code",
356
+ "execution_count": 18,
357
+ "metadata": {},
358
+ "outputs": [],
359
+ "source": [
360
+ "import tensorflow as tf\n",
361
+ "from tensorflow.keras.preprocessing.text import Tokenizer\n",
362
+ "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
363
+ "tokenizer = Tokenizer(oov_token=\"<OOV>\")\n",
364
+ "tokenizer.fit_on_texts(X_train)\n"
365
+ ]
366
+ },
367
+ {
368
+ "cell_type": "code",
369
+ "execution_count": 19,
370
+ "metadata": {},
371
+ "outputs": [
372
+ {
373
+ "data": {
374
+ "text/plain": [
375
+ "7466"
376
+ ]
377
+ },
378
+ "execution_count": 19,
379
+ "metadata": {},
380
+ "output_type": "execute_result"
381
+ }
382
+ ],
383
+ "source": [
384
+ "len(tokenizer.word_index)+1"
385
+ ]
386
+ },
387
+ {
388
+ "cell_type": "code",
389
+ "execution_count": 20,
390
+ "metadata": {},
391
+ "outputs": [],
392
+ "source": [
393
+ "voc_size=7466"
394
+ ]
395
+ },
396
+ {
397
+ "cell_type": "code",
398
+ "execution_count": 21,
399
+ "metadata": {},
400
+ "outputs": [],
401
+ "source": [
402
+ "# Convert text to sequences of integers\n",
403
+ "X_train = tokenizer.texts_to_sequences(X_train)\n",
404
+ "X_test = tokenizer.texts_to_sequences(X_test)\n"
405
+ ]
406
+ },
407
+ {
408
+ "cell_type": "code",
409
+ "execution_count": 22,
410
+ "metadata": {},
411
+ "outputs": [
412
+ {
413
+ "name": "stdout",
414
+ "output_type": "stream",
415
+ "text": [
416
+ "Max Length: 189\n",
417
+ "Min Length: 0\n",
418
+ "Average Length: 15.866923076923078\n"
419
+ ]
420
+ }
421
+ ],
422
+ "source": [
423
+ "import numpy as np\n",
424
+ "essay_lengths = [len(essay) for essay in X_train]\n",
425
+ "print(f\"Max Length: {max(essay_lengths)}\")\n",
426
+ "print(f\"Min Length: {min(essay_lengths)}\")\n",
427
+ "print(f\"Average Length: {np.mean(essay_lengths)}\")\n"
428
+ ]
429
+ },
430
+ {
431
+ "cell_type": "code",
432
+ "execution_count": 24,
433
+ "metadata": {},
434
+ "outputs": [],
435
+ "source": [
436
+ "max_length = 50 # Set max length (adjust based on your data)\n",
437
+ "X_train = pad_sequences(X_train, maxlen=max_length, padding='post')\n",
438
+ "X_test = pad_sequences(X_test, maxlen=max_length, padding='post')\n"
439
+ ]
440
+ },
441
+ {
442
+ "cell_type": "code",
443
+ "execution_count": 25,
444
+ "metadata": {},
445
+ "outputs": [
446
+ {
447
+ "name": "stderr",
448
+ "output_type": "stream",
449
+ "text": [
450
+ "c:\\Users\\saipr\\anaconda3\\keras\\src\\layers\\core\\embedding.py:90: UserWarning: Argument `input_length` is deprecated. Just remove it.\n",
451
+ " warnings.warn(\n"
452
+ ]
453
+ }
454
+ ],
455
+ "source": [
456
+ "from tensorflow.keras.layers import Embedding,LSTM,GRU,SimpleRNN,Embedding,Dense\n",
457
+ "from tensorflow.keras.models import Sequential\n",
458
+ "from tensorflow.keras.regularizers import l2\n",
459
+ "\n",
460
+ "model=Sequential()\n",
461
+ "model.add(Embedding(input_dim=voc_size, output_dim=128, input_length=600))\n",
462
+ "model.add(LSTM(84, activation='tanh', kernel_regularizer=l2(0.005), return_sequences=True))\n",
463
+ "model.add(LSTM(64,activation='tanh',kernel_regularizer=l2(0.005)))\n",
464
+ "model.add(Dense(6, activation='softmax')) # Output layer with 1 neuron and sigmoid activation\n"
465
+ ]
466
+ },
467
+ {
468
+ "cell_type": "code",
469
+ "execution_count": 26,
470
+ "metadata": {},
471
+ "outputs": [],
472
+ "source": [
473
+ "model.compile(optimizer='Adam', loss='sparse_categorical_crossentropy',metrics=['accuracy'])"
474
+ ]
475
+ },
476
+ {
477
+ "cell_type": "code",
478
+ "execution_count": 27,
479
+ "metadata": {},
480
+ "outputs": [],
481
+ "source": [
482
+ "from keras.callbacks import EarlyStopping\n",
483
+ "\n",
484
+ "# Define early stopping\n",
485
+ "early_stopping = EarlyStopping(\n",
486
+ " monitor='val_loss', # Metric to monitor\n",
487
+ " patience=3, # Number of epochs with no improvement after which training will stop\n",
488
+ " restore_best_weights=True # Restore model weights from the epoch with the best value of the monitored metric\n",
489
+ ")\n"
490
+ ]
491
+ },
492
+ {
493
+ "cell_type": "code",
494
+ "execution_count": 28,
495
+ "metadata": {},
496
+ "outputs": [
497
+ {
498
+ "name": "stdout",
499
+ "output_type": "stream",
500
+ "text": [
501
+ "Epoch 1/10\n",
502
+ "\u001b[1m122/122\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m9s\u001b[0m 43ms/step - accuracy: 0.8430 - loss: 1.5452 - val_accuracy: 0.8684 - val_loss: 0.5334\n",
503
+ "Epoch 2/10\n",
504
+ "\u001b[1m122/122\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 40ms/step - accuracy: 0.8625 - loss: 0.5024 - val_accuracy: 0.8684 - val_loss: 0.4096\n",
505
+ "Epoch 3/10\n",
506
+ "\u001b[1m122/122\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 40ms/step - accuracy: 0.8531 - loss: 0.4347 - val_accuracy: 0.8989 - val_loss: 0.4201\n",
507
+ "Epoch 4/10\n",
508
+ "\u001b[1m122/122\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 39ms/step - accuracy: 0.9168 - loss: 0.3396 - val_accuracy: 0.9671 - val_loss: 0.1950\n",
509
+ "Epoch 5/10\n",
510
+ "\u001b[1m122/122\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 40ms/step - accuracy: 0.9804 - loss: 0.1364 - val_accuracy: 0.9779 - val_loss: 0.1315\n",
511
+ "Epoch 6/10\n",
512
+ "\u001b[1m122/122\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 39ms/step - accuracy: 0.9882 - loss: 0.0891 - val_accuracy: 0.9785 - val_loss: 0.1169\n",
513
+ "Epoch 7/10\n",
514
+ "\u001b[1m122/122\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 40ms/step - accuracy: 0.9161 - loss: 0.3179 - val_accuracy: 0.8684 - val_loss: 0.4311\n",
515
+ "Epoch 8/10\n",
516
+ "\u001b[1m122/122\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 39ms/step - accuracy: 0.8766 - loss: 0.4133 - val_accuracy: 0.9348 - val_loss: 0.3198\n",
517
+ "Epoch 9/10\n",
518
+ "\u001b[1m122/122\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 39ms/step - accuracy: 0.9573 - loss: 0.2205 - val_accuracy: 0.9611 - val_loss: 0.1988\n"
519
+ ]
520
+ },
521
+ {
522
+ "data": {
523
+ "text/plain": [
524
+ "<keras.src.callbacks.history.History at 0x1f4af1560e0>"
525
+ ]
526
+ },
527
+ "execution_count": 28,
528
+ "metadata": {},
529
+ "output_type": "execute_result"
530
+ }
531
+ ],
532
+ "source": [
533
+ "model.fit(X_train,y_train,epochs=10,validation_data=(X_test,y_test),callbacks=[early_stopping])"
534
+ ]
535
+ },
536
+ {
537
+ "cell_type": "code",
538
+ "execution_count": 29,
539
+ "metadata": {},
540
+ "outputs": [
541
+ {
542
+ "name": "stdout",
543
+ "output_type": "stream",
544
+ "text": [
545
+ "\u001b[1m53/53\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 24ms/step - accuracy: 0.9814 - loss: 0.1061\n",
546
+ "Test Loss: 0.11691754311323166\n",
547
+ "Test Accuracy: 0.9784688949584961\n"
548
+ ]
549
+ }
550
+ ],
551
+ "source": [
552
+ "# Evaluate the model on the test data\n",
553
+ "test_loss, test_accuracy = model.evaluate(X_test, y_test)\n",
554
+ "\n",
555
+ "# Print the results\n",
556
+ "print(f'Test Loss: {test_loss}')\n",
557
+ "print(f'Test Accuracy: {test_accuracy}')\n"
558
+ ]
559
+ },
560
+ {
561
+ "cell_type": "code",
562
+ "execution_count": 55,
563
+ "metadata": {},
564
+ "outputs": [
565
+ {
566
+ "name": "stderr",
567
+ "output_type": "stream",
568
+ "text": [
569
+ "WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`. \n"
570
+ ]
571
+ }
572
+ ],
573
+ "source": [
574
+ "# Save the trained model\n",
575
+ "model.save('model.h5')\n",
576
+ "\n",
577
+ "# Save the tokenizer\n",
578
+ "import pickle\n",
579
+ "with open('tokenizer.pkl', 'wb') as f:\n",
580
+ " pickle.dump(tokenizer, f)\n"
581
+ ]
582
+ },
583
+ {
584
+ "cell_type": "code",
585
+ "execution_count": 58,
586
+ "metadata": {},
587
+ "outputs": [],
588
+ "source": [
589
+ "from tensorflow.keras.models import load_model\n",
590
+ "import pickle\n",
591
+ "\n",
592
+ "# Function to load the model and tokenizer\n",
593
+ "def load_model_and_tokenizer(model_path='model.h5', tokenizer_path='tokenizer.pkl'):\n",
594
+ " model = load_model(model_path)\n",
595
+ " with open(tokenizer_path, 'rb') as f:\n",
596
+ " tokenizer = pickle.load(f)\n",
597
+ " return model, tokenizer\n"
598
+ ]
599
+ },
600
+ {
601
+ "cell_type": "code",
602
+ "execution_count": 61,
603
+ "metadata": {},
604
+ "outputs": [
605
+ {
606
+ "name": "stderr",
607
+ "output_type": "stream",
608
+ "text": [
609
+ "WARNING:absl:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.\n"
610
+ ]
611
+ },
612
+ {
613
+ "name": "stdout",
614
+ "output_type": "stream",
615
+ "text": [
616
+ "\u001b[1m1/1\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 243ms/step\n",
617
+ "Text: Congrats! You have won a free ticket to the concert!\n",
618
+ "Predicted Label: ham\n",
619
+ "Prediction Confidence: 0.0243\n",
620
+ "\n",
621
+ "Text: Hey, let's grab coffee tomorrow. What time works for you?\n",
622
+ "Predicted Label: spam\n",
623
+ "Prediction Confidence: 0.9839\n",
624
+ "\n",
625
+ "Text: You have an important meeting with the CEO tomorrow!\n",
626
+ "Predicted Label: spam\n",
627
+ "Prediction Confidence: 0.9839\n",
628
+ "\n",
629
+ "Text: Hey, just checking in. How are you doing?\n",
630
+ "Predicted Label: spam\n",
631
+ "Prediction Confidence: 0.9839\n",
632
+ "\n"
633
+ ]
634
+ }
635
+ ],
636
+ "source": [
637
+ "# Function to make predictions and classify as \"spam\" or \"ham\"\n",
638
+ "def predict_text(model, tokenizer, sample_texts, maxlen=50):\n",
639
+ " # Preprocess the input text (tokenize and pad sequences)\n",
640
+ " X_predict = tokenizer.texts_to_sequences(sample_texts)\n",
641
+ " X_predict = pad_sequences(X_predict, maxlen=maxlen, padding='post')\n",
642
+ "\n",
643
+ " # Make predictions using the trained model\n",
644
+ " predictions = model.predict(X_predict)\n",
645
+ "\n",
646
+ " # Process predictions to classify as 'spam' or 'ham'\n",
647
+ " results = []\n",
648
+ " for text, pred in zip(sample_texts, predictions):\n",
649
+ " label = \"spam\" if pred[0] > 0.5 else \"ham\" # Threshold of 0.5 for binary classification\n",
650
+ " results.append({\n",
651
+ " \"Text\": text,\n",
652
+ " \"Predicted Label\": label,\n",
653
+ " \"Prediction Confidence\": f\"{pred[0]:.4f}\"\n",
654
+ " })\n",
655
+ " return results\n",
656
+ "\n",
657
+ "# Example usage\n",
658
+ "sample_texts = [\n",
659
+ " \"Congrats! You have won a free ticket to the concert!\",\n",
660
+ " \"Hey, let's grab coffee tomorrow. What time works for you?\",\n",
661
+ " \"You have an important meeting with the CEO tomorrow!\",\n",
662
+ " \"Hey, just checking in. How are you doing?\"\n",
663
+ "]\n",
664
+ "\n",
665
+ "# Load the saved model and tokenizer\n",
666
+ "model, tokenizer = load_model_and_tokenizer()\n",
667
+ "\n",
668
+ "# Predict on sample texts\n",
669
+ "predictions = predict_text(model, tokenizer, sample_texts)\n",
670
+ "\n",
671
+ "# Print prediction results\n",
672
+ "for result in predictions:\n",
673
+ " print(f\"Text: {result['Text']}\")\n",
674
+ " print(f\"Predicted Label: {result['Predicted Label']}\")\n",
675
+ " print(f\"Prediction Confidence: {result['Prediction Confidence']}\\n\")\n",
676
+ "\n"
677
+ ]
678
+ },
679
+ {
680
+ "cell_type": "code",
681
+ "execution_count": 79,
682
+ "metadata": {},
683
+ "outputs": [
684
+ {
685
+ "name": "stdout",
686
+ "output_type": "stream",
687
+ "text": [
688
+ "\u001b[1m53/53\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 14ms/step\n"
689
+ ]
690
+ }
691
+ ],
692
+ "source": [
693
+ "from sklearn.metrics import confusion_matrix\n",
694
+ "import numpy as np\n",
695
+ "\n",
696
+ "# Assuming y_test is already in binary format (0 or 1)\n",
697
+ "# If y_test is already binary (0 or 1), skip np.argmax\n",
698
+ "\n",
699
+ "# Get predictions from the model (output will likely be probabilities)\n",
700
+ "y_pred = model.predict(X_test)\n",
701
+ "\n"
702
+ ]
703
+ },
704
+ {
705
+ "cell_type": "code",
706
+ "execution_count": 80,
707
+ "metadata": {},
708
+ "outputs": [],
709
+ "source": [
710
+ "# Convert the predicted probabilities to binary labels (0 or 1)\n",
711
+ "y_pred_labels = np.argmax(y_pred, axis=1)\n",
712
+ " # This assumes a binary classification\n",
713
+ "\n"
714
+ ]
715
+ },
716
+ {
717
+ "cell_type": "code",
718
+ "execution_count": 77,
719
+ "metadata": {},
720
+ "outputs": [
721
+ {
722
+ "data": {
723
+ "text/plain": [
724
+ "array([0, 0, 0, ..., 0, 0, 0], dtype=int64)"
725
+ ]
726
+ },
727
+ "execution_count": 77,
728
+ "metadata": {},
729
+ "output_type": "execute_result"
730
+ }
731
+ ],
732
+ "source": [
733
+ "y_pred_labels"
734
+ ]
735
+ },
736
+ {
737
+ "cell_type": "code",
738
+ "execution_count": 78,
739
+ "metadata": {},
740
+ "outputs": [
741
+ {
742
+ "data": {
743
+ "text/plain": [
744
+ "4014 1\n",
745
+ "1697 0\n",
746
+ "2374 0\n",
747
+ "2529 0\n",
748
+ "2794 0\n",
749
+ " ..\n",
750
+ "991 0\n",
751
+ "224 1\n",
752
+ "1233 0\n",
753
+ "1366 0\n",
754
+ "2627 0\n",
755
+ "Name: v1, Length: 1672, dtype: int32"
756
+ ]
757
+ },
758
+ "execution_count": 78,
759
+ "metadata": {},
760
+ "output_type": "execute_result"
761
+ }
762
+ ],
763
+ "source": [
764
+ "y_test"
765
+ ]
766
+ },
767
+ {
768
+ "cell_type": "code",
769
+ "execution_count": 81,
770
+ "metadata": {},
771
+ "outputs": [
772
+ {
773
+ "name": "stdout",
774
+ "output_type": "stream",
775
+ "text": [
776
+ "Confusion Matrix:\n",
777
+ "[[1441 11]\n",
778
+ " [ 25 195]]\n"
779
+ ]
780
+ }
781
+ ],
782
+ "source": [
783
+ "# Compute the confusion matrix using the binary labels\n",
784
+ "cm = confusion_matrix(y_test, y_pred_labels)\n",
785
+ "\n",
786
+ "# Print the confusion matrix\n",
787
+ "print(\"Confusion Matrix:\")\n",
788
+ "print(cm)\n"
789
+ ]
790
+ },
791
+ {
792
+ "cell_type": "code",
793
+ "execution_count": 82,
794
+ "metadata": {},
795
+ "outputs": [
796
+ {
797
+ "name": "stdout",
798
+ "output_type": "stream",
799
+ "text": [
800
+ "TensorFlow version: 2.17.0\n",
801
+ "scikit-learn version: 1.5.1\n",
802
+ "Pandas version: 2.1.4\n",
803
+ "NumPy version: 1.23.5\n",
804
+ "Matplotlib version: 3.7.0\n",
805
+ "Seaborn version: 0.13.2\n",
806
+ "Streamlit version: 1.37.1\n"
807
+ ]
808
+ }
809
+ ],
810
+ "source": [
811
+ "# Check TensorFlow version\n",
812
+ "import tensorflow as tf\n",
813
+ "print(f\"TensorFlow version: {tf.__version__}\")\n",
814
+ "\n",
815
+ "# Check scikit-learn version\n",
816
+ "import sklearn\n",
817
+ "print(f\"scikit-learn version: {sklearn.__version__}\")\n",
818
+ "\n",
819
+ "# Check pandas version\n",
820
+ "import pandas as pd\n",
821
+ "print(f\"Pandas version: {pd.__version__}\")\n",
822
+ "\n",
823
+ "# Check NumPy version\n",
824
+ "import numpy as np\n",
825
+ "print(f\"NumPy version: {np.__version__}\")\n",
826
+ "\n",
827
+ "# Check Matplotlib version\n",
828
+ "import matplotlib\n",
829
+ "print(f\"Matplotlib version: {matplotlib.__version__}\")\n",
830
+ "\n",
831
+ "# Check Seaborn version\n",
832
+ "import seaborn as sns\n",
833
+ "print(f\"Seaborn version: {sns.__version__}\")\n",
834
+ "\n",
835
+ "# Check Streamlit version\n",
836
+ "import streamlit as st\n",
837
+ "print(f\"Streamlit version: {st.__version__}\")\n"
838
+ ]
839
+ },
840
+ {
841
+ "cell_type": "code",
842
+ "execution_count": null,
843
+ "metadata": {},
844
+ "outputs": [],
845
+ "source": []
846
+ }
847
+ ],
848
+ "metadata": {
849
+ "kernelspec": {
850
+ "display_name": "base",
851
+ "language": "python",
852
+ "name": "python3"
853
+ },
854
+ "language_info": {
855
+ "codemirror_mode": {
856
+ "name": "ipython",
857
+ "version": 3
858
+ },
859
+ "file_extension": ".py",
860
+ "mimetype": "text/x-python",
861
+ "name": "python",
862
+ "nbconvert_exporter": "python",
863
+ "pygments_lexer": "ipython3",
864
+ "version": "3.10.9"
865
+ }
866
+ },
867
+ "nbformat": 4,
868
+ "nbformat_minor": 2
869
+ }
tokenizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:359759b8182a943c97f4c24cd3673a262e82c79699f9a3b408d2913b974d4180
3
+ size 289219