sujoy0011 commited on
Commit
80dc37e
·
verified ·
1 Parent(s): 7ff8fd1

Upload 4 files

Browse files
Files changed (4) hide show
  1. Twitter_sentiment_Analysis.ipynb +1190 -0
  2. app.py +89 -0
  3. model.pkl +3 -0
  4. vectorizer.pkl +3 -0
Twitter_sentiment_Analysis.ipynb ADDED
@@ -0,0 +1,1190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import numpy as np\n",
10
+ "import pandas as pd\n",
11
+ "import re\n",
12
+ "from nltk.corpus import stopwords\n",
13
+ "from nltk.stem.porter import PorterStemmer\n",
14
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
15
+ "from sklearn.model_selection import train_test_split\n",
16
+ "from sklearn.linear_model import LogisticRegression\n",
17
+ "from sklearn.metrics import accuracy_score\n"
18
+ ]
19
+ },
20
+ {
21
+ "cell_type": "code",
22
+ "execution_count": 2,
23
+ "metadata": {},
24
+ "outputs": [
25
+ {
26
+ "name": "stderr",
27
+ "output_type": "stream",
28
+ "text": [
29
+ "[nltk_data] Downloading package stopwords to\n",
30
+ "[nltk_data] C:\\Users\\KIIT\\AppData\\Roaming\\nltk_data...\n",
31
+ "[nltk_data] Package stopwords is already up-to-date!\n"
32
+ ]
33
+ },
34
+ {
35
+ "data": {
36
+ "text/plain": [
37
+ "True"
38
+ ]
39
+ },
40
+ "execution_count": 2,
41
+ "metadata": {},
42
+ "output_type": "execute_result"
43
+ }
44
+ ],
45
+ "source": [
46
+ "import nltk\n",
47
+ "nltk.download('stopwords')"
48
+ ]
49
+ },
50
+ {
51
+ "cell_type": "code",
52
+ "execution_count": 3,
53
+ "metadata": {},
54
+ "outputs": [
55
+ {
56
+ "name": "stdout",
57
+ "output_type": "stream",
58
+ "text": [
59
+ "['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', \"you're\", \"you've\", \"you'll\", \"you'd\", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', \"she's\", 'her', 'hers', 'herself', 'it', \"it's\", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', \"that'll\", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', \"don't\", 'should', \"should've\", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', \"aren't\", 'couldn', \"couldn't\", 'didn', \"didn't\", 'doesn', \"doesn't\", 'hadn', \"hadn't\", 'hasn', \"hasn't\", 'haven', \"haven't\", 'isn', \"isn't\", 'ma', 'mightn', \"mightn't\", 'mustn', \"mustn't\", 'needn', \"needn't\", 'shan', \"shan't\", 'shouldn', \"shouldn't\", 'wasn', \"wasn't\", 'weren', \"weren't\", 'won', \"won't\", 'wouldn', \"wouldn't\"]\n"
60
+ ]
61
+ }
62
+ ],
63
+ "source": [
64
+ "print(stopwords.words('english'))"
65
+ ]
66
+ },
67
+ {
68
+ "cell_type": "code",
69
+ "execution_count": 5,
70
+ "metadata": {},
71
+ "outputs": [],
72
+ "source": [
73
+ "dataset = pd.read_csv(\"training.1600000.processed.noemoticon.csv\" , encoding= 'ISO-8859-1')"
74
+ ]
75
+ },
76
+ {
77
+ "cell_type": "code",
78
+ "execution_count": 6,
79
+ "metadata": {},
80
+ "outputs": [
81
+ {
82
+ "data": {
83
+ "text/html": [
84
+ "<div>\n",
85
+ "<style scoped>\n",
86
+ " .dataframe tbody tr th:only-of-type {\n",
87
+ " vertical-align: middle;\n",
88
+ " }\n",
89
+ "\n",
90
+ " .dataframe tbody tr th {\n",
91
+ " vertical-align: top;\n",
92
+ " }\n",
93
+ "\n",
94
+ " .dataframe thead th {\n",
95
+ " text-align: right;\n",
96
+ " }\n",
97
+ "</style>\n",
98
+ "<table border=\"1\" class=\"dataframe\">\n",
99
+ " <thead>\n",
100
+ " <tr style=\"text-align: right;\">\n",
101
+ " <th></th>\n",
102
+ " <th>0</th>\n",
103
+ " <th>1467810369</th>\n",
104
+ " <th>Mon Apr 06 22:19:45 PDT 2009</th>\n",
105
+ " <th>NO_QUERY</th>\n",
106
+ " <th>_TheSpecialOne_</th>\n",
107
+ " <th>@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D</th>\n",
108
+ " </tr>\n",
109
+ " </thead>\n",
110
+ " <tbody>\n",
111
+ " <tr>\n",
112
+ " <th>0</th>\n",
113
+ " <td>0</td>\n",
114
+ " <td>1467810672</td>\n",
115
+ " <td>Mon Apr 06 22:19:49 PDT 2009</td>\n",
116
+ " <td>NO_QUERY</td>\n",
117
+ " <td>scotthamilton</td>\n",
118
+ " <td>is upset that he can't update his Facebook by ...</td>\n",
119
+ " </tr>\n",
120
+ " <tr>\n",
121
+ " <th>1</th>\n",
122
+ " <td>0</td>\n",
123
+ " <td>1467810917</td>\n",
124
+ " <td>Mon Apr 06 22:19:53 PDT 2009</td>\n",
125
+ " <td>NO_QUERY</td>\n",
126
+ " <td>mattycus</td>\n",
127
+ " <td>@Kenichan I dived many times for the ball. Man...</td>\n",
128
+ " </tr>\n",
129
+ " <tr>\n",
130
+ " <th>2</th>\n",
131
+ " <td>0</td>\n",
132
+ " <td>1467811184</td>\n",
133
+ " <td>Mon Apr 06 22:19:57 PDT 2009</td>\n",
134
+ " <td>NO_QUERY</td>\n",
135
+ " <td>ElleCTF</td>\n",
136
+ " <td>my whole body feels itchy and like its on fire</td>\n",
137
+ " </tr>\n",
138
+ " <tr>\n",
139
+ " <th>3</th>\n",
140
+ " <td>0</td>\n",
141
+ " <td>1467811193</td>\n",
142
+ " <td>Mon Apr 06 22:19:57 PDT 2009</td>\n",
143
+ " <td>NO_QUERY</td>\n",
144
+ " <td>Karoli</td>\n",
145
+ " <td>@nationwideclass no, it's not behaving at all....</td>\n",
146
+ " </tr>\n",
147
+ " <tr>\n",
148
+ " <th>4</th>\n",
149
+ " <td>0</td>\n",
150
+ " <td>1467811372</td>\n",
151
+ " <td>Mon Apr 06 22:20:00 PDT 2009</td>\n",
152
+ " <td>NO_QUERY</td>\n",
153
+ " <td>joy_wolf</td>\n",
154
+ " <td>@Kwesidei not the whole crew</td>\n",
155
+ " </tr>\n",
156
+ " </tbody>\n",
157
+ "</table>\n",
158
+ "</div>"
159
+ ],
160
+ "text/plain": [
161
+ " 0 1467810369 Mon Apr 06 22:19:45 PDT 2009 NO_QUERY _TheSpecialOne_ \\\n",
162
+ "0 0 1467810672 Mon Apr 06 22:19:49 PDT 2009 NO_QUERY scotthamilton \n",
163
+ "1 0 1467810917 Mon Apr 06 22:19:53 PDT 2009 NO_QUERY mattycus \n",
164
+ "2 0 1467811184 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY ElleCTF \n",
165
+ "3 0 1467811193 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY Karoli \n",
166
+ "4 0 1467811372 Mon Apr 06 22:20:00 PDT 2009 NO_QUERY joy_wolf \n",
167
+ "\n",
168
+ " @switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D \n",
169
+ "0 is upset that he can't update his Facebook by ... \n",
170
+ "1 @Kenichan I dived many times for the ball. Man... \n",
171
+ "2 my whole body feels itchy and like its on fire \n",
172
+ "3 @nationwideclass no, it's not behaving at all.... \n",
173
+ "4 @Kwesidei not the whole crew "
174
+ ]
175
+ },
176
+ "execution_count": 6,
177
+ "metadata": {},
178
+ "output_type": "execute_result"
179
+ }
180
+ ],
181
+ "source": [
182
+ "dataset.head()"
183
+ ]
184
+ },
185
+ {
186
+ "cell_type": "code",
187
+ "execution_count": 8,
188
+ "metadata": {},
189
+ "outputs": [],
190
+ "source": [
191
+ "col_names = ['target' , 'id' , 'date' , 'flag' , 'user' , 'text']\n",
192
+ "dataset.columns = col_names"
193
+ ]
194
+ },
195
+ {
196
+ "cell_type": "code",
197
+ "execution_count": 9,
198
+ "metadata": {},
199
+ "outputs": [
200
+ {
201
+ "data": {
202
+ "text/html": [
203
+ "<div>\n",
204
+ "<style scoped>\n",
205
+ " .dataframe tbody tr th:only-of-type {\n",
206
+ " vertical-align: middle;\n",
207
+ " }\n",
208
+ "\n",
209
+ " .dataframe tbody tr th {\n",
210
+ " vertical-align: top;\n",
211
+ " }\n",
212
+ "\n",
213
+ " .dataframe thead th {\n",
214
+ " text-align: right;\n",
215
+ " }\n",
216
+ "</style>\n",
217
+ "<table border=\"1\" class=\"dataframe\">\n",
218
+ " <thead>\n",
219
+ " <tr style=\"text-align: right;\">\n",
220
+ " <th></th>\n",
221
+ " <th>target</th>\n",
222
+ " <th>id</th>\n",
223
+ " <th>date</th>\n",
224
+ " <th>flag</th>\n",
225
+ " <th>user</th>\n",
226
+ " <th>text</th>\n",
227
+ " </tr>\n",
228
+ " </thead>\n",
229
+ " <tbody>\n",
230
+ " <tr>\n",
231
+ " <th>0</th>\n",
232
+ " <td>0</td>\n",
233
+ " <td>1467810672</td>\n",
234
+ " <td>Mon Apr 06 22:19:49 PDT 2009</td>\n",
235
+ " <td>NO_QUERY</td>\n",
236
+ " <td>scotthamilton</td>\n",
237
+ " <td>is upset that he can't update his Facebook by ...</td>\n",
238
+ " </tr>\n",
239
+ " <tr>\n",
240
+ " <th>1</th>\n",
241
+ " <td>0</td>\n",
242
+ " <td>1467810917</td>\n",
243
+ " <td>Mon Apr 06 22:19:53 PDT 2009</td>\n",
244
+ " <td>NO_QUERY</td>\n",
245
+ " <td>mattycus</td>\n",
246
+ " <td>@Kenichan I dived many times for the ball. Man...</td>\n",
247
+ " </tr>\n",
248
+ " <tr>\n",
249
+ " <th>2</th>\n",
250
+ " <td>0</td>\n",
251
+ " <td>1467811184</td>\n",
252
+ " <td>Mon Apr 06 22:19:57 PDT 2009</td>\n",
253
+ " <td>NO_QUERY</td>\n",
254
+ " <td>ElleCTF</td>\n",
255
+ " <td>my whole body feels itchy and like its on fire</td>\n",
256
+ " </tr>\n",
257
+ " <tr>\n",
258
+ " <th>3</th>\n",
259
+ " <td>0</td>\n",
260
+ " <td>1467811193</td>\n",
261
+ " <td>Mon Apr 06 22:19:57 PDT 2009</td>\n",
262
+ " <td>NO_QUERY</td>\n",
263
+ " <td>Karoli</td>\n",
264
+ " <td>@nationwideclass no, it's not behaving at all....</td>\n",
265
+ " </tr>\n",
266
+ " <tr>\n",
267
+ " <th>4</th>\n",
268
+ " <td>0</td>\n",
269
+ " <td>1467811372</td>\n",
270
+ " <td>Mon Apr 06 22:20:00 PDT 2009</td>\n",
271
+ " <td>NO_QUERY</td>\n",
272
+ " <td>joy_wolf</td>\n",
273
+ " <td>@Kwesidei not the whole crew</td>\n",
274
+ " </tr>\n",
275
+ " </tbody>\n",
276
+ "</table>\n",
277
+ "</div>"
278
+ ],
279
+ "text/plain": [
280
+ " target id date flag user \\\n",
281
+ "0 0 1467810672 Mon Apr 06 22:19:49 PDT 2009 NO_QUERY scotthamilton \n",
282
+ "1 0 1467810917 Mon Apr 06 22:19:53 PDT 2009 NO_QUERY mattycus \n",
283
+ "2 0 1467811184 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY ElleCTF \n",
284
+ "3 0 1467811193 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY Karoli \n",
285
+ "4 0 1467811372 Mon Apr 06 22:20:00 PDT 2009 NO_QUERY joy_wolf \n",
286
+ "\n",
287
+ " text \n",
288
+ "0 is upset that he can't update his Facebook by ... \n",
289
+ "1 @Kenichan I dived many times for the ball. Man... \n",
290
+ "2 my whole body feels itchy and like its on fire \n",
291
+ "3 @nationwideclass no, it's not behaving at all.... \n",
292
+ "4 @Kwesidei not the whole crew "
293
+ ]
294
+ },
295
+ "execution_count": 9,
296
+ "metadata": {},
297
+ "output_type": "execute_result"
298
+ }
299
+ ],
300
+ "source": [
301
+ "dataset.head()"
302
+ ]
303
+ },
304
+ {
305
+ "cell_type": "code",
306
+ "execution_count": 10,
307
+ "metadata": {},
308
+ "outputs": [
309
+ {
310
+ "data": {
311
+ "text/plain": [
312
+ "(1599999, 6)"
313
+ ]
314
+ },
315
+ "execution_count": 10,
316
+ "metadata": {},
317
+ "output_type": "execute_result"
318
+ }
319
+ ],
320
+ "source": [
321
+ "dataset.shape"
322
+ ]
323
+ },
324
+ {
325
+ "cell_type": "code",
326
+ "execution_count": 11,
327
+ "metadata": {},
328
+ "outputs": [
329
+ {
330
+ "data": {
331
+ "text/plain": [
332
+ "target 0\n",
333
+ "id 0\n",
334
+ "date 0\n",
335
+ "flag 0\n",
336
+ "user 0\n",
337
+ "text 0\n",
338
+ "dtype: int64"
339
+ ]
340
+ },
341
+ "execution_count": 11,
342
+ "metadata": {},
343
+ "output_type": "execute_result"
344
+ }
345
+ ],
346
+ "source": [
347
+ "#checking for missing values\n",
348
+ "dataset.isnull().sum()"
349
+ ]
350
+ },
351
+ {
352
+ "cell_type": "code",
353
+ "execution_count": 12,
354
+ "metadata": {},
355
+ "outputs": [
356
+ {
357
+ "data": {
358
+ "text/plain": [
359
+ "target\n",
360
+ "4 800000\n",
361
+ "0 799999\n",
362
+ "Name: count, dtype: int64"
363
+ ]
364
+ },
365
+ "execution_count": 12,
366
+ "metadata": {},
367
+ "output_type": "execute_result"
368
+ }
369
+ ],
370
+ "source": [
371
+ "# Distribution of tweets\n",
372
+ "dataset['target'].value_counts()"
373
+ ]
374
+ },
375
+ {
376
+ "cell_type": "code",
377
+ "execution_count": 13,
378
+ "metadata": {},
379
+ "outputs": [],
380
+ "source": [
381
+ "# Converting 0 to -ve and 4 to +ve\n",
382
+ "dataset['target'] = dataset['target'].map({0:0 , 4:1})"
383
+ ]
384
+ },
385
+ {
386
+ "cell_type": "code",
387
+ "execution_count": 15,
388
+ "metadata": {},
389
+ "outputs": [
390
+ {
391
+ "data": {
392
+ "text/plain": [
393
+ "target\n",
394
+ "1 800000\n",
395
+ "0 799999\n",
396
+ "Name: count, dtype: int64"
397
+ ]
398
+ },
399
+ "execution_count": 15,
400
+ "metadata": {},
401
+ "output_type": "execute_result"
402
+ }
403
+ ],
404
+ "source": [
405
+ "dataset['target'].value_counts()"
406
+ ]
407
+ },
408
+ {
409
+ "cell_type": "code",
410
+ "execution_count": 16,
411
+ "metadata": {},
412
+ "outputs": [],
413
+ "source": [
414
+ "# Stemming\n",
415
+ "\n",
416
+ "stremmer = PorterStemmer()\n",
417
+ "\n",
418
+ "def stemming(content):\n",
419
+ " stemmed_content = re.sub('[^a-zA-Z]',' ',content) # removing not a-z and A-Z\n",
420
+ " stemmed_content = stemmed_content.lower()\n",
421
+ " stemmed_content = stemmed_content.split()\n",
422
+ " stemmed_content = [stremmer.stem(word) for word in stemmed_content if not word in stopwords.words('english')]\n",
423
+ " stemmed_content = ' '.join(stemmed_content)\n",
424
+ " return stemmed_content"
425
+ ]
426
+ },
427
+ {
428
+ "cell_type": "code",
429
+ "execution_count": 17,
430
+ "metadata": {},
431
+ "outputs": [],
432
+ "source": [
433
+ "dataset['text'] = dataset['text'].apply(stemming)"
434
+ ]
435
+ },
436
+ {
437
+ "cell_type": "code",
438
+ "execution_count": 18,
439
+ "metadata": {},
440
+ "outputs": [
441
+ {
442
+ "data": {
443
+ "text/html": [
444
+ "<div>\n",
445
+ "<style scoped>\n",
446
+ " .dataframe tbody tr th:only-of-type {\n",
447
+ " vertical-align: middle;\n",
448
+ " }\n",
449
+ "\n",
450
+ " .dataframe tbody tr th {\n",
451
+ " vertical-align: top;\n",
452
+ " }\n",
453
+ "\n",
454
+ " .dataframe thead th {\n",
455
+ " text-align: right;\n",
456
+ " }\n",
457
+ "</style>\n",
458
+ "<table border=\"1\" class=\"dataframe\">\n",
459
+ " <thead>\n",
460
+ " <tr style=\"text-align: right;\">\n",
461
+ " <th></th>\n",
462
+ " <th>target</th>\n",
463
+ " <th>id</th>\n",
464
+ " <th>date</th>\n",
465
+ " <th>flag</th>\n",
466
+ " <th>user</th>\n",
467
+ " <th>text</th>\n",
468
+ " </tr>\n",
469
+ " </thead>\n",
470
+ " <tbody>\n",
471
+ " <tr>\n",
472
+ " <th>0</th>\n",
473
+ " <td>0</td>\n",
474
+ " <td>1467810672</td>\n",
475
+ " <td>Mon Apr 06 22:19:49 PDT 2009</td>\n",
476
+ " <td>NO_QUERY</td>\n",
477
+ " <td>scotthamilton</td>\n",
478
+ " <td>upset updat facebook text might cri result sch...</td>\n",
479
+ " </tr>\n",
480
+ " <tr>\n",
481
+ " <th>1</th>\n",
482
+ " <td>0</td>\n",
483
+ " <td>1467810917</td>\n",
484
+ " <td>Mon Apr 06 22:19:53 PDT 2009</td>\n",
485
+ " <td>NO_QUERY</td>\n",
486
+ " <td>mattycus</td>\n",
487
+ " <td>kenichan dive mani time ball manag save rest g...</td>\n",
488
+ " </tr>\n",
489
+ " <tr>\n",
490
+ " <th>2</th>\n",
491
+ " <td>0</td>\n",
492
+ " <td>1467811184</td>\n",
493
+ " <td>Mon Apr 06 22:19:57 PDT 2009</td>\n",
494
+ " <td>NO_QUERY</td>\n",
495
+ " <td>ElleCTF</td>\n",
496
+ " <td>whole bodi feel itchi like fire</td>\n",
497
+ " </tr>\n",
498
+ " <tr>\n",
499
+ " <th>3</th>\n",
500
+ " <td>0</td>\n",
501
+ " <td>1467811193</td>\n",
502
+ " <td>Mon Apr 06 22:19:57 PDT 2009</td>\n",
503
+ " <td>NO_QUERY</td>\n",
504
+ " <td>Karoli</td>\n",
505
+ " <td>nationwideclass behav mad see</td>\n",
506
+ " </tr>\n",
507
+ " <tr>\n",
508
+ " <th>4</th>\n",
509
+ " <td>0</td>\n",
510
+ " <td>1467811372</td>\n",
511
+ " <td>Mon Apr 06 22:20:00 PDT 2009</td>\n",
512
+ " <td>NO_QUERY</td>\n",
513
+ " <td>joy_wolf</td>\n",
514
+ " <td>kwesidei whole crew</td>\n",
515
+ " </tr>\n",
516
+ " </tbody>\n",
517
+ "</table>\n",
518
+ "</div>"
519
+ ],
520
+ "text/plain": [
521
+ " target id date flag user \\\n",
522
+ "0 0 1467810672 Mon Apr 06 22:19:49 PDT 2009 NO_QUERY scotthamilton \n",
523
+ "1 0 1467810917 Mon Apr 06 22:19:53 PDT 2009 NO_QUERY mattycus \n",
524
+ "2 0 1467811184 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY ElleCTF \n",
525
+ "3 0 1467811193 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY Karoli \n",
526
+ "4 0 1467811372 Mon Apr 06 22:20:00 PDT 2009 NO_QUERY joy_wolf \n",
527
+ "\n",
528
+ " text \n",
529
+ "0 upset updat facebook text might cri result sch... \n",
530
+ "1 kenichan dive mani time ball manag save rest g... \n",
531
+ "2 whole bodi feel itchi like fire \n",
532
+ "3 nationwideclass behav mad see \n",
533
+ "4 kwesidei whole crew "
534
+ ]
535
+ },
536
+ "execution_count": 18,
537
+ "metadata": {},
538
+ "output_type": "execute_result"
539
+ }
540
+ ],
541
+ "source": [
542
+ "dataset.head()"
543
+ ]
544
+ },
545
+ {
546
+ "cell_type": "code",
547
+ "execution_count": 19,
548
+ "metadata": {},
549
+ "outputs": [],
550
+ "source": [
551
+ "x = dataset['text']\n",
552
+ "y = dataset['target']"
553
+ ]
554
+ },
555
+ {
556
+ "cell_type": "code",
557
+ "execution_count": 20,
558
+ "metadata": {},
559
+ "outputs": [],
560
+ "source": [
561
+ "# splitting the dataset\n",
562
+ "x_train , x_test , y_train , y_test = train_test_split(x , y , test_size = 0.2 , random_state = 0)"
563
+ ]
564
+ },
565
+ {
566
+ "cell_type": "code",
567
+ "execution_count": 21,
568
+ "metadata": {},
569
+ "outputs": [],
570
+ "source": [
571
+ "# convert textual data to numerical data\n",
572
+ "vectorizer = TfidfVectorizer()\n",
573
+ "x_train = vectorizer.fit_transform(x_train)\n",
574
+ "x_test = vectorizer.transform(x_test)"
575
+ ]
576
+ },
577
+ {
578
+ "cell_type": "code",
579
+ "execution_count": 22,
580
+ "metadata": {},
581
+ "outputs": [
582
+ {
583
+ "name": "stdout",
584
+ "output_type": "stream",
585
+ "text": [
586
+ " (0, 145591)\t0.48328892862950174\n",
587
+ " (0, 384310)\t0.38648598535906226\n",
588
+ " (0, 160355)\t0.18966194768681632\n",
589
+ " (0, 422796)\t0.4213995220282958\n",
590
+ " (0, 246262)\t0.516206150117446\n",
591
+ " (0, 393595)\t0.18633353695642413\n",
592
+ " (0, 149660)\t0.12602103676347354\n",
593
+ " (0, 150562)\t0.187752051036393\n",
594
+ " (0, 443991)\t0.22625223143666687\n",
595
+ " (1, 172128)\t0.6067414559564506\n",
596
+ " (1, 418051)\t0.7948992424350689\n",
597
+ " (2, 406965)\t0.6931768888241752\n",
598
+ " (2, 275790)\t0.3769717187165907\n",
599
+ " (2, 290673)\t0.24841016587340456\n",
600
+ " (2, 150650)\t0.20986098127991223\n",
601
+ " (2, 42279)\t0.5211994648067829\n",
602
+ " (3, 175231)\t0.30748407834013664\n",
603
+ " (3, 89478)\t0.5137960384023271\n",
604
+ " (3, 135304)\t0.18399221471225605\n",
605
+ " (3, 292469)\t0.3352332134067401\n",
606
+ " (3, 399931)\t0.21912347276618377\n",
607
+ " (3, 317428)\t0.5137960384023271\n",
608
+ " (3, 175234)\t0.4280552121498152\n",
609
+ " (4, 408579)\t0.14704998873675024\n",
610
+ " (4, 300289)\t0.2058593651486058\n",
611
+ " :\t:\n",
612
+ " (1279995, 101591)\t0.8081360486674279\n",
613
+ " (1279995, 248952)\t0.5889958631808858\n",
614
+ " (1279996, 277402)\t0.6930282733228941\n",
615
+ " (1279996, 133848)\t0.34541074396262944\n",
616
+ " (1279996, 435543)\t0.2695787059712405\n",
617
+ " (1279996, 230940)\t0.28709000004756496\n",
618
+ " (1279996, 384176)\t0.22284929416293517\n",
619
+ " (1279996, 168384)\t0.22632455016071848\n",
620
+ " (1279996, 445127)\t0.19037698208802128\n",
621
+ " (1279996, 170080)\t0.2583579928589749\n",
622
+ " (1279996, 408579)\t0.2035510397723402\n",
623
+ " (1279997, 22582)\t0.40592321055556474\n",
624
+ " (1279997, 407667)\t0.4517041173506153\n",
625
+ " (1279997, 365896)\t0.34128528334674657\n",
626
+ " (1279997, 78807)\t0.20434235294380243\n",
627
+ " (1279997, 318283)\t0.48408216042272795\n",
628
+ " (1279997, 278738)\t0.20662639845796468\n",
629
+ " (1279997, 31095)\t0.1879300266675478\n",
630
+ " (1279997, 267587)\t0.18767777014427442\n",
631
+ " (1279997, 334582)\t0.19548006690275818\n",
632
+ " (1279997, 243236)\t0.23915227399663266\n",
633
+ " (1279997, 241760)\t0.17315132700092342\n",
634
+ " (1279998, 360147)\t0.7967059461608392\n",
635
+ " (1279998, 393318)\t0.47775281405037406\n",
636
+ " (1279998, 150849)\t0.37015116374112683\n"
637
+ ]
638
+ }
639
+ ],
640
+ "source": [
641
+ "print(x_train)"
642
+ ]
643
+ },
644
+ {
645
+ "cell_type": "code",
646
+ "execution_count": 23,
647
+ "metadata": {},
648
+ "outputs": [
649
+ {
650
+ "name": "stderr",
651
+ "output_type": "stream",
652
+ "text": [
653
+ "C:\\Users\\KIIT\\AppData\\Roaming\\Python\\Python312\\site-packages\\sklearn\\linear_model\\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
654
+ "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
655
+ "\n",
656
+ "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
657
+ " https://scikit-learn.org/stable/modules/preprocessing.html\n",
658
+ "Please also refer to the documentation for alternative solver options:\n",
659
+ " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
660
+ " n_iter_i = _check_optimize_result(\n"
661
+ ]
662
+ },
663
+ {
664
+ "data": {
665
+ "text/html": [
666
+ "<style>#sk-container-id-1 {\n",
667
+ " /* Definition of color scheme common for light and dark mode */\n",
668
+ " --sklearn-color-text: black;\n",
669
+ " --sklearn-color-line: gray;\n",
670
+ " /* Definition of color scheme for unfitted estimators */\n",
671
+ " --sklearn-color-unfitted-level-0: #fff5e6;\n",
672
+ " --sklearn-color-unfitted-level-1: #f6e4d2;\n",
673
+ " --sklearn-color-unfitted-level-2: #ffe0b3;\n",
674
+ " --sklearn-color-unfitted-level-3: chocolate;\n",
675
+ " /* Definition of color scheme for fitted estimators */\n",
676
+ " --sklearn-color-fitted-level-0: #f0f8ff;\n",
677
+ " --sklearn-color-fitted-level-1: #d4ebff;\n",
678
+ " --sklearn-color-fitted-level-2: #b3dbfd;\n",
679
+ " --sklearn-color-fitted-level-3: cornflowerblue;\n",
680
+ "\n",
681
+ " /* Specific color for light theme */\n",
682
+ " --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
683
+ " --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white)));\n",
684
+ " --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
685
+ " --sklearn-color-icon: #696969;\n",
686
+ "\n",
687
+ " @media (prefers-color-scheme: dark) {\n",
688
+ " /* Redefinition of color scheme for dark theme */\n",
689
+ " --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
690
+ " --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, #111)));\n",
691
+ " --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
692
+ " --sklearn-color-icon: #878787;\n",
693
+ " }\n",
694
+ "}\n",
695
+ "\n",
696
+ "#sk-container-id-1 {\n",
697
+ " color: var(--sklearn-color-text);\n",
698
+ "}\n",
699
+ "\n",
700
+ "#sk-container-id-1 pre {\n",
701
+ " padding: 0;\n",
702
+ "}\n",
703
+ "\n",
704
+ "#sk-container-id-1 input.sk-hidden--visually {\n",
705
+ " border: 0;\n",
706
+ " clip: rect(1px 1px 1px 1px);\n",
707
+ " clip: rect(1px, 1px, 1px, 1px);\n",
708
+ " height: 1px;\n",
709
+ " margin: -1px;\n",
710
+ " overflow: hidden;\n",
711
+ " padding: 0;\n",
712
+ " position: absolute;\n",
713
+ " width: 1px;\n",
714
+ "}\n",
715
+ "\n",
716
+ "#sk-container-id-1 div.sk-dashed-wrapped {\n",
717
+ " border: 1px dashed var(--sklearn-color-line);\n",
718
+ " margin: 0 0.4em 0.5em 0.4em;\n",
719
+ " box-sizing: border-box;\n",
720
+ " padding-bottom: 0.4em;\n",
721
+ " background-color: var(--sklearn-color-background);\n",
722
+ "}\n",
723
+ "\n",
724
+ "#sk-container-id-1 div.sk-container {\n",
725
+ " /* jupyter's `normalize.less` sets `[hidden] { display: none; }`\n",
726
+ " but bootstrap.min.css set `[hidden] { display: none !important; }`\n",
727
+ " so we also need the `!important` here to be able to override the\n",
728
+ " default hidden behavior on the sphinx rendered scikit-learn.org.\n",
729
+ " See: https://github.com/scikit-learn/scikit-learn/issues/21755 */\n",
730
+ " display: inline-block !important;\n",
731
+ " position: relative;\n",
732
+ "}\n",
733
+ "\n",
734
+ "#sk-container-id-1 div.sk-text-repr-fallback {\n",
735
+ " display: none;\n",
736
+ "}\n",
737
+ "\n",
738
+ "div.sk-parallel-item,\n",
739
+ "div.sk-serial,\n",
740
+ "div.sk-item {\n",
741
+ " /* draw centered vertical line to link estimators */\n",
742
+ " background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background));\n",
743
+ " background-size: 2px 100%;\n",
744
+ " background-repeat: no-repeat;\n",
745
+ " background-position: center center;\n",
746
+ "}\n",
747
+ "\n",
748
+ "/* Parallel-specific style estimator block */\n",
749
+ "\n",
750
+ "#sk-container-id-1 div.sk-parallel-item::after {\n",
751
+ " content: \"\";\n",
752
+ " width: 100%;\n",
753
+ " border-bottom: 2px solid var(--sklearn-color-text-on-default-background);\n",
754
+ " flex-grow: 1;\n",
755
+ "}\n",
756
+ "\n",
757
+ "#sk-container-id-1 div.sk-parallel {\n",
758
+ " display: flex;\n",
759
+ " align-items: stretch;\n",
760
+ " justify-content: center;\n",
761
+ " background-color: var(--sklearn-color-background);\n",
762
+ " position: relative;\n",
763
+ "}\n",
764
+ "\n",
765
+ "#sk-container-id-1 div.sk-parallel-item {\n",
766
+ " display: flex;\n",
767
+ " flex-direction: column;\n",
768
+ "}\n",
769
+ "\n",
770
+ "#sk-container-id-1 div.sk-parallel-item:first-child::after {\n",
771
+ " align-self: flex-end;\n",
772
+ " width: 50%;\n",
773
+ "}\n",
774
+ "\n",
775
+ "#sk-container-id-1 div.sk-parallel-item:last-child::after {\n",
776
+ " align-self: flex-start;\n",
777
+ " width: 50%;\n",
778
+ "}\n",
779
+ "\n",
780
+ "#sk-container-id-1 div.sk-parallel-item:only-child::after {\n",
781
+ " width: 0;\n",
782
+ "}\n",
783
+ "\n",
784
+ "/* Serial-specific style estimator block */\n",
785
+ "\n",
786
+ "#sk-container-id-1 div.sk-serial {\n",
787
+ " display: flex;\n",
788
+ " flex-direction: column;\n",
789
+ " align-items: center;\n",
790
+ " background-color: var(--sklearn-color-background);\n",
791
+ " padding-right: 1em;\n",
792
+ " padding-left: 1em;\n",
793
+ "}\n",
794
+ "\n",
795
+ "\n",
796
+ "/* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is\n",
797
+ "clickable and can be expanded/collapsed.\n",
798
+ "- Pipeline and ColumnTransformer use this feature and define the default style\n",
799
+ "- Estimators will overwrite some part of the style using the `sk-estimator` class\n",
800
+ "*/\n",
801
+ "\n",
802
+ "/* Pipeline and ColumnTransformer style (default) */\n",
803
+ "\n",
804
+ "#sk-container-id-1 div.sk-toggleable {\n",
805
+ " /* Default theme specific background. It is overwritten whether we have a\n",
806
+ " specific estimator or a Pipeline/ColumnTransformer */\n",
807
+ " background-color: var(--sklearn-color-background);\n",
808
+ "}\n",
809
+ "\n",
810
+ "/* Toggleable label */\n",
811
+ "#sk-container-id-1 label.sk-toggleable__label {\n",
812
+ " cursor: pointer;\n",
813
+ " display: block;\n",
814
+ " width: 100%;\n",
815
+ " margin-bottom: 0;\n",
816
+ " padding: 0.5em;\n",
817
+ " box-sizing: border-box;\n",
818
+ " text-align: center;\n",
819
+ "}\n",
820
+ "\n",
821
+ "#sk-container-id-1 label.sk-toggleable__label-arrow:before {\n",
822
+ " /* Arrow on the left of the label */\n",
823
+ " content: \"▸\";\n",
824
+ " float: left;\n",
825
+ " margin-right: 0.25em;\n",
826
+ " color: var(--sklearn-color-icon);\n",
827
+ "}\n",
828
+ "\n",
829
+ "#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {\n",
830
+ " color: var(--sklearn-color-text);\n",
831
+ "}\n",
832
+ "\n",
833
+ "/* Toggleable content - dropdown */\n",
834
+ "\n",
835
+ "#sk-container-id-1 div.sk-toggleable__content {\n",
836
+ " max-height: 0;\n",
837
+ " max-width: 0;\n",
838
+ " overflow: hidden;\n",
839
+ " text-align: left;\n",
840
+ " /* unfitted */\n",
841
+ " background-color: var(--sklearn-color-unfitted-level-0);\n",
842
+ "}\n",
843
+ "\n",
844
+ "#sk-container-id-1 div.sk-toggleable__content.fitted {\n",
845
+ " /* fitted */\n",
846
+ " background-color: var(--sklearn-color-fitted-level-0);\n",
847
+ "}\n",
848
+ "\n",
849
+ "#sk-container-id-1 div.sk-toggleable__content pre {\n",
850
+ " margin: 0.2em;\n",
851
+ " border-radius: 0.25em;\n",
852
+ " color: var(--sklearn-color-text);\n",
853
+ " /* unfitted */\n",
854
+ " background-color: var(--sklearn-color-unfitted-level-0);\n",
855
+ "}\n",
856
+ "\n",
857
+ "#sk-container-id-1 div.sk-toggleable__content.fitted pre {\n",
858
+ " /* unfitted */\n",
859
+ " background-color: var(--sklearn-color-fitted-level-0);\n",
860
+ "}\n",
861
+ "\n",
862
+ "#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {\n",
863
+ " /* Expand drop-down */\n",
864
+ " max-height: 200px;\n",
865
+ " max-width: 100%;\n",
866
+ " overflow: auto;\n",
867
+ "}\n",
868
+ "\n",
869
+ "#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {\n",
870
+ " content: \"▾\";\n",
871
+ "}\n",
872
+ "\n",
873
+ "/* Pipeline/ColumnTransformer-specific style */\n",
874
+ "\n",
875
+ "#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
876
+ " color: var(--sklearn-color-text);\n",
877
+ " background-color: var(--sklearn-color-unfitted-level-2);\n",
878
+ "}\n",
879
+ "\n",
880
+ "#sk-container-id-1 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
881
+ " background-color: var(--sklearn-color-fitted-level-2);\n",
882
+ "}\n",
883
+ "\n",
884
+ "/* Estimator-specific style */\n",
885
+ "\n",
886
+ "/* Colorize estimator box */\n",
887
+ "#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
888
+ " /* unfitted */\n",
889
+ " background-color: var(--sklearn-color-unfitted-level-2);\n",
890
+ "}\n",
891
+ "\n",
892
+ "#sk-container-id-1 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
893
+ " /* fitted */\n",
894
+ " background-color: var(--sklearn-color-fitted-level-2);\n",
895
+ "}\n",
896
+ "\n",
897
+ "#sk-container-id-1 div.sk-label label.sk-toggleable__label,\n",
898
+ "#sk-container-id-1 div.sk-label label {\n",
899
+ " /* The background is the default theme color */\n",
900
+ " color: var(--sklearn-color-text-on-default-background);\n",
901
+ "}\n",
902
+ "\n",
903
+ "/* On hover, darken the color of the background */\n",
904
+ "#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {\n",
905
+ " color: var(--sklearn-color-text);\n",
906
+ " background-color: var(--sklearn-color-unfitted-level-2);\n",
907
+ "}\n",
908
+ "\n",
909
+ "/* Label box, darken color on hover, fitted */\n",
910
+ "#sk-container-id-1 div.sk-label.fitted:hover label.sk-toggleable__label.fitted {\n",
911
+ " color: var(--sklearn-color-text);\n",
912
+ " background-color: var(--sklearn-color-fitted-level-2);\n",
913
+ "}\n",
914
+ "\n",
915
+ "/* Estimator label */\n",
916
+ "\n",
917
+ "#sk-container-id-1 div.sk-label label {\n",
918
+ " font-family: monospace;\n",
919
+ " font-weight: bold;\n",
920
+ " display: inline-block;\n",
921
+ " line-height: 1.2em;\n",
922
+ "}\n",
923
+ "\n",
924
+ "#sk-container-id-1 div.sk-label-container {\n",
925
+ " text-align: center;\n",
926
+ "}\n",
927
+ "\n",
928
+ "/* Estimator-specific */\n",
929
+ "#sk-container-id-1 div.sk-estimator {\n",
930
+ " font-family: monospace;\n",
931
+ " border: 1px dotted var(--sklearn-color-border-box);\n",
932
+ " border-radius: 0.25em;\n",
933
+ " box-sizing: border-box;\n",
934
+ " margin-bottom: 0.5em;\n",
935
+ " /* unfitted */\n",
936
+ " background-color: var(--sklearn-color-unfitted-level-0);\n",
937
+ "}\n",
938
+ "\n",
939
+ "#sk-container-id-1 div.sk-estimator.fitted {\n",
940
+ " /* fitted */\n",
941
+ " background-color: var(--sklearn-color-fitted-level-0);\n",
942
+ "}\n",
943
+ "\n",
944
+ "/* on hover */\n",
945
+ "#sk-container-id-1 div.sk-estimator:hover {\n",
946
+ " /* unfitted */\n",
947
+ " background-color: var(--sklearn-color-unfitted-level-2);\n",
948
+ "}\n",
949
+ "\n",
950
+ "#sk-container-id-1 div.sk-estimator.fitted:hover {\n",
951
+ " /* fitted */\n",
952
+ " background-color: var(--sklearn-color-fitted-level-2);\n",
953
+ "}\n",
954
+ "\n",
955
+ "/* Specification for estimator info (e.g. \"i\" and \"?\") */\n",
956
+ "\n",
957
+ "/* Common style for \"i\" and \"?\" */\n",
958
+ "\n",
959
+ ".sk-estimator-doc-link,\n",
960
+ "a:link.sk-estimator-doc-link,\n",
961
+ "a:visited.sk-estimator-doc-link {\n",
962
+ " float: right;\n",
963
+ " font-size: smaller;\n",
964
+ " line-height: 1em;\n",
965
+ " font-family: monospace;\n",
966
+ " background-color: var(--sklearn-color-background);\n",
967
+ " border-radius: 1em;\n",
968
+ " height: 1em;\n",
969
+ " width: 1em;\n",
970
+ " text-decoration: none !important;\n",
971
+ " margin-left: 1ex;\n",
972
+ " /* unfitted */\n",
973
+ " border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
974
+ " color: var(--sklearn-color-unfitted-level-1);\n",
975
+ "}\n",
976
+ "\n",
977
+ ".sk-estimator-doc-link.fitted,\n",
978
+ "a:link.sk-estimator-doc-link.fitted,\n",
979
+ "a:visited.sk-estimator-doc-link.fitted {\n",
980
+ " /* fitted */\n",
981
+ " border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
982
+ " color: var(--sklearn-color-fitted-level-1);\n",
983
+ "}\n",
984
+ "\n",
985
+ "/* On hover */\n",
986
+ "div.sk-estimator:hover .sk-estimator-doc-link:hover,\n",
987
+ ".sk-estimator-doc-link:hover,\n",
988
+ "div.sk-label-container:hover .sk-estimator-doc-link:hover,\n",
989
+ ".sk-estimator-doc-link:hover {\n",
990
+ " /* unfitted */\n",
991
+ " background-color: var(--sklearn-color-unfitted-level-3);\n",
992
+ " color: var(--sklearn-color-background);\n",
993
+ " text-decoration: none;\n",
994
+ "}\n",
995
+ "\n",
996
+ "div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover,\n",
997
+ ".sk-estimator-doc-link.fitted:hover,\n",
998
+ "div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover,\n",
999
+ ".sk-estimator-doc-link.fitted:hover {\n",
1000
+ " /* fitted */\n",
1001
+ " background-color: var(--sklearn-color-fitted-level-3);\n",
1002
+ " color: var(--sklearn-color-background);\n",
1003
+ " text-decoration: none;\n",
1004
+ "}\n",
1005
+ "\n",
1006
+ "/* Span, style for the box shown on hovering the info icon */\n",
1007
+ ".sk-estimator-doc-link span {\n",
1008
+ " display: none;\n",
1009
+ " z-index: 9999;\n",
1010
+ " position: relative;\n",
1011
+ " font-weight: normal;\n",
1012
+ " right: .2ex;\n",
1013
+ " padding: .5ex;\n",
1014
+ " margin: .5ex;\n",
1015
+ " width: min-content;\n",
1016
+ " min-width: 20ex;\n",
1017
+ " max-width: 50ex;\n",
1018
+ " color: var(--sklearn-color-text);\n",
1019
+ " box-shadow: 2pt 2pt 4pt #999;\n",
1020
+ " /* unfitted */\n",
1021
+ " background: var(--sklearn-color-unfitted-level-0);\n",
1022
+ " border: .5pt solid var(--sklearn-color-unfitted-level-3);\n",
1023
+ "}\n",
1024
+ "\n",
1025
+ ".sk-estimator-doc-link.fitted span {\n",
1026
+ " /* fitted */\n",
1027
+ " background: var(--sklearn-color-fitted-level-0);\n",
1028
+ " border: var(--sklearn-color-fitted-level-3);\n",
1029
+ "}\n",
1030
+ "\n",
1031
+ ".sk-estimator-doc-link:hover span {\n",
1032
+ " display: block;\n",
1033
+ "}\n",
1034
+ "\n",
1035
+ "/* \"?\"-specific style due to the `<a>` HTML tag */\n",
1036
+ "\n",
1037
+ "#sk-container-id-1 a.estimator_doc_link {\n",
1038
+ " float: right;\n",
1039
+ " font-size: 1rem;\n",
1040
+ " line-height: 1em;\n",
1041
+ " font-family: monospace;\n",
1042
+ " background-color: var(--sklearn-color-background);\n",
1043
+ " border-radius: 1rem;\n",
1044
+ " height: 1rem;\n",
1045
+ " width: 1rem;\n",
1046
+ " text-decoration: none;\n",
1047
+ " /* unfitted */\n",
1048
+ " color: var(--sklearn-color-unfitted-level-1);\n",
1049
+ " border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
1050
+ "}\n",
1051
+ "\n",
1052
+ "#sk-container-id-1 a.estimator_doc_link.fitted {\n",
1053
+ " /* fitted */\n",
1054
+ " border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
1055
+ " color: var(--sklearn-color-fitted-level-1);\n",
1056
+ "}\n",
1057
+ "\n",
1058
+ "/* On hover */\n",
1059
+ "#sk-container-id-1 a.estimator_doc_link:hover {\n",
1060
+ " /* unfitted */\n",
1061
+ " background-color: var(--sklearn-color-unfitted-level-3);\n",
1062
+ " color: var(--sklearn-color-background);\n",
1063
+ " text-decoration: none;\n",
1064
+ "}\n",
1065
+ "\n",
1066
+ "#sk-container-id-1 a.estimator_doc_link.fitted:hover {\n",
1067
+ " /* fitted */\n",
1068
+ " background-color: var(--sklearn-color-fitted-level-3);\n",
1069
+ "}\n",
1070
+ "</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>LogisticRegression()</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-1\" type=\"checkbox\" checked><label for=\"sk-estimator-id-1\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;&nbsp;LogisticRegression<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.linear_model.LogisticRegression.html\">?<span>Documentation for LogisticRegression</span></a><span class=\"sk-estimator-doc-link fitted\">i<span>Fitted</span></span></label><div class=\"sk-toggleable__content fitted\"><pre>LogisticRegression()</pre></div> </div></div></div></div>"
1071
+ ],
1072
+ "text/plain": [
1073
+ "LogisticRegression()"
1074
+ ]
1075
+ },
1076
+ "execution_count": 23,
1077
+ "metadata": {},
1078
+ "output_type": "execute_result"
1079
+ }
1080
+ ],
1081
+ "source": [
1082
+ "# Training the model\n",
1083
+ "model = LogisticRegression()\n",
1084
+ "model.fit(x_train , y_train)"
1085
+ ]
1086
+ },
1087
+ {
1088
+ "cell_type": "code",
1089
+ "execution_count": 24,
1090
+ "metadata": {},
1091
+ "outputs": [
1092
+ {
1093
+ "name": "stdout",
1094
+ "output_type": "stream",
1095
+ "text": [
1096
+ "0.775615625\n"
1097
+ ]
1098
+ }
1099
+ ],
1100
+ "source": [
1101
+ "# Testing the model\n",
1102
+ "y_pred = model.predict(x_test)\n",
1103
+ "print(accuracy_score(y_test , y_pred))"
1104
+ ]
1105
+ },
1106
+ {
1107
+ "cell_type": "code",
1108
+ "execution_count": 25,
1109
+ "metadata": {},
1110
+ "outputs": [],
1111
+ "source": [
1112
+ "# Function to predict the sentiment\n",
1113
+ "def predict_sentiment(text):\n",
1114
+ " text = re.sub('[^a-zA-Z]',' ',text) # removing not a-z and A-Z\n",
1115
+ " text = text.lower()\n",
1116
+ " text = text.split() \n",
1117
+ " text = [stremmer.stem(word) for word in text if not word in stopwords.words('english')]\n",
1118
+ " text = ' '.join(text)\n",
1119
+ " text = [text]\n",
1120
+ " text = vectorizer.transform(text) \n",
1121
+ " sentiment = model.predict(text)\n",
1122
+ " if sentiment == 0:\n",
1123
+ " return \"Negative\"\n",
1124
+ " else:\n",
1125
+ " return \"Positive\""
1126
+ ]
1127
+ },
1128
+ {
1129
+ "cell_type": "code",
1130
+ "execution_count": 26,
1131
+ "metadata": {},
1132
+ "outputs": [
1133
+ {
1134
+ "name": "stdout",
1135
+ "output_type": "stream",
1136
+ "text": [
1137
+ "Negative\n",
1138
+ "Positive\n"
1139
+ ]
1140
+ }
1141
+ ],
1142
+ "source": [
1143
+ "# Testing the model\n",
1144
+ "print(predict_sentiment(\"I hate you\"))\n",
1145
+ "print(predict_sentiment(\"I love you\"))"
1146
+ ]
1147
+ },
1148
+ {
1149
+ "cell_type": "code",
1150
+ "execution_count": 27,
1151
+ "metadata": {},
1152
+ "outputs": [],
1153
+ "source": [
1154
+ "# Save the model\n",
1155
+ "import pickle\n",
1156
+ "pickle.dump(model , open('model.pkl' , 'wb'))"
1157
+ ]
1158
+ },
1159
+ {
1160
+ "cell_type": "code",
1161
+ "execution_count": 29,
1162
+ "metadata": {},
1163
+ "outputs": [],
1164
+ "source": [
1165
+ "pickle.dump(vectorizer , open('vectorizer.pkl' , 'wb'))"
1166
+ ]
1167
+ }
1168
+ ],
1169
+ "metadata": {
1170
+ "kernelspec": {
1171
+ "display_name": "Python 3",
1172
+ "language": "python",
1173
+ "name": "python3"
1174
+ },
1175
+ "language_info": {
1176
+ "codemirror_mode": {
1177
+ "name": "ipython",
1178
+ "version": 3
1179
+ },
1180
+ "file_extension": ".py",
1181
+ "mimetype": "text/x-python",
1182
+ "name": "python",
1183
+ "nbconvert_exporter": "python",
1184
+ "pygments_lexer": "ipython3",
1185
+ "version": "3.12.4"
1186
+ }
1187
+ },
1188
+ "nbformat": 4,
1189
+ "nbformat_minor": 2
1190
+ }
app.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pickle
3
+ import re
4
+ from sklearn.feature_extraction.text import TfidfVectorizer
5
+ from nltk.corpus import stopwords
6
+ import nltk
7
+ from ntscraper import Nitter
8
+
9
+ # Download stopwords once, using Streamlit's caching
10
+ @st.cache_resource
11
+ def load_stopwords():
12
+ nltk.download('stopwords')
13
+ return stopwords.words('english')
14
+
15
+ # Load model and vectorizer once
16
+ @st.cache_resource
17
+ def load_model_and_vectorizer():
18
+ with open('model.pkl', 'rb') as model_file:
19
+ model = pickle.load(model_file)
20
+ with open('vectorizer.pkl', 'rb') as vectorizer_file:
21
+ vectorizer = pickle.load(vectorizer_file)
22
+ return model, vectorizer
23
+
24
+ # Define sentiment prediction function
25
+ def predict_sentiment(text, model, vectorizer, stop_words):
26
+ # Preprocess text
27
+ text = re.sub('[^a-zA-Z]', ' ', text)
28
+ text = text.lower()
29
+ text = text.split()
30
+ text = [word for word in text if word not in stop_words]
31
+ text = ' '.join(text)
32
+ text = [text]
33
+ text = vectorizer.transform(text)
34
+
35
+ # Predict sentiment
36
+ sentiment = model.predict(text)
37
+ return "Negative" if sentiment == 0 else "Positive"
38
+
39
+ # Initialize Nitter scraper
40
+ @st.cache_resource
41
+ def initialize_scraper():
42
+ return Nitter(log_level=1)
43
+
44
+ # Function to create a colored card
45
+ def create_card(tweet_text, sentiment):
46
+ color = "green" if sentiment == "Positive" else "red"
47
+ card_html = f"""
48
+ <div style="background-color: {color}; padding: 10px; border-radius: 5px; margin: 10px 0;">
49
+ <h5 style="color: white;">{sentiment} Sentiment</h5>
50
+ <p style="color: white;">{tweet_text}</p>
51
+ </div>
52
+ """
53
+ return card_html
54
+
55
+ # Main app logic
56
+ def main():
57
+ st.title("Twitter Sentiment Analysis")
58
+
59
+ # Load stopwords, model, vectorizer, and scraper only once
60
+ stop_words = load_stopwords()
61
+ model, vectorizer = load_model_and_vectorizer()
62
+ scraper = initialize_scraper()
63
+
64
+ # User input: either text input or Twitter username
65
+ option = st.selectbox("Choose an option", ["Input text", "Get tweets from user"])
66
+
67
+ if option == "Input text":
68
+ text_input = st.text_area("Enter text to analyze sentiment")
69
+ if st.button("Analyze"):
70
+ sentiment = predict_sentiment(text_input, model, vectorizer, stop_words)
71
+ st.write(f"Sentiment: {sentiment}")
72
+
73
+ elif option == "Get tweets from user":
74
+ username = st.text_input("Enter Twitter username")
75
+ if st.button("Fetch Tweets"):
76
+ tweets_data = scraper.get_tweets(username, mode='user', number=5)
77
+ if 'tweets' in tweets_data: # Check if the 'tweets' key exists
78
+ for tweet in tweets_data['tweets']:
79
+ tweet_text = tweet['text'] # Access the text of the tweet
80
+ sentiment = predict_sentiment(tweet_text, model, vectorizer, stop_words) # Predict sentiment of the tweet text
81
+
82
+ # Create and display the colored card for the tweet
83
+ card_html = create_card(tweet_text, sentiment)
84
+ st.markdown(card_html, unsafe_allow_html=True)
85
+ else:
86
+ st.write("No tweets found or an error occurred.")
87
+
88
+ if __name__ == "__main__":
89
+ main()
model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68f9b697c666945be1754523f1a9009c96ef21527243a17524f95aaf8620b8b8
3
+ size 3687998
vectorizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:11ac9f2bf806dfe7ff5976f837b176078352f728240b629a12692e48a1df628b
3
+ size 14777888