vikranth1111 commited on
Commit
eaf5291
·
1 Parent(s): 4d7e4f0

Upload 2 files

Browse files
Disaster_Detection_From_Tweets_using_ML.ipynb ADDED
@@ -0,0 +1,913 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {
7
+ "colab": {
8
+ "base_uri": "https://localhost:8080/"
9
+ },
10
+ "id": "MxvHGzzsDnoa",
11
+ "outputId": "17e475f1-0b63-4e12-f71b-b6eb6ba91065"
12
+ },
13
+ "outputs": [
14
+ {
15
+ "ename": "ModuleNotFoundError",
16
+ "evalue": "No module named 'sklearn'",
17
+ "output_type": "error",
18
+ "traceback": [
19
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
20
+ "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
21
+ "\u001b[1;32m/Users/vikranthbakkashetty/Downloads/disaster_detection_from_tweets-main/Disaster_Detection_From_Tweets_using_ML.ipynb Cell 1\u001b[0m line \u001b[0;36m1\n\u001b[1;32m <a href='vscode-notebook-cell:/Users/vikranthbakkashetty/Downloads/disaster_detection_from_tweets-main/Disaster_Detection_From_Tweets_using_ML.ipynb#W0sZmlsZQ%3D%3D?line=7'>8</a>\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mmatplotlib\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mpyplot\u001b[39;00m \u001b[39mas\u001b[39;00m \u001b[39mplt\u001b[39;00m\n\u001b[1;32m <a href='vscode-notebook-cell:/Users/vikranthbakkashetty/Downloads/disaster_detection_from_tweets-main/Disaster_Detection_From_Tweets_using_ML.ipynb#W0sZmlsZQ%3D%3D?line=8'>9</a>\u001b[0m get_ipython()\u001b[39m.\u001b[39mrun_line_magic(\u001b[39m'\u001b[39m\u001b[39mmatplotlib\u001b[39m\u001b[39m'\u001b[39m, \u001b[39m'\u001b[39m\u001b[39minline\u001b[39m\u001b[39m'\u001b[39m)\n\u001b[0;32m---> <a href='vscode-notebook-cell:/Users/vikranthbakkashetty/Downloads/disaster_detection_from_tweets-main/Disaster_Detection_From_Tweets_using_ML.ipynb#W0sZmlsZQ%3D%3D?line=9'>10</a>\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39msklearn\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mmetrics\u001b[39;00m \u001b[39mimport\u001b[39;00m accuracy_score, f1_score, precision_score,confusion_matrix, recall_score, roc_auc_score\n\u001b[1;32m <a href='vscode-notebook-cell:/Users/vikranthbakkashetty/Downloads/disaster_detection_from_tweets-main/Disaster_Detection_From_Tweets_using_ML.ipynb#W0sZmlsZQ%3D%3D?line=11'>12</a>\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39msklearn\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mfeature_extraction\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mtext\u001b[39;00m \u001b[39mimport\u001b[39;00m TfidfVectorizer\n\u001b[1;32m <a href='vscode-notebook-cell:/Users/vikranthbakkashetty/Downloads/disaster_detection_from_tweets-main/Disaster_Detection_From_Tweets_using_ML.ipynb#W0sZmlsZQ%3D%3D?line=12'>13</a>\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39msklearn\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mmodel_selection\u001b[39;00m \u001b[39mimport\u001b[39;00m train_test_split,cross_val_score\n",
22
+ "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'sklearn'"
23
+ ]
24
+ }
25
+ ],
26
+ "source": [
27
+ "import pandas as pd\n",
28
+ "import numpy as np\n",
29
+ "import itertools\n",
30
+ "import seaborn as sns\n",
31
+ "import nltk, re, string\n",
32
+ "from string import punctuation\n",
33
+ "from nltk.corpus import stopwords\n",
34
+ "import matplotlib.pyplot as plt\n",
35
+ "%matplotlib inline\n",
36
+ "from sklearn.metrics import accuracy_score, f1_score, precision_score,confusion_matrix, recall_score, roc_auc_score\n",
37
+ "\n",
38
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
39
+ "from sklearn.model_selection import train_test_split,cross_val_score\n",
40
+ "#machine learning\n",
41
+ "from sklearn.linear_model import PassiveAggressiveClassifier,LogisticRegression\n",
42
+ "# machine learning\n",
43
+ "from sklearn.naive_bayes import MultinomialNB,GaussianNB\n",
44
+ "nltk.download('stopwords')\n",
45
+ "nltk.download('punkt')\n",
46
+ "nltk.download('wordnet')\n",
47
+ "nltk.download('omw-1.4')"
48
+ ]
49
+ },
50
+ {
51
+ "cell_type": "code",
52
+ "execution_count": null,
53
+ "metadata": {},
54
+ "outputs": [],
55
+ "source": [
56
+ "pip install sklearn"
57
+ ]
58
+ },
59
+ {
60
+ "cell_type": "code",
61
+ "execution_count": null,
62
+ "metadata": {
63
+ "colab": {
64
+ "base_uri": "https://localhost:8080/",
65
+ "height": 206
66
+ },
67
+ "id": "-xCY2mIQD5_x",
68
+ "outputId": "b9158464-4ece-4715-efa7-ec1042a28e68"
69
+ },
70
+ "outputs": [],
71
+ "source": [
72
+ "df = pd.read_csv('//Users/vikranthbakkashetty/Downloads/disaster_detection_from_tweets-main/disaster_tweets.csv')\n",
73
+ "df.head()"
74
+ ]
75
+ },
76
+ {
77
+ "cell_type": "code",
78
+ "execution_count": null,
79
+ "metadata": {
80
+ "colab": {
81
+ "base_uri": "https://localhost:8080/"
82
+ },
83
+ "id": "Gd_1_5TYFHDD",
84
+ "outputId": "3e19f261-8433-446f-8840-1c6af98e3c6e"
85
+ },
86
+ "outputs": [],
87
+ "source": [
88
+ "df.info()"
89
+ ]
90
+ },
91
+ {
92
+ "cell_type": "markdown",
93
+ "metadata": {
94
+ "id": "01yWUhW-FEd4"
95
+ },
96
+ "source": [
97
+ "## Target Distribution"
98
+ ]
99
+ },
100
+ {
101
+ "cell_type": "code",
102
+ "execution_count": null,
103
+ "metadata": {
104
+ "colab": {
105
+ "base_uri": "https://localhost:8080/",
106
+ "height": 353
107
+ },
108
+ "id": "TNxbrllBE7Rw",
109
+ "outputId": "7bc5683c-d881-4c5f-80f5-aba6236916ad"
110
+ },
111
+ "outputs": [],
112
+ "source": [
113
+ "sns.set_style(\"dark\")\n",
114
+ "sns.countplot(df.target)"
115
+ ]
116
+ },
117
+ {
118
+ "cell_type": "code",
119
+ "execution_count": null,
120
+ "metadata": {
121
+ "colab": {
122
+ "base_uri": "https://localhost:8080/",
123
+ "height": 206
124
+ },
125
+ "id": "S3MpxtWuFL-4",
126
+ "outputId": "16249244-930d-4eef-91f3-058944c6b918"
127
+ },
128
+ "outputs": [],
129
+ "source": [
130
+ "# craeteing new column for storing length of reviews \n",
131
+ "df['length'] = df['text'].apply(len)\n",
132
+ "df.head()"
133
+ ]
134
+ },
135
+ {
136
+ "cell_type": "code",
137
+ "execution_count": null,
138
+ "metadata": {
139
+ "colab": {
140
+ "base_uri": "https://localhost:8080/",
141
+ "height": 283
142
+ },
143
+ "id": "U3U3T9QPFYFP",
144
+ "outputId": "a0350f22-0ba2-4f81-e984-cc5e8696a3b7"
145
+ },
146
+ "outputs": [],
147
+ "source": [
148
+ "df['length'].plot(bins=50, kind='hist')"
149
+ ]
150
+ },
151
+ {
152
+ "cell_type": "code",
153
+ "execution_count": null,
154
+ "metadata": {
155
+ "colab": {
156
+ "base_uri": "https://localhost:8080/"
157
+ },
158
+ "id": "s6eau4zHFh6S",
159
+ "outputId": "a7e8cd58-05f4-42d2-f85d-88d84fd35775"
160
+ },
161
+ "outputs": [],
162
+ "source": [
163
+ "df.length.describe()"
164
+ ]
165
+ },
166
+ {
167
+ "cell_type": "code",
168
+ "execution_count": null,
169
+ "metadata": {
170
+ "colab": {
171
+ "base_uri": "https://localhost:8080/",
172
+ "height": 54
173
+ },
174
+ "id": "2UGUXOu_FmTV",
175
+ "outputId": "454767e7-18d5-4601-b563-3954d39fc503"
176
+ },
177
+ "outputs": [],
178
+ "source": [
179
+ "df[df['length'] == 157]['text'].iloc[0]"
180
+ ]
181
+ },
182
+ {
183
+ "cell_type": "code",
184
+ "execution_count": null,
185
+ "metadata": {
186
+ "colab": {
187
+ "base_uri": "https://localhost:8080/",
188
+ "height": 343
189
+ },
190
+ "id": "pTDW0qiZFpNq",
191
+ "outputId": "56e487c9-ef89-46ea-e393-280b9a81e0c9"
192
+ },
193
+ "outputs": [],
194
+ "source": [
195
+ "df.hist(column='length', by='target', bins=50,figsize=(10,4))"
196
+ ]
197
+ },
198
+ {
199
+ "cell_type": "code",
200
+ "execution_count": null,
201
+ "metadata": {
202
+ "colab": {
203
+ "base_uri": "https://localhost:8080/"
204
+ },
205
+ "id": "IJYv-A-oG8fk",
206
+ "outputId": "aaba1ebb-aac7-4162-cfd3-77e75299c255"
207
+ },
208
+ "outputs": [],
209
+ "source": [
210
+ "stop = set(stopwords.words('english'))\n",
211
+ "punctuation = list(string.punctuation)\n",
212
+ "stop.update(punctuation)\n",
213
+ "\n",
214
+ "# Removing stop words which are unneccesary from headline news\n",
215
+ "def remove_stopwords(text):\n",
216
+ " final_text = []\n",
217
+ " for i in text.split():\n",
218
+ " if i.strip().lower() not in stop:\n",
219
+ " final_text.append(i.strip())\n",
220
+ " return \" \".join(final_text)\n",
221
+ "\n",
222
+ "df_1 = df[df['target']==1]\n",
223
+ "df_0 = df[df['target']==0]\n",
224
+ "df_1['text']=df_1['text'].apply(remove_stopwords)\n",
225
+ "df_0['text']=df_0['text'].apply(remove_stopwords)"
226
+ ]
227
+ },
228
+ {
229
+ "cell_type": "markdown",
230
+ "metadata": {
231
+ "id": "lIaKop1n4Vr6"
232
+ },
233
+ "source": [
234
+ "## Plotting wordcloud of Disaster Tweets"
235
+ ]
236
+ },
237
+ {
238
+ "cell_type": "code",
239
+ "execution_count": null,
240
+ "metadata": {
241
+ "colab": {
242
+ "base_uri": "https://localhost:8080/",
243
+ "height": 606
244
+ },
245
+ "id": "Qkxm4Gl_Hdg9",
246
+ "outputId": "c707ac2a-5fc5-4abe-eb08-ae6e91e20414"
247
+ },
248
+ "outputs": [],
249
+ "source": [
250
+ "from wordcloud import WordCloud\n",
251
+ "plt.figure(figsize = (20,20)) # Text that is Disaster tweets\n",
252
+ "wc = WordCloud(max_words = 1000 , width = 1600 , height = 800).generate(\" \".join(df_1.text))\n",
253
+ "plt.imshow(wc , interpolation = 'bilinear')\n"
254
+ ]
255
+ },
256
+ {
257
+ "cell_type": "markdown",
258
+ "metadata": {
259
+ "id": "RFybOU0d4hMn"
260
+ },
261
+ "source": [
262
+ "## Plotting wordcloud of Normal Tweets"
263
+ ]
264
+ },
265
+ {
266
+ "cell_type": "code",
267
+ "execution_count": null,
268
+ "metadata": {
269
+ "colab": {
270
+ "base_uri": "https://localhost:8080/",
271
+ "height": 606
272
+ },
273
+ "id": "88A5_Es3HyrZ",
274
+ "outputId": "94752358-6917-4640-c953-84228d453cc3"
275
+ },
276
+ "outputs": [],
277
+ "source": [
278
+ "plt.figure(figsize = (20,20)) # Text that is Normal Tweets\n",
279
+ "wc = WordCloud(max_words = 1000 , width = 1600 , height = 800).generate(\" \".join(df_0.text))\n",
280
+ "plt.imshow(wc , interpolation = 'bilinear')"
281
+ ]
282
+ },
283
+ {
284
+ "cell_type": "markdown",
285
+ "metadata": {
286
+ "id": "G4xpJIDhLeZ7"
287
+ },
288
+ "source": [
289
+ "## Data Cleaning and Preparation"
290
+ ]
291
+ },
292
+ {
293
+ "cell_type": "code",
294
+ "execution_count": null,
295
+ "metadata": {
296
+ "id": "czAn1C9hLcrS"
297
+ },
298
+ "outputs": [],
299
+ "source": [
300
+ "from nltk.stem import WordNetLemmatizer\n",
301
+ "lemma = WordNetLemmatizer()\n",
302
+ "#creating list of possible stopwords from nltk library\n",
303
+ "stop = stopwords.words('english')\n",
304
+ "\n",
305
+ "def cleanTweet(txt):\n",
306
+ " # lowercaing\n",
307
+ " txt = txt.lower()\n",
308
+ " # tokenization\n",
309
+ " words = nltk.word_tokenize(txt)\n",
310
+ " # removing stopwords & mennatizing the words\n",
311
+ " words = ' '.join([lemma.lemmatize(word) for word in words if word not in (stop)])\n",
312
+ " text = \"\".join(words)\n",
313
+ " # removing non-alphabetic characters\n",
314
+ " txt = re.sub('[^a-z]',' ',text)\n",
315
+ " return txt \n"
316
+ ]
317
+ },
318
+ {
319
+ "cell_type": "markdown",
320
+ "metadata": {
321
+ "id": "t9K2uFC65CjP"
322
+ },
323
+ "source": [
324
+ "## Applying Clean Tweet Function on Tweets Text"
325
+ ]
326
+ },
327
+ {
328
+ "cell_type": "code",
329
+ "execution_count": null,
330
+ "metadata": {
331
+ "colab": {
332
+ "base_uri": "https://localhost:8080/",
333
+ "height": 206
334
+ },
335
+ "id": "_j712nT9Ma4L",
336
+ "outputId": "8e494e08-f0f9-453f-f12c-147d8b55baa4"
337
+ },
338
+ "outputs": [],
339
+ "source": [
340
+ "df['cleaned_tweets'] = df['text'].apply(cleanTweet)\n",
341
+ "df.head()"
342
+ ]
343
+ },
344
+ {
345
+ "cell_type": "markdown",
346
+ "metadata": {
347
+ "id": "-z-24scU5NgM"
348
+ },
349
+ "source": [
350
+ "## Creating Feature & Target Variables"
351
+ ]
352
+ },
353
+ {
354
+ "cell_type": "code",
355
+ "execution_count": null,
356
+ "metadata": {
357
+ "id": "T7UA_KKNIAK6"
358
+ },
359
+ "outputs": [],
360
+ "source": [
361
+ "y = df.target\n",
362
+ "X=df.cleaned_tweets"
363
+ ]
364
+ },
365
+ {
366
+ "cell_type": "code",
367
+ "execution_count": null,
368
+ "metadata": {
369
+ "id": "IOkKFRGwIXrr"
370
+ },
371
+ "outputs": [],
372
+ "source": [
373
+ "X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.20,stratify=y, random_state=0)"
374
+ ]
375
+ },
376
+ {
377
+ "cell_type": "markdown",
378
+ "metadata": {
379
+ "id": "KTixc5ua6WPb"
380
+ },
381
+ "source": [
382
+ "## TF-IDF Vectorizer - Bi-Gram"
383
+ ]
384
+ },
385
+ {
386
+ "cell_type": "code",
387
+ "execution_count": null,
388
+ "metadata": {
389
+ "id": "QcOwtLDiIcna"
390
+ },
391
+ "outputs": [],
392
+ "source": [
393
+ "tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.8, ngram_range=(1,2))\n",
394
+ "tfidf_train_2 = tfidf_vectorizer.fit_transform(X_train)\n",
395
+ "tfidf_test_2 = tfidf_vectorizer.transform(X_test)"
396
+ ]
397
+ },
398
+ {
399
+ "cell_type": "markdown",
400
+ "metadata": {
401
+ "id": "ET9odcJ46z3M"
402
+ },
403
+ "source": [
404
+ "## Multinomial Naive Bayes"
405
+ ]
406
+ },
407
+ {
408
+ "cell_type": "code",
409
+ "execution_count": null,
410
+ "metadata": {
411
+ "colab": {
412
+ "base_uri": "https://localhost:8080/"
413
+ },
414
+ "id": "EyAtlCrMIigj",
415
+ "outputId": "8739c74f-97cb-44bb-a5c9-02380d817047"
416
+ },
417
+ "outputs": [],
418
+ "source": [
419
+ "## Model Fitting\n",
420
+ "mnb_tf = MultinomialNB()\n",
421
+ "mnb_tf.fit(tfidf_train_2, y_train)\n",
422
+ "\n"
423
+ ]
424
+ },
425
+ {
426
+ "cell_type": "markdown",
427
+ "metadata": {
428
+ "id": "iP06hRJV80he"
429
+ },
430
+ "source": [
431
+ "## 10-Fold Cross Validation"
432
+ ]
433
+ },
434
+ {
435
+ "cell_type": "code",
436
+ "execution_count": null,
437
+ "metadata": {
438
+ "colab": {
439
+ "base_uri": "https://localhost:8080/"
440
+ },
441
+ "id": "3dl2qwK_80Lg",
442
+ "outputId": "a29fee5d-1a00-4bcc-88cd-d9854a31fb8b"
443
+ },
444
+ "outputs": [],
445
+ "source": [
446
+ "from sklearn import model_selection\n",
447
+ "\n",
448
+ "kfold = model_selection.KFold(n_splits=10)\n",
449
+ "scoring = 'accuracy'\n",
450
+ "\n",
451
+ "acc_mnb2 = cross_val_score(estimator = mnb_tf, X = tfidf_train_2, y = y_train, cv = kfold,scoring=scoring)\n",
452
+ "acc_mnb2.mean()"
453
+ ]
454
+ },
455
+ {
456
+ "cell_type": "markdown",
457
+ "metadata": {
458
+ "id": "VEhlOemY9o3v"
459
+ },
460
+ "source": [
461
+ "## Model Prediction Test set"
462
+ ]
463
+ },
464
+ {
465
+ "cell_type": "code",
466
+ "execution_count": null,
467
+ "metadata": {
468
+ "colab": {
469
+ "base_uri": "https://localhost:8080/",
470
+ "height": 333
471
+ },
472
+ "id": "XMu-aOsJ9cMA",
473
+ "outputId": "c3c45740-cba3-4595-c8c4-a8a33ae41536"
474
+ },
475
+ "outputs": [],
476
+ "source": [
477
+ "pred_mnb2 = mnb_tf.predict(tfidf_test_2)\n",
478
+ "CM=confusion_matrix(y_test,pred_mnb2)\n",
479
+ "sns.heatmap(CM,cmap= \"Blues\", linecolor = 'black' , linewidth = 1 , annot = True, fmt='' , xticklabels = ['Normal', 'Disaster'] , yticklabels = ['Normal', 'Disaster'])\n",
480
+ "\n",
481
+ "TN = CM[0][0]\n",
482
+ "FN = CM[1][0]\n",
483
+ "TP = CM[1][1]\n",
484
+ "FP = CM[0][1]\n",
485
+ "specificity = TN/(TN+FP)\n",
486
+ "\n",
487
+ "acc= accuracy_score(y_test, pred_mnb2)\n",
488
+ "\n",
489
+ "prec = precision_score(y_test, pred_mnb2)\n",
490
+ "rec = recall_score(y_test, pred_mnb2)\n",
491
+ "f1 = f1_score(y_test, pred_mnb2)\n",
492
+ "\n",
493
+ "\n",
494
+ "model_results =pd.DataFrame([['Multinomial Naive Bayes - TFIDF-Bigram',acc, prec,rec,specificity, f1]],\n",
495
+ " columns = ['Model', 'Accuracy','Precision', 'Sensitivity','Specificity', 'F1 Score'])\n",
496
+ "\n",
497
+ "model_results"
498
+ ]
499
+ },
500
+ {
501
+ "cell_type": "markdown",
502
+ "metadata": {
503
+ "id": "zAuAQhIh63sj"
504
+ },
505
+ "source": [
506
+ "## Passive Aggressive Classifier"
507
+ ]
508
+ },
509
+ {
510
+ "cell_type": "code",
511
+ "execution_count": null,
512
+ "metadata": {
513
+ "colab": {
514
+ "base_uri": "https://localhost:8080/"
515
+ },
516
+ "id": "o_aydxKyPnk_",
517
+ "outputId": "8a05104c-6b65-4c3c-a6bf-139c251518c5"
518
+ },
519
+ "outputs": [],
520
+ "source": [
521
+ "pass_tf = PassiveAggressiveClassifier()\n",
522
+ "pass_tf.fit(tfidf_train_2, y_train)"
523
+ ]
524
+ },
525
+ {
526
+ "cell_type": "markdown",
527
+ "metadata": {
528
+ "id": "wIriBQmj-qsi"
529
+ },
530
+ "source": [
531
+ "## 10-Fold Cross Validation"
532
+ ]
533
+ },
534
+ {
535
+ "cell_type": "code",
536
+ "execution_count": null,
537
+ "metadata": {
538
+ "colab": {
539
+ "base_uri": "https://localhost:8080/"
540
+ },
541
+ "id": "5Ir7B5fe-vht",
542
+ "outputId": "19833836-7e14-4b96-a31c-d207564045dd"
543
+ },
544
+ "outputs": [],
545
+ "source": [
546
+ "\n",
547
+ "kfold = model_selection.KFold(n_splits=10)\n",
548
+ "scoring = 'accuracy'\n",
549
+ "\n",
550
+ "acc_pass2 = cross_val_score(estimator = pass_tf, X = tfidf_train_2, y = y_train, cv = kfold,scoring=scoring)\n",
551
+ "acc_pass2.mean()"
552
+ ]
553
+ },
554
+ {
555
+ "cell_type": "markdown",
556
+ "metadata": {
557
+ "id": "duOtJaGH-6Dl"
558
+ },
559
+ "source": [
560
+ "## Model Prediction"
561
+ ]
562
+ },
563
+ {
564
+ "cell_type": "code",
565
+ "execution_count": null,
566
+ "metadata": {
567
+ "colab": {
568
+ "base_uri": "https://localhost:8080/",
569
+ "height": 360
570
+ },
571
+ "id": "aV2OjmZv_Dat",
572
+ "outputId": "502445e1-c2ff-459b-8222-e529cd551be1"
573
+ },
574
+ "outputs": [],
575
+ "source": [
576
+ "pred_pass2 = pass_tf.predict(tfidf_test_2)\n",
577
+ "CM=confusion_matrix(y_test,pred_pass2)\n",
578
+ "sns.heatmap(CM,cmap= \"Blues\", linecolor = 'black' , linewidth = 1 , annot = True, fmt='' , xticklabels = ['Normal', 'Disaster'] , yticklabels = ['Normal', 'Disaster'])\n",
579
+ "\n",
580
+ "acc = accuracy_score(y_test, pred_pass2)\n",
581
+ "prec = precision_score(y_test, pred_pass2)\n",
582
+ "rec = recall_score(y_test, pred_pass2)\n",
583
+ "f1 = f1_score(y_test, pred_pass2)\n",
584
+ "\n",
585
+ "results =pd.DataFrame([['Passive Aggressive Classifier - TFIDF-Bigram',acc, prec,rec,specificity, f1]],\n",
586
+ " columns = ['Model', 'Accuracy','Precision', 'Sensitivity','Specificity', 'F1 Score'])\n",
587
+ "results = model_results.append(results, ignore_index = True)\n",
588
+ "results"
589
+ ]
590
+ },
591
+ {
592
+ "cell_type": "markdown",
593
+ "metadata": {
594
+ "id": "4kzfq6Vu6bj5"
595
+ },
596
+ "source": [
597
+ "## TF-IDF Vectorizer - Tri Gram"
598
+ ]
599
+ },
600
+ {
601
+ "cell_type": "code",
602
+ "execution_count": null,
603
+ "metadata": {
604
+ "id": "SHDeA4FfQnhu"
605
+ },
606
+ "outputs": [],
607
+ "source": [
608
+ "tfidf_vectorizer_3 = TfidfVectorizer(stop_words='english', max_df=0.8, ngram_range=(1,3))\n",
609
+ "tfidf_train_3 = tfidf_vectorizer_3.fit_transform(X_train)\n",
610
+ "tfidf_test_3 = tfidf_vectorizer_3.transform(X_test)"
611
+ ]
612
+ },
613
+ {
614
+ "cell_type": "markdown",
615
+ "metadata": {
616
+ "id": "6-7gipjN6-Sc"
617
+ },
618
+ "source": [
619
+ "## Multinomial Naive Bayes - Tri Gram"
620
+ ]
621
+ },
622
+ {
623
+ "cell_type": "code",
624
+ "execution_count": null,
625
+ "metadata": {
626
+ "colab": {
627
+ "base_uri": "https://localhost:8080/"
628
+ },
629
+ "id": "mooomowf6i5S",
630
+ "outputId": "e4b287da-ba29-45b1-ddde-986f131f4ee6"
631
+ },
632
+ "outputs": [],
633
+ "source": [
634
+ "mnb_tf3 = MultinomialNB()\n",
635
+ "mnb_tf3.fit(tfidf_train_3, y_train)"
636
+ ]
637
+ },
638
+ {
639
+ "cell_type": "markdown",
640
+ "metadata": {
641
+ "id": "lWJ9ehn4_5Mk"
642
+ },
643
+ "source": [
644
+ "## 10-fold cross validation"
645
+ ]
646
+ },
647
+ {
648
+ "cell_type": "code",
649
+ "execution_count": null,
650
+ "metadata": {
651
+ "colab": {
652
+ "base_uri": "https://localhost:8080/"
653
+ },
654
+ "id": "31vc8MHq_8Kh",
655
+ "outputId": "7f59800a-2e29-46c0-dcfd-62ec95b3c690"
656
+ },
657
+ "outputs": [],
658
+ "source": [
659
+ "kfold = model_selection.KFold(n_splits=10)\n",
660
+ "scoring = 'accuracy'\n",
661
+ "\n",
662
+ "acc_mnb3 = cross_val_score(estimator = mnb_tf, X = tfidf_train_3, y = y_train, cv = kfold,scoring=scoring)\n",
663
+ "acc_mnb3.mean()"
664
+ ]
665
+ },
666
+ {
667
+ "cell_type": "markdown",
668
+ "metadata": {
669
+ "id": "vXcrlM9AAFUW"
670
+ },
671
+ "source": [
672
+ "## Model Prediction"
673
+ ]
674
+ },
675
+ {
676
+ "cell_type": "code",
677
+ "execution_count": null,
678
+ "metadata": {
679
+ "colab": {
680
+ "base_uri": "https://localhost:8080/",
681
+ "height": 394
682
+ },
683
+ "id": "4Gm5Vakt_7v2",
684
+ "outputId": "9190b92b-9a7b-4845-ba53-423cfd6bbb26"
685
+ },
686
+ "outputs": [],
687
+ "source": [
688
+ "pred_mnb3 = mnb_tf3.predict(tfidf_test_3)\n",
689
+ "CM=confusion_matrix(y_test,pred_mnb3)\n",
690
+ "sns.heatmap(CM,cmap= \"Blues\", linecolor = 'black' , linewidth = 1 , annot = True, fmt='' , xticklabels = ['Normal', 'Disaster'] , yticklabels = ['Normal', 'Disaster'])\n",
691
+ "\n",
692
+ "acc = accuracy_score(y_test, pred_mnb3)\n",
693
+ "prec = precision_score(y_test, pred_mnb3)\n",
694
+ "rec = recall_score(y_test, pred_mnb3)\n",
695
+ "f1 = f1_score(y_test, pred_mnb3)\n",
696
+ "\n",
697
+ "mod_results =pd.DataFrame([['Multinomial Naive Bayes - TFIDF-Trigram',acc, prec,rec,specificity, f1]],\n",
698
+ " columns = ['Model', 'Accuracy','Precision', 'Sensitivity','Specificity', 'F1 Score'])\n",
699
+ "results = results.append(mod_results, ignore_index = True)\n",
700
+ "results"
701
+ ]
702
+ },
703
+ {
704
+ "cell_type": "markdown",
705
+ "metadata": {
706
+ "id": "olvJkW1f7XhU"
707
+ },
708
+ "source": [
709
+ "## Passive Aggressive Classifier - Tri Gram"
710
+ ]
711
+ },
712
+ {
713
+ "cell_type": "code",
714
+ "execution_count": null,
715
+ "metadata": {
716
+ "colab": {
717
+ "base_uri": "https://localhost:8080/"
718
+ },
719
+ "id": "vQGkFat27GKm",
720
+ "outputId": "4368ca5e-9f84-47d6-d1fe-a6b79c1cf552"
721
+ },
722
+ "outputs": [],
723
+ "source": [
724
+ "pass_tf3 = PassiveAggressiveClassifier()\n",
725
+ "pass_tf3.fit(tfidf_train_3, y_train)\n",
726
+ "\n",
727
+ "## cross validation\n",
728
+ "kfold = model_selection.KFold(n_splits=10)\n",
729
+ "scoring = 'accuracy'\n",
730
+ "\n",
731
+ "acc_pass3 = cross_val_score(estimator = pass_tf3, X = tfidf_train_3, y = y_train, cv = kfold,scoring=scoring)\n",
732
+ "acc_pass3.mean()"
733
+ ]
734
+ },
735
+ {
736
+ "cell_type": "code",
737
+ "execution_count": null,
738
+ "metadata": {
739
+ "colab": {
740
+ "base_uri": "https://localhost:8080/",
741
+ "height": 423
742
+ },
743
+ "id": "QGvVFjCo7g9e",
744
+ "outputId": "f685155a-59a6-44c7-9e0e-c261d2e9c5d9"
745
+ },
746
+ "outputs": [],
747
+ "source": [
748
+ "pred_pass3 = pass_tf3.predict(tfidf_test_3)\n",
749
+ "CM=confusion_matrix(y_test,pred_pass3)\n",
750
+ "sns.heatmap(CM,cmap= \"Blues\", linecolor = 'black' , linewidth = 1 , annot = True, fmt='' , xticklabels = ['Normal', 'Disaster'] , yticklabels = ['Normal', 'Disaster'])\n",
751
+ "\n",
752
+ "acc = accuracy_score(y_test, pred_pass3)\n",
753
+ "prec = precision_score(y_test, pred_pass3)\n",
754
+ "rec = recall_score(y_test, pred_pass3)\n",
755
+ "f1 = f1_score(y_test, pred_pass3)\n",
756
+ "\n",
757
+ "mod1_results =pd.DataFrame([['Passive Aggressive Classifier - TFIDF-Trigram',acc, prec,rec,specificity, f1]],\n",
758
+ " columns = ['Model', 'Accuracy','Precision', 'Sensitivity','Specificity', 'F1 Score'])\n",
759
+ "results = results.append(mod1_results, ignore_index = True)\n",
760
+ "results"
761
+ ]
762
+ },
763
+ {
764
+ "cell_type": "markdown",
765
+ "metadata": {
766
+ "id": "6BIjKOjyB3Bi"
767
+ },
768
+ "source": [
769
+ "## Most Informative Features"
770
+ ]
771
+ },
772
+ {
773
+ "cell_type": "code",
774
+ "execution_count": null,
775
+ "metadata": {
776
+ "id": "zip4Fbfo7koR"
777
+ },
778
+ "outputs": [],
779
+ "source": [
780
+ "def most_informative_feature_for_binary_classification(vectorizer, classifier, n=100):\n",
781
+ " \"\"\"\n",
782
+ " See: https://stackoverflow.com/a/26980472\n",
783
+ " \n",
784
+ " Identify most important features if given a vectorizer and binary classifier. Set n to the number\n",
785
+ " of weighted features you would like to show. (Note: current implementation merely prints and does not \n",
786
+ " return top classes.)\n",
787
+ " \"\"\"\n",
788
+ "\n",
789
+ " class_labels = classifier.classes_\n",
790
+ " feature_names = vectorizer.get_feature_names_out()\n",
791
+ " topn_class1 = sorted(zip(classifier.coef_[0], feature_names))[:n]\n",
792
+ " topn_class2 = sorted(zip(classifier.coef_[0], feature_names))[-n:]\n",
793
+ "\n",
794
+ " for coef, feat in topn_class1:\n",
795
+ " print(class_labels[0], coef, feat)\n",
796
+ "\n",
797
+ " print()\n",
798
+ "\n",
799
+ " for coef, feat in reversed(topn_class2):\n",
800
+ " print(class_labels[1], coef, feat)"
801
+ ]
802
+ },
803
+ {
804
+ "cell_type": "code",
805
+ "execution_count": null,
806
+ "metadata": {
807
+ "colab": {
808
+ "base_uri": "https://localhost:8080/"
809
+ },
810
+ "id": "0_LZODtNB7TW",
811
+ "outputId": "bd9941bd-5dec-45cd-bb93-739e1299e9be"
812
+ },
813
+ "outputs": [],
814
+ "source": [
815
+ "most_informative_feature_for_binary_classification(tfidf_vectorizer_3, pass_tf3, n=10)"
816
+ ]
817
+ },
818
+ {
819
+ "cell_type": "code",
820
+ "execution_count": null,
821
+ "metadata": {
822
+ "colab": {
823
+ "base_uri": "https://localhost:8080/"
824
+ },
825
+ "id": "slO0gzyHCBJD",
826
+ "outputId": "998697f5-38d4-4726-bbf6-28fd7e00b511"
827
+ },
828
+ "outputs": [],
829
+ "source": [
830
+ "most_informative_feature_for_binary_classification(tfidf_vectorizer, mnb_tf, n=10)"
831
+ ]
832
+ },
833
+ {
834
+ "cell_type": "markdown",
835
+ "metadata": {
836
+ "id": "fJg_mWAGDsLM"
837
+ },
838
+ "source": [
839
+ "## Sample prediction"
840
+ ]
841
+ },
842
+ {
843
+ "cell_type": "code",
844
+ "execution_count": null,
845
+ "metadata": {
846
+ "colab": {
847
+ "base_uri": "https://localhost:8080/"
848
+ },
849
+ "id": "12swn6EUCiIz",
850
+ "outputId": "b2375378-7a66-4cc2-95f3-ff302ca92666"
851
+ },
852
+ "outputs": [],
853
+ "source": [
854
+ "sentences = [\n",
855
+ " \"Just happened a terrible car crash\",\n",
856
+ " \"Heard about #earthquake is different cities, stay safe everyone.\",\n",
857
+ " \"No I don't like cold!\",\n",
858
+ " \"@RosieGray Now in all sincerety do you think the UN would move to Israel if there was a fraction of a chance of being annihilated?\"\n",
859
+ " ]\n",
860
+ "\n",
861
+ "tfidf_trigram = tfidf_vectorizer_3.transform(sentences)\n",
862
+ "\n",
863
+ "\n",
864
+ "predictions = pass_tf3.predict(tfidf_trigram)\n",
865
+ "\n",
866
+ "for text, label in zip(sentences, predictions):\n",
867
+ " if label==1:\n",
868
+ " target=\"Disaster Tweet\"\n",
869
+ " print(\"text:\", text, \"\\nClass:\", target)\n",
870
+ " print()\n",
871
+ " else:\n",
872
+ " target=\"Normal Tweet\"\n",
873
+ " print(\"text:\", text, \"\\nClass:\", target)\n",
874
+ " print()"
875
+ ]
876
+ },
877
+ {
878
+ "cell_type": "code",
879
+ "execution_count": null,
880
+ "metadata": {
881
+ "id": "EUBaz8aAE2ko"
882
+ },
883
+ "outputs": [],
884
+ "source": []
885
+ }
886
+ ],
887
+ "metadata": {
888
+ "colab": {
889
+ "collapsed_sections": [],
890
+ "name": "Disaster Detection From Tweets using ML.ipynb",
891
+ "provenance": []
892
+ },
893
+ "gpuClass": "standard",
894
+ "kernelspec": {
895
+ "display_name": "Python 3",
896
+ "name": "python3"
897
+ },
898
+ "language_info": {
899
+ "codemirror_mode": {
900
+ "name": "ipython",
901
+ "version": 3
902
+ },
903
+ "file_extension": ".py",
904
+ "mimetype": "text/x-python",
905
+ "name": "python",
906
+ "nbconvert_exporter": "python",
907
+ "pygments_lexer": "ipython3",
908
+ "version": "3.12.0"
909
+ }
910
+ },
911
+ "nbformat": 4,
912
+ "nbformat_minor": 0
913
+ }
disaster_tweets.csv ADDED
The diff for this file is too large to render. See raw diff