vikranth1111 commited on
Commit
469b201
·
1 Parent(s): b95f34b

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +501 -0
app.py ADDED
@@ -0,0 +1,501 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+
4
+ # In[6]:
5
+
6
+
7
+ import pandas as pd
8
+ import numpy as np
9
+ import itertools
10
+ import seaborn as sns
11
+ import nltk, re, string
12
+ from string import punctuation
13
+ from nltk.corpus import stopwords
14
+ import matplotlib.pyplot as plt
15
+ get_ipython().run_line_magic('matplotlib', 'inline')
16
+ from sklearn.metrics import accuracy_score, f1_score, precision_score,confusion_matrix, recall_score, roc_auc_score
17
+
18
+ from sklearn.feature_extraction.text import TfidfVectorizer
19
+ from sklearn.model_selection import train_test_split,cross_val_score
20
+ #machine learning
21
+ from sklearn.linear_model import PassiveAggressiveClassifier,LogisticRegression
22
+ # machine learning
23
+ from sklearn.naive_bayes import MultinomialNB,GaussianNB
24
+ nltk.download('stopwords')
25
+ nltk.download('punkt')
26
+ nltk.download('wordnet')
27
+ nltk.download('omw-1.4')
28
+
29
+
30
+ # In[20]:
31
+
32
+
33
+ pip install wordcloud
34
+
35
+
36
+ # In[7]:
37
+
38
+
39
+ pip install pandas numpy seaborn nltk matplotlib scikit-learn
40
+
41
+
42
+ # In[8]:
43
+
44
+
45
+ pip install pandas numpy seaborn nltk matplotlib scikit-learn
46
+
47
+
48
+ # In[9]:
49
+
50
+
51
+ import ssl
52
+ ssl._create_default_https_context = ssl._create_unverified_context
53
+
54
+ import nltk
55
+ nltk.download()
56
+
57
+
58
+ # In[10]:
59
+
60
+
61
+ df = pd.read_csv('disaster_tweets.csv')
62
+ df.head()
63
+
64
+
65
+ # In[11]:
66
+
67
+
68
+ df.info()
69
+
70
+
71
+ # ## Target Distribution
72
+
73
+ # In[12]:
74
+
75
+
76
+ sns.set_style("dark")
77
+ sns.countplot(df.target)
78
+
79
+
80
+ # In[13]:
81
+
82
+
83
+ # craeteing new column for storing length of reviews
84
+ df['length'] = df['text'].apply(len)
85
+ df.head()
86
+
87
+
88
+ # In[14]:
89
+
90
+
91
+ df['length'].plot(bins=50, kind='hist')
92
+
93
+
94
+ # In[15]:
95
+
96
+
97
+ df.length.describe()
98
+
99
+
100
+ # In[16]:
101
+
102
+
103
+ df[df['length'] == 157]['text'].iloc[0]
104
+
105
+
106
+ # In[17]:
107
+
108
+
109
+ df.hist(column='length', by='target', bins=50,figsize=(10,4))
110
+
111
+
112
+ # In[18]:
113
+
114
+
115
+ stop = set(stopwords.words('english'))
116
+ punctuation = list(string.punctuation)
117
+ stop.update(punctuation)
118
+
119
+ # Removing stop words which are unneccesary from headline news
120
+ def remove_stopwords(text):
121
+ final_text = []
122
+ for i in text.split():
123
+ if i.strip().lower() not in stop:
124
+ final_text.append(i.strip())
125
+ return " ".join(final_text)
126
+
127
+ df_1 = df[df['target']==1]
128
+ df_0 = df[df['target']==0]
129
+ df_1['text']=df_1['text'].apply(remove_stopwords)
130
+ df_0['text']=df_0['text'].apply(remove_stopwords)
131
+
132
+
133
+ # ## Plotting wordcloud of Disaster Tweets
134
+
135
+ # In[21]:
136
+
137
+
138
+ from wordcloud import WordCloud
139
+ plt.figure(figsize = (20,20)) # Text that is Disaster tweets
140
+ wc = WordCloud(max_words = 1000 , width = 1600 , height = 800).generate(" ".join(df_1.text))
141
+ plt.imshow(wc , interpolation = 'bilinear')
142
+
143
+
144
+ # ## Plotting wordcloud of Normal Tweets
145
+
146
+ # In[22]:
147
+
148
+
149
+ plt.figure(figsize = (20,20)) # Text that is Normal Tweets
150
+ wc = WordCloud(max_words = 1000 , width = 1600 , height = 800).generate(" ".join(df_0.text))
151
+ plt.imshow(wc , interpolation = 'bilinear')
152
+
153
+
154
+ # ## Data Cleaning and Preparation
155
+
156
+ # In[23]:
157
+
158
+
159
+ from nltk.stem import WordNetLemmatizer
160
+ lemma = WordNetLemmatizer()
161
+ #creating list of possible stopwords from nltk library
162
+ stop = stopwords.words('english')
163
+
164
+ def cleanTweet(txt):
165
+ # lowercaing
166
+ txt = txt.lower()
167
+ # tokenization
168
+ words = nltk.word_tokenize(txt)
169
+ # removing stopwords & mennatizing the words
170
+ words = ' '.join([lemma.lemmatize(word) for word in words if word not in (stop)])
171
+ text = "".join(words)
172
+ # removing non-alphabetic characters
173
+ txt = re.sub('[^a-z]',' ',text)
174
+ return txt
175
+
176
+
177
+ # ## Applying Clean Tweet Function on Tweets Text
178
+
179
+ # In[24]:
180
+
181
+
182
+ df['cleaned_tweets'] = df['text'].apply(cleanTweet)
183
+ df.head()
184
+
185
+
186
+ # ## Creating Feature & Target Variables
187
+
188
+ # In[25]:
189
+
190
+
191
+ y = df.target
192
+ X=df.cleaned_tweets
193
+
194
+
195
+ # In[26]:
196
+
197
+
198
+ X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.20,stratify=y, random_state=0)
199
+
200
+
201
+ # ## TF-IDF Vectorizer - Bi-Gram
202
+
203
+ # In[27]:
204
+
205
+
206
+ tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.8, ngram_range=(1,2))
207
+ tfidf_train_2 = tfidf_vectorizer.fit_transform(X_train)
208
+ tfidf_test_2 = tfidf_vectorizer.transform(X_test)
209
+
210
+
211
+ # ## Multinomial Naive Bayes
212
+
213
+ # In[28]:
214
+
215
+
216
+ ## Model Fitting
217
+ mnb_tf = MultinomialNB()
218
+ mnb_tf.fit(tfidf_train_2, y_train)
219
+
220
+
221
+
222
+ # ## 10-Fold Cross Validation
223
+
224
+ # In[29]:
225
+
226
+
227
+ from sklearn import model_selection
228
+
229
+ kfold = model_selection.KFold(n_splits=10)
230
+ scoring = 'accuracy'
231
+
232
+ acc_mnb2 = cross_val_score(estimator = mnb_tf, X = tfidf_train_2, y = y_train, cv = kfold,scoring=scoring)
233
+ acc_mnb2.mean()
234
+
235
+
236
+ # ## Model Prediction Test set
237
+
238
+ # In[30]:
239
+
240
+
241
+ pred_mnb2 = mnb_tf.predict(tfidf_test_2)
242
+ CM=confusion_matrix(y_test,pred_mnb2)
243
+ sns.heatmap(CM,cmap= "Blues", linecolor = 'black' , linewidth = 1 , annot = True, fmt='' , xticklabels = ['Normal', 'Disaster'] , yticklabels = ['Normal', 'Disaster'])
244
+
245
+ TN = CM[0][0]
246
+ FN = CM[1][0]
247
+ TP = CM[1][1]
248
+ FP = CM[0][1]
249
+ specificity = TN/(TN+FP)
250
+
251
+ acc= accuracy_score(y_test, pred_mnb2)
252
+
253
+ prec = precision_score(y_test, pred_mnb2)
254
+ rec = recall_score(y_test, pred_mnb2)
255
+ f1 = f1_score(y_test, pred_mnb2)
256
+
257
+
258
+ model_results =pd.DataFrame([['Multinomial Naive Bayes - TFIDF-Bigram',acc, prec,rec,specificity, f1]],
259
+ columns = ['Model', 'Accuracy','Precision', 'Sensitivity','Specificity', 'F1 Score'])
260
+
261
+ model_results
262
+
263
+
264
+ # ## Passive Aggressive Classifier
265
+
266
+ # In[31]:
267
+
268
+
269
+ pass_tf = PassiveAggressiveClassifier()
270
+ pass_tf.fit(tfidf_train_2, y_train)
271
+
272
+
273
+ # ## 10-Fold Cross Validation
274
+
275
+ # In[32]:
276
+
277
+
278
+ kfold = model_selection.KFold(n_splits=10)
279
+ scoring = 'accuracy'
280
+
281
+ acc_pass2 = cross_val_score(estimator = pass_tf, X = tfidf_train_2, y = y_train, cv = kfold,scoring=scoring)
282
+ acc_pass2.mean()
283
+
284
+
285
+ # ## Model Prediction
286
+
287
+ # In[33]:
288
+
289
+
290
+ pred_pass2 = pass_tf.predict(tfidf_test_2)
291
+ CM=confusion_matrix(y_test,pred_pass2)
292
+ sns.heatmap(CM,cmap= "Blues", linecolor = 'black' , linewidth = 1 , annot = True, fmt='' , xticklabels = ['Normal', 'Disaster'] , yticklabels = ['Normal', 'Disaster'])
293
+
294
+ acc = accuracy_score(y_test, pred_pass2)
295
+ prec = precision_score(y_test, pred_pass2)
296
+ rec = recall_score(y_test, pred_pass2)
297
+ f1 = f1_score(y_test, pred_pass2)
298
+
299
+ results =pd.DataFrame([['Passive Aggressive Classifier - TFIDF-Bigram',acc, prec,rec,specificity, f1]],
300
+ columns = ['Model', 'Accuracy','Precision', 'Sensitivity','Specificity', 'F1 Score'])
301
+ results = model_results.append(results, ignore_index = True)
302
+ results
303
+
304
+
305
+ # ## TF-IDF Vectorizer - Tri Gram
306
+
307
+ # In[34]:
308
+
309
+
310
+ tfidf_vectorizer_3 = TfidfVectorizer(stop_words='english', max_df=0.8, ngram_range=(1,3))
311
+ tfidf_train_3 = tfidf_vectorizer_3.fit_transform(X_train)
312
+ tfidf_test_3 = tfidf_vectorizer_3.transform(X_test)
313
+
314
+
315
+ # ## Multinomial Naive Bayes - Tri Gram
316
+
317
+ # In[35]:
318
+
319
+
320
+ mnb_tf3 = MultinomialNB()
321
+ mnb_tf3.fit(tfidf_train_3, y_train)
322
+
323
+
324
+ # ## 10-fold cross validation
325
+
326
+ # In[36]:
327
+
328
+
329
+ kfold = model_selection.KFold(n_splits=10)
330
+ scoring = 'accuracy'
331
+
332
+ acc_mnb3 = cross_val_score(estimator = mnb_tf, X = tfidf_train_3, y = y_train, cv = kfold,scoring=scoring)
333
+ acc_mnb3.mean()
334
+
335
+
336
+ # ## Model Prediction
337
+
338
+ # In[37]:
339
+
340
+
341
+ pred_mnb3 = mnb_tf3.predict(tfidf_test_3)
342
+ CM=confusion_matrix(y_test,pred_mnb3)
343
+ sns.heatmap(CM,cmap= "Blues", linecolor = 'black' , linewidth = 1 , annot = True, fmt='' , xticklabels = ['Normal', 'Disaster'] , yticklabels = ['Normal', 'Disaster'])
344
+
345
+ acc = accuracy_score(y_test, pred_mnb3)
346
+ prec = precision_score(y_test, pred_mnb3)
347
+ rec = recall_score(y_test, pred_mnb3)
348
+ f1 = f1_score(y_test, pred_mnb3)
349
+
350
+ mod_results =pd.DataFrame([['Multinomial Naive Bayes - TFIDF-Trigram',acc, prec,rec,specificity, f1]],
351
+ columns = ['Model', 'Accuracy','Precision', 'Sensitivity','Specificity', 'F1 Score'])
352
+ results = results.append(mod_results, ignore_index = True)
353
+ results
354
+
355
+
356
+ # ## Passive Aggressive Classifier - Tri Gram
357
+
358
+ # In[38]:
359
+
360
+
361
+ pass_tf3 = PassiveAggressiveClassifier()
362
+ pass_tf3.fit(tfidf_train_3, y_train)
363
+
364
+ ## cross validation
365
+ kfold = model_selection.KFold(n_splits=10)
366
+ scoring = 'accuracy'
367
+
368
+ acc_pass3 = cross_val_score(estimator = pass_tf3, X = tfidf_train_3, y = y_train, cv = kfold,scoring=scoring)
369
+ acc_pass3.mean()
370
+
371
+
372
+ # In[39]:
373
+
374
+
375
+ pred_pass3 = pass_tf3.predict(tfidf_test_3)
376
+ CM=confusion_matrix(y_test,pred_pass3)
377
+ sns.heatmap(CM,cmap= "Blues", linecolor = 'black' , linewidth = 1 , annot = True, fmt='' , xticklabels = ['Normal', 'Disaster'] , yticklabels = ['Normal', 'Disaster'])
378
+
379
+ acc = accuracy_score(y_test, pred_pass3)
380
+ prec = precision_score(y_test, pred_pass3)
381
+ rec = recall_score(y_test, pred_pass3)
382
+ f1 = f1_score(y_test, pred_pass3)
383
+
384
+ mod1_results =pd.DataFrame([['Passive Aggressive Classifier - TFIDF-Trigram',acc, prec,rec,specificity, f1]],
385
+ columns = ['Model', 'Accuracy','Precision', 'Sensitivity','Specificity', 'F1 Score'])
386
+ results = results.append(mod1_results, ignore_index = True)
387
+ results
388
+
389
+
390
+ # ## Most Informative Features
391
+
392
+ # In[40]:
393
+
394
+
395
+ def most_informative_feature_for_binary_classification(vectorizer, classifier, n=100):
396
+ """
397
+ See: https://stackoverflow.com/a/26980472
398
+
399
+ Identify most important features if given a vectorizer and binary classifier. Set n to the number
400
+ of weighted features you would like to show. (Note: current implementation merely prints and does not
401
+ return top classes.)
402
+ """
403
+
404
+ class_labels = classifier.classes_
405
+ feature_names = vectorizer.get_feature_names_out()
406
+ topn_class1 = sorted(zip(classifier.coef_[0], feature_names))[:n]
407
+ topn_class2 = sorted(zip(classifier.coef_[0], feature_names))[-n:]
408
+
409
+ for coef, feat in topn_class1:
410
+ print(class_labels[0], coef, feat)
411
+
412
+ print()
413
+
414
+ for coef, feat in reversed(topn_class2):
415
+ print(class_labels[1], coef, feat)
416
+
417
+
418
+ # In[41]:
419
+
420
+
421
+ most_informative_feature_for_binary_classification(tfidf_vectorizer_3, pass_tf3, n=10)
422
+
423
+
424
+ # In[42]:
425
+
426
+
427
+ most_informative_feature_for_binary_classification(tfidf_vectorizer, mnb_tf, n=10)
428
+
429
+
430
+ # ## Sample prediction
431
+
432
+ # In[43]:
433
+
434
+
435
+ sentences = [
436
+ "Just happened a terrible car crash",
437
+ "Heard about #earthquake is different cities, stay safe everyone.",
438
+ "No I don't like cold!",
439
+ "@RosieGray Now in all sincerety do you think the UN would move to Israel if there was a fraction of a chance of being annihilated?"
440
+ ]
441
+
442
+ tfidf_trigram = tfidf_vectorizer_3.transform(sentences)
443
+
444
+
445
+ predictions = pass_tf3.predict(tfidf_trigram)
446
+
447
+ for text, label in zip(sentences, predictions):
448
+ if label==1:
449
+ target="Disaster Tweet"
450
+ print("text:", text, "\nClass:", target)
451
+ print()
452
+ else:
453
+ target="Normal Tweet"
454
+ print("text:", text, "\nClass:", target)
455
+ print()
456
+
457
+
458
+ # In[44]:
459
+
460
+
461
+ pip install gradio
462
+
463
+
464
+ # In[45]:
465
+
466
+
467
+ pip install gradio tensorflow
468
+
469
+
470
+ # In[61]:
471
+
472
+
473
+ import gradio as gr
474
+
475
+
476
+
477
+ def sample_prediction(inputs):
478
+ Accuracy= '97%'
479
+
480
+ # Split the input text into separate sentences
481
+
482
+ sentences = inputs.split('\n')
483
+ tfidf_trigram = tfidf_vectorizer_3.transform(sentences)
484
+ predictions = pass_tf3.predict(tfidf_trigram)
485
+ results = [" Disaster Tweet " if prediction == 1 else " Normal Tweet " for prediction in predictions]
486
+ return results, Accuracy
487
+
488
+ iface = gr.Interface(
489
+ fn=sample_prediction,
490
+
491
+ inputs=gr.Textbox(label="Enter Sentences (separate by newline)", type="text"),
492
+ outputs=[
493
+ gr.Textbox(label="Results"),
494
+ gr.Textbox(label="Accuracy")
495
+ ],
496
+ title="Tweet Classifier",
497
+ description="Enter multiple sentences (separate by newline) and get predictions."
498
+ )
499
+
500
+ iface.launch(share=True)
501
+