#!/usr/bin/env python # coding: utf-8 # In[6]: import pandas as pd import numpy as np import itertools import seaborn as sns import nltk, re, string from string import punctuation from nltk.corpus import stopwords import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') from sklearn.metrics import accuracy_score, f1_score, precision_score,confusion_matrix, recall_score, roc_auc_score from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.model_selection import train_test_split,cross_val_score #machine learning from sklearn.linear_model import PassiveAggressiveClassifier,LogisticRegression # machine learning from sklearn.naive_bayes import MultinomialNB,GaussianNB nltk.download('stopwords') nltk.download('punkt') nltk.download('wordnet') nltk.download('omw-1.4') # In[20]: # In[7]: # In[9]: import ssl ssl._create_default_https_context = ssl._create_unverified_context import nltk nltk.download() # In[10]: df = pd.read_csv('/home/user/disaster_tweets.csv') df.head() # In[11]: df.info() # ## Target Distribution # In[12]: sns.set_style("dark") sns.countplot(df.target) # In[13]: # craeteing new column for storing length of reviews df['length'] = df['text'].apply(len) df.head() # In[14]: df['length'].plot(bins=50, kind='hist') # In[15]: df.length.describe() # In[16]: df[df['length'] == 157]['text'].iloc[0] # In[17]: df.hist(column='length', by='target', bins=50,figsize=(10,4)) # In[18]: stop = set(stopwords.words('english')) punctuation = list(string.punctuation) stop.update(punctuation) # Removing stop words which are unneccesary from headline news def remove_stopwords(text): final_text = [] for i in text.split(): if i.strip().lower() not in stop: final_text.append(i.strip()) return " ".join(final_text) df_1 = df[df['target']==1] df_0 = df[df['target']==0] df_1['text']=df_1['text'].apply(remove_stopwords) df_0['text']=df_0['text'].apply(remove_stopwords) # ## Plotting wordcloud of Disaster Tweets # In[21]: from wordcloud import WordCloud plt.figure(figsize = (20,20)) # Text that is Disaster tweets wc = WordCloud(max_words = 1000 , width = 1600 , height = 800).generate(" ".join(df_1.text)) plt.imshow(wc , interpolation = 'bilinear') # ## Plotting wordcloud of Normal Tweets # In[22]: plt.figure(figsize = (20,20)) # Text that is Normal Tweets wc = WordCloud(max_words = 1000 , width = 1600 , height = 800).generate(" ".join(df_0.text)) plt.imshow(wc , interpolation = 'bilinear') # ## Data Cleaning and Preparation # In[23]: from nltk.stem import WordNetLemmatizer lemma = WordNetLemmatizer() #creating list of possible stopwords from nltk library stop = stopwords.words('english') def cleanTweet(txt): # lowercaing txt = txt.lower() # tokenization words = nltk.word_tokenize(txt) # removing stopwords & mennatizing the words words = ' '.join([lemma.lemmatize(word) for word in words if word not in (stop)]) text = "".join(words) # removing non-alphabetic characters txt = re.sub('[^a-z]',' ',text) return txt # ## Applying Clean Tweet Function on Tweets Text # In[24]: df['cleaned_tweets'] = df['text'].apply(cleanTweet) df.head() # ## Creating Feature & Target Variables # In[25]: y = df.target X=df.cleaned_tweets # In[26]: X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.20,stratify=y, random_state=0) # ## TF-IDF Vectorizer - Bi-Gram # In[27]: tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.8, ngram_range=(1,2)) tfidf_train_2 = tfidf_vectorizer.fit_transform(X_train) tfidf_test_2 = tfidf_vectorizer.transform(X_test) # ## Multinomial Naive Bayes # In[28]: ## Model Fitting mnb_tf = MultinomialNB() mnb_tf.fit(tfidf_train_2, y_train) # ## 10-Fold Cross Validation # In[29]: from sklearn import model_selection kfold = model_selection.KFold(n_splits=10) scoring = 'accuracy' acc_mnb2 = cross_val_score(estimator = mnb_tf, X = tfidf_train_2, y = y_train, cv = kfold,scoring=scoring) acc_mnb2.mean() # ## Model Prediction Test set # In[30]: pred_mnb2 = mnb_tf.predict(tfidf_test_2) CM=confusion_matrix(y_test,pred_mnb2) sns.heatmap(CM,cmap= "Blues", linecolor = 'black' , linewidth = 1 , annot = True, fmt='' , xticklabels = ['Normal', 'Disaster'] , yticklabels = ['Normal', 'Disaster']) TN = CM[0][0] FN = CM[1][0] TP = CM[1][1] FP = CM[0][1] specificity = TN/(TN+FP) acc= accuracy_score(y_test, pred_mnb2) prec = precision_score(y_test, pred_mnb2) rec = recall_score(y_test, pred_mnb2) f1 = f1_score(y_test, pred_mnb2) model_results =pd.DataFrame([['Multinomial Naive Bayes - TFIDF-Bigram',acc, prec,rec,specificity, f1]], columns = ['Model', 'Accuracy','Precision', 'Sensitivity','Specificity', 'F1 Score']) model_results # ## Passive Aggressive Classifier # In[31]: pass_tf = PassiveAggressiveClassifier() pass_tf.fit(tfidf_train_2, y_train) # ## 10-Fold Cross Validation # In[32]: kfold = model_selection.KFold(n_splits=10) scoring = 'accuracy' acc_pass2 = cross_val_score(estimator = pass_tf, X = tfidf_train_2, y = y_train, cv = kfold,scoring=scoring) acc_pass2.mean() # ## Model Prediction # In[33]: pred_pass2 = pass_tf.predict(tfidf_test_2) CM=confusion_matrix(y_test,pred_pass2) sns.heatmap(CM,cmap= "Blues", linecolor = 'black' , linewidth = 1 , annot = True, fmt='' , xticklabels = ['Normal', 'Disaster'] , yticklabels = ['Normal', 'Disaster']) acc = accuracy_score(y_test, pred_pass2) prec = precision_score(y_test, pred_pass2) rec = recall_score(y_test, pred_pass2) f1 = f1_score(y_test, pred_pass2) results =pd.DataFrame([['Passive Aggressive Classifier - TFIDF-Bigram',acc, prec,rec,specificity, f1]], columns = ['Model', 'Accuracy','Precision', 'Sensitivity','Specificity', 'F1 Score']) results = model_results.append(results, ignore_index = True) results # ## TF-IDF Vectorizer - Tri Gram # In[34]: tfidf_vectorizer_3 = TfidfVectorizer(stop_words='english', max_df=0.8, ngram_range=(1,3)) tfidf_train_3 = tfidf_vectorizer_3.fit_transform(X_train) tfidf_test_3 = tfidf_vectorizer_3.transform(X_test) # ## Multinomial Naive Bayes - Tri Gram # In[35]: mnb_tf3 = MultinomialNB() mnb_tf3.fit(tfidf_train_3, y_train) # ## 10-fold cross validation # In[36]: kfold = model_selection.KFold(n_splits=10) scoring = 'accuracy' acc_mnb3 = cross_val_score(estimator = mnb_tf, X = tfidf_train_3, y = y_train, cv = kfold,scoring=scoring) acc_mnb3.mean() # ## Model Prediction # In[37]: pred_mnb3 = mnb_tf3.predict(tfidf_test_3) CM=confusion_matrix(y_test,pred_mnb3) sns.heatmap(CM,cmap= "Blues", linecolor = 'black' , linewidth = 1 , annot = True, fmt='' , xticklabels = ['Normal', 'Disaster'] , yticklabels = ['Normal', 'Disaster']) acc = accuracy_score(y_test, pred_mnb3) prec = precision_score(y_test, pred_mnb3) rec = recall_score(y_test, pred_mnb3) f1 = f1_score(y_test, pred_mnb3) mod_results =pd.DataFrame([['Multinomial Naive Bayes - TFIDF-Trigram',acc, prec,rec,specificity, f1]], columns = ['Model', 'Accuracy','Precision', 'Sensitivity','Specificity', 'F1 Score']) results = results.append(mod_results, ignore_index = True) results # ## Passive Aggressive Classifier - Tri Gram # In[38]: pass_tf3 = PassiveAggressiveClassifier() pass_tf3.fit(tfidf_train_3, y_train) ## cross validation kfold = model_selection.KFold(n_splits=10) scoring = 'accuracy' acc_pass3 = cross_val_score(estimator = pass_tf3, X = tfidf_train_3, y = y_train, cv = kfold,scoring=scoring) acc_pass3.mean() # In[39]: pred_pass3 = pass_tf3.predict(tfidf_test_3) CM=confusion_matrix(y_test,pred_pass3) sns.heatmap(CM,cmap= "Blues", linecolor = 'black' , linewidth = 1 , annot = True, fmt='' , xticklabels = ['Normal', 'Disaster'] , yticklabels = ['Normal', 'Disaster']) acc = accuracy_score(y_test, pred_pass3) prec = precision_score(y_test, pred_pass3) rec = recall_score(y_test, pred_pass3) f1 = f1_score(y_test, pred_pass3) mod1_results =pd.DataFrame([['Passive Aggressive Classifier - TFIDF-Trigram',acc, prec,rec,specificity, f1]], columns = ['Model', 'Accuracy','Precision', 'Sensitivity','Specificity', 'F1 Score']) results = results.append(mod1_results, ignore_index = True) results # ## Most Informative Features # In[40]: def most_informative_feature_for_binary_classification(vectorizer, classifier, n=100): """ See: https://stackoverflow.com/a/26980472 Identify most important features if given a vectorizer and binary classifier. Set n to the number of weighted features you would like to show. (Note: current implementation merely prints and does not return top classes.) """ class_labels = classifier.classes_ feature_names = vectorizer.get_feature_names_out() topn_class1 = sorted(zip(classifier.coef_[0], feature_names))[:n] topn_class2 = sorted(zip(classifier.coef_[0], feature_names))[-n:] for coef, feat in topn_class1: print(class_labels[0], coef, feat) print() for coef, feat in reversed(topn_class2): print(class_labels[1], coef, feat) # In[41]: most_informative_feature_for_binary_classification(tfidf_vectorizer_3, pass_tf3, n=10) # In[42]: most_informative_feature_for_binary_classification(tfidf_vectorizer, mnb_tf, n=10) # ## Sample prediction # In[43]: sentences = [ "Just happened a terrible car crash", "Heard about #earthquake is different cities, stay safe everyone.", "No I don't like cold!", "@RosieGray Now in all sincerety do you think the UN would move to Israel if there was a fraction of a chance of being annihilated?" ] tfidf_trigram = tfidf_vectorizer_3.transform(sentences) predictions = pass_tf3.predict(tfidf_trigram) for text, label in zip(sentences, predictions): if label==1: target="Disaster Tweet" print("text:", text, "\nClass:", target) print() else: target="Normal Tweet" print("text:", text, "\nClass:", target) print() # In[44]: # In[61]: import gradio as gr def sample_prediction(inputs): Accuracy= '97%' # Split the input text into separate sentences sentences = inputs.split('\n') tfidf_trigram = tfidf_vectorizer_3.transform(sentences) predictions = pass_tf3.predict(tfidf_trigram) results = [" Disaster Tweet " if prediction == 1 else " Normal Tweet " for prediction in predictions] return results, Accuracy iface = gr.Interface( fn=sample_prediction, inputs=gr.Textbox(label="Enter Sentences (separate by newline)", type="text"), outputs=[ gr.Textbox(label="Results"), gr.Textbox(label="Accuracy") ], title="Tweet Classifier", description="Enter multiple sentences (separate by newline) and get predictions." ) iface.launch(share=True)