In [1]:
import re
import nltk
import string
import numpy as np 
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer

nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kurti\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
def process_tweet(tweet):
    """
    Process tweet function.
    Input:
        tweet: a string containing a tweet
    Returns:
        tweets_clean: a list of words containing the processed tweet

    *Taken from Coursera NLP Specialization Course 1, week 1 programming
    assignment*
    """
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', str(tweet))
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', str(tweet))
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', str(tweet))
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', str(tweet))
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            # tweets_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)

    return " ".join(tweets_clean)

In [3]:
# read train data
df = pd.read_csv("../inputs/train.csv")
# shuffle data
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
# create new column "all_text"
df["all_text"] = df["text"] + df["keyword"].fillna("none") + df["location"].fillna("none")
# split into features and labels
X = df.drop(["text", "keyword", "location", "target"], axis=1)
y = df["target"]

# process tweets
X["all_text"] = X["all_text"].apply(process_tweet)
X.head(10)

Unnamed: 0,id,all_text
0,3796,new weapon caus un-imagin destruct destructionnon
1,3185,f @ing thing gishwh got soak delug go pad tamp...
2,7769,dt rt  ûïthe col polic catch pickpocket liver...
3,191,aftershock back school kick great want thank e...
4,9810,respons trauma children addict develop defens ...
5,7934,look like got caught rainstorm amaz disgust ti...
6,2538,favorit ladi came volunt meet hope join youth ...
7,2611,ux fail emv peopl want insert remov quickli li...
8,9756,can't find ariana grand shirt fuck tragedytrag...
9,6254,murder stori america  ûª first hijack


In [4]:
# create a dictionary mapping predictions to the tweet idx
pred_idx_dict = {}
# initialize kfold
skf = StratifiedKFold(n_splits=5, shuffle=False)
for fold, (train_idx, val_idx) in enumerate(skf.split(X=X, y=y)):
    X_train, X_val = X.loc[train_idx, :], X.loc[val_idx, :]
    y_train, y_val = y[train_idx], y[val_idx]

    # vectorize text and store model
    count_vect = CountVectorizer()
    X_train_vect = count_vect.fit_transform(X_train["all_text"].values)
    X_val_vect = count_vect.transform(X_val["all_text"].values)
    
    # classify predictions
    clf = MultinomialNB()
    clf.fit(X_train_vect, y_train)
    y_preds = clf.predict(X_val_vect)
    
    # idx of tweet mapping to prediction of model
    for idx, key  in enumerate(val_idx):
        pred_idx_dict[key] = y_preds[idx]

In [20]:
# create df with actual and prediction
error_df = X.copy()
error_df.rename(columns={"all_text":"processed_all_text"}, inplace=True)
error_df["all_text"] = df[df["id"] == error_df["id"].values]["all_text"]
error_df["actual"] = y.copy()
error_df["predictions"] = pred_idx_dict.values()

In [21]:
error_df

Unnamed: 0,id,processed_all_text,all_text,actual,predictions
0,3796,new weapon caus un-imagin destruct destructionnon,So you have a new weapon that can cause un-ima...,1,0
1,3185,f @ing thing gishwh got soak delug go pad tamp...,The f$&amp;@ing things I do for #GISHWHES Just...,0,0
2,7769,dt rt  ûïthe col polic catch pickpocket liver...,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1,0
3,191,aftershock back school kick great want thank e...,Aftershock back to school kick off was great. ...,0,0
4,9810,respons trauma children addict develop defens ...,in response to trauma Children of Addicts deve...,0,1
...,...,...,...,...,...
7608,7470,mani obliter server alway like play :D obliter...,@Eganator2000 There aren't many Obliteration s...,0,0
7609,7691,panic attack bc enough money drug alcohol want...,just had a panic attack bc I don't have enough...,0,0
7610,1242,omron hem 712c automat blood pressur monitor s...,Omron HEM-712C Automatic Blood Pressure Monito...,0,1
7611,10862,offici say quarantin place alabama home possib...,Officials say a quarantine is in place at an A...,1,1


In [24]:
# store only the misclassified instances
misclassified_df = error_df[error_df["actual"].values != error_df["predictions"]]
# keep only 100 of the misclassfied instances
misclassified_100 = misclassified_df.sample(n=100, random_state=42)
misclassified_100.head(10)

Unnamed: 0,id,processed_all_text,all_text,actual,predictions
149,1061,ye i'm bleed heart liberal.bleedingl oak tx,@KatRamsland Yes I'm a bleeding heart liberal....,1,0
518,8946,storm came . . fuck coolstormnon,So this storm just came out of no where. .fuck...,1,0
3161,143,car even week got fuck car accid .. mf can't f...,only had a car for not even a week and got in ...,1,0
6624,9044,spacex founder musk structur failur took falcon 9,SpaceX Founder Musk: Structural Failure Took D...,1,0
881,1458,anoth one anoth one still ain't done shit one ...,'I did another one I did another one. You stil...,1,0
4314,10364,router one latest ddo attack weapon,Your Router is One of the Latest DDoS Attack W...,0,1
5399,6188,gov brown allow parol 1976 chowchilla school b...,Gov. Brown allows parole for 1976 Chowchilla s...,0,1
4266,4911,chick masturb guy get explod face,Chick masturbates a guy until she gets explode...,1,0
3959,2112,borrow concern possibl interest rate rise coul...,#Borrowers concerned at possible #interest rat...,0,1
6445,7926,stuck rainstorm stay toward middl road street ...,Stuck in a rainstorm? Stay toward the middle o...,0,1


In [23]:
misclassified_100.to_csv("misclassified_data.csv", index=False)