{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package stopwords to\n",
"[nltk_data] C:\\Users\\kurti\\AppData\\Roaming\\nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n"
]
},
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import re\n",
"import nltk\n",
"import string\n",
"import numpy as np \n",
"import pandas as pd\n",
"from nltk.corpus import stopwords\n",
"from nltk.stem import PorterStemmer\n",
"from nltk.tokenize import TweetTokenizer\n",
"from sklearn.naive_bayes import MultinomialNB\n",
"from sklearn.model_selection import StratifiedKFold\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"\n",
"nltk.download(\"stopwords\")"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"def process_tweet(tweet):\n",
" \"\"\"\n",
" Process tweet function.\n",
" Input:\n",
" tweet: a string containing a tweet\n",
" Returns:\n",
" tweets_clean: a list of words containing the processed tweet\n",
"\n",
" *Taken from Coursera NLP Specialization Course 1, week 1 programming\n",
" assignment*\n",
" \"\"\"\n",
" stemmer = PorterStemmer()\n",
" stopwords_english = stopwords.words('english')\n",
" # remove stock market tickers like $GE\n",
" tweet = re.sub(r'\\$\\w*', '', str(tweet))\n",
" # remove old style retweet text \"RT\"\n",
" tweet = re.sub(r'^RT[\\s]+', '', str(tweet))\n",
" # remove hyperlinks\n",
" tweet = re.sub(r'https?:\\/\\/.*[\\r\\n]*', '', str(tweet))\n",
" # remove hashtags\n",
" # only removing the hash # sign from the word\n",
" tweet = re.sub(r'#', '', str(tweet))\n",
" # tokenize tweets\n",
" tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,\n",
" reduce_len=True)\n",
" tweet_tokens = tokenizer.tokenize(tweet)\n",
"\n",
" tweets_clean = []\n",
" for word in tweet_tokens:\n",
" if (word not in stopwords_english and # remove stopwords\n",
" word not in string.punctuation): # remove punctuation\n",
" # tweets_clean.append(word)\n",
" stem_word = stemmer.stem(word) # stemming word\n",
" tweets_clean.append(stem_word)\n",
"\n",
" return \" \".join(tweets_clean)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" all_text | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 3796 | \n",
" new weapon caus un-imagin destruct destructionnon | \n",
"
\n",
" \n",
" | 1 | \n",
" 3185 | \n",
" f @ing thing gishwh got soak delug go pad tamp... | \n",
"
\n",
" \n",
" | 2 | \n",
" 7769 | \n",
" dt rt ûïthe col polic catch pickpocket liver... | \n",
"
\n",
" \n",
" | 3 | \n",
" 191 | \n",
" aftershock back school kick great want thank e... | \n",
"
\n",
" \n",
" | 4 | \n",
" 9810 | \n",
" respons trauma children addict develop defens ... | \n",
"
\n",
" \n",
" | 5 | \n",
" 7934 | \n",
" look like got caught rainstorm amaz disgust ti... | \n",
"
\n",
" \n",
" | 6 | \n",
" 2538 | \n",
" favorit ladi came volunt meet hope join youth ... | \n",
"
\n",
" \n",
" | 7 | \n",
" 2611 | \n",
" ux fail emv peopl want insert remov quickli li... | \n",
"
\n",
" \n",
" | 8 | \n",
" 9756 | \n",
" can't find ariana grand shirt fuck tragedytrag... | \n",
"
\n",
" \n",
" | 9 | \n",
" 6254 | \n",
" murder stori america ûª first hijack | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id all_text\n",
"0 3796 new weapon caus un-imagin destruct destructionnon\n",
"1 3185 f @ing thing gishwh got soak delug go pad tamp...\n",
"2 7769 dt rt ûïthe col polic catch pickpocket liver...\n",
"3 191 aftershock back school kick great want thank e...\n",
"4 9810 respons trauma children addict develop defens ...\n",
"5 7934 look like got caught rainstorm amaz disgust ti...\n",
"6 2538 favorit ladi came volunt meet hope join youth ...\n",
"7 2611 ux fail emv peopl want insert remov quickli li...\n",
"8 9756 can't find ariana grand shirt fuck tragedytrag...\n",
"9 6254 murder stori america ûª first hijack"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# read train data\n",
"df = pd.read_csv(\"../inputs/train.csv\")\n",
"# shuffle data\n",
"df = df.sample(frac=1, random_state=42).reset_index(drop=True)\n",
"# create new column \"all_text\"\n",
"df[\"all_text\"] = df[\"text\"] + df[\"keyword\"].fillna(\"none\") + df[\"location\"].fillna(\"none\")\n",
"# split into features and labels\n",
"X = df.drop([\"text\", \"keyword\", \"location\", \"target\"], axis=1)\n",
"y = df[\"target\"]\n",
"\n",
"# process tweets\n",
"X[\"all_text\"] = X[\"all_text\"].apply(process_tweet)\n",
"X.head(10)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# create a dictionary mapping predictions to the tweet idx\n",
"pred_idx_dict = {}\n",
"# initialize kfold\n",
"skf = StratifiedKFold(n_splits=5, shuffle=False)\n",
"for fold, (train_idx, val_idx) in enumerate(skf.split(X=X, y=y)):\n",
" X_train, X_val = X.loc[train_idx, :], X.loc[val_idx, :]\n",
" y_train, y_val = y[train_idx], y[val_idx]\n",
"\n",
" # vectorize text and store model\n",
" count_vect = CountVectorizer()\n",
" X_train_vect = count_vect.fit_transform(X_train[\"all_text\"].values)\n",
" X_val_vect = count_vect.transform(X_val[\"all_text\"].values)\n",
" \n",
" # classify predictions\n",
" clf = MultinomialNB()\n",
" clf.fit(X_train_vect, y_train)\n",
" y_preds = clf.predict(X_val_vect)\n",
" \n",
" # idx of tweet mapping to prediction of model\n",
" for idx, key in enumerate(val_idx):\n",
" pred_idx_dict[key] = y_preds[idx]"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"# create df with actual and prediction\n",
"error_df = X.copy()\n",
"error_df.rename(columns={\"all_text\":\"processed_all_text\"}, inplace=True)\n",
"error_df[\"all_text\"] = df[df[\"id\"] == error_df[\"id\"].values][\"all_text\"]\n",
"error_df[\"actual\"] = y.copy()\n",
"error_df[\"predictions\"] = pred_idx_dict.values()"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" processed_all_text | \n",
" all_text | \n",
" actual | \n",
" predictions | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 3796 | \n",
" new weapon caus un-imagin destruct destructionnon | \n",
" So you have a new weapon that can cause un-ima... | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" | 1 | \n",
" 3185 | \n",
" f @ing thing gishwh got soak delug go pad tamp... | \n",
" The f$&@ing things I do for #GISHWHES Just... | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 2 | \n",
" 7769 | \n",
" dt rt ûïthe col polic catch pickpocket liver... | \n",
" DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe... | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" | 3 | \n",
" 191 | \n",
" aftershock back school kick great want thank e... | \n",
" Aftershock back to school kick off was great. ... | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 4 | \n",
" 9810 | \n",
" respons trauma children addict develop defens ... | \n",
" in response to trauma Children of Addicts deve... | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" | ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" | 7608 | \n",
" 7470 | \n",
" mani obliter server alway like play :D obliter... | \n",
" @Eganator2000 There aren't many Obliteration s... | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 7609 | \n",
" 7691 | \n",
" panic attack bc enough money drug alcohol want... | \n",
" just had a panic attack bc I don't have enough... | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | 7610 | \n",
" 1242 | \n",
" omron hem 712c automat blood pressur monitor s... | \n",
" Omron HEM-712C Automatic Blood Pressure Monito... | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" | 7611 | \n",
" 10862 | \n",
" offici say quarantin place alabama home possib... | \n",
" Officials say a quarantine is in place at an A... | \n",
" 1 | \n",
" 1 | \n",
"
\n",
" \n",
" | 7612 | \n",
" 10409 | \n",
" move england five year ago today whirlwind time | \n",
" I moved to England five years ago today. What ... | \n",
" 1 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
7613 rows × 5 columns
\n",
"
"
],
"text/plain": [
" id processed_all_text \\\n",
"0 3796 new weapon caus un-imagin destruct destructionnon \n",
"1 3185 f @ing thing gishwh got soak delug go pad tamp... \n",
"2 7769 dt rt ûïthe col polic catch pickpocket liver... \n",
"3 191 aftershock back school kick great want thank e... \n",
"4 9810 respons trauma children addict develop defens ... \n",
"... ... ... \n",
"7608 7470 mani obliter server alway like play :D obliter... \n",
"7609 7691 panic attack bc enough money drug alcohol want... \n",
"7610 1242 omron hem 712c automat blood pressur monitor s... \n",
"7611 10862 offici say quarantin place alabama home possib... \n",
"7612 10409 move england five year ago today whirlwind time \n",
"\n",
" all_text actual predictions \n",
"0 So you have a new weapon that can cause un-ima... 1 0 \n",
"1 The f$&@ing things I do for #GISHWHES Just... 0 0 \n",
"2 DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe... 1 0 \n",
"3 Aftershock back to school kick off was great. ... 0 0 \n",
"4 in response to trauma Children of Addicts deve... 0 1 \n",
"... ... ... ... \n",
"7608 @Eganator2000 There aren't many Obliteration s... 0 0 \n",
"7609 just had a panic attack bc I don't have enough... 0 0 \n",
"7610 Omron HEM-712C Automatic Blood Pressure Monito... 0 1 \n",
"7611 Officials say a quarantine is in place at an A... 1 1 \n",
"7612 I moved to England five years ago today. What ... 1 1 \n",
"\n",
"[7613 rows x 5 columns]"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"error_df"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" processed_all_text | \n",
" all_text | \n",
" actual | \n",
" predictions | \n",
"
\n",
" \n",
" \n",
" \n",
" | 149 | \n",
" 1061 | \n",
" ye i'm bleed heart liberal.bleedingl oak tx | \n",
" @KatRamsland Yes I'm a bleeding heart liberal.... | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" | 518 | \n",
" 8946 | \n",
" storm came . . fuck coolstormnon | \n",
" So this storm just came out of no where. .fuck... | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" | 3161 | \n",
" 143 | \n",
" car even week got fuck car accid .. mf can't f... | \n",
" only had a car for not even a week and got in ... | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" | 6624 | \n",
" 9044 | \n",
" spacex founder musk structur failur took falcon 9 | \n",
" SpaceX Founder Musk: Structural Failure Took D... | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" | 881 | \n",
" 1458 | \n",
" anoth one anoth one still ain't done shit one ... | \n",
" 'I did another one I did another one. You stil... | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" | 4314 | \n",
" 10364 | \n",
" router one latest ddo attack weapon | \n",
" Your Router is One of the Latest DDoS Attack W... | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" | 5399 | \n",
" 6188 | \n",
" gov brown allow parol 1976 chowchilla school b... | \n",
" Gov. Brown allows parole for 1976 Chowchilla s... | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" | 4266 | \n",
" 4911 | \n",
" chick masturb guy get explod face | \n",
" Chick masturbates a guy until she gets explode... | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" | 3959 | \n",
" 2112 | \n",
" borrow concern possibl interest rate rise coul... | \n",
" #Borrowers concerned at possible #interest rat... | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" | 6445 | \n",
" 7926 | \n",
" stuck rainstorm stay toward middl road street ... | \n",
" Stuck in a rainstorm? Stay toward the middle o... | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id processed_all_text \\\n",
"149 1061 ye i'm bleed heart liberal.bleedingl oak tx \n",
"518 8946 storm came . . fuck coolstormnon \n",
"3161 143 car even week got fuck car accid .. mf can't f... \n",
"6624 9044 spacex founder musk structur failur took falcon 9 \n",
"881 1458 anoth one anoth one still ain't done shit one ... \n",
"4314 10364 router one latest ddo attack weapon \n",
"5399 6188 gov brown allow parol 1976 chowchilla school b... \n",
"4266 4911 chick masturb guy get explod face \n",
"3959 2112 borrow concern possibl interest rate rise coul... \n",
"6445 7926 stuck rainstorm stay toward middl road street ... \n",
"\n",
" all_text actual predictions \n",
"149 @KatRamsland Yes I'm a bleeding heart liberal.... 1 0 \n",
"518 So this storm just came out of no where. .fuck... 1 0 \n",
"3161 only had a car for not even a week and got in ... 1 0 \n",
"6624 SpaceX Founder Musk: Structural Failure Took D... 1 0 \n",
"881 'I did another one I did another one. You stil... 1 0 \n",
"4314 Your Router is One of the Latest DDoS Attack W... 0 1 \n",
"5399 Gov. Brown allows parole for 1976 Chowchilla s... 0 1 \n",
"4266 Chick masturbates a guy until she gets explode... 1 0 \n",
"3959 #Borrowers concerned at possible #interest rat... 0 1 \n",
"6445 Stuck in a rainstorm? Stay toward the middle o... 0 1 "
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# store only the misclassified instances\n",
"misclassified_df = error_df[error_df[\"actual\"].values != error_df[\"predictions\"]]\n",
"# keep only 100 of the misclassfied instances\n",
"misclassified_100 = misclassified_df.sample(n=100, random_state=42)\n",
"misclassified_100.head(10)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"misclassified_100.to_csv(\"misclassified_data.csv\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.6"
}
},
"nbformat": 4,
"nbformat_minor": 4
}