{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package stopwords to\n", "[nltk_data] C:\\Users\\kurti\\AppData\\Roaming\\nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import re\n", "import nltk\n", "import string\n", "import numpy as np \n", "import pandas as pd\n", "from nltk.corpus import stopwords\n", "from nltk.stem import PorterStemmer\n", "from nltk.tokenize import TweetTokenizer\n", "from sklearn.naive_bayes import MultinomialNB\n", "from sklearn.model_selection import StratifiedKFold\n", "from sklearn.feature_extraction.text import CountVectorizer\n", "\n", "nltk.download(\"stopwords\")" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "def process_tweet(tweet):\n", " \"\"\"\n", " Process tweet function.\n", " Input:\n", " tweet: a string containing a tweet\n", " Returns:\n", " tweets_clean: a list of words containing the processed tweet\n", "\n", " *Taken from Coursera NLP Specialization Course 1, week 1 programming\n", " assignment*\n", " \"\"\"\n", " stemmer = PorterStemmer()\n", " stopwords_english = stopwords.words('english')\n", " # remove stock market tickers like $GE\n", " tweet = re.sub(r'\\$\\w*', '', str(tweet))\n", " # remove old style retweet text \"RT\"\n", " tweet = re.sub(r'^RT[\\s]+', '', str(tweet))\n", " # remove hyperlinks\n", " tweet = re.sub(r'https?:\\/\\/.*[\\r\\n]*', '', str(tweet))\n", " # remove hashtags\n", " # only removing the hash # sign from the word\n", " tweet = re.sub(r'#', '', str(tweet))\n", " # tokenize tweets\n", " tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,\n", " reduce_len=True)\n", " tweet_tokens = tokenizer.tokenize(tweet)\n", "\n", " tweets_clean = []\n", " for word in tweet_tokens:\n", " if (word not in stopwords_english and # remove stopwords\n", " word not in string.punctuation): # remove punctuation\n", " # tweets_clean.append(word)\n", " stem_word = stemmer.stem(word) # stemming word\n", " tweets_clean.append(stem_word)\n", "\n", " return \" \".join(tweets_clean)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idall_text
03796new weapon caus un-imagin destruct destructionnon
13185f @ing thing gishwh got soak delug go pad tamp...
27769dt rt ‰ ûïthe col polic catch pickpocket liver...
3191aftershock back school kick great want thank e...
49810respons trauma children addict develop defens ...
57934look like got caught rainstorm amaz disgust ti...
62538favorit ladi came volunt meet hope join youth ...
72611ux fail emv peopl want insert remov quickli li...
89756can't find ariana grand shirt fuck tragedytrag...
96254murder stori america ‰ ûª first hijack
\n", "
" ], "text/plain": [ " id all_text\n", "0 3796 new weapon caus un-imagin destruct destructionnon\n", "1 3185 f @ing thing gishwh got soak delug go pad tamp...\n", "2 7769 dt rt ‰ ûïthe col polic catch pickpocket liver...\n", "3 191 aftershock back school kick great want thank e...\n", "4 9810 respons trauma children addict develop defens ...\n", "5 7934 look like got caught rainstorm amaz disgust ti...\n", "6 2538 favorit ladi came volunt meet hope join youth ...\n", "7 2611 ux fail emv peopl want insert remov quickli li...\n", "8 9756 can't find ariana grand shirt fuck tragedytrag...\n", "9 6254 murder stori america ‰ ûª first hijack" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# read train data\n", "df = pd.read_csv(\"../inputs/train.csv\")\n", "# shuffle data\n", "df = df.sample(frac=1, random_state=42).reset_index(drop=True)\n", "# create new column \"all_text\"\n", "df[\"all_text\"] = df[\"text\"] + df[\"keyword\"].fillna(\"none\") + df[\"location\"].fillna(\"none\")\n", "# split into features and labels\n", "X = df.drop([\"text\", \"keyword\", \"location\", \"target\"], axis=1)\n", "y = df[\"target\"]\n", "\n", "# process tweets\n", "X[\"all_text\"] = X[\"all_text\"].apply(process_tweet)\n", "X.head(10)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# create a dictionary mapping predictions to the tweet idx\n", "pred_idx_dict = {}\n", "# initialize kfold\n", "skf = StratifiedKFold(n_splits=5, shuffle=False)\n", "for fold, (train_idx, val_idx) in enumerate(skf.split(X=X, y=y)):\n", " X_train, X_val = X.loc[train_idx, :], X.loc[val_idx, :]\n", " y_train, y_val = y[train_idx], y[val_idx]\n", "\n", " # vectorize text and store model\n", " count_vect = CountVectorizer()\n", " X_train_vect = count_vect.fit_transform(X_train[\"all_text\"].values)\n", " X_val_vect = count_vect.transform(X_val[\"all_text\"].values)\n", " \n", " # classify predictions\n", " clf = MultinomialNB()\n", " clf.fit(X_train_vect, y_train)\n", " y_preds = clf.predict(X_val_vect)\n", " \n", " # idx of tweet mapping to prediction of model\n", " for idx, key in enumerate(val_idx):\n", " pred_idx_dict[key] = y_preds[idx]" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "# create df with actual and prediction\n", "error_df = X.copy()\n", "error_df.rename(columns={\"all_text\":\"processed_all_text\"}, inplace=True)\n", "error_df[\"all_text\"] = df[df[\"id\"] == error_df[\"id\"].values][\"all_text\"]\n", "error_df[\"actual\"] = y.copy()\n", "error_df[\"predictions\"] = pred_idx_dict.values()" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idprocessed_all_textall_textactualpredictions
03796new weapon caus un-imagin destruct destructionnonSo you have a new weapon that can cause un-ima...10
13185f @ing thing gishwh got soak delug go pad tamp...The f$&@ing things I do for #GISHWHES Just...00
27769dt rt ‰ ûïthe col polic catch pickpocket liver...DT @georgegalloway: RT @Galloway4Mayor: ‰ÛÏThe...10
3191aftershock back school kick great want thank e...Aftershock back to school kick off was great. ...00
49810respons trauma children addict develop defens ...in response to trauma Children of Addicts deve...01
..................
76087470mani obliter server alway like play :D obliter...@Eganator2000 There aren't many Obliteration s...00
76097691panic attack bc enough money drug alcohol want...just had a panic attack bc I don't have enough...00
76101242omron hem 712c automat blood pressur monitor s...Omron HEM-712C Automatic Blood Pressure Monito...01
761110862offici say quarantin place alabama home possib...Officials say a quarantine is in place at an A...11
761210409move england five year ago today whirlwind timeI moved to England five years ago today. What ...11
\n", "

7613 rows × 5 columns

\n", "
" ], "text/plain": [ " id processed_all_text \\\n", "0 3796 new weapon caus un-imagin destruct destructionnon \n", "1 3185 f @ing thing gishwh got soak delug go pad tamp... \n", "2 7769 dt rt ‰ ûïthe col polic catch pickpocket liver... \n", "3 191 aftershock back school kick great want thank e... \n", "4 9810 respons trauma children addict develop defens ... \n", "... ... ... \n", "7608 7470 mani obliter server alway like play :D obliter... \n", "7609 7691 panic attack bc enough money drug alcohol want... \n", "7610 1242 omron hem 712c automat blood pressur monitor s... \n", "7611 10862 offici say quarantin place alabama home possib... \n", "7612 10409 move england five year ago today whirlwind time \n", "\n", " all_text actual predictions \n", "0 So you have a new weapon that can cause un-ima... 1 0 \n", "1 The f$&@ing things I do for #GISHWHES Just... 0 0 \n", "2 DT @georgegalloway: RT @Galloway4Mayor: ‰ÛÏThe... 1 0 \n", "3 Aftershock back to school kick off was great. ... 0 0 \n", "4 in response to trauma Children of Addicts deve... 0 1 \n", "... ... ... ... \n", "7608 @Eganator2000 There aren't many Obliteration s... 0 0 \n", "7609 just had a panic attack bc I don't have enough... 0 0 \n", "7610 Omron HEM-712C Automatic Blood Pressure Monito... 0 1 \n", "7611 Officials say a quarantine is in place at an A... 1 1 \n", "7612 I moved to England five years ago today. What ... 1 1 \n", "\n", "[7613 rows x 5 columns]" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "error_df" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idprocessed_all_textall_textactualpredictions
1491061ye i'm bleed heart liberal.bleedingl oak tx@KatRamsland Yes I'm a bleeding heart liberal....10
5188946storm came . . fuck coolstormnonSo this storm just came out of no where. .fuck...10
3161143car even week got fuck car accid .. mf can't f...only had a car for not even a week and got in ...10
66249044spacex founder musk structur failur took falcon 9SpaceX Founder Musk: Structural Failure Took D...10
8811458anoth one anoth one still ain't done shit one ...'I did another one I did another one. You stil...10
431410364router one latest ddo attack weaponYour Router is One of the Latest DDoS Attack W...01
53996188gov brown allow parol 1976 chowchilla school b...Gov. Brown allows parole for 1976 Chowchilla s...01
42664911chick masturb guy get explod faceChick masturbates a guy until she gets explode...10
39592112borrow concern possibl interest rate rise coul...#Borrowers concerned at possible #interest rat...01
64457926stuck rainstorm stay toward middl road street ...Stuck in a rainstorm? Stay toward the middle o...01
\n", "
" ], "text/plain": [ " id processed_all_text \\\n", "149 1061 ye i'm bleed heart liberal.bleedingl oak tx \n", "518 8946 storm came . . fuck coolstormnon \n", "3161 143 car even week got fuck car accid .. mf can't f... \n", "6624 9044 spacex founder musk structur failur took falcon 9 \n", "881 1458 anoth one anoth one still ain't done shit one ... \n", "4314 10364 router one latest ddo attack weapon \n", "5399 6188 gov brown allow parol 1976 chowchilla school b... \n", "4266 4911 chick masturb guy get explod face \n", "3959 2112 borrow concern possibl interest rate rise coul... \n", "6445 7926 stuck rainstorm stay toward middl road street ... \n", "\n", " all_text actual predictions \n", "149 @KatRamsland Yes I'm a bleeding heart liberal.... 1 0 \n", "518 So this storm just came out of no where. .fuck... 1 0 \n", "3161 only had a car for not even a week and got in ... 1 0 \n", "6624 SpaceX Founder Musk: Structural Failure Took D... 1 0 \n", "881 'I did another one I did another one. You stil... 1 0 \n", "4314 Your Router is One of the Latest DDoS Attack W... 0 1 \n", "5399 Gov. Brown allows parole for 1976 Chowchilla s... 0 1 \n", "4266 Chick masturbates a guy until she gets explode... 1 0 \n", "3959 #Borrowers concerned at possible #interest rat... 0 1 \n", "6445 Stuck in a rainstorm? Stay toward the middle o... 0 1 " ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# store only the misclassified instances\n", "misclassified_df = error_df[error_df[\"actual\"].values != error_df[\"predictions\"]]\n", "# keep only 100 of the misclassfied instances\n", "misclassified_100 = misclassified_df.sample(n=100, random_state=42)\n", "misclassified_100.head(10)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "misclassified_100.to_csv(\"misclassified_data.csv\", index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.6" } }, "nbformat": 4, "nbformat_minor": 4 }