{ "metadata": { "colab": { "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3", "language": "python" }, "language_info": { "name": "python", "version": "3.10.13", "mimetype": "text/x-python", "codemirror_mode": { "name": "ipython", "version": 3 }, "pygments_lexer": "ipython3", "nbconvert_exporter": "python", "file_extension": ".py" }, "kaggle": { "accelerator": "nvidiaTeslaT4", "dataSources": [ { "sourceId": 4140, "sourceType": "datasetVersion", "datasetId": 2477 }, { "sourceId": 8767630, "sourceType": "datasetVersion", "datasetId": 5268480 } ], "dockerImageVersionId": 30733, "isInternetEnabled": true, "language": "python", "sourceType": "notebook", "isGpuEnabled": true } }, "nbformat_minor": 0, "nbformat": 4, "cells": [ { "cell_type": "markdown", "source": [ "## Import Dependencies" ], "metadata": { "id": "OkalWs9I9oxf" } }, { "cell_type": "code", "source": [ "import tensorflow as tf\n", "import numpy as np\n", "import pandas as pd\n", "import re\n", "from nltk.corpus import stopwords\n", "from nltk.stem.porter import PorterStemmer\n", "from nltk.stem import WordNetLemmatizer\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.metrics import accuracy_score" ], "metadata": { "id": "bjaPyT8E9AmG", "execution": { "iopub.status.busy": "2024-06-25T14:50:46.705417Z", "iopub.execute_input": "2024-06-25T14:50:46.705780Z", "iopub.status.idle": "2024-06-25T14:50:59.981817Z", "shell.execute_reply.started": "2024-06-25T14:50:46.705751Z", "shell.execute_reply": "2024-06-25T14:50:59.980895Z" }, "trusted": true }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "## Data Processing" ], "metadata": { "id": "uL_hVczk-8H5" } }, { "cell_type": "code", "source": [ "import nltk\n", "nltk.download('stopwords')" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "yAH9DcJk-TZc", "outputId": "6205729b-893e-4f54-ed66-b474cad9af60" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", "[nltk_data] Unzipping corpora/stopwords.zip.\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ "True" ] }, "metadata": {}, "execution_count": 15 } ] }, { "cell_type": "code", "source": [ "#print the stopwords in English\n", "print(stopwords.words('english'))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "QcRIpiSD-ibE", "outputId": "6cde0654-654e-4c3f-82e3-8849a51aa76d" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": "['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', \"you're\", \"you've\", \"you'll\", \"you'd\", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', \"she's\", 'her', 'hers', 'herself', 'it', \"it's\", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', \"that'll\", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', \"don't\", 'should', \"should've\", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', \"aren't\", 'couldn', \"couldn't\", 'didn', \"didn't\", 'doesn', \"doesn't\", 'hadn', \"hadn't\", 'hasn', \"hasn't\", 'haven', \"haven't\", 'isn', \"isn't\", 'ma', 'mightn', \"mightn't\", 'mustn', \"mustn't\", 'needn', \"needn't\", 'shan', \"shan't\", 'shouldn', \"shouldn't\", 'wasn', \"wasn't\", 'weren', \"weren't\", 'won', \"won't\", 'wouldn', \"wouldn't\"]\n" } ] }, { "cell_type": "code", "source": [ "twitter_data = pd.read_csv(\"/content/training.1600000.processed.noemoticon.csv\", encoding= 'ISO-8859-1')" ], "metadata": { "id": "wVBUG3Eb-uWJ" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "twitter_data.shape" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "1HuuPc2b_yEK", "outputId": "de10fed9-8717-4ed4-f79d-006c7d22a24e" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "execution_count": 8, "data": { "text/plain": [ "(1599999, 6)" ] }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "twitter_data.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 536 }, "id": "EXVv_49H_2va", "outputId": "8b74f60b-e93a-4c18-8370-d751b9a6ca85" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "execution_count": 14, "data": { "text/plain": [ " 0 1467810369 Mon Apr 06 22:19:45 PDT 2009 NO_QUERY _TheSpecialOne_ \\\n", "0 0 1467810672 Mon Apr 06 22:19:49 PDT 2009 NO_QUERY scotthamilton \n", "1 0 1467810917 Mon Apr 06 22:19:53 PDT 2009 NO_QUERY mattycus \n", "2 0 1467811184 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY ElleCTF \n", "3 0 1467811193 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY Karoli \n", "4 0 1467811372 Mon Apr 06 22:20:00 PDT 2009 NO_QUERY joy_wolf \n", "\n", " @switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D \n", "0 is upset that he can't update his Facebook by ... \n", "1 @Kenichan I dived many times for the ball. Man... \n", "2 my whole body feels itchy and like its on fire \n", "3 @nationwideclass no, it's not behaving at all.... \n", "4 @Kwesidei not the whole crew " ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
01467810369Mon Apr 06 22:19:45 PDT 2009NO_QUERY_TheSpecialOne_@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D
001467810672Mon Apr 06 22:19:49 PDT 2009NO_QUERYscotthamiltonis upset that he can't update his Facebook by ...
101467810917Mon Apr 06 22:19:53 PDT 2009NO_QUERYmattycus@Kenichan I dived many times for the ball. Man...
201467811184Mon Apr 06 22:19:57 PDT 2009NO_QUERYElleCTFmy whole body feels itchy and like its on fire
301467811193Mon Apr 06 22:19:57 PDT 2009NO_QUERYKaroli@nationwideclass no, it's not behaving at all....
401467811372Mon Apr 06 22:20:00 PDT 2009NO_QUERYjoy_wolf@Kwesidei not the whole crew
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "twitter_data" } }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "# naming the columns and reading the dataset again\n", "\n", "column_names = ['target', 'ids', 'date', 'flag', 'user', 'text']\n", "twitter_data = pd.read_csv(\"/content/training.1600000.processed.noemoticon.csv\", encoding= 'ISO-8859-1', names=column_names)" ], "metadata": { "id": "T3KVZDHcATjt" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "twitter_data.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 293 }, "id": "nTRi2hsbDdct", "outputId": "ba0aa17e-543c-4bdc-bbf4-8f70af54c6ae" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "execution_count": 10, "data": { "text/plain": [ " target ids date flag \\\n", "0 0 1467810369 Mon Apr 06 22:19:45 PDT 2009 NO_QUERY \n", "1 0 1467810672 Mon Apr 06 22:19:49 PDT 2009 NO_QUERY \n", "2 0 1467810917 Mon Apr 06 22:19:53 PDT 2009 NO_QUERY \n", "3 0 1467811184 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY \n", "4 0 1467811193 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY \n", "\n", " user text \n", "0 _TheSpecialOne_ @switchfoot http://twitpic.com/2y1zl - Awww, t... \n", "1 scotthamilton is upset that he can't update his Facebook by ... \n", "2 mattycus @Kenichan I dived many times for the ball. Man... \n", "3 ElleCTF my whole body feels itchy and like its on fire \n", "4 Karoli @nationwideclass no, it's not behaving at all.... " ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
targetidsdateflagusertext
001467810369Mon Apr 06 22:19:45 PDT 2009NO_QUERY_TheSpecialOne_@switchfoot http://twitpic.com/2y1zl - Awww, t...
101467810672Mon Apr 06 22:19:49 PDT 2009NO_QUERYscotthamiltonis upset that he can't update his Facebook by ...
201467810917Mon Apr 06 22:19:53 PDT 2009NO_QUERYmattycus@Kenichan I dived many times for the ball. Man...
301467811184Mon Apr 06 22:19:57 PDT 2009NO_QUERYElleCTFmy whole body feels itchy and like its on fire
401467811193Mon Apr 06 22:19:57 PDT 2009NO_QUERYKaroli@nationwideclass no, it's not behaving at all....
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "twitter_data" } }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "twitter_data.shape" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "E6sCivycDiFv", "outputId": "a017f8a9-dc12-41f7-8d3f-3b6f1f2e2dab" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "execution_count": 11, "data": { "text/plain": [ "(1600000, 6)" ] }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "# Check for missing values\n", "\n", "twitter_data.isnull().sum()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "2_2Zq9Z2Dz9m", "outputId": "ac8d1cf9-c8bc-41d1-9d3b-6a2675833739" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "execution_count": 18, "data": { "text/plain": [ "target 0\n", "ids 0\n", "date 0\n", "flag 0\n", "user 0\n", "text 0\n", "dtype: int64" ] }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "# Check the distribution of target columns\n", "\n", "twitter_data['target'].value_counts()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "DS3jzsG8EQLD", "outputId": "6be3c1fc-2963-4af6-b85a-8c64407e7ffd" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "execution_count": 12, "data": { "text/plain": [ "target\n", "0 800000\n", "4 800000\n", "Name: count, dtype: int64" ] }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "# Convert the target \"4\" to \"1\"\n", "\n", "twitter_data.loc[twitter_data['target'] == 4, 'target'] = 1" ], "metadata": { "id": "6IZz89lKGekA" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "twitter_data['target'].value_counts()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "q91SJJ7zIZov", "outputId": "4e2ce776-0e3c-41dc-a92a-228428977e92" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "execution_count": 14, "data": { "text/plain": [ "target\n", "0 800000\n", "1 800000\n", "Name: count, dtype: int64" ] }, "metadata": {} } ] }, { "cell_type": "markdown", "source": [ "### Lemmatization" ], "metadata": { "id": "li4fKBkaUlFR" } }, { "cell_type": "code", "source": [ "from nltk.corpus import wordnet" ], "metadata": { "id": "bCkReBpZ1I1u" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "nltk.download('wordnet')" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "mESYXOLqt6ya", "outputId": "f095b03e-ab9e-46d4-ce93-f2c86a72fdc3" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "[nltk_data] Downloading package wordnet to /root/nltk_data...\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ "True" ] }, "metadata": {}, "execution_count": 18 } ] }, { "cell_type": "code", "source": [ "nltk.download('omw-1.4')" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "MHzzF70tuJC8", "outputId": "bdcdac3d-863a-4faa-bf3b-84c6458361d1" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "[nltk_data] Downloading package omw-1.4 to /root/nltk_data...\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ "True" ] }, "metadata": {}, "execution_count": 19 } ] }, { "cell_type": "code", "source": [ "nltk.download('averaged_perceptron_tagger')" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "izz4kTbN1N9A", "outputId": "4251c8d5-ce37-45d0-8cbc-e4aa0e1aeb5d" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "[nltk_data] Downloading package averaged_perceptron_tagger to\n", "[nltk_data] /root/nltk_data...\n", "[nltk_data] Unzipping taggers/averaged_perceptron_tagger.zip.\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ "True" ] }, "metadata": {}, "execution_count": 20 } ] }, { "cell_type": "code", "source": [ "lemmatizer= WordNetLemmatizer()" ], "metadata": { "id": "cJOtrvuHuSps" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "lemmatizer= WordNetLemmatizer()\n", "\n", "def get_wordnet_pos(word):\n", " \"\"\"Map POS tag to first character lemmatize() accepts.\"\"\"\n", " tag = nltk.pos_tag([word])[0][1][0].upper()\n", " tag_dict = {\n", " 'J': wordnet.ADJ,\n", " 'N': wordnet.NOUN,\n", " 'V': wordnet.VERB,\n", " 'R': wordnet.ADV\n", " }\n", " return tag_dict.get(tag, wordnet.NOUN)\n", "\n", "def lemmatize_content(content):\n", " lemmatized_content = re.sub('[^a-zA-Z]', ' ', content)\n", " lemmatized_content = lemmatized_content.lower()\n", " lemmatized_content = lemmatized_content.split()\n", " lemmatized_content = [\n", " lemmatizer.lemmatize(word, get_wordnet_pos(word))\n", " for word in lemmatized_content\n", " if word not in stopwords.words('english')\n", " ]\n", " lemmatized_content = ' '.join(lemmatized_content)\n", " return lemmatized_content" ], "metadata": { "id": "riPiX6WF1o2f" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "twitter_data['lemmatized_text'] = twitter_data['text'].apply(lemmatize_content)" ], "metadata": { "id": "wNR1zl1F2NmH" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "twitter_data.to_csv('lemmatized_twitter_data.csv', index=False)" ], "metadata": { "id": "HXrl1FZxjkwA" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "twitter_data.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 379 }, "id": "XfuEqjNVk7ZM", "outputId": "e9c94b52-2d6d-4a56-f5d2-10a9a4d45902", "collapsed": true, "jupyter": { "outputs_hidden": true } }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "execution_count": 23, "data": { "text/plain": [ " target ids date flag \\\n", "0 0 1467810369 Mon Apr 06 22:19:45 PDT 2009 NO_QUERY \n", "1 0 1467810672 Mon Apr 06 22:19:49 PDT 2009 NO_QUERY \n", "2 0 1467810917 Mon Apr 06 22:19:53 PDT 2009 NO_QUERY \n", "3 0 1467811184 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY \n", "4 0 1467811193 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY \n", "\n", " user text \\\n", "0 _TheSpecialOne_ @switchfoot http://twitpic.com/2y1zl - Awww, t... \n", "1 scotthamilton is upset that he can't update his Facebook by ... \n", "2 mattycus @Kenichan I dived many times for the ball. Man... \n", "3 ElleCTF my whole body feels itchy and like its on fire \n", "4 Karoli @nationwideclass no, it's not behaving at all.... \n", "\n", " lemmatized_text \n", "0 switchfoot http twitpic com zl awww bummer sho... \n", "1 upset update facebook texting might cry result... \n", "2 kenichan dive many time ball manage save rest ... \n", "3 whole body feel itchy like fire \n", "4 nationwideclass behaving mad see " ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
targetidsdateflagusertextlemmatized_text
001467810369Mon Apr 06 22:19:45 PDT 2009NO_QUERY_TheSpecialOne_@switchfoot http://twitpic.com/2y1zl - Awww, t...switchfoot http twitpic com zl awww bummer sho...
101467810672Mon Apr 06 22:19:49 PDT 2009NO_QUERYscotthamiltonis upset that he can't update his Facebook by ...upset update facebook texting might cry result...
201467810917Mon Apr 06 22:19:53 PDT 2009NO_QUERYmattycus@Kenichan I dived many times for the ball. Man...kenichan dive many time ball manage save rest ...
301467811184Mon Apr 06 22:19:57 PDT 2009NO_QUERYElleCTFmy whole body feels itchy and like its on firewhole body feel itchy like fire
401467811193Mon Apr 06 22:19:57 PDT 2009NO_QUERYKaroli@nationwideclass no, it's not behaving at all....nationwideclass behaving mad see
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "twitter_data" } }, "metadata": {} } ] }, { "cell_type": "markdown", "source": [ "### Load lemmatized data" ], "metadata": { "id": "sNiP-Ew1vDVR" } }, { "cell_type": "code", "source": [ "twitter_data= pd.read_csv('/content/drive/MyDrive/Twitter sentiment analysis/deflaut_lemmatized_twitter_data.csv')" ], "metadata": { "id": "tEZS1Srsp1jq", "execution": { "iopub.status.busy": "2024-06-25T14:51:17.904652Z", "iopub.execute_input": "2024-06-25T14:51:17.905888Z", "iopub.status.idle": "2024-06-25T14:51:27.185104Z", "shell.execute_reply.started": "2024-06-25T14:51:17.905838Z", "shell.execute_reply": "2024-06-25T14:51:27.184265Z" }, "trusted": true }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "twitter_data.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "gkyXE5XSxVYd", "outputId": "14290935-cef1-42e7-9618-956d638dd15e", "execution": { "iopub.status.busy": "2024-06-25T14:51:31.174111Z", "iopub.execute_input": "2024-06-25T14:51:31.174738Z", "iopub.status.idle": "2024-06-25T14:51:31.197063Z", "shell.execute_reply.started": "2024-06-25T14:51:31.174704Z", "shell.execute_reply": "2024-06-25T14:51:31.196101Z" }, "trusted": true }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " target ids date flag \\\n", "0 0 1467810369 Mon Apr 06 22:19:45 PDT 2009 NO_QUERY \n", "1 0 1467810672 Mon Apr 06 22:19:49 PDT 2009 NO_QUERY \n", "2 0 1467810917 Mon Apr 06 22:19:53 PDT 2009 NO_QUERY \n", "3 0 1467811184 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY \n", "4 0 1467811193 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY \n", "\n", " user text \\\n", "0 _TheSpecialOne_ @switchfoot http://twitpic.com/2y1zl - Awww, t... \n", "1 scotthamilton is upset that he can't update his Facebook by ... \n", "2 mattycus @Kenichan I dived many times for the ball. Man... \n", "3 ElleCTF my whole body feels itchy and like its on fire \n", "4 Karoli @nationwideclass no, it's not behaving at all.... \n", "\n", " lemmatized_text \n", "0 switchfoot http twitpic com zl awww bummer sho... \n", "1 upset update facebook texting might cry result... \n", "2 kenichan dive many time ball manage save rest ... \n", "3 whole body feel itchy like fire \n", "4 nationwideclass behaving mad see " ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
targetidsdateflagusertextlemmatized_text
001467810369Mon Apr 06 22:19:45 PDT 2009NO_QUERY_TheSpecialOne_@switchfoot http://twitpic.com/2y1zl - Awww, t...switchfoot http twitpic com zl awww bummer sho...
101467810672Mon Apr 06 22:19:49 PDT 2009NO_QUERYscotthamiltonis upset that he can't update his Facebook by ...upset update facebook texting might cry result...
201467810917Mon Apr 06 22:19:53 PDT 2009NO_QUERYmattycus@Kenichan I dived many times for the ball. Man...kenichan dive many time ball manage save rest ...
301467811184Mon Apr 06 22:19:57 PDT 2009NO_QUERYElleCTFmy whole body feels itchy and like its on firewhole body feel itchy like fire
401467811193Mon Apr 06 22:19:57 PDT 2009NO_QUERYKaroli@nationwideclass no, it's not behaving at all....nationwideclass behaving mad see
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "twitter_data" } }, "metadata": {}, "execution_count": 4 } ] }, { "cell_type": "code", "source": [ "twitter_data.shape" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "avvbcoJrxbVa", "outputId": "e5172041-9d60-4dcd-d416-dc1e031e724f", "execution": { "iopub.status.busy": "2024-06-25T14:51:34.793324Z", "iopub.execute_input": "2024-06-25T14:51:34.794041Z", "iopub.status.idle": "2024-06-25T14:51:34.800208Z", "shell.execute_reply.started": "2024-06-25T14:51:34.794010Z", "shell.execute_reply": "2024-06-25T14:51:34.798757Z" }, "trusted": true }, "execution_count": null, "outputs": [ { "execution_count": 4, "output_type": "execute_result", "data": { "text/plain": "(1600000, 7)" }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "X = twitter_data['lemmatized_text']\n", "Y = twitter_data['target']" ], "metadata": { "id": "Wx1sKkhfzhQY", "execution": { "iopub.status.busy": "2024-06-25T14:51:40.852792Z", "iopub.execute_input": "2024-06-25T14:51:40.853486Z", "iopub.status.idle": "2024-06-25T14:51:40.857707Z", "shell.execute_reply.started": "2024-06-25T14:51:40.853456Z", "shell.execute_reply": "2024-06-25T14:51:40.856693Z" }, "trusted": true }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "Y.value_counts()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "U0BcjkP52pg9", "outputId": "baea1a2f-85af-4c51-d067-999dd7a2871b" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "target\n", "0 800000\n", "1 800000\n", "Name: count, dtype: int64" ] }, "metadata": {}, "execution_count": 5 } ] }, { "cell_type": "code", "source": [ "print(twitter_data.dtypes)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "NTZXuWlp1SRf", "outputId": "3bf7db1d-65ee-43ec-9646-3a0ade686756" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "target int64\n", "ids int64\n", "date object\n", "flag object\n", "user object\n", "text object\n", "lemmatized_text object\n", "dtype: object\n" ] } ] }, { "cell_type": "markdown", "source": [ "### Vectorize" ], "metadata": { "id": "Y6w5Xg7fznQJ" } }, { "cell_type": "code", "source": [ "sent_length = [len(sentence.split()) for sentence in X if len(sentence.split())>30]" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 201 }, "id": "X4v353dAxy07", "outputId": "8107fda1-eb1c-4f45-d2fc-1f28b7e93932", "execution": { "iopub.status.busy": "2024-06-25T14:51:57.694579Z", "iopub.execute_input": "2024-06-25T14:51:57.695283Z", "iopub.status.idle": "2024-06-25T14:51:58.444403Z", "shell.execute_reply.started": "2024-06-25T14:51:57.695252Z", "shell.execute_reply": "2024-06-25T14:51:58.443084Z" }, "trusted": true }, "execution_count": null, "outputs": [ { "output_type": "error", "ename": "AttributeError", "evalue": "'float' object has no attribute 'split'", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0msent_length\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msentence\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0msentence\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mX\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msentence\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m>\u001b[0m\u001b[0;36m30\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0msent_length\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msentence\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0msentence\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mX\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msentence\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m>\u001b[0m\u001b[0;36m30\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mAttributeError\u001b[0m: 'float' object has no attribute 'split'" ] } ] }, { "cell_type": "markdown", "source": [ "Looks like, there are some float values in lemmatized_text." ], "metadata": { "id": "kc3mPNiYz_7l" } }, { "cell_type": "code", "source": [ "# Step 1: Convert floats or non-string values to string\n", "twitter_data['lemmatized_text'] = twitter_data['lemmatized_text'].apply(lambda x: str(x) if isinstance(x, float) else x)" ], "metadata": { "id": "W9rptbdR0dGJ", "execution": { "iopub.status.busy": "2024-06-25T14:52:01.682444Z", "iopub.execute_input": "2024-06-25T14:52:01.682778Z", "iopub.status.idle": "2024-06-25T14:52:02.267217Z", "shell.execute_reply.started": "2024-06-25T14:52:01.682754Z", "shell.execute_reply": "2024-06-25T14:52:02.266398Z" }, "trusted": true }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "X = twitter_data['lemmatized_text']\n", "Y = twitter_data['target']" ], "metadata": { "id": "R80IPwKL06O8", "execution": { "iopub.status.busy": "2024-06-25T14:52:06.859466Z", "iopub.execute_input": "2024-06-25T14:52:06.859830Z", "iopub.status.idle": "2024-06-25T14:52:06.865603Z", "shell.execute_reply.started": "2024-06-25T14:52:06.859802Z", "shell.execute_reply": "2024-06-25T14:52:06.864392Z" }, "trusted": true }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "sent_length = [len(sentence.split()) for sentence in X if len(sentence.split())>20]" ], "metadata": { "id": "m22x8AwH0hh2", "execution": { "iopub.status.busy": "2024-06-25T14:52:11.002941Z", "iopub.execute_input": "2024-06-25T14:52:11.003748Z", "iopub.status.idle": "2024-06-25T14:52:12.257863Z", "shell.execute_reply.started": "2024-06-25T14:52:11.003707Z", "shell.execute_reply": "2024-06-25T14:52:12.257010Z" }, "trusted": true }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "len(sent_length)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "H9xgwpbqzUjF", "outputId": "c666717d-6aff-491c-a235-623c4d5f3448", "execution": { "iopub.status.busy": "2024-06-25T14:52:15.612119Z", "iopub.execute_input": "2024-06-25T14:52:15.612992Z", "iopub.status.idle": "2024-06-25T14:52:15.618716Z", "shell.execute_reply.started": "2024-06-25T14:52:15.612958Z", "shell.execute_reply": "2024-06-25T14:52:15.617741Z" }, "trusted": true }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "2197" ] }, "metadata": {}, "execution_count": 12 } ] }, { "cell_type": "code", "source": [ "from tensorflow.keras.layers import TextVectorization" ], "metadata": { "id": "0csw7Rzr1l4_", "execution": { "iopub.status.busy": "2024-06-25T14:52:19.492530Z", "iopub.execute_input": "2024-06-25T14:52:19.492897Z", "iopub.status.idle": "2024-06-25T14:52:19.522369Z", "shell.execute_reply.started": "2024-06-25T14:52:19.492848Z", "shell.execute_reply": "2024-06-25T14:52:19.521677Z" }, "trusted": true }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "MAX_FEATURES = 75000" ], "metadata": { "id": "ZIoorY7-27Qz", "execution": { "iopub.status.busy": "2024-06-25T14:52:30.301720Z", "iopub.execute_input": "2024-06-25T14:52:30.302106Z", "iopub.status.idle": "2024-06-25T14:52:30.306410Z", "shell.execute_reply.started": "2024-06-25T14:52:30.302062Z", "shell.execute_reply": "2024-06-25T14:52:30.305439Z" }, "trusted": true }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "vectorizer = TextVectorization(max_tokens=MAX_FEATURES,\n", " output_sequence_length=20,\n", " output_mode='int')" ], "metadata": { "id": "lM6fOU2t3O-8", "execution": { "iopub.status.busy": "2024-06-25T14:52:33.180929Z", "iopub.execute_input": "2024-06-25T14:52:33.181297Z", "iopub.status.idle": "2024-06-25T14:52:33.921076Z", "shell.execute_reply.started": "2024-06-25T14:52:33.181271Z", "shell.execute_reply": "2024-06-25T14:52:33.920208Z" }, "trusted": true }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "vectorizer.adapt(X.values)" ], "metadata": { "id": "LeqOrOJw3Ugs", "execution": { "iopub.status.busy": "2024-06-25T14:52:37.168502Z", "iopub.execute_input": "2024-06-25T14:52:37.169169Z", "iopub.status.idle": "2024-06-25T14:52:43.255211Z", "shell.execute_reply.started": "2024-06-25T14:52:37.169138Z", "shell.execute_reply": "2024-06-25T14:52:43.254431Z" }, "trusted": true }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "vectorizer.get_vocabulary()" ], "metadata": { "id": "RvXqSQpM3el2", "trusted": true }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "len(vectorizer.get_vocabulary())" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "vh-tvW758jvi", "outputId": "1439f68b-ef90-44cb-b10c-f55dd09b4da1", "execution": { "iopub.status.busy": "2024-06-25T14:53:08.164181Z", "iopub.execute_input": "2024-06-25T14:53:08.164782Z", "iopub.status.idle": "2024-06-25T14:53:08.471208Z", "shell.execute_reply.started": "2024-06-25T14:53:08.164753Z", "shell.execute_reply": "2024-06-25T14:53:08.470205Z" }, "trusted": true }, "execution_count": null, "outputs": [ { "execution_count": 17, "output_type": "execute_result", "data": { "text/plain": "75000" }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "vectorizer('Hello world, life is great')" ], "metadata": { "id": "9m1PjMwi8uGK", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "6e84b312-39a6-4a24-bf05-b01687189b3d", "execution": { "iopub.status.busy": "2024-06-25T14:53:11.828013Z", "iopub.execute_input": "2024-06-25T14:53:11.828730Z", "iopub.status.idle": "2024-06-25T14:53:13.077033Z", "shell.execute_reply.started": "2024-06-25T14:53:11.828700Z", "shell.execute_reply": "2024-06-25T14:53:13.076085Z" }, "trusted": true }, "execution_count": null, "outputs": [ { "execution_count": 18, "output_type": "execute_result", "data": { "text/plain": "" }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "vectorizer('Hey buddy, life is awesome')" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "PirQse4bWeCY", "outputId": "14e25a78-a2ab-4423-a457-edf20b0d78e9", "execution": { "iopub.status.busy": "2024-06-25T14:53:15.328223Z", "iopub.execute_input": "2024-06-25T14:53:15.328553Z", "iopub.status.idle": "2024-06-25T14:53:15.346786Z", "shell.execute_reply.started": "2024-06-25T14:53:15.328530Z", "shell.execute_reply": "2024-06-25T14:53:15.345927Z" }, "trusted": true }, "execution_count": null, "outputs": [ { "execution_count": 19, "output_type": "execute_result", "data": { "text/plain": "" }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "vectorized_text = vectorizer(X.values)" ], "metadata": { "id": "VMEoKm6lXSNW", "execution": { "iopub.status.busy": "2024-06-25T14:53:19.627901Z", "iopub.execute_input": "2024-06-25T14:53:19.628255Z", "iopub.status.idle": "2024-06-25T14:53:23.730808Z", "shell.execute_reply.started": "2024-06-25T14:53:19.628228Z", "shell.execute_reply": "2024-06-25T14:53:23.730033Z" }, "trusted": true }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "vectorized_text.dtype" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "G-o7uCoWZNUG", "outputId": "8168d0de-37bd-4731-8678-2ece48a484a5", "execution": { "iopub.status.busy": "2024-06-25T14:53:31.269373Z", "iopub.execute_input": "2024-06-25T14:53:31.270310Z", "iopub.status.idle": "2024-06-25T14:53:31.275970Z", "shell.execute_reply.started": "2024-06-25T14:53:31.270269Z", "shell.execute_reply": "2024-06-25T14:53:31.275061Z" }, "trusted": true }, "execution_count": null, "outputs": [ { "execution_count": 21, "output_type": "execute_result", "data": { "text/plain": "tf.int64" }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "vectorized_text.shape" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "mXYxar2pZUsV", "outputId": "060e9018-c36e-4bbb-eed7-8b4f0beb4e1b", "execution": { "iopub.status.busy": "2024-06-25T14:53:38.231329Z", "iopub.execute_input": "2024-06-25T14:53:38.231712Z", "iopub.status.idle": "2024-06-25T14:53:38.238041Z", "shell.execute_reply.started": "2024-06-25T14:53:38.231683Z", "shell.execute_reply": "2024-06-25T14:53:38.237006Z" }, "trusted": true }, "execution_count": null, "outputs": [ { "execution_count": 22, "output_type": "execute_result", "data": { "text/plain": "TensorShape([1600000, 20])" }, "metadata": {} } ] }, { "cell_type": "markdown", "source": [ "## Prepare Dataset" ], "metadata": { "id": "PgiE9_Yxulln" } }, { "cell_type": "code", "source": [ "#MCSHBAP - map, chache, shuffle, batch, prefetch from_tensor_slices, list_file\n", "dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, Y))\n", "dataset = dataset.cache()\n", "dataset = dataset.shuffle(1000000)\n", "dataset = dataset.batch(16)\n", "dataset = dataset.prefetch(4)" ], "metadata": { "id": "aEWYamp9Z7nc", "execution": { "iopub.status.busy": "2024-06-25T14:53:50.931596Z", "iopub.execute_input": "2024-06-25T14:53:50.932259Z", "iopub.status.idle": "2024-06-25T14:53:50.960596Z", "shell.execute_reply.started": "2024-06-25T14:53:50.932225Z", "shell.execute_reply": "2024-06-25T14:53:50.959710Z" }, "trusted": true }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "train = dataset.take(int(len(dataset)*.8))\n", "val = dataset.skip(int(len(dataset)*.8)).take(int(len(dataset)*.1))\n", "test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))" ], "metadata": { "execution": { "iopub.status.busy": "2024-06-25T14:53:54.228079Z", "iopub.execute_input": "2024-06-25T14:53:54.228960Z", "iopub.status.idle": "2024-06-25T14:53:54.245007Z", "shell.execute_reply.started": "2024-06-25T14:53:54.228922Z", "shell.execute_reply": "2024-06-25T14:53:54.244234Z" }, "trusted": true, "id": "JNqpMDGVulln" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "## Create Model" ], "metadata": { "id": "gPgfQL0Oulln" } }, { "cell_type": "code", "source": [ "from tensorflow.keras.models import Sequential\n", "from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding" ], "metadata": { "execution": { "iopub.status.busy": "2024-06-25T14:53:58.500182Z", "iopub.execute_input": "2024-06-25T14:53:58.501050Z", "iopub.status.idle": "2024-06-25T14:53:58.507412Z", "shell.execute_reply.started": "2024-06-25T14:53:58.501016Z", "shell.execute_reply": "2024-06-25T14:53:58.506548Z" }, "trusted": true, "id": "uf2qEhrBullo" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "model = Sequential()\n", "# Create the embedding layer\n", "model.add(Embedding(MAX_FEATURES + 1, 32))\n", "# Bidirectional LSTM Layer with specified parameters\n", "model.add(Bidirectional(LSTM(50, activation='tanh', dropout=0.2, recurrent_dropout=0.2)))\n", "# Dense layer with Dropout\n", "model.add(Dense(128, activation='relu'))\n", "model.add(Dropout(0.1))\n", "# Output layer\n", "model.add(Dense(1, activation='sigmoid'))" ], "metadata": { "execution": { "iopub.status.busy": "2024-06-25T14:54:10.259729Z", "iopub.execute_input": "2024-06-25T14:54:10.260096Z", "iopub.status.idle": "2024-06-25T14:54:10.295468Z", "shell.execute_reply.started": "2024-06-25T14:54:10.260068Z", "shell.execute_reply": "2024-06-25T14:54:10.294744Z" }, "trusted": true, "id": "ddUNfyl3ullo" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "model.build(input_shape=(None, None))" ], "metadata": { "execution": { "iopub.status.busy": "2024-06-25T14:54:13.901270Z", "iopub.execute_input": "2024-06-25T14:54:13.902006Z", "iopub.status.idle": "2024-06-25T14:54:14.087392Z", "shell.execute_reply.started": "2024-06-25T14:54:13.901975Z", "shell.execute_reply": "2024-06-25T14:54:14.086630Z" }, "trusted": true, "id": "JnhwdMa9ullo" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "from tensorflow.keras.optimizers import Adam\n", "from tensorflow.keras.losses import BinaryCrossentropy\n", "from tensorflow.keras.models import load_model" ], "metadata": { "execution": { "iopub.status.busy": "2024-06-25T14:54:16.930948Z", "iopub.execute_input": "2024-06-25T14:54:16.931270Z", "iopub.status.idle": "2024-06-25T14:54:16.938622Z", "shell.execute_reply.started": "2024-06-25T14:54:16.931246Z", "shell.execute_reply": "2024-06-25T14:54:16.937714Z" }, "trusted": true, "id": "-IA6WFlpullo" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "from tensorflow.keras.callbacks import ModelCheckpoint\n", "\n", "# Define the filepath pattern for saving the model weights\n", "filepath = \"/kaggle/working/Twitter_epoch_{epoch:02d}_val_loss:{val_loss:.2f}_val_acc:{val_accuracy:.2f}.weights.h5\"\n", "\n", "# Create a list of callbacks\n", "callbacks = [ModelCheckpoint(filepath=filepath,save_weights_only=True,save_freq=\"epoch\")]" ], "metadata": { "execution": { "iopub.status.busy": "2024-06-25T14:54:28.712934Z", "iopub.execute_input": "2024-06-25T14:54:28.713295Z", "iopub.status.idle": "2024-06-25T14:54:28.720603Z", "shell.execute_reply.started": "2024-06-25T14:54:28.713267Z", "shell.execute_reply": "2024-06-25T14:54:28.719758Z" }, "trusted": true, "id": "znphNNi9ullo" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "model.compile(optimizer=Adam(learning_rate=0.001),loss=BinaryCrossentropy(), metrics=['accuracy'])" ], "metadata": { "execution": { "iopub.status.busy": "2024-06-25T14:54:34.161816Z", "iopub.execute_input": "2024-06-25T14:54:34.162210Z", "iopub.status.idle": "2024-06-25T14:54:34.176415Z", "shell.execute_reply.started": "2024-06-25T14:54:34.162180Z", "shell.execute_reply": "2024-06-25T14:54:34.175537Z" }, "trusted": true, "id": "hnN4envQullo" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "model.summary()" ], "metadata": { "execution": { "iopub.status.busy": "2024-06-25T14:54:36.370257Z", "iopub.execute_input": "2024-06-25T14:54:36.370987Z", "iopub.status.idle": "2024-06-25T14:54:36.390660Z", "shell.execute_reply.started": "2024-06-25T14:54:36.370951Z", "shell.execute_reply": "2024-06-25T14:54:36.389810Z" }, "trusted": true, "id": "aXlzo4D5ullp", "outputId": "378fc4f2-785d-45a1-e7bd-ce14027b7ad3" }, "execution_count": null, "outputs": [ { "output_type": "display_data", "data": { "text/plain": "\u001b[1mModel: \"sequential\"\u001b[0m\n", "text/html": "
Model: \"sequential\"\n
\n" }, "metadata": {} }, { "output_type": "display_data", "data": { "text/plain": "┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓\n┃\u001b[1m \u001b[0m\u001b[1mLayer (type) \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mOutput Shape \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1m Param #\u001b[0m\u001b[1m \u001b[0m┃\n┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩\n│ embedding (\u001b[38;5;33mEmbedding\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m32\u001b[0m) │ \u001b[38;5;34m2,400,032\u001b[0m │\n├─────────────────────────────────┼────────────────────────┼───────────────┤\n│ bidirectional (\u001b[38;5;33mBidirectional\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m100\u001b[0m) │ \u001b[38;5;34m33,200\u001b[0m │\n├─────────────────────────────────┼────────────────────────┼───────────────┤\n│ dense (\u001b[38;5;33mDense\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m128\u001b[0m) │ \u001b[38;5;34m12,928\u001b[0m │\n├─────────────────────────────────┼────────────────────────┼───────────────┤\n│ dropout (\u001b[38;5;33mDropout\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m128\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │\n├─────────────────────────────────┼────────────────────────┼───────────────┤\n│ dense_1 (\u001b[38;5;33mDense\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m1\u001b[0m) │ \u001b[38;5;34m129\u001b[0m │\n└─────────────────────────────────┴────────────────────────┴───────────────┘\n", "text/html": "
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓\n┃ Layer (type)                     Output Shape                  Param # ┃\n┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩\n│ embedding (Embedding)           │ (None, None, 32)       │     2,400,032 │\n├─────────────────────────────────┼────────────────────────┼───────────────┤\n│ bidirectional (Bidirectional)   │ (None, 100)            │        33,200 │\n├─────────────────────────────────┼────────────────────────┼───────────────┤\n│ dense (Dense)                   │ (None, 128)            │        12,928 │\n├─────────────────────────────────┼────────────────────────┼───────────────┤\n│ dropout (Dropout)               │ (None, 128)            │             0 │\n├─────────────────────────────────┼────────────────────────┼───────────────┤\n│ dense_1 (Dense)                 │ (None, 1)              │           129 │\n└─────────────────────────────────┴────────────────────────┴───────────────┘\n
\n" }, "metadata": {} }, { "output_type": "display_data", "data": { "text/plain": "\u001b[1m Total params: \u001b[0m\u001b[38;5;34m2,446,289\u001b[0m (9.33 MB)\n", "text/html": "
 Total params: 2,446,289 (9.33 MB)\n
\n" }, "metadata": {} }, { "output_type": "display_data", "data": { "text/plain": "\u001b[1m Trainable params: \u001b[0m\u001b[38;5;34m2,446,289\u001b[0m (9.33 MB)\n", "text/html": "
 Trainable params: 2,446,289 (9.33 MB)\n
\n" }, "metadata": {} }, { "output_type": "display_data", "data": { "text/plain": "\u001b[1m Non-trainable params: \u001b[0m\u001b[38;5;34m0\u001b[0m (0.00 B)\n", "text/html": "
 Non-trainable params: 0 (0.00 B)\n
\n" }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "history = model.fit(train, epochs=5, validation_data=val, callbacks=callbacks)" ], "metadata": { "execution": { "iopub.status.busy": "2024-06-25T14:55:10.330140Z", "iopub.execute_input": "2024-06-25T14:55:10.330506Z" }, "trusted": true, "id": "ZSTZWH0Hullp", "outputId": "745998f4-13e4-45a9-871d-b60ec1972385" }, "execution_count": null, "outputs": [ { "name": "stdout", "text": "Epoch 1/5\n\u001b[1m72347/80000\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m━━\u001b[0m \u001b[1m5:30\u001b[0m 43ms/step - accuracy: 0.7937 - loss: 0.4444", "output_type": "stream" }, { "text": "IOPub message rate exceeded.\nThe notebook server will temporarily stop sending output\nto the client in order to avoid crashing it.\nTo change this limit, set the config variable\n`--NotebookApp.iopub_msg_rate_limit`.\n\nCurrent values:\nNotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\nNotebookApp.rate_limit_window=3.0 (secs)\n\n", "name": "stderr", "output_type": "stream" }, { "name": "stdout", "text": "\u001b[1m80000/80000\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m3552s\u001b[0m 44ms/step - accuracy: 0.8271 - loss: 0.3845 - val_accuracy: 0.8192 - val_loss: 0.3976\nEpoch 4/5\n\u001b[1m80000/80000\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m3547s\u001b[0m 44ms/step - accuracy: 0.8321 - loss: 0.3761 - val_accuracy: 0.8243 - val_loss: 0.3898\nEpoch 5/5\n\u001b[1m 2284/80000\u001b[0m \u001b[37m━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[1m56:07\u001b[0m 43ms/step - accuracy: 0.8639 - loss: 0.3181", "output_type": "stream" } ] }, { "cell_type": "code", "source": [ "Y.shape" ], "metadata": { "execution": { "iopub.status.busy": "2024-06-23T18:35:56.636125Z", "iopub.execute_input": "2024-06-23T18:35:56.636495Z", "iopub.status.idle": "2024-06-23T18:35:56.644631Z", "shell.execute_reply.started": "2024-06-23T18:35:56.636460Z", "shell.execute_reply": "2024-06-23T18:35:56.643704Z" }, "trusted": true, "id": "bCQlcFbzullp", "outputId": "95a728da-e6d6-4ea0-88ac-e1b20ec6bb01" }, "execution_count": null, "outputs": [ { "execution_count": 37, "output_type": "execute_result", "data": { "text/plain": "(1600000,)" }, "metadata": {} } ] }, { "cell_type": "markdown", "source": [ "## Make Prediction" ], "metadata": { "id": "e04-q7UB3DGD" } }, { "cell_type": "code", "source": [ "from tensorflow.keras.models import load_model" ], "metadata": { "id": "MKxQgs3m58H-" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "model = load_model('/content/drive/MyDrive/twitter_sentiment_analysis/twitter_sentiment_analysis_epoch4.h5')" ], "metadata": { "id": "HTqzhL298MM1" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "input_text = vectorizer(\"6 years later and nothing changes… And he blames the other driver each time.\")" ], "metadata": { "id": "qehiN3QDullp" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "input_text = tf.expand_dims(input_text, axis=0)" ], "metadata": { "id": "MPBPiHeq86fM" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "input_text.shape" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "0huBcwIQ88wt", "outputId": "db15e490-f048-4026-ad40-4313bfe2e4a5" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "TensorShape([1, 20])" ] }, "metadata": {}, "execution_count": 41 } ] }, { "cell_type": "code", "source": [ "res = model.predict(input_text)" ], "metadata": { "id": "zvzeF7pK45Lg", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "5d81930a-e6dc-4479-8370-19d289c064d6" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "1/1 [==============================] - 1s 801ms/step\n" ] } ] }, { "cell_type": "code", "source": [ "res.shape" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "-MBu6wRe8jv2", "outputId": "086ac5fc-0aed-4558-802d-a64fa5058e5d" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(1, 1)" ] }, "metadata": {}, "execution_count": 43 } ] }, { "cell_type": "code", "source": [ "res" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "nI-ryBTn9CSG", "outputId": "6488c02d-a050-4fd6-8eec-8fe267caf475" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array([[0.60668284]], dtype=float32)" ] }, "metadata": {}, "execution_count": 44 } ] }, { "cell_type": "code", "source": [ "if res<0.5:\n", " print('Negative')\n", "else:\n", " print('Positive')" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "yIImIbYF9D3d", "outputId": "aadfcdb4-2e79-4f1a-a220-404c7dcf42bd" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Positive\n" ] } ] }, { "cell_type": "markdown", "source": [ "## Save the vectorizer" ], "metadata": { "id": "AxK1G-IXBG39" } }, { "cell_type": "code", "source": [ "\n", "model = tf.keras.models.Sequential()\n", "model.add(tf.keras.Input(shape=(1,), dtype=tf.string))\n", "model.add(vectorizer)" ], "metadata": { "id": "6moK73ijCOTZ" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "model.save('/content/drive/MyDrive/twitter_sentiment_analysis/vectorizer_model', save_format='tf')" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "-YNccjKeDQoD", "outputId": "33ca7c92-63e5-45e0-c258-19f0653b107e" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.\n" ] } ] }, { "cell_type": "code", "source": [ "loaded_model = tf.keras.models.load_model('/content/drive/MyDrive/twitter_sentiment_analysis/vectorizer_model')" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "lWRVvOJZJB-1", "outputId": "fe446de4-35fb-429b-81ef-8cf0ffcfb293" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "WARNING:tensorflow:No training configuration found in save file, so the model was *not* compiled. Compile it manually.\n" ] } ] }, { "cell_type": "code", "source": [ "loaded_vectorizer = loaded_model.layers[0]" ], "metadata": { "id": "IlaNfQAsJWL0" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "vectorizer('Hello world, life is pretty awesome')" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "eqrPTycoJhkE", "outputId": "d78dc9f3-91a4-4f67-9ec5-10d80a773fe8" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "" ] }, "metadata": {}, "execution_count": 33 } ] }, { "cell_type": "code", "source": [ "loaded_vectorizer('Hello world, life is pretty awesome')" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "LiGC60zSJnOr", "outputId": "767d59f5-60e2-4df1-e468-6a3e044906be" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "" ] }, "metadata": {}, "execution_count": 9 } ] }, { "cell_type": "code", "source": [], "metadata": { "id": "wBPbQIIeihSt" }, "execution_count": null, "outputs": [] } ] }