Spaces:

vikranth1111
/

cap_fin

No application file

App Files Files Community

vikranth1111 commited on Nov 21, 2023

Commit

eaf5291

1 Parent(s): 4d7e4f0

Upload 2 files

Browse files

Files changed (2) hide show

Disaster_Detection_From_Tweets_using_ML.ipynb +913 -0
disaster_tweets.csv +0 -0

Disaster_Detection_From_Tweets_using_ML.ipynb ADDED Viewed

	@@ -0,0 +1,913 @@

+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": 1,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "MxvHGzzsDnoa",
+        "outputId": "17e475f1-0b63-4e12-f71b-b6eb6ba91065"
+      },
+      "outputs": [
+        {
+          "ename": "ModuleNotFoundError",
+          "evalue": "No module named 'sklearn'",
+          "output_type": "error",
+          "traceback": [
+            "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+            "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
+            "\u001b[1;32m/Users/vikranthbakkashetty/Downloads/disaster_detection_from_tweets-main/Disaster_Detection_From_Tweets_using_ML.ipynb Cell 1\u001b[0m line \u001b[0;36m1\n\u001b[1;32m      <a href='vscode-notebook-cell:/Users/vikranthbakkashetty/Downloads/disaster_detection_from_tweets-main/Disaster_Detection_From_Tweets_using_ML.ipynb#W0sZmlsZQ%3D%3D?line=7'>8</a>\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mmatplotlib\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mpyplot\u001b[39;00m \u001b[39mas\u001b[39;00m \u001b[39mplt\u001b[39;00m\n\u001b[1;32m      <a href='vscode-notebook-cell:/Users/vikranthbakkashetty/Downloads/disaster_detection_from_tweets-main/Disaster_Detection_From_Tweets_using_ML.ipynb#W0sZmlsZQ%3D%3D?line=8'>9</a>\u001b[0m get_ipython()\u001b[39m.\u001b[39mrun_line_magic(\u001b[39m'\u001b[39m\u001b[39mmatplotlib\u001b[39m\u001b[39m'\u001b[39m, \u001b[39m'\u001b[39m\u001b[39minline\u001b[39m\u001b[39m'\u001b[39m)\n\u001b[0;32m---> <a href='vscode-notebook-cell:/Users/vikranthbakkashetty/Downloads/disaster_detection_from_tweets-main/Disaster_Detection_From_Tweets_using_ML.ipynb#W0sZmlsZQ%3D%3D?line=9'>10</a>\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39msklearn\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mmetrics\u001b[39;00m \u001b[39mimport\u001b[39;00m  accuracy_score, f1_score, precision_score,confusion_matrix, recall_score, roc_auc_score\n\u001b[1;32m     <a href='vscode-notebook-cell:/Users/vikranthbakkashetty/Downloads/disaster_detection_from_tweets-main/Disaster_Detection_From_Tweets_using_ML.ipynb#W0sZmlsZQ%3D%3D?line=11'>12</a>\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39msklearn\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mfeature_extraction\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mtext\u001b[39;00m \u001b[39mimport\u001b[39;00m TfidfVectorizer\n\u001b[1;32m     <a href='vscode-notebook-cell:/Users/vikranthbakkashetty/Downloads/disaster_detection_from_tweets-main/Disaster_Detection_From_Tweets_using_ML.ipynb#W0sZmlsZQ%3D%3D?line=12'>13</a>\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39msklearn\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mmodel_selection\u001b[39;00m \u001b[39mimport\u001b[39;00m train_test_split,cross_val_score\n",
+            "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'sklearn'"
+          ]
+        }
+      ],
+      "source": [
+        "import pandas as pd\n",
+        "import numpy as np\n",
+        "import itertools\n",
+        "import seaborn as sns\n",
+        "import nltk, re, string\n",
+        "from string import punctuation\n",
+        "from nltk.corpus import stopwords\n",
+        "import matplotlib.pyplot as plt\n",
+        "%matplotlib inline\n",
+        "from sklearn.metrics import  accuracy_score, f1_score, precision_score,confusion_matrix, recall_score, roc_auc_score\n",
+        "\n",
+        "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+        "from sklearn.model_selection import train_test_split,cross_val_score\n",
+        "#machine learning\n",
+        "from sklearn.linear_model import PassiveAggressiveClassifier,LogisticRegression\n",
+        "# machine learning\n",
+        "from sklearn.naive_bayes import MultinomialNB,GaussianNB\n",
+        "nltk.download('stopwords')\n",
+        "nltk.download('punkt')\n",
+        "nltk.download('wordnet')\n",
+        "nltk.download('omw-1.4')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "pip install sklearn"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 206
+        },
+        "id": "-xCY2mIQD5_x",
+        "outputId": "b9158464-4ece-4715-efa7-ec1042a28e68"
+      },
+      "outputs": [],
+      "source": [
+        "df = pd.read_csv('//Users/vikranthbakkashetty/Downloads/disaster_detection_from_tweets-main/disaster_tweets.csv')\n",
+        "df.head()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "Gd_1_5TYFHDD",
+        "outputId": "3e19f261-8433-446f-8840-1c6af98e3c6e"
+      },
+      "outputs": [],
+      "source": [
+        "df.info()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "01yWUhW-FEd4"
+      },
+      "source": [
+        "## Target Distribution"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 353
+        },
+        "id": "TNxbrllBE7Rw",
+        "outputId": "7bc5683c-d881-4c5f-80f5-aba6236916ad"
+      },
+      "outputs": [],
+      "source": [
+        "sns.set_style(\"dark\")\n",
+        "sns.countplot(df.target)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 206
+        },
+        "id": "S3MpxtWuFL-4",
+        "outputId": "16249244-930d-4eef-91f3-058944c6b918"
+      },
+      "outputs": [],
+      "source": [
+        "# craeteing new column for storing length of reviews \n",
+        "df['length'] = df['text'].apply(len)\n",
+        "df.head()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 283
+        },
+        "id": "U3U3T9QPFYFP",
+        "outputId": "a0350f22-0ba2-4f81-e984-cc5e8696a3b7"
+      },
+      "outputs": [],
+      "source": [
+        "df['length'].plot(bins=50, kind='hist')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "s6eau4zHFh6S",
+        "outputId": "a7e8cd58-05f4-42d2-f85d-88d84fd35775"
+      },
+      "outputs": [],
+      "source": [
+        "df.length.describe()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 54
+        },
+        "id": "2UGUXOu_FmTV",
+        "outputId": "454767e7-18d5-4601-b563-3954d39fc503"
+      },
+      "outputs": [],
+      "source": [
+        "df[df['length'] == 157]['text'].iloc[0]"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 343
+        },
+        "id": "pTDW0qiZFpNq",
+        "outputId": "56e487c9-ef89-46ea-e393-280b9a81e0c9"
+      },
+      "outputs": [],
+      "source": [
+        "df.hist(column='length', by='target', bins=50,figsize=(10,4))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "IJYv-A-oG8fk",
+        "outputId": "aaba1ebb-aac7-4162-cfd3-77e75299c255"
+      },
+      "outputs": [],
+      "source": [
+        "stop = set(stopwords.words('english'))\n",
+        "punctuation = list(string.punctuation)\n",
+        "stop.update(punctuation)\n",
+        "\n",
+        "# Removing stop words which are unneccesary from headline news\n",
+        "def remove_stopwords(text):\n",
+        "    final_text = []\n",
+        "    for i in text.split():\n",
+        "        if i.strip().lower() not in stop:\n",
+        "            final_text.append(i.strip())\n",
+        "    return \" \".join(final_text)\n",
+        "\n",
+        "df_1 = df[df['target']==1]\n",
+        "df_0 = df[df['target']==0]\n",
+        "df_1['text']=df_1['text'].apply(remove_stopwords)\n",
+        "df_0['text']=df_0['text'].apply(remove_stopwords)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "lIaKop1n4Vr6"
+      },
+      "source": [
+        "## Plotting wordcloud of Disaster Tweets"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 606
+        },
+        "id": "Qkxm4Gl_Hdg9",
+        "outputId": "c707ac2a-5fc5-4abe-eb08-ae6e91e20414"
+      },
+      "outputs": [],
+      "source": [
+        "from wordcloud import WordCloud\n",
+        "plt.figure(figsize = (20,20)) # Text that is Disaster tweets\n",
+        "wc = WordCloud(max_words = 1000 , width = 1600 , height = 800).generate(\" \".join(df_1.text))\n",
+        "plt.imshow(wc , interpolation = 'bilinear')\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "RFybOU0d4hMn"
+      },
+      "source": [
+        "## Plotting wordcloud of Normal Tweets"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 606
+        },
+        "id": "88A5_Es3HyrZ",
+        "outputId": "94752358-6917-4640-c953-84228d453cc3"
+      },
+      "outputs": [],
+      "source": [
+        "plt.figure(figsize = (20,20)) # Text that is Normal Tweets\n",
+        "wc = WordCloud(max_words = 1000 , width = 1600 , height = 800).generate(\" \".join(df_0.text))\n",
+        "plt.imshow(wc , interpolation = 'bilinear')"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "G4xpJIDhLeZ7"
+      },
+      "source": [
+        "## Data Cleaning and Preparation"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "czAn1C9hLcrS"
+      },
+      "outputs": [],
+      "source": [
+        "from nltk.stem import WordNetLemmatizer\n",
+        "lemma = WordNetLemmatizer()\n",
+        "#creating list of possible stopwords from nltk library\n",
+        "stop = stopwords.words('english')\n",
+        "\n",
+        "def cleanTweet(txt):\n",
+        "  # lowercaing\n",
+        "    txt = txt.lower()\n",
+        "    # tokenization\n",
+        "    words = nltk.word_tokenize(txt)\n",
+        "  # removing stopwords & mennatizing the words\n",
+        "    words = ' '.join([lemma.lemmatize(word) for word in words if word not in (stop)])\n",
+        "    text = \"\".join(words)\n",
+        "  # removing non-alphabetic characters\n",
+        "    txt = re.sub('[^a-z]',' ',text)\n",
+        "    return txt  \n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "t9K2uFC65CjP"
+      },
+      "source": [
+        "## Applying Clean Tweet Function on Tweets Text"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 206
+        },
+        "id": "_j712nT9Ma4L",
+        "outputId": "8e494e08-f0f9-453f-f12c-147d8b55baa4"
+      },
+      "outputs": [],
+      "source": [
+        "df['cleaned_tweets'] = df['text'].apply(cleanTweet)\n",
+        "df.head()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "-z-24scU5NgM"
+      },
+      "source": [
+        "## Creating Feature & Target Variables"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "T7UA_KKNIAK6"
+      },
+      "outputs": [],
+      "source": [
+        "y = df.target\n",
+        "X=df.cleaned_tweets"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "IOkKFRGwIXrr"
+      },
+      "outputs": [],
+      "source": [
+        "X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.20,stratify=y, random_state=0)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "KTixc5ua6WPb"
+      },
+      "source": [
+        "## TF-IDF Vectorizer - Bi-Gram"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "QcOwtLDiIcna"
+      },
+      "outputs": [],
+      "source": [
+        "tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.8, ngram_range=(1,2))\n",
+        "tfidf_train_2 = tfidf_vectorizer.fit_transform(X_train)\n",
+        "tfidf_test_2 = tfidf_vectorizer.transform(X_test)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "ET9odcJ46z3M"
+      },
+      "source": [
+        "## Multinomial Naive Bayes"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "EyAtlCrMIigj",
+        "outputId": "8739c74f-97cb-44bb-a5c9-02380d817047"
+      },
+      "outputs": [],
+      "source": [
+        "## Model Fitting\n",
+        "mnb_tf = MultinomialNB()\n",
+        "mnb_tf.fit(tfidf_train_2, y_train)\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "iP06hRJV80he"
+      },
+      "source": [
+        "## 10-Fold Cross Validation"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "3dl2qwK_80Lg",
+        "outputId": "a29fee5d-1a00-4bcc-88cd-d9854a31fb8b"
+      },
+      "outputs": [],
+      "source": [
+        "from sklearn import model_selection\n",
+        "\n",
+        "kfold = model_selection.KFold(n_splits=10)\n",
+        "scoring = 'accuracy'\n",
+        "\n",
+        "acc_mnb2 = cross_val_score(estimator = mnb_tf, X = tfidf_train_2, y = y_train, cv = kfold,scoring=scoring)\n",
+        "acc_mnb2.mean()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "VEhlOemY9o3v"
+      },
+      "source": [
+        "## Model Prediction Test set"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 333
+        },
+        "id": "XMu-aOsJ9cMA",
+        "outputId": "c3c45740-cba3-4595-c8c4-a8a33ae41536"
+      },
+      "outputs": [],
+      "source": [
+        "pred_mnb2 = mnb_tf.predict(tfidf_test_2)\n",
+        "CM=confusion_matrix(y_test,pred_mnb2)\n",
+        "sns.heatmap(CM,cmap= \"Blues\", linecolor = 'black' , linewidth = 1 , annot = True, fmt='' , xticklabels = ['Normal', 'Disaster'] , yticklabels = ['Normal', 'Disaster'])\n",
+        "\n",
+        "TN = CM[0][0]\n",
+        "FN = CM[1][0]\n",
+        "TP = CM[1][1]\n",
+        "FP = CM[0][1]\n",
+        "specificity = TN/(TN+FP)\n",
+        "\n",
+        "acc= accuracy_score(y_test, pred_mnb2)\n",
+        "\n",
+        "prec = precision_score(y_test, pred_mnb2)\n",
+        "rec = recall_score(y_test, pred_mnb2)\n",
+        "f1 = f1_score(y_test, pred_mnb2)\n",
+        "\n",
+        "\n",
+        "model_results =pd.DataFrame([['Multinomial Naive Bayes - TFIDF-Bigram',acc, prec,rec,specificity, f1]],\n",
+        "               columns = ['Model', 'Accuracy','Precision', 'Sensitivity','Specificity', 'F1 Score'])\n",
+        "\n",
+        "model_results"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "zAuAQhIh63sj"
+      },
+      "source": [
+        "## Passive Aggressive Classifier"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "o_aydxKyPnk_",
+        "outputId": "8a05104c-6b65-4c3c-a6bf-139c251518c5"
+      },
+      "outputs": [],
+      "source": [
+        "pass_tf = PassiveAggressiveClassifier()\n",
+        "pass_tf.fit(tfidf_train_2, y_train)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "wIriBQmj-qsi"
+      },
+      "source": [
+        "## 10-Fold Cross Validation"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "5Ir7B5fe-vht",
+        "outputId": "19833836-7e14-4b96-a31c-d207564045dd"
+      },
+      "outputs": [],
+      "source": [
+        "\n",
+        "kfold = model_selection.KFold(n_splits=10)\n",
+        "scoring = 'accuracy'\n",
+        "\n",
+        "acc_pass2 = cross_val_score(estimator = pass_tf, X = tfidf_train_2, y = y_train, cv = kfold,scoring=scoring)\n",
+        "acc_pass2.mean()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "duOtJaGH-6Dl"
+      },
+      "source": [
+        "## Model Prediction"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 360
+        },
+        "id": "aV2OjmZv_Dat",
+        "outputId": "502445e1-c2ff-459b-8222-e529cd551be1"
+      },
+      "outputs": [],
+      "source": [
+        "pred_pass2 = pass_tf.predict(tfidf_test_2)\n",
+        "CM=confusion_matrix(y_test,pred_pass2)\n",
+        "sns.heatmap(CM,cmap= \"Blues\", linecolor = 'black' , linewidth = 1 , annot = True, fmt='' , xticklabels = ['Normal', 'Disaster'] , yticklabels = ['Normal', 'Disaster'])\n",
+        "\n",
+        "acc = accuracy_score(y_test, pred_pass2)\n",
+        "prec = precision_score(y_test, pred_pass2)\n",
+        "rec = recall_score(y_test, pred_pass2)\n",
+        "f1 = f1_score(y_test, pred_pass2)\n",
+        "\n",
+        "results =pd.DataFrame([['Passive Aggressive Classifier - TFIDF-Bigram',acc, prec,rec,specificity, f1]],\n",
+        "               columns = ['Model', 'Accuracy','Precision', 'Sensitivity','Specificity', 'F1 Score'])\n",
+        "results = model_results.append(results, ignore_index = True)\n",
+        "results"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "4kzfq6Vu6bj5"
+      },
+      "source": [
+        "## TF-IDF Vectorizer - Tri Gram"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "SHDeA4FfQnhu"
+      },
+      "outputs": [],
+      "source": [
+        "tfidf_vectorizer_3 = TfidfVectorizer(stop_words='english', max_df=0.8, ngram_range=(1,3))\n",
+        "tfidf_train_3 = tfidf_vectorizer_3.fit_transform(X_train)\n",
+        "tfidf_test_3 = tfidf_vectorizer_3.transform(X_test)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "6-7gipjN6-Sc"
+      },
+      "source": [
+        "## Multinomial Naive Bayes - Tri Gram"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "mooomowf6i5S",
+        "outputId": "e4b287da-ba29-45b1-ddde-986f131f4ee6"
+      },
+      "outputs": [],
+      "source": [
+        "mnb_tf3 = MultinomialNB()\n",
+        "mnb_tf3.fit(tfidf_train_3, y_train)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "lWJ9ehn4_5Mk"
+      },
+      "source": [
+        "## 10-fold cross validation"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "31vc8MHq_8Kh",
+        "outputId": "7f59800a-2e29-46c0-dcfd-62ec95b3c690"
+      },
+      "outputs": [],
+      "source": [
+        "kfold = model_selection.KFold(n_splits=10)\n",
+        "scoring = 'accuracy'\n",
+        "\n",
+        "acc_mnb3 = cross_val_score(estimator = mnb_tf, X = tfidf_train_3, y = y_train, cv = kfold,scoring=scoring)\n",
+        "acc_mnb3.mean()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "vXcrlM9AAFUW"
+      },
+      "source": [
+        "## Model Prediction"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 394
+        },
+        "id": "4Gm5Vakt_7v2",
+        "outputId": "9190b92b-9a7b-4845-ba53-423cfd6bbb26"
+      },
+      "outputs": [],
+      "source": [
+        "pred_mnb3 = mnb_tf3.predict(tfidf_test_3)\n",
+        "CM=confusion_matrix(y_test,pred_mnb3)\n",
+        "sns.heatmap(CM,cmap= \"Blues\", linecolor = 'black' , linewidth = 1 , annot = True, fmt='' , xticklabels = ['Normal', 'Disaster'] , yticklabels = ['Normal', 'Disaster'])\n",
+        "\n",
+        "acc = accuracy_score(y_test, pred_mnb3)\n",
+        "prec = precision_score(y_test, pred_mnb3)\n",
+        "rec = recall_score(y_test, pred_mnb3)\n",
+        "f1 = f1_score(y_test, pred_mnb3)\n",
+        "\n",
+        "mod_results =pd.DataFrame([['Multinomial Naive Bayes - TFIDF-Trigram',acc, prec,rec,specificity, f1]],\n",
+        "               columns = ['Model', 'Accuracy','Precision', 'Sensitivity','Specificity', 'F1 Score'])\n",
+        "results = results.append(mod_results, ignore_index = True)\n",
+        "results"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "olvJkW1f7XhU"
+      },
+      "source": [
+        "## Passive Aggressive Classifier - Tri Gram"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "vQGkFat27GKm",
+        "outputId": "4368ca5e-9f84-47d6-d1fe-a6b79c1cf552"
+      },
+      "outputs": [],
+      "source": [
+        "pass_tf3 = PassiveAggressiveClassifier()\n",
+        "pass_tf3.fit(tfidf_train_3, y_train)\n",
+        "\n",
+        "## cross validation\n",
+        "kfold = model_selection.KFold(n_splits=10)\n",
+        "scoring = 'accuracy'\n",
+        "\n",
+        "acc_pass3 = cross_val_score(estimator = pass_tf3, X = tfidf_train_3, y = y_train, cv = kfold,scoring=scoring)\n",
+        "acc_pass3.mean()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 423
+        },
+        "id": "QGvVFjCo7g9e",
+        "outputId": "f685155a-59a6-44c7-9e0e-c261d2e9c5d9"
+      },
+      "outputs": [],
+      "source": [
+        "pred_pass3 = pass_tf3.predict(tfidf_test_3)\n",
+        "CM=confusion_matrix(y_test,pred_pass3)\n",
+        "sns.heatmap(CM,cmap= \"Blues\", linecolor = 'black' , linewidth = 1 , annot = True, fmt='' , xticklabels = ['Normal', 'Disaster'] , yticklabels = ['Normal', 'Disaster'])\n",
+        "\n",
+        "acc = accuracy_score(y_test, pred_pass3)\n",
+        "prec = precision_score(y_test, pred_pass3)\n",
+        "rec = recall_score(y_test, pred_pass3)\n",
+        "f1 = f1_score(y_test, pred_pass3)\n",
+        "\n",
+        "mod1_results =pd.DataFrame([['Passive Aggressive Classifier - TFIDF-Trigram',acc, prec,rec,specificity, f1]],\n",
+        "               columns = ['Model', 'Accuracy','Precision', 'Sensitivity','Specificity', 'F1 Score'])\n",
+        "results = results.append(mod1_results, ignore_index = True)\n",
+        "results"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "6BIjKOjyB3Bi"
+      },
+      "source": [
+        "## Most Informative Features"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "zip4Fbfo7koR"
+      },
+      "outputs": [],
+      "source": [
+        "def most_informative_feature_for_binary_classification(vectorizer, classifier, n=100):\n",
+        "    \"\"\"\n",
+        "    See: https://stackoverflow.com/a/26980472\n",
+        "    \n",
+        "    Identify most important features if given a vectorizer and binary classifier. Set n to the number\n",
+        "    of weighted features you would like to show. (Note: current implementation merely prints and does not \n",
+        "    return top classes.)\n",
+        "    \"\"\"\n",
+        "\n",
+        "    class_labels = classifier.classes_\n",
+        "    feature_names = vectorizer.get_feature_names_out()\n",
+        "    topn_class1 = sorted(zip(classifier.coef_[0], feature_names))[:n]\n",
+        "    topn_class2 = sorted(zip(classifier.coef_[0], feature_names))[-n:]\n",
+        "\n",
+        "    for coef, feat in topn_class1:\n",
+        "        print(class_labels[0], coef, feat)\n",
+        "\n",
+        "    print()\n",
+        "\n",
+        "    for coef, feat in reversed(topn_class2):\n",
+        "        print(class_labels[1], coef, feat)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "0_LZODtNB7TW",
+        "outputId": "bd9941bd-5dec-45cd-bb93-739e1299e9be"
+      },
+      "outputs": [],
+      "source": [
+        "most_informative_feature_for_binary_classification(tfidf_vectorizer_3, pass_tf3, n=10)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "slO0gzyHCBJD",
+        "outputId": "998697f5-38d4-4726-bbf6-28fd7e00b511"
+      },
+      "outputs": [],
+      "source": [
+        "most_informative_feature_for_binary_classification(tfidf_vectorizer, mnb_tf, n=10)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "fJg_mWAGDsLM"
+      },
+      "source": [
+        "## Sample prediction"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "12swn6EUCiIz",
+        "outputId": "b2375378-7a66-4cc2-95f3-ff302ca92666"
+      },
+      "outputs": [],
+      "source": [
+        "sentences = [\n",
+        "  \"Just happened a terrible car crash\",\n",
+        "    \"Heard about #earthquake is different cities, stay safe everyone.\",\n",
+        "    \"No I don't like cold!\",\n",
+        "    \"@RosieGray Now in all sincerety do you think the UN would move to Israel if there was a fraction of a chance of being annihilated?\"\n",
+        "  ]\n",
+        "\n",
+        "tfidf_trigram = tfidf_vectorizer_3.transform(sentences)\n",
+        "\n",
+        "\n",
+        "predictions = pass_tf3.predict(tfidf_trigram)\n",
+        "\n",
+        "for text, label in zip(sentences, predictions):\n",
+        "    if label==1:\n",
+        "        target=\"Disaster Tweet\"\n",
+        "        print(\"text:\", text, \"\\nClass:\", target)\n",
+        "        print()\n",
+        "    else:\n",
+        "        target=\"Normal Tweet\"\n",
+        "        print(\"text:\", text, \"\\nClass:\", target)\n",
+        "        print()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "EUBaz8aAE2ko"
+      },
+      "outputs": [],
+      "source": []
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "collapsed_sections": [],
+      "name": "Disaster Detection From Tweets using ML.ipynb",
+      "provenance": []
+    },
+    "gpuClass": "standard",
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.12.0"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}

disaster_tweets.csv ADDED Viewed

The diff for this file is too large to render. See raw diff