{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## This Model Allows Sentiment Analysis using Support Vector Machine Algorithm" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### This model is comprised of different steps as listed below:\n", "1. Importing Libraries\n", "2. Data Loading\n", "3. Data Cleaning\n", "4. Data Preprocessing\n", "5. Train Test Split\n", "6. Model Training\n", "7. Model Evaluation\n", "8. Pickle Export" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Importing Libraries" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "from sklearn import svm\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.pipeline import Pipeline\n", "import joblib" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Data Load" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('Tweets.csv') # Load dataset\n", "df = df[['text', 'airline_sentiment']] # Ensure these columns exist" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Data Cleaning" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0\n" ] } ], "source": [ "print(df['airline_sentiment'].isnull().sum())" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [], "source": [ "df = df.dropna(subset=['airline_sentiment'])\n", "df['airline_sentiment'] = df['airline_sentiment'].map({\n", " 'positive': 2,\n", " 'neutral': 1,\n", " 'negative': 0\n", "})" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Train Test Split" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [], "source": [ "# Split dataset\n", "X = df['text']\n", "y = df['airline_sentiment']\n", "X_train, X_test, y_train, y_test = train_test_split(\n", " X, y , test_size=0.33, random_state=42\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Model Training" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [], "source": [ "#model pipelining\n", "\n", "model = Pipeline([\n", " ('tfidf', TfidfVectorizer(max_features=10000, ngram_range=(1,2), stop_words='english')),\n", " ('svm', svm.SVC(kernel='linear', probability=True, class_weight='balanced'))])" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Pipeline(steps=[('tfidf',\n",
" TfidfVectorizer(max_features=10000, ngram_range=(1, 2),\n",
" stop_words='english')),\n",
" ('svm',\n",
" SVC(class_weight='balanced', kernel='linear',\n",
" probability=True))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('tfidf',\n",
" TfidfVectorizer(max_features=10000, ngram_range=(1, 2),\n",
" stop_words='english')),\n",
" ('svm',\n",
" SVC(class_weight='balanced', kernel='linear',\n",
" probability=True))])TfidfVectorizer(max_features=10000, ngram_range=(1, 2), stop_words='english')
SVC(class_weight='balanced', kernel='linear', probability=True)