Spaces:
Sleeping
Sleeping
Upload prml_project (1).py
Browse files- prml_project (1).py +410 -0
prml_project (1).py
ADDED
|
@@ -0,0 +1,410 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""PRML_project.ipynb
|
| 3 |
+
|
| 4 |
+
Automatically generated by Colab.
|
| 5 |
+
|
| 6 |
+
Original file is located at
|
| 7 |
+
https://colab.research.google.com/drive/1_9mr_G1Wt8bteyyMEFJYBImPcIteTcSQ
|
| 8 |
+
|
| 9 |
+
## Downloading & preparing the Dataset
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import pandas as pd
|
| 13 |
+
import matplotlib.pyplot as plt
|
| 14 |
+
import warnings
|
| 15 |
+
from sklearn.model_selection import train_test_split
|
| 16 |
+
from sklearn.metrics import accuracy_score,classification_report, ConfusionMatrixDisplay
|
| 17 |
+
import re
|
| 18 |
+
import string
|
| 19 |
+
from sklearn.linear_model import LogisticRegression
|
| 20 |
+
from sklearn.naive_bayes import MultinomialNB
|
| 21 |
+
from sklearn.tree import DecisionTreeClassifier
|
| 22 |
+
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
|
| 23 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 24 |
+
from xgboost import XGBClassifier
|
| 25 |
+
from lightgbm import LGBMClassifier
|
| 26 |
+
from sklearn.svm import SVC
|
| 27 |
+
# Ignore FutureWarning messages
|
| 28 |
+
warnings.simplefilter(action='ignore', category=FutureWarning)
|
| 29 |
+
|
| 30 |
+
import os
|
| 31 |
+
import sys
|
| 32 |
+
from tempfile import NamedTemporaryFile
|
| 33 |
+
from urllib.request import urlopen
|
| 34 |
+
from urllib.parse import unquote, urlparse
|
| 35 |
+
from urllib.error import HTTPError
|
| 36 |
+
from zipfile import ZipFile
|
| 37 |
+
import tarfile
|
| 38 |
+
import shutil
|
| 39 |
+
|
| 40 |
+
CHUNK_SIZE = 40960
|
| 41 |
+
DATA_SOURCE_MAPPING = 'sentiment-analysis-dataset:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F989445%2F1808590%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240418%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240418T100202Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D37697dd0d9910676a3f12986b24306fc3726be4de82536c784ffb79deff0ba33d8973d6d612a53bcf9ed39bd7ad8a1d69bb34c42a34c7d6cffee6dd3048a9ef68f047745664f48ea6f3773a1f263129a6f78d48923235cc363b4081daadea014b0958575bf8376d565858404a8b1be7e5f317bdd9f5823ce4777f0b7052445c648bcda039294c804978828087705abe4416a6f9a0e0743388667017128a5ab2ef5ab2dade0d40d1659f4313296501907b4baec3161131e151e6f5b982eee9a6f7eb1b022da9c874f216d7fac981dc1351e9001ee56d03d1da8b2e0d4c97320f18d7e9b00ec63f4ba7444d81595cc8edff2b05f13aef4b204dd2710d0fddf0ef9'
|
| 42 |
+
|
| 43 |
+
KAGGLE_INPUT_PATH='/kaggle/input'
|
| 44 |
+
KAGGLE_WORKING_PATH='/kaggle/working'
|
| 45 |
+
KAGGLE_SYMLINK='kaggle'
|
| 46 |
+
|
| 47 |
+
!umount /kaggle/input/ 2> /dev/null
|
| 48 |
+
shutil.rmtree('/kaggle/input', ignore_errors=True)
|
| 49 |
+
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
|
| 50 |
+
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)
|
| 51 |
+
|
| 52 |
+
try:
|
| 53 |
+
os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
|
| 54 |
+
except FileExistsError:
|
| 55 |
+
pass
|
| 56 |
+
try:
|
| 57 |
+
os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
|
| 58 |
+
except FileExistsError:
|
| 59 |
+
pass
|
| 60 |
+
|
| 61 |
+
for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
|
| 62 |
+
directory, download_url_encoded = data_source_mapping.split(':')
|
| 63 |
+
download_url = unquote(download_url_encoded)
|
| 64 |
+
filename = urlparse(download_url).path
|
| 65 |
+
destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
|
| 66 |
+
try:
|
| 67 |
+
with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
|
| 68 |
+
total_length = fileres.headers['content-length']
|
| 69 |
+
print(f'Downloading {directory}, {total_length} bytes compressed')
|
| 70 |
+
dl = 0
|
| 71 |
+
data = fileres.read(CHUNK_SIZE)
|
| 72 |
+
while len(data) > 0:
|
| 73 |
+
dl += len(data)
|
| 74 |
+
tfile.write(data)
|
| 75 |
+
done = int(50 * dl / int(total_length))
|
| 76 |
+
sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
|
| 77 |
+
sys.stdout.flush()
|
| 78 |
+
data = fileres.read(CHUNK_SIZE)
|
| 79 |
+
if filename.endswith('.zip'):
|
| 80 |
+
with ZipFile(tfile) as zfile:
|
| 81 |
+
zfile.extractall(destination_path)
|
| 82 |
+
else:
|
| 83 |
+
with tarfile.open(tfile.name) as tarfile:
|
| 84 |
+
tarfile.extractall(destination_path)
|
| 85 |
+
print(f'\nDownloaded and uncompressed: {directory}')
|
| 86 |
+
except HTTPError as e:
|
| 87 |
+
print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
|
| 88 |
+
continue
|
| 89 |
+
except OSError as e:
|
| 90 |
+
print(f'Failed to load {download_url} to path {destination_path}')
|
| 91 |
+
continue
|
| 92 |
+
|
| 93 |
+
print('Data source import complete.')
|
| 94 |
+
|
| 95 |
+
import numpy as np # linear algebra
|
| 96 |
+
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
|
| 97 |
+
|
| 98 |
+
# Input data files are available in the read-only "../input/" directory
|
| 99 |
+
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
|
| 100 |
+
|
| 101 |
+
import os
|
| 102 |
+
for dirname, _, filenames in os.walk('/kaggle/input'):
|
| 103 |
+
for filename in filenames:
|
| 104 |
+
print(os.path.join(dirname, filename))
|
| 105 |
+
|
| 106 |
+
d = pd.read_csv('/kaggle/input/sentiment-analysis-dataset/train.csv',encoding='latin1');
|
| 107 |
+
f = pd.read_csv('/kaggle/input/sentiment-analysis-dataset/test.csv',encoding='latin1');
|
| 108 |
+
df = pd.concat([d,f])
|
| 109 |
+
|
| 110 |
+
print(df.shape)
|
| 111 |
+
display(df.info())
|
| 112 |
+
display(df)
|
| 113 |
+
|
| 114 |
+
"""## Preprocessing the dataset"""
|
| 115 |
+
|
| 116 |
+
df.dropna(inplace=True)
|
| 117 |
+
|
| 118 |
+
df['sentiment'].value_counts(normalize=True).plot(kind='bar');
|
| 119 |
+
|
| 120 |
+
df['sentiment'] = df['sentiment'].astype('category').cat.codes
|
| 121 |
+
df['sentiment'].value_counts(normalize=True).plot(kind='bar');
|
| 122 |
+
|
| 123 |
+
df['Time of Tweet'] = df['Time of Tweet'].astype('category').cat.codes
|
| 124 |
+
# Convert Country column to categorical variable
|
| 125 |
+
df['Country'] = df['Country'].astype('category').cat.codes
|
| 126 |
+
# convert Age of User to integer
|
| 127 |
+
df['Age of User']=df['Age of User'].replace({'0-20':18,'21-30':25,'31-45':38,'46-60':53,'60-70':65,'70-100':80})
|
| 128 |
+
|
| 129 |
+
df.info()
|
| 130 |
+
|
| 131 |
+
df.drop(columns=['textID','Time of Tweet', 'Age of User', 'Country', 'Population -2020', 'Land Area (Km²)', 'Density (P/Km²)'])
|
| 132 |
+
|
| 133 |
+
def wp(text):
|
| 134 |
+
text = text.lower()
|
| 135 |
+
text = re.sub('\[.*?\]', '', text)
|
| 136 |
+
text = re.sub("\\W"," ",text)
|
| 137 |
+
text = re.sub('https?://\S+|www\.\S+', '', text)
|
| 138 |
+
text = re.sub('<.*?>+', '', text)
|
| 139 |
+
text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
|
| 140 |
+
text = re.sub('\n', '', text)
|
| 141 |
+
text = re.sub('\w*\d\w*', '', text)
|
| 142 |
+
return text
|
| 143 |
+
|
| 144 |
+
df['selected_text'] = df["selected_text"].apply(wp)
|
| 145 |
+
|
| 146 |
+
"""## Training and testing split """
|
| 147 |
+
|
| 148 |
+
X=df['selected_text']
|
| 149 |
+
y= df['sentiment']
|
| 150 |
+
|
| 151 |
+
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
|
| 152 |
+
print(X_train.shape)
|
| 153 |
+
print(X_test.shape)
|
| 154 |
+
print(y_train.shape)
|
| 155 |
+
print(y_test.shape)
|
| 156 |
+
|
| 157 |
+
vectorization = TfidfVectorizer()
|
| 158 |
+
XV_train = vectorization.fit_transform(X_train)
|
| 159 |
+
XV_test = vectorization.transform(X_test)
|
| 160 |
+
|
| 161 |
+
"""## Random forest and boosting methods
|
| 162 |
+
|
| 163 |
+
### Random forest
|
| 164 |
+
"""
|
| 165 |
+
|
| 166 |
+
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
|
| 167 |
+
rf_classifier.fit(XV_train, y_train)
|
| 168 |
+
|
| 169 |
+
rf_pred = rf_classifier.predict(XV_test)
|
| 170 |
+
|
| 171 |
+
print("Random Forest Accuracy:", accuracy_score(y_test, rf_pred))
|
| 172 |
+
|
| 173 |
+
print("\nRandom Forest Classification Report:")
|
| 174 |
+
print(classification_report(y_test, rf_pred))
|
| 175 |
+
|
| 176 |
+
ConfusionMatrixDisplay.from_predictions(y_test, rf_pred);
|
| 177 |
+
|
| 178 |
+
"""### Adaboost boosting method"""
|
| 179 |
+
|
| 180 |
+
ada_classifier = AdaBoostClassifier()
|
| 181 |
+
ada_classifier.fit(XV_train, y_train)
|
| 182 |
+
|
| 183 |
+
ada_pred = ada_classifier.predict(XV_test)
|
| 184 |
+
|
| 185 |
+
print("AdaBoost Accuracy:", accuracy_score(y_test, ada_pred))
|
| 186 |
+
|
| 187 |
+
print("\nAdaBoost Classification Report:")
|
| 188 |
+
print(classification_report(y_test, ada_pred))
|
| 189 |
+
|
| 190 |
+
ConfusionMatrixDisplay.from_predictions(y_test, ada_pred);
|
| 191 |
+
|
| 192 |
+
"""### Gradient Boosting"""
|
| 193 |
+
|
| 194 |
+
from sklearn.ensemble import GradientBoostingClassifier
|
| 195 |
+
# Gradient Boosting Machine (GBM)
|
| 196 |
+
gbm_classifier = GradientBoostingClassifier()
|
| 197 |
+
gbm_classifier.fit(XV_train, y_train)
|
| 198 |
+
y_pred_gbm = gbm_classifier.predict(XV_test)
|
| 199 |
+
accuracy_gbm = accuracy_score(y_test, y_pred_gbm)
|
| 200 |
+
print("\nGradient Boosting Machine (GBM) Model:")
|
| 201 |
+
print("Accuracy:", accuracy_gbm)
|
| 202 |
+
report_gbm = classification_report(y_test, y_pred_gbm)
|
| 203 |
+
print("Gradient Boosting Machine (GBM) Classification Report:")
|
| 204 |
+
print(report_gbm)
|
| 205 |
+
# If you want to display confusion matrix for GBM, you can use:
|
| 206 |
+
ConfusionMatrixDisplay.from_predictions(y_test, y_pred_gbm)
|
| 207 |
+
|
| 208 |
+
"""### LightGBM"""
|
| 209 |
+
|
| 210 |
+
import lightgbm as lgb
|
| 211 |
+
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay
|
| 212 |
+
|
| 213 |
+
# LightGBM
|
| 214 |
+
lgb_classifier = lgb.LGBMClassifier()
|
| 215 |
+
lgb_classifier.fit(XV_train, y_train)
|
| 216 |
+
y_pred_lgb = lgb_classifier.predict(XV_test)
|
| 217 |
+
accuracy_lgb = accuracy_score(y_test, y_pred_lgb)
|
| 218 |
+
print("\nLightGBM Model:")
|
| 219 |
+
print("Accuracy:", accuracy_lgb)
|
| 220 |
+
report_lgb = classification_report(y_test, y_pred_lgb)
|
| 221 |
+
print("LightGBM Classification Report:")
|
| 222 |
+
print(report_lgb)
|
| 223 |
+
# If you want to display confusion matrix for LightGBM, you can use:
|
| 224 |
+
ConfusionMatrixDisplay.from_predictions(y_test, y_pred_lgb)
|
| 225 |
+
|
| 226 |
+
"""## SVM(Support Vector Machine)
|
| 227 |
+
|
| 228 |
+
### Kernel ---> 'Linear'
|
| 229 |
+
"""
|
| 230 |
+
|
| 231 |
+
svm_classifier = SVC(kernel='linear')
|
| 232 |
+
svm_classifier.fit(XV_train, y_train)
|
| 233 |
+
|
| 234 |
+
svm_pred = svm_classifier.predict(XV_test)
|
| 235 |
+
|
| 236 |
+
svm_accuracy = accuracy_score(y_test, svm_pred)
|
| 237 |
+
print(f"SVM with linear kernel Accuracy:", svm_accuracy)
|
| 238 |
+
|
| 239 |
+
print("\nSVM ( Kernel='linear' ) Classification Report:")
|
| 240 |
+
print(classification_report(y_test, svm_pred))
|
| 241 |
+
|
| 242 |
+
ConfusionMatrixDisplay.from_predictions(y_test,svm_pred);
|
| 243 |
+
|
| 244 |
+
"""### Kernel--->'Poly'"""
|
| 245 |
+
|
| 246 |
+
svm_classifier = SVC(kernel='poly')
|
| 247 |
+
svm_classifier.fit(XV_train, y_train)
|
| 248 |
+
|
| 249 |
+
svm_pred = svm_classifier.predict(XV_test)
|
| 250 |
+
|
| 251 |
+
svm_accuracy = accuracy_score(y_test, svm_pred)
|
| 252 |
+
print(f"SVM with poly kernel Accuracy:", svm_accuracy)
|
| 253 |
+
|
| 254 |
+
print("\nSVM ( Kernel='Poly' ) Classification Report:")
|
| 255 |
+
print(classification_report(y_test, svm_pred))
|
| 256 |
+
|
| 257 |
+
ConfusionMatrixDisplay.from_predictions(y_test,svm_pred);
|
| 258 |
+
|
| 259 |
+
"""### Kernel--->'RBF'"""
|
| 260 |
+
|
| 261 |
+
svm_classifier = SVC(kernel='rbf')
|
| 262 |
+
svm_classifier.fit(XV_train, y_train)
|
| 263 |
+
|
| 264 |
+
svm_pred = svm_classifier.predict(XV_test)
|
| 265 |
+
|
| 266 |
+
svm_accuracy = accuracy_score(y_test, svm_pred)
|
| 267 |
+
print(f"SVM with rbf kernel Accuracy:", svm_accuracy)
|
| 268 |
+
|
| 269 |
+
print("\nSVM ( Kernel='RBF' ) Classification Report:")
|
| 270 |
+
print(classification_report(y_test, svm_pred))
|
| 271 |
+
|
| 272 |
+
ConfusionMatrixDisplay.from_predictions(y_test,svm_pred);
|
| 273 |
+
|
| 274 |
+
"""# Decision Tree"""
|
| 275 |
+
|
| 276 |
+
from sklearn.tree import DecisionTreeClassifier, plot_tree
|
| 277 |
+
decision_tree=DecisionTreeClassifier(max_depth=20)
|
| 278 |
+
|
| 279 |
+
decision_tree.fit(XV_train,y_train)
|
| 280 |
+
|
| 281 |
+
dt_pred=decision_tree.predict(XV_test)
|
| 282 |
+
|
| 283 |
+
dt_accuracy=accuracy_score(y_test,dt_pred)
|
| 284 |
+
print(f"Decision Tree Accuracy with depth=20:", dt_accuracy)
|
| 285 |
+
|
| 286 |
+
print("\nDecision Tree Classification Report:")
|
| 287 |
+
print(classification_report(y_test, dt_pred))
|
| 288 |
+
|
| 289 |
+
ConfusionMatrixDisplay.from_predictions(y_test,dt_pred);
|
| 290 |
+
|
| 291 |
+
"""# Logistic Regression"""
|
| 292 |
+
|
| 293 |
+
logistic_model = LogisticRegression(max_iter=100)
|
| 294 |
+
|
| 295 |
+
logistic_model.fit(XV_train, y_train)
|
| 296 |
+
|
| 297 |
+
y_pred_logistic = logistic_model.predict(XV_test)
|
| 298 |
+
|
| 299 |
+
accuracy_logistic = accuracy_score(y_test, y_pred_logistic)
|
| 300 |
+
print("Logistic Regression Model:")
|
| 301 |
+
print(f"Accuracy: {accuracy_logistic}")
|
| 302 |
+
|
| 303 |
+
report_logistic = classification_report(y_test, y_pred_logistic)
|
| 304 |
+
print("Logistic Regression Classification Report:")
|
| 305 |
+
print(report_logistic)
|
| 306 |
+
|
| 307 |
+
ConfusionMatrixDisplay.from_predictions(y_test,y_pred_logistic);
|
| 308 |
+
|
| 309 |
+
"""# Naive Bayes"""
|
| 310 |
+
|
| 311 |
+
nb_classifier = MultinomialNB()
|
| 312 |
+
|
| 313 |
+
nb_classifier.fit(XV_train, y_train)
|
| 314 |
+
|
| 315 |
+
y_pred = nb_classifier.predict(XV_test)
|
| 316 |
+
|
| 317 |
+
accuracy = accuracy_score(y_test, y_pred)
|
| 318 |
+
print("Naive Bayes Model:")
|
| 319 |
+
print("Accuracy:", accuracy)
|
| 320 |
+
|
| 321 |
+
report_naive_bayes = classification_report(y_test, y_pred)
|
| 322 |
+
print("Naive Bayes Classification Report:")
|
| 323 |
+
print(report_naive_bayes)
|
| 324 |
+
|
| 325 |
+
ConfusionMatrixDisplay.from_predictions(y_test,dt_pred);
|
| 326 |
+
|
| 327 |
+
"""# K Nearest Neightbors (KNN)"""
|
| 328 |
+
|
| 329 |
+
from sklearn.neighbors import KNeighborsClassifier
|
| 330 |
+
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay
|
| 331 |
+
|
| 332 |
+
# K-Nearest Neighbors (KNN)
|
| 333 |
+
knn_classifier = KNeighborsClassifier()
|
| 334 |
+
knn_classifier.fit(XV_train, y_train)
|
| 335 |
+
y_pred_knn = knn_classifier.predict(XV_test)
|
| 336 |
+
accuracy_knn = accuracy_score(y_test, y_pred_knn)
|
| 337 |
+
print("K-Nearest Neighbors (KNN) Model:")
|
| 338 |
+
print("Accuracy:", accuracy_knn)
|
| 339 |
+
report_knn = classification_report(y_test, y_pred_knn)
|
| 340 |
+
print("K-Nearest Neighbors (KNN) Classification Report:")
|
| 341 |
+
print(report_knn)
|
| 342 |
+
# If you want to display confusion matrix for KNN, you can use:
|
| 343 |
+
ConfusionMatrixDisplay.from_predictions(y_test, y_pred_knn)
|
| 344 |
+
|
| 345 |
+
"""# Test"""
|
| 346 |
+
|
| 347 |
+
def output_lable(n):
|
| 348 |
+
if n == 0:
|
| 349 |
+
return "The Text Sentement is Negative"
|
| 350 |
+
elif n == 1:
|
| 351 |
+
return "The Text Sentement is Neutral"
|
| 352 |
+
elif n == 2:
|
| 353 |
+
return "The Text Sentement is Positive"
|
| 354 |
+
|
| 355 |
+
def manual_testing(news):
|
| 356 |
+
testing_news = {"text":[news]}
|
| 357 |
+
new_def_test = pd.DataFrame(testing_news)
|
| 358 |
+
new_def_test["text"] = new_def_test["text"].apply(wp)
|
| 359 |
+
new_x_test = new_def_test["text"]
|
| 360 |
+
new_xv_test = vectorization.transform(new_x_test)
|
| 361 |
+
pred_lr = logistic_model.predict(new_xv_test)
|
| 362 |
+
pred_svm = svm_classifier.predict(new_xv_test)
|
| 363 |
+
|
| 364 |
+
return print((output_lable(pred_lr[0])))
|
| 365 |
+
|
| 366 |
+
text = input("Enter Text to Classify ")
|
| 367 |
+
manual_testing(text)
|
| 368 |
+
|
| 369 |
+
pip install gradio
|
| 370 |
+
|
| 371 |
+
import gradio as gr
|
| 372 |
+
import matplotlib.pyplot as plt
|
| 373 |
+
import seaborn as sns
|
| 374 |
+
|
| 375 |
+
# Function to classify sentiment
|
| 376 |
+
def classify_sentiment(text):
|
| 377 |
+
# Preprocess the text
|
| 378 |
+
processed_text = wp(text)
|
| 379 |
+
# Vectorize the text
|
| 380 |
+
vectorized_text = vectorization.transform([processed_text])
|
| 381 |
+
# Predict sentiment using logistic regression model
|
| 382 |
+
prediction = logistic_model.predict(vectorized_text)[0]
|
| 383 |
+
# Output sentiment label
|
| 384 |
+
sentiment_label = output_label(prediction)
|
| 385 |
+
# Get probabilities for each sentiment class
|
| 386 |
+
probabilities = logistic_model.predict_proba(vectorized_text)[0]
|
| 387 |
+
|
| 388 |
+
# Plot probabilities
|
| 389 |
+
plt.figure(figsize=(8, 6))
|
| 390 |
+
sns.barplot(x=["Negative", "Neutral", "Positive"], y=probabilities)
|
| 391 |
+
plt.xlabel("Sentiment")
|
| 392 |
+
plt.ylabel("Probability")
|
| 393 |
+
plt.title("Sentiment Probability Distribution")
|
| 394 |
+
plt.ylim([0, 1])
|
| 395 |
+
plt.tight_layout()
|
| 396 |
+
plt.savefig("sentiment_probabilities.png")
|
| 397 |
+
|
| 398 |
+
return sentiment_label, "sentiment_probabilities.png"
|
| 399 |
+
|
| 400 |
+
# Input and output components for the interface
|
| 401 |
+
inputs = gr.Textbox(lines=10, label="Enter the text you want to analyze:")
|
| 402 |
+
outputs = [
|
| 403 |
+
gr.Textbox(label="Sentiment Prediction"),
|
| 404 |
+
gr.Image(label="Sentiment Probability Distribution")
|
| 405 |
+
]
|
| 406 |
+
|
| 407 |
+
# Create the Gradio interface
|
| 408 |
+
interface = gr.Interface(fn=classify_sentiment, inputs=inputs, outputs=outputs, title="Sentiment Classification", description="Enter a piece of text and analyze its sentiment.")
|
| 409 |
+
interface.launch()
|
| 410 |
+
|