b22ee075 commited on
Commit
3cc8ce5
·
verified ·
1 Parent(s): 0790b43

Upload prml_project (1).py

Browse files
Files changed (1) hide show
  1. prml_project (1).py +410 -0
prml_project (1).py ADDED
@@ -0,0 +1,410 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """PRML_project.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1_9mr_G1Wt8bteyyMEFJYBImPcIteTcSQ
8
+
9
+ ## Downloading & preparing the Dataset
10
+ """
11
+
12
+ import pandas as pd
13
+ import matplotlib.pyplot as plt
14
+ import warnings
15
+ from sklearn.model_selection import train_test_split
16
+ from sklearn.metrics import accuracy_score,classification_report, ConfusionMatrixDisplay
17
+ import re
18
+ import string
19
+ from sklearn.linear_model import LogisticRegression
20
+ from sklearn.naive_bayes import MultinomialNB
21
+ from sklearn.tree import DecisionTreeClassifier
22
+ from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
23
+ from sklearn.feature_extraction.text import TfidfVectorizer
24
+ from xgboost import XGBClassifier
25
+ from lightgbm import LGBMClassifier
26
+ from sklearn.svm import SVC
27
+ # Ignore FutureWarning messages
28
+ warnings.simplefilter(action='ignore', category=FutureWarning)
29
+
30
+ import os
31
+ import sys
32
+ from tempfile import NamedTemporaryFile
33
+ from urllib.request import urlopen
34
+ from urllib.parse import unquote, urlparse
35
+ from urllib.error import HTTPError
36
+ from zipfile import ZipFile
37
+ import tarfile
38
+ import shutil
39
+
40
+ CHUNK_SIZE = 40960
41
+ DATA_SOURCE_MAPPING = 'sentiment-analysis-dataset:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F989445%2F1808590%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240418%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240418T100202Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D37697dd0d9910676a3f12986b24306fc3726be4de82536c784ffb79deff0ba33d8973d6d612a53bcf9ed39bd7ad8a1d69bb34c42a34c7d6cffee6dd3048a9ef68f047745664f48ea6f3773a1f263129a6f78d48923235cc363b4081daadea014b0958575bf8376d565858404a8b1be7e5f317bdd9f5823ce4777f0b7052445c648bcda039294c804978828087705abe4416a6f9a0e0743388667017128a5ab2ef5ab2dade0d40d1659f4313296501907b4baec3161131e151e6f5b982eee9a6f7eb1b022da9c874f216d7fac981dc1351e9001ee56d03d1da8b2e0d4c97320f18d7e9b00ec63f4ba7444d81595cc8edff2b05f13aef4b204dd2710d0fddf0ef9'
42
+
43
+ KAGGLE_INPUT_PATH='/kaggle/input'
44
+ KAGGLE_WORKING_PATH='/kaggle/working'
45
+ KAGGLE_SYMLINK='kaggle'
46
+
47
+ !umount /kaggle/input/ 2> /dev/null
48
+ shutil.rmtree('/kaggle/input', ignore_errors=True)
49
+ os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
50
+ os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)
51
+
52
+ try:
53
+ os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
54
+ except FileExistsError:
55
+ pass
56
+ try:
57
+ os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
58
+ except FileExistsError:
59
+ pass
60
+
61
+ for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
62
+ directory, download_url_encoded = data_source_mapping.split(':')
63
+ download_url = unquote(download_url_encoded)
64
+ filename = urlparse(download_url).path
65
+ destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
66
+ try:
67
+ with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
68
+ total_length = fileres.headers['content-length']
69
+ print(f'Downloading {directory}, {total_length} bytes compressed')
70
+ dl = 0
71
+ data = fileres.read(CHUNK_SIZE)
72
+ while len(data) > 0:
73
+ dl += len(data)
74
+ tfile.write(data)
75
+ done = int(50 * dl / int(total_length))
76
+ sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
77
+ sys.stdout.flush()
78
+ data = fileres.read(CHUNK_SIZE)
79
+ if filename.endswith('.zip'):
80
+ with ZipFile(tfile) as zfile:
81
+ zfile.extractall(destination_path)
82
+ else:
83
+ with tarfile.open(tfile.name) as tarfile:
84
+ tarfile.extractall(destination_path)
85
+ print(f'\nDownloaded and uncompressed: {directory}')
86
+ except HTTPError as e:
87
+ print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
88
+ continue
89
+ except OSError as e:
90
+ print(f'Failed to load {download_url} to path {destination_path}')
91
+ continue
92
+
93
+ print('Data source import complete.')
94
+
95
+ import numpy as np # linear algebra
96
+ import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
97
+
98
+ # Input data files are available in the read-only "../input/" directory
99
+ # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
100
+
101
+ import os
102
+ for dirname, _, filenames in os.walk('/kaggle/input'):
103
+ for filename in filenames:
104
+ print(os.path.join(dirname, filename))
105
+
106
+ d = pd.read_csv('/kaggle/input/sentiment-analysis-dataset/train.csv',encoding='latin1');
107
+ f = pd.read_csv('/kaggle/input/sentiment-analysis-dataset/test.csv',encoding='latin1');
108
+ df = pd.concat([d,f])
109
+
110
+ print(df.shape)
111
+ display(df.info())
112
+ display(df)
113
+
114
+ """## Preprocessing the dataset"""
115
+
116
+ df.dropna(inplace=True)
117
+
118
+ df['sentiment'].value_counts(normalize=True).plot(kind='bar');
119
+
120
+ df['sentiment'] = df['sentiment'].astype('category').cat.codes
121
+ df['sentiment'].value_counts(normalize=True).plot(kind='bar');
122
+
123
+ df['Time of Tweet'] = df['Time of Tweet'].astype('category').cat.codes
124
+ # Convert Country column to categorical variable
125
+ df['Country'] = df['Country'].astype('category').cat.codes
126
+ # convert Age of User to integer
127
+ df['Age of User']=df['Age of User'].replace({'0-20':18,'21-30':25,'31-45':38,'46-60':53,'60-70':65,'70-100':80})
128
+
129
+ df.info()
130
+
131
+ df.drop(columns=['textID','Time of Tweet', 'Age of User', 'Country', 'Population -2020', 'Land Area (Km²)', 'Density (P/Km²)'])
132
+
133
+ def wp(text):
134
+ text = text.lower()
135
+ text = re.sub('\[.*?\]', '', text)
136
+ text = re.sub("\\W"," ",text)
137
+ text = re.sub('https?://\S+|www\.\S+', '', text)
138
+ text = re.sub('<.*?>+', '', text)
139
+ text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
140
+ text = re.sub('\n', '', text)
141
+ text = re.sub('\w*\d\w*', '', text)
142
+ return text
143
+
144
+ df['selected_text'] = df["selected_text"].apply(wp)
145
+
146
+ """## Training and testing split """
147
+
148
+ X=df['selected_text']
149
+ y= df['sentiment']
150
+
151
+ X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
152
+ print(X_train.shape)
153
+ print(X_test.shape)
154
+ print(y_train.shape)
155
+ print(y_test.shape)
156
+
157
+ vectorization = TfidfVectorizer()
158
+ XV_train = vectorization.fit_transform(X_train)
159
+ XV_test = vectorization.transform(X_test)
160
+
161
+ """## Random forest and boosting methods
162
+
163
+ ### Random forest
164
+ """
165
+
166
+ rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
167
+ rf_classifier.fit(XV_train, y_train)
168
+
169
+ rf_pred = rf_classifier.predict(XV_test)
170
+
171
+ print("Random Forest Accuracy:", accuracy_score(y_test, rf_pred))
172
+
173
+ print("\nRandom Forest Classification Report:")
174
+ print(classification_report(y_test, rf_pred))
175
+
176
+ ConfusionMatrixDisplay.from_predictions(y_test, rf_pred);
177
+
178
+ """### Adaboost boosting method"""
179
+
180
+ ada_classifier = AdaBoostClassifier()
181
+ ada_classifier.fit(XV_train, y_train)
182
+
183
+ ada_pred = ada_classifier.predict(XV_test)
184
+
185
+ print("AdaBoost Accuracy:", accuracy_score(y_test, ada_pred))
186
+
187
+ print("\nAdaBoost Classification Report:")
188
+ print(classification_report(y_test, ada_pred))
189
+
190
+ ConfusionMatrixDisplay.from_predictions(y_test, ada_pred);
191
+
192
+ """### Gradient Boosting"""
193
+
194
+ from sklearn.ensemble import GradientBoostingClassifier
195
+ # Gradient Boosting Machine (GBM)
196
+ gbm_classifier = GradientBoostingClassifier()
197
+ gbm_classifier.fit(XV_train, y_train)
198
+ y_pred_gbm = gbm_classifier.predict(XV_test)
199
+ accuracy_gbm = accuracy_score(y_test, y_pred_gbm)
200
+ print("\nGradient Boosting Machine (GBM) Model:")
201
+ print("Accuracy:", accuracy_gbm)
202
+ report_gbm = classification_report(y_test, y_pred_gbm)
203
+ print("Gradient Boosting Machine (GBM) Classification Report:")
204
+ print(report_gbm)
205
+ # If you want to display confusion matrix for GBM, you can use:
206
+ ConfusionMatrixDisplay.from_predictions(y_test, y_pred_gbm)
207
+
208
+ """### LightGBM"""
209
+
210
+ import lightgbm as lgb
211
+ from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay
212
+
213
+ # LightGBM
214
+ lgb_classifier = lgb.LGBMClassifier()
215
+ lgb_classifier.fit(XV_train, y_train)
216
+ y_pred_lgb = lgb_classifier.predict(XV_test)
217
+ accuracy_lgb = accuracy_score(y_test, y_pred_lgb)
218
+ print("\nLightGBM Model:")
219
+ print("Accuracy:", accuracy_lgb)
220
+ report_lgb = classification_report(y_test, y_pred_lgb)
221
+ print("LightGBM Classification Report:")
222
+ print(report_lgb)
223
+ # If you want to display confusion matrix for LightGBM, you can use:
224
+ ConfusionMatrixDisplay.from_predictions(y_test, y_pred_lgb)
225
+
226
+ """## SVM(Support Vector Machine)
227
+
228
+ ### Kernel ---> 'Linear'
229
+ """
230
+
231
+ svm_classifier = SVC(kernel='linear')
232
+ svm_classifier.fit(XV_train, y_train)
233
+
234
+ svm_pred = svm_classifier.predict(XV_test)
235
+
236
+ svm_accuracy = accuracy_score(y_test, svm_pred)
237
+ print(f"SVM with linear kernel Accuracy:", svm_accuracy)
238
+
239
+ print("\nSVM ( Kernel='linear' ) Classification Report:")
240
+ print(classification_report(y_test, svm_pred))
241
+
242
+ ConfusionMatrixDisplay.from_predictions(y_test,svm_pred);
243
+
244
+ """### Kernel--->'Poly'"""
245
+
246
+ svm_classifier = SVC(kernel='poly')
247
+ svm_classifier.fit(XV_train, y_train)
248
+
249
+ svm_pred = svm_classifier.predict(XV_test)
250
+
251
+ svm_accuracy = accuracy_score(y_test, svm_pred)
252
+ print(f"SVM with poly kernel Accuracy:", svm_accuracy)
253
+
254
+ print("\nSVM ( Kernel='Poly' ) Classification Report:")
255
+ print(classification_report(y_test, svm_pred))
256
+
257
+ ConfusionMatrixDisplay.from_predictions(y_test,svm_pred);
258
+
259
+ """### Kernel--->'RBF'"""
260
+
261
+ svm_classifier = SVC(kernel='rbf')
262
+ svm_classifier.fit(XV_train, y_train)
263
+
264
+ svm_pred = svm_classifier.predict(XV_test)
265
+
266
+ svm_accuracy = accuracy_score(y_test, svm_pred)
267
+ print(f"SVM with rbf kernel Accuracy:", svm_accuracy)
268
+
269
+ print("\nSVM ( Kernel='RBF' ) Classification Report:")
270
+ print(classification_report(y_test, svm_pred))
271
+
272
+ ConfusionMatrixDisplay.from_predictions(y_test,svm_pred);
273
+
274
+ """# Decision Tree"""
275
+
276
+ from sklearn.tree import DecisionTreeClassifier, plot_tree
277
+ decision_tree=DecisionTreeClassifier(max_depth=20)
278
+
279
+ decision_tree.fit(XV_train,y_train)
280
+
281
+ dt_pred=decision_tree.predict(XV_test)
282
+
283
+ dt_accuracy=accuracy_score(y_test,dt_pred)
284
+ print(f"Decision Tree Accuracy with depth=20:", dt_accuracy)
285
+
286
+ print("\nDecision Tree Classification Report:")
287
+ print(classification_report(y_test, dt_pred))
288
+
289
+ ConfusionMatrixDisplay.from_predictions(y_test,dt_pred);
290
+
291
+ """# Logistic Regression"""
292
+
293
+ logistic_model = LogisticRegression(max_iter=100)
294
+
295
+ logistic_model.fit(XV_train, y_train)
296
+
297
+ y_pred_logistic = logistic_model.predict(XV_test)
298
+
299
+ accuracy_logistic = accuracy_score(y_test, y_pred_logistic)
300
+ print("Logistic Regression Model:")
301
+ print(f"Accuracy: {accuracy_logistic}")
302
+
303
+ report_logistic = classification_report(y_test, y_pred_logistic)
304
+ print("Logistic Regression Classification Report:")
305
+ print(report_logistic)
306
+
307
+ ConfusionMatrixDisplay.from_predictions(y_test,y_pred_logistic);
308
+
309
+ """# Naive Bayes"""
310
+
311
+ nb_classifier = MultinomialNB()
312
+
313
+ nb_classifier.fit(XV_train, y_train)
314
+
315
+ y_pred = nb_classifier.predict(XV_test)
316
+
317
+ accuracy = accuracy_score(y_test, y_pred)
318
+ print("Naive Bayes Model:")
319
+ print("Accuracy:", accuracy)
320
+
321
+ report_naive_bayes = classification_report(y_test, y_pred)
322
+ print("Naive Bayes Classification Report:")
323
+ print(report_naive_bayes)
324
+
325
+ ConfusionMatrixDisplay.from_predictions(y_test,dt_pred);
326
+
327
+ """# K Nearest Neightbors (KNN)"""
328
+
329
+ from sklearn.neighbors import KNeighborsClassifier
330
+ from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay
331
+
332
+ # K-Nearest Neighbors (KNN)
333
+ knn_classifier = KNeighborsClassifier()
334
+ knn_classifier.fit(XV_train, y_train)
335
+ y_pred_knn = knn_classifier.predict(XV_test)
336
+ accuracy_knn = accuracy_score(y_test, y_pred_knn)
337
+ print("K-Nearest Neighbors (KNN) Model:")
338
+ print("Accuracy:", accuracy_knn)
339
+ report_knn = classification_report(y_test, y_pred_knn)
340
+ print("K-Nearest Neighbors (KNN) Classification Report:")
341
+ print(report_knn)
342
+ # If you want to display confusion matrix for KNN, you can use:
343
+ ConfusionMatrixDisplay.from_predictions(y_test, y_pred_knn)
344
+
345
+ """# Test"""
346
+
347
+ def output_lable(n):
348
+ if n == 0:
349
+ return "The Text Sentement is Negative"
350
+ elif n == 1:
351
+ return "The Text Sentement is Neutral"
352
+ elif n == 2:
353
+ return "The Text Sentement is Positive"
354
+
355
+ def manual_testing(news):
356
+ testing_news = {"text":[news]}
357
+ new_def_test = pd.DataFrame(testing_news)
358
+ new_def_test["text"] = new_def_test["text"].apply(wp)
359
+ new_x_test = new_def_test["text"]
360
+ new_xv_test = vectorization.transform(new_x_test)
361
+ pred_lr = logistic_model.predict(new_xv_test)
362
+ pred_svm = svm_classifier.predict(new_xv_test)
363
+
364
+ return print((output_lable(pred_lr[0])))
365
+
366
+ text = input("Enter Text to Classify ")
367
+ manual_testing(text)
368
+
369
+ pip install gradio
370
+
371
+ import gradio as gr
372
+ import matplotlib.pyplot as plt
373
+ import seaborn as sns
374
+
375
+ # Function to classify sentiment
376
+ def classify_sentiment(text):
377
+ # Preprocess the text
378
+ processed_text = wp(text)
379
+ # Vectorize the text
380
+ vectorized_text = vectorization.transform([processed_text])
381
+ # Predict sentiment using logistic regression model
382
+ prediction = logistic_model.predict(vectorized_text)[0]
383
+ # Output sentiment label
384
+ sentiment_label = output_label(prediction)
385
+ # Get probabilities for each sentiment class
386
+ probabilities = logistic_model.predict_proba(vectorized_text)[0]
387
+
388
+ # Plot probabilities
389
+ plt.figure(figsize=(8, 6))
390
+ sns.barplot(x=["Negative", "Neutral", "Positive"], y=probabilities)
391
+ plt.xlabel("Sentiment")
392
+ plt.ylabel("Probability")
393
+ plt.title("Sentiment Probability Distribution")
394
+ plt.ylim([0, 1])
395
+ plt.tight_layout()
396
+ plt.savefig("sentiment_probabilities.png")
397
+
398
+ return sentiment_label, "sentiment_probabilities.png"
399
+
400
+ # Input and output components for the interface
401
+ inputs = gr.Textbox(lines=10, label="Enter the text you want to analyze:")
402
+ outputs = [
403
+ gr.Textbox(label="Sentiment Prediction"),
404
+ gr.Image(label="Sentiment Probability Distribution")
405
+ ]
406
+
407
+ # Create the Gradio interface
408
+ interface = gr.Interface(fn=classify_sentiment, inputs=inputs, outputs=outputs, title="Sentiment Classification", description="Enter a piece of text and analyze its sentiment.")
409
+ interface.launch()
410
+