Spaces:

b22ee075
/

Sentiment-classification

Runtime error

App Files Files Community

Sentiment-classification / app.py

b22ee075

Update app.py

4dd58b1 verified about 2 years ago

raw

history blame

8.19 kB

	# -- coding: utf-8 --
	"""PRML_project.ipynb

	Automatically generated by Colab.

	Original file is located at
	https://colab.research.google.com/drive/1_9mr_G1Wt8bteyyMEFJYBImPcIteTcSQ

	## Downloading & preparing the Dataset
	"""

	import pandas as pd
	import matplotlib.pyplot as plt
	import warnings
	from sklearn.model_selection import train_test_split
	from sklearn.metrics import accuracy_score,classification_report, ConfusionMatrixDisplay
	import re
	import string
	from sklearn.linear_model import LogisticRegression
	from sklearn.naive_bayes import MultinomialNB
	from sklearn.tree import DecisionTreeClassifier
	from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.svm import SVC
	# Ignore FutureWarning messages
	warnings.simplefilter(action='ignore', category=FutureWarning)

	import os
	import sys
	from tempfile import NamedTemporaryFile
	from urllib.request import urlopen
	from urllib.parse import unquote, urlparse
	from urllib.error import HTTPError
	from zipfile import ZipFile
	import tarfile
	import shutil

	CHUNK_SIZE = 40960
	DATA_SOURCE_MAPPING = 'sentiment-analysis-dataset:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F989445%2F1808590%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240418%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240418T100202Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D37697dd0d9910676a3f12986b24306fc3726be4de82536c784ffb79deff0ba33d8973d6d612a53bcf9ed39bd7ad8a1d69bb34c42a34c7d6cffee6dd3048a9ef68f047745664f48ea6f3773a1f263129a6f78d48923235cc363b4081daadea014b0958575bf8376d565858404a8b1be7e5f317bdd9f5823ce4777f0b7052445c648bcda039294c804978828087705abe4416a6f9a0e0743388667017128a5ab2ef5ab2dade0d40d1659f4313296501907b4baec3161131e151e6f5b982eee9a6f7eb1b022da9c874f216d7fac981dc1351e9001ee56d03d1da8b2e0d4c97320f18d7e9b00ec63f4ba7444d81595cc8edff2b05f13aef4b204dd2710d0fddf0ef9'

	KAGGLE_INPUT_PATH='/kaggle/input'
	KAGGLE_WORKING_PATH='/kaggle/working'
	KAGGLE_SYMLINK='kaggle'

	import subprocess
	subprocess.run(["umount", "/kaggle/input/"], stderr=subprocess.DEVNULL)
	shutil.rmtree('/kaggle/input', ignore_errors=True)
	os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
	os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

	try:
	os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
	except FileExistsError:
	pass
	try:
	os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
	except FileExistsError:
	pass

	for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
	directory, download_url_encoded = data_source_mapping.split(':')
	download_url = unquote(download_url_encoded)
	filename = urlparse(download_url).path
	destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
	try:
	with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
	total_length = fileres.headers['content-length']
	print(f'Downloading {directory}, {total_length} bytes compressed')
	dl = 0
	data = fileres.read(CHUNK_SIZE)
	while len(data) > 0:
	dl += len(data)
	tfile.write(data)
	done = int(50 * dl / int(total_length))
	sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
	sys.stdout.flush()
	data = fileres.read(CHUNK_SIZE)
	if filename.endswith('.zip'):
	with ZipFile(tfile) as zfile:
	zfile.extractall(destination_path)
	else:
	with tarfile.open(tfile.name) as tarfile:
	tarfile.extractall(destination_path)
	print(f'\nDownloaded and uncompressed: {directory}')
	except HTTPError as e:
	print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
	continue
	except OSError as e:
	print(f'Failed to load {download_url} to path {destination_path}')
	continue

	print('Data source import complete.')

	import numpy as np # linear algebra
	import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


	import os
	for dirname, _, filenames in os.walk('/kaggle/input'):
	for filename in filenames:
	print(os.path.join(dirname, filename))

	d = pd.read_csv('/kaggle/input/sentiment-analysis-dataset/train.csv',encoding='latin1');
	f = pd.read_csv('/kaggle/input/sentiment-analysis-dataset/test.csv',encoding='latin1');
	df = pd.concat([d,f])

	print(df.shape)
	display(df.info())
	display(df)

	"""## Preprocessing the dataset"""

	df.dropna(inplace=True)

	df['sentiment'].value_counts(normalize=True).plot(kind='bar');

	df['sentiment'] = df['sentiment'].astype('category').cat.codes
	df['sentiment'].value_counts(normalize=True).plot(kind='bar');

	df['Time of Tweet'] = df['Time of Tweet'].astype('category').cat.codes
	# Convert Country column to categorical variable
	df['Country'] = df['Country'].astype('category').cat.codes
	# convert Age of User to integer
	df['Age of User']=df['Age of User'].replace({'0-20':18,'21-30':25,'31-45':38,'46-60':53,'60-70':65,'70-100':80})

	df.info()

	df.drop(columns=['textID','Time of Tweet', 'Age of User', 'Country', 'Population -2020', 'Land Area (Km²)', 'Density (P/Km²)'])

	def wp(text):
	text = text.lower()
	text = re.sub('\[.*?\]', '', text)
	text = re.sub("\\W"," ",text)
	text = re.sub('https?://\S+\|www\.\S+', '', text)
	text = re.sub('<.*?>+', '', text)
	text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
	text = re.sub('\n', '', text)
	text = re.sub('\w\d\w', '', text)
	return text

	df['selected_text'] = df["selected_text"].apply(wp)

	"""## Training and testing split """

	X=df['selected_text']
	y= df['sentiment']

	X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
	print(X_train.shape)
	print(X_test.shape)
	print(y_train.shape)
	print(y_test.shape)

	vectorization = TfidfVectorizer()
	XV_train = vectorization.fit_transform(X_train)
	XV_test = vectorization.transform(X_test)

	"""# Logistic Regression"""

	logistic_model = LogisticRegression(max_iter=100)

	logistic_model.fit(XV_train, y_train)

	y_pred_logistic = logistic_model.predict(XV_test)

	accuracy_logistic = accuracy_score(y_test, y_pred_logistic)
	print("Logistic Regression Model:")
	print(f"Accuracy: {accuracy_logistic}")

	report_logistic = classification_report(y_test, y_pred_logistic)
	print("Logistic Regression Classification Report:")
	print(report_logistic)

	ConfusionMatrixDisplay.from_predictions(y_test,y_pred_logistic);

	pip install gradio

	import gradio as gr
	import matplotlib.pyplot as plt
	import seaborn as sns

	# Function to classify sentiment
	def classify_sentiment(text):
	# Preprocess the text
	processed_text = wp(text)
	# Vectorize the text
	vectorized_text = vectorization.transform([processed_text])
	# Predict sentiment using logistic regression model
	prediction = logistic_model.predict(vectorized_text)[0]
	# Output sentiment label
	sentiment_label = output_label(prediction)
	# Get probabilities for each sentiment class
	probabilities = logistic_model.predict_proba(vectorized_text)[0]

	# Plot probabilities
	plt.figure(figsize=(8, 6))
	sns.barplot(x=["Negative", "Neutral", "Positive"], y=probabilities)
	plt.xlabel("Sentiment")
	plt.ylabel("Probability")
	plt.title("Sentiment Probability Distribution")
	plt.ylim([0, 1])
	plt.tight_layout()
	plt.savefig("sentiment_probabilities.png")

	return sentiment_label, "sentiment_probabilities.png"

	# Input and output components for the interface
	inputs = gr.Textbox(lines=10, label="Enter the text you want to analyze:")
	outputs = [
	gr.Textbox(label="Sentiment Prediction"),
	gr.Image(label="Sentiment Probability Distribution")
	]

	# Create the Gradio interface
	interface = gr.Interface(fn=classify_sentiment, inputs=inputs, outputs=outputs, title="Sentiment Classification", description="Enter a piece of text and analyze its sentiment.")
	interface.launch()