Spaces:

b22ee075
/

Sentiment-classification

Runtime error

App Files Files Community

Sentiment-classification / app.py

b22ee075

Update app.py

7df8d0b verified about 2 years ago

raw

history blame

4.77 kB

	# -- coding: utf-8 --
	"""PRML_project.ipynb

	Automatically generated by Colab.

	Original file is located at
	https://colab.research.google.com/drive/1_9mr_G1Wt8bteyyMEFJYBImPcIteTcSQ

	## Downloading & preparing the Dataset
	"""

	import pandas as pd
	import matplotlib.pyplot as plt
	import warnings
	from sklearn.model_selection import train_test_split
	from sklearn.metrics import accuracy_score,classification_report, ConfusionMatrixDisplay
	import re
	import string
	from sklearn.linear_model import LogisticRegression
	from sklearn.naive_bayes import MultinomialNB
	from sklearn.tree import DecisionTreeClassifier
	from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.svm import SVC
	# Ignore FutureWarning messages
	warnings.simplefilter(action='ignore', category=FutureWarning)


	import numpy as np # linear algebra
	import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


	import os
	for dirname, _, filenames in os.walk('/kaggle/input'):
	for filename in filenames:
	print(os.path.join(dirname, filename))

	d = pd.read_csv('train.csv',encoding='latin1');
	f = pd.read_csv('test.csv',encoding='latin1');
	df = pd.concat([d,f])

	print(df.shape)
	display(df.info())
	display(df)

	"""## Preprocessing the dataset"""

	df.dropna(inplace=True)

	df['sentiment'].value_counts(normalize=True).plot(kind='bar');

	df['sentiment'] = df['sentiment'].astype('category').cat.codes
	df['sentiment'].value_counts(normalize=True).plot(kind='bar');

	df['Time of Tweet'] = df['Time of Tweet'].astype('category').cat.codes
	# Convert Country column to categorical variable
	df['Country'] = df['Country'].astype('category').cat.codes
	# convert Age of User to integer
	df['Age of User']=df['Age of User'].replace({'0-20':18,'21-30':25,'31-45':38,'46-60':53,'60-70':65,'70-100':80})

	df.info()

	df.drop(columns=['textID','Time of Tweet', 'Age of User', 'Country', 'Population -2020', 'Land Area (Km²)', 'Density (P/Km²)'])

	def wp(text):
	text = text.lower()
	text = re.sub('\[.*?\]', '', text)
	text = re.sub("\\W"," ",text)
	text = re.sub('https?://\S+\|www\.\S+', '', text)
	text = re.sub('<.*?>+', '', text)
	text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
	text = re.sub('\n', '', text)
	text = re.sub('\w\d\w', '', text)
	return text

	df['selected_text'] = df["selected_text"].apply(wp)

	"""## Training and testing split """

	X=df['selected_text']
	y= df['sentiment']

	X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
	print(X_train.shape)
	print(X_test.shape)
	print(y_train.shape)
	print(y_test.shape)

	vectorization = TfidfVectorizer()
	XV_train = vectorization.fit_transform(X_train)
	XV_test = vectorization.transform(X_test)

	"""# Logistic Regression"""

	logistic_model = LogisticRegression(max_iter=100)

	logistic_model.fit(XV_train, y_train)

	y_pred_logistic = logistic_model.predict(XV_test)

	accuracy_logistic = accuracy_score(y_test, y_pred_logistic)
	print("Logistic Regression Model:")
	print(f"Accuracy: {accuracy_logistic}")

	report_logistic = classification_report(y_test, y_pred_logistic)
	print("Logistic Regression Classification Report:")
	print(report_logistic)

	ConfusionMatrixDisplay.from_predictions(y_test,y_pred_logistic);

	import gradio as gr
	import matplotlib.pyplot as plt
	import seaborn as sns

	# Function to classify sentiment
	def classify_sentiment(text):
	# Preprocess the text
	processed_text = wp(text)
	# Vectorize the text
	vectorized_text = vectorization.transform([processed_text])
	# Predict sentiment using logistic regression model
	prediction = logistic_model.predict(vectorized_text)[0]
	# Output sentiment label
	sentiment_label = output_label(prediction)
	# Get probabilities for each sentiment class
	probabilities = logistic_model.predict_proba(vectorized_text)[0]

	# Plot probabilities
	plt.figure(figsize=(8, 6))
	sns.barplot(x=["Negative", "Neutral", "Positive"], y=probabilities)
	plt.xlabel("Sentiment")
	plt.ylabel("Probability")
	plt.title("Sentiment Probability Distribution")
	plt.ylim([0, 1])
	plt.tight_layout()
	plt.savefig("sentiment_probabilities.png")

	return sentiment_label, "sentiment_probabilities.png"

	# Input and output components for the interface
	inputs = gr.Textbox(lines=10, label="Enter the text you want to analyze:")
	outputs = [
	gr.Textbox(label="Sentiment Prediction"),
	gr.Image(label="Sentiment Probability Distribution")
	]

	# Create the Gradio interface
	interface = gr.Interface(fn=classify_sentiment, inputs=inputs, outputs=outputs, title="Sentiment Classification", description="Enter a piece of text and analyze its sentiment.")
	interface.launch()