Spaces:

b22ee075
/

Sentiment-classification

Sleeping

App Files Files Community

Sentiment-classification / app.py

b22ee075

Update app.py

ca36277 verified almost 2 years ago

raw

history blame contribute delete

3.41 kB

	# -- coding: utf-8 --
	"""PRML_project.ipynb

	Automatically generated by Colab.

	Original file is located at
	https://colab.research.google.com/drive/1_9mr_G1Wt8bteyyMEFJYBImPcIteTcSQ

	## Downloading & preparing the Dataset
	"""
	import pandas as pd
	import matplotlib.pyplot as plt
	import warnings
	from sklearn.model_selection import train_test_split
	from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
	import re
	import string
	from sklearn.linear_model import LogisticRegression
	from sklearn.feature_extraction.text import TfidfVectorizer

	# Ignore FutureWarning messages
	warnings.simplefilter(action='ignore', category=FutureWarning)

	# Load data
	d = pd.read_csv('train.csv', encoding='latin1')
	f = pd.read_csv('test.csv', encoding='latin1')
	df = pd.concat([d, f])

	# Preprocessing the dataset
	df.dropna(inplace=True)

	df['sentiment'] = df['sentiment'].astype('category').cat.codes
	df['Time of Tweet'] = df['Time of Tweet'].astype('category').cat.codes
	df['Country'] = df['Country'].astype('category').cat.codes
	df['Age of User'] = df['Age of User'].replace({'0-20': 18, '21-30': 25, '31-45': 38, '46-60': 53, '60-70': 65, '70-100': 80})

	df.drop(columns=['textID', 'Time of Tweet', 'Age of User', 'Country', 'Population -2020', 'Land Area (Km²)', 'Density (P/Km²)'], inplace=True)

	def preprocess_text(text):
	text = text.lower()
	text = re.sub('\[.*?\]', '', text)
	text = re.sub("\\W", " ", text)
	text = re.sub('https?://\S+\|www\.\S+', '', text)
	text = re.sub('<.*?>+', '', text)
	text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
	text = re.sub('\n', '', text)
	text = re.sub('\w\d\w', '', text)
	return text

	df['selected_text'] = df["selected_text"].apply(preprocess_text)

	# Training and testing split
	X = df['selected_text']
	y = df['sentiment']

	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

	vectorization = TfidfVectorizer()
	XV_train = vectorization.fit_transform(X_train)
	XV_test = vectorization.transform(X_test)

	# Logistic Regression
	logistic_model = LogisticRegression(max_iter=100)
	logistic_model.fit(XV_train, y_train)

	y_pred_logistic = logistic_model.predict(XV_test)

	accuracy_logistic = accuracy_score(y_test, y_pred_logistic)
	print("Logistic Regression Model:")
	print(f"Accuracy: {accuracy_logistic}")

	report_logistic = classification_report(y_test, y_pred_logistic)
	print("Logistic Regression Classification Report:")
	print(report_logistic)

	# Confusion Matrix
	#conf_matrix = confusion_matrix(y_test, y_pred_logistic)

	#ConfusionMatrixDisplay(conf_matrix).plot()
	import gradio as gr
	# Function to classify sentiment
	def classify_sentiment(text):
	processed_text = preprocess_text(text)
	vectorized_text = vectorization.transform([processed_text])
	prediction = logistic_model.predict(vectorized_text)[0]
	if prediction == 0:
	return "Negative"
	elif prediction == 1:
	return "Neutral"
	else:
	return "Positive"

	# Input and output components for the interface
	inputs = gr.Textbox(lines=10, label="Enter the text you want to analyze:")
	outputs = [
	gr.Textbox(label="Sentiment Prediction")
	]

	# Create the Gradio interface
	interface = gr.Interface(fn=classify_sentiment, inputs=inputs, outputs=outputs, title="Sentiment Classification", description="Enter a piece of text and analyze its sentiment.")
	interface.launch()