Spaces:

b22ee075
/

Sentiment-classification

Sleeping

File size: 3,412 Bytes

3cc8ce5
 
 
 
 
 
 
 
 
 
 
 
 
 
f65e883
3cc8ce5
 
 
 
f65e883
3cc8ce5
 
 
f65e883
 
 
 
3cc8ce5
f65e883
3cc8ce5
 
 
 
 
f65e883
3cc8ce5
f65e883
3cc8ce5
f65e883
3cc8ce5
 
f65e883
3cc8ce5
 
 
 
 
 
 
f65e883
3cc8ce5
f65e883
 
 
3cc8ce5
f65e883
3cc8ce5
 
 
 
 
f65e883
3cc8ce5
 
 
 
 
 
 
 
 
 
 
 
 
f65e883
 
3cc8ce5
f65e883
3e7df15
3cc8ce5
 
f65e883
3cc8ce5
 
ca36277
 
 
 
 
 
3cc8ce5
 
 
 
eb0b05c
3cc8ce5

# -*- coding: utf-8 -*-
"""PRML_project.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1_9mr_G1Wt8bteyyMEFJYBImPcIteTcSQ

## Downloading & preparing the Dataset
"""
import pandas as pd
import matplotlib.pyplot as plt
import warnings
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import re
import string
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

# Ignore FutureWarning messages
warnings.simplefilter(action='ignore', category=FutureWarning)

# Load data
d = pd.read_csv('train.csv', encoding='latin1')
f = pd.read_csv('test.csv', encoding='latin1')
df = pd.concat([d, f])

# Preprocessing the dataset
df.dropna(inplace=True)

df['sentiment'] = df['sentiment'].astype('category').cat.codes
df['Time of Tweet'] = df['Time of Tweet'].astype('category').cat.codes
df['Country'] = df['Country'].astype('category').cat.codes
df['Age of User'] = df['Age of User'].replace({'0-20': 18, '21-30': 25, '31-45': 38, '46-60': 53, '60-70': 65, '70-100': 80})

df.drop(columns=['textID', 'Time of Tweet', 'Age of User', 'Country', 'Population -2020', 'Land Area (Km²)', 'Density (P/Km²)'], inplace=True)

def preprocess_text(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W", " ", text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

df['selected_text'] = df["selected_text"].apply(preprocess_text)

# Training and testing split
X = df['selected_text']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectorization = TfidfVectorizer()
XV_train = vectorization.fit_transform(X_train)
XV_test = vectorization.transform(X_test)

# Logistic Regression
logistic_model = LogisticRegression(max_iter=100)
logistic_model.fit(XV_train, y_train)

y_pred_logistic = logistic_model.predict(XV_test)

accuracy_logistic = accuracy_score(y_test, y_pred_logistic)
print("Logistic Regression Model:")
print(f"Accuracy: {accuracy_logistic}")

report_logistic = classification_report(y_test, y_pred_logistic)
print("Logistic Regression Classification Report:")
print(report_logistic)

# Confusion Matrix
#conf_matrix = confusion_matrix(y_test, y_pred_logistic)

#ConfusionMatrixDisplay(conf_matrix).plot()
import gradio as gr
# Function to classify sentiment
def classify_sentiment(text):
    processed_text = preprocess_text(text)
    vectorized_text = vectorization.transform([processed_text])
    prediction = logistic_model.predict(vectorized_text)[0]
    if prediction == 0:
        return "Negative"
    elif prediction == 1:
        return "Neutral"
    else:
        return "Positive"

# Input and output components for the interface
inputs = gr.Textbox(lines=10, label="Enter the text you want to analyze:")
outputs = [
    gr.Textbox(label="Sentiment Prediction")
]

# Create the Gradio interface
interface = gr.Interface(fn=classify_sentiment, inputs=inputs, outputs=outputs, title="Sentiment Classification", description="Enter a piece of text and analyze its sentiment.")
interface.launch()