# -*- coding: utf-8 -*-
"""PRML_project.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1_9mr_G1Wt8bteyyMEFJYBImPcIteTcSQ

## Downloading & preparing the Dataset
"""
import pandas as pd
import matplotlib.pyplot as plt
import warnings
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import re
import string
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

# Ignore FutureWarning messages
warnings.simplefilter(action='ignore', category=FutureWarning)

# Load data
d = pd.read_csv('train.csv', encoding='latin1')
f = pd.read_csv('test.csv', encoding='latin1')
df = pd.concat([d, f])

# Preprocessing the dataset
df.dropna(inplace=True)

df['sentiment'] = df['sentiment'].astype('category').cat.codes
df['Time of Tweet'] = df['Time of Tweet'].astype('category').cat.codes
df['Country'] = df['Country'].astype('category').cat.codes
df['Age of User'] = df['Age of User'].replace({'0-20': 18, '21-30': 25, '31-45': 38, '46-60': 53, '60-70': 65, '70-100': 80})

df.drop(columns=['textID', 'Time of Tweet', 'Age of User', 'Country', 'Population -2020', 'Land Area (Km²)', 'Density (P/Km²)'], inplace=True)

def preprocess_text(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W", " ", text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

df['selected_text'] = df["selected_text"].apply(preprocess_text)

# Training and testing split
X = df['selected_text']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectorization = TfidfVectorizer()
XV_train = vectorization.fit_transform(X_train)
XV_test = vectorization.transform(X_test)

# Logistic Regression
logistic_model = LogisticRegression(max_iter=100)
logistic_model.fit(XV_train, y_train)

y_pred_logistic = logistic_model.predict(XV_test)

accuracy_logistic = accuracy_score(y_test, y_pred_logistic)
print("Logistic Regression Model:")
print(f"Accuracy: {accuracy_logistic}")

report_logistic = classification_report(y_test, y_pred_logistic)
print("Logistic Regression Classification Report:")
print(report_logistic)

# Confusion Matrix
#conf_matrix = confusion_matrix(y_test, y_pred_logistic)

#ConfusionMatrixDisplay(conf_matrix).plot()
import gradio as gr
# Function to classify sentiment
def classify_sentiment(text):
    processed_text = preprocess_text(text)
    vectorized_text = vectorization.transform([processed_text])
    prediction = logistic_model.predict(vectorized_text)[0]
    if prediction == 0:
        return "Negative"
    elif prediction == 1:
        return "Neutral"
    else:
        return "Positive"

# Input and output components for the interface
inputs = gr.Textbox(lines=10, label="Enter the text you want to analyze:")
outputs = [
    gr.Textbox(label="Sentiment Prediction")
]

# Create the Gradio interface
interface = gr.Interface(fn=classify_sentiment, inputs=inputs, outputs=outputs, title="Sentiment Classification", description="Enter a piece of text and analyze its sentiment.")
interface.launch()