Spaces:
Sleeping
Sleeping
File size: 3,412 Bytes
3cc8ce5 f65e883 3cc8ce5 f65e883 3cc8ce5 f65e883 3cc8ce5 f65e883 3cc8ce5 f65e883 3cc8ce5 f65e883 3cc8ce5 f65e883 3cc8ce5 f65e883 3cc8ce5 f65e883 3cc8ce5 f65e883 3cc8ce5 f65e883 3cc8ce5 f65e883 3cc8ce5 f65e883 3cc8ce5 f65e883 3e7df15 3cc8ce5 f65e883 3cc8ce5 ca36277 3cc8ce5 eb0b05c 3cc8ce5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 | # -*- coding: utf-8 -*-
"""PRML_project.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1_9mr_G1Wt8bteyyMEFJYBImPcIteTcSQ
## Downloading & preparing the Dataset
"""
import pandas as pd
import matplotlib.pyplot as plt
import warnings
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import re
import string
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
# Ignore FutureWarning messages
warnings.simplefilter(action='ignore', category=FutureWarning)
# Load data
d = pd.read_csv('train.csv', encoding='latin1')
f = pd.read_csv('test.csv', encoding='latin1')
df = pd.concat([d, f])
# Preprocessing the dataset
df.dropna(inplace=True)
df['sentiment'] = df['sentiment'].astype('category').cat.codes
df['Time of Tweet'] = df['Time of Tweet'].astype('category').cat.codes
df['Country'] = df['Country'].astype('category').cat.codes
df['Age of User'] = df['Age of User'].replace({'0-20': 18, '21-30': 25, '31-45': 38, '46-60': 53, '60-70': 65, '70-100': 80})
df.drop(columns=['textID', 'Time of Tweet', 'Age of User', 'Country', 'Population -2020', 'Land Area (Km²)', 'Density (P/Km²)'], inplace=True)
def preprocess_text(text):
text = text.lower()
text = re.sub('\[.*?\]', '', text)
text = re.sub("\\W", " ", text)
text = re.sub('https?://\S+|www\.\S+', '', text)
text = re.sub('<.*?>+', '', text)
text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
text = re.sub('\n', '', text)
text = re.sub('\w*\d\w*', '', text)
return text
df['selected_text'] = df["selected_text"].apply(preprocess_text)
# Training and testing split
X = df['selected_text']
y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
vectorization = TfidfVectorizer()
XV_train = vectorization.fit_transform(X_train)
XV_test = vectorization.transform(X_test)
# Logistic Regression
logistic_model = LogisticRegression(max_iter=100)
logistic_model.fit(XV_train, y_train)
y_pred_logistic = logistic_model.predict(XV_test)
accuracy_logistic = accuracy_score(y_test, y_pred_logistic)
print("Logistic Regression Model:")
print(f"Accuracy: {accuracy_logistic}")
report_logistic = classification_report(y_test, y_pred_logistic)
print("Logistic Regression Classification Report:")
print(report_logistic)
# Confusion Matrix
#conf_matrix = confusion_matrix(y_test, y_pred_logistic)
#ConfusionMatrixDisplay(conf_matrix).plot()
import gradio as gr
# Function to classify sentiment
def classify_sentiment(text):
processed_text = preprocess_text(text)
vectorized_text = vectorization.transform([processed_text])
prediction = logistic_model.predict(vectorized_text)[0]
if prediction == 0:
return "Negative"
elif prediction == 1:
return "Neutral"
else:
return "Positive"
# Input and output components for the interface
inputs = gr.Textbox(lines=10, label="Enter the text you want to analyze:")
outputs = [
gr.Textbox(label="Sentiment Prediction")
]
# Create the Gradio interface
interface = gr.Interface(fn=classify_sentiment, inputs=inputs, outputs=outputs, title="Sentiment Classification", description="Enter a piece of text and analyze its sentiment.")
interface.launch()
|