b22ee075's picture
Update app.py
7df8d0b verified
raw
history blame
4.77 kB
# -*- coding: utf-8 -*-
"""PRML_project.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1_9mr_G1Wt8bteyyMEFJYBImPcIteTcSQ
## Downloading & preparing the Dataset
"""
import pandas as pd
import matplotlib.pyplot as plt
import warnings
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report, ConfusionMatrixDisplay
import re
import string
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
# Ignore FutureWarning messages
warnings.simplefilter(action='ignore', category=FutureWarning)
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
d = pd.read_csv('train.csv',encoding='latin1');
f = pd.read_csv('test.csv',encoding='latin1');
df = pd.concat([d,f])
print(df.shape)
display(df.info())
display(df)
"""## Preprocessing the dataset"""
df.dropna(inplace=True)
df['sentiment'].value_counts(normalize=True).plot(kind='bar');
df['sentiment'] = df['sentiment'].astype('category').cat.codes
df['sentiment'].value_counts(normalize=True).plot(kind='bar');
df['Time of Tweet'] = df['Time of Tweet'].astype('category').cat.codes
# Convert Country column to categorical variable
df['Country'] = df['Country'].astype('category').cat.codes
# convert Age of User to integer
df['Age of User']=df['Age of User'].replace({'0-20':18,'21-30':25,'31-45':38,'46-60':53,'60-70':65,'70-100':80})
df.info()
df.drop(columns=['textID','Time of Tweet', 'Age of User', 'Country', 'Population -2020', 'Land Area (Km²)', 'Density (P/Km²)'])
def wp(text):
text = text.lower()
text = re.sub('\[.*?\]', '', text)
text = re.sub("\\W"," ",text)
text = re.sub('https?://\S+|www\.\S+', '', text)
text = re.sub('<.*?>+', '', text)
text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
text = re.sub('\n', '', text)
text = re.sub('\w*\d\w*', '', text)
return text
df['selected_text'] = df["selected_text"].apply(wp)
"""## Training and testing split """
X=df['selected_text']
y= df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
vectorization = TfidfVectorizer()
XV_train = vectorization.fit_transform(X_train)
XV_test = vectorization.transform(X_test)
"""# Logistic Regression"""
logistic_model = LogisticRegression(max_iter=100)
logistic_model.fit(XV_train, y_train)
y_pred_logistic = logistic_model.predict(XV_test)
accuracy_logistic = accuracy_score(y_test, y_pred_logistic)
print("Logistic Regression Model:")
print(f"Accuracy: {accuracy_logistic}")
report_logistic = classification_report(y_test, y_pred_logistic)
print("Logistic Regression Classification Report:")
print(report_logistic)
ConfusionMatrixDisplay.from_predictions(y_test,y_pred_logistic);
import gradio as gr
import matplotlib.pyplot as plt
import seaborn as sns
# Function to classify sentiment
def classify_sentiment(text):
# Preprocess the text
processed_text = wp(text)
# Vectorize the text
vectorized_text = vectorization.transform([processed_text])
# Predict sentiment using logistic regression model
prediction = logistic_model.predict(vectorized_text)[0]
# Output sentiment label
sentiment_label = output_label(prediction)
# Get probabilities for each sentiment class
probabilities = logistic_model.predict_proba(vectorized_text)[0]
# Plot probabilities
plt.figure(figsize=(8, 6))
sns.barplot(x=["Negative", "Neutral", "Positive"], y=probabilities)
plt.xlabel("Sentiment")
plt.ylabel("Probability")
plt.title("Sentiment Probability Distribution")
plt.ylim([0, 1])
plt.tight_layout()
plt.savefig("sentiment_probabilities.png")
return sentiment_label, "sentiment_probabilities.png"
# Input and output components for the interface
inputs = gr.Textbox(lines=10, label="Enter the text you want to analyze:")
outputs = [
gr.Textbox(label="Sentiment Prediction"),
gr.Image(label="Sentiment Probability Distribution")
]
# Create the Gradio interface
interface = gr.Interface(fn=classify_sentiment, inputs=inputs, outputs=outputs, title="Sentiment Classification", description="Enter a piece of text and analyze its sentiment.")
interface.launch()