Spaces:
Sleeping
Sleeping
| # -*- coding: utf-8 -*- | |
| """PRML_project.ipynb | |
| Automatically generated by Colab. | |
| Original file is located at | |
| https://colab.research.google.com/drive/1_9mr_G1Wt8bteyyMEFJYBImPcIteTcSQ | |
| ## Downloading & preparing the Dataset | |
| """ | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| import warnings | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.metrics import accuracy_score, classification_report, confusion_matrix | |
| import re | |
| import string | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| # Ignore FutureWarning messages | |
| warnings.simplefilter(action='ignore', category=FutureWarning) | |
| # Load data | |
| d = pd.read_csv('train.csv', encoding='latin1') | |
| f = pd.read_csv('test.csv', encoding='latin1') | |
| df = pd.concat([d, f]) | |
| # Preprocessing the dataset | |
| df.dropna(inplace=True) | |
| df['sentiment'] = df['sentiment'].astype('category').cat.codes | |
| df['Time of Tweet'] = df['Time of Tweet'].astype('category').cat.codes | |
| df['Country'] = df['Country'].astype('category').cat.codes | |
| df['Age of User'] = df['Age of User'].replace({'0-20': 18, '21-30': 25, '31-45': 38, '46-60': 53, '60-70': 65, '70-100': 80}) | |
| df.drop(columns=['textID', 'Time of Tweet', 'Age of User', 'Country', 'Population -2020', 'Land Area (Km²)', 'Density (P/Km²)'], inplace=True) | |
| def preprocess_text(text): | |
| text = text.lower() | |
| text = re.sub('\[.*?\]', '', text) | |
| text = re.sub("\\W", " ", text) | |
| text = re.sub('https?://\S+|www\.\S+', '', text) | |
| text = re.sub('<.*?>+', '', text) | |
| text = re.sub('[%s]' % re.escape(string.punctuation), '', text) | |
| text = re.sub('\n', '', text) | |
| text = re.sub('\w*\d\w*', '', text) | |
| return text | |
| df['selected_text'] = df["selected_text"].apply(preprocess_text) | |
| # Training and testing split | |
| X = df['selected_text'] | |
| y = df['sentiment'] | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) | |
| vectorization = TfidfVectorizer() | |
| XV_train = vectorization.fit_transform(X_train) | |
| XV_test = vectorization.transform(X_test) | |
| # Logistic Regression | |
| logistic_model = LogisticRegression(max_iter=100) | |
| logistic_model.fit(XV_train, y_train) | |
| y_pred_logistic = logistic_model.predict(XV_test) | |
| accuracy_logistic = accuracy_score(y_test, y_pred_logistic) | |
| print("Logistic Regression Model:") | |
| print(f"Accuracy: {accuracy_logistic}") | |
| report_logistic = classification_report(y_test, y_pred_logistic) | |
| print("Logistic Regression Classification Report:") | |
| print(report_logistic) | |
| # Confusion Matrix | |
| #conf_matrix = confusion_matrix(y_test, y_pred_logistic) | |
| #ConfusionMatrixDisplay(conf_matrix).plot() | |
| import gradio as gr | |
| # Function to classify sentiment | |
| def classify_sentiment(text): | |
| processed_text = preprocess_text(text) | |
| vectorized_text = vectorization.transform([processed_text]) | |
| prediction = logistic_model.predict(vectorized_text)[0] | |
| if prediction == 0: | |
| return "Negative" | |
| elif prediction == 1: | |
| return "Neutral" | |
| else: | |
| return "Positive" | |
| # Input and output components for the interface | |
| inputs = gr.Textbox(lines=10, label="Enter the text you want to analyze:") | |
| outputs = [ | |
| gr.Textbox(label="Sentiment Prediction") | |
| ] | |
| # Create the Gradio interface | |
| interface = gr.Interface(fn=classify_sentiment, inputs=inputs, outputs=outputs, title="Sentiment Classification", description="Enter a piece of text and analyze its sentiment.") | |
| interface.launch() | |