Spaces:
Runtime error
Runtime error
| # -*- coding: utf-8 -*- | |
| """PRML_project.ipynb | |
| Automatically generated by Colab. | |
| Original file is located at | |
| https://colab.research.google.com/drive/1_9mr_G1Wt8bteyyMEFJYBImPcIteTcSQ | |
| ## Downloading & preparing the Dataset | |
| """ | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| import warnings | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.metrics import accuracy_score,classification_report, ConfusionMatrixDisplay | |
| import re | |
| import string | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.naive_bayes import MultinomialNB | |
| from sklearn.tree import DecisionTreeClassifier | |
| from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.svm import SVC | |
| # Ignore FutureWarning messages | |
| warnings.simplefilter(action='ignore', category=FutureWarning) | |
| import os | |
| import sys | |
| from tempfile import NamedTemporaryFile | |
| from urllib.request import urlopen | |
| from urllib.parse import unquote, urlparse | |
| from urllib.error import HTTPError | |
| from zipfile import ZipFile | |
| import tarfile | |
| import shutil | |
| CHUNK_SIZE = 40960 | |
| DATA_SOURCE_MAPPING = 'sentiment-analysis-dataset:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F989445%2F1808590%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240418%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240418T100202Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D37697dd0d9910676a3f12986b24306fc3726be4de82536c784ffb79deff0ba33d8973d6d612a53bcf9ed39bd7ad8a1d69bb34c42a34c7d6cffee6dd3048a9ef68f047745664f48ea6f3773a1f263129a6f78d48923235cc363b4081daadea014b0958575bf8376d565858404a8b1be7e5f317bdd9f5823ce4777f0b7052445c648bcda039294c804978828087705abe4416a6f9a0e0743388667017128a5ab2ef5ab2dade0d40d1659f4313296501907b4baec3161131e151e6f5b982eee9a6f7eb1b022da9c874f216d7fac981dc1351e9001ee56d03d1da8b2e0d4c97320f18d7e9b00ec63f4ba7444d81595cc8edff2b05f13aef4b204dd2710d0fddf0ef9' | |
| KAGGLE_INPUT_PATH='/kaggle/input' | |
| KAGGLE_WORKING_PATH='/kaggle/working' | |
| KAGGLE_SYMLINK='kaggle' | |
| import subprocess | |
| subprocess.run(["umount", "/kaggle/input/"], stderr=subprocess.DEVNULL) | |
| shutil.rmtree('/kaggle/input', ignore_errors=True) | |
| os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True) | |
| os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True) | |
| try: | |
| os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True) | |
| except FileExistsError: | |
| pass | |
| try: | |
| os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True) | |
| except FileExistsError: | |
| pass | |
| for data_source_mapping in DATA_SOURCE_MAPPING.split(','): | |
| directory, download_url_encoded = data_source_mapping.split(':') | |
| download_url = unquote(download_url_encoded) | |
| filename = urlparse(download_url).path | |
| destination_path = os.path.join(KAGGLE_INPUT_PATH, directory) | |
| try: | |
| with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile: | |
| total_length = fileres.headers['content-length'] | |
| print(f'Downloading {directory}, {total_length} bytes compressed') | |
| dl = 0 | |
| data = fileres.read(CHUNK_SIZE) | |
| while len(data) > 0: | |
| dl += len(data) | |
| tfile.write(data) | |
| done = int(50 * dl / int(total_length)) | |
| sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded") | |
| sys.stdout.flush() | |
| data = fileres.read(CHUNK_SIZE) | |
| if filename.endswith('.zip'): | |
| with ZipFile(tfile) as zfile: | |
| zfile.extractall(destination_path) | |
| else: | |
| with tarfile.open(tfile.name) as tarfile: | |
| tarfile.extractall(destination_path) | |
| print(f'\nDownloaded and uncompressed: {directory}') | |
| except HTTPError as e: | |
| print(f'Failed to load (likely expired) {download_url} to path {destination_path}') | |
| continue | |
| except OSError as e: | |
| print(f'Failed to load {download_url} to path {destination_path}') | |
| continue | |
| print('Data source import complete.') | |
| import numpy as np # linear algebra | |
| import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) | |
| import os | |
| for dirname, _, filenames in os.walk('/kaggle/input'): | |
| for filename in filenames: | |
| print(os.path.join(dirname, filename)) | |
| d = pd.read_csv('/kaggle/input/sentiment-analysis-dataset/train.csv',encoding='latin1'); | |
| f = pd.read_csv('/kaggle/input/sentiment-analysis-dataset/test.csv',encoding='latin1'); | |
| df = pd.concat([d,f]) | |
| print(df.shape) | |
| display(df.info()) | |
| display(df) | |
| """## Preprocessing the dataset""" | |
| df.dropna(inplace=True) | |
| df['sentiment'].value_counts(normalize=True).plot(kind='bar'); | |
| df['sentiment'] = df['sentiment'].astype('category').cat.codes | |
| df['sentiment'].value_counts(normalize=True).plot(kind='bar'); | |
| df['Time of Tweet'] = df['Time of Tweet'].astype('category').cat.codes | |
| # Convert Country column to categorical variable | |
| df['Country'] = df['Country'].astype('category').cat.codes | |
| # convert Age of User to integer | |
| df['Age of User']=df['Age of User'].replace({'0-20':18,'21-30':25,'31-45':38,'46-60':53,'60-70':65,'70-100':80}) | |
| df.info() | |
| df.drop(columns=['textID','Time of Tweet', 'Age of User', 'Country', 'Population -2020', 'Land Area (Km²)', 'Density (P/Km²)']) | |
| def wp(text): | |
| text = text.lower() | |
| text = re.sub('\[.*?\]', '', text) | |
| text = re.sub("\\W"," ",text) | |
| text = re.sub('https?://\S+|www\.\S+', '', text) | |
| text = re.sub('<.*?>+', '', text) | |
| text = re.sub('[%s]' % re.escape(string.punctuation), '', text) | |
| text = re.sub('\n', '', text) | |
| text = re.sub('\w*\d\w*', '', text) | |
| return text | |
| df['selected_text'] = df["selected_text"].apply(wp) | |
| """## Training and testing split """ | |
| X=df['selected_text'] | |
| y= df['sentiment'] | |
| X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42) | |
| print(X_train.shape) | |
| print(X_test.shape) | |
| print(y_train.shape) | |
| print(y_test.shape) | |
| vectorization = TfidfVectorizer() | |
| XV_train = vectorization.fit_transform(X_train) | |
| XV_test = vectorization.transform(X_test) | |
| """# Logistic Regression""" | |
| logistic_model = LogisticRegression(max_iter=100) | |
| logistic_model.fit(XV_train, y_train) | |
| y_pred_logistic = logistic_model.predict(XV_test) | |
| accuracy_logistic = accuracy_score(y_test, y_pred_logistic) | |
| print("Logistic Regression Model:") | |
| print(f"Accuracy: {accuracy_logistic}") | |
| report_logistic = classification_report(y_test, y_pred_logistic) | |
| print("Logistic Regression Classification Report:") | |
| print(report_logistic) | |
| ConfusionMatrixDisplay.from_predictions(y_test,y_pred_logistic); | |
| pip install gradio | |
| import gradio as gr | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| # Function to classify sentiment | |
| def classify_sentiment(text): | |
| # Preprocess the text | |
| processed_text = wp(text) | |
| # Vectorize the text | |
| vectorized_text = vectorization.transform([processed_text]) | |
| # Predict sentiment using logistic regression model | |
| prediction = logistic_model.predict(vectorized_text)[0] | |
| # Output sentiment label | |
| sentiment_label = output_label(prediction) | |
| # Get probabilities for each sentiment class | |
| probabilities = logistic_model.predict_proba(vectorized_text)[0] | |
| # Plot probabilities | |
| plt.figure(figsize=(8, 6)) | |
| sns.barplot(x=["Negative", "Neutral", "Positive"], y=probabilities) | |
| plt.xlabel("Sentiment") | |
| plt.ylabel("Probability") | |
| plt.title("Sentiment Probability Distribution") | |
| plt.ylim([0, 1]) | |
| plt.tight_layout() | |
| plt.savefig("sentiment_probabilities.png") | |
| return sentiment_label, "sentiment_probabilities.png" | |
| # Input and output components for the interface | |
| inputs = gr.Textbox(lines=10, label="Enter the text you want to analyze:") | |
| outputs = [ | |
| gr.Textbox(label="Sentiment Prediction"), | |
| gr.Image(label="Sentiment Probability Distribution") | |
| ] | |
| # Create the Gradio interface | |
| interface = gr.Interface(fn=classify_sentiment, inputs=inputs, outputs=outputs, title="Sentiment Classification", description="Enter a piece of text and analyze its sentiment.") | |
| interface.launch() | |