Spaces:
Runtime error
Runtime error
| # utilities | |
| import re | |
| import pickle | |
| import numpy as np | |
| import pandas as pd | |
| # plotting | |
| import seaborn as sns | |
| from wordcloud import WordCloud | |
| import matplotlib.pyplot as plt | |
| # nltk | |
| from nltk.stem import WordNetLemmatizer | |
| # sklearn | |
| from sklearn.svm import LinearSVC | |
| from sklearn.naive_bayes import BernoulliNB | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.metrics import confusion_matrix, classification_report | |
| pip install datasets | |
| from datasets import load_dataset | |
| dataset = load_dataset("training.1600000.processed.noemoticon.csv") | |
| DATASET_COLUMNS = ["sentiment", "ids", "date", "flag", "user", "text"] | |
| DATASET_ENCODING = "ISO-8859-1" | |
| dataset = pd.read_csv('training.1600000.processed.noemoticon.csv', | |
| encoding=DATASET_ENCODING , names=DATASET_COLUMNS) | |
| # Removing the unnecessary columns. | |
| dataset = dataset[['sentiment','text']] | |
| # Replacing the values to ease understanding. | |
| dataset['sentiment'] = dataset['sentiment'].replace(4,1) | |
| # Storing data in lists. | |
| text, sentiment = list(dataset['text']), list(dataset['sentiment']) | |
| # Defining dictionary containing all emojis with their meanings. | |
| emojis = {':)': 'smile', ':-)': 'smile', ';d': 'wink', ':-E': 'vampire', ':(': 'sad', | |
| ':-(': 'sad', ':-<': 'sad', ':P': 'raspberry', ':O': 'surprised', | |
| ':-@': 'shocked', ':@': 'shocked',':-$': 'confused', ':\\': 'annoyed', | |
| ':#': 'mute', ':X': 'mute', ':^)': 'smile', ':-&': 'confused', '$_$': 'greedy', | |
| '@@': 'eyeroll', ':-!': 'confused', ':-D': 'smile', ':-0': 'yell', 'O.o': 'confused', | |
| '<(-_-)>': 'robot', 'd[-_-]b': 'dj', ":'-)": 'sadsmile', ';)': 'wink', | |
| ';-)': 'wink', 'O:-)': 'angel','O*-)': 'angel','(:-D': 'gossip', '=^.^=': 'cat'} | |
| ## Defining set containing all stopwords in english. | |
| stopwordlist = ['a', 'about', 'above', 'after', 'again', 'ain', 'all', 'am', 'an', | |
| 'and','any','are', 'as', 'at', 'be', 'because', 'been', 'before', | |
| 'being', 'below', 'between','both', 'by', 'can', 'd', 'did', 'do', | |
| 'does', 'doing', 'down', 'during', 'each','few', 'for', 'from', | |
| 'further', 'had', 'has', 'have', 'having', 'he', 'her', 'here', | |
| 'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in', | |
| 'into','is', 'it', 'its', 'itself', 'just', 'll', 'm', 'ma', | |
| 'me', 'more', 'most','my', 'myself', 'now', 'o', 'of', 'on', 'once', | |
| 'only', 'or', 'other', 'our', 'ours','ourselves', 'out', 'own', 're', | |
| 's', 'same', 'she', "shes", 'should', "shouldve",'so', 'some', 'such', | |
| 't', 'than', 'that', "thatll", 'the', 'their', 'theirs', 'them', | |
| 'themselves', 'then', 'there', 'these', 'they', 'this', 'those', | |
| 'through', 'to', 'too','under', 'until', 'up', 've', 'very', 'was', | |
| 'we', 'were', 'what', 'when', 'where','which','while', 'who', 'whom', | |
| 'why', 'will', 'with', 'won', 'y', 'you', "youd","youll", "youre", | |
| "youve", 'your', 'yours', 'yourself', 'yourselves'] | |
| ## Defining set containing all stopwords in english. | |
| stopwordlist = ['a', 'about', 'above', 'after', 'again', 'ain', 'all', 'am', 'an', | |
| 'and','any','are', 'as', 'at', 'be', 'because', 'been', 'before', | |
| 'being', 'below', 'between','both', 'by', 'can', 'd', 'did', 'do', | |
| 'does', 'doing', 'down', 'during', 'each','few', 'for', 'from', | |
| 'further', 'had', 'has', 'have', 'having', 'he', 'her', 'here', | |
| 'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in', | |
| 'into','is', 'it', 'its', 'itself', 'just', 'll', 'm', 'ma', | |
| 'me', 'more', 'most','my', 'myself', 'now', 'o', 'of', 'on', 'once', | |
| 'only', 'or', 'other', 'our', 'ours','ourselves', 'out', 'own', 're', | |
| 's', 'same', 'she', "shes", 'should', "shouldve",'so', 'some', 'such', | |
| 't', 'than', 'that', "thatll", 'the', 'their', 'theirs', 'them', | |
| 'themselves', 'then', 'there', 'these', 'they', 'this', 'those', | |
| 'through', 'to', 'too','under', 'until', 'up', 've', 'very', 'was', | |
| 'we', 'were', 'what', 'when', 'where','which','while', 'who', 'whom', | |
| 'why', 'will', 'with', 'won', 'y', 'you', "youd","youll", "youre", | |
| "youve", 'your', 'yours', 'yourself', 'yourselves'] | |
| def preprocess(textdata): | |
| processedText = [] | |
| # Create Lemmatizer and Stemmer. | |
| wordLemm = WordNetLemmatizer() | |
| # Defining regex patterns. | |
| urlPattern = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)" | |
| userPattern = '@[^\s]+' | |
| alphaPattern = "[^a-zA-Z0-9]" | |
| sequencePattern = r"(.)\1\1+" | |
| seqReplacePattern = r"\1\1" | |
| for tweet in textdata: | |
| tweet = tweet.lower() | |
| # Replace all URls with 'URL' | |
| tweet = re.sub(urlPattern,' URL',tweet) | |
| # Replace all emojis. | |
| for emoji in emojis.keys(): | |
| tweet = tweet.replace(emoji, "EMOJI" + emojis[emoji]) | |
| # Replace @USERNAME to 'USER'. | |
| tweet = re.sub(userPattern,' USER', tweet) | |
| # Replace all non alphabets. | |
| tweet = re.sub(alphaPattern, " ", tweet) | |
| # Replace 3 or more consecutive letters by 2 letter. | |
| tweet = re.sub(sequencePattern, seqReplacePattern, tweet) | |
| tweetwords = '' | |
| for word in tweet.split(): | |
| # Checking if the word is a stopword. | |
| #if word not in stopwordlist: | |
| if len(word)>1: | |
| # Lemmatizing the word. | |
| word = wordLemm.lemmatize(word) | |
| tweetwords += (word+' ') | |
| processedText.append(tweetwords) | |
| return processedText | |
| import nltk | |
| nltk.download() | |
| import time | |
| t = time.time() | |
| processedtext = preprocess(text) | |
| print(f'Text Preprocessing complete.') | |
| print(f'Time Taken: {round(time.time()-t)} seconds') | |
| X_train, X_test, y_train, y_test = train_test_split(processedtext, sentiment, | |
| test_size = 0.05, random_state = 0) | |
| print(f'Data Split done.') | |