Spaces:
Build error
Build error
| import re | |
| import nltk as nltk | |
| import numpy as np | |
| import pandas as pd | |
| from gensim.models import Word2Vec | |
| from sklearn.feature_extraction.text import CountVectorizer | |
| df = pd.read_csv("./labeled_data.csv") | |
| print("Finished loading data from labeled_data.csv") | |
| # Data cleansing | |
| tweets = df.iloc[:,6] | |
| texts = [] | |
| for iterrow in tweets.items(): | |
| text = iterrow[1] | |
| text = re.sub(r'\@.*\:', "",text) | |
| text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', "", text, flags=re.MULTILINE) | |
| text = re.sub(r'[^A-Za-z ]+', "",text) | |
| text = re.sub(r'RT', "",text) | |
| texts.append(text) | |
| df_1 = df.iloc[:,:6] | |
| df_2 = pd.DataFrame(texts) | |
| print(df_2) | |
| count = CountVectorizer() | |
| count = CountVectorizer(stop_words='english', ngram_range=(1,5)) | |
| count.fit(df_2[0]) | |
| X_train_vectorizer=count.transform(df_2[0]) | |
| df_2 = pd.DataFrame(X_train_vectorizer.toarray()) | |
| df_cleaned = pd.concat([df_1,df_2],axis=1) | |
| # Data splitting | |
| def train_validate_test_split(df_local, train_percent=.6, validate_percent=.2, seed=None): | |
| np.random.seed(seed) | |
| perm = np.random.permutation(df_local.index) | |
| m = len(df_local.index) | |
| train_end = int(train_percent * m) | |
| validate_end = int(validate_percent * m) + train_end | |
| train = df_local.iloc[perm[:train_end]] | |
| validate = df_local.iloc[perm[train_end:validate_end]] | |
| test = df_local.iloc[perm[validate_end:]] | |
| return train, validate, test | |
| train, validate, test = train_validate_test_split(df_cleaned) | |
| train = train.dropna(axis=0).reset_index(drop=True) | |
| validate = validate.dropna(axis=0).reset_index(drop=True) | |
| test = test.dropna(axis=0).reset_index(drop=True) | |
| # Construct a dictionary | |
| # 1. Traverse each word in the dataset, store them in a dictionary | |
| # the dictionary will be used for one-hot encoding | |
| # 2. Calculate the maximum number of words that a sentense contains | |
| train_tweets = train.iloc[:,6] | |
| word_set = set() | |
| max_len = 0 | |
| curr_len = 0 | |
| for line in train_tweets.items(): | |
| if curr_len > max_len: | |
| max_len = curr_len | |
| curr_len = 0 | |
| for word in line[1].split(): | |
| word_set.add(word) | |
| curr_len += 1 | |
| dictionary = list(word_set) | |
| # max_len: 33 | |
| # len(dictionary): | |
| # # Load the word2vec model | |
| # model = Word2Vec.load("word2vec.model") | |
| # | |
| # # Convert the text to a list of words | |
| # words = nltk.word_tokenize(text) | |
| # | |
| # # Convert the words to word vectors using the word2vec model | |
| # vectors = [model.wv[word] for word in words] | |