import pandas as pd import os import numpy as np from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import train_test_split import nltk from nltk.tokenize import word_tokenize class UNSUP_Loader: def __init__(self, args, base_attrs): if not os.path.exists(nltk.downloader.Downloader()._download_dir): nltk.download('punkt') self.all_data, self.train_data, self.test_data = self.get_examples(base_attrs) self.all_data['words'] = self.all_data['text'].apply(word_tokenize) le = LabelEncoder() self.all_data['y_true'] = le.fit_transform(self.all_data['label']) self.all_data['text'] = self.all_data['words'].apply(lambda l: " ".join(l)) self.train_data, self.test_data = self.all_data.iloc[self.train_data.index], self.all_data.iloc[self.test_data.index] self.test_true_labels = self.all_data.y_true.values[self.test_data.index] self.train_true_labels = self.all_data.y_true.values[self.train_data.index] if args.backbone == 'glove': self.embedding_matrix, self.index_word, self.train_data, self.test_data = \ get_glove_data(args, self.all_data, self.train_data, self.test_data) elif args.backbone == 'sae': self.tfidf_train, self.tfidf_test = get_tfidf_data(args, self.train_data, self.test_data) def get_examples(self, base_attrs): train_csv = pd.read_csv(os.path.join(base_attrs['data_dir'],'train.tsv'), sep = '\t') dev_csv = pd.read_csv(os.path.join(base_attrs['data_dir'],'dev.tsv'), sep = '\t') test_csv = pd.read_csv(os.path.join(base_attrs['data_dir'],'test.tsv'), sep = '\t') train_data_list = [[x, y] for x, y in zip(train_csv['text'], train_csv['label'])] dev_data_list = [[x, y] for x, y in zip(dev_csv['text'], dev_csv['label'])] test_data_list = [[x, y] for x, y in zip(test_csv['text'], test_csv['label'])] all_data_list = train_data_list + dev_data_list + test_data_list all_data_frame = pd.DataFrame(all_data_list, columns = ['text', 'label']) train_data_list = train_data_list + dev_data_list train_data_frame = pd.DataFrame(train_data_list, columns = ['text', 'label']) test_data_frame = pd.DataFrame(test_data_list, columns = ['text', 'label']) return all_data_frame, train_data_frame, test_data_frame def get_tfidf_data(args, train_data, test_data): from sklearn.feature_extraction.text import TfidfVectorizer vec_tfidf = TfidfVectorizer(max_features = args.feat_dim) tfidf_train = vec_tfidf.fit_transform(train_data['text'].tolist()).todense() tfidf_test = vec_tfidf.transform(test_data['text'].tolist()).todense() return tfidf_train, tfidf_test def get_glove_data(args, all_data, train_data, test_data): texts = all_data['words'].tolist() tokenizer = Tokenizer(num_words = args.max_num_words, oov_token="", filters='!"#$%&()*+-/:;<=>@[\]^_`{|}~') tokenizer.fit_on_texts(texts) tokenizer.word_index = {e:i for e,i in tokenizer.word_index.items() if i <= args.max_num_words} # <= because tokenizer is 1 indexed tokenizer.word_index[tokenizer.oov_token] = args.max_num_words + 1# because tokenizer is 1 indexed word_index = tokenizer.word_index index_word = {v: k for k, v in word_index.items()} max_features = min(args.max_num_words + 1, len(word_index)) + 1 sequences = tokenizer.texts_to_sequences(texts) sequences_pad = pad_sequences(sequences, maxlen = args.max_seq_length, padding='post', truncating='post') train_x = sequences_pad[train_data.index] test_x = sequences_pad[test_data.index] embedding_matrix, embeddings_index = get_glove_embedding(args, max_features, word_index) return embedding_matrix, index_word, train_x, test_x def get_glove_embedding(args, MAX_FEATURES, word_index): EMBEDDING_DIM = 300 EMBEDDING_FILE = os.path.join(args.glove_model, 'glove.6B.' + str(EMBEDDING_DIM) +'d.txt') def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32') #read token embedding and process, form a dict (one word -> one vector) embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE,encoding="utf-8")) #get value from dict all_embs = np.stack(embeddings_index.values()) #cal mean and std emb_mean, emb_std = all_embs.mean(), all_embs.std() """Guassian distribution """ # pad zero to none 10002, 300 embedding_matrix = np.random.normal(emb_mean, emb_std, (MAX_FEATURES+1, EMBEDDING_DIM)) # for word, i in word_index.items(): if i >= MAX_FEATURES: continue embedding_vector = embeddings_index.get(word) if embedding_vector is not None: embedding_matrix[i] = embedding_vector #embedding_matrix (MAX_FEATURES, ) random initialization for unmarked token return embedding_matrix, embeddings_index