Spaces:
Build error
Build error
| # -*- coding: utf-8 -*- | |
| """ | |
| Created on Fri Jun 12 10:02:20 2020 | |
| @author: luol2 | |
| """ | |
| import time | |
| import os, sys | |
| import numpy as np | |
| from keras.preprocessing.sequence import pad_sequences | |
| from keras_bert import Tokenizer | |
| class CNN_RepresentationLayer(object): | |
| def __init__(self, wordvec_file, vocab_file=[],\ | |
| vec_size=50, word_size=10000, frequency=10000): | |
| ''' | |
| wordvec_file : the file path of word embedding | |
| vec_size : the dimension size of word vector | |
| learned by word2vec tool | |
| word_size : the size of word vocabulary | |
| frequency : the threshold for the words left according to | |
| their frequency appeared in the text | |
| for example, when frequency is 10000, the most | |
| frequent appeared 10000 words are considered | |
| ''' | |
| #load word embedding | |
| file = open(wordvec_file) | |
| first_line = file.readline().strip() | |
| file.close() | |
| self.word_size = int(first_line.split()[0]) | |
| self.vec_size = int(first_line.split()[1]) | |
| self.frequency = frequency | |
| if self.frequency>self.word_size: | |
| self.vec_table = np.zeros((self.word_size + 2, self.vec_size)) | |
| else: | |
| self.vec_table = np.zeros((self.frequency + 2, self.vec_size)) | |
| self.word_2_index = {} | |
| self.load_wordvecs(wordvec_file) | |
| #other fea | |
| self.char_2_index={} | |
| self.char_table_size=0 | |
| if 'char' in vocab_file.keys(): | |
| self.load_fea_vocab(vocab_file['char'],self.char_2_index) | |
| self.char_table_size=len(self.char_2_index) | |
| #print(self.char_table_size) | |
| #print(self.char_2_index) | |
| self.label_2_index={} | |
| self.label_table_size=0 | |
| if 'label' in vocab_file.keys(): | |
| self.load_label_vocab(vocab_file['label'],self.label_2_index) | |
| self.label_table_size=len(self.label_2_index) | |
| #print(self.label_table_size) | |
| #print(self.char_2_index) | |
| self.pos_2_index={} | |
| self.pos_table_size=0 | |
| if 'pos' in vocab_file.keys(): | |
| self.load_fea_vocab(vocab_file['pos'],self.pos_2_index) | |
| self.pos_table_size=len(self.pos_2_index) | |
| #print(self.pos_table_size) | |
| def load_wordvecs(self, wordvec_file): | |
| file = open(wordvec_file,'r',encoding='utf-8') | |
| file.readline() | |
| #print(self.word_size) | |
| #print(self.vec_size) | |
| row = 0 | |
| self.word_2_index['padding_0'] = row #oov-zero vector | |
| row+=1 | |
| for line in file: | |
| if row <= self.word_size and row <= self.frequency: | |
| line_split = line.strip().split(' ') | |
| self.word_2_index[line_split[0]] = row | |
| for col in range(self.vec_size): | |
| self.vec_table[row][col] = float(line_split[col + 1]) | |
| row += 1 | |
| else: | |
| break | |
| self.word_2_index['sparse_vectors'] = row #oov-zero vector | |
| file.close() | |
| def load_fea_vocab(self,fea_file,fea_index): | |
| fin=open(fea_file,'r',encoding='utf-8') | |
| i=0 | |
| fea_index['padding_0']=i | |
| i+=1 | |
| fea_index['oov_padding']=i | |
| i+=1 | |
| for line in fin: | |
| fea_index[line.strip()]=i | |
| i+=1 | |
| fin.close() | |
| def load_label_vocab(self,fea_file,fea_index): | |
| fin=open(fea_file,'r',encoding='utf-8') | |
| i=0 | |
| for line in fin: | |
| fea_index[line.strip()]=i | |
| i+=1 | |
| fin.close() | |
| def generate_label_list(self,labels): | |
| label_list=[] | |
| for label in labels: | |
| temp_label=[0]*self.label_table_size | |
| temp_label[self.label_2_index[label]]=1 | |
| label_list.append(temp_label) | |
| return label_list | |
| def represent_instances_all_feas(self, instances, labels, word_max_len=100, char_max_len=50): | |
| x_text_list=[] | |
| x_word_list=[] | |
| x_char_list=[] | |
| x_lemma_np=[] | |
| x_pos_np=[] | |
| y_np=[] | |
| startTime=time.time() | |
| for sentence in instances: | |
| sentence_list=[] | |
| sentence_word_list=[] | |
| sentence_lemma_list=[] | |
| sentence_pos_list=[] | |
| sentence_text=[] | |
| for j in range(0,len(sentence)): | |
| word=sentence[j] | |
| #char fea | |
| char_list=[0]*char_max_len | |
| for i in range(len(word[0])): | |
| if i<char_max_len: | |
| if word[0][i] in self.char_2_index.keys(): | |
| char_list[i]=self.char_2_index[word[0][i]] | |
| else: | |
| char_list[i]=self.char_2_index['oov_padding'] | |
| sentence_word_list.append(char_list) | |
| #word fea | |
| sentence_text.append(word[0].lower()) | |
| if word[0].lower() in self.word_2_index.keys(): | |
| sentence_list.append(self.word_2_index[word[0].lower()]) | |
| else: | |
| sentence_list.append(self.word_2_index['sparse_vectors']) | |
| """ | |
| #lemma fea | |
| if word[1].lower() in self.word_2_index.keys(): | |
| sentence_lemma_list.append(self.word_2_index[word[1].lower()]) | |
| else: | |
| sentence_lemma_list.append(self.word_2_index['sparse_vectors']) | |
| #pos fea | |
| if word[3] in self.pos_2_index.keys(): | |
| sentence_pos_list.append(self.pos_2_index[word[3]]) | |
| else: | |
| sentence_pos_list.append(self.pos_2_index['oov_padding']) | |
| """ | |
| x_text_list.append(sentence_text) | |
| x_word_list.append(sentence_list) | |
| x_char_list.append(sentence_word_list) | |
| # x_lemma_list.append(sentence_lemma_list) | |
| # x_pos_list.append(sentence_pos_list) | |
| #print('\nword:',x_word_list) | |
| #print('\nchar:',x_char_list) | |
| #print('\nlemma:',x_lemma_list) | |
| #print('\npos:',x_pos_list) | |
| #y_list=self.generate_label_list(labels) | |
| #print('\ny_list:',y_list) | |
| x_word_np = pad_sequences(x_word_list, word_max_len, value=0, padding='post',truncating='post') # right padding | |
| x_char_np = pad_sequences(x_char_list, word_max_len, value=0, padding='post',truncating='post') | |
| #x_lemma_np = pad_sequences(x_lemma_list, word_max_len, value=0, padding='post',truncating='post') | |
| #x_pos_np = pad_sequences(x_pos_list, word_max_len, value=0, padding='post',truncating='post') | |
| #y_np = np.array(y_list) | |
| return [x_word_np, x_char_np, x_lemma_np, x_pos_np, x_text_list], y_np | |
| def represent_instances_all_feas_myself(self, instances, labels, word_max_len=100, char_max_len=50): | |
| x_text_list=[] | |
| x_word_list=[] | |
| x_char_list=[] | |
| x_lemma_list=[] | |
| x_pos_list=[] | |
| y_list=[] | |
| startTime=time.time() | |
| for sentence in instances: | |
| sentence_list=[0]*word_max_len | |
| sentence_word_list=[[0]*char_max_len for i in range(word_max_len)] | |
| sentence_lemma_list=[0]*word_max_len | |
| sentence_pos_list=[0]*word_max_len | |
| sentence_text=[] | |
| for j in range(0,len(sentence)): | |
| word=sentence[j] | |
| sentence_text.append(word[0].lower()) | |
| if j<word_max_len: | |
| #char fea | |
| for i in range(len(word[0])): | |
| if i<char_max_len: | |
| if word[0][i] in self.char_2_index.keys(): | |
| sentence_word_list[j][i]=self.char_2_index[word[0][i]] | |
| else: | |
| sentence_word_list[j][i]=self.char_2_index['oov_padding'] | |
| #word fea | |
| if word[0].lower() in self.word_2_index.keys(): | |
| sentence_list[j]=self.word_2_index[word[0].lower()] | |
| else: | |
| sentence_list[j]=self.word_2_index['sparse_vectors'] | |
| #lemma fea | |
| if word[1].lower() in self.word_2_index.keys(): | |
| sentence_lemma_list[j]=self.word_2_index[word[1].lower()] | |
| else: | |
| sentence_lemma_list[j]=self.word_2_index['sparse_vectors'] | |
| #pos fea | |
| if word[3] in self.pos_2_index.keys(): | |
| sentence_pos_list[j]=self.pos_2_index[word[3]] | |
| else: | |
| sentence_pos_list[j]=self.pos_2_index['oov_padding'] | |
| x_text_list.append(sentence_text) | |
| x_word_list.append(sentence_list) | |
| x_char_list.append(sentence_word_list) | |
| x_lemma_list.append(sentence_lemma_list) | |
| x_pos_list.append(sentence_pos_list) | |
| print('ml-model-represent-list:',time.time()-startTime) | |
| startTime=time.time() | |
| #print('\nword:',x_word_list) | |
| #print('\nchar:',x_char_list) | |
| #print('\nlemma:',x_lemma_list) | |
| #print('\npos:',x_pos_list) | |
| y_list=self.generate_label_list(labels) | |
| #print('\ny_list:',y_list) | |
| # x_word_np = pad_sequences2(x_word_list, word_max_len, value=0, padding='post',truncating='post') # right padding | |
| # x_char_np = pad_sequences2(x_char_list, word_max_len, value=0, padding='post',truncating='post') | |
| # x_lemma_np = pad_sequences2(x_lemma_list, word_max_len, value=0, padding='post',truncating='post') | |
| # x_pos_np = pad_sequences2(x_pos_list, word_max_len, value=0, padding='post',truncating='post') | |
| x_word_np = np.array(x_word_list) # right padding | |
| x_char_np = pad_sequences2(x_char_list) | |
| x_lemma_np = np.array(x_lemma_list) | |
| x_pos_np = np.array(x_pos_list) | |
| y_np = np.array(y_list) | |
| print('ml-model-represent-pad:',time.time()-startTime) | |
| return [x_word_np, x_char_np, x_lemma_np, x_pos_np, x_text_list], y_np | |
| class BERT_RepresentationLayer(object): | |
| def __init__(self, vocab_path, label_file): | |
| #load vocab | |
| self.bert_vocab_dict = {} | |
| self.load_bert_vocab(vocab_path,self.bert_vocab_dict) | |
| self.tokenizer = Tokenizer(self.bert_vocab_dict) | |
| #load label | |
| self.label_2_index={} | |
| self.label_table_size=0 | |
| self.load_label_vocab(label_file,self.label_2_index) | |
| self.label_table_size=len(self.label_2_index) | |
| def load_label_vocab(self,fea_file,fea_index): | |
| fin=open(fea_file,'r',encoding='utf-8') | |
| i=0 | |
| for line in fin: | |
| fea_index[line.strip()]=i | |
| i+=1 | |
| fin.close() | |
| def load_bert_vocab(self,vocab_file,vocab_dict): | |
| fin=open(vocab_file,'r',encoding='utf-8') | |
| i=0 | |
| for line in fin: | |
| vocab_dict[line.strip()]=i | |
| i+=1 | |
| fin.close() | |
| def generate_label_list(self,labels): | |
| label_list=[] | |
| for label in labels: | |
| temp_label=[0]*self.label_table_size | |
| temp_label[self.label_2_index[label]]=1 | |
| label_list.append(temp_label) | |
| return label_list | |
| def load_data(self,instances, labels, word_max_len=100): | |
| x_index=[] | |
| x_seg=[] | |
| y_np=[] | |
| for sentence in instances: | |
| sentence_text_list=[] | |
| for j in range(0,len(sentence)): | |
| sentence_text_list.append(sentence[j][0]) | |
| sentence_text=' '.join(sentence_text_list) | |
| #print(self.tokenizer.tokenize(first=sentence_text)) | |
| x1, x2 = self.tokenizer.encode(first=sentence_text) | |
| x_index.append(x1) | |
| x_seg.append(x2) | |
| # y_list=self.generate_label_list(labels) | |
| x1_np = pad_sequences(x_index, word_max_len, value=0, padding='post',truncating='post') # right padding | |
| x2_np = pad_sequences(x_seg, word_max_len, value=0, padding='post',truncating='post') | |
| # y_np = np.array(y_list) | |
| return [x1_np, x2_np], y_np | |
| if __name__ == '__main__': | |
| pass | |