File size: 6,958 Bytes
e8e72fb | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 | import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras import layers
import tensorflow_hub as hub
import tensorflow_text as text
import string
import warnings
warnings.filterwarnings('ignore')
import sys
import os
import pickle
parent_root = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir))
sys.path.append(parent_root)
from src.utils import *
from src.config.configs import *
params = Params()
class Embeddings(object):
def __init__(self):
# Word-level
self.vocab_size = params.VOCAB_SIZE
self.seq_length = params.SEQ_LENGTH
# Char-level
self.char_vocab = params.CHAR_VOCAB
self.output_char_length = params.CHAR_LENGTH
self.max_tokens = len(string.ascii_lowercase + string.digits + string.punctuation) + 2 # + Space, OOV
# Positional
self.line_ids_depth = params.LINE_IDS_DEPTH
self.length_lines_depth = params.LENGTH_LINES_DEPTH
self.total_lines_depth = params.TOTAL_LINES_DEPTH
def _get_word_embeddings(self, list_sentences):
"""
Get word-level embedding layer
args:
- list_sentences: List of all sentences in train/val set
return
- word_embed: Word embedding layer
"""
# Vectorization
# Check if word_vectorizer obj are saved or not
# If obj existed at disk
if os.path.exists(params.WORD_VECTORIZATION):
from_disk = pickle.load(open(params.WORD_VECTORIZATION, "rb"))
print("Load pre-saved word_vectorizer object from disk at: ".format(params.WORD_VECTORIZATION))
word_vectorizer = TextVectorization.from_config(from_disk['config'])
word_vectorizer.set_weights(from_disk['weights'])
else:
# Create folder to save vectorizer obj
if not os.path.exists(params.VECTORIZATION):
os.makedirs(params.VECTORIZATION)
print("Create new word_vectorizer object ...")
# Create new word_vectorizer obj
word_vectorizer = TextVectorization(max_tokens=self.vocab_size, output_sequence_length=self.seq_length)
word_vectorizer.adapt(list_sentences)
# Save newly created obj
pickle.dump({'config': word_vectorizer.get_config(),
'weights': word_vectorizer.get_weights()}
, open(params.WORD_VECTORIZATION, "wb"))
print("Saved new word_vectorizer object to disk at: ".format(params.WORD_VECTORIZATION))
word_vocab = word_vectorizer.get_vocabulary()
print("Word vectorization on training set with vocab size: ", len(word_vocab))
# Embedding layer
word_embed = tf.keras.layers.Embedding(input_dim = len(word_vocab), output_dim=params.WORD_OUTPUT_DIM,
mask_zero=True,
name="word-level_embedding")
return word_vectorizer, word_embed
def _get_char_embeddings(self, list_char):
"""
Get char-level embedding layer
args:
- list_char: List of chars split from each sentence in list_sentences
"""
# If obj existed at disk
if os.path.exists(params.CHAR_VECTORIZATION):
from_disk = pickle.load(open(params.CHAR_VECTORIZATION, "rb"))
print("Load pre-saved char_vectorizer object from disk at: ".format(params.CHAR_VECTORIZATION))
char_vectorizer = TextVectorization.from_config(from_disk['config'])
char_vectorizer.set_weights(from_disk['weights'])
else:
# Create folder to save vectorizer obj
if not os.path.exists(params.VECTORIZATION):
os.makedirs(params.VECTORIZATION)
print("Create new char_vectorizer object ...")
# Create new word_vectorizer obj
char_vectorizer = TextVectorization(max_tokens = self.max_tokens,
output_sequence_length=self.output_char_length)
char_vectorizer.adapt(list_char)
# Save newly created obj
pickle.dump({'config': char_vectorizer.get_config(),
'weights': char_vectorizer.get_weights()}
, open(params.CHAR_VECTORIZATION, "wb"))
char_vocab = char_vectorizer.get_vocabulary()
print("Char vectorization on training set with vocab size: ", len(char_vocab))
# Embedding
char_embed = tf.keras.layers.Embedding(input_dim = len(char_vocab), output_dim = params.CHAR_OUTPUT_DIM,
mask_zero=False,
name="character-level_embedding")
return char_vectorizer, char_embed
@staticmethod
def create_glove_vocab(glove_txt):
"""
Create vocab dict from glove_txt
"""
glove_file = open(glove_txt)
glove_embed_dict = {}
for line in glove_file:
records = line.split()
word = records[0]
vector_dimensions = np.asarray(records[1:], dtype='float32')
glove_embed_dict[word] = vector_dimensions
glove_file.close()
return glove_embed_dict
@staticmethod
def create_glove_embed_matrix(vectorizer, glove_embed_dict, embed_dim = 200):
"""
Create glove matrix
args:
- vectorizer: word_vectorizer adapted to train_set
- glove_embed_dict: glove vocabulary dict
"""
corpus_vocab = vectorizer.get_vocabulary()
vocab_size = len(corpus_vocab)
glove_embed_matrix = np.zeros((vocab_size, embed_dim))
for i, word in enumerate(corpus_vocab):
word_vector = glove_embed_dict.get(word)
if word_vector is not None:
glove_embed_matrix[i] = word_vector
return glove_embed_matrix
def _get_glove_embeddings(self, vectorizer, glove_txt):
"""
Get pretrained glove embedding layer
"""
glove_embed_dict = self.create_glove_vocab(glove_txt)
glove_embed_matrix = self.create_glove_embed_matrix(vectorizer, glove_embed_dict)
glove_embed = layers.Embedding(input_dim=glove_embed_matrix.shape[0], output_dim=glove_embed_matrix.shape[1],
input_length=params.SEQ_LENGTH, trainable=False, weights=[glove_embed_matrix], name="glove_embedding")
return glove_embed
def _get_bert_embeddings(self):
"""
Get pretrained BERT embedding layer
"""
preprocess = hub.load(params.BERT_PROCESS_DIR)
bert = hub.load(params.BERT_EMBED_DIR)
preprocess_layer = hub.KerasLayer(preprocess, name='bert_input_preprocess')
bert_layer = hub.KerasLayer(bert, name='bert_layer')
return preprocess_layer, bert_layer
|