| | |
| | |
| | """ A bunch of util functions to build Seq2Seq models with Caffe2.""" |
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | import collections |
| | from future.utils import viewitems |
| |
|
| | import caffe2.proto.caffe2_pb2 as caffe2_pb2 |
| | from caffe2.python import attention, core, rnn_cell, brew |
| |
|
| |
|
| | PAD_ID = 0 |
| | PAD = '<PAD>' |
| | GO_ID = 1 |
| | GO = '<GO>' |
| | EOS_ID = 2 |
| | EOS = '<EOS>' |
| | UNK_ID = 3 |
| | UNK = '<UNK>' |
| |
|
| |
|
| | def gen_vocab(corpus, unk_threshold): |
| | vocab = collections.defaultdict(lambda: len(vocab)) |
| | freqs = collections.defaultdict(lambda: 0) |
| | |
| | vocab[PAD] |
| | vocab[GO] |
| | vocab[EOS] |
| | vocab[UNK] |
| |
|
| | with open(corpus) as f: |
| | for sentence in f: |
| | tokens = sentence.strip().split() |
| | for token in tokens: |
| | freqs[token] += 1 |
| | for token, freq in viewitems(freqs): |
| | if freq > unk_threshold: |
| | vocab[token] |
| |
|
| | return vocab |
| |
|
| |
|
| | def get_numberized_sentence(sentence, vocab): |
| | numerized_sentence = [] |
| | for token in sentence.strip().split(): |
| | if token in vocab: |
| | numerized_sentence.append(vocab[token]) |
| | else: |
| | numerized_sentence.append(vocab[UNK]) |
| | return numerized_sentence |
| |
|
| |
|
| | def rnn_unidirectional_layer( |
| | model, |
| | inputs, |
| | input_lengths, |
| | input_size, |
| | num_units, |
| | dropout_keep_prob, |
| | forward_only, |
| | return_sequence_output, |
| | return_final_state, |
| | scope=None, |
| | ): |
| | """ Unidirectional LSTM encoder.""" |
| | with core.NameScope(scope): |
| | initial_cell_state = model.param_init_net.ConstantFill( |
| | [], |
| | 'initial_cell_state', |
| | shape=[num_units], |
| | value=0.0, |
| | ) |
| | initial_hidden_state = model.param_init_net.ConstantFill( |
| | [], |
| | 'initial_hidden_state', |
| | shape=[num_units], |
| | value=0.0, |
| | ) |
| |
|
| | cell = rnn_cell.LSTMCell( |
| | input_size=input_size, |
| | hidden_size=num_units, |
| | forget_bias=0.0, |
| | memory_optimization=False, |
| | name=(scope + '/' if scope else '') + 'lstm', |
| | forward_only=forward_only, |
| | ) |
| |
|
| | dropout_ratio = ( |
| | None if dropout_keep_prob is None else (1.0 - dropout_keep_prob) |
| | ) |
| | if dropout_ratio is not None: |
| | cell = rnn_cell.DropoutCell( |
| | internal_cell=cell, |
| | dropout_ratio=dropout_ratio, |
| | name=(scope + '/' if scope else '') + 'dropout', |
| | forward_only=forward_only, |
| | is_test=False, |
| | ) |
| |
|
| | outputs_with_grads = [] |
| | if return_sequence_output: |
| | outputs_with_grads.append(0) |
| | if return_final_state: |
| | outputs_with_grads.extend([1, 3]) |
| |
|
| | outputs, (_, final_hidden_state, _, final_cell_state) = ( |
| | cell.apply_over_sequence( |
| | model=model, |
| | inputs=inputs, |
| | seq_lengths=input_lengths, |
| | initial_states=(initial_hidden_state, initial_cell_state), |
| | outputs_with_grads=outputs_with_grads, |
| | ) |
| | ) |
| | return outputs, final_hidden_state, final_cell_state |
| |
|
| |
|
| | def rnn_bidirectional_layer( |
| | model, |
| | inputs, |
| | input_lengths, |
| | input_size, |
| | num_units, |
| | dropout_keep_prob, |
| | forward_only, |
| | return_sequence_output, |
| | return_final_state, |
| | scope=None, |
| | ): |
| | outputs_fw, final_hidden_fw, final_cell_fw = rnn_unidirectional_layer( |
| | model, |
| | inputs, |
| | input_lengths, |
| | input_size, |
| | num_units, |
| | dropout_keep_prob, |
| | forward_only, |
| | return_sequence_output, |
| | return_final_state, |
| | scope=(scope + '/' if scope else '') + 'fw', |
| | ) |
| | with core.NameScope(scope): |
| | reversed_inputs = model.net.ReversePackedSegs( |
| | [inputs, input_lengths], |
| | ['reversed_inputs'], |
| | ) |
| | outputs_bw, final_hidden_bw, final_cell_bw = rnn_unidirectional_layer( |
| | model, |
| | reversed_inputs, |
| | input_lengths, |
| | input_size, |
| | num_units, |
| | dropout_keep_prob, |
| | forward_only, |
| | return_sequence_output, |
| | return_final_state, |
| | scope=(scope + '/' if scope else '') + 'bw', |
| | ) |
| | with core.NameScope(scope): |
| | outputs_bw = model.net.ReversePackedSegs( |
| | [outputs_bw, input_lengths], |
| | ['outputs_bw'], |
| | ) |
| |
|
| | |
| | if return_sequence_output: |
| | with core.NameScope(scope): |
| | outputs, _ = model.net.Concat( |
| | [outputs_fw, outputs_bw], |
| | ['outputs', 'outputs_dim'], |
| | axis=2, |
| | ) |
| | else: |
| | outputs = None |
| |
|
| | if return_final_state: |
| | with core.NameScope(scope): |
| | final_hidden_state, _ = model.net.Concat( |
| | [final_hidden_fw, final_hidden_bw], |
| | ['final_hidden_state', 'final_hidden_state_dim'], |
| | axis=2, |
| | ) |
| | final_cell_state, _ = model.net.Concat( |
| | [final_cell_fw, final_cell_bw], |
| | ['final_cell_state', 'final_cell_state_dim'], |
| | axis=2, |
| | ) |
| | else: |
| | final_hidden_state = None |
| | final_cell_state = None |
| |
|
| | return outputs, final_hidden_state, final_cell_state |
| |
|
| |
|
| | def build_embeddings( |
| | model, |
| | vocab_size, |
| | embedding_size, |
| | name, |
| | freeze_embeddings, |
| | ): |
| | embeddings = model.param_init_net.GaussianFill( |
| | [], |
| | name, |
| | shape=[vocab_size, embedding_size], |
| | std=0.1, |
| | ) |
| | if not freeze_embeddings: |
| | model.params.append(embeddings) |
| | return embeddings |
| |
|
| |
|
| | def get_layer_scope(scope, layer_type, i): |
| | prefix = (scope + '/' if scope else '') + layer_type |
| | return '{}/layer{}'.format(prefix, i) |
| |
|
| |
|
| | def build_embedding_encoder( |
| | model, |
| | encoder_params, |
| | num_decoder_layers, |
| | inputs, |
| | input_lengths, |
| | vocab_size, |
| | embeddings, |
| | embedding_size, |
| | use_attention, |
| | num_gpus=0, |
| | forward_only=False, |
| | scope=None, |
| | ): |
| | with core.NameScope(scope or ''): |
| | if num_gpus == 0: |
| | embedded_encoder_inputs = model.net.Gather( |
| | [embeddings, inputs], |
| | ['embedded_encoder_inputs'], |
| | ) |
| | else: |
| | with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)): |
| | embedded_encoder_inputs_cpu = model.net.Gather( |
| | [embeddings, inputs], |
| | ['embedded_encoder_inputs_cpu'], |
| | ) |
| | embedded_encoder_inputs = model.CopyCPUToGPU( |
| | embedded_encoder_inputs_cpu, |
| | 'embedded_encoder_inputs', |
| | ) |
| |
|
| | layer_inputs = embedded_encoder_inputs |
| | layer_input_size = embedding_size |
| | encoder_units_per_layer = [] |
| | final_encoder_hidden_states = [] |
| | final_encoder_cell_states = [] |
| |
|
| | num_encoder_layers = len(encoder_params['encoder_layer_configs']) |
| | use_bidirectional_encoder = encoder_params.get( |
| | 'use_bidirectional_encoder', |
| | False, |
| | ) |
| |
|
| | for i, layer_config in enumerate(encoder_params['encoder_layer_configs']): |
| |
|
| | if use_bidirectional_encoder and i == 0: |
| | layer_func = rnn_bidirectional_layer |
| | output_dims = 2 * layer_config['num_units'] |
| | else: |
| | layer_func = rnn_unidirectional_layer |
| | output_dims = layer_config['num_units'] |
| | encoder_units_per_layer.append(output_dims) |
| |
|
| | is_final_layer = (i == num_encoder_layers - 1) |
| |
|
| | dropout_keep_prob = layer_config.get( |
| | 'dropout_keep_prob', |
| | None, |
| | ) |
| |
|
| | return_final_state = i >= (num_encoder_layers - num_decoder_layers) |
| | ( |
| | layer_outputs, |
| | final_layer_hidden_state, |
| | final_layer_cell_state, |
| | ) = layer_func( |
| | model=model, |
| | inputs=layer_inputs, |
| | input_lengths=input_lengths, |
| | input_size=layer_input_size, |
| | num_units=layer_config['num_units'], |
| | dropout_keep_prob=dropout_keep_prob, |
| | forward_only=forward_only, |
| | return_sequence_output=(not is_final_layer) or use_attention, |
| | return_final_state=return_final_state, |
| | scope=get_layer_scope(scope, 'encoder', i), |
| | ) |
| |
|
| | if not is_final_layer: |
| | layer_inputs = layer_outputs |
| | layer_input_size = output_dims |
| | final_encoder_hidden_states.append(final_layer_hidden_state) |
| | final_encoder_cell_states.append(final_layer_cell_state) |
| |
|
| | encoder_outputs = layer_outputs |
| | weighted_encoder_outputs = None |
| |
|
| | return ( |
| | encoder_outputs, |
| | weighted_encoder_outputs, |
| | final_encoder_hidden_states, |
| | final_encoder_cell_states, |
| | encoder_units_per_layer, |
| | ) |
| |
|
| |
|
| | class LSTMWithAttentionDecoder(object): |
| |
|
| | def scope(self, name): |
| | return self.name + '/' + name if self.name is not None else name |
| |
|
| | def _get_attention_type(self, attention_type_as_string): |
| | if attention_type_as_string == 'regular': |
| | return attention.AttentionType.Regular |
| | elif attention_type_as_string == 'recurrent': |
| | return attention.AttentionType.Recurrent |
| | else: |
| | assert False, 'Unknown type ' + attention_type_as_string |
| |
|
| | def __init__( |
| | self, |
| | encoder_outputs, |
| | encoder_output_dim, |
| | encoder_lengths, |
| | vocab_size, |
| | attention_type, |
| | embedding_size, |
| | decoder_num_units, |
| | decoder_cells, |
| | residual_output_layers=None, |
| | name=None, |
| | weighted_encoder_outputs=None, |
| | ): |
| | self.name = name |
| | self.num_layers = len(decoder_cells) |
| | if attention_type == 'none': |
| | self.cell = rnn_cell.MultiRNNCell( |
| | decoder_cells, |
| | name=self.scope('decoder'), |
| | residual_output_layers=residual_output_layers, |
| | ) |
| | self.use_attention = False |
| | self.decoder_output_dim = decoder_num_units |
| | self.output_indices = self.cell.output_indices |
| | else: |
| | decoder_cell = rnn_cell.MultiRNNCell( |
| | decoder_cells, |
| | name=self.scope('decoder'), |
| | residual_output_layers=residual_output_layers, |
| | ) |
| | self.cell = rnn_cell.AttentionCell( |
| | encoder_output_dim=encoder_output_dim, |
| | encoder_outputs=encoder_outputs, |
| | encoder_lengths=encoder_lengths, |
| | decoder_cell=decoder_cell, |
| | decoder_state_dim=decoder_num_units, |
| | name=self.scope('attention_decoder'), |
| | attention_type=self._get_attention_type(attention_type), |
| | weighted_encoder_outputs=weighted_encoder_outputs, |
| | attention_memory_optimization=True, |
| | ) |
| | self.use_attention = True |
| | self.decoder_output_dim = decoder_num_units + encoder_output_dim |
| |
|
| | self.output_indices = decoder_cell.output_indices |
| | self.output_indices.append(2 * self.num_layers) |
| |
|
| | def get_state_names(self): |
| | return self.cell.get_state_names() |
| |
|
| | def get_outputs_with_grads(self): |
| | |
| | return [2 * i for i in self.output_indices] |
| |
|
| | def get_output_dim(self): |
| | return self.decoder_output_dim |
| |
|
| | def get_attention_weights(self): |
| | assert self.use_attention |
| | |
| | return self.cell.get_attention_weights() |
| |
|
| | def apply( |
| | self, |
| | model, |
| | input_t, |
| | seq_lengths, |
| | states, |
| | timestep, |
| | ): |
| | return self.cell.apply( |
| | model=model, |
| | input_t=input_t, |
| | seq_lengths=seq_lengths, |
| | states=states, |
| | timestep=timestep, |
| | ) |
| |
|
| | def apply_over_sequence( |
| | self, |
| | model, |
| | inputs, |
| | seq_lengths, |
| | initial_states, |
| | ): |
| | return self.cell.apply_over_sequence( |
| | model=model, |
| | inputs=inputs, |
| | seq_lengths=seq_lengths, |
| | initial_states=initial_states, |
| | outputs_with_grads=self.get_outputs_with_grads(), |
| | ) |
| |
|
| |
|
| | def build_initial_rnn_decoder_states( |
| | model, |
| | encoder_units_per_layer, |
| | decoder_units_per_layer, |
| | final_encoder_hidden_states, |
| | final_encoder_cell_states, |
| | use_attention, |
| | ): |
| | num_encoder_layers = len(encoder_units_per_layer) |
| | num_decoder_layers = len(decoder_units_per_layer) |
| | if num_encoder_layers > num_decoder_layers: |
| | offset = num_encoder_layers - num_decoder_layers |
| | else: |
| | offset = 0 |
| |
|
| | initial_states = [] |
| | for i, decoder_num_units in enumerate(decoder_units_per_layer): |
| |
|
| | if ( |
| | final_encoder_hidden_states and |
| | len(final_encoder_hidden_states) > (i + offset) |
| | ): |
| | final_encoder_hidden_state = final_encoder_hidden_states[i + offset] |
| | else: |
| | final_encoder_hidden_state = None |
| |
|
| | if final_encoder_hidden_state is None: |
| | decoder_initial_hidden_state = model.param_init_net.ConstantFill( |
| | [], |
| | 'decoder_initial_hidden_state_{}'.format(i), |
| | shape=[decoder_num_units], |
| | value=0.0, |
| | ) |
| | model.params.append(decoder_initial_hidden_state) |
| | elif decoder_num_units != encoder_units_per_layer[i + offset]: |
| | decoder_initial_hidden_state = brew.fc( |
| | model, |
| | final_encoder_hidden_state, |
| | 'decoder_initial_hidden_state_{}'.format(i), |
| | encoder_units_per_layer[i + offset], |
| | decoder_num_units, |
| | axis=2, |
| | ) |
| | else: |
| | decoder_initial_hidden_state = final_encoder_hidden_state |
| | initial_states.append(decoder_initial_hidden_state) |
| |
|
| | if ( |
| | final_encoder_cell_states and |
| | len(final_encoder_cell_states) > (i + offset) |
| | ): |
| | final_encoder_cell_state = final_encoder_cell_states[i + offset] |
| | else: |
| | final_encoder_cell_state = None |
| |
|
| | if final_encoder_cell_state is None: |
| | decoder_initial_cell_state = model.param_init_net.ConstantFill( |
| | [], |
| | 'decoder_initial_cell_state_{}'.format(i), |
| | shape=[decoder_num_units], |
| | value=0.0, |
| | ) |
| | model.params.append(decoder_initial_cell_state) |
| | elif decoder_num_units != encoder_units_per_layer[i + offset]: |
| | decoder_initial_cell_state = brew.fc( |
| | model, |
| | final_encoder_cell_state, |
| | 'decoder_initial_cell_state_{}'.format(i), |
| | encoder_units_per_layer[i + offset], |
| | decoder_num_units, |
| | axis=2, |
| | ) |
| | else: |
| | decoder_initial_cell_state = final_encoder_cell_state |
| | initial_states.append(decoder_initial_cell_state) |
| |
|
| | if use_attention: |
| | initial_attention_weighted_encoder_context = ( |
| | model.param_init_net.ConstantFill( |
| | [], |
| | 'initial_attention_weighted_encoder_context', |
| | shape=[encoder_units_per_layer[-1]], |
| | value=0.0, |
| | ) |
| | ) |
| | model.params.append(initial_attention_weighted_encoder_context) |
| | initial_states.append(initial_attention_weighted_encoder_context) |
| |
|
| | return initial_states |
| |
|
| |
|
| | def build_embedding_decoder( |
| | model, |
| | decoder_layer_configs, |
| | inputs, |
| | input_lengths, |
| | encoder_lengths, |
| | encoder_outputs, |
| | weighted_encoder_outputs, |
| | final_encoder_hidden_states, |
| | final_encoder_cell_states, |
| | encoder_units_per_layer, |
| | vocab_size, |
| | embeddings, |
| | embedding_size, |
| | attention_type, |
| | forward_only, |
| | num_gpus=0, |
| | scope=None, |
| | ): |
| | with core.NameScope(scope or ''): |
| | if num_gpus == 0: |
| | embedded_decoder_inputs = model.net.Gather( |
| | [embeddings, inputs], |
| | ['embedded_decoder_inputs'], |
| | ) |
| | else: |
| | with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)): |
| | embedded_decoder_inputs_cpu = model.net.Gather( |
| | [embeddings, inputs], |
| | ['embedded_decoder_inputs_cpu'], |
| | ) |
| | embedded_decoder_inputs = model.CopyCPUToGPU( |
| | embedded_decoder_inputs_cpu, |
| | 'embedded_decoder_inputs', |
| | ) |
| |
|
| | decoder_cells = [] |
| | decoder_units_per_layer = [] |
| | for i, layer_config in enumerate(decoder_layer_configs): |
| | num_units = layer_config['num_units'] |
| | decoder_units_per_layer.append(num_units) |
| |
|
| | if i == 0: |
| | input_size = embedding_size |
| | else: |
| | input_size = decoder_cells[-1].get_output_dim() |
| |
|
| | cell = rnn_cell.LSTMCell( |
| | forward_only=forward_only, |
| | input_size=input_size, |
| | hidden_size=num_units, |
| | forget_bias=0.0, |
| | memory_optimization=False, |
| | ) |
| |
|
| | dropout_keep_prob = layer_config.get('dropout_keep_prob', None) |
| | if dropout_keep_prob is not None: |
| | dropout_ratio = 1.0 - layer_config.dropout_keep_prob |
| | cell = rnn_cell.DropoutCell( |
| | internal_cell=cell, |
| | dropout_ratio=dropout_ratio, |
| | forward_only=forward_only, |
| | is_test=False, |
| | name=get_layer_scope(scope, 'decoder_dropout', i), |
| | ) |
| |
|
| | decoder_cells.append(cell) |
| |
|
| | states = build_initial_rnn_decoder_states( |
| | model=model, |
| | encoder_units_per_layer=encoder_units_per_layer, |
| | decoder_units_per_layer=decoder_units_per_layer, |
| | final_encoder_hidden_states=final_encoder_hidden_states, |
| | final_encoder_cell_states=final_encoder_cell_states, |
| | use_attention=(attention_type != 'none'), |
| | ) |
| | attention_decoder = LSTMWithAttentionDecoder( |
| | encoder_outputs=encoder_outputs, |
| | encoder_output_dim=encoder_units_per_layer[-1], |
| | encoder_lengths=encoder_lengths, |
| | vocab_size=vocab_size, |
| | attention_type=attention_type, |
| | embedding_size=embedding_size, |
| | decoder_num_units=decoder_units_per_layer[-1], |
| | decoder_cells=decoder_cells, |
| | weighted_encoder_outputs=weighted_encoder_outputs, |
| | name=scope, |
| | ) |
| | decoder_outputs, _ = attention_decoder.apply_over_sequence( |
| | model=model, |
| | inputs=embedded_decoder_inputs, |
| | seq_lengths=input_lengths, |
| | initial_states=states, |
| | ) |
| |
|
| | |
| | |
| | |
| | decoder_outputs_flattened, _ = model.net.Reshape( |
| | [decoder_outputs], |
| | [ |
| | 'decoder_outputs_flattened', |
| | 'decoder_outputs_and_contexts_combination_old_shape', |
| | ], |
| | shape=[-1, attention_decoder.get_output_dim()], |
| | ) |
| |
|
| | decoder_outputs = decoder_outputs_flattened |
| | decoder_output_dim = attention_decoder.get_output_dim() |
| |
|
| | return (decoder_outputs, decoder_output_dim) |
| |
|
| |
|
| | def output_projection( |
| | model, |
| | decoder_outputs, |
| | decoder_output_size, |
| | target_vocab_size, |
| | decoder_softmax_size, |
| | ): |
| | if decoder_softmax_size is not None: |
| | decoder_outputs = brew.fc( |
| | model, |
| | decoder_outputs, |
| | 'decoder_outputs_scaled', |
| | dim_in=decoder_output_size, |
| | dim_out=decoder_softmax_size, |
| | ) |
| | decoder_output_size = decoder_softmax_size |
| |
|
| | output_projection_w = model.param_init_net.XavierFill( |
| | [], |
| | 'output_projection_w', |
| | shape=[target_vocab_size, decoder_output_size], |
| | ) |
| |
|
| | output_projection_b = model.param_init_net.XavierFill( |
| | [], |
| | 'output_projection_b', |
| | shape=[target_vocab_size], |
| | ) |
| | model.params.extend([ |
| | output_projection_w, |
| | output_projection_b, |
| | ]) |
| | output_logits = model.net.FC( |
| | [ |
| | decoder_outputs, |
| | output_projection_w, |
| | output_projection_b, |
| | ], |
| | ['output_logits'], |
| | ) |
| | return output_logits |
| |
|