ALMOST WORKING

Browse files

Files changed (7) hide show

ag_news_clm.sh +18 -0
model/encoders.py +4 -2
model/outputs.py +52 -0
model/t5_vae.py +8 -6
model/vae.py +3 -11
run_clm_flax.py +3 -1
train.py +32 -26

ag_news_clm.sh ADDED Viewed

	@@ -0,0 +1,18 @@

+# test CLM works
+export RUN_NAME=test_clm
+./venv/bin/python run_clm_flax.py \
+--model_name_or_path="gpt2" \
+--output_dir="output/${RUN_NAME}" \
+--overwrite_output_dir \
+--dataset_name="ag_news" \
+--do_train --do_eval \
+--save_steps="2500" \
+--eval_steps="2500" \
+--block_size="128" \
+--per_device_train_batch_size="1" \
+--per_device_eval_batch_size="1" \
+--learning_rate="5e-3" --warmup_steps="1000" \
+--adam_beta1="0.9" --adam_beta2="0.98" --weight_decay="0.01" \
+--overwrite_output_dir \
+--num_train_epochs="20" \

model/encoders.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import logging
 import flax.linen as nn
 logger = logging.getLogger(__name__)
@@ -14,8 +15,9 @@ class Encoder(nn.Module):
     @nn.compact
     def __call__(self, encoding):
         latent_tokens = nn.Dense(self.latent_size)(encoding)
-        raw_latent_code = latent_tokens[:, : self.n_tokens, :]
-        latent_code = nn.Tanh()(raw_latent_code)
         return latent_code  # (batch, latent_tokens_per_sequence, latent_token_dim)

 import logging
+import jax.numpy as jnp
 import flax.linen as nn
 logger = logging.getLogger(__name__)
     @nn.compact
     def __call__(self, encoding):
         latent_tokens = nn.Dense(self.latent_size)(encoding)
+        raw_latent_code = latent_tokens[:, : self.n_latent_tokens, :]
+        # TODO does this just apply tanh to each latent token? Or across the whole batch
+        latent_code = jnp.tanh(raw_latent_code)
         return latent_code  # (batch, latent_tokens_per_sequence, latent_token_dim)

model/outputs.py CHANGED Viewed

@@ -1,3 +1,5 @@
 import flax
 import jaxlib.xla_extension as jax_xla
@@ -14,6 +16,56 @@ class TransformerVAE_Output(ModelOutput):
             Latent codes representing encoded sequences.
         remade_encoder_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, n_tokens, model_dim)`):
             Reconstructed encoder hidden states representing sequences.
     """
     latent_codes: jax_xla.DeviceArray = None
     remade_encoder_hidden_state: jax_xla.DeviceArray = None

+from typing import Optional, Tuple
 import flax
 import jaxlib.xla_extension as jax_xla
             Latent codes representing encoded sequences.
         remade_encoder_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, n_tokens, model_dim)`):
             Reconstructed encoder hidden states representing sequences.
+    (std Seq2Seq) Args:
+        logits (:obj:`jax_xla.DeviceArray` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        past_key_values (:obj:`tuple(tuple(jax_xla.DeviceArray))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
+            Tuple of :obj:`tuple(jax_xla.DeviceArray)` of length :obj:`config.n_layers`, with each tuple having 2
+            tensors of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional
+            tensors of shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential decoding.
+        decoder_hidden_states (:obj:`tuple(jax_xla.DeviceArray)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`jax_xla.DeviceArray` (one for the output of the embeddings + one for the output of each
+            layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
+        decoder_attentions (:obj:`tuple(jax_xla.DeviceArray)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`jax_xla.DeviceArray` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        cross_attentions (:obj:`tuple(jax_xla.DeviceArray)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`jax_xla.DeviceArray` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+        encoder_last_hidden_state (:obj:`jax_xla.DeviceArray` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (:obj:`tuple(jax_xla.DeviceArray)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`jax_xla.DeviceArray` (one for the output of the embeddings + one for the output of each
+            layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
+        encoder_attentions (:obj:`tuple(jax_xla.DeviceArray)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`jax_xla.DeviceArray` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
     """
     latent_codes: jax_xla.DeviceArray = None
     remade_encoder_hidden_state: jax_xla.DeviceArray = None
+    # seq2seq
+    logits: jax_xla.DeviceArray = None
+    past_key_values: Optional[Tuple[Tuple[jax_xla.DeviceArray]]] = None
+    decoder_hidden_states: Optional[Tuple[jax_xla.DeviceArray]] = None
+    decoder_attentions: Optional[Tuple[jax_xla.DeviceArray]] = None
+    cross_attentions: Optional[Tuple[jax_xla.DeviceArray]] = None
+    encoder_last_hidden_state: Optional[jax_xla.DeviceArray] = None
+    encoder_hidden_states: Optional[Tuple[jax_xla.DeviceArray]] = None
+    encoder_attentions: Optional[Tuple[jax_xla.DeviceArray]] = None

model/t5_vae.py CHANGED Viewed

@@ -35,12 +35,14 @@ class FlaxT5_VAE_ForAutoencodingModule(nn.Module):
         self,
         input_ids=None,
         attention_mask=None,
         latent_codes=None,
         output_attentions=None,
         output_hidden_states=None,
         return_dict=None,
         deterministic: bool = True,
     ):
         """
             Adapted from `FlaxT5ForConditionalGenerationModule`
         """
@@ -75,16 +77,16 @@ class FlaxT5_VAE_ForAutoencodingModule(nn.Module):
         sequence_output = decoder_outputs[0]
-        if self.config.tie_word_embeddings:
             # Rescale output before projecting on vocab
             # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
-            sequence_output = sequence_output * (self.config.t5.d_model ** -0.5)
-        if self.config.tie_word_embeddings:
-            shared_embedding = self.shared.variables["params"]["embedding"]
-            lm_logits = self.lm_head.apply({"params": {"kernel": shared_embedding.T}}, sequence_output)
         else:
-            lm_logits = self.lm_head(sequence_output)
         if not return_dict:
             return (lm_logits,) + decoder_outputs[1:] + encoder_outputs

         self,
         input_ids=None,
         attention_mask=None,
+        encoder_outputs=None,
         latent_codes=None,
         output_attentions=None,
         output_hidden_states=None,
         return_dict=None,
         deterministic: bool = True,
     ):
+        # TODO should I use None args when everything has to be computed anyway?
         """
             Adapted from `FlaxT5ForConditionalGenerationModule`
         """
         sequence_output = decoder_outputs[0]
+        if self.t5.config.tie_word_embeddings:
             # Rescale output before projecting on vocab
             # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
+            sequence_output = sequence_output * (self.t5.config.d_model ** -0.5)
+        if self.t5.config.tie_word_embeddings:
+            shared_embedding = self.t5.shared.variables["params"]["embedding"]
+            lm_logits = self.t5.lm_head.apply({"params": {"kernel": shared_embedding.T}}, sequence_output)
         else:
+            lm_logits = self.t5.lm_head(sequence_output)
         if not return_dict:
             return (lm_logits,) + decoder_outputs[1:] + encoder_outputs

model/vae.py CHANGED Viewed

@@ -3,7 +3,6 @@ import flax.linen as nn
 from model.encoders import VAE_ENCODER_MODELS
 from model.decoders import VAE_DECODER_MODELS
-from model.outputs import TransformerVAE_Output
 from model.config import T5_VAE_Config
@@ -18,21 +17,14 @@ class VAE(nn.Module):
     def setup(self):
         self.encoder = VAE_ENCODER_MODELS[self.config.vae_encoder_model](self.config.latent_size, self.config.n_latent_tokens)
-        self.decoder = VAE_DECODER_MODELS[self.config.vae_decoder_model](self.config.t5.d_model, self.config.n_latent_tokens)
     def __call__(self, encoding=None, latent_codes=None):
-        if latent_codes is None:
-            latent_codes = self.encode(encoding)
-        # return latent_codes for regulariser loss
-        return TransformerVAE_Output(
-            latent_codes,
-            self.decoder(latent_codes),
-        )
     def encode(self, encoding):
-        assert encoding.shape[1:] == self.input_shape
         return self.encoder(encoding)
     def decode(self, latent):
-        assert latent.shape[1:] == self.input_shape
         return self.decoder(latent)

 from model.encoders import VAE_ENCODER_MODELS
 from model.decoders import VAE_DECODER_MODELS
 from model.config import T5_VAE_Config
     def setup(self):
         self.encoder = VAE_ENCODER_MODELS[self.config.vae_encoder_model](self.config.latent_size, self.config.n_latent_tokens)
+        self.decoder = VAE_DECODER_MODELS[self.config.vae_decoder_model](self.config.t5.d_model,  self.config.n_latent_tokens)
     def __call__(self, encoding=None, latent_codes=None):
+        latent_codes = self.encode(encoding)
+        return self.decode(latent_codes), latent_codes
     def encode(self, encoding):
         return self.encoder(encoding)
     def decode(self, latent):
         return self.decoder(latent)

run_clm_flax.py CHANGED Viewed

@@ -405,7 +405,7 @@ def main():
             k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
             for k, t in concatenated_examples.items()
         }
-        result["labels"] = result["input_ids"].copy()
         return result
     # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder
@@ -421,6 +421,8 @@ def main():
         num_proc=data_args.preprocessing_num_workers,
         load_from_cache_file=not data_args.overwrite_cache,
     )
     if training_args.do_train:
         if "train" not in tokenized_datasets:

             k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
             for k, t in concatenated_examples.items()
         }
+        result["label"] = result["input_ids"].copy()
         return result
     # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder
         num_proc=data_args.preprocessing_num_workers,
         load_from_cache_file=not data_args.overwrite_cache,
     )
+    import pdb
+    pdb.set_trace()
     if training_args.do_train:
         if "train" not in tokenized_datasets:

train.py CHANGED Viewed

@@ -2,6 +2,7 @@
     Pre-training/Fine-tuning seq2seq models on autoencoding a dataset.
     TODO:
     - [x] Don't make decoder input ids.
     - [ ] Add reg loss
         - [x] calculate MMD loss
@@ -15,7 +16,7 @@
                 use_extra_logs (:obj:`bool`, `optional`, defaults to False):
                     Store extra logs during each training inference.
-            - [ ] Send the scedule time to the compute_loss method and calculate a coefficient based on that.
 '''
 import logging
 import math
@@ -379,6 +380,10 @@ def main():
             )
         return output
     tokenized_datasets = dataset.map(
         tokenize_function,
         batched=True,
@@ -394,22 +399,23 @@ def main():
         )
     block_size = min(data_args.block_size, tokenizer.model_max_length)
-    # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
-    def group_texts(examples):
-        # Concatenate all texts.
-        concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
-        total_length = len(concatenated_examples[list(examples.keys())[0]])
-        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
-        # customize this part to your needs.
-        if total_length >= block_size:
-            total_length = (total_length // block_size) * block_size
-        # Split by chunks of max_len.
-        result = {
-            k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
-            for k, t in concatenated_examples.items()
-        }
-        result["labels"] = result["input_ids"].copy()
-        return result
     # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder
     # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower
@@ -419,7 +425,7 @@ def main():
     # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
     lm_datasets = tokenized_datasets.map(
-        group_texts,
         batched=True,
         num_proc=data_args.preprocessing_num_workers,
         load_from_cache_file=not data_args.overwrite_cache,
@@ -516,8 +522,8 @@ def main():
         x_size = x.shape[0]
         y_size = y.shape[0]
         dim = x.shape[1]
-        tiled_x = jnp.repeat(jnp.reshape(x, (x_size, 1, dim)), y_size, axis = 1)
-        tiled_y = jnp.repeat(jnp.reshape(y, (1, y_size, dim)), x_size, axis = 0)
         return jnp.exp(-jnp.mean((tiled_x - tiled_y) ** 2, axis=2) / dim * 1.0)
     def compute_mmd(x, y):
@@ -526,16 +532,16 @@ def main():
         xy_kernel = compute_kernel(x, y)
         return jnp.mean(x_kernel) + jnp.mean(y_kernel) - 2 * jnp.mean(xy_kernel)
-    def regulariser_loss(latent_codes):
-        true_samples = jnp.random.randn(latent_codes.shape())
         return compute_mmd(true_samples, latent_codes)
-    def loss_fn(logits, labels, latent_codes):
         shift_logits = logits[..., :-1, :]
         shift_labels = labels[..., 1:]
         loss = optax.softmax_cross_entropy(shift_logits, onehot(shift_labels, shift_logits.shape[-1]))
-        reg_loss = regulariser_loss(latent_codes)
         return loss.mean() + reg_loss.mean()
     # Define gradient update step fn
@@ -544,8 +550,8 @@ def main():
         def compute_loss(params):
             labels = batch.pop("labels")
-            logits, latent_codes = state.apply_fn(**batch, params=params, dropout_rng=dropout_rng, train=True)[:2]
-            loss = loss_fn(logits, labels, latent_codes)
             return loss
         grad_fn = jax.value_and_grad(compute_loss)

     Pre-training/Fine-tuning seq2seq models on autoencoding a dataset.
     TODO:
+    - [ ] Get this running.
     - [x] Don't make decoder input ids.
     - [ ] Add reg loss
         - [x] calculate MMD loss
                 use_extra_logs (:obj:`bool`, `optional`, defaults to False):
                     Store extra logs during each training inference.
+            - [ ] Send the schedule time to the compute_loss method and calculate a coefficient based on that.
 '''
 import logging
 import math
             )
         return output
+    # remove dataset tasks
+    for k in dataset.keys():
+        dataset[k].info.task_templates = []
     tokenized_datasets = dataset.map(
         tokenize_function,
         batched=True,
         )
     block_size = min(data_args.block_size, tokenizer.model_max_length)
+    # Limits each input sequence to size block_size.
+    pad_token_id = tokenizer.pad_token_id
+    def limit_length(examples):
+        examples["labels"] = examples["input_ids"].copy()
+        for i, input_ids in enumerate(examples["input_ids"]):
+            if len(input_ids) > block_size:
+                for k in examples.keys():
+                    examples[k][i] = examples[k][i][:block_size]
+            elif len(input_ids) < block_size:
+                delta = block_size - len(input_ids)
+                examples['input_ids'][i] = examples['input_ids'][i] + [pad_token_id] * delta
+                examples['attention_mask'][i] = examples['attention_mask'][i] + [0] * delta
+                examples['labels'][i] = examples['labels'][i] + [-100] * delta
+        return examples
     # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder
     # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower
     # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
     lm_datasets = tokenized_datasets.map(
+        limit_length,
         batched=True,
         num_proc=data_args.preprocessing_num_workers,
         load_from_cache_file=not data_args.overwrite_cache,
         x_size = x.shape[0]
         y_size = y.shape[0]
         dim = x.shape[1]
+        tiled_x = jnp.repeat(jnp.reshape(x, (x_size, 1, dim)), y_size, axis=1)
+        tiled_y = jnp.repeat(jnp.reshape(y, (1, y_size, dim)), x_size, axis=0)
         return jnp.exp(-jnp.mean((tiled_x - tiled_y) ** 2, axis=2) / dim * 1.0)
     def compute_mmd(x, y):
         xy_kernel = compute_kernel(x, y)
         return jnp.mean(x_kernel) + jnp.mean(y_kernel) - 2 * jnp.mean(xy_kernel)
+    def regulariser_loss(latent_codes, rng: jax.random.PRNGKey):
+        true_samples = jax.random.normal(rng, latent_codes.shape())
         return compute_mmd(true_samples, latent_codes)
+    def loss_fn(logits, labels, latent_codes, rng: jax.random.PRNGKey):
         shift_logits = logits[..., :-1, :]
         shift_labels = labels[..., 1:]
         loss = optax.softmax_cross_entropy(shift_logits, onehot(shift_labels, shift_logits.shape[-1]))
+        reg_loss = regulariser_loss(latent_codes, rng)
         return loss.mean() + reg_loss.mean()
     # Define gradient update step fn
         def compute_loss(params):
             labels = batch.pop("labels")
+            outputs = state.apply_fn(**batch, params=params, dropout_rng=dropout_rng, train=True)
+            loss = loss_fn(outputs.logits, labels, outputs.latent_codes, state.dropout_rng)
             return loss
         grad_fn = jax.value_and_grad(compute_loss)