working saving & loading

Browse files

Files changed (5) hide show

ag_news.sh +1 -0
ag_news_load.sh +20 -0
model/config.py +4 -5
tests/test_t5_vae.py +15 -42
train.py +9 -8

ag_news.sh CHANGED Viewed

@@ -17,3 +17,4 @@ export RUN_NAME=test
 --adam_beta1="0.9" --adam_beta2="0.98" --weight_decay="0.01" \
 --overwrite_output_dir \
 --num_train_epochs="20" \

 --adam_beta1="0.9" --adam_beta2="0.98" --weight_decay="0.01" \
 --overwrite_output_dir \
 --num_train_epochs="20" \
+--push_to_hub \

ag_news_load.sh ADDED Viewed

	@@ -0,0 +1,20 @@

+export RUN_NAME=test
+./venv/bin/python train.py \
+--model_name_or_path="output/test" \
+--t5_model_name_or_path="t5-base" \
+--output_dir="output/from_save/${RUN_NAME}" \
+--overwrite_output_dir \
+--dataset_name="ag_news" \
+--do_train --do_eval \
+--n_latent_tokens 6 \
+--latent_token_size 32 \
+--save_steps="2500" \
+--eval_steps="2500" \
+--block_size="32" \
+--per_device_train_batch_size="1" \
+--per_device_eval_batch_size="1" \
+--learning_rate="5e-3" --warmup_steps="1000" \
+--adam_beta1="0.9" --adam_beta2="0.98" --weight_decay="0.01" \
+--overwrite_output_dir \
+--num_train_epochs="20" \

model/config.py CHANGED Viewed

@@ -46,6 +46,7 @@ class T5VaeConfig(PretrainedConfig):
         cache_dir=None,
         tie_word_embeddings=True,
         # T5 config
         vocab_size=32128,
         d_model=512,
         d_kv=64,
@@ -86,12 +87,10 @@ class T5VaeConfig(PretrainedConfig):
         if t5_model_name_or_path:
             self.t5 = AutoConfig.from_pretrained(t5_model_name_or_path, cache_dir=cache_dir)
             assertEqual(self.t5.model_type, "t5", "Need t5 model type for transformer_decoder.")
-            if num_layers:
-                self.t5.num_layers = num_layers
-            if num_heads:
-                self.t5.num_heads = num_heads
             self.t5.decoder_start_token_id = decoder_start_token_id
-            self.t5.n_positions = self.set_seq_size
         else:
             self.t5 = T5Config(
                 vocab_size=vocab_size,

         cache_dir=None,
         tie_word_embeddings=True,
         # T5 config
+        t5=dict(),
         vocab_size=32128,
         d_model=512,
         d_kv=64,
         if t5_model_name_or_path:
             self.t5 = AutoConfig.from_pretrained(t5_model_name_or_path, cache_dir=cache_dir)
             assertEqual(self.t5.model_type, "t5", "Need t5 model type for transformer_decoder.")
             self.t5.decoder_start_token_id = decoder_start_token_id
+        elif t5:
+            # use for loading a config
+            self.t5 = T5Config(**t5)
         else:
             self.t5 = T5Config(
                 vocab_size=vocab_size,

tests/test_t5_vae.py CHANGED Viewed

@@ -294,51 +294,21 @@ class FlaxT5VaeModelTest(FlaxModelTesterMixin, FlaxGenerationTesterMixin, unitte
                 for jitted_output, output in zip(jitted_outputs, outputs):
                     self.assertEqual(jitted_output.shape, output.shape)
-    # overwrite since special base model prefix is used
-    def test_save_load_from_base(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        base_class = FLAX_MODEL_MAPPING[config.__class__]
-        for model_class in self.all_model_classes:
-            if model_class == base_class:
-                continue
-            model = base_class(config)
-            base_params = flatten_dict(unfreeze(model.params))
-            # check that all base model weights are loaded correctly
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                head_model = model_class.from_pretrained(tmpdirname)
-                base_param_from_head = flatten_dict(unfreeze(head_model.params))
-                for key in base_param_from_head.keys():
-                    max_diff = (base_params[key] - base_param_from_head[key]).sum().item()
-                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
-    # overwrite since special base model prefix is used
-    def test_save_load_to_base(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        base_class = FLAX_MODEL_MAPPING[config.__class__]
-        for model_class in self.all_model_classes:
-            if model_class == base_class:
-                continue
-            model = model_class(config)
-            base_params_from_head = flatten_dict(unfreeze(model.params))
-            # check that all base model weights are loaded correctly
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                base_model = base_class.from_pretrained(tmpdirname)
-                base_params = flatten_dict(unfreeze(base_model.params))
-                for key in base_params_from_head.keys():
-                    max_diff = (base_params[key] - base_params_from_head[key]).sum().item()
-                    self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
 ## Copied training methdos
@@ -354,8 +324,8 @@ def compute_mmd(x, y):
     x_kernel = compute_kernel(x, x)
     y_kernel = compute_kernel(y, y)
     xy_kernel = compute_kernel(x, y)
     return jnp.mean(x_kernel) + jnp.mean(y_kernel) - 2 * jnp.mean(xy_kernel)
 def regulariser_loss(latent_codes, rng):
     true_samples = jax.random.normal(rng, latent_codes.shape)
     return compute_mmd(true_samples, latent_codes)
@@ -403,3 +373,6 @@ class FlaxT5VaeModelIntegrationTests(unittest.TestCase):
         outputs = model(input_ids, decoder_input_ids=decoder_input_ids)
         logits, latent_codes = outputs[0], outputs[1]
         loss = loss_fn(logits, labels, latent_codes, jax.random.PRNGKey(42))

                 for jitted_output, output in zip(jitted_outputs, outputs):
                     self.assertEqual(jitted_output.shape, output.shape)
+    def test_save_and_load(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        model = FlaxT5VaeForAutoencoding(config)
+        model_params = flatten_dict(unfreeze(model.params))
+        # check that all base model weights are loaded correctly
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            model.save_pretrained(tmpdirname)
+            head_model = FlaxT5VaeForAutoencoding.from_pretrained(tmpdirname)
+            new_params = flatten_dict(unfreeze(head_model.params))
+            for key in new_params.keys():
+                max_diff = (model_params[key] - new_params[key]).sum().item()
+                self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
 ## Copied training methdos
     x_kernel = compute_kernel(x, x)
     y_kernel = compute_kernel(y, y)
     xy_kernel = compute_kernel(x, y)
     return jnp.mean(x_kernel) + jnp.mean(y_kernel) - 2 * jnp.mean(xy_kernel)
 def regulariser_loss(latent_codes, rng):
     true_samples = jax.random.normal(rng, latent_codes.shape)
     return compute_mmd(true_samples, latent_codes)
         outputs = model(input_ids, decoder_input_ids=decoder_input_ids)
         logits, latent_codes = outputs[0], outputs[1]
         loss = loss_fn(logits, labels, latent_codes, jax.random.PRNGKey(42))
+        import pdb
+        pdb.set_trace()
+        pass

train.py CHANGED Viewed

@@ -156,6 +156,9 @@ class DataTrainingArguments:
             "Default to the model max input length for single sentence inputs (take into account special tokens)."
         },
     )
     overwrite_cache: bool = field(
         default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
     )
@@ -293,7 +296,7 @@ def main():
     if data_args.dataset_name is not None:
         # Downloading and loading a dataset from the hub.
         dataset = load_dataset(
-            data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, keep_in_memory=False
         )
         if "validation" not in dataset.keys():
@@ -344,10 +347,6 @@ def main():
         tokenizer = AutoTokenizer.from_pretrained(
             model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
         )
-    elif model_args.model_name_or_path:
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
-        )
     elif model_args.t5_model_name_or_path:
         tokenizer = AutoTokenizer.from_pretrained(
             model_args.t5_model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
@@ -363,6 +362,7 @@ def main():
             model_args.model_name_or_path, config=config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype)
         )
         # TODO assert token embedding size == len(tokenizer)
     else:
         vocab_size = len(tokenizer)
         config.t5.vocab_size = vocab_size
@@ -563,7 +563,8 @@ def main():
     def regulariser_loss(latent_codes, rng):
         true_samples = jax.random.normal(rng, latent_codes.shape)
-        return jax.vmap(compute_mmd)(true_samples, latent_codes)
     def loss_fn(logits, labels, latent_codes, regulariser_rng):
         shift_logits = logits[..., :-1, :]
@@ -594,7 +595,7 @@ def main():
         return new_state, metrics
     # Define eval fn
-    def eval_step(params, batch):
         labels = batch.pop("labels")
         logits, latent_codes = model(**batch, params=params, train=False)[:2]
         loss = loss_fn(logits, labels, latent_codes, rng)
@@ -660,7 +661,7 @@ def main():
                 for _ in tqdm(range(eval_steps), desc="Evaluating...", position=2, leave=False):
                     # Model forward
                     batch = next(eval_loader)
-                    metrics = p_eval_step(state.params, batch)
                     eval_metrics.append(metrics)
                 # normalize eval metrics

             "Default to the model max input length for single sentence inputs (take into account special tokens)."
         },
     )
+    streaming: bool = field(
+        default=False, metadata={"help": "Stream the dataset."}
+    )
     overwrite_cache: bool = field(
         default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
     )
     if data_args.dataset_name is not None:
         # Downloading and loading a dataset from the hub.
         dataset = load_dataset(
+            data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, streaming=data_args.streaming, keep_in_memory=False
         )
         if "validation" not in dataset.keys():
         tokenizer = AutoTokenizer.from_pretrained(
             model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
         )
     elif model_args.t5_model_name_or_path:
         tokenizer = AutoTokenizer.from_pretrained(
             model_args.t5_model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
             model_args.model_name_or_path, config=config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype)
         )
         # TODO assert token embedding size == len(tokenizer)
+        assert(model.params['t5']['shared'].shape[0] == len(tokenizer), "T5 Tokenizer doesn't match T5Vae embedding size.")
     else:
         vocab_size = len(tokenizer)
         config.t5.vocab_size = vocab_size
     def regulariser_loss(latent_codes, rng):
         true_samples = jax.random.normal(rng, latent_codes.shape)
+        # return jax.vmap(compute_mmd)(true_samples, latent_codes)
+        return compute_mmd(true_samples, latent_codes)
     def loss_fn(logits, labels, latent_codes, regulariser_rng):
         shift_logits = logits[..., :-1, :]
         return new_state, metrics
     # Define eval fn
+    def eval_step(params, rng, batch):
         labels = batch.pop("labels")
         logits, latent_codes = model(**batch, params=params, train=False)[:2]
         loss = loss_fn(logits, labels, latent_codes, rng)
                 for _ in tqdm(range(eval_steps), desc="Evaluating...", position=2, leave=False):
                     # Model forward
                     batch = next(eval_loader)
+                    metrics = p_eval_step(state.params, state.dropout_rng, batch)
                     eval_metrics.append(metrics)
                 # normalize eval metrics