Spaces:

tom-doerr
/

logo_generator

Runtime error

App Files Files Community

boris commited on Apr 8, 2022

Commit

bcd360f

2 Parent(s): 7f2f8ed 728a3c3

Merge branch 'main' of https://github.com/borisdayma/dalle-mini into main

Browse files

Files changed (13) hide show

.github/workflows/sync_to_hub.yml +1 -1
.github/workflows/sync_to_hub_debug.yml +1 -1
README.md +2 -1
app/streamlit/app.py +1 -1
setup.cfg +1 -0
src/dalle_mini/data.py +12 -3
src/dalle_mini/model/text.py +3 -0
tools/train/config/mega/config.json +27 -8
tools/train/config/mini/config.json +1 -1
tools/train/scalable_shampoo/README.md +1 -1
tools/train/scalable_shampoo/distributed_shampoo.py +153 -30
tools/train/scalable_shampoo/symmetric_matrices/symmetric_matrices.py +170 -8
tools/train/train.py +50 -9

.github/workflows/sync_to_hub.yml CHANGED Viewed

@@ -17,4 +17,4 @@ jobs:
       - name: Push to hub
         env:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
-        run: git push https://boris:$HF_TOKEN@huggingface.co/spaces/flax-community/dalle-mini main

       - name: Push to hub
         env:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: git push https://boris:$HF_TOKEN@huggingface.co/spaces/dalle-mini/dalle-mini main

.github/workflows/sync_to_hub_debug.yml CHANGED Viewed

@@ -14,4 +14,4 @@ jobs:
       - name: Push to hub
         env:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
-        run: git push --force https://boris:$HF_TOKEN@huggingface.co/spaces/flax-community/dalle-mini-debug +HEAD:main

       - name: Push to hub
         env:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: git push --force https://boris:$HF_TOKEN@huggingface.co/spaces/dalle-mini/dalle-mini-debug +HEAD:main

README.md CHANGED Viewed

@@ -6,6 +6,7 @@ colorTo: green
 sdk: streamlit
 app_file: app/streamlit/app.py
 pinned: True
 ---
 # DALL·E Mini
@@ -18,7 +19,7 @@ _Generate images from a text prompt_
 Our logo was generated with DALL·E mini using the prompt "logo of an armchair in the shape of an avocado".
-You can create your own pictures with [the demo](https://huggingface.co/spaces/flax-community/dalle-mini).
 ## How does it work?

 sdk: streamlit
 app_file: app/streamlit/app.py
 pinned: True
+license: apache-2.0
 ---
 # DALL·E Mini
 Our logo was generated with DALL·E mini using the prompt "logo of an armchair in the shape of an avocado".
+You can create your own pictures with [the demo](https://huggingface.co/spaces/dalle-mini/dalle-mini).
 ## How does it work?

app/streamlit/app.py CHANGED Viewed

@@ -78,7 +78,7 @@ if prompt != "":
         </div>
         </div>
         </div>
-        <small><i>Predictions may take up to 40s under high load. Please stand by.</i></small>
     """,
         unsafe_allow_html=True,
     )

         </div>
         </div>
         </div>
+        <small><i>Predictions may take up to 5mn under high load. Please stand by.</i></small>
     """,
         unsafe_allow_html=True,
     )

setup.cfg CHANGED Viewed

@@ -27,6 +27,7 @@ install_requires =
     einops
     unidecode
     ftfy
     pillow
     jax
     flax

     einops
     unidecode
     ftfy
+    emoji
     pillow
     jax
     flax

src/dalle_mini/data.py CHANGED Viewed

@@ -43,6 +43,8 @@ class Dataset:
         if self.seed_dataset is None:
             # create a random seed
             self.seed_dataset = random.randint(0, 2**32 - 1)
         self.multi_hosts = jax.process_count() > 1
         # feed blank captions only in streaming mode for now
         # otherwise dataset could be cached with same blanked captions
@@ -173,6 +175,7 @@ class Dataset:
                 blank_caption_function,
                 text_column=self.text_column,
                 blank_caption_prob=self.blank_caption_prob,
             )
             if hasattr(self, "train_dataset"):
                 self.train_dataset = (
@@ -180,7 +183,9 @@ class Dataset:
                     if self.streaming
                     else self.train_dataset.map(
                         partial_blank_caption_function,
-                        num_proc=self.preprocessing_num_workers,
                         load_from_cache_file=False,
                         desc="Blanking some captions",
                     )
@@ -316,8 +321,12 @@ def shift_tokens_right(input_ids: np.array, decoder_start_token_id: int):
     return shifted_input_ids
-def blank_caption_function(example, text_column, blank_caption_prob):
-    if blank_caption_prob and np.random.rand() < blank_caption_prob:
         example[text_column] = ""
     return example

         if self.seed_dataset is None:
             # create a random seed
             self.seed_dataset = random.randint(0, 2**32 - 1)
+        # set numpy rng
+        self.np_rng = np.random.default_rng(self.seed_dataset)
         self.multi_hosts = jax.process_count() > 1
         # feed blank captions only in streaming mode for now
         # otherwise dataset could be cached with same blanked captions
                 blank_caption_function,
                 text_column=self.text_column,
                 blank_caption_prob=self.blank_caption_prob,
+                rng=self.np_rng,
             )
             if hasattr(self, "train_dataset"):
                 self.train_dataset = (
                     if self.streaming
                     else self.train_dataset.map(
                         partial_blank_caption_function,
+                        num_proc=None
+                        if self.seed_dataset
+                        else self.preprocessing_num_workers,
                         load_from_cache_file=False,
                         desc="Blanking some captions",
                     )
     return shifted_input_ids
+def blank_caption_function(example, text_column, blank_caption_prob, rng=None):
+    if (
+        blank_caption_prob
+        and (rng.random() if rng is not None else np.random.random())
+        < blank_caption_prob
+    ):
         example[text_column] = ""
     return example

src/dalle_mini/model/text.py CHANGED Viewed

@@ -8,6 +8,7 @@ import random
 import re
 from pathlib import Path
 import ftfy
 from huggingface_hub import hf_hub_download
 from unidecode import unidecode
@@ -213,6 +214,8 @@ class TextNormalizer:
         t = ftfy.fix_text(t)
         # fix html
         t = fix_html(t)
         # decode and simplify text: see unidecode library
         t = unidecode(t)
         # lower case

 import re
 from pathlib import Path
+import emoji
 import ftfy
 from huggingface_hub import hf_hub_download
 from unidecode import unidecode
         t = ftfy.fix_text(t)
         # fix html
         t = fix_html(t)
+        # decode emojis (would be removed by unidecode)
+        t = emoji.demojize(t)
         # decode and simplify text: see unidecode library
         t = unidecode(t)
         # lower case

tools/train/config/mega/config.json CHANGED Viewed

@@ -1,30 +1,49 @@
 {
   "activation_dropout": 0.0,
-  "activation_function": "gelu",
   "attention_dropout": 0.0,
   "bos_token_id": 16385,
   "d_model": 2048,
   "decoder_attention_heads": 32,
-  "decoder_ffn_dim": 8192,
   "decoder_layerdrop": 0.0,
-  "decoder_layers": 24,
   "decoder_start_token_id": 16384,
   "dropout": 0.0,
   "encoder_attention_heads": 32,
-  "encoder_ffn_dim": 8192,
   "encoder_layerdrop": 0.0,
-  "encoder_layers": 24,
-  "encoder_vocab_size": 50264,
   "eos_token_id": 16385,
   "image_length": 256,
-  "image_vocab_size": 16391,
   "init_std": 0.01,
   "is_encoder_decoder": true,
   "max_text_length": 64,
   "model_type": "dallebart",
   "normalize_text": true,
   "pad_token_id": 16385,
   "scale_embedding": false,
   "tie_word_embeddings": false,
-  "use_cache": true
 }

 {
   "activation_dropout": 0.0,
+  "activation_function": "swish",
   "attention_dropout": 0.0,
   "bos_token_id": 16385,
   "d_model": 2048,
   "decoder_attention_heads": 32,
+  "decoder_ffn_dim": 4096,
   "decoder_layerdrop": 0.0,
+  "decoder_layers": 25,
   "decoder_start_token_id": 16384,
+  "do_sample": true,
   "dropout": 0.0,
   "encoder_attention_heads": 32,
+  "encoder_ffn_dim": 4096,
   "encoder_layerdrop": 0.0,
+  "encoder_layers": 25,
+  "encoder_vocab_size": 50272,
   "eos_token_id": 16385,
+  "force_ln_scale": false,
+  "gradient_checkpointing": false,
   "image_length": 256,
+  "image_vocab_size": 16415,
   "init_std": 0.01,
   "is_encoder_decoder": true,
+  "ln_positions": "normformer",
+  "ln_type": "layernorm",
+  "max_length": 257,
   "max_text_length": 64,
+  "min_length": 257,
   "model_type": "dallebart",
   "normalize_text": true,
   "pad_token_id": 16385,
   "scale_embedding": false,
+  "sinkhorn_iters": 1,
+  "tau_init": 0.05,
   "tie_word_embeddings": false,
+  "use_absolute_position_embeddings": true,
+  "use_alibi": false,
+  "use_bias": false,
+  "use_cache": true,
+  "use_cosine_attention": false,
+  "use_deepnet_scaling": false,
+  "use_final_ln_decoder": true,
+  "use_final_ln_encoder": true,
+  "use_glu": true,
+  "use_head_scale": false,
+  "use_swin_position_embeddings": false
 }

tools/train/config/mini/config.json CHANGED Viewed

@@ -16,7 +16,7 @@
   "eos_token_id": 16385,
   "gradient_checkpointing": false,
   "image_length": 256,
-  "image_vocab_size": 16384,
   "init_std": 0.02,
   "is_encoder_decoder": true,
   "max_text_length": 64,

   "eos_token_id": 16385,
   "gradient_checkpointing": false,
   "image_length": 256,
+  "image_vocab_size": 16391,
   "init_std": 0.02,
   "is_encoder_decoder": true,
   "max_text_length": 64,

tools/train/scalable_shampoo/README.md CHANGED Viewed

@@ -4,4 +4,4 @@ Files copied from [google-research/scalable_shampoo/optax](https://github.com/go
 Imports have been modified to be relative.
-This will be replaced with `optax-shampoo` package eventually.


4
5	Imports have been modified to be relative.
6
7	+ This will eventually be replaced with `optax-shampoo` package.

tools/train/scalable_shampoo/distributed_shampoo.py CHANGED Viewed

@@ -25,13 +25,12 @@
 # Authors: Rohan Anil (rohananil at google dot com)
 #    &     Vineet Gupta (vineet at google dot com)
 #
 """Distributed Shampoo Implementation."""
 import enum
 import functools
 import itertools
-from typing import Any, List, NamedTuple
 import chex
 import jax
@@ -43,6 +42,7 @@ from flax import struct
 from jax import lax
 from .quantization_utils import QuantizedValue
 # Dtype for inverse-pth root routine
 # Switch to f64 if you have hardware that supports it. Enable the jax flag
@@ -141,7 +141,10 @@ class GraftingType(enum.IntEnum):
 def power_iteration(
-    matrix, num_iters=100, error_tolerance=1e-6, precision=lax.Precision.HIGHEST
 ):
     r"""Power iteration algorithm.
@@ -156,10 +159,10 @@ def power_iteration(
       matrix: the symmetric PSD matrix.
       num_iters: Number of iterations.
       error_tolerance: Iterative exit condition.
-      precision: precision XLA related flag, the available options are:
-        a) lax.Precision.DEFAULT (better step time, but not precise)
-        b) lax.Precision.HIGH (increased precision, slower)
-        c) lax.Precision.HIGHEST (best possible precision, slowest)
     Returns:
       eigen vector, eigen value
@@ -196,7 +199,11 @@ def power_iteration(
     return v_out, s_out
-def mat_power(mat_m, p, precision=lax.Precision.HIGHEST):
     """A simple matrix power method. M^p where p can be TracedValue."""
     power = jnp.eye(mat_m.shape[0], dtype=_MAT_INV_PTH_ROOT_DTYPE)
@@ -245,15 +252,19 @@ def matrix_inverse_pth_root(
       num_iters: Maximum number of iterations.
       ridge_epsilon: Ridge epsilon added to make the matrix positive definite.
       error_tolerance: Error indicator, useful for early termination.
-      precision: precision XLA related flag, the available options are:
-        a) lax.Precision.DEFAULT (better step time, but not precise)
-        b) lax.Precision.HIGH (increased precision, slower)
-        c) lax.Precision.HIGHEST (best possible precision, slowest)
     Returns:
       matrix^(-1/p)
     """
     assert matrix.shape[0] == matrix.shape[1]
     # We use _MAT_INV_PTH_ROOT_DTYPE for the matrix inverse pth root.
@@ -336,8 +347,8 @@ def merge_small_dims(shape_to_merge, max_dim):
     return resulting_shape
-def pad_matrix(mat, max_size):
-    """Pad a matrix to a max_size.
     Args:
       mat: a matrix to pad.
@@ -346,19 +357,132 @@ def pad_matrix(mat, max_size):
     Returns:
       Given M returns [[M, 0], [0, I]]
     """
-    size = mat.shape[0]
-    assert size <= max_size
-    if size == max_size:
         return mat
-    pad_size = max_size - size
-    zs1 = jnp.zeros([size, pad_size], dtype=mat.dtype)
-    zs2 = jnp.zeros([pad_size, size], dtype=mat.dtype)
     eye = jnp.eye(pad_size, dtype=mat.dtype)
     mat = jnp.concatenate([mat, zs1], 1)
     mat = jnp.concatenate([mat, jnp.concatenate([zs2, eye], 1)], 0)
     return mat
 def pad_vector(vec, max_size):
     """Pad a vector to a max_size.
@@ -694,18 +818,17 @@ def distributed_shampoo(
       num_devices_for_pjit: Number of devices to parallelize over when using pjit.
       shard_optimizer_states: Shard optimizer states to save memory in model
         parallel training.
-      best_effort_memory_usage_reduction: Best effort memory usage reduction.
-        diagonal_statistics -> jnp.bfloat16
-        momentum buffers (2x) -> jnp.int8
         statistics, preconditioners -> jnp.int16 + diagonals
       inverse_failure_threshold: numerics are hard and inverses fail sometimes; we
         determine that using this threshold.
       moving_average_for_momentum: Whether to use moving average for momentum
         instead of exponential moving average.
       skip_preconditioning_dim_size_gt: Skip if preconditioning dim size is
-          greater than this value.
-      clip_by_scaled_gradient_norm: Clip by scaled gradient norm (only useful
-        when using RMSProp Grafting).
       precision: precision XLA related flag, the available options are: a)
         lax.Precision.DEFAULT (better step time, but not precise) b)
         lax.Precision.HIGH (increased precision, slower) c) lax.Precision.HIGHEST
@@ -1167,7 +1290,7 @@ def distributed_shampoo(
         new_padded_statistics = []
         for stat in new_stats_flat:
             new_padded_statistics.extend(
-                [pad_matrix(stat, max_size) for stat in stat.statistics]
             )
         # Create global stats
@@ -1388,7 +1511,7 @@ def distributed_shampoo(
         num_devices = lax.psum(1, batch_axis_name)
         num_statistics = len(statistics)
         # Pad statistics and exponents to next multiple of num_devices.
-        packed_statistics = [pad_matrix(stat, max_size) for stat in statistics]
         to_pad = -num_statistics % num_devices
         packed_statistics.extend(
             [jnp.eye(max_size, dtype=packed_statistics[0].dtype) for _ in range(to_pad)]
@@ -1540,7 +1663,7 @@ def distributed_shampoo(
         # diagonals [d] f32
         # bucket_sizes [d] f32
         packed_quantized_statistics = [
-            pad_matrix(stat.quantized, max_size) for stat in statistics
         ]
         packed_quantized_diagonals = [
             pad_vector(stat.diagonal, max_size) for stat in statistics
@@ -1772,7 +1895,7 @@ def distributed_shampoo(
         """
         num_statistics = len(statistics)
         to_pad = -num_statistics % num_devices_for_pjit
-        padded_statistics = [pad_matrix(stat, max_size) for stat in statistics]
         padded_statistics.extend(
             [jnp.eye(max_size, dtype=padded_statistics[0].dtype) for _ in range(to_pad)]
         )

 # Authors: Rohan Anil (rohananil at google dot com)
 #    &     Vineet Gupta (vineet at google dot com)
 #
 """Distributed Shampoo Implementation."""
 import enum
 import functools
 import itertools
+from typing import Any, List, NamedTuple, Tuple
 import chex
 import jax
 from jax import lax
 from .quantization_utils import QuantizedValue
+from .symmetric_matrices import symmetric_matrices
 # Dtype for inverse-pth root routine
 # Switch to f64 if you have hardware that supports it. Enable the jax flag
 def power_iteration(
+    matrix,
+    num_iters=100,
+    error_tolerance=1e-6,
+    precision=lax.Precision.HIGHEST,
 ):
     r"""Power iteration algorithm.
       matrix: the symmetric PSD matrix.
       num_iters: Number of iterations.
       error_tolerance: Iterative exit condition.
+      precision: precision XLA related flag, the available options are: a)
+        lax.Precision.DEFAULT (better step time, but not precise) b)
+        lax.Precision.HIGH (increased precision, slower) c) lax.Precision.HIGHEST
+        (best possible precision, slowest)
     Returns:
       eigen vector, eigen value
     return v_out, s_out
+def mat_power(
+    mat_m,
+    p,
+    precision=lax.Precision.HIGHEST,
+):
     """A simple matrix power method. M^p where p can be TracedValue."""
     power = jnp.eye(mat_m.shape[0], dtype=_MAT_INV_PTH_ROOT_DTYPE)
       num_iters: Maximum number of iterations.
       ridge_epsilon: Ridge epsilon added to make the matrix positive definite.
       error_tolerance: Error indicator, useful for early termination.
+      precision: precision XLA related flag, the available options are: a)
+        lax.Precision.DEFAULT (better step time, but not precise) b)
+        lax.Precision.HIGH (increased precision, slower) c) lax.Precision.HIGHEST
+        (best possible precision, slowest)
     Returns:
       matrix^(-1/p)
     """
+    # If the input is not square, materialize it from the concatenated form.
+    if matrix.shape[0] != matrix.shape[1]:
+        matrix = symmetric_matrices.materialize_matrix_from_concat(matrix)
     assert matrix.shape[0] == matrix.shape[1]
     # We use _MAT_INV_PTH_ROOT_DTYPE for the matrix inverse pth root.
     return resulting_shape
+def pad_square_matrix(mat, max_size):
+    """Pad a square matrix up to max_size.
     Args:
       mat: a matrix to pad.
     Returns:
       Given M returns [[M, 0], [0, I]]
     """
+    rows, cols = mat.shape
+    if rows != cols:
+        raise ValueError(
+            "Must have rows == cols, instead got " f"rows={rows}, cols={cols}"
+        )
+    if cols > max_size:
+        raise ValueError(
+            "Must have cols <= max_size. Instead got "
+            f"cols={cols}, max_size={max_size}."
+        )
+    if rows == max_size:
         return mat
+    pad_size = max_size - rows
+    zs1 = jnp.zeros([rows, pad_size], dtype=mat.dtype)
+    zs2 = jnp.zeros([pad_size, rows], dtype=mat.dtype)
     eye = jnp.eye(pad_size, dtype=mat.dtype)
     mat = jnp.concatenate([mat, zs1], 1)
     mat = jnp.concatenate([mat, jnp.concatenate([zs2, eye], 1)], 0)
     return mat
+def make_sliced_padding(
+    symmetric_block_size,
+    num_blocks,
+    starting_block,
+    dtype,
+):
+    """Returns padding for symmetric block matrix.
+    Specifically, the padding is given concatenated rectangular matrices
+    representing the lower-triangular rows below the starting block. For example,
+    if we want to pad the symmetric matrix
+    M = [[A, B^T]
+         [B, C]],
+    the desired output (in terms of the full matrix) with num_blocks = 4 is
+    M_padded = [[A, B^T, 0, 0]
+                [B, C,   0, 0]
+                [0, 0,   I, 0]
+                 0, 0,   0, I].
+    We would represent M as the block matrix mat = [A, B, C]. In this form, the
+    additional padding to provide has form [0, 0, I, 0, 0, 0, I] (only the lower
+    triangular parts in the third and fourth rows).
+    Args:
+      symmetric_block_size: The size of each block.
+      num_blocks: The total number of blocks.
+      starting_block: The block where to start the padding.
+      dtype: The type to use for the blocks.
+    """
+    if starting_block == num_blocks:
+        return jnp.zeros(shape=(symmetric_block_size, 0), dtype=dtype)
+    blocks = []
+    for i in range(starting_block, num_blocks):
+        blocks.append(
+            jnp.zeros(
+                shape=(symmetric_block_size, symmetric_block_size * i), dtype=dtype
+            )
+        )
+        blocks.append(jnp.eye(symmetric_block_size, dtype=dtype))
+    return jnp.concatenate(blocks, axis=-1)
+def pad_block_symmetric_matrix(
+    mat,
+    symmetric_block_size,
+    max_num_blocks,
+):
+    """Returns the padded blocked symmetric matrix.
+    The size of the padded matrix will be:
+      [symmetric_block_size, symmetric_block_size * max_num_blocks]
+    The input matrix can either:
+      - Be square with size less or equal to symmetric_block_size. In this case,
+        mat will first be padded to a square matrix of size symmetric_block_size,
+        and then be padded again up to the full size of the blocked matrix.
+      - Be a rectangle with number of rows equal to block size.
+        In this case, number of columns must be a multiple of number of rows, and
+        the ratio must correspond to a block representation of a symmetric matrix.
+        That is, the ratio must have form x * (x + 1) / 2. Here, x represents the
+        number of block rows represented by the matrix.
+    Args:
+      mat: The input block matrix.
+      symmetric_block_size: The size of blocks.
+      max_num_blocks: The largest number of blocks to pad to.
+    """
+    rows, cols = mat.shape
+    if rows > symmetric_block_size:
+        raise ValueError(
+            "Must have rows <= symmetric_block_size. Instead got "
+            f"rows={rows}, symmetric_block_size={symmetric_block_size}."
+        )
+    if rows > cols:
+        raise ValueError(
+            "Must have rows <= cols, instead got " f"rows={rows}, cols={cols}."
+        )
+    if cols > symmetric_block_size * max_num_blocks:
+        raise ValueError(
+            "Must have cols <= symmetric_block_size * max_num_blocks "
+            f"Instead got cols={cols}, "
+            f"symmetric_block_size={symmetric_block_size}, "
+            f"max_num_blocks={max_num_blocks}."
+        )
+    if rows < symmetric_block_size:
+        mat = pad_square_matrix(mat, max_size=symmetric_block_size)
+    # Update rows and cols after possibly padding in pad_square_matrix.
+    rows, cols = mat.shape
+    assert rows == symmetric_block_size
+    assert cols % rows == 0
+    filled_blocks = cols // rows
+    padding_blocks = make_sliced_padding(
+        symmetric_block_size=symmetric_block_size,
+        num_blocks=symmetric_matrices.num_blocks_from_total_blocks(max_num_blocks),
+        starting_block=symmetric_matrices.num_blocks_from_total_blocks(filled_blocks),
+        dtype=mat.dtype,
+    )
+    return jnp.concatenate([mat, padding_blocks], axis=-1)
 def pad_vector(vec, max_size):
     """Pad a vector to a max_size.
       num_devices_for_pjit: Number of devices to parallelize over when using pjit.
       shard_optimizer_states: Shard optimizer states to save memory in model
         parallel training.
+      best_effort_memory_usage_reduction: Best effort memory usage reduction. -
+        diagonal_statistics -> jnp.bfloat16 - momentum buffers (2x) -> jnp.int8 -
         statistics, preconditioners -> jnp.int16 + diagonals
       inverse_failure_threshold: numerics are hard and inverses fail sometimes; we
         determine that using this threshold.
       moving_average_for_momentum: Whether to use moving average for momentum
         instead of exponential moving average.
       skip_preconditioning_dim_size_gt: Skip if preconditioning dim size is
+        greater than this value.
+      clip_by_scaled_gradient_norm: Clip by scaled gradient norm (only useful when
+        using RMSProp Grafting).
       precision: precision XLA related flag, the available options are: a)
         lax.Precision.DEFAULT (better step time, but not precise) b)
         lax.Precision.HIGH (increased precision, slower) c) lax.Precision.HIGHEST
         new_padded_statistics = []
         for stat in new_stats_flat:
             new_padded_statistics.extend(
+                [pad_square_matrix(stat, max_size) for stat in stat.statistics]
             )
         # Create global stats
         num_devices = lax.psum(1, batch_axis_name)
         num_statistics = len(statistics)
         # Pad statistics and exponents to next multiple of num_devices.
+        packed_statistics = [pad_square_matrix(stat, max_size) for stat in statistics]
         to_pad = -num_statistics % num_devices
         packed_statistics.extend(
             [jnp.eye(max_size, dtype=packed_statistics[0].dtype) for _ in range(to_pad)]
         # diagonals [d] f32
         # bucket_sizes [d] f32
         packed_quantized_statistics = [
+            pad_square_matrix(stat.quantized, max_size) for stat in statistics
         ]
         packed_quantized_diagonals = [
             pad_vector(stat.diagonal, max_size) for stat in statistics
         """
         num_statistics = len(statistics)
         to_pad = -num_statistics % num_devices_for_pjit
+        padded_statistics = [pad_square_matrix(stat, max_size) for stat in statistics]
         padded_statistics.extend(
             [jnp.eye(max_size, dtype=padded_statistics[0].dtype) for _ in range(to_pad)]
         )

tools/train/scalable_shampoo/symmetric_matrices/symmetric_matrices.py CHANGED Viewed

@@ -16,7 +16,7 @@
 """JAX Ops for symmetric matrices used by the Shampoo optimizer."""
 import functools
-from typing import Any, List, Sequence, Union
 import jax
 import jax.numpy as jnp
@@ -192,7 +192,7 @@ def materialize_matrix(symmetric_matrix):
 @functools.partial(jax.jit, static_argnames=("num_blocks"))
 def materialize_matrix_from_concat(
     block_rows_concat,
-    num_blocks,
 ):
     """Returns a materialized symmetric matrix from concatenated slices.
@@ -200,7 +200,11 @@ def materialize_matrix_from_concat(
       block_rows_concat: The matrix represented as the concatenated
         lower-triangular blocks.
       num_blocks: The number of block-rows used to represent the symmetric matrix.
     """
     block_size = block_rows_concat.shape[-2]
     block_rows = [
@@ -251,6 +255,28 @@ def update_sliced_rows(
     )
 def find_num_blocks(block_rows_concat):
     """Returns the number of (row) blocks representing the concatenated matrix.
@@ -270,11 +296,147 @@ def find_num_blocks(block_rows_concat):
     # Compute the number of square blocks used to represent the matrix.
     total_blocks = block_rows_concat.shape[-1] / block_rows_concat.shape[-2]
     # Determine the number of block rows by inverting y = x*(x+1)/2.
-    num_blocks = np.round((np.sqrt(8 * total_blocks + 1) - 1) / 2).astype(np.int32)
-    if num_blocks * (num_blocks + 1) / 2 != total_blocks:
         raise ValueError(
-            "Could not determine an appropriate number of blocks for "
-            "the concatenated matrix."
         )
-    else:
-        return num_blocks

 """JAX Ops for symmetric matrices used by the Shampoo optimizer."""
 import functools
+from typing import Any, List, Optional, Sequence, Union
 import jax
 import jax.numpy as jnp
 @functools.partial(jax.jit, static_argnames=("num_blocks"))
 def materialize_matrix_from_concat(
     block_rows_concat,
+    num_blocks=None,
 ):
     """Returns a materialized symmetric matrix from concatenated slices.
       block_rows_concat: The matrix represented as the concatenated
         lower-triangular blocks.
       num_blocks: The number of block-rows used to represent the symmetric matrix.
+        If not specified, it is inferred from the shape of block_rows_concat.
     """
+    if num_blocks is None:
+        num_blocks = find_num_blocks(block_rows_concat)
     block_size = block_rows_concat.shape[-2]
     block_rows = [
     )
+def num_blocks_from_total_blocks(total_blocks):
+    """Returns the number of blocks (i.e.
+    block rows) from the total blocks.
+    This is the inverse of the function x -> x*(x+1)/2.
+    For example, the matrix M = [[A, B^T], [B, C]] may be represented using a
+    total of 3 blocks ([A, B, C]). The number of corresponding block rows is 2.
+    Args:
+      total_blocks: The total blocks used to represent the matrix.
+    """
+    num_blocks = np.round((np.sqrt(8 * total_blocks + 1) - 1) / 2).astype(np.int32)
+    if (num_blocks * (num_blocks + 1)) / 2 != total_blocks:
+        raise ValueError(
+            f"total_blocks={total_blocks} does not correspond to "
+            "a symmetric matrix. It must have the form total_blocks = x*(x+1)/2."
+        )
+    return num_blocks
 def find_num_blocks(block_rows_concat):
     """Returns the number of (row) blocks representing the concatenated matrix.
     # Compute the number of square blocks used to represent the matrix.
     total_blocks = block_rows_concat.shape[-1] / block_rows_concat.shape[-2]
     # Determine the number of block rows by inverting y = x*(x+1)/2.
+    return num_blocks_from_total_blocks(total_blocks)
+@functools.partial(jax.jit, static_argnames=("block_size"))
+def slice_symmetric_matrix(
+    mat,
+    block_size,
+):
+    """Returns sliced row blocks.
+    Args:
+      mat: A symmetric matrix.
+      block_size: The size of the row slices.
+    """
+    num_rows = mat.shape[-2]
+    num_cols = mat.shape[-1]
+    if num_rows != num_cols:
+        raise ValueError("mat is not square.")
+    if num_rows % block_size != 0:
         raise ValueError(
+            "block size does not evenly divide rows. "
+            f"num_rows={num_rows}, block_size={block_size}"
         )
+    return SlicedSymmetricMatrix(
+        block_rows=[
+            mat[
+                Ellipsis,
+                i * block_size : (i + 1) * block_size,
+                0 : (i + 1) * block_size,
+            ]
+            for i in range(num_rows // block_size)
+        ]
+    )
+@functools.partial(jax.jit, static_argnames=("block_size"))
+def slice_symmetric_matrix_concat(
+    mat,
+    block_size,
+):
+    """Returns the concatenated sliced row blocks.
+    Args:
+      mat: A symmetric matrix.
+      block_size: The size of the row slices.
+    """
+    sliced_symmetric_matrix = slice_symmetric_matrix(mat=mat, block_size=block_size)
+    return jnp.concatenate(sliced_symmetric_matrix.block_rows, axis=-1)
+def sliced_matrix_diag(mat):
+    """Returns the diagonal of the symmetric matrix.
+    Args:
+      mat: The symmetric matrix represented in concatenated block form.
+    """
+    rows, cols = mat.shape
+    total_blocks = cols // rows
+    num_blocks = num_blocks_from_total_blocks(total_blocks)
+    diags = []
+    for i in range(num_blocks):
+        last_index = rows * ((i + 2) * (i + 1)) // 2
+        first_index = last_index - rows
+        diags.append(jnp.diag(mat[Ellipsis, first_index:last_index]))
+    return jnp.concatenate(diags, axis=-1)
+def diag_as_concat(diag, block_size):
+    """Returns the representation of a diagonal matrix in symmetric block form.
+    Args:
+      diag: The 1D array for the diagonals.
+      block_size: The size of blocks to use. Must divide the length of diag.
+    """
+    assert len(diag.shape) == 1  # diag must be 1D.
+    assert len(diag) % block_size == 0
+    num_diag_blocks = len(diag) // block_size
+    blocks = []
+    for i in range(num_diag_blocks):
+        blocks.append(jnp.zeros(shape=(block_size, block_size * i), dtype=diag.dtype))
+        blocks.append(jnp.diag(diag[i * block_size : (i + 1) * block_size]))
+    return jnp.concatenate(blocks, axis=-1)
+def row_abs_maxes(mat):
+    """Returns the max of the absolute values of the rows of the full matrix.
+    For example the symmetric matrix M = [[1, 6], [6, 2]] is represented using
+    mat = [1, 6, 2] with block_size = 1. In this case the function returns the
+    aboslute row maxes of the original symmetric matrix, [6, 6].
+    Args:
+      mat: The symmetric matrix represented as the concatenated blocks.
+    """
+    rows, cols = mat.shape
+    # Find col and row max for each block.
+    col_maxes = []
+    row_maxes = []
+    for i in range(cols // rows):
+        block = jnp.abs(mat[Ellipsis, i * rows : (i + 1) * rows])
+        col_maxes.append(jnp.max(block, axis=1))
+        row_maxes.append(jnp.max(block, axis=0))
+    # global row max from block maxes.
+    num_blocks = num_blocks_from_total_blocks(cols // rows)
+    maxes = []
+    for i in range(num_blocks):
+        maxes.append(
+            jnp.concatenate(
+                row_maxes[(i * (i + 1) // 2) : ((i + 2) * (i + 1) // 2)]
+                + [
+                    col_maxes[((j + 1) * (j + 2)) // 2 - (j - i + 1)]
+                    for j in range(i + 1, num_blocks)
+                ],
+                axis=-1,
+            )
+        )
+    return jnp.max(jnp.stack(maxes), axis=0)
+def times_vector(mat, vec):
+    """Returns the symmetric block-concatenated matrix multiplied by a vector.
+    Specifically, each value in the vector is multiplied by a row of the full
+    matrix. That is, the vector is broadcast and multiplied element-wise. Note
+    this would be the transpose of full_mat * vec if full_mat represented the full
+    symmetric matrix.
+    Args:
+      mat: The symmetric matrix represented as the concatenated blocks.
+      vec: The vector, having the same dimension as the materialized matrix.
+    """
+    rows, cols = mat.shape
+    num_blocks = num_blocks_from_total_blocks(cols // rows)
+    multiplied = []
+    for i in range(num_blocks):
+        mat_block = mat[
+            Ellipsis, rows * ((i + 1) * i) // 2 : rows * ((i + 1) * (i + 2)) // 2
+        ]
+        vec_block = vec[Ellipsis, rows * i : rows * (i + 1)]
+        multiplied.append(jnp.einsum("...ij,...i->ij", mat_block, vec_block))
+    return jnp.concatenate(multiplied, axis=-1)

tools/train/train.py CHANGED Viewed

@@ -368,6 +368,12 @@ class TrainingArguments:
             "help": "Whether to quantize optimizer (only supported with Distributed Shampoo)."
         },
     )
     num_train_epochs: int = field(
         default=3, metadata={"help": "Total number of training epochs to perform."}
@@ -450,6 +456,11 @@ class TrainingArguments:
         metadata={"help": "Verify that TPU is not in use."},
     )
     mp_devices: Optional[int] = field(
         default=1,
         metadata={
@@ -500,6 +511,11 @@ class TrainingArguments:
                 f"Output directory ({self.output_dir}) already exists and is not empty."
                 "Use --overwrite_output_dir to overcome."
             )
         assert (
             self.mp_devices > 0
         ), f"Number of devices for model parallelism must be > 0"
@@ -530,6 +546,12 @@ def main():
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
     # Make one log on every process with the configuration for debugging.
     logging.basicConfig(
         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@@ -748,8 +770,20 @@ def main():
             graft_type=graft_type,
             nesterov=False,
             exponent_override=0,
-            statistics_partition_spec=PartitionSpec(None, "dp", None),
-            preconditioner_partition_spec=PartitionSpec("dp", None, None),
             num_devices_for_pjit=training_args.dp_devices,
             shard_optimizer_states=True,
             inverse_failure_threshold=0.1,
@@ -917,7 +951,7 @@ def main():
     # "vmap trick" avoids a crash when mp_devices > 1 (not sure why it happens)
     # lead to better perf: see https://wandb.ai/dalle-mini/dalle-mini/reports/JAX-pmap-vs-pjit--VmlldzoxNDg1ODA2
-    use_vmap_trick = True
     # make grad_param_spec for vmap
     if use_vmap_trick:
@@ -1145,7 +1179,8 @@ def main():
                 self.log_time("train_per_log", delta_time, offset=False)
         def log_time(self, key, duration, offset=True):
-            wandb.log({f"time/{key}": duration, **self.state_dict})
             if offset:
                 self.offset_time += duration
@@ -1191,7 +1226,11 @@ def main():
         # ======================== Evaluating ==============================
         if training_args.do_eval:
             start_eval_time = time.perf_counter()
-            eval_loader = dataset.dataloader("eval", eval_batch_size_per_step)
             eval_steps = (
                 len_eval_dataset // eval_batch_size_per_step
                 if len_eval_dataset is not None
@@ -1353,10 +1392,12 @@ def main():
             metrics_logger.update_state_metrics(local_state)
             metrics_logger.log({})
-            # Generate an epoch by shuffling sampling indices from the train dataset
             train_loader = dataset.dataloader(
                 "train",
-                batch_size_per_node,
                 epoch,
             )
             # train
@@ -1373,12 +1414,12 @@ def main():
                 # set correct shape to batch
                 # - add grad_step dim if gradient_accumulation_steps > 1
-                # - split per dp device if not multi-host for vmap trick (does not work in multi-host)
                 bs_shape = (
-                    (batch_size_per_node_per_grad_step,)
                     if not use_vmap_trick
                     else (
                         jax.local_device_count()
                         // training_args.mp_devices,  # local dp devices
                         training_args.per_device_train_batch_size,
                     )

             "help": "Whether to quantize optimizer (only supported with Distributed Shampoo)."
         },
     )
+    shard_shampoo_across: str = field(
+        default="dp",
+        metadata={
+            "help": "Whether to shard the optimizer across data devices (dp), model devices (mp) or both (2d)."
+        },
+    )
     num_train_epochs: int = field(
         default=3, metadata={"help": "Total number of training epochs to perform."}
         metadata={"help": "Verify that TPU is not in use."},
     )
+    use_vmap_trick: bool = field(
+        default=True,
+        metadata={"help": "Verify that TPU is not in use."},
+    )
     mp_devices: Optional[int] = field(
         default=1,
         metadata={
                 f"Output directory ({self.output_dir}) already exists and is not empty."
                 "Use --overwrite_output_dir to overcome."
             )
+        assert self.shard_shampoo_across in [
+            "dp",
+            "mp",
+            "2d",
+        ], f"Shard shampoo across {self.shard_shampoo_across} not supported."
         assert (
             self.mp_devices > 0
         ), f"Number of devices for model parallelism must be > 0"
     else:
         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    # check arguments
+    if training_args.mp_devices > jax.local_device_count():
+        assert (
+            data_args.seed_dataset is not None
+        ), "Seed dataset must be provided when model is split over multiple hosts"
     # Make one log on every process with the configuration for debugging.
     logging.basicConfig(
         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
             graft_type=graft_type,
             nesterov=False,
             exponent_override=0,
+            statistics_partition_spec=PartitionSpec(
+                None, training_args.shard_shampoo_across, None
+            )
+            if training_args.shard_shampoo_across != "2d"
+            else PartitionSpec(None, "dp", "mp"),
+            preconditioner_partition_spec=PartitionSpec(
+                training_args.shard_shampoo_across, None, None
+            )
+            if training_args.shard_shampoo_across != "2d"
+            else PartitionSpec(
+                "mp" if training_args.mp_devices > training_args.dp_devices else "dp",
+                None,
+                None,
+            ),
             num_devices_for_pjit=training_args.dp_devices,
             shard_optimizer_states=True,
             inverse_failure_threshold=0.1,
     # "vmap trick" avoids a crash when mp_devices > 1 (not sure why it happens)
     # lead to better perf: see https://wandb.ai/dalle-mini/dalle-mini/reports/JAX-pmap-vs-pjit--VmlldzoxNDg1ODA2
+    use_vmap_trick = training_args.use_vmap_trick
     # make grad_param_spec for vmap
     if use_vmap_trick:
                 self.log_time("train_per_log", delta_time, offset=False)
         def log_time(self, key, duration, offset=True):
+            if jax.process_index() == 0:
+                wandb.log({f"time/{key}": duration, **self.state_dict})
             if offset:
                 self.offset_time += duration
         # ======================== Evaluating ==============================
         if training_args.do_eval:
             start_eval_time = time.perf_counter()
+            eval_loader = dataset.dataloader(
+                "eval",
+                eval_batch_size_per_step
+                * max(1, training_args.mp_devices // jax.local_device_count()),
+            )
             eval_steps = (
                 len_eval_dataset // eval_batch_size_per_step
                 if len_eval_dataset is not None
             metrics_logger.update_state_metrics(local_state)
             metrics_logger.log({})
+            # load data - may be replicated on multiple nodes
+            node_groups = max(1, training_args.mp_devices // jax.local_device_count())
+            loader_bs = batch_size_per_node * node_groups
             train_loader = dataset.dataloader(
                 "train",
+                loader_bs,
                 epoch,
             )
             # train
                 # set correct shape to batch
                 # - add grad_step dim if gradient_accumulation_steps > 1
                 bs_shape = (
+                    (batch_size_per_node_per_grad_step * node_groups,)
                     if not use_vmap_trick
                     else (
                         jax.local_device_count()
+                        * node_groups
                         // training_args.mp_devices,  # local dp devices
                         training_args.per_device_train_batch_size,
                     )