Upload folder using huggingface_hub

Browse files

Files changed (6) hide show

.gitattributes +2 -0
sharpness/final.tmp +3 -0
sharpness/final_810001.tmp +3 -0
sharpness/gen_images.py +374 -0
sharpness/helper_inference.py +210 -0
sharpness/model.py +427 -0

.gitattributes CHANGED Viewed

@@ -77,3 +77,5 @@ heun3_dt01/810001.tmp filter=lfs diff=lfs merge=lfs -text
 1e-6_kl_naive_globalscale_channelmean_sampling/810000.tmp filter=lfs diff=lfs merge=lfs -text
 heun3_dt01/810001/810001.tmp filter=lfs diff=lfs merge=lfs -text
 meanflow/810001.tmp filter=lfs diff=lfs merge=lfs -text

 1e-6_kl_naive_globalscale_channelmean_sampling/810000.tmp filter=lfs diff=lfs merge=lfs -text
 heun3_dt01/810001/810001.tmp filter=lfs diff=lfs merge=lfs -text
 meanflow/810001.tmp filter=lfs diff=lfs merge=lfs -text
+sharpness/final.tmp filter=lfs diff=lfs merge=lfs -text
+sharpness/final_810001.tmp filter=lfs diff=lfs merge=lfs -text

sharpness/final.tmp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:43b76a05291b4f9715b131d73ce450f6e59a18bcb2b84f4f6c916140b71e5e74
+size 2110113717

sharpness/final_810001.tmp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d3c4c0a239f06a5dbdab8892ab1cbc2cef0dc7ada7ade9d94261064aa188cbf0
+size 2110113717

sharpness/gen_images.py ADDED Viewed

	@@ -0,0 +1,374 @@

+from typing import Any
+import jax.numpy as jnp
+from absl import app, flags
+from functools import partial
+import numpy as np
+import tqdm
+import jax
+import jax.numpy as jnp
+import flax
+import optax
+import wandb
+from ml_collections import config_flags
+import ml_collections
+from utils.wandb import setup_wandb, default_wandb_config
+from utils.train_state import TrainStateEma
+from utils.checkpoint import Checkpoint
+from utils.stable_vae import StableVAE
+from utils.sharding import create_sharding, all_gather
+from utils.datasets import get_dataset
+from model import DiT
+from helper_eval import eval_model
+from helper_inference import do_inference
+FLAGS = flags.FLAGS
+flags.DEFINE_string('dataset_name', 'imagenet256', 'Environment name.')
+flags.DEFINE_string('load_dir', './sharpness/final.tmp', 'Logging dir (if not None, save params).')
+flags.DEFINE_string('save_dir', './checkpoints/', 'Logging dir (if not None, save params).')
+flags.DEFINE_string('fid_stats', None, 'FID stats file.')
+flags.DEFINE_integer('seed', 10, 'Random seed.') # Must be the same across all processes.
+flags.DEFINE_integer('log_interval', 1000, 'Logging interval.')
+flags.DEFINE_integer('eval_interval', 1000000, 'Eval interval.')
+flags.DEFINE_integer('save_interval', 10000, 'Save interval.')
+flags.DEFINE_integer('batch_size', 256, 'Mini batch size.')
+flags.DEFINE_integer('max_steps', int(500_000), 'Number of training steps.')
+flags.DEFINE_integer('debug_overfit', 0, 'Debug overfitting.')
+flags.DEFINE_string('mode', 'train', 'train or inference.')
+model_config = ml_collections.ConfigDict({
+    'lr': 0.0001,
+    'beta1': 0.9,
+    'beta2': 0.999,
+    'weight_decay': 0.1,
+    'use_cosine': 0,
+    'warmup': 0,
+    'dropout': 0.0,
+    'hidden_size': 64, # change this!
+    'patch_size': 8, # change this!
+    'depth': 2, # change this!
+    'num_heads': 2, # change this!
+    'mlp_ratio': 1, # change this!
+    'class_dropout_prob': 0.1,
+    'num_classes': 1000,
+    'denoise_timesteps': 128,
+    'cfg_scale': 4.0,
+    'target_update_rate': 0.999,
+    'use_ema': 0,
+    'use_stable_vae': 1,
+    'sharding': 'dp', # dp or fsdp.
+    't_sampling': 'discrete-dt',
+    'dt_sampling': 'uniform',
+    'bootstrap_cfg': 0,
+    'bootstrap_every': 8, # Make sure its a divisor of batch size.
+    'bootstrap_ema': 1,
+    'bootstrap_dt_bias': 0,
+    'train_type': 'shortcut' # or naive.
+})
+#config_flags.DEFINE_config_dict('wandb', wandb_config, lock_config=False)
+config_flags.DEFINE_config_dict('model', model_config, lock_config=False)
+##############################################
+## Training Code.
+##############################################
+def main(_):
+    np.random.seed(FLAGS.seed)
+    print("Using devices", jax.local_devices())
+    device_count = len(jax.local_devices())
+    global_device_count = jax.device_count()
+    print("Device count", device_count)
+    print("Global device count", global_device_count)
+    local_batch_size = FLAGS.batch_size // (global_device_count // device_count)
+    print("Global Batch: ", FLAGS.batch_size)
+    print("Node Batch: ", local_batch_size)
+    print("Device Batch:", local_batch_size // device_count)
+    # Create wandb logger
+    if jax.process_index() == 0 and FLAGS.mode == 'train':
+        setup_wandb(FLAGS.model.to_dict(), **FLAGS.wandb)
+    dataset = get_dataset(FLAGS.dataset_name, local_batch_size, True, FLAGS.debug_overfit)
+    dataset_valid = get_dataset(FLAGS.dataset_name, local_batch_size, False, FLAGS.debug_overfit)
+    example_obs, example_labels = next(dataset)
+    example_obs = example_obs[:1]
+    example_obs_shape = example_obs.shape
+    if FLAGS.model.use_stable_vae:
+        vae = StableVAE.create()
+        if 'latent' in FLAGS.dataset_name:
+            example_obs = example_obs[:, :, :, example_obs.shape[-1] // 2:]
+            example_obs_shape = example_obs.shape
+        else:
+            example_obs = vae.encode(jax.random.PRNGKey(0), example_obs)
+        example_obs_shape = example_obs.shape
+        vae_rng = jax.random.PRNGKey(42)
+        vae_encode = jax.jit(vae.encode)
+        vae_decode = jax.jit(vae.decode)
+    if FLAGS.fid_stats is not None:
+        from utils.fid import get_fid_network, fid_from_stats
+        get_fid_activations = get_fid_network()
+        truth_fid_stats = np.load(FLAGS.fid_stats)
+    else:
+        get_fid_activations = None
+        truth_fid_stats = None
+    ###################################
+    # Creating Model and put on devices.
+    ###################################
+    FLAGS.model.image_channels = example_obs_shape[-1]
+    FLAGS.model.image_size = example_obs_shape[1]
+    dit_args = {
+        'patch_size': FLAGS.model['patch_size'],
+        'hidden_size': FLAGS.model['hidden_size'],
+        'depth': FLAGS.model['depth'],
+        'num_heads': FLAGS.model['num_heads'],
+        'mlp_ratio': FLAGS.model['mlp_ratio'],
+        'out_channels': example_obs_shape[-1],
+        'class_dropout_prob': FLAGS.model['class_dropout_prob'],
+        'num_classes': FLAGS.model['num_classes'],
+        'dropout': FLAGS.model['dropout'],
+        'ignore_dt': False if (FLAGS.model['train_type'] in ('shortcut', 'livereflow')) else True,
+    }
+    model_def = DiT(**dit_args)
+#    tabulate_fn = flax.linen.tabulate(model_def, jax.random.PRNGKey(0))
+    tabulate_fn = flax.linen.tabulate(model_def, rngs={"params": jax.random.PRNGKey(0), "label":jax.random.PRNGKey(0)})
+    print(tabulate_fn(example_obs, jnp.zeros((1,)), jnp.zeros((1,)), jnp.zeros((1,), dtype=jnp.int32)))
+    if FLAGS.model.use_cosine:
+        lr_schedule = optax.warmup_cosine_decay_schedule(0.0, FLAGS.model['lr'], FLAGS.model['warmup'], FLAGS.max_steps)
+    elif FLAGS.model.warmup > 0:
+        lr_schedule = optax.linear_schedule(0.0, FLAGS.model['lr'], FLAGS.model['warmup'])
+    else:
+        lr_schedule = lambda x: FLAGS.model['lr']
+    adam = optax.adamw(learning_rate=lr_schedule, b1=FLAGS.model['beta1'], b2=FLAGS.model['beta2'], weight_decay=FLAGS.model['weight_decay'])
+    tx = optax.chain(adam)
+    def log_param_shapes(params, label=""):
+        flat = flax.traverse_util.flatten_dict(params)
+        squeezed_flat = {k: jnp.squeeze(v, axis = 0) for k, v in flat.items() if v.shape[0] == 1}
+        print(f"\n{label} parameter shapes:")
+        for k, v in flat.items():
+            print(f"{k}: {v.shape}")
+        return flax.traverse_util.unflatten_dict(squeezed_flat)
+    def init(rng):
+        param_key, dropout_key, dropout2_key = jax.random.split(rng, 3)
+        example_t = jnp.zeros((1,))
+        example_dt = jnp.zeros((1,))
+        example_label = jnp.zeros((1,), dtype=jnp.int32)
+        example_obs = jnp.zeros(example_obs_shape)
+        model_rngs = {'params': param_key, 'label_dropout': dropout_key, 'dropout': dropout2_key}
+        params = model_def.init(model_rngs, example_obs, example_t, example_dt, example_label)['params']
+        opt_state = tx.init(params)
+        ts = TrainStateEma.create(model_def, params, rng=rng, tx=tx, opt_state=opt_state)
+        if FLAGS.load_dir is not None:
+            cp = Checkpoint(FLAGS.load_dir)
+            train_state_load = cp.load_as_dict()["train_state"]
+            log_param_shapes(ts.params)
+            flat = log_param_shapes(train_state_load["params"])
+            flat_ema = log_param_shapes(train_state_load["params_ema"])
+            flat_mu = log_param_shapes(train_state_load["opt_state"][0][0].mu)
+            flat_nu = log_param_shapes(train_state_load["opt_state"][0][0].nu)
+            from optax import ScaleByAdamState
+            opt_state = train_state_load["opt_state"]
+            new_state = ScaleByAdamState(
+                opt_state[0][0].count,
+                mu=flat_mu,
+                nu=flat_nu
+            )
+            opt_state = list(opt_state)
+            opt_state[0] = list(opt_state[0])
+            opt_state[0][0] = new_state
+            opt_state[0] = tuple(opt_state[0])
+            opt_state = tuple(opt_state)
+            train_state_load = TrainStateEma.create(model_def, params = flat, rng = rng, tx = tx, opt_state=opt_state)
+            #Need to replace EMA because we have a separate ema
+            log_param_shapes(train_state_load.params)
+            train_state_load.replace(params_ema = flat_ema)
+            start_step = train_state_load.step
+            ts = train_state_load
+        return ts
+    rng = jax.random.PRNGKey(FLAGS.seed)
+    train_state_shape = jax.eval_shape(init, rng)
+    data_sharding, train_state_sharding, no_shard, shard_data, global_to_local = create_sharding(FLAGS.model.sharding, train_state_shape)
+    train_state = jax.jit(init, out_shardings=train_state_sharding)(rng)
+    jax.debug.visualize_array_sharding(train_state.params['FinalLayer_0']['Dense_0']['kernel'])
+    jax.debug.visualize_array_sharding(train_state.params['TimestepEmbedder_1']['Dense_0']['kernel'])
+    jax.experimental.multihost_utils.assert_equal(train_state.params['TimestepEmbedder_1']['Dense_0']['kernel'])
+    start_step = 1
+    if False:#FLAGS.load_dir is not None:
+        cp = Checkpoint(FLAGS.load_dir)
+        replace_dict = cp.load_as_dict()['train_state']
+        del replace_dict['opt_state'] # Debug
+        train_state = train_state.replace(**replace_dict)
+        if FLAGS.wandb.run_id != "None": # If we are continuing a run.
+            start_step = train_state.step
+        train_state = jax.jit(lambda x : x, out_shardings=train_state_sharding)(train_state)
+        print("Loaded model with step", train_state.step)
+        train_state = train_state.replace(step=0)
+        jax.debug.visualize_array_sharding(train_state.params['FinalLayer_0']['Dense_0']['kernel'])
+        del cp
+    if FLAGS.model.train_type == 'progressive' or FLAGS.model.train_type == 'consistency-distillation':
+        train_state_teacher = jax.jit(lambda x : x, out_shardings=train_state_sharding)(train_state)
+    else:
+        train_state_teacher = None
+    visualize_labels = example_labels
+    visualize_labels = shard_data(visualize_labels)
+    visualize_labels = jax.experimental.multihost_utils.process_allgather(visualize_labels)
+    imagenet_labels = open('data/imagenet_labels.txt').read().splitlines()
+    ###################################
+    # Update Function
+    ###################################
+    @partial(jax.jit, out_shardings=(train_state_sharding, no_shard))
+    def update(train_state, train_state_teacher, images, labels, force_t=-1, force_dt=-1):
+        new_rng, targets_key, dropout_key, perm_key = jax.random.split(train_state.rng, 4)
+        info = {}
+        id_perm = jax.random.permutation(perm_key, images.shape[0])
+        images = images[id_perm]
+        labels = labels[id_perm]
+        images = jax.lax.with_sharding_constraint(images, data_sharding)
+        labels = jax.lax.with_sharding_constraint(labels, data_sharding)
+        if FLAGS.model['cfg_scale'] == 0: # For unconditional generation.
+            labels = jnp.ones(labels.shape[0], dtype=jnp.int32) * FLAGS.model['num_classes']
+        if FLAGS.model['train_type'] == 'naive':
+            from baselines.targets_naive import get_targets
+            x_t, v_t, t, dt_base, labels, info = get_targets(FLAGS, targets_key, train_state, images, labels, force_t, force_dt)
+        elif FLAGS.model['train_type'] == 'shortcut':
+            from targets_shortcut import get_targets
+            x_t, v_t, t, dt_base, labels, info = get_targets(FLAGS, targets_key, train_state, images, labels, force_t, force_dt)
+        elif FLAGS.model['train_type'] == 'progressive':
+            from baselines.targets_progressive import get_targets
+            x_t, v_t, t, dt_base, labels, info = get_targets(FLAGS, targets_key, train_state, train_state_teacher, images, labels, force_t, force_dt)
+        elif FLAGS.model['train_type'] == 'consistency-distillation':
+            from baselines.targets_consistency_distillation import get_targets
+            x_t, v_t, t, dt_base, labels, info = get_targets(FLAGS, targets_key, train_state, train_state_teacher, images, labels, force_t, force_dt)
+        elif FLAGS.model['train_type'] == 'consistency':
+            from baselines.targets_consistency_training import get_targets
+            x_t, v_t, t, dt_base, labels, info = get_targets(FLAGS, targets_key, train_state, images, labels, force_t, force_dt)
+        elif FLAGS.model['train_type'] == 'livereflow':
+            from baselines.targets_livereflow import get_targets
+            x_t, v_t, t, dt_base, labels, info = get_targets(FLAGS, targets_key, train_state, images, labels, force_t, force_dt)
+        def loss_fn(grad_params):
+            v_prime, logvars, activations = train_state.call_model(x_t, t, dt_base, labels, train=True, rngs={'dropout': dropout_key}, params=grad_params, return_activations=True)
+            mse_v = jnp.mean((v_prime - v_t) ** 2, axis=(1, 2, 3))
+            loss = jnp.mean(mse_v)
+            info = {
+                'loss': loss,
+                'v_magnitude_prime': jnp.sqrt(jnp.mean(jnp.square(v_prime))),
+                **{'activations/' + k : jnp.sqrt(jnp.mean(jnp.square(v))) for k, v in activations.items()},
+            }
+            if FLAGS.model['train_type'] == 'shortcut' or FLAGS.model['train_type'] == 'livereflow':
+                bootstrap_size = FLAGS.batch_size // FLAGS.model['bootstrap_every']
+                info['loss_flow'] = jnp.mean(mse_v[bootstrap_size:])
+                info['loss_bootstrap'] = jnp.mean(mse_v[:bootstrap_size])
+            return loss, info
+        grads, new_info = jax.grad(loss_fn, has_aux=True)(train_state.params)
+        info = {**info, **new_info}
+        updates, new_opt_state = train_state.tx.update(grads, train_state.opt_state, train_state.params)
+        new_params = optax.apply_updates(train_state.params, updates)
+        info['grad_norm'] = optax.global_norm(grads)
+        info['update_norm'] = optax.global_norm(updates)
+        info['param_norm'] = optax.global_norm(new_params)
+        info['lr'] = lr_schedule(train_state.step)
+        train_state = train_state.replace(rng=new_rng, step=train_state.step + 1, params=new_params, opt_state=new_opt_state)
+        train_state = train_state.update_ema(FLAGS.model['target_update_rate'])
+        return train_state, info
+    if FLAGS.mode != 'train':
+        do_inference(FLAGS, train_state, None, dataset, dataset_valid, shard_data, vae_encode, vae_decode, update,
+                       get_fid_activations, imagenet_labels, visualize_labels,
+                       fid_from_stats, truth_fid_stats)
+        return
+    ###################################
+    # Train Loop
+    ###################################
+    for i in tqdm.tqdm(range(1 + start_step, FLAGS.max_steps + 1 + start_step),
+                       smoothing=0.1,
+                       dynamic_ncols=True):
+        # Sample data.
+        if not FLAGS.debug_overfit or i == 1:
+            batch_images, batch_labels = shard_data(*next(dataset))
+            if FLAGS.model.use_stable_vae and 'latent' not in FLAGS.dataset_name:
+                vae_rng, vae_key = jax.random.split(vae_rng)
+                batch_images = vae_encode(vae_key, batch_images)
+        # Train update.
+        train_state, update_info = update(train_state, train_state_teacher, batch_images, batch_labels)
+        if i % FLAGS.log_interval == 0 or i == 1:
+            update_info = jax.device_get(update_info)
+            update_info = jax.tree_map(lambda x: np.array(x), update_info)
+            update_info = jax.tree_map(lambda x: x.mean(), update_info)
+            train_metrics = {f'training/{k}': v for k, v in update_info.items()}
+            valid_images, valid_labels = shard_data(*next(dataset_valid))
+            if FLAGS.model.use_stable_vae and 'latent' not in FLAGS.dataset_name:
+                valid_images = vae_encode(vae_rng, valid_images)
+            _, valid_update_info = update(train_state, train_state_teacher, valid_images, valid_labels)
+            valid_update_info = jax.device_get(valid_update_info)
+            valid_update_info = jax.tree_map(lambda x: x.mean(), valid_update_info)
+            train_metrics['training/loss_valid'] = valid_update_info['loss']
+            if jax.process_index() == 0:
+                wandb.log(train_metrics, step=i)
+        if FLAGS.model['train_type'] == 'progressive':
+            num_sections = np.log2(FLAGS.model['denoise_timesteps']).astype(jnp.int32)
+            if i % (FLAGS.max_steps // num_sections) == 0:
+                train_state_teacher = jax.jit(lambda x : x, out_shardings=train_state_sharding)(train_state)
+        if i % FLAGS.eval_interval == 0:
+            eval_model(FLAGS, train_state, train_state_teacher, i, dataset, dataset_valid, shard_data, vae_encode, vae_decode, update,
+                       get_fid_activations, imagenet_labels, visualize_labels,
+                       fid_from_stats, truth_fid_stats)
+        if i % FLAGS.save_interval == 0 and FLAGS.save_dir is not None:
+            train_state_gather = jax.experimental.multihost_utils.process_allgather(train_state)
+            #This all gather might be parto f the reason the shape is odd
+            if jax.process_index() == 0:
+                cp = Checkpoint(FLAGS.save_dir+str(train_state_gather.step+1), parallel=False)
+                cp.train_state = train_state_gather
+                cp.save()
+                del cp
+            del train_state_gather
+if __name__ == '__main__':
+    app.run(main)

sharpness/helper_inference.py ADDED Viewed

	@@ -0,0 +1,210 @@

+import jax
+import jax.experimental
+import wandb
+import jax.numpy as jnp
+import numpy as np
+import tqdm
+import matplotlib.pyplot as plt
+import os
+from functools import partial
+from absl import app, flags
+flags.DEFINE_integer('inference_timesteps', 128, 'Number of timesteps for inference.')
+flags.DEFINE_integer('inference_generations', 50000, 'Number of generations for inference.')
+flags.DEFINE_float('inference_cfg_scale', 1.0, 'CFG scale for inference.')
+#So although we do a CFG sanity check, we don't really train properly with CFG for this to actually work.
+if False:
+    classes = np.load("classes.npz")
+    global_mean = jnp.load("global_mean.npy")
+    #print(type(classes))#npz shit
+    classes = {key: classes[key] for key in classes.files}
+    classes["1000"] = global_mean
+    classes_array = jnp.array([classes[str(i)] for i in range(len(classes))])
+def do_inference(
+    FLAGS,
+    train_state,
+    step,
+    dataset,
+    dataset_valid,
+    shard_data,
+    vae_encode,
+    vae_decode,
+    update,
+    get_fid_activations,
+    imagenet_labels,
+    visualize_labels,
+    fid_from_stats,
+    truth_fid_stats,
+):
+    with jax.spmd_mode('allow_all'):
+        global_device_count = jax.device_count()
+        key = jax.random.PRNGKey(42 + jax.process_index())
+        batch_images, batch_labels = next(dataset)
+        valid_images, valid_labels = next(dataset_valid)
+        if FLAGS.model.use_stable_vae:
+            batch_images = vae_encode(key, batch_images)
+            valid_images = vae_encode(key, valid_images)
+        batch_labels_sharded, valid_labels_sharded = shard_data(batch_labels, valid_labels)
+        labels_uncond = shard_data(jnp.ones(batch_labels.shape, dtype=jnp.int32) * FLAGS.model['num_classes']) # Null token
+        eps = jax.random.normal(key, batch_images.shape)
+        def process_img(img):
+            if FLAGS.model.use_stable_vae:
+                img = vae_decode(img[None])[0]
+            img = img * 0.5 + 0.5
+            img = jnp.clip(img, 0, 1)
+            img = np.array(img)
+            return img
+#        @partial(jax.jit, static_argnums=(5,))
+        def call_model(train_state, images, t, dt, labels, use_ema=True, perturbe = False):
+            if use_ema and FLAGS.model.use_ema:
+                call_fn = train_state.call_model_ema
+            else:
+                call_fn = train_state.call_model
+            key2 = jax.random.PRNGKey(0)
+            output = call_fn(images, t, dt, labels, train=False, rngs={"label": key2}, perturbe = perturbe)
+            return output
+        if FLAGS.mode == 'interpolate':
+            seed = 5
+            eps0 = jax.random.normal(jax.random.PRNGKey(seed), batch_images[0].shape)
+            eps1 = jax.random.normal(jax.random.PRNGKey(seed+1), batch_images[0].shape)
+            labels = jnp.ones(FLAGS.batch_size,).astype(jnp.int32) * 555
+            i = jnp.linspace(0, 1, FLAGS.batch_size)
+            i_neg = np.sqrt(1-i**2)
+            x = eps0[None] * i_neg[:, None, None, None] + eps1[None] * i[:, None, None, None]
+            t_vector = jnp.full((FLAGS.batch_size, ), 0)
+            dt_vector = jnp.zeros_like(t_vector)
+            cfg_scale = FLAGS.inference_cfg_scale
+            v = call_model(train_state, x, t_vector, dt_vector, labels)
+            x = x + v * 1.0
+            x = vae_decode(x) # Image is in [-1, 1] space.
+            x_render = np.array(jax.experimental.multihost_utils.process_allgather(x))
+            os.makedirs(FLAGS.save_dir, exist_ok=True)
+            np.save(FLAGS.save_dir + f'/x_render.npy', x_render)
+            breakpoint()
+        denoise_timesteps = FLAGS.inference_timesteps
+        num_generations = FLAGS.inference_generations
+        cfg_scale = FLAGS.inference_cfg_scale
+        x0 = []
+        x1 = []
+        lab = []
+        x_render = []
+        activations = []
+        images_shape = batch_images.shape
+        print(f"Calc FID for CFG {cfg_scale} and denoise_timesteps {denoise_timesteps}")
+        for fid_it in tqdm.tqdm(range(num_generations // FLAGS.batch_size)):
+            key = jax.random.PRNGKey(42)
+            key = jax.random.fold_in(key, fid_it)
+            key = jax.random.fold_in(key, jax.process_index())
+            eps_key, label_key = jax.random.split(key)
+            x = jax.random.normal(eps_key, images_shape)
+            e = 0.30
+            labels = jax.random.randint(label_key, (images_shape[0],), 0, FLAGS.model.num_classes)
+            #from baselines.targets_naive import map_labels_to_classes
+            #x_cond = map_labels_to_classes(classes_array, labels) * (1-e) + e * x
+            #x_uncond = map_labels_to_classes(classes_array, labels_uncond) * (1-e) + e * x
+            x, labels = shard_data(x, labels)
+            x0.append(np.array(jax.experimental.multihost_utils.process_allgather(x)))
+            delta_t = 1.0 / denoise_timesteps
+            sigmas = []
+            for ti in range(denoise_timesteps + 1):
+                t = ti / denoise_timesteps # From x_0 (noise) to x_1 (data)
+                sigmas.append(t)
+                #So this gives us n + 1 steps, because we start at n
+            i = 0
+            for ti in range(denoise_timesteps):
+                t = ti / denoise_timesteps # From x_0 (noise) to x_1 (data)
+                t_vector = jnp.full((images_shape[0], ), t)
+                if FLAGS.model.train_type == 'naive':
+                    dt_flow = np.log2(FLAGS.model['denoise_timesteps']).astype(jnp.int32)
+                    dt_base = jnp.ones(images_shape[0], dtype=jnp.int32) * dt_flow # Smallest dt.
+                else: # shortcut
+                    dt_flow = np.log2(denoise_timesteps).astype(jnp.int32)
+                    dt_base = jnp.ones(images_shape[0], dtype=jnp.int32) * dt_flow
+                    # print(dt_base)
+                t_vector, dt_base = shard_data(t_vector, dt_base)
+                if cfg_scale == 1:
+                    v = call_model(train_state, x, t_vector, dt_base, labels, perturbe = True)#True really just means (conditional)
+                elif cfg_scale == 0:
+                    v = call_model(train_state, x, t_vector, dt_base, labels_uncond)
+                else:
+                    v_pred_uncond = call_model(train_state, x, t_vector, dt_base, labels_uncond)
+                    v_pred_label = call_model(train_state, x, t_vector, dt_base, labels)
+                    v = v_pred_uncond + cfg_scale * (v_pred_label - v_pred_uncond)
+                if FLAGS.model.train_type == 'consistency':
+                    eps = shard_data(jax.random.normal(jax.random.fold_in(eps_key, ti), images_shape))
+                    x1pred = x + v * (1-t)
+                    x = x1pred * (t+delta_t) + eps * (1-t-delta_t)
+                elif True:
+                    x = x + v * delta_t # Euler sampling.
+                elif False:
+                    def get_ancestral_step(t0, t1):
+                        sigma_up = None
+                        return 1 / (1 + ((t0 ** 2 * (t1 - 1) ** 4) / ((t0 - 1) ** 2 * t1 ** 4)) ** 0.5), sigma_up
+                #    def flow_sample_sde_3(model, x, ts):
+    #for s, t in tqdm(zip(ts[:-1], ts[1:]), total=len(ts) - 1):
+    #    dx = model(x, s)
+    #    denoised = x + dx * (1 - s)
+    #    noise = torch.randn_like(x)
+    #    fac_1 = (s * (1 - t) ** 2) / ((1 - s) ** 2 * t)
+    #    fac_2 = (t ** 2 - 2 * s * t ** 2 + s ** 2 * (2 * t - 1)) / ((1 - s) ** 2 * t)
+    #    fac_3 = (1 - t) * (fac_2 / t) ** 0.5
+    #    x = fac_1 * x + fac_2 * denoised + fac_3 * noise
+    #return x
+                    #So our timesteps looks like 0, 1/128..
+                    sigma_down, sigma_up = get_ancestral_step(sigmas[i], sigmas[i + 1])
+                    # Euler method
+                    dt = sigma_down - sigmas[i]
+                    #Naive up
+                    sigma_up = sigmas[i+1] - dt
+                    x = x + v * dt
+                    if sigmas[i + 1] != 1.0:
+                        x = x + jax.random.normal(eps_key, images_shape) * sigma_up * v
+                i += 1
+            x1.append(np.array(jax.experimental.multihost_utils.process_allgather(x)))
+            lab.append(np.array(jax.experimental.multihost_utils.process_allgather(labels)))
+            if FLAGS.model.use_stable_vae:
+                x = vae_decode(x) # Image is in [-1, 1] space.
+                if num_generations < 10000:
+                    x_render.append(np.array(jax.experimental.multihost_utils.process_allgather(x)))
+            #save some number of x
+            #What is x shape?
+            x = jax.image.resize(x, (x.shape[0], 299, 299, 3), method='bilinear', antialias=False)
+            x = jnp.clip(x, -1, 1)
+            acts = get_fid_activations(x)[..., 0, 0, :] # [devices, batch//devices, 2048]
+            acts = jax.experimental.multihost_utils.process_allgather(acts)
+            acts = np.array(acts)
+            activations.append(acts)
+        if jax.process_index() == 0:
+            activations = np.concatenate(activations, axis=0)
+            activations = activations.reshape((-1, activations.shape[-1]))
+            mu1 = np.mean(activations, axis=0)
+            sigma1 = np.cov(activations, rowvar=False)
+            fid = fid_from_stats(mu1, sigma1, truth_fid_stats['mu'], truth_fid_stats['sigma'])
+            print(f"FID is {fid}")
+            return
+            if FLAGS.save_dir is not None:
+                os.makedirs(FLAGS.save_dir, exist_ok=True)
+                x_render = np.concatenate(x_render, axis=0)
+                np.save(FLAGS.save_dir + f'/x_render.npy', x_render)

sharpness/model.py ADDED Viewed

	@@ -0,0 +1,427 @@

+import math
+from typing import Any, Callable, Optional, Tuple, Type, Sequence, Union
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+from einops import rearrange
+Array = Any
+PRNGKey = Any
+Shape = Tuple[int]
+Dtype = Any
+from math_utils import get_2d_sincos_pos_embed, modulate
+from jax._src import core
+from jax._src import dtypes
+from jax._src.nn.initializers import _compute_fans
+def xavier_uniform_pytorchlike():
+    def init(key, shape, dtype):
+        dtype = dtypes.canonicalize_dtype(dtype)
+        #named_shape = core.as_named_shape(shape)
+        if len(shape) == 2: # Dense, [in, out]
+            fan_in = shape[0]
+            fan_out = shape[1]
+        elif len(shape) == 4: # Conv, [k, k, in, out]. Assumes patch-embed style conv.
+            fan_in = shape[0] * shape[1] * shape[2]
+            fan_out = shape[3]
+        else:
+            raise ValueError(f"Invalid shape {shape}")
+        variance = 2 / (fan_in + fan_out)
+        scale = jnp.sqrt(3 * variance)
+        param = jax.random.uniform(key, shape, dtype, -1) * scale
+        return param
+    return init
+class TrainConfig:
+    def __init__(self, dtype):
+        self.dtype = dtype
+    def kern_init(self, name='default', zero=False):
+        if zero or 'bias' in name:
+            return nn.initializers.constant(0)
+        return xavier_uniform_pytorchlike()
+    def default_config(self):
+        return {
+            'kernel_init': self.kern_init(),
+            'bias_init': self.kern_init('bias', zero=True),
+            'dtype': self.dtype,
+        }
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    hidden_size: int
+    tc: TrainConfig
+    frequency_embedding_size: int = 256
+    @nn.compact
+    def __call__(self, t):
+        x = self.timestep_embedding(t)
+        x = nn.Dense(self.hidden_size, kernel_init=nn.initializers.normal(0.02),
+                     bias_init=self.tc.kern_init('time_bias'), dtype=self.tc.dtype)(x)
+        x = nn.silu(x)
+        x = nn.Dense(self.hidden_size, kernel_init=nn.initializers.normal(0.02),
+                     bias_init=self.tc.kern_init('time_bias'))(x)
+        return x
+    # t is between [0, 1].
+    def timestep_embedding(self, t, max_period=10000):
+        """
+        Create sinusoidal timestep embeddings.
+        :param t: a 1-D Tensor of N indices, one per batch element.
+                            These may be fractional.
+        :param dim: the dimension of the output.
+        :param max_period: controls the minimum frequency of the embeddings.
+        :return: an (N, D) Tensor of positional embeddings.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+        t = jax.lax.convert_element_type(t, jnp.float32)
+        # t = t * max_period
+        dim = self.frequency_embedding_size
+        half = dim // 2
+        freqs = jnp.exp( -math.log(max_period) * jnp.arange(start=0, stop=half, dtype=jnp.float32) / half)
+        args = t[:, None] * freqs[None]
+        embedding = jnp.concatenate([jnp.cos(args), jnp.sin(args)], axis=-1)
+        embedding = embedding.astype(self.tc.dtype)
+        return embedding
+class LabelEmbedder(nn.Module):
+    """
+    Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
+    """
+    num_classes: int
+    hidden_size: int
+    tc: TrainConfig
+    @nn.compact
+    def __call__(self, labels):
+        embedding_table = nn.Embed(self.num_classes + 1, self.hidden_size,
+                                   embedding_init=nn.initializers.normal(0.02), dtype=self.tc.dtype)
+        embeddings = embedding_table(labels)
+        return embeddings
+class PatchEmbed(nn.Module):
+    """ 2D Image to Patch Embedding """
+    patch_size: int
+    hidden_size: int
+    tc: TrainConfig
+    bias: bool = True
+    @nn.compact
+    def __call__(self, x):
+        B, H, W, C = x.shape
+        patch_tuple = (self.patch_size, self.patch_size)
+        num_patches = (H // self.patch_size)
+        x = nn.Conv(self.hidden_size, patch_tuple, patch_tuple, use_bias=self.bias, padding="VALID",
+                     kernel_init=self.tc.kern_init('patch'), bias_init=self.tc.kern_init('patch_bias', zero=True),
+                     dtype=self.tc.dtype)(x) # (B, P, P, hidden_size)
+        x = rearrange(x, 'b h w c -> b (h w) c', h=num_patches, w=num_patches)
+        return x
+class MlpBlock(nn.Module):
+    """Transformer MLP / feed-forward block."""
+    mlp_dim: int
+    tc: TrainConfig
+    out_dim: Optional[int] = None
+    dropout_rate: float = None
+    train: bool = False
+    @nn.compact
+    def __call__(self, inputs):
+        """It's just an MLP, so the input shape is (batch, len, emb)."""
+        actual_out_dim = inputs.shape[-1] if self.out_dim is None else self.out_dim
+        x = nn.Dense(features=self.mlp_dim, **self.tc.default_config())(inputs)
+        x = nn.gelu(x)
+        x = nn.Dropout(rate=self.dropout_rate, deterministic=(not self.train))(x)
+        output = nn.Dense(features=actual_out_dim, **self.tc.default_config())(x)
+        output = nn.Dropout(rate=self.dropout_rate, deterministic=(not self.train))(output)
+        return output
+def modulate(x, shift, scale):
+    # scale = jnp.clip(scale, -1, 1)
+    return x * (1 + scale[:, None]) + shift[:, None]
+################################################################################
+#                                 Core DiT Model                                #
+#################################################################################
+class DiTBlock(nn.Module):
+    """
+    A DiT block with adaptive layer norm zero (adaLN-Zero) conditioning.
+    """
+    hidden_size: int
+    num_heads: int
+    tc: TrainConfig
+    mlp_ratio: float = 4.0
+    dropout: float = 0.0
+    train: bool = False
+    # @functools.partial(jax.checkpoint, policy=jax.checkpoint_policies.nothing_saveable)
+    @nn.compact
+    def __call__(self, x, c):
+        # Calculate adaLn modulation parameters.
+        c = nn.silu(c)
+        c = nn.Dense(6 * self.hidden_size, **self.tc.default_config())(c)
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = jnp.split(c, 6, axis=-1)
+        # Attention Residual.
+        x_norm = nn.LayerNorm(use_bias=False, use_scale=False, dtype=self.tc.dtype)(x)
+        x_modulated = modulate(x_norm, shift_msa, scale_msa)
+        channels_per_head = self.hidden_size // self.num_heads
+        k = nn.Dense(self.hidden_size, **self.tc.default_config())(x_modulated)
+        q = nn.Dense(self.hidden_size, **self.tc.default_config())(x_modulated)
+        v = nn.Dense(self.hidden_size, **self.tc.default_config())(x_modulated)
+        k = jnp.reshape(k, (k.shape[0], k.shape[1], self.num_heads, channels_per_head))
+        q = jnp.reshape(q, (q.shape[0], q.shape[1], self.num_heads, channels_per_head))
+        v = jnp.reshape(v, (v.shape[0], v.shape[1], self.num_heads, channels_per_head))
+        q = q / q.shape[3] # (1/d) scaling.
+        w = jnp.einsum('bqhc,bkhc->bhqk', q, k) # [B, HW, HW, num_heads]
+        w = w.astype(jnp.float32)
+        w = nn.softmax(w, axis=-1)
+        y = jnp.einsum('bhqk,bkhc->bqhc', w, v) # [B, HW, num_heads, channels_per_head]
+        y = jnp.reshape(y, x.shape) # [B, H, W, C] (C = heads * channels_per_head)
+        attn_x = nn.Dense(self.hidden_size, **self.tc.default_config())(y)
+        x = x + (gate_msa[:, None] * attn_x)
+        # MLP Residual.
+        x_norm2 = nn.LayerNorm(use_bias=False, use_scale=False, dtype=self.tc.dtype)(x)
+        x_modulated2 = modulate(x_norm2, shift_mlp, scale_mlp)
+        mlp_x = MlpBlock(mlp_dim=int(self.hidden_size * self.mlp_ratio), tc=self.tc,
+                         dropout_rate=self.dropout, train=self.train)(x_modulated2)
+        x = x + (gate_mlp[:, None] * mlp_x)
+        return x
+class FinalLayer(nn.Module):
+    """
+    The final layer of DiT.
+    """
+    patch_size: int
+    out_channels: int
+    hidden_size: int
+    tc: TrainConfig
+    @nn.compact
+    def __call__(self, x, c):
+        c = nn.silu(c)
+        c = nn.Dense(2 * self.hidden_size, kernel_init=self.tc.kern_init(zero=True),
+                     bias_init=self.tc.kern_init('bias', zero=True), dtype=self.tc.dtype)(c)
+        shift, scale = jnp.split(c, 2, axis=-1)
+        x = nn.LayerNorm(use_bias=False, use_scale=False, dtype=self.tc.dtype)(x)
+        x = modulate(x, shift, scale)
+        x = nn.Dense(self.patch_size * self.patch_size * self.out_channels,
+                     kernel_init=self.tc.kern_init('final', zero=True),
+                     bias_init=self.tc.kern_init('final_bias', zero=True), dtype=self.tc.dtype)(x)
+        return x
+import jax
+import jax.numpy as jnp
+def apply_label_embedding_noise(key, label_embeddings):
+    """
+    Applies Gaussian noise to label embeddings based on specified probabilities.
+    Args:
+        key: A JAX random key.
+        label_embeddings: A JAX array of shape (batch_size, embedding_dim),
+                          representing the label embeddings.
+    Returns:
+        A tuple containing:
+        - noisy_label_embeddings: The label embeddings with noise applied.
+        - noise_levels: A JAX array of shape (batch_size,), indicating
+                        the alpha value used for each sample (1.0 for no noise,
+                        0.0 for 100% noise, or a uniform sample for partial noise).
+    """
+    batch_size, embedding_dim = label_embeddings.shape
+    # Split key for different random operations
+    key, noise_type_key, alpha_key, normal_key = jax.random.split(key, 4)
+    # Determine noise application type for each sample
+    # 0: 100% noise (alpha = 0)
+    # 1: Partial noise (alpha uniformly 0-1)
+    # 2: No noise (do nothing)
+    noise_type_choices = jax.random.choice(
+        noise_type_key,
+        a=jnp.array([0, 1, 2]),
+        shape=(batch_size,),
+        p=jnp.array([0.00, 0.10, 0.90])
+    )
+    # Initialize noise_levels to 1.0 (no noise)
+    noise_levels = jnp.ones(batch_size, dtype=label_embeddings.dtype)
+    # Generate alpha values for partial noise
+    sampled_alphas = jax.random.uniform(alpha_key, shape=(batch_size,), minval=0.0, maxval=1.0)
+    # Generate Gaussian noise for the entire batch
+    # We assume a standard deviation of 1 for the noise, you might want to adjust this.
+    gaussian_noise = jax.random.normal(normal_key, shape=label_embeddings.shape)
+    # Initialize noisy_label_embeddings
+    noisy_label_embeddings = label_embeddings
+    # Apply 100% noise
+    cond_100_percent_noise = (noise_type_choices == 0)
+    noisy_label_embeddings = jnp.where(
+        cond_100_percent_noise[:, None],  # Expand dim for broadcasting
+        gaussian_noise,
+        noisy_label_embeddings
+    )
+    noise_levels = jnp.where(cond_100_percent_noise, 0.0, noise_levels)
+    # Apply partial noise
+    cond_partial_noise = (noise_type_choices == 1)
+    # Reshape sampled_alphas for broadcasting
+    alpha_reshaped = sampled_alphas[:, None]
+    noisy_label_embeddings = jnp.where(
+        cond_partial_noise[:, None],
+        label_embeddings * alpha_reshaped + gaussian_noise * (1.0 - alpha_reshaped),
+        noisy_label_embeddings
+    )
+    noise_levels = jnp.where(cond_partial_noise, sampled_alphas, noise_levels)
+    # For cond_no_noise (noise_type_choices == 2), noisy_label_embeddings remains
+    # label_embeddings and noise_levels remains 1.0, so no specific action needed.
+    return noisy_label_embeddings, noise_levels, key
+class DiT(nn.Module):
+    """
+    Diffusion model with a Transformer backbone.
+    """
+    patch_size: int
+    hidden_size: int
+    depth: int
+    num_heads: int
+    mlp_ratio: float
+    out_channels: int
+    class_dropout_prob: float
+    num_classes: int
+    ignore_dt: bool = False
+    dropout: float = 0.0
+    dtype: Dtype = jnp.bfloat16
+    @nn.compact
+    def __call__(self, x, t, dt, y, train=False, return_activations=False, perturbe = True):
+        # (x = (B, H, W, C) image, t = (B,) timesteps, y = (B,) class labels)
+        print("DiT: Input of shape", x.shape, "dtype", x.dtype)
+        activations = {}
+        key = self.make_rng("label")
+        batch_size = x.shape[0]
+        input_size = x.shape[1]
+        in_channels = x.shape[-1]
+        num_patches = (input_size // self.patch_size) ** 2
+        num_patches_side = input_size // self.patch_size
+        tc = TrainConfig(dtype=self.dtype)
+        if self.ignore_dt:
+            dt = jnp.zeros_like(t)
+        # pos_embed = self.param("pos_embed", get_2d_sincos_pos_embed, self.hidden_size, num_patches)
+        # pos_embed = jax.lax.stop_gradient(pos_embed)
+        pos_embed = get_2d_sincos_pos_embed(None, self.hidden_size, num_patches)
+        x = PatchEmbed(self.patch_size, self.hidden_size, tc=tc)(x) # (B, num_patches, hidden_size)
+        print("DiT: After patch embed, shape is", x.shape, "dtype", x.dtype)
+        activations['patch_embed'] = x
+        x = x + pos_embed
+        x = x.astype(self.dtype)
+        te = TimestepEmbedder(self.hidden_size, tc=tc)(t) # (B, hidden_size)
+        dte = TimestepEmbedder(self.hidden_size, tc=tc)(dt) # (B, hidden_size)
+        ye = LabelEmbedder(self.num_classes, self.hidden_size, tc=tc)(y) # (B, hidden_size)
+#        ye_g = TimestepEmbedder(self.hidden_size,tc=tc)
+        #CFG free, here!
+        #So we set CFG % to 0 during training
+        #Instead, we will apply gaussian noise to the label embeddings, and condition... somewhere, on that.
+        #So the perturbed version uses cfg between conditional and conditional, except the second one uses condition_amount = ones
+        #So we use condition_amount = zeros, then condition_amount = ones.
+        #Not sure how we indicate training mode. Maybe -1?
+        #x = int(x == 'true')
+        #Now we need a way to condition the forward pass..
+        def adjust_condition_amount(train, peturbe, condition_amount):
+            def true_fn(_):
+                return jnp.ones_like(condition_amount)  # peturbe is True → ones
+            def false_fn(_):
+                return jnp.zeros_like(condition_amount)  # peturbe is False → zeros
+            def train_false_branch(_):
+                return jax.lax.cond(peturbe, true_fn, false_fn, operand=None)
+            def train_true_branch(_):
+                return condition_amount  # leave it unchanged during training
+            return jax.lax.cond(train, train_true_branch, train_false_branch, operand=None)
+        #When perturbe is true, we return ones = no noise
+        #When false, return zeros = full noise.
+        #For NON training, we don't want to actually modify the labels, only the conditioning.
+        #So default during training is apply
+        def apply_fn(key, ye, train):
+            def true_branch(args):
+                key, ye = args
+                ye_new, condition_amount, key_new = apply_label_embedding_noise(key, ye)
+                return ye_new.astype(jnp.float32), condition_amount, key_new
+            def false_branch(args):
+                key, ye = args
+                ye_new, condition_amount, key_new = apply_label_embedding_noise(key, ye)
+                return ye.astype(jnp.float32), condition_amount, key_new
+            return jax.lax.cond(train, true_branch, false_branch, (key, ye))
+        print("train is", train)#False
+        print("perturbe is", perturbe)#False right now (it's getting passed properly)
+        print("initial ye", ye[0][0:10])
+        ye, condition_amount, key = apply_fn(key, ye, train)
+        print("new ye", ye[0][0:10])
+        print("condition amount", condition_amount)
+        condition_amount = adjust_condition_amount(train, perturbe, condition_amount)
+        print("adjusted", condition_amount)
+        ye_g = TimestepEmbedder(self.hidden_size, tc=tc)(condition_amount)
+        c = te + ye + dte + ye_g
+        activations['pos_embed'] = pos_embed
+        activations['time_embed'] = te
+        activations['dt_embed'] = dte
+        activations['label_embed'] = ye
+        activations['conditioning'] = c
+        print("DiT: Patch Embed of shape", x.shape, "dtype", x.dtype)
+        print("DiT: Conditioning of shape", c.shape, "dtype", c.dtype)
+        for i in range(self.depth):
+            x = DiTBlock(self.hidden_size, self.num_heads, tc, self.mlp_ratio, self.dropout, train)(x, c)
+            activations[f'dit_block_{i}'] = x
+        x = FinalLayer(self.patch_size, self.out_channels, self.hidden_size, tc)(x, c) # (B, num_patches, p*p*c)
+        activations['final_layer'] = x
+        # print("DiT: FinalLayer of shape", x.shape, "dtype", x.dtype)
+        x = jnp.reshape(x, (batch_size, num_patches_side, num_patches_side,
+                            self.patch_size, self.patch_size, self.out_channels))
+        x = jnp.einsum('bhwpqc->bhpwqc', x)
+        x = rearrange(x, 'B H P W Q C -> B (H P) (W Q) C', H=int(num_patches_side), W=int(num_patches_side))
+        assert x.shape == (batch_size, input_size, input_size, self.out_channels)
+        t_discrete = jnp.floor(t * 256).astype(jnp.int32)
+        logvars = nn.Embed(256, 1, embedding_init=nn.initializers.constant(0))(t_discrete) * 100
+        if return_activations:
+            return x, logvars, activations
+        return x#, dte, te