Upload folder using huggingface_hub

Browse files

Files changed (2) hide show

1e-6_kl_naive_globalscale_channelmean/log.txt +0 -0
1e-6_kl_naive_globalscale_channelmean/train.py +504 -0

1e-6_kl_naive_globalscale_channelmean/log.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

1e-6_kl_naive_globalscale_channelmean/train.py ADDED Viewed

	@@ -0,0 +1,504 @@

+from typing import Any
+import jax.numpy as jnp
+from absl import app, flags
+from functools import partial
+import numpy as np
+import tqdm
+import jax
+import jax.numpy as jnp
+import flax
+import optax
+import wandb
+from ml_collections import config_flags
+import ml_collections
+from PIL import Image
+from utils.wandb import setup_wandb, default_wandb_config
+from utils.train_state import TrainStateEma
+from utils.checkpoint import Checkpoint
+from utils.stable_vae import StableVAE
+from utils.my_vae import MyVAE
+from utils.sharding import create_sharding, all_gather
+from utils.datasets import get_dataset
+from model import DiT
+from helper_eval import eval_model
+from helper_inference import do_inference
+FLAGS = flags.FLAGS
+flags.DEFINE_string('dataset_name', 'imagenet256', 'Environment name.')
+flags.DEFINE_string('load_dir', None, 'Logging dir (if not None, save params).')
+flags.DEFINE_string('save_dir', './checkpoints/', 'Logging dir (if not None, save params).')
+flags.DEFINE_string('fid_stats', None, 'FID stats file.')
+flags.DEFINE_integer('seed', 10, 'Random seed.') # Must be the same across all processes.
+flags.DEFINE_integer('log_interval', 100, 'Logging interval.')
+flags.DEFINE_integer('eval_interval', 1000000, 'Eval interval.')
+flags.DEFINE_integer('save_interval', 10000, 'Save interval.')
+flags.DEFINE_integer('batch_size', 512, 'Mini batch size.')
+flags.DEFINE_integer('max_steps', int(500_000), 'Number of training steps.')
+flags.DEFINE_integer('debug_overfit', 0, 'Debug overfitting.')
+flags.DEFINE_string('mode', 'train', 'train or inference.')
+model_config = ml_collections.ConfigDict({
+    'lr': 0.0001,
+    'beta1': 0.9,
+    'beta2': 0.999,
+    'weight_decay': 0.1,
+    'use_cosine': 0,
+    'warmup': 0,
+    'dropout': 0.0,
+    'hidden_size': 64, # change this!
+    'patch_size': 8, # change this!
+    'depth': 2, # change this!
+    'num_heads': 2, # change this!
+    'mlp_ratio': 1, # change this!
+    'class_dropout_prob': 0.1,
+    'num_classes': 1000,
+    'denoise_timesteps': 128,
+    'cfg_scale': 4.0,
+    'target_update_rate': 0.999,
+    'use_ema': 0,
+    'use_stable_vae': 1,
+    'sharding': 'dp', # dp or fsdp.
+    't_sampling': 'discrete-dt',
+    'dt_sampling': 'uniform',
+    'bootstrap_cfg': 0,
+    'bootstrap_every': 8, # Make sure its a divisor of batch size.
+    'bootstrap_ema': 1,
+    'bootstrap_dt_bias': 0,
+    'train_type': 'shortcut' # or naive.
+})
+wandb_config = default_wandb_config()
+wandb_config.update({
+    'project': 'shortcut',
+    'name': 'shortcut_{dataset_name}',
+})
+config_flags.DEFINE_config_dict('wandb', wandb_config, lock_config=False)
+config_flags.DEFINE_config_dict('model', model_config, lock_config=False)
+##############################################
+## Training Code.
+##############################################
+def main(_):
+    np.random.seed(FLAGS.seed)
+    print("Using devices", jax.local_devices())
+    device_count = len(jax.local_devices())
+    global_device_count = jax.device_count()
+    print("Device count", device_count)
+    print("Global device count", global_device_count)
+    local_batch_size = FLAGS.batch_size // (global_device_count // device_count)
+    print("Global Batch: ", FLAGS.batch_size)
+    print("Node Batch: ", local_batch_size)
+    print("Device Batch:", local_batch_size // device_count)
+    # Create wandb logger
+    if jax.process_index() == 0 and FLAGS.mode == 'train':
+        setup_wandb(FLAGS.model.to_dict(), **FLAGS.wandb)
+    dataset = get_dataset(FLAGS.dataset_name, local_batch_size, True, FLAGS.debug_overfit)
+    dataset_valid = get_dataset(FLAGS.dataset_name, local_batch_size, False, FLAGS.debug_overfit)
+    example_obs, example_labels = next(dataset)
+    test_data = example_obs[:4]
+    example_obs = example_obs[:1]
+    example_obs_shape = example_obs.shape
+    if FLAGS.model.use_stable_vae:
+        #vae = StableVAE.create()
+        print("creating model")
+        #Create vae with IMAGe shapes
+        vae= MyVAE.create(example_obs)
+        print("model done")
+        if 'latent' in FLAGS.dataset_name:
+            example_obs = example_obs[:, :, :, example_obs.shape[-1] // 2:]
+            example_obs_shape = example_obs.shape
+        else:
+            #Need to expand the obs shape and repeat because our thing expects JIT
+            pass
+            x =  jnp.expand_dims(example_obs, axis=0)
+            ## Repeat along the new axis to get shape (4, 3, 32, 32, 4)
+            #x = jnp.repeat(x,repeats=4, axis = 1)
+            x = jnp.repeat(x, repeats=4, axis=0)
+            print("Input to vae", x.shape)
+            #print(example_obs.shape)
+            example_obs, res = vae.encode(x)
+            print("output example shape", example_obs.shape)
+            #del x
+        #example_obs_shape = example_obs.shape
+        vae_rng = jax.random.PRNGKey(42)
+        #How do we do this?
+#        vae_encode = jax.jit(vae.encode)
+#        vae_decode = jax.jit(vae.decode)
+        vae_encode = vae.encode
+        vae_decode = vae.decode
+        print("Test data shape", test_data.shape)#4,256,256,3?
+        #save first image.
+        first = test_data[0]
+        image = (first * 255).astype(np.uint8)
+        image = np.array(image)
+        img = Image.fromarray(image)
+        img.save("testimg.png")
+        #Needs expansion to 4x
+        x =  jnp.expand_dims(test_data, axis=0)
+        #So now we are 1,4,256,256,3
+        x = jnp.swapaxes(x, 0, 1)
+        print("x shape", x.shape)
+        encoded, res = vae_encode(x)
+        print("encoded shape", encoded.shape)
+        #It's possible we want to compress this
+        decoded = vae_decode(encoded)
+        print("image shape", decoded.shape)
+        #Encode, decode, log
+        decoded_img = decoded[0][0]
+        print("decoded img shape", decoded_img.shape)
+        image = (decoded_img * 255).astype(np.uint8)
+        image = np.array(image)
+        img = Image.fromarray(image)
+        img.save("decodedimg.png")
+        #Need example shape here again
+        example_obs = example_obs.squeeze()
+        print("obs shape", example_obs.shape)
+        example_obs_shape = example_obs.shape
+    if FLAGS.fid_stats is not None:
+        from utils.fid import get_fid_network, fid_from_stats
+        get_fid_activations = get_fid_network()
+        truth_fid_stats = np.load(FLAGS.fid_stats)
+    else:
+        get_fid_activations = None
+        truth_fid_stats = None
+    ###################################
+    # Creating Model and put on devices.
+    ###################################
+    FLAGS.model.image_channels = example_obs_shape[-1]
+    FLAGS.model.image_size = example_obs_shape[1]
+    dit_args = {
+        'patch_size': FLAGS.model['patch_size'],
+        'hidden_size': FLAGS.model['hidden_size'],
+        'depth': FLAGS.model['depth'],
+        'num_heads': FLAGS.model['num_heads'],
+        'mlp_ratio': FLAGS.model['mlp_ratio'],
+        'out_channels': example_obs_shape[-1],
+        'class_dropout_prob': FLAGS.model['class_dropout_prob'],
+        'num_classes': FLAGS.model['num_classes'],
+        'dropout': FLAGS.model['dropout'],
+        'ignore_dt': False if (FLAGS.model['train_type'] in ('shortcut', 'livereflow')) else True,
+    }
+    model_def = DiT(**dit_args)
+    tabulate_fn = flax.linen.tabulate(model_def, jax.random.PRNGKey(0))
+    print(tabulate_fn(example_obs, jnp.zeros((1,)), jnp.zeros((1,)), jnp.zeros((1,), dtype=jnp.int32)))
+    if FLAGS.model.use_cosine:
+        lr_schedule = optax.warmup_cosine_decay_schedule(0.0, FLAGS.model['lr'], FLAGS.model['warmup'], FLAGS.max_steps)
+    elif FLAGS.model.warmup > 0:
+        lr_schedule = optax.linear_schedule(0.0, FLAGS.model['lr'], FLAGS.model['warmup'])
+    else:
+        lr_schedule = lambda x: FLAGS.model['lr']
+    adam = optax.adamw(learning_rate=lr_schedule, b1=FLAGS.model['beta1'], b2=FLAGS.model['beta2'], weight_decay=FLAGS.model['weight_decay'])
+    tx = optax.chain(adam)
+    start_step = 1
+    def log_param_shapes(params, label=""):
+        flat = flax.traverse_util.flatten_dict(params)
+        squeezed_flat = {k: jnp.squeeze(v, axis = 0) for k, v in flat.items() if v.shape[0] == 1}
+        print(f"\n{label} parameter shapes:")
+        for k, v in flat.items():
+            print(f"{k}: {v.shape}")
+        return flax.traverse_util.unflatten_dict(squeezed_flat)
+    def init(rng):
+        param_key, dropout_key, dropout2_key = jax.random.split(rng, 3)
+        example_t = jnp.zeros((1,))
+        example_dt = jnp.zeros((1,))
+        example_label = jnp.zeros((1,), dtype=jnp.int32)
+        example_obs = jnp.zeros(example_obs_shape)
+        model_rngs = {'params': param_key, 'label_dropout': dropout_key, 'dropout': dropout2_key}
+        params = model_def.init(model_rngs, example_obs, example_t, example_dt, example_label)['params']
+        opt_state = tx.init(params)
+        ts = TrainStateEma.create(model_def, params, rng=rng, tx=tx, opt_state=opt_state)
+        if FLAGS.load_dir is not None:
+            cp = Checkpoint(FLAGS.load_dir)
+            train_state_load = cp.load_as_dict()["train_state"]
+            start_step = train_state_load["step"]
+            log_param_shapes(ts.params)
+            flat = log_param_shapes(train_state_load["params"])
+            flat_ema = log_param_shapes(train_state_load["params_ema"])
+            flat_mu = log_param_shapes(train_state_load["opt_state"][0][0].mu)
+            flat_nu = log_param_shapes(train_state_load["opt_state"][0][0].nu)
+            from optax import ScaleByAdamState
+            opt_state = train_state_load["opt_state"]
+            new_state = ScaleByAdamState(
+                opt_state[0][0].count,
+                mu=flat_mu,
+                nu=flat_nu
+            )
+            opt_state = list(opt_state)
+            opt_state[0] = list(opt_state[0])
+            opt_state[0][0] = new_state
+            opt_state[0] = tuple(opt_state[0])
+            opt_state = tuple(opt_state)
+            train_state_load = TrainStateEma.create(model_def, params = flat, rng = rng, tx = tx, opt_state=opt_state)
+            train_state_load = train_state_load.replace(step=start_step)
+            #Need to replace EMA because we have a separate ema
+            log_param_shapes(train_state_load.params)
+            train_state_load = train_state_load.replace(params_ema = flat_ema)
+            ts = train_state_load
+        return ts
+    rng = jax.random.PRNGKey(FLAGS.seed)
+    train_state_shape = jax.eval_shape(init, rng)
+    data_sharding, train_state_sharding, no_shard, shard_data, global_to_local = create_sharding(FLAGS.model.sharding, train_state_shape)
+    train_state = jax.jit(init, out_shardings=train_state_sharding)(rng)
+    #So we can only visualize here if we are squeezed.... which might cause errors later?
+    jax.debug.visualize_array_sharding(train_state.params['FinalLayer_0']['Dense_0']['kernel'])
+    jax.debug.visualize_array_sharding(train_state.params['TimestepEmbedder_1']['Dense_0']['kernel'])
+    jax.experimental.multihost_utils.assert_equal(train_state.params['TimestepEmbedder_1']['Dense_0']['kernel'])
+    if False:#FLAGS.load_dir is not None:
+        cp = Checkpoint(FLAGS.load_dir)
+        replace_dict = cp.load_as_dict()['train_state']
+        def log_param_shapes(params, label=""):
+            flat = flax.traverse_util.flatten_dict(params)
+            squeezed_flat = {k: jnp.squeeze(v, axis = 0) for k, v in flat.items() if v.shape[0] == 1}
+            print(f"\n{label} parameter shapes:")
+            for k, v in flat.items():
+                print(f"{k}: {v.shape}")
+            return flax.traverse_util.unflatten_dict(squeezed_flat)
+        log_param_shapes(train_state.params, "Before load")
+        train_state = train_state.replace(**replace_dict)
+        flat = log_param_shapes(train_state.params, "Before squeeze")
+        train_state = train_state.replace(params=flat)
+        log_param_shapes(flat, "after squeeze")
+        flat_ema = log_param_shapes(train_state.params_ema, "before ema")
+        train_state = train_state.replace(params_ema=flat_ema)
+        log_param_shapes(flat_ema, "after squeeze")
+        print(train_state.step)
+        exit()
+        #log_param_shapes(train_state.params, "After squeeze")
+        if FLAGS.wandb.run_id != "None": # If we are continuing a run.
+            start_step = train_state.step
+        train_state = jax.jit(lambda x : x, out_shardings=train_state_sharding)(train_state)
+        print("Loaded model with step", train_state.step)
+        #log_param_shapes(train_state.params, "after jit shard")
+        train_state = train_state.replace(step=0)
+        jax.debug.visualize_array_sharding(train_state.params['FinalLayer_0']['Dense_0']['kernel'])
+        del cp
+    if FLAGS.model.train_type == 'progressive' or FLAGS.model.train_type == 'consistency-distillation':
+        train_state_teacher = jax.jit(lambda x : x, out_shardings=train_state_sharding)(train_state)
+    else:
+        train_state_teacher = None
+    visualize_labels = example_labels
+    visualize_labels = shard_data(visualize_labels)
+    visualize_labels = jax.experimental.multihost_utils.process_allgather(visualize_labels)
+    imagenet_labels = open('data/imagenet_labels.txt').read().splitlines()
+    ###################################
+    # Update Function
+    ###################################
+    @partial(jax.jit, out_shardings=(train_state_sharding, no_shard))
+    def update(train_state, train_state_teacher, images, labels, force_t=-1, force_dt=-1):
+        new_rng, targets_key, dropout_key, perm_key = jax.random.split(train_state.rng, 4)
+        info = {}
+        id_perm = jax.random.permutation(perm_key, images.shape[0])
+        images = images[id_perm]
+        labels = labels[id_perm]
+        images = jax.lax.with_sharding_constraint(images, data_sharding)
+        labels = jax.lax.with_sharding_constraint(labels, data_sharding)
+        if FLAGS.model['cfg_scale'] == 0: # For unconditional generation.
+            labels = jnp.ones(labels.shape[0], dtype=jnp.int32) * FLAGS.model['num_classes']
+        if FLAGS.model['train_type'] == 'naive':
+            from baselines.targets_naive import get_targets
+            x_t, v_t, t, dt_base, labels, info = get_targets(FLAGS, targets_key, train_state, images, labels, force_t, force_dt)
+        elif FLAGS.model['train_type'] == 'shortcut':
+            from targets_shortcut import get_targets
+            x_t, v_t, t, dt_base, labels, info = get_targets(FLAGS, targets_key, train_state, images, labels, force_t, force_dt)
+        elif FLAGS.model['train_type'] == 'progressive':
+            from baselines.targets_progressive import get_targets
+            x_t, v_t, t, dt_base, labels, info = get_targets(FLAGS, targets_key, train_state, train_state_teacher, images, labels, force_t, force_dt)
+        elif FLAGS.model['train_type'] == 'consistency-distillation':
+            from baselines.targets_consistency_distillation import get_targets
+            x_t, v_t, t, dt_base, labels, info = get_targets(FLAGS, targets_key, train_state, train_state_teacher, images, labels, force_t, force_dt)
+        elif FLAGS.model['train_type'] == 'consistency':
+            from baselines.targets_consistency_training import get_targets
+            x_t, v_t, t, dt_base, labels, info = get_targets(FLAGS, targets_key, train_state, images, labels, force_t, force_dt)
+        elif FLAGS.model['train_type'] == 'livereflow':
+            from baselines.targets_livereflow import get_targets
+            x_t, v_t, t, dt_base, labels, info = get_targets(FLAGS, targets_key, train_state, images, labels, force_t, force_dt)
+        def loss_fn(grad_params):
+            v_prime, logvars, activations = train_state.call_model(x_t, t, dt_base, labels, train=True, rngs={'dropout': dropout_key}, params=grad_params, return_activations=True)
+            mse_v = jnp.mean((v_prime - v_t) ** 2, axis=(1, 2, 3))
+            loss = jnp.mean(mse_v)
+            info = {
+                'loss': loss,
+                'v_magnitude_prime': jnp.sqrt(jnp.mean(jnp.square(v_prime))),
+                **{'activations/' + k : jnp.sqrt(jnp.mean(jnp.square(v))) for k, v in activations.items()},
+            }
+            if FLAGS.model['train_type'] == 'shortcut' or FLAGS.model['train_type'] == 'livereflow':
+                bootstrap_size = FLAGS.batch_size // FLAGS.model['bootstrap_every']
+                info['loss_flow'] = jnp.mean(mse_v[bootstrap_size:])
+                info['loss_bootstrap'] = jnp.mean(mse_v[:bootstrap_size])
+            return loss, info
+        grads, new_info = jax.grad(loss_fn, has_aux=True)(train_state.params)
+        info = {**info, **new_info}
+        updates, new_opt_state = train_state.tx.update(grads, train_state.opt_state, train_state.params)
+        new_params = optax.apply_updates(train_state.params, updates)
+        info['grad_norm'] = optax.global_norm(grads)
+        info['update_norm'] = optax.global_norm(updates)
+        info['param_norm'] = optax.global_norm(new_params)
+        info['lr'] = lr_schedule(train_state.step)
+        train_state = train_state.replace(rng=new_rng, step=train_state.step + 1, params=new_params, opt_state=new_opt_state)
+        train_state = train_state.update_ema(FLAGS.model['target_update_rate'])
+        return train_state, info
+    if FLAGS.mode != 'train':
+        #do_inference(FLAGS, train_state, None, dataset, dataset_valid, shard_data, vae_encode, vae_decode, update,
+        #               get_fid_activations, imagenet_labels, visualize_labels,
+        #               fid_from_stats, truth_fid_stats)
+        print("doing the else")
+        cfgs = [1.0,1.25,1.5,1.75,2.0,2.25,2.5,2.75,3.0]#Basically only no 1.5, since it's already done.
+        steps = [128,64,32,16,8,4,2,1]
+            #steps = [64,32,16,8,2,1]
+        if True:
+            for cfg in cfgs:
+                for step in steps:
+                    FLAGS.inference_timesteps = step
+                    FLAGS.inference_cfg_scale = cfg
+                    do_inference(FLAGS, train_state, None, dataset, dataset_valid, shard_data, vae_encode, vae_decode, update,
+                       get_fid_activations, imagenet_labels, visualize_labels,
+                       fid_from_stats, truth_fid_stats)
+        exit()
+        return
+    ###################################
+    # Train Loop
+    ###################################
+    for i in tqdm.tqdm(range(1 + start_step, FLAGS.max_steps + 1 + start_step),
+                       smoothing=0.1,
+                       dynamic_ncols=True):
+        # Sample data.
+        if not FLAGS.debug_overfit or i == 1:
+            batch_images, batch_labels = shard_data(*next(dataset))
+            if FLAGS.model.use_stable_vae and 'latent' not in FLAGS.dataset_name:
+                vae_rng, vae_key = jax.random.split(vae_rng)
+                #batch_images = vae_encode(vae_key, batch_images)
+                batch_images_reshaped = batch_images.reshape((len(jax.local_devices()), -1, *batch_images.shape[1:]))#Shoudl split over devices
+                batch_images_reshaped, result_dict = vae_encode(batch_images_reshaped)#like (4, 128, 32, 32, 4)
+                #print(batch_images_reshaped.shape)
+                batch_images = batch_images_reshaped.reshape(-1, batch_images_reshaped.shape[2], batch_images_reshaped.shape[3], batch_images_reshaped.shape[-1])
+                #print("after encode", batch_images.shape)
+                #We don't sample right now, and we don't use a key because we don't sample.
+                #Normalize global
+                mean = jnp.array([ 0.04621413,  0.00622245, -0.03867066, -0.12760854])
+                std = jnp.array([1.1124766, 1.1514145, 1.1221403, 1.0895475])
+                batch_images = (batch_images - mean) / std.mean()
+        # Train update.
+        train_state, update_info = update(train_state, train_state_teacher, batch_images, batch_labels)
+        if i % FLAGS.log_interval == 0 or i == 1:
+            update_info = jax.device_get(update_info)
+            update_info = jax.tree_map(lambda x: np.array(x), update_info)
+            update_info = jax.tree_map(lambda x: x.mean(), update_info)
+            train_metrics = {f'training/{k}': v for k, v in update_info.items()}
+            valid_images, valid_labels = shard_data(*next(dataset_valid))
+            if FLAGS.model.use_stable_vae and 'latent' not in FLAGS.dataset_name:
+                #valid_images = vae_encode(vae_rng, valid_images)
+                valid_images_reshaped = valid_images.reshape((len(jax.local_devices()), -1, *valid_images.shape[1:]))#Shoudl split over devices
+                valid_images_reshaped, result_dict = vae_encode(valid_images_reshaped)#like (4, 128, 32, 32, 4)
+                #print(batch_images_reshaped.shape)
+                valid_images = valid_images_reshaped.reshape(-1, valid_images_reshaped.shape[2], valid_images_reshaped.shape[3], valid_images_reshaped.shape[-1])
+                #We forgot valid/2
+                mean = jnp.array([ 0.04621413,  0.00622245, -0.03867066, -0.12760854])
+                std = jnp.array([1.1124766, 1.1514145, 1.1221403, 1.0895475])
+                valid_images = (valid_images - mean) / std.mean()
+            _, valid_update_info = update(train_state, train_state_teacher, valid_images, valid_labels)
+            valid_update_info = jax.device_get(valid_update_info)
+            valid_update_info = jax.tree_map(lambda x: x.mean(), valid_update_info)
+            train_metrics['training/loss_valid'] = valid_update_info['loss']
+            if jax.process_index() == 0:
+                print(train_metrics)
+                wandb.log(train_metrics, step=i)
+        if FLAGS.model['train_type'] == 'progressive':
+            num_sections = np.log2(FLAGS.model['denoise_timesteps']).astype(jnp.int32)
+            if i % (FLAGS.max_steps // num_sections) == 0:
+                train_state_teacher = jax.jit(lambda x : x, out_shardings=train_state_sharding)(train_state)
+        if i % FLAGS.eval_interval == 0:
+            eval_model(FLAGS, train_state, train_state_teacher, i, dataset, dataset_valid, shard_data, vae_encode, vae_decode, update,
+                       get_fid_activations, imagenet_labels, visualize_labels,
+                       fid_from_stats, truth_fid_stats)
+        if i % FLAGS.save_interval == 0 and FLAGS.save_dir is not None:
+            train_state_gather = jax.experimental.multihost_utils.process_allgather(train_state)
+            if jax.process_index() == 0:
+                cp = Checkpoint(FLAGS.save_dir+str(train_state_gather.step+1), parallel=False)
+                cp.train_state = train_state_gather
+                cp.save()
+                del cp
+            del train_state_gather
+if __name__ == '__main__':
+    app.run(main)