| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| import os |
|
|
| import nemo_run as run |
| from huggingface_hub import snapshot_download |
| from nemo.collections import llm |
| from nemo.collections.diffusion.models.model import DiT7BConfig, DiT14BConfig |
| from nemo.collections.diffusion.train import pretrain, videofolder_datamodule |
| from nemo.lightning.pytorch.strategies.utils import RestoreConfig |
|
|
|
|
| @run.cli.factory(target=llm.train) |
| def cosmos_diffusion_7b_text2world_finetune() -> run.Partial: |
| |
| recipe = pretrain() |
| recipe.model.config = run.Config(DiT7BConfig) |
|
|
| |
| recipe.trainer.max_steps = 1000 |
| recipe.optim.config.lr = 1e-6 |
|
|
| |
| recipe.trainer.strategy.tensor_model_parallel_size = 8 |
| recipe.trainer.strategy.sequence_parallel = True |
| recipe.trainer.strategy.ckpt_async_save = False |
|
|
| |
| recipe.trainer.strategy.ddp.with_megatron_fsdp_code_path = True |
| recipe.trainer.strategy.ddp.data_parallel_sharding_strategy = "MODEL_AND_OPTIMIZER_STATES" |
| recipe.trainer.strategy.ddp.overlap_param_gather = True |
| recipe.trainer.strategy.ddp.overlap_grad_reduce = True |
| recipe.model.config.use_cpu_initialization = True |
|
|
| |
| recipe.data = videofolder_datamodule() |
| recipe.data.path = "" |
|
|
| |
| recipe.resume.restore_config = run.Config(RestoreConfig, load_artifacts=False) |
| recipe.resume.restore_config.path = os.path.join( |
| snapshot_download("nvidia/Cosmos-1.0-Diffusion-7B-Text2World", allow_patterns=["nemo/*"]), "nemo" |
| ) |
| recipe.resume.resume_if_exists = False |
|
|
| |
| recipe.log_log_dir = "nemo_experiments/cosmos_diffusion_7b_text2world_finetune" |
|
|
| return recipe |
|
|
|
|
| @run.cli.factory(target=llm.train) |
| def cosmos_diffusion_14b_text2world_finetune() -> run.Partial: |
| |
| recipe = pretrain() |
| recipe.model.config = run.Config(DiT14BConfig) |
|
|
| |
| recipe.trainer.max_steps = 1000 |
| recipe.optim.config.lr = 1e-6 |
|
|
| |
| recipe.trainer.strategy.tensor_model_parallel_size = 8 |
| recipe.trainer.strategy.sequence_parallel = True |
| recipe.trainer.strategy.ckpt_async_save = False |
|
|
| |
| recipe.trainer.strategy.ddp.with_megatron_fsdp_code_path = True |
| recipe.trainer.strategy.ddp.data_parallel_sharding_strategy = "MODEL_AND_OPTIMIZER_STATES" |
| recipe.trainer.strategy.ddp.overlap_param_gather = True |
| recipe.trainer.strategy.ddp.overlap_grad_reduce = True |
| recipe.model.config.use_cpu_initialization = True |
|
|
| |
| recipe.model.config.recompute_granularity = "full" |
| recipe.model.config.recompute_method = "uniform" |
| recipe.model.config.recompute_num_layers = 1 |
|
|
| |
| recipe.data = videofolder_datamodule() |
| recipe.data.path = "" |
|
|
| |
| recipe.resume.restore_config = run.Config(RestoreConfig, load_artifacts=False) |
| recipe.resume.restore_config.path = os.path.join( |
| snapshot_download("nvidia/Cosmos-1.0-Diffusion-14B-Text2World", allow_patterns=["nemo/*"]), "nemo" |
| ) |
|
|
| recipe.resume.resume_if_exists = False |
|
|
| |
| recipe.log_log_dir = "nemo_experiments/cosmos_diffusion_14b_text2world_finetune" |
|
|
| return recipe |
|
|
|
|
| if __name__ == "__main__": |
| run.cli.main(llm.train, default_factory=cosmos_diffusion_7b_text2world_finetune) |
|
|