taohu commited on Feb 2

Commit

0839907

verified ·

1 Parent(s): f4f2bac

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +108 -0
FastGen/.claude/settings.local.json +19 -0
FastGen/.gitignore +180 -0
FastGen/CLAUDE.md +7 -0
FastGen/CONTRIBUTING.md +93 -0
FastGen/Dockerfile +77 -0
FastGen/LICENSE +202 -0
FastGen/Makefile +29 -0
FastGen/README.md +182 -0
FastGen/SF.md +146 -0
FastGen/assets/teaser.png +3 -0
FastGen/fastgen/__init__.py +2 -0
FastGen/fastgen/callbacks/README.md +59 -0
FastGen/fastgen/callbacks/__init__.py +2 -0
FastGen/fastgen/callbacks/callback.py +183 -0
FastGen/fastgen/callbacks/ct_schedule.py +83 -0
FastGen/fastgen/callbacks/ema.py +155 -0
FastGen/fastgen/callbacks/forced_weight_norm.py +28 -0
FastGen/fastgen/callbacks/gpu_mem_profiler.py +134 -0
FastGen/fastgen/callbacks/gpu_stats.py +92 -0
FastGen/fastgen/callbacks/grad_clip.py +222 -0
FastGen/fastgen/callbacks/param_count.py +116 -0
FastGen/fastgen/callbacks/train_profiler.py +138 -0
FastGen/fastgen/callbacks/wandb.py +404 -0
FastGen/fastgen/configs/README.md +108 -0
FastGen/fastgen/configs/__init__.py +2 -0
FastGen/fastgen/configs/callbacks.py +63 -0
FastGen/fastgen/configs/config.py +254 -0
FastGen/fastgen/configs/config_utils.py +317 -0
FastGen/fastgen/configs/data.py +123 -0
FastGen/fastgen/configs/data_dummy.py +67 -0
FastGen/fastgen/configs/discriminator.py +106 -0
FastGen/fastgen/configs/experiments/CogVideoX/config_dmd2.py +48 -0
FastGen/fastgen/configs/experiments/CogVideoX/config_kd.py +30 -0
FastGen/fastgen/configs/experiments/CogVideoX/config_sft.py +35 -0
FastGen/fastgen/configs/experiments/CogVideoX/config_sft_5b.py +40 -0
FastGen/fastgen/configs/experiments/CosmosPredict2/config_dmd2.py +69 -0
FastGen/fastgen/configs/experiments/CosmosPredict2/config_dmd2_14b.py +37 -0
FastGen/fastgen/configs/experiments/CosmosPredict2/config_dmd2_v2w.py +18 -0
FastGen/fastgen/configs/experiments/CosmosPredict2/config_sft.py +48 -0
FastGen/fastgen/configs/experiments/CosmosPredict2/config_sft_14b.py +19 -0
FastGen/fastgen/configs/experiments/CosmosPredict2/config_sft_v2w.py +18 -0
FastGen/fastgen/configs/experiments/DiT/config_mf_b.py +70 -0
FastGen/fastgen/configs/experiments/DiT/config_mf_xl.py +26 -0
FastGen/fastgen/configs/experiments/DiT/config_sft_dit_xl.py +68 -0
FastGen/fastgen/configs/experiments/DiT/config_sft_sit_xl.py +75 -0
FastGen/fastgen/configs/experiments/EDM/config_cm_cifar10.py +28 -0
FastGen/fastgen/configs/experiments/EDM/config_cm_cifar10_fast.py +25 -0
FastGen/fastgen/configs/experiments/EDM/config_cm_in64.py +65 -0
FastGen/fastgen/configs/experiments/EDM/config_dmd2_cifar10.py +24 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,111 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+FastGen/assets/teaser.png filter=lfs diff=lfs merge=lfs -text
+FastGen/scripts/inference/examples/00_child_swings_rusty_swing_set.mp4 filter=lfs diff=lfs merge=lfs -text
+FastGen/scripts/inference/examples/00_child_swings_rusty_swing_set.png filter=lfs diff=lfs merge=lfs -text
+FastGen/scripts/inference/examples/01_impressionist_rubber_duck_sunset.mp4 filter=lfs diff=lfs merge=lfs -text
+FastGen/scripts/inference/examples/01_impressionist_rubber_duck_sunset.png filter=lfs diff=lfs merge=lfs -text
+FastGen/scripts/inference/examples/02_two_cyclists_stop_sign.mp4 filter=lfs diff=lfs merge=lfs -text
+FastGen/scripts/inference/examples/02_two_cyclists_stop_sign.png filter=lfs diff=lfs merge=lfs -text
+FastGen/scripts/inference/examples/03_astronaut_cow_beach_van_gogh.mp4 filter=lfs diff=lfs merge=lfs -text
+FastGen/scripts/inference/examples/03_astronaut_cow_beach_van_gogh.png filter=lfs diff=lfs merge=lfs -text
+FastGen/scripts/inference/examples/04_bird_flying_church_tower_clock.mp4 filter=lfs diff=lfs merge=lfs -text
+FastGen/scripts/inference/examples/04_bird_flying_church_tower_clock.png filter=lfs diff=lfs merge=lfs -text
+checkpoints/Self-Forcing/vidprom_filtered_extended.txt filter=lfs diff=lfs merge=lfs -text
+hf_models/Wan2.1-T2V-1.3B-Diffusers/assets/comp_effic.png filter=lfs diff=lfs merge=lfs -text
+hf_models/Wan2.1-T2V-1.3B-Diffusers/assets/data_for_diff_stage.jpg filter=lfs diff=lfs merge=lfs -text
+hf_models/Wan2.1-T2V-1.3B-Diffusers/assets/i2v_res.png filter=lfs diff=lfs merge=lfs -text
+hf_models/Wan2.1-T2V-1.3B-Diffusers/assets/t2v_res.jpg filter=lfs diff=lfs merge=lfs -text
+hf_models/Wan2.1-T2V-1.3B-Diffusers/assets/vben_1.3b_vs_sota.png filter=lfs diff=lfs merge=lfs -text
+hf_models/Wan2.1-T2V-1.3B-Diffusers/assets/vben_vs_sota.png filter=lfs diff=lfs merge=lfs -text
+hf_models/Wan2.1-T2V-1.3B-Diffusers/assets/video_dit_arch.jpg filter=lfs diff=lfs merge=lfs -text
+hf_models/Wan2.1-T2V-1.3B-Diffusers/assets/video_vae_res.jpg filter=lfs diff=lfs merge=lfs -text
+hf_models/Wan2.1-T2V-1.3B-Diffusers/examples/i2v_input.JPG filter=lfs diff=lfs merge=lfs -text
+hf_models/Wan2.1-T2V-1.3B-Diffusers/tokenizer/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+pip_wheels/accelerate-1.12.0-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/anyio-4.12.1-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/av-14.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/babel-2.18.0-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/beautifulsoup4-4.14.3-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/bleach-6.3.0-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/bokeh-3.8.2-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/boto3-1.42.39-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/botocore-1.42.39-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/certifi-2026.1.4-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/cffi-2.0.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/charset_normalizer-3.4.4-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/click-8.3.1-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/contourpy-1.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/debugpy-1.8.20-py2.py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/diffusers-0.35.1-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/fsspec-2026.1.0-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/gitpython-3.1.46-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/hf_xet-1.2.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/huggingface_hub-0.36.1-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/imageio-2.37.2-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/imageio_ffmpeg-0.6.0-py3-none-manylinux2014_x86_64.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/ipykernel-7.1.0-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/ipython-9.10.0-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/ipywidgets-8.1.8-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/jedi-0.19.2-py2.py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/jinja2-3.1.6-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/jupyter_client-8.8.0-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/jupyter_server-2.17.0-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/jupyterlab-4.5.3-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/jupyterlab_widgets-3.0.16-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/lark-1.3.1-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/moviepy-2.2.1-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/mpmath-1.3.0-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/narwhals-2.16.0-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/nbconvert-7.17.0-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/networkx-3.6.1-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/notebook-7.5.3-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/nvidia_cusparselt_cu12-0.6.2-py3-none-manylinux2014_x86_64.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/nvidia_nccl_cu12-2.21.5-py3-none-manylinux2014_x86_64.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/opencv_python_headless-4.11.0.86-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/pandas-2.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/parso-0.8.5-py2.py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/pillow-11.3.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/plotly-6.5.2-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/prompt_toolkit-3.0.52-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/protobuf-6.33.5-cp39-abi3-manylinux2014_x86_64.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/psutil-7.2.2-cp36-abi3-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/pydantic-2.12.5-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/pygments-2.19.2-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/python_dateutil-2.9.0.post0-py2.py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/pytz-2025.2-py2.py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/pyyaml-6.0.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/pyzmq-26.4.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/rdkit-2024.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/regex-2026.1.15-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/rpds_py-0.30.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/safetensors-0.7.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/scipy-1.16.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/sentencepiece-0.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/sentry_sdk-2.51.0-py2.py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/setuptools-80.10.2-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/sympy-1.13.1-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/timm-1.0.24-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/tokenizers-0.21.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/torch-2.6.0-cp312-cp312-manylinux1_x86_64.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/torchvision-0.21.0-cp312-cp312-manylinux1_x86_64.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/tornado-6.5.4-cp39-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/transformers-4.49.0-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/triton-3.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/tzdata-2025.3-py2.py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/urllib3-2.6.3-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/wandb-0.23.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl filter=lfs diff=lfs merge=lfs -text
+pip_wheels/widgetsnbextension-4.0.15-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text

FastGen/.claude/settings.local.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "permissions": {
+    "allow": [
+      "Bash(conda activate:*)",
+      "Bash(pip install:*)",
+      "Bash(source:*)",
+      "Bash(conda info:*)",
+      "Bash(pip cache:*)",
+      "Bash(python:*)",
+      "Bash(curl:*)",
+      "Bash(echo:*)",
+      "Bash(huggingface-cli download:*)",
+      "Bash(chmod:*)",
+      "Bash(bash:*)",
+      "Bash(ls:*)",
+      "Bash(huggingface-cli:*)"
+    ]
+  }
+}

FastGen/.gitignore ADDED Viewed

	@@ -0,0 +1,180 @@

+# macOS
+.DS_Store
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+# large files
+*.zip
+# temporal folder
+tmp/
+# credentials
+credentials
+# logs
+FASTGEN_OUTPUT/
+wandb/
+*.err
+# vscode
+*.vscode/

FastGen/CLAUDE.md ADDED Viewed

	@@ -0,0 +1,7 @@

+# FastGen Project Notes
+## Environment
+- Conda environment: `sf`
+- Activate: `conda activate sf`
+- Install: `pip install -e .`

FastGen/CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,93 @@

+# Contributing to FastGen
+Thank you for your interest in contributing to FastGen!
+## Issue Tracking
+* All enhancement, bugfix, or change requests should begin with the creation of an issue.
+* The issue should be reviewed and approved prior to code review.
+## Development Setup
+All development should be done using the provided Docker image to ensure a consistent environment.
+```bash
+# Build the Docker image
+docker build -t fastgen:dev .
+# Run interactively with GPU support (as current user, not root)
+docker run --gpus all -it --rm \
+    --user $(id -u):$(id -g) \
+    -e HOME=/workspace \
+    -v /etc/passwd:/etc/passwd:ro \
+    -v /etc/group:/etc/group:ro \
+    -v $(pwd):/workspace \
+    fastgen:dev bash
+# Inside the container, install linters
+make install
+```
+## Code Style and Formatting
+FastGen uses **Ruff** for linting and code formatting. Follow existing conventions when adding new code.
+```bash
+make format  # Auto-format code
+make lint    # Check compliance
+```
+**Note:** The `fastgen/third_party/` directory is excluded from checks.
+## Continuous Integration
+The GitLab CI pipeline runs automatically on every push and merge request:
+| Stage | Commands |
+|-------|----------|
+| Lint | `make lint`, `make mypy` |
+| Test | `make pytest` |
+| Install | `make install-fastgen` |
+**Note:** Before submitting a pull request, ensure all checks pass locally.
+## Pull Request Process
+1. Fork the repository (for external contributors) or create a feature branch.
+2. Make your changes following the code style guidelines.
+3. Run all checks locally (see [CI section](#continuous-integration)).
+4. Commit your changes with sign-off (PRs with unsigned commits will not be accepted):
+   ```bash
+   git commit -s -m "Add feature X"
+   ```
+5. Push and create a Pull Request.
+**Note:** Keep PRs focused on a single concern and reference related issues (e.g., "Fixes #123").
+## Developer Certificate of Origin
+```
+    Developer Certificate of Origin
+    Version 1.1
+    Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
+    1 Letterman Drive
+    Suite D4700
+    San Francisco, CA, 94129
+    Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed.
+```
+```
+    Developer's Certificate of Origin 1.1
+    By making a contribution to this project, I certify that:
+    (a) The contribution was created in whole or in part by me and I have the right to submit it under the open source license indicated in the file; or
+    (b) The contribution is based upon previous work that, to the best of my knowledge, is covered under an appropriate open source license and I have the right under that license to submit that work with modifications, whether created in whole or in part by me, under the same open source license (unless I am permitted to submit under a different license), as indicated in the file; or
+    (c) The contribution was provided directly to me by some other person who certified (a), (b) or (c) and I have not modified it.
+    (d) I understand and agree that this project and the contribution are public and that a record of the contribution (including all personal information I submit with it, including my sign-off) is maintained indefinitely and may be redistributed consistent with this project or the open source license(s) involved.
+```

FastGen/Dockerfile ADDED Viewed

	@@ -0,0 +1,77 @@

+FROM nvcr.io/nvidia/pytorch:25.06-py3
+# Defines the architecture (amd64 or arm64) automatically during build
+ARG TARGETARCH
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+ENV DEBIAN_FRONTEND=noninteractive
+# Set the timezone
+ENV TZ=America/Los_Angeles
+RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
+# Install system dependencies
+RUN apt-get update --fix-missing && \
+    apt-get install -y --no-install-recommends \
+        build-essential \
+        ninja-build \
+        cmake \
+        wget \
+        ca-certificates \
+        git \
+        curl \
+        unzip \
+        python3-tk \
+    && rm -rf /var/lib/apt/lists/* \
+    && cmake --version \
+    && echo "✅ All system dependencies installed successfully"
+# Install s5cmd with architecture logic
+RUN S5CMD_VERSION="2.1.0-beta.1" && \
+    if [ "$TARGETARCH" = "arm64" ]; then \
+        S5CMD_ARCH="Linux-arm64"; \
+    else \
+        S5CMD_ARCH="Linux-64bit"; \
+    fi && \
+    wget "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/s5cmd_${S5CMD_VERSION}_${S5CMD_ARCH}.tar.gz" && \
+    tar -xf "s5cmd_${S5CMD_VERSION}_${S5CMD_ARCH}.tar.gz" && \
+    install s5cmd /usr/local/bin/ && \
+    rm s5cmd "s5cmd_${S5CMD_VERSION}_${S5CMD_ARCH}.tar.gz"
+# Set build optimization flags
+ENV MAX_JOBS=4
+ENV TORCH_CUDA_ARCH_LIST="8.0;8.6;9.0;10.0"
+# Upgrade pip
+RUN pip install --upgrade pip setuptools wheel
+# numpy<2.0.0 is required for the preinstalled numba and thinc
+RUN pip install wandb[media] \
+    diffusers==0.35.1 \
+    transformers==4.49.0 \
+    accelerate \
+    safetensors \
+    scipy \
+    einops \
+    jupyter \
+    hydra-core \
+    imageio \
+    tqdm \
+    webdataset \
+    av \
+    loguru \
+    boto3 \
+    timm \
+    ftfy \
+    opencv-python-headless \
+    sentencepiece \
+    "numpy<2.0.0"
+RUN pip uninstall -y apex
+RUN pip install --index-url=https://sc-hw-artf.nvidia.com/artifactory/api/pypi/hwinf-mlwfo-pypi/simple --upgrade one-logger-utils
+WORKDIR /workspace
+# Entry point
+RUN (printf '#!/bin/bash\nexec \"$@\"\n' >> /entry.sh) && chmod a+x /entry.sh
+ENTRYPOINT ["/entry.sh"]

FastGen/LICENSE ADDED Viewed

	@@ -0,0 +1,202 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright 2026 NVIDIA CORPORATION & AFFILIATES
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

FastGen/Makefile ADDED Viewed

	@@ -0,0 +1,29 @@

+sinclude .env
+all: help
+install-linters: ## install the linters
+	pip install black==23.10.0 ruff==0.6.9 mypy==1.9.0 types-psutil
+install: install-linters
+mypy:
+	python3 -m mypy --check-untyped-defs --follow-imports=silent --exclude third_party/ train.py
+lint:
+	python3 -m ruff format --check --exclude fastgen/third_party/
+	python3 -m ruff check --exclude fastgen/third_party/
+format:
+	python3 -m ruff format --exclude fastgen/third_party/
+	python3 -m ruff check --fix --exclude fastgen/third_party/
+install-fastgen:
+	python3 -m pip install -e .
+pytest:
+	ulimit -n 4096 && python3 -m pytest --ignore=FASTGEN_OUTPUT --ignore=runs --ignore=tmp --ignore third_party
+.PHONY: help
+help:
+	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'

FastGen/README.md ADDED Viewed

	@@ -0,0 +1,182 @@

+<h1 align="center">NVIDIA FastGen: Fast Generation from Diffusion Models</h1>
+[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](LICENSE)
+[![PyTorch](https://img.shields.io/badge/PyTorch-2.0%2B-ee4c2c?logo=pytorch&logoColor=white)](https://pytorch.org/)
+[![NVIDIA](https://img.shields.io/badge/NVIDIA-76B900?logo=nvidia&logoColor=white)](https://www.nvidia.com/)
+<p align="center">
+<b></b> <a href="https://weilinie.github.io/">Weili Nie</a> • <a href="https://jberner.info/">Julius Berner</a> • <a href="https://research.nvidia.com/labs/genair/author/chao-liu/">Chao Liu</a> • <a href="http://latentspace.cc/">Arash Vahdat</a>
+</p>
+<p align="center">
+  <img src="assets/teaser.png" alt="FastGen header" width=100%/>
+</p>
+FastGen is a PyTorch-based framework for building fast generative models using various distillation and acceleration techniques. It supports:
+- large-scale training with ≥10B parameters.
+- different tasks and modalities, including T2I, I2V, and V2V.
+- various distillation methods, including consistency models, distribution matching distillation, self-forcing, and more.
+## Repository Structure
+```
+fastgen/
+├── fastgen/
+│   ├── callbacks/           # Training callbacks (EMA, profiling, etc.)
+│   ├── configs/             # Configuration system
+│   │   ├── experiments/     # Experiment configs
+│   │   └── methods/         # Method-specific configs
+│   ├── datasets/            # Dataset loaders
+│   ├── methods/             # Training methods (CM, DMD2, SFT, KD etc.)
+│   ├── networks/            # Neural network architectures
+│   ├── third_party/         # Third-party dependencies
+│   ├── trainer.py           # Main training loop
+│   └── utils/               # Utilities (distributed, checkpointing)
+├── scripts/                 # Inference and evaluation scripts
+├── tests/                   # Unit tests
+├── Makefile                 # Development commands (lint, format, test)
+└── train.py                 # Main training entry point
+```
+## Setup
+**Recommended:** Use the provided Docker container for a consistent environment. See [CONTRIBUTING.md](CONTRIBUTING.md) for Docker setup instructions. Otherwise, create a new [conda](https://www.anaconda.com/docs/getting-started/miniconda/install) environment with `conda create -y -n fastgen python=3.12.3 pip; conda activate fastgen`.
+### Installation
+```bash
+git clone https://github.com/NVlabs/FastGen.git
+cd FastGen
+pip install -e .
+```
+### Credentials (Optional)
+For W&B logging, [get your API key](https://wandb.ai/settings) and save it to `credentials/wandb_api.txt` or set the `WANDB_API_KEY` environment variable.
+Without either of these, W&B will prompt for your API key interactively.
+For more details, including S3 storage and other environment variables, see [fastgen/configs/README.md](fastgen/configs/README.md#environment-variables).
+## Quick Start
+Before running the following commands, download the CIFAR-10 dataset and pretrained EDM models:
+```bash
+python scripts/download_data.py --dataset cifar10
+```
+For other datasets and models, see [fastgen/networks/README.md](fastgen/networks/README.md) and [fastgen/datasets/README.md](fastgen/datasets/README.md).
+### Basic Training
+```bash
+python train.py --config=fastgen/configs/experiments/EDM/config_dmd2_test.py
+```
+If you run out-of-memory, try a smaller batch-size, e.g., `dataloader_train.batch_size=32`, which automatically uses gradient accumulation to match the global batch-size.
+**Expected Output:** See the training log for a link to the run on [wandb.ai](https://wandb.ai). Training outputs go to `$FASTGEN_OUTPUT_ROOT/{project}/{group}/{name}/`. With default settings, outputs are organized as follows:
+```
+FASTGEN_OUTPUT/fastgen/cifar10/debug/
+├── checkpoints/    # Model checkpoints in the format {iteration:07d}.pth
+│   ├── 0001000.pth
+│   └── ...
+├── config.yaml     # Resolved configuration for reproducibility
+├── wandb_id.txt    # W&B run ID for resuming
+└── ...
+```
+### DDP/FSDP2 Training
+For multi-GPU training, use DDP:
+```bash
+torchrun --nproc_per_node=8 train.py \
+    --config=fastgen/configs/experiments/EDM/config_dmd2_test.py \
+    - trainer.ddp=True log_config.name=test_ddp
+```
+For large models, use FSDP2 for model sharding by replacing `trainer.ddp=True` with `trainer.fsdp=True`.
+### Inference
+```bash
+python scripts/inference/image_model_inference.py --config fastgen/configs/experiments/EDM/config_dmd2_test.py \
+  --classes=10 --prompt_file=scripts/inference/prompts/classes.txt --ckpt=FASTGEN_OUTPUT/fastgen/cifar10/debug/checkpoints/0002000.pth - log_config.name=test_inference
+```
+For other inferences modes and FID evaluations, see [scripts/README.md](scripts/README.md).
+### Command-Line Overrides
+Override any config parameter using Hydra-style syntax (note the `-` separator):
+```bash
+python train.py --config=path/to/config.py - key=value nested.key=value
+```
+## Documentation
+Detailed documentation is available in each component's README:
+| Component | Documentation | Description |
+|-----------|---------------|-------------|
+| **Methods** | [fastgen/methods/README.md](fastgen/methods/README.md) | Training methods (sCM, MeanFlow, DMD2, Self-Forcing, etc.) |
+| **Networks** | [fastgen/networks/README.md](fastgen/networks/README.md) | Network architectures (EDM, SD, SDXL, Flux, WAN, CogVideoX, Cosmos) and pretrained models |
+| **Configs** | [fastgen/configs/README.md](fastgen/configs/README.md) | Configuration system, environment variables, and creating custom configs |
+| **Datasets** | [fastgen/datasets/README.md](fastgen/datasets/README.md) | Dataset preparation and WebDataset loaders |
+| **Callbacks** | [fastgen/callbacks/README.md](fastgen/callbacks/README.md) | Training callbacks (EMA, logging, gradient clipping, etc.) |
+| **Inference** | [scripts/README.md](scripts/README.md) | Inference modes (T2I, T2V, I2V, V2V, etc.) and FID evaluation |
+| **Third Party** | [fastgen/third_party/README.md](fastgen/third_party/README.md) | Third-party dependencies (Depth Anything V2, etc.) |
+## Supported Methods
+| Category | Methods |
+|----------|---------|
+| **Consistency Models** | [CM](https://arxiv.org/abs/2303.01469), [sCM](https://arxiv.org/abs/2410.11081), [TCM](https://arxiv.org/abs/2410.14895), [MeanFlow](https://arxiv.org/abs/2505.13447) |
+| **Distribution Matching** | [DMD2](https://arxiv.org/abs/2405.14867), [f-Distill](https://arxiv.org/abs/2502.15681), [LADD](https://arxiv.org/abs/2403.12015), [CausVid](https://arxiv.org/abs/2412.07772), [Self-Forcing](https://arxiv.org/abs/2506.08009) |
+| **Fine-Tuning** | [SFT](https://arxiv.org/abs/2006.11239), [CausalSFT](https://arxiv.org/abs/2407.01392) |
+| **Knowledge Distillation** | [KD](https://arxiv.org/abs/2101.02388), [CausalKD](https://arxiv.org/abs/2412.07772) |
+See [fastgen/methods/README.md](fastgen/methods/README.md) for details.
+## Supported Networks and Data
+FastGen is designed to be **agnostic to the network and data** and you can add your own architectures and datasets (see [fastgen/networks/README.md](fastgen/networks/README.md) and [fastgen/datasets/README.md](fastgen/datasets/README.md)). For reference, we provide the following implementations:
+| Data | Networks |
+|------|----------|
+| **Image** | EDM, EDM2, DiT, SD 1.5, SDXL, Flux |
+| **Video** | WAN (T2V, I2V, VACE), CogVideoX, Cosmos Predict2 |
+See [fastgen/networks/README.md](fastgen/networks/README.md) for details.
+Not all combinations of methods and networks are currently supported. We provide typical use-cases in our predefined configs in [fastgen/configs/experiments](fastgen/configs/experiments).
+**We plan to provide distilled student checkpoints for CIFAR-10 and ImageNet soon.**
+## Contributing
+We welcome contributions! Please see [CONTRIBUTING.md](CONTRIBUTING.md) for details.
+We thank everyone who has helped design, build, and test FastGen!
+- **Core contributors:** Weili Nie, Julius Berner, Chao Liu
+- **Other contributors:** James Lucas, David Pankratz, Sihyun Yu, Willis Ma, Yilun Xu, Shengqu Cai, Xinyin Ma, Yanke Song
+- **Collaborators:** Sophia Zalewski, Wei Xiong, Christian Laforte, Sajad Norouzi, Kaiwen Zheng, Miloš Hašan, Saeed Hadadan, Gene Liu, David Dynerman, Grace Lam, Pooya Jannaty, Jan Kautz, and many more.
+- **Project lead:** Arash Vahdat
+## License
+This project is licensed under the Apache License 2.0 - see [LICENSE](LICENSE) for details. Third-party licenses are documented in [licenses/README.md](licenses/README.md).
+## Reference
+```
+@article{fastgen2026,
+  title={NVIDIA FastGen: Fast Generation from Diffusion Models},
+  author={Nie, Weili and Berner, Julius and Liu, Chao and Vahdat, Arash},
+  url={https://github.com/NVlabs/FastGen},
+  year={2026},
+}
+```

FastGen/SF.md ADDED Viewed

	@@ -0,0 +1,146 @@

+# Self-Forcing: Quick Reference Guide
+Self-Forcing is a distribution matching distillation method for causal/autoregressive video generation. It trains a fast student model to match a teacher model's output distribution while maintaining gradient flow through an autoregressive rollout process.
+**Reference**: [Huang et al., 2025](https://arxiv.org/abs/2506.08009)
+---
+## Environment Setup
+```bash
+conda activate sf
+pip install -e .
+```
+---
+## Quick Start
+### 1. Download Checkpoint
+```bash
+huggingface-cli download gdhe17/Self-Forcing checkpoints/ode_init.pt \
+  --local-dir FASTGEN_OUTPUT/MODEL/Self-Forcing
+```
+### 2. Run Training
+#### Data-Free Training (Recommended for testing)
+Uses random tensors instead of real video data. GAN discriminator is disabled.
+**Single GPU:**
+```bash
+python train.py --config=fastgen/configs/experiments/WanT2V/config_sf_datafree.py
+```
+**Single GPU (specific device):**
+```bash
+CUDA_VISIBLE_DEVICES=0 python train.py --config=fastgen/configs/experiments/WanT2V/config_sf_datafree.py
+```
+**Multi-GPU (8 GPUs):**
+```bash
+torchrun --nproc_per_node=8 train.py \
+  --config=fastgen/configs/experiments/WanT2V/config_sf_datafree.py \
+  - trainer.ddp=True
+```
+#### Training with Real Video Data
+For best quality, use real video data with GAN discriminator enabled.
+Prepare WebDataset shards (see `fastgen/datasets/README.md`):
+```
+/path/to/videos/
+├── 00000.tar (sample_001.mp4, sample_001.txt, ...)
+├── 00001.tar
+└── ...
+```
+**Single GPU:**
+```bash
+python train.py --config=fastgen/configs/experiments/WanT2V/config_sf.py \
+  - dataloader_train.datatags='["WDS:/path/to/your/videos"]'
+```
+**Multi-GPU (8 GPUs):**
+```bash
+torchrun --nproc_per_node=8 train.py \
+  --config=fastgen/configs/experiments/WanT2V/config_sf.py \
+  - trainer.ddp=True dataloader_train.datatags='["WDS:/path/to/your/videos"]'
+```
+---
+## Available Configs
+| Model | Config Path | Notes |
+|-------|-------------|-------|
+| WAN T2V (light) | `fastgen/configs/experiments/WanT2V/config_sf_datafree_light.py` | Data-free, reduced resolution (~8-12GB VRAM) |
+| WAN T2V (data-free) | `fastgen/configs/experiments/WanT2V/config_sf_datafree.py` | Data-free, full 480p (~24GB VRAM) |
+| WAN T2V (with data) | `fastgen/configs/experiments/WanT2V/config_sf.py` | Requires video data, GAN enabled |
+| VACE-WAN V2V | `fastgen/configs/experiments/WanV2V/config_sf.py` | Requires video data |
+---
+## Key Configuration Parameters
+### Self-Forcing Specific
+| Parameter | Default | Description |
+|-----------|---------|-------------|
+| `enable_gradient_in_rollout` | `True` | Enable gradients at exit step during rollout |
+| `start_gradient_frame` | `0` | Frame index to start gradient tracking |
+| `same_step_across_blocks` | `True` | Use same exit step for all blocks |
+| `last_step_only` | `False` | Exit only at the last denoising step |
+| `context_noise` | `0.0` | Noise level added to cached context (0-1) |
+### Training Settings
+| Parameter | Default | Description |
+|-----------|---------|-------------|
+| `student_sample_steps` | `4` | Number of denoising steps |
+| `student_update_freq` | `5` | Student update frequency |
+| `gan_loss_weight_gen` | `0.001` | GAN loss weight for generator |
+---
+## Custom Training Example
+```bash
+python train.py --config=fastgen/configs/experiments/WanT2V/config_sf.py \
+  - model.gan_loss_weight_gen=0.005 \
+    model.student_sample_steps=6 \
+    model.context_noise=0.1 \
+    trainer.max_iter=10000 \
+    dataloader_train.batch_size=2
+```
+---
+## How It Works
+1. **Autoregressive Rollout**: Processes video chunk-by-chunk with KV-caching
+2. **Gradient Tracking**: Enables gradients at stochastic exit steps during rollout
+3. **Distribution Matching**: Combines VSD loss with GAN training
+4. **Alternating Updates**:
+   - Student updates (VSD + GAN loss)
+   - Discriminator/fake score updates (adversarial training)
+---
+## Key Files
+- **Method**: `fastgen/methods/distribution_matching/self_forcing.py`
+- **Config Template**: `fastgen/configs/methods/config_self_forcing.py`
+- **Tests**: `tests/test_sfmodel.py`
+---
+## Testing
+```bash
+pytest tests/test_sfmodel.py -v
+```

FastGen/assets/teaser.png ADDED Viewed

Git LFS Details

SHA256: a824dd86d32d840adcf4d4d182930b4544cc11c7ae5237fb27d5ac05e2f42ddf
Pointer size: 132 Bytes
Size of remote file: 1.46 MB

FastGen/fastgen/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2	+ # SPDX-License-Identifier: Apache-2.0

FastGen/fastgen/callbacks/README.md ADDED Viewed

	@@ -0,0 +1,59 @@

+# Callbacks
+Training callbacks for FastGen. All callbacks inherit from `Callback` and hook into the training lifecycle.
+## Available Callbacks
+| Callback | Description |
+|----------|-------------|
+| [`EMACallback`](ema.py) | Exponential Moving Average updates (constant, power, or halflife schedules) |
+| [`GradClipCallback`](grad_clip.py) | Gradient norm clipping with NaN/Inf handling |
+| [`WandbCallback`](wandb.py) | Weights & Biases logging (metrics, images, videos) |
+| [`GPUStatsCallback`](gpu_stats.py) | GPU memory and utilization logging |
+| [`MemTrackerCallback`](gpu_mem_profiler.py) | GPU memory profiling with HTML visualizations |
+| [`TrainProfilerCallback`](train_profiler.py) | Training speed and timing breakdown |
+| [`ParamCountCallback`](param_count.py) | Parameter count logging |
+| [`ForcedWeightNormCallback`](forced_weight_norm.py) | Forced weight normalization (EDM2) |
+| [`CTScheduleCallback`](ct_schedule.py) | Consistency training curriculum schedule |
+## Usage
+Configure callbacks as a dictionary in the config:
+```python
+from fastgen.configs.callbacks import WANDB_CALLBACK, GradClip_CALLBACK, EMA_CALLBACK
+from omegaconf import DictConfig
+config.trainer.callbacks = DictConfig({
+    **WANDB_CALLBACK,
+    **GradClip_CALLBACK,
+    **EMA_CALLBACK,
+})
+```
+Predefined configs are in `fastgen/configs/callbacks.py`.
+## Lifecycle Hooks
+Callbacks can override these hooks:
+| Phase | Hooks |
+|-------|-------|
+| Setup | `on_app_begin`, `on_model_init_start/end`, `on_optimizer_init_start/end`, `on_load_checkpoint_start/end`, `on_dataloader_init_start/end` |
+| Training | `on_train_begin/end`, `on_training_step_begin/end`, `on_training_accum_step_begin`, `on_backward_begin`, `on_optimizer_step_begin` |
+| Validation | `on_validation_begin/end`, `on_validation_step_begin/end` |
+| Checkpointing | `on_save_checkpoint_start/success/end`, `state_dict`, `load_state_dict` |
+| Cleanup | `on_app_end` |
+## Custom Callbacks
+```python
+from fastgen.callbacks.callback import Callback
+class MyCallback(Callback):
+    def on_training_step_end(self, model, data_batch, output_batch, loss_dict, iteration=0):
+        if iteration % self.config.trainer.logging_iter == 0:
+            print(f"Step {iteration}: loss = {loss_dict['total_loss'].item():.4f}")
+```
+Callbacks have access to `self.config` and `self.trainer` after initialization.

FastGen/fastgen/callbacks/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2	+ # SPDX-License-Identifier: Apache-2.0

FastGen/fastgen/callbacks/callback.py ADDED Viewed

	@@ -0,0 +1,183 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+from typing import Callable, Any, TYPE_CHECKING
+import torch
+from torch.utils.data import DataLoader
+from fastgen.utils import instantiate
+import fastgen.utils.logging_utils as logger
+if TYPE_CHECKING:
+    from fastgen.configs.config import BaseConfig
+    from fastgen.trainer import Trainer
+    from fastgen.methods import FastGenModel
+class CallbackDict:
+    def __init__(self, config: BaseConfig, trainer: Trainer):
+        self._callbacks = {}
+        callback_configs = config.trainer.callbacks
+        if callback_configs:
+            if isinstance(callback_configs, list):
+                logger.warning(msg="The 'config.trainer.callbacks' parameter should be a dict instead of a list. ")
+                callback_configs = {f"callback_{k}": v for k, v in enumerate(callback_configs)}
+            for callback_name, current_callback_cfg in callback_configs.items():
+                if "_target_" not in current_callback_cfg:
+                    logger.critical(
+                        f"Callback {callback_name} is missing the '_target_' field. \n Skip {current_callback_cfg}"
+                    )
+                    continue
+                logger.critical(f"Instantiating callback {callback_name}: {current_callback_cfg}")
+                _callback = instantiate(current_callback_cfg)
+                assert isinstance(_callback, Callback), f"{current_callback_cfg} is not a valid callback."
+                _callback.config = config
+                _callback.trainer = trainer
+                _callback.on_app_begin()
+                self._callbacks[callback_name] = _callback
+    def __getattr__(self, method_name: str) -> Callable:
+        def load_state_dict(state_dict: dict[str, Any]) -> None:
+            for name, callback in self._callbacks.items():
+                if name in state_dict:
+                    callback.load_state_dict(state_dict[name])
+                else:
+                    logger.warning(f"Callback {name} not found in checkpoint.")
+        def state_dict() -> dict[str, Any]:
+            return {name: self._callbacks[name].state_dict() for name in self._callbacks}
+        def callbacks_wrapper(*args, **kwargs):
+            for callback in self._callbacks.values():
+                assert hasattr(callback, method_name)
+                method = getattr(callback, method_name)
+                assert callable(method), f"{method_name} is not callable."
+                method(*args, **kwargs)
+        if method_name == "state_dict":
+            return state_dict
+        if method_name == "load_state_dict":
+            return load_state_dict
+        return callbacks_wrapper
+class Callback:
+    config: "BaseConfig"
+    trainer: "Trainer"
+    def on_app_begin(self) -> None:
+        pass
+    def on_model_init_start(self, model: FastGenModel) -> None:
+        pass
+    def on_model_init_end(self, model: FastGenModel | torch.nn.parallel.DistributedDataParallel) -> None:
+        pass
+    def on_optimizer_init_start(self, model: FastGenModel) -> None:
+        pass
+    def on_optimizer_init_end(self, model: FastGenModel) -> None:
+        pass
+    def on_load_checkpoint_start(self, model: FastGenModel) -> None:
+        pass
+    def on_load_checkpoint_end(self, model: FastGenModel, iteration: int = 0) -> None:
+        pass
+    def on_dataloader_init_start(self, model: FastGenModel, iteration: int = 0) -> None:
+        pass
+    def on_dataloader_init_end(
+        self, model: FastGenModel, dataloader_train: DataLoader, dataloader_val: DataLoader, iteration: int = 0
+    ) -> None:
+        pass
+    def on_train_begin(self, model: FastGenModel, iteration: int = 0) -> None:
+        pass
+    def on_training_step_begin(
+        self,
+        model: FastGenModel,
+        iteration: int = 0,
+    ) -> None:
+        pass
+    def on_training_accum_step_begin(
+        self,
+        model: FastGenModel,
+        data_batch: dict[str, torch.Tensor],
+        iteration: int = 0,
+        accum_iter: int = 0,
+    ) -> None:
+        pass
+    def on_backward_begin(
+        self,
+        model: FastGenModel,
+        data_batch: dict[str, torch.Tensor],
+        output_batch: dict[str, torch.Tensor | Callable],
+        loss_dict: dict[str, torch.Tensor],
+        iteration: int = 0,
+        accum_iter: int = 0,
+    ) -> None:
+        pass
+    def on_training_step_end(
+        self,
+        model: FastGenModel,
+        data_batch: dict[str, torch.Tensor],
+        output_batch: dict[str, torch.Tensor | Callable],
+        loss_dict: dict[str, torch.Tensor],
+        iteration: int = 0,
+    ) -> None:
+        pass
+    def on_optimizer_step_begin(self, model: FastGenModel, iteration: int = 0) -> None:
+        pass
+    def on_train_end(self, model: FastGenModel, iteration: int = 0) -> None:
+        pass
+    def on_validation_begin(self, model: FastGenModel, iteration: int = 0, idx: int = 0) -> None:
+        pass
+    def on_validation_step_begin(
+        self, model: FastGenModel, data_batch: dict[str, torch.Tensor], step: int = 0, iteration: int = 0, idx: int = 0
+    ) -> None:
+        pass
+    def on_validation_step_end(
+        self,
+        model: FastGenModel,
+        data_batch: dict[str, torch.Tensor],
+        output_batch: dict[str, torch.Tensor | Callable],
+        loss_dict: dict[str, torch.Tensor],
+        step: int = 0,
+        iteration: int = 0,
+        idx: int = 0,
+    ) -> None:
+        pass
+    def on_validation_end(self, model: FastGenModel, iteration: int = 0, idx: int = 0) -> None:
+        pass
+    def on_save_checkpoint_start(self, model: FastGenModel, iteration: int = 0) -> None:
+        pass
+    def on_save_checkpoint_success(self, model: FastGenModel, iteration: int = 0, path: str = None) -> None:
+        pass
+    def on_save_checkpoint_end(self, model: FastGenModel, iteration: int = 0) -> None:
+        pass
+    def on_app_end(self, model: FastGenModel, iteration: int = 0) -> None:
+        pass
+    def state_dict(self) -> dict[str, Any]:
+        return {}
+    def load_state_dict(self, state_dict: dict[str, Any]) -> None:
+        pass

FastGen/fastgen/callbacks/ct_schedule.py ADDED Viewed

	@@ -0,0 +1,83 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+from typing import Callable, TYPE_CHECKING
+import wandb
+import torch
+from fastgen.callbacks.callback import Callback
+import fastgen.utils.logging_utils as logger
+from fastgen.utils.basic_utils import get_batch_size_total
+from fastgen.utils.distributed import is_rank0
+if TYPE_CHECKING:
+    from fastgen.methods import FastGenModel
+    from fastgen.configs.config import BaseConfig
+class CTScheduleCallback(Callback):
+    config: "BaseConfig"
+    def __init__(
+        self,
+        q: float = 2.0,
+        ratio_limit: float = 0.999,
+        kimg_per_stage: int = 12500,
+        batch_size: int = 1,
+    ):
+        self.q = q
+        self.ratio_limit = ratio_limit
+        self.kimg_per_stage = kimg_per_stage
+        self.batch_size = batch_size
+        self.stage = 0
+        self.ratio = 0.0
+    def _get_cur_stage(self, model, iteration):
+        # Start from the saved iteration of the first-stage model in TCM
+        if hasattr(model, "resume_iter"):
+            assert isinstance(model.resume_iter, int)
+            iteration = iteration + model.resume_iter
+        batch_size = self.batch_size
+        if hasattr(self, "config"):
+            # override the batch_size using self.config
+            batch_size = get_batch_size_total(self.config)
+        cur_nimg = iteration * batch_size
+        stage = cur_nimg // (self.kimg_per_stage * 1000)
+        return stage, cur_nimg
+    def _update_schedule(self, stage):
+        self.stage = stage
+        self.ratio = 1 - 1 / self.q ** (stage + 1)
+        if self.ratio > self.ratio_limit:
+            logger.info(f"Clipping ratio from {self.ratio} -> {self.ratio_limit}")
+            self.ratio = self.ratio_limit
+    def on_train_begin(self, model: FastGenModel, iteration: int = 0) -> None:
+        stage, _ = self._get_cur_stage(model, iteration)
+        self._update_schedule(stage)
+        setattr(model, "ratio", self.ratio)
+    def on_training_step_end(
+        self,
+        model: FastGenModel,
+        data_batch: dict[str, torch.Tensor],
+        output_batch: dict[str, torch.Tensor | Callable],
+        loss_dict: dict[str, torch.Tensor],
+        iteration: int = 0,
+    ) -> None:
+        del data_batch, output_batch, loss_dict
+        new_stage, cur_nimg = self._get_cur_stage(model, iteration)
+        if new_stage > self.stage:
+            self._update_schedule(new_stage)
+            setattr(model, "ratio", self.ratio)
+        if hasattr(self, "config"):
+            # only wandb log when config exists
+            if iteration % self.config.trainer.logging_iter == 0 and is_rank0():
+                if wandb.run:
+                    wandb.log({"ct_schedule/kimg": cur_nimg / 1e3, "ct_schedule/ratio": self.ratio}, step=iteration)

FastGen/fastgen/callbacks/ema.py ADDED Viewed

	@@ -0,0 +1,155 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+from typing import Callable, TYPE_CHECKING, Optional
+import torch
+import wandb
+from fastgen.callbacks.callback import Callback
+from fastgen.utils.basic_utils import get_batch_size_total
+from fastgen.utils.distributed import synchronize, is_rank0
+import fastgen.utils.logging_utils as logger
+if TYPE_CHECKING:
+    from fastgen.methods import FastGenModel
+class EMACallback(Callback):
+    def __init__(
+        self,
+        type: str = "constant",
+        # params for type=constant
+        beta: float = 0.9999,
+        # params for type=power
+        gamma: float = 16.97,
+        # params for type=halflife
+        ema_halflife_kimg: float = 500,
+        ema_rampup_ratio: Optional[float] = 0.05,
+        ema_name: str = "ema",
+        batch_size: int = 1,  # overwritten by self.config if it exists
+        fsdp: bool = False,  # overwritten by self.config if it exists
+    ):
+        self.type = type
+        self.beta = beta
+        self.gamma = gamma
+        self.ema_halflife_kimg = ema_halflife_kimg
+        self.ema_rampup_ratio = ema_rampup_ratio
+        self.ema_name = ema_name
+        self.batch_size = batch_size
+        self._is_fsdp = fsdp
+        self._enabled = True
+    def on_app_begin(self) -> None:
+        if hasattr(self, "config"):
+            # override using config
+            self._is_fsdp = self.config.trainer.fsdp
+            self.batch_size = get_batch_size_total(self.config)
+    def on_model_init_end(
+        self, model: FastGenModel | torch.nn.parallel.DistributedDataParallel, iteration: int = 0
+    ) -> None:
+        # Unwrap DDP if needed to access the original model's attributes
+        if hasattr(model, "module"):
+            model = model.module
+        # check ema initialization
+        ema = getattr(model, self.ema_name, None)
+        if ema is None:
+            self._enabled = False
+            logger.info(f"EMA {self.ema_name} is not enabled, skipping callback.")
+            return
+        assert ema.training is False, f"EMA {self.ema_name} should be in eval mode"
+        for name, p_net in ema.named_parameters():
+            assert not p_net.requires_grad, f"EMA parameter {name} should not require gradients"
+    def _total_iteration(self, model: FastGenModel, iteration: int) -> int:
+        if hasattr(model, "resume_iter"):
+            assert isinstance(model.resume_iter, int)
+            iteration = iteration + model.resume_iter
+        return iteration
+    def _power_function_beta(self, iteration):
+        beta = (1 - 1 / iteration) ** (self.gamma + 1)
+        return beta
+    def _get_cur_nimg(self, iteration):
+        cur_nimg = iteration * self.batch_size
+        return self.batch_size, cur_nimg
+    def _halflife_beta(self, iteration):
+        ema_halflife_nimg = self.ema_halflife_kimg * 1000
+        batch_size, cur_nimg = self._get_cur_nimg(iteration)
+        if self.ema_rampup_ratio is not None:
+            ema_halflife_nimg = min(ema_halflife_nimg, cur_nimg * self.ema_rampup_ratio)
+        ema_beta = 0.5 ** (batch_size / max(ema_halflife_nimg, 1e-8))
+        return ema_beta
+    def on_training_step_end(
+        self,
+        model: FastGenModel,
+        data_batch: dict[str, torch.Tensor],
+        output_batch: dict[str, torch.Tensor | Callable],
+        loss_dict: dict[str, torch.Tensor],
+        iteration: int = 0,
+    ) -> None:
+        del data_batch, output_batch, loss_dict
+        # Check if EMA is enabled
+        if not self._enabled:
+            return
+        if self.type == "constant":
+            beta = self.beta
+        elif self.type == "power":
+            beta = self._power_function_beta(self._total_iteration(model, iteration))
+        elif self.type == "halflife":
+            beta = self._halflife_beta(self._total_iteration(model, iteration))
+        else:
+            raise ValueError(f"Invalid {self.ema_name} type: {self.type}")
+        with torch.no_grad():
+            ema = getattr(model, self.ema_name)
+            ema_state_dict = ema.state_dict()
+            for name, p_net in model.net.named_parameters():
+                if self._is_fsdp and hasattr(p_net, "full_tensor"):
+                    # Gather the full tensor from all ranks if using FSDP with DTensor
+                    # When CPU offloading is enabled, we need to move to CUDA first because
+                    # full_tensor() performs an all_gather which requires a CUDA backend
+                    if p_net.device.type == "cpu":
+                        # Move local shard to CUDA, gather, then the result stays on CUDA
+                        # which is fine since we'll copy to EMA (which handles device placement)
+                        full_tensor = p_net.to("cuda").full_tensor()
+                    else:
+                        full_tensor = p_net.full_tensor()
+                else:
+                    full_tensor = p_net
+                # Strip checkpoint wrapper prefix if present (EMA doesn't have checkpointing)
+                ema_name = name.replace("_checkpoint_wrapped_module.", "")
+                # Cast to EMA dtype and device (typically float32 on CPU) for lerp_ compatibility
+                if ema_name in ema_state_dict:
+                    ema_param = ema_state_dict[ema_name]
+                    ema_param.lerp_(full_tensor.to(device=ema_param.device, dtype=ema_param.dtype), 1.0 - beta)
+                elif iteration == 1:
+                    # only warn on first iteration if parameter is not found
+                    logger.warning(f"EMA parameter {ema_name} not found in EMA state dict, skipping update.")
+            # FSDP2 doesn't shard buffers, so we can just copy them
+            for name, p_net in model.net.named_buffers():
+                if name in ema_state_dict:
+                    ema_param = ema_state_dict[name]
+                    ema_param.copy_(p_net.to(device=ema_param.device, dtype=ema_param.dtype))
+                elif iteration == 1:
+                    # only warn on first iteration if buffer is not found
+                    logger.warning(f"EMA buffer {name} not found in EMA state dict, skipping update.")
+            if hasattr(self, "config"):
+                # only wandb log when config exists
+                if iteration % self.config.trainer.logging_iter == 0 and is_rank0():
+                    if wandb.run:
+                        wandb.log({f"ema/{self.ema_name}_beta": beta}, step=iteration)
+        synchronize()

FastGen/fastgen/callbacks/forced_weight_norm.py ADDED Viewed

	@@ -0,0 +1,28 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+from typing import TYPE_CHECKING
+from fastgen.callbacks.callback import Callback
+import fastgen.utils.logging_utils as logger
+if TYPE_CHECKING:
+    from fastgen.methods import FastGenModel
+class ForcedWeightNormCallback(Callback):
+    def on_training_accum_step_begin(
+        self,
+        model: FastGenModel,
+        *args,
+        **kwargs,
+    ) -> None:
+        if hasattr(model.net, "forced_weight_normalization"):
+            model.net.forced_weight_normalization()
+        else:
+            logger.warning(
+                "Enabled ForcedWeightNormCallback but model.net does not have the forced_weight_normalization method."
+            )

FastGen/fastgen/callbacks/gpu_mem_profiler.py ADDED Viewed

	@@ -0,0 +1,134 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+import torch
+import os
+from fastgen.utils import logging_utils as logger
+from fastgen.callbacks.callback import Callback
+import atexit
+import pickle
+from typing import Callable, Optional, TYPE_CHECKING
+import base64
+import json
+if TYPE_CHECKING:
+    from fastgen.methods import FastGenModel
+def create_dump(dump_path):
+    logger.critical(f"Creating {dump_path}")
+    if not dump_path.endswith("html"):
+        print(f"[{__file__}] create_dump produces an HTML file but was called with {dump_path=}")
+    torch.cuda.memory._dump_snapshot(dump_path + ".pickle")
+    with open(dump_path + ".pickle", "rb") as f:
+        data = pickle.load(f)
+    _memory_viz_template = r"""
+    <!DOCTYPE html>
+    <html>
+    <head>
+    </head>
+    <body>
+    <script type="module">
+    import {add_local_files} from "https://cdn.jsdelivr.net/gh/pytorch/pytorch@main/torch/utils/viz/MemoryViz.js"
+    const local_files = $SNAPSHOT
+    add_local_files(local_files, $VIZ_KIND)
+    </script>
+    </body>
+    """
+    # find which GPU was active
+    idx_device = -1
+    for i in range(8):
+        if data["device_traces"][i]:
+            idx_device = i
+            break
+    traces = data["device_traces"][idx_device]  # create an aliasing variable for convenience
+    traces = [
+        d for d in traces if d["action"] == "alloc" or d["action"] == "free_completed"
+    ]  # only the `alloc` and `free_completed` events matter for our visualization
+    for d in traces:
+        d["fastgen_frames"] = [
+            f for f in d["frames"] if "fastgen" in f["filename"]
+        ]  # get the callstack frames from fastgen code (e.g. ignore frames in pytorch/other libraries)
+        if not d["fastgen_frames"]:
+            d["fastgen_frames"] = d["frames"]
+    # run through the trace and find allocations that were allocated but never freed
+    set_alloced_addrs: dict = {}
+    for d in traces:
+        if d["action"] == "alloc":
+            set_alloced_addrs[d["addr"]] = d
+        elif d["action"] == "free_completed":
+            if d["addr"] in set_alloced_addrs:
+                del set_alloced_addrs[d["addr"]]
+        else:
+            raise NotImplementedError(f"{d['action']}")
+    never_freed_traces = list(set_alloced_addrs.values())
+    KB = 1 << 10
+    never_freed_traces = [t for t in never_freed_traces if t["size"] > KB]  # get rid of allocations below 1 KB
+    # now proceed through the trace (guarenteed to be all `alloc` events as we removed all free events).
+    # for each pair of alloc events, merge them iff they share a common fastgen ancestor.
+    # Merging events is useful as it both speeds up the visualization rendering and also makes it more understandable.
+    i = 0
+    while i < len(never_freed_traces) - 1:
+        curr_frames = never_freed_traces[i]["fastgen_frames"]
+        next_frames = never_freed_traces[i + 1]["fastgen_frames"]
+        if (
+            curr_frames and next_frames and curr_frames[0] == next_frames[0]
+        ):  # TODO: probably should compare the full callstack
+            # same ancestor, delete next event and add its size to current event
+            never_freed_traces[i]["size"] += never_freed_traces[i + 1]["size"]
+            never_freed_traces.pop(i + 1)
+        else:
+            i += 1  # different ancestor, do not combine and move on
+    data["device_traces"][idx_device] = never_freed_traces  # update the trace to only be the merged-alloc events
+    data["segments"] = []  # shrink the trace, unused in memory timeline
+    data["external_annotations"] = []  # shrink the trace, unused in memory timeline
+    buffer = pickle.dumps(data)
+    buffer += b"\x00" * (3 - len(buffer) % 3)
+    encoded_buffer = base64.b64encode(buffer).decode("utf-8")
+    json_format = json.dumps([{"name": "snapshot.pickle", "base64": encoded_buffer}])
+    html_src = _memory_viz_template.replace("$VIZ_KIND", repr("Active Memory Timeline")).replace(
+        "$SNAPSHOT", json_format
+    )
+    with open(dump_path, "w") as f:
+        f.write(html_src)
+class MemTrackerCallback(Callback):
+    def __init__(self, save_every_n_iters: Optional[int] = None, deactivate_after_n_iters: int = 100):
+        def close_and_save():
+            create_dump(
+                f"{os.environ.get('FASTGEN_OUTPUT_ROOT', 'FASTGEN_OUTPUT')}/crash_rank{os.environ.get('RANK', '0')}.html"
+            )
+        self.deactivate_after_n_iters = deactivate_after_n_iters  # Deactivate eventually to prevent leaking host memory
+        self.save_every_n_iters = save_every_n_iters
+        self.atexit_fn = close_and_save
+        atexit.register(self.atexit_fn)
+    def on_app_begin(self):
+        logger.info("[MemTrackerCallback] Tracking peak memory usage")
+        torch.cuda.memory._record_memory_history(stacks="python")
+    def on_training_step_end(
+        self,
+        model: FastGenModel,
+        data_batch: dict[str, torch.Tensor],
+        output_batch: dict[str, torch.Tensor | Callable],
+        loss_dict: dict[str, torch.Tensor],
+        iteration: int = 0,
+    ) -> None:
+        if iteration > self.deactivate_after_n_iters:
+            torch.cuda.memory._record_memory_history(enabled=None)  # frees pytorch tracking datastructures
+        if self.save_every_n_iters is not None and (iteration % self.save_every_n_iters) == 0:
+            create_dump(
+                f"{os.environ.get('FASTGEN_OUTPUT_ROOT', 'FASTGEN_OUTPUT')}/step{iteration}_rank{os.environ.get('RANK', '0')}.html"
+            )

FastGen/fastgen/callbacks/gpu_stats.py ADDED Viewed

	@@ -0,0 +1,92 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+import os
+from typing import TYPE_CHECKING, Callable, Any, Dict, List
+import pandas as pd
+import psutil
+import torch
+from fastgen.callbacks.callback import Callback
+from fastgen.utils.distributed import world_size, is_rank0, synchronize
+import fastgen.utils.logging_utils as logger
+if TYPE_CHECKING:
+    from fastgen.methods import FastGenModel
+def log_prof_data(data_list: List[Dict[str, Any]]):
+    # Create a table to log data with rank information
+    metrics = list(data_list[0].keys())
+    # Initialize dictionaries to store min and max values for each metric
+    min_values = {key: float("inf") for key in metrics}
+    max_values = {key: float("-inf") for key in metrics}
+    sum_values = {key: 0.0 for key in metrics}
+    count = 0
+    for _rank, prof_data in enumerate(data_list):
+        count += 1
+        # Update min, max, and sum values
+        for key in metrics:
+            min_values[key] = min(min_values[key], prof_data[key])
+            max_values[key] = max(max_values[key], prof_data[key])
+            sum_values[key] += prof_data[key]
+    # Calculate average values
+    avg_values = {key: sum_values[key] / count for key in metrics}
+    summary_df = pd.DataFrame({"Avg": avg_values, "Max": max_values, "Min": min_values})
+    logger.info(f"GPU stats:\n{summary_df.to_string()}")
+class GPUStatsCallback(Callback):
+    def __init__(self, every_n: int = 100):
+        self.every_n = every_n
+    def on_train_begin(self, model: FastGenModel, iteration: int = 0):
+        torch.cuda.reset_peak_memory_stats()
+        if hasattr(self, "config"):
+            # overwritten by logging_iter if self.config exists
+            self.every_n = self.config.trainer.logging_iter
+        logger.info(f"every_n to measure gpus stats: {self.every_n}")
+    def on_training_step_end(
+        self,
+        model: FastGenModel,
+        data_batch: dict[str, torch.Tensor],
+        output_batch: dict[str, torch.Tensor | Callable],
+        loss_dict: dict[str, torch.Tensor],
+        iteration: int = 0,
+    ) -> None:
+        del data_batch, output_batch, loss_dict
+        if iteration % self.every_n == 0:
+            cur_process = psutil.Process(os.getpid())
+            cpu_memory_usage = sum(p.memory_info().rss for p in [cur_process] + cur_process.children(recursive=True))
+            cpu_mem_gb = cpu_memory_usage / (1024**3)
+            peak_gpu_mem_gb = torch.cuda.max_memory_allocated() / (1024**3)
+            peak_gpu_mem_reserved_gb = torch.cuda.max_memory_reserved() / (1024**3)
+            util = torch.cuda.utilization()
+            prof_data = {
+                "cpu_mem_gb": float(cpu_mem_gb),
+                "peak_gpu_mem_gb": float(peak_gpu_mem_gb),
+                "peak_gpu_mem_reserved_gb": float(peak_gpu_mem_reserved_gb),
+                "util": float(util),
+            }
+            synchronize()
+            data_list = [prof_data] * world_size()
+            # this is blocking by default
+            if world_size() > 1:
+                torch.distributed.all_gather_object(data_list, prof_data)
+            if is_rank0():
+                log_prof_data(data_list)
+            synchronize()

FastGen/fastgen/callbacks/grad_clip.py ADDED Viewed

	@@ -0,0 +1,222 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+from contextlib import contextmanager
+from typing import TYPE_CHECKING, Optional
+import wandb
+import torch
+from torch.distributed.tensor import DTensor
+from fastgen.callbacks.callback import Callback
+from fastgen.utils.distributed import is_rank0, world_size
+import fastgen.utils.logging_utils as logger
+if TYPE_CHECKING:
+    from fastgen.methods import FastGenModel
+@contextmanager
+def cast_gradients_dtype(model, dtype=torch.float32, enabled=True):
+    if enabled:
+        try:
+            # Cast gradients to the desired dtype
+            for param in model.parameters():
+                if param.grad is not None and param.grad.dtype != dtype:
+                    param.grad.data = param.grad.data.to(dtype)
+            yield
+        finally:
+            # Restore original gradient dtypes
+            for param in model.parameters():
+                if param.grad is not None and param.grad.dtype != param.dtype:
+                    param.grad.data = param.grad.data.to(param.dtype)
+    else:
+        yield
+def clip_grad_norm_fsdp(
+    parameters,
+    max_norm: float,
+    norm_type: float = 2.0,
+    device: Optional[torch.device] = None,
+) -> torch.Tensor:
+    """
+    Clip gradients for FSDP2 models with CPU offloading.
+    The standard torch.nn.utils.clip_grad_norm_ fails with FSDP2 CPU offloading because
+    DTensor operations (like division) trigger all_reduce on CPU, which has no backend.
+    This implementation:
+    1. Extracts local tensors from DTensors
+    2. Computes local norms on native device (CPU or GPU)
+    3. All-reduces the scalar norm to get global norm
+    4. Clips gradients in-place using the global norm
+    Args:
+        parameters: Iterable of parameters with gradients
+        max_norm: Maximum norm value
+        norm_type: Type of norm (default: L2)
+        device: Device for all-reduce tensor. If None, inferred from gradients or defaults to cuda.
+    Returns:
+        Total gradient norm (global across all ranks) as a regular tensor
+    """
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+    parameters = list(p for p in parameters if p.grad is not None)
+    if len(parameters) == 0:
+        return torch.tensor(0.0)
+    # Compute per-parameter norms on their native device (CPU or GPU)
+    # We compute norm^norm_type to allow proper aggregation across ranks
+    local_norm_sum = 0.0
+    inferred_device = None
+    for p in parameters:
+        if isinstance(p.grad, DTensor):
+            grad = p.grad._local_tensor
+        else:
+            grad = p.grad
+        # Infer CUDA device from gradients (use first CUDA device found)
+        if inferred_device is None and grad.device.type == "cuda":
+            inferred_device = grad.device
+        # Compute norm on the gradient's native device, accumulate as Python float
+        local_norm_sum += torch.norm(grad.detach().float(), norm_type).item() ** norm_type
+    # Use provided device, or inferred device, or fall back to current CUDA device
+    if device is None:
+        device = inferred_device if inferred_device is not None else torch.device("cuda")
+    local_norm_sum = torch.tensor(local_norm_sum, device=device)
+    # All-reduce to get global norm across all ranks
+    if world_size() > 1:
+        torch.distributed.all_reduce(local_norm_sum, op=torch.distributed.ReduceOp.SUM)
+    # Compute final global norm
+    total_norm = local_norm_sum ** (1.0 / norm_type)
+    # Compute clip coefficient (regular tensor division, no DTensor ops)
+    clip_coef = max_norm / (total_norm + 1e-6)
+    clip_coef_clamped = torch.clamp(clip_coef, max=1.0)
+    # Apply clipping to gradients
+    for p in parameters:
+        if isinstance(p.grad, DTensor):
+            # For DTensor, scale the local tensor directly
+            local_grad = p.grad._local_tensor
+            local_grad.mul_(clip_coef_clamped.to(local_grad.device))
+        else:
+            p.grad.detach().mul_(clip_coef_clamped.to(p.grad.device))
+    return total_norm
+class GradClipCallback(Callback):
+    def __init__(
+        self,
+        grad_norm: float | None = 1.0,
+        model_key: str = "net",
+        posinf: float | None = None,
+        neginf: float | None = None,
+        precision_grad_clip: Optional[torch.dtype] = None,
+    ) -> None:
+        self.grad_norm = grad_norm
+        self.model_key = model_key
+        self.posinf = posinf
+        self.neginf = neginf
+        self.precision_grad_clip = precision_grad_clip
+    def nan_to_num(self, module: torch.nn.Module) -> tuple[int, torch.dtype | None]:
+        grad_dtype = None
+        non_finite_grads_count = 0
+        for name, param in module.named_parameters():
+            if param.grad is not None:
+                grad_dtype = param.grad.dtype
+                # Extract local tensor for DTensor (avoids triggering distributed ops on CPU)
+                if isinstance(param.grad, DTensor):
+                    grad = param.grad._local_tensor
+                else:
+                    grad = param.grad
+                non_finite_grads = grad.numel() - grad.isfinite().sum().item()
+                if non_finite_grads:
+                    non_finite_grads_count += non_finite_grads
+                    logger.debug(
+                        f"Gradient of {name} (dtype {grad_dtype}) is not finite: "
+                        f"Setting {grad.isnan().sum().item()} NaNs to 0 and {grad.isinf().sum().item()} Infs "
+                        f"to {self.posinf} or {self.neginf}."
+                    )
+                    torch.nan_to_num(grad, nan=0.0, posinf=self.posinf, neginf=self.neginf, out=grad)
+        return non_finite_grads_count, grad_dtype
+    def on_optimizer_step_begin(self, model: FastGenModel, iteration: int = 0) -> None:
+        # unscale the optimizer related to the `model_key`
+        assert (
+            self.model_key in model.optimizer_dict.keys()
+        ), f"Keys in optimizer_dict: {list(model.optimizer_dict.keys())}."
+        optimizer = model.optimizer_dict[self.model_key]
+        # Only unscale if grad_scaler should be used (checks enabled + float32 grads)
+        if model.should_use_grad_scaler(optimizer):
+            model.grad_scaler.unscale_(optimizer)
+        # Save model device before selecting subnet (subnet may not have .device)
+        model_device = model.device
+        # select subnet if specified (by default, we only perform gradient clips on model.net)
+        subnets = self.model_key.split(".")
+        for subnet in subnets:
+            model = getattr(model, subnet)
+        # set nan to num for each parameter
+        non_finite_grads_count, grad_dtype = self.nan_to_num(model)
+        logger.debug(f"Gradient dtype of {self.model_key}: {grad_dtype}")
+        if non_finite_grads_count > 0:
+            logger.info(
+                f"Number of parameters with non-finite gradients (of dtype {grad_dtype}): {non_finite_grads_count}"
+            )
+        log_dict = {f"optimizer/non_finite_grads_count (model_key {self.model_key})": non_finite_grads_count}
+        if self.grad_norm is not None:
+            # Cast all gradients to precision_grad_clip for numerical stability during clipping
+            cast_grads = (
+                self.precision_grad_clip is not None
+                and grad_dtype is not None
+                and grad_dtype != self.precision_grad_clip
+            )
+            # log value at first iteration
+            if iteration == 1 and cast_grads:
+                logger.info(f"Casting gradients from {grad_dtype} to {self.precision_grad_clip} before clipping.")
+            # Check if CPU offloading is enabled by looking for DTensor grads on CPU
+            # CPU offloading = DTensor local tensors are on CPU
+            # No CPU offloading = DTensor local tensors are on GPU (or no DTensors)
+            use_fsdp_cpu_offload_clip = False
+            for p in model.parameters():
+                if p.grad is not None and isinstance(p.grad, DTensor):
+                    if p.grad._local_tensor.device.type == "cpu":
+                        use_fsdp_cpu_offload_clip = True
+                        break
+            with cast_gradients_dtype(model, dtype=self.precision_grad_clip, enabled=cast_grads):
+                if use_fsdp_cpu_offload_clip:
+                    # Use custom clipping for FSDP with CPU offloading
+                    # Standard clip_grad_norm_ fails because DTensor ops trigger all_reduce on CPU
+                    total_norm = clip_grad_norm_fsdp(model.parameters(), self.grad_norm, device=model_device)
+                else:
+                    # Standard clipping for non-FSDP or FSDP without CPU offloading
+                    total_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), self.grad_norm, foreach=True)
+            log_dict[f"optimizer/grad_norm (model_key {self.model_key})"] = total_norm.item()
+        if hasattr(self, "config"):
+            # only wandb log when config exists
+            if iteration % self.config.trainer.logging_iter == 0 and is_rank0() and wandb.run:
+                wandb.log(log_dict, step=iteration)

FastGen/fastgen/callbacks/param_count.py ADDED Viewed

	@@ -0,0 +1,116 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+from typing import TYPE_CHECKING
+from fastgen.callbacks.callback import Callback
+from fastgen.utils.distributed import world_size
+import fastgen.utils.logging_utils as logger
+import torch
+import wandb
+try:
+    from torch.distributed.tensor import DTensor
+except ImportError:
+    DTensor = None
+if TYPE_CHECKING:
+    from fastgen.methods import FastGenModel
+def _get_local_numel(param: torch.Tensor) -> int:
+    """Get the local (sharded) number of elements for a parameter.
+    For DTensor (FSDP2), returns the local shard size.
+    For regular tensors, returns the full size.
+    """
+    if DTensor is not None and isinstance(param, DTensor):
+        return param._local_tensor.numel()
+    return param.numel()
+class ParamCountCallback(Callback):
+    def on_train_begin(self, model: FastGenModel, **kwargs) -> None:
+        # get modules
+        modules = {"model": model, **model.model_dict}
+        # iterate over modules
+        output = {}
+        for name, module in modules.items():
+            # Logical (full model) param counts
+            trainable_params = sum(p.numel() for p in module.parameters() if p.requires_grad)
+            total_params = sum(p.numel() for p in module.parameters())
+            # Local (sharded) param counts - what's actually in memory on this rank
+            local_trainable_params = sum(_get_local_numel(p) for p in module.parameters() if p.requires_grad)
+            local_total_params = sum(_get_local_numel(p) for p in module.parameters())
+            # check if parameter counts are different across ranks
+            if world_size() > 1:
+                trainable_params = self.gather_param_counts(trainable_params)
+                total_params = self.gather_param_counts(total_params)
+                local_trainable_params = self.gather_param_counts(local_trainable_params)
+                local_total_params = self.gather_param_counts(local_total_params)
+                if len(set(total_params)) == 1 and len(set(trainable_params)) == 1:
+                    trainable_params = trainable_params[0]
+                    total_params = total_params[0]
+                if len(set(local_total_params)) == 1 and len(set(local_trainable_params)) == 1:
+                    local_trainable_params = local_trainable_params[0]
+                    local_total_params = local_total_params[0]
+            # logging
+            module_name = module.__class__.__name__
+            output.update(
+                {
+                    f"{name}/trainable_params": trainable_params,
+                    f"{name}/total_params": total_params,
+                    f"{name}/local_trainable_params": local_trainable_params,
+                    f"{name}/local_total_params": local_total_params,
+                }
+            )
+            if isinstance(trainable_params, list):
+                logger.warning(f"Parameter counts differ across ranks for {module_name}.")
+                for rank, (p_train, p) in enumerate(zip(trainable_params, total_params)):
+                    logger.info(
+                        f"{name} ({module_name}) has {p_train * 1.e-6:.2f} M trainable and {p * 1.e-6:.2f} M total params on rank {rank}."
+                    )
+            else:
+                logger.info(
+                    f"{name} ({module_name}) has {trainable_params * 1.e-6:.2f} M trainable and {total_params * 1.e-6:.2f} M total params (logical)."
+                )
+            # Report local/sharded counts
+            if isinstance(local_trainable_params, list):
+                for rank, (p_train, p) in enumerate(zip(local_trainable_params, local_total_params)):
+                    logger.info(
+                        f"{name} ({module_name}) has {p_train * 1.e-6:.2f} M trainable and {p * 1.e-6:.2f} M total params LOCAL on rank {rank}."
+                    )
+            else:
+                is_sharded = local_total_params < total_params if not isinstance(total_params, list) else True
+                if is_sharded:
+                    logger.info(
+                        f"{name} ({module_name}) has {local_trainable_params * 1.e-6:.2f} M trainable and {local_total_params * 1.e-6:.2f} M total params LOCAL per rank (sharding ratio: {world_size()}x)."
+                    )
+                else:
+                    logger.info(f"{name} ({module_name}) is NOT sharded (local == logical params).")
+        if wandb.run:
+            wandb.run.summary.update(output)
+    def gather_param_counts(self, param_count):
+        """
+        Gather parameter counts across all ranks.
+        Args:
+            param_count: Parameter count to gather.
+        Returns:
+            List of parameter counts across all ranks.
+        """
+        param_count = torch.tensor(
+            [param_count], dtype=torch.long, device="cuda" if torch.cuda.is_available() else "cpu"
+        )
+        param_count_list = [torch.zeros_like(param_count) for _ in range(world_size())]
+        torch.distributed.all_gather(param_count_list, param_count)
+        return [p.item() for p in param_count_list]

FastGen/fastgen/callbacks/train_profiler.py ADDED Viewed

	@@ -0,0 +1,138 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+import time
+from typing import TYPE_CHECKING, Callable
+import torch
+import wandb
+from fastgen.callbacks.callback import Callback
+from fastgen.utils.distributed import is_rank0
+import fastgen.utils.logging_utils as logger
+if TYPE_CHECKING:
+    from fastgen.methods import FastGenModel
+class TrainProfilerCallback(Callback):
+    """Callback for profiling training speed and detailed timing breakdowns.
+    Tracks:
+    - iter_time: seconds per iteration (wall clock time)
+    - data_load_time: time spent loading data
+    - avg_forward_time: average forward pass time across accumulation steps
+    - backward_time: time spent in backward pass
+    - optim_step_time: time spent in optimizer step
+    """
+    def __init__(self, every_n: int = 100, detailed: bool = True):
+        """Initialize the profiler callback.
+        Args:
+            every_n: Log metrics every N iterations
+            detailed: If True, log detailed timing breakdown. If False, only log iter_time.
+        """
+        # For iter_time tracking
+        self.last_log_time = None
+        # For detailed profiling
+        self.detailed = detailed
+        self.train_step_begin_time = None
+        self.accum_begin_times = None
+        self.backward_begin_times = None
+        self.optimizer_step_begin = None
+        self.step_end_time = None
+        self.every_n = every_n
+    def on_train_begin(self, model: FastGenModel, iteration: int = 0) -> None:
+        if hasattr(self, "config"):
+            # overwritten by logging_iter if self.config exists
+            self.every_n = self.config.trainer.logging_iter
+        logger.info(f"every_n to profile trainer: {self.every_n}")
+    def on_training_step_begin(
+        self,
+        model: FastGenModel,
+        iteration: int = 0,
+    ):
+        if self.detailed:
+            self.train_step_begin_time = time.perf_counter()
+            self.accum_begin_times = []
+            self.backward_begin_times = []
+    def on_training_accum_step_begin(
+        self, model: FastGenModel, data_batch: dict[str, torch.Tensor], iteration: int = 0, accum_iter: int = 0
+    ):
+        if self.detailed:
+            self.accum_begin_times.append(time.perf_counter())
+    def on_backward_begin(
+        self,
+        model: FastGenModel,
+        data_batch: dict[str, torch.Tensor],
+        output_batch: dict[str, torch.Tensor | Callable],
+        loss_dict: dict[str, torch.Tensor],
+        iteration: int = 0,
+        accum_iter: int = 0,
+    ):
+        if self.detailed:
+            self.backward_begin_times.append(time.perf_counter())
+    def on_optimizer_step_begin(self, model: FastGenModel, iteration: int = 0):
+        if self.detailed:
+            self.optimizer_step_begin = time.perf_counter()
+    def on_training_step_end(
+        self,
+        model: FastGenModel,
+        data_batch: dict[str, torch.Tensor],
+        output_batch: dict[str, torch.Tensor | Callable],
+        loss_dict: dict[str, torch.Tensor],
+        iteration: int = 0,
+    ) -> None:
+        del data_batch, output_batch, loss_dict
+        if self.detailed:
+            self.step_end_time = time.perf_counter()
+        if hasattr(self, "config"):
+            # only wandb log when config exists
+            if iteration % self.every_n == 0 and is_rank0():
+                metrics = {}
+                # Calculate iter_time (wall clock time per iteration)
+                cur_time = time.time()
+                if self.last_log_time is not None:
+                    iter_time = (cur_time - self.last_log_time) / self.every_n
+                    logger.info(f"{iteration} : avg iteration time       {iter_time:.2f} seconds")
+                    metrics["profiler/avg_iteration_time"] = iter_time
+                self.last_log_time = cur_time
+                # Calculate detailed timing breakdown
+                if self.detailed and self.accum_begin_times and self.backward_begin_times:
+                    data_load_time = self.accum_begin_times[0] - self.train_step_begin_time
+                    forward_time = sum(
+                        [b - a for (b, a) in zip(self.backward_begin_times, self.accum_begin_times)]
+                    ) / len(self.accum_begin_times)
+                    backward_time = self.optimizer_step_begin - self.backward_begin_times[-1]
+                    optim_step_time = self.step_end_time - self.optimizer_step_begin
+                    logger.info(f"{iteration} : data loading time        {data_load_time:.2f}")
+                    logger.info(f"{iteration} : avg forward pass time    {forward_time:.2f}")
+                    logger.info(f"{iteration} : backward pass time       {backward_time:.2f}")
+                    logger.info(f"{iteration} : optimizer step time      {optim_step_time:.2f}")
+                    metrics.update(
+                        {
+                            "profiler/data_loading_time": data_load_time,
+                            "profiler/avg_forward_pass_time": forward_time,
+                            "profiler/backward_pass_time": backward_time,
+                            "profiler/optimizer_step_time": optim_step_time,
+                        }
+                    )
+                if wandb.run and metrics:
+                    wandb.log(metrics, step=iteration)

FastGen/fastgen/callbacks/wandb.py ADDED Viewed

	@@ -0,0 +1,404 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+import os
+from dataclasses import dataclass, field
+import time
+from typing import Optional, Dict, Callable, TYPE_CHECKING
+import gc
+import torch
+import torchvision
+from torchvision.transforms import functional as tv_F
+import wandb
+import wandb.util
+from fastgen.callbacks.callback import Callback
+from fastgen.configs.config_utils import serialize_config
+from fastgen.utils import basic_utils
+from fastgen.utils.distributed import rank0_only, synchronize, world_size
+from fastgen.utils import logging_utils as logger
+if TYPE_CHECKING:
+    from fastgen.configs.config import BaseConfig
+    from fastgen.methods import FastGenModel
+def to_wandb(
+    tensor: torch.Tensor,
+    rgb_range: float = 255.0,
+    normalized: bool = False,
+    max_plot_img: int = 16,
+    max_plot_vid: int = 2,
+    fps: int = 16,
+    channel_before_time: bool = True,
+    caption: str | None = None,
+    vid_format: str = "mp4",
+) -> wandb.Image | wandb.Video:
+    """
+    Convert a tensor to a wandb.Image or wandb.Video.
+    Args:
+        tensor (torch.Tensor): Input tensor of shape [B,C,H,W], [B,T,C,H,W], or [B,T,C,H,W,D].
+        rgb_range (float, optional): Output target RGB range (can almost definitely be kept as 255).
+            Defaults to 255.0.
+        normalized (bool, optional): Whether the tensor is normalized to [0,1]. Defaults to False which assumes [-1,1] range.
+        max_plot_img (int, optional): Max number of images to plot. Defaults to 16.
+        max_plot_vid (int, optional): Max number of videos to plot. Defaults to 2.
+        fps (int, optional): Frames per second. Defaults to 8.
+        channel_before_time (bool, optional): Whether the tensor is in the format [B,C,T,..]. Set False if the [B,T,C,..] format is used.
+        caption (str, optional): Caption for the image or video. Defaults to None.
+        vid_format (str, optional): Format of the video file. Defaults to "mp4".
+    Returns:
+        wandb.Image | wandb.Video: Format a tensor for logging to W&B.
+    """
+    if tensor.ndim == 5:
+        max_plot = max_plot_vid
+        if channel_before_time:
+            tensor = tensor.permute(0, 2, 1, 3, 4)
+    elif tensor.ndim == 4:
+        max_plot = max_plot_img
+    else:
+        raise ValueError(f"Tensor must be 4 or 5 dimensional, but got {tensor.ndim} dimensions")
+    # slice and adjust range
+    if normalized:
+        factor = rgb_range
+        offset = 0.0
+    else:
+        factor = rgb_range / 2.0
+        offset = rgb_range / 2.0
+    tensor = tensor[:max_plot].mul(factor).add(offset).clip_(0, rgb_range).to(torch.uint8)
+    # convert to wandb.Image or wandb.Video
+    assert tensor.shape[-3] == 3, "Make sure that the data is in ..., C, H, W format"
+    if tensor.ndim == 5:
+        return wandb.Video(tensor.cpu().numpy(), fps=fps, format=vid_format, caption=caption)
+    else:
+        image_grid = torchvision.utils.make_grid(tensor, nrow=4, pad_value=1)
+        image_grid = tv_F.to_pil_image(image_grid)
+        return wandb.Image(image_grid, caption=caption)
+@rank0_only
+def init_wandb(config: BaseConfig):
+    # wandb login
+    wandb_credential = config.log_config.wandb_credential
+    if os.path.isfile(wandb_credential):
+        os.environ["WANDB_API_KEY"] = open(wandb_credential, encoding="utf-8").read().strip("\n")
+        logger.info(f"Loading WANDB_API_KEY from {wandb_credential}")
+    wandb_config = config.log_config
+    # Resume with or generate a wandb id
+    logger.info(f"wandb_config.save_path: {wandb_config.save_path}")
+    os.makedirs(wandb_config.save_path, exist_ok=True)
+    wandb_id_path = f"{wandb_config.save_path}/wandb_id.txt"
+    if os.path.isfile(wandb_id_path):
+        wandb_id = open(wandb_id_path, encoding="utf-8").read().strip()
+        logger.info(f"Resuming with an existing wandb id: {wandb_id}")
+    else:
+        wandb_id = wandb.util.generate_id()
+        with open(wandb_id_path, "w", encoding="utf-8") as f:
+            f.write(f"{wandb_id}\n")
+        logger.info(f"Generating a wandb id: {wandb_id}")
+    # Get config as plain dict
+    config_resolved = serialize_config(config, return_type="dict")
+    # Initialize the wandb library.
+    wandb.init(
+        id=wandb_id,
+        project=wandb_config.project,
+        group=wandb_config.group,
+        name=wandb_config.name,
+        config=config_resolved,
+        dir=wandb_config.save_path,
+        resume="allow",
+        mode=wandb_config.wandb_mode,
+    )
+    # Save a copy of code to a wandb Artifact (this can be slow)
+    # Make code upload optional to avoid distributed training delays
+    upload_code = basic_utils.str2bool(os.getenv("WANDB_UPLOAD_CODE", "false"))
+    if upload_code:
+        logger.info("Uploading code to wandb (this may take a few minutes)...")
+        wandb.run.log_code(".")
+        logger.info("Code upload to wandb completed")
+    else:
+        logger.info("Wandb code upload disabled (set WANDB_UPLOAD_CODE=true to enable)")
+@dataclass
+class _LossDictRecord:
+    loss_dict: dict = field(default_factory=dict)
+    iter_count_dict: dict = field(default_factory=dict)
+    def add(self, loss_dict: Optional[Dict[str, torch.Tensor]]) -> None:
+        if loss_dict is not None:
+            for loss_name, loss_val in loss_dict.items():
+                self.loss_dict[loss_name] = self.loss_dict.get(loss_name, 0.0) + loss_val.float().item()
+                self.iter_count_dict[loss_name] = self.iter_count_dict.get(loss_name, 0) + 1
+    def reset(self) -> None:
+        self.loss_dict = {}
+        self.iter_count_dict = {}
+    def gather_dict(self, dictionary: Dict[str, float | int]) -> Dict[str, float | int]:
+        n_ranks = world_size()
+        if n_ranks > 1:
+            dict_list = [None for _ in range(n_ranks)]
+            torch.distributed.all_gather_object(dict_list, dictionary)
+            # from list of dicts to dict of summed values
+            dictionary = {}
+            for d in dict_list:
+                for key, value in d.items():
+                    dictionary[key] = dictionary.get(key, 0.0) + value
+        return dictionary
+    def get_stat(self) -> Dict[str, float]:
+        # number of ranks that logged this loss
+        rank_dict = self.gather_dict({k: 1 for k in self.loss_dict.keys()})
+        # number of times this loss was computed
+        count_dict = self.gather_dict(self.iter_count_dict)
+        # sum of all losses
+        loss_dict = self.gather_dict(self.loss_dict)
+        avg_loss_dict = {}
+        for loss_name, loss_val in loss_dict.items():
+            count = count_dict.get(loss_name, 0)
+            ranks = rank_dict.get(loss_name, 1)
+            iter_count = count / ranks
+            avg_loss = (loss_val / count) * (ranks / world_size()) if count > 0 else 0.0
+            logger.info(f"avg_{loss_name}: {avg_loss:.4f}".ljust(30) + f"iter count: {iter_count}")
+            avg_loss_dict[loss_name] = avg_loss
+        self.reset()
+        return avg_loss_dict
+class WandbCallback(Callback):
+    """
+    The callback gets precision for data from model
+    """
+    def __init__(
+        self,
+        *args,
+        validation_logging_step: int = 1,
+        sample_logging_iter: Optional[int] = None,
+        vid_format: str = "mp4",
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.validation_logging_step = validation_logging_step
+        self.sample_logging_iter = sample_logging_iter
+        self.val_sample_map = None
+        self.vid_format = vid_format
+        self.loss_dict_record = _LossDictRecord()
+        self.val_loss_dict_record = _LossDictRecord()
+    def on_app_begin(self) -> None:
+        assert hasattr(self, "config"), "Missing config in WandbCallback."
+        init_wandb(self.config)
+        self.offload_module_in_decoding = self.config.trainer.offload_module_in_decoding
+        # disable offloading if using FSDP
+        if self.config.trainer.fsdp:
+            self.offload_module_in_decoding = False
+        if self.sample_logging_iter is None:
+            self.sample_logging_iter = self.config.trainer.logging_iter
+        synchronize()
+    @rank0_only
+    def on_optimizer_step_begin(self, model: FastGenModel, iteration: int = 0) -> None:
+        assert hasattr(self, "config"), "Missing config in WandbCallback."
+        if iteration % self.config.trainer.logging_iter == 0:
+            for name, scheduler in model.scheduler_dict.items():
+                wandb.log({f"optimizer/lr_{name}": scheduler.get_last_lr()[0]}, step=iteration)
+    def get_sample_map(
+        self, model: FastGenModel, data_batch: dict[str, torch.Tensor], output_batch: dict[str, torch.Tensor | Callable]
+    ) -> dict[str, wandb.Image | wandb.Video]:
+        # Collect generated and real data and create copies to avoid modifying the original dicts
+        sample_map = {}
+        gen_rand = output_batch["gen_rand"]
+        if isinstance(gen_rand, Callable):
+            synchronize()
+            gen_rand = gen_rand()
+            synchronize()
+        # Avoid modifying the original dicts
+        data_batch = data_batch.copy()
+        output_batch = output_batch.copy()
+        # Decide whether we want to visualize multistep teacher generation
+        if self.config.trainer.visualize_teacher:
+            assert "input_rand" in output_batch, "We need to know the noise to visualize teacher generation"
+            teacher_output = model.sample(
+                model.teacher,
+                output_batch["input_rand"][0:1],
+                data_batch["condition"][0:1],  # e.g. text condition encoded by the text encoder
+                data_batch["neg_condition"][0:1],  # e.g. negative text condition encoded by the text encoder
+            )
+            output_batch["gen_teacher"] = teacher_output
+        # Decode to pixel if it's in latent space
+        if hasattr(model.net, "init_preprocessors"):
+            torch.cuda.empty_cache()
+            device_nets = model.device
+            has_vae = hasattr(model.net, "vae")
+            if not has_vae:
+                model.net.init_vae()
+                model.net.vae.to(device=device_nets, dtype=model.precision)
+            if self.offload_module_in_decoding:
+                # offload the unneeded models to CPU (enable it if hitting OOM here)
+                logger.info(
+                    f"GPU Memory BEFORE moving nets to CPU: {torch.cuda.memory_allocated(device_nets) / 1024 ** 2:.2f} MB"
+                )
+                if hasattr(model, "fake_score"):
+                    model.fake_score = model.fake_score.to("cpu")
+                if hasattr(model, "teacher"):
+                    model.teacher = model.teacher.to("cpu")
+                logger.info(
+                    f"GPU Memory AFTER moving nets to CPU: {torch.cuda.memory_allocated(device_nets) / 1024 ** 2:.2f} MB"
+                )
+                synchronize()
+            with basic_utils.inference_mode(precision_amp=model.precision_amp_enc, device_type=device_nets.type):
+                if "real" in data_batch:
+                    # only generate one sample for video
+                    limit = 1 if len(data_batch["real"].shape) == 5 else len(data_batch["real"])
+                    data_batch["real"] = model.net.vae.decode(data_batch["real"][:limit])
+                if isinstance(gen_rand, dict):
+                    for k in gen_rand:
+                        limit = 1 if len(gen_rand[k].shape) == 5 else len(gen_rand[k])
+                        gen_rand[k] = model.net.vae.decode(gen_rand[k][:limit])
+                else:
+                    limit = 1 if len(gen_rand.shape) == 5 else len(gen_rand)
+                    gen_rand = model.net.vae.decode(gen_rand[:limit])
+                if "gen_teacher" in output_batch:
+                    output_batch["gen_teacher"] = model.net.vae.decode(output_batch["gen_teacher"][:limit])
+                if logger.LOG_LEVEL == "DEBUG" and "gen_rand_train" in output_batch:
+                    output_batch["gen_rand_train"] = model.net.vae.decode(output_batch["gen_rand_train"][:limit])
+            if not has_vae:
+                del model.net.vae
+            if self.offload_module_in_decoding:
+                # move back fake_score to gpu
+                if hasattr(model, "fake_score"):
+                    model.fake_score = model.fake_score.to(device_nets)
+                if hasattr(model, "teacher"):
+                    model.teacher = model.teacher.to(device_nets)
+                logger.info(
+                    f"GPU Memory AFTER moving nets back to GPU: {torch.cuda.memory_allocated(device_nets) / 1024 ** 2:.2f} MB"
+                )
+                synchronize()
+        if wandb.run:
+            if (
+                "condition_raw" in data_batch
+                and isinstance(data_batch["condition_raw"], (list, tuple))
+                and isinstance(data_batch["condition_raw"][0], str)
+            ):
+                caption = "\n".join(data_batch["condition_raw"][: len(gen_rand)])
+            else:
+                caption = None
+            if isinstance(gen_rand, dict):
+                for k in gen_rand:
+                    sample_map[f"student/generation/{k}"] = to_wandb(
+                        gen_rand[k], caption=caption, vid_format=self.vid_format
+                    )
+            else:
+                sample_map["student/generation"] = to_wandb(gen_rand, caption=caption, vid_format=self.vid_format)
+            if "real" in data_batch:
+                sample_map["data/real"] = to_wandb(data_batch["real"], caption=caption, vid_format=self.vid_format)
+            if "gen_teacher" in output_batch:
+                sample_map["teacher/generation"] = to_wandb(
+                    output_batch["gen_teacher"], caption=caption, vid_format=self.vid_format
+                )
+            if logger.LOG_LEVEL == "DEBUG" and "gen_rand_train" in output_batch:
+                sample_map["student/generation_train"] = to_wandb(
+                    output_batch["gen_rand_train"], caption=caption, vid_format=self.vid_format
+                )
+        return sample_map
+    def log_sample_map(
+        self,
+        model: FastGenModel,
+        data_batch: dict[str, torch.Tensor],
+        output_batch: dict[str, torch.Tensor | Callable],
+        suffix: str = "",
+        iteration: int = 0,
+        group: str = "train",
+    ) -> None:
+        sample_map = self.get_sample_map(model, data_batch, output_batch)
+        sample_map = {f"{group}_media/{k}{suffix}": v for k, v in sample_map.items()}
+        if wandb.run:
+            wandb.log(sample_map, step=iteration)
+        synchronize()
+        gc.collect()
+        torch.cuda.empty_cache()
+    def log_stats(self, loss_dict_record: _LossDictRecord, iteration: int = 0, group: str = "train") -> None:
+        logger.info(f"logging {group} stats at iteration {iteration}" + "-" * 20)
+        # Collect distributed statistics
+        avg_loss_dict = loss_dict_record.get_stat()
+        stats = {f"{group}/{name}": val for name, val in avg_loss_dict.items()}
+        base_info = {"optimizer/iteration": iteration}
+        # log stats and base info
+        if wandb.run:
+            wandb.log(stats, step=iteration)
+            wandb.log(base_info, step=iteration)
+    def on_training_step_end(
+        self,
+        model: FastGenModel,
+        data_batch: dict[str, torch.Tensor],
+        output_batch: dict[str, torch.Tensor | Callable],
+        loss_dict: dict[str, torch.Tensor],
+        iteration: int = 0,
+    ) -> None:
+        self.loss_dict_record.add(loss_dict)
+        time_start = time.perf_counter()
+        logged = False
+        if iteration % self.config.trainer.logging_iter == 0 or iteration == 1:
+            self.log_stats(self.loss_dict_record, iteration=iteration, group="train")
+            logged = True
+        if iteration % self.sample_logging_iter == 0 or iteration == 1:
+            self.log_sample_map(model, data_batch, output_batch, iteration=iteration, group="train")
+            logged = True
+        if logged:
+            time_taken = time.perf_counter() - time_start
+            logger.info(f"WandB logging complete after {time_taken:.2f} seconds")
+    def on_validation_step_end(
+        self,
+        model: FastGenModel,
+        data_batch: dict[str, torch.Tensor],
+        output_batch: dict[str, torch.Tensor | Callable],
+        loss_dict: dict[str, torch.Tensor],
+        step: int = 0,
+        iteration: int = 0,
+        idx: int = 0,
+    ) -> None:
+        self.val_loss_dict_record.add(loss_dict)
+        if step % self.validation_logging_step == 0:
+            self.log_sample_map(
+                model, data_batch, output_batch, suffix=f"_{step}", iteration=iteration, group=f"val{idx}"
+            )
+    def on_validation_end(self, model: FastGenModel, iteration: int = 0, idx: int = 0) -> None:
+        self.log_stats(self.val_loss_dict_record, iteration=iteration, group=f"val{idx}")

FastGen/fastgen/configs/README.md ADDED Viewed

	@@ -0,0 +1,108 @@

+# Configuration System
+FastGen uses a hierarchical Python-based configuration system built on [Hydra](https://hydra.cc/), [OmegaConf](https://omegaconf.readthedocs.io/), and [attrs](https://www.attrs.org/).
+## Directory Structure
+```
+fastgen/configs/
+├── experiments/         # Experiment configs (cifar10, Wan, sdxl, etc.)
+├── methods/             # Method-specific configs (DMD2, CM, KD, SFT, etc.)
+├── callbacks.py         # Callback configurations (EMA, WandB, GradClip, etc.)
+├── config_utils.py      # Utilities (import, override, serialize configs)
+├── config.py            # Base config classes (BaseConfig, BaseModelConfig, BaseTrainerConfig)
+├── data.py              # Dataset/dataloader configurations
+├── discriminator.py     # Discriminator configs (for GAN-based methods)
+├── net.py               # Network architecture configurations
+├── opt.py               # Optimizer and scheduler configurations
+```
+## Config Hierarchy
+1. **Base Configs** (`config.py`): Core classes defining model, trainer, data, and logging settings
+2. **Method Configs** (`methods/`): Extend base configs with method-specific parameters
+3. **Experiment Configs** (`experiments/`): Concrete configs for specific dataset/model combinations
+## LazyCall Pattern
+Deferred instantiation using `LazyCall`:
+```python
+from fastgen.utils import LazyCall as L
+from fastgen.methods import DMD2Model
+model_class = L(DMD2Model)(config=None)  # Config dict with _target_
+model = instantiate(model_class)         # Instantiate later
+```
+## Command-Line Arguments
+```bash
+python train.py --config=path/to/config.py [--log_level LEVEL] [--dryrun] - key=value
+```
+| Argument | Description |
+|----------|-------------|
+| `--config` | Path to the config file |
+| `--log_level` | Log level: DEBUG, INFO (default), WARNING, ERROR |
+| `--dryrun` | Print resolved config and exit without training |
+| `-` | Separator before config overrides (required) |
+Examples:
+```bash
+# Override training settings
+python train.py --config=fastgen/configs/experiments/EDM/config_dmd2_test.py - \
+    trainer.max_iter=10000 \
+    model.gan_loss_weight_gen=0. \
+    log_config.name=my_experiment
+# Debug config without training
+python train.py --config=fastgen/configs/experiments/EDM/config_dmd2_test.py --dryrun
+# Verbose logging
+python train.py --config=fastgen/configs/experiments/EDM/config_dmd2_test.py --log_level DEBUG
+```
+## Key Config Classes
+| Class | Purpose |
+|-------|---------|
+| `BaseConfig` | Top-level: model, trainer, dataloader, logging |
+| `BaseModelConfig` | Network, optimizer, precision, EMA, guidance |
+| `BaseTrainerConfig` | Checkpointing, callbacks, DDP/FSDP, iterations |
+| `LogConfig` | Project, group, name, wandb settings |
+## Environment Variables
+### Core Variables
+| Variable | Description | Default |
+|----------|-------------|---------|
+| `FASTGEN_OUTPUT_ROOT` | Root directory for checkpoints, logs, and outputs | `FASTGEN_OUTPUT` |
+| `DATA_ROOT_DIR` | Root directory for datasets | `$FASTGEN_OUTPUT_ROOT/DATA` |
+| `CKPT_ROOT_DIR` | Root directory for pretrained checkpoints | `$FASTGEN_OUTPUT_ROOT/MODEL` |
+| `HF_HOME` | HuggingFace cache directory | `$FASTGEN_OUTPUT_ROOT/.cache` |
+| `LOCAL_FILES_ONLY` | Use only local files, skip downloads from HuggingFace | `false` |
+| `WANDB_API_KEY` | W&B API key for persistent logging ([get yours here](https://wandb.ai/settings)) | (none, W&B will prompt) |
+| `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`, `AWS_REGION`, `AWS_ENDPOINT_URL` | S3 credentials for data and checkpoint storage | (none) |
+### Loading Credentials from Files
+As an alternative to setting environment variables directly, credentials can be loaded automatically from files in the `./credentials/` directory:
+| File | Environment Variables Set |
+|------|---------------------------|
+| `./credentials/wandb_api.txt` | `WANDB_API_KEY` (plain text file containing your API key) |
+| `./credentials/s3.json` | `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`, `AWS_REGION`, `AWS_ENDPOINT_URL` (JSON format below) |
+**Format for `s3.json`:**
+```json
+{
+    "aws_access_key_id": "<your_access_key>",
+    "aws_secret_access_key": "<your_secret_key>",
+    "region_name": "<region>",
+    "endpoint_url": "<s3_endpoint_url>"
+}
+```

FastGen/fastgen/configs/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2	+ # SPDX-License-Identifier: Apache-2.0

FastGen/fastgen/configs/callbacks.py ADDED Viewed

	@@ -0,0 +1,63 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+from fastgen.utils import LazyCall as L
+from fastgen.callbacks.ct_schedule import CTScheduleCallback
+from fastgen.callbacks.grad_clip import GradClipCallback
+from fastgen.callbacks.param_count import ParamCountCallback
+from fastgen.callbacks.wandb import WandbCallback
+from fastgen.callbacks.ema import EMACallback
+from fastgen.callbacks.train_profiler import TrainProfilerCallback
+from fastgen.callbacks.gpu_stats import GPUStatsCallback
+from fastgen.callbacks.forced_weight_norm import ForcedWeightNormCallback
+from fastgen.callbacks.gpu_mem_profiler import MemTrackerCallback
+CTSchedule_CALLBACK = dict(
+    ct_schedule=L(CTScheduleCallback)(q=2.0, ratio_limit=0.999, kimg_per_stage=12500),
+)
+EMA_CALLBACK = dict(
+    ema=L(EMACallback)(type="constant", beta=0.9999, gamma=16.97, ema_halflife_kimg=500, ema_rampup_ratio=0.05),
+)
+EMA_CONST_CALLBACKS = dict(
+    ema_9999=L(EMACallback)(type="constant", beta=0.9999, ema_name="ema_9999"),
+    ema_99995=L(EMACallback)(type="constant", beta=0.99995, ema_name="ema_99995"),
+    ema_9996=L(EMACallback)(type="constant", beta=0.9996, ema_name="ema_9996"),
+)
+EMA_POWER_CALLBACKS = dict(
+    ema_1=L(EMACallback)(type="power", gamma=96.99, ema_name="ema_1"),
+    ema_5=L(EMACallback)(type="power", gamma=16.97, ema_name="ema_5"),
+    ema_10=L(EMACallback)(type="power", gamma=6.94, ema_name="ema_10"),
+)
+ForcedWeightNorm_CALLBACK = dict(
+    forced_weight_norm=L(ForcedWeightNormCallback)(),
+)
+GradClip_CALLBACK = dict(
+    grad_clip=L(GradClipCallback)(grad_norm=10.0, model_key="net"),
+)
+GPUStats_CALLBACK = dict(
+    gpu_stats=L(GPUStatsCallback)(every_n=100),
+)
+ParamCount_CALLBACK = dict(
+    param_count=L(ParamCountCallback)(),
+)
+TrainProfiler_CALLBACK = dict(
+    train_profiler=L(TrainProfilerCallback)(every_n=100),
+)
+WANDB_CALLBACK = dict(
+    wandb=L(WandbCallback)(sample_logging_iter=None),
+)
+MemTracker_CALLBACK = dict(
+    mem_tracker=L(MemTrackerCallback)(),
+)

FastGen/fastgen/configs/config.py ADDED Viewed

	@@ -0,0 +1,254 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+import os
+from typing import Any, List, Optional, Dict
+import copy
+import attrs
+from omegaconf import DictConfig
+from fastgen.utils import LazyCall as L
+from fastgen.configs.callbacks import WANDB_CALLBACK
+from fastgen.configs.data import CIFAR10_Loader_Config
+from fastgen.configs.net import EDM_CIFAR10_Config as EDMConfig
+from fastgen.configs.opt import BaseOptimizerConfig, BaseSchedulerConfig
+from fastgen.methods import FastGenModel
+@attrs.define(slots=False)
+class CuDNNConfig:
+    # If set to True, cudnn will use deterministic cudnn functions for better reproducibility.
+    deterministic: bool = False
+    # If set to True, cudnn will benchmark several algorithms and pick the fastest one.
+    benchmark: bool = True
+@attrs.define(slots=False)
+class LogConfig:
+    # Project name
+    project: str = "fastgen"
+    # Experiment name
+    group: str = "cifar10"
+    # Run/job name
+    name: str = "debug"
+    # W&B mode, can be "online" or "disabled".
+    wandb_mode: str = "online"
+    # Wandb credential path
+    wandb_credential: str = "./credentials/wandb_api.txt"
+    # save path
+    @property
+    def save_path(self) -> str:
+        return os.path.join(
+            os.environ.get("FASTGEN_OUTPUT_ROOT", "FASTGEN_OUTPUT"), f"{self.project}/{self.group}/{self.name}"
+        )
+@attrs.define(slots=False)
+class EvalConfig:
+    # Number of samples to generate
+    num_samples: int = 50000
+    # Save a small batch of images
+    save_images: bool = False
+    # Minimum checkpoint to evaluate
+    min_ckpt: int = 0
+    # Maximum checkpoint to evaluate
+    max_ckpt: int = 100000000
+    # Directory to save samples
+    samples_dir: str = "samples"
+@attrs.define(slots=False)
+class BaseCheckpointerConfig:
+    save_dir: str = "checkpoints"
+    use_s3: bool = False
+    s3_container: str = "s3://checkpoints/fastgen"
+    s3_credential: str = "./credentials/s3.json"
+    # path to pretrained model (from previous stages),
+    # it's used by loading fsdp/ddp trained ckpt to an fsdp/ddp pipeline
+    pretrained_ckpt_path: str = ""
+    # submodule names of model and keys of a pretrained checkpoint of the form {"model": {"submodule_key": ...}, ...}
+    pretrained_ckpt_key_map: Dict[str, str] = {"net": "net"}
+@attrs.define(slots=False)
+class SampleTConfig:
+    """Config for sampling t from a time distribution."""
+    # time distribution (currently supporting: uniform, lognormal, polynomial, logitnormal, shift, and log_t)
+    time_dist_type: str = "uniform"
+    # mu in lognormal, logitnormal, and log_t distributions
+    train_p_mean: float = -1.1
+    # sigma in lognormal, logitnormal, and log_t distributions
+    train_p_std: float = 2.0
+    # shift value in shifted sampling (t_shifted = t * shift / (t * (shift - 1) + 1))
+    shift: float = 5.0
+    # lowest value in truncated range
+    min_t: float = 0.002
+    # highest value in truncated range
+    max_t: float = 80.0
+    # If provided, it is in the form [t_max, ..., 0] where len(t_list) needs to equal student_sample_steps + 1
+    t_list: Optional[List[float]] = None
+    # degree of freedom in log-transformed student-t distribution
+    log_t_df: float = 0.01
+@attrs.define(slots=False)
+class BaseModelConfig:
+    # Use factory functions to ensure each instance gets its own copy
+    net: dict = attrs.field(factory=lambda: copy.deepcopy(EDMConfig))
+    teacher: Optional[dict] = None  # Usually not used, only used when teacher is different from net (i.e. Causvid)
+    # guidance scale for classifier-free guidance in teacher diffusion model. None means no guidance.
+    guidance_scale: Optional[float] = None
+    # enable skip layer guidance (currently only wan network has the skip_layers option in cfg)
+    skip_layers: List[int] | None = None
+    # optimizer and scheduler for the main net (i.e., one-step generator in DMD)
+    net_optimizer: dict = attrs.field(factory=lambda: copy.deepcopy(BaseOptimizerConfig))
+    net_scheduler: dict = attrs.field(factory=lambda: copy.deepcopy(BaseSchedulerConfig))
+    # sampling t from a given distribution
+    sample_t_cfg: SampleTConfig = attrs.field(factory=SampleTConfig)
+    # shape of the input to the model (defaults to CIFAR-10)
+    input_shape: List[int] = [3, 32, 32]
+    # device ("cuda" or "cpu")
+    device: str = "cuda"
+    # enable gradient scaler
+    grad_scaler_enabled: bool = False
+    grad_scaler_init_scale: float = 65536.0
+    grad_scaler_growth_interval: int = 2000
+    # path to the pretrained teacher model ckpt
+    pretrained_model_path: str = ""
+    # path to the pretrained student net ckpt (if different from the teacher)
+    pretrained_student_net_path: str = ""
+    # initialize student from the above checkpoints (can be turned off to only load weights to the teacher)
+    load_student_weights: bool = True
+    # enable preprocessors in the model
+    enable_preprocessors: bool = True
+    # EMA for the main net (requires EMACallback)
+    use_ema: Any = False
+    # multistep generation if larger than 1 (default: single-step generation)
+    student_sample_steps: int = 1
+    # sampling type in multistep generation ('sde', 'ode')
+    student_sample_type: str = "sde"
+    # Enable memory-efficient model loading with meta device:
+    # - Rank 0 loads pretrained weights normally
+    # - Other ranks use torch.device("meta") for ZERO memory allocation (just metadata)
+    # - FSDP materializes meta tensors and broadcasts weights from rank 0
+    # This dramatically speeds up initialization for large models (14B+):
+    # - Reduces RAM from N*model_size to 1*model_size
+    # - Eliminates disk I/O contention (N parallel reads -> 1 read)
+    # - Expected speedup: 30+ min -> <1 min for 14B models on 8 GPUs
+    fsdp_meta_init: bool = False
+    # whether to add the teacher model to the fsdp_dict
+    add_teacher_to_fsdp_dict: bool = True
+    # whether to find unused parameters in ddp
+    # - can be turned off for improved performance
+    # - however, it is required if the model has a discriminator or the net initializes unused modules (e.g., for logvar predictions)
+    ddp_find_unused_parameters: bool = True
+    # precision variables (choose from "float64", "float32", "bfloat16", or "float16")
+    # (precision of the time steps is handled in the noise scheduler, defaulting to float64 for numerical stability)
+    # precision for model/optimizer states and data - recommended to be float32 if precision_amp is not None
+    precision: str = "float32"
+    # AMP during training - if None or equal to precision, AMP is disabled during training.
+    precision_amp: str | None = None
+    # AMP during inference - if None or equal to precision, AMP is disabled during inference.
+    precision_amp_infer: str | None = None
+    # AMP during en-/decoding (e.g., for VAEs or text encoders) - if None or equal to precision, AMP is disabled during en-/decoding.
+    precision_amp_enc: str | None = None
+@attrs.define(slots=False)
+class BaseTrainerConfig:
+    cudnn: CuDNNConfig = attrs.field(factory=CuDNNConfig)
+    checkpointer: BaseCheckpointerConfig = attrs.field(factory=BaseCheckpointerConfig)
+    # Callbacks configs.
+    callbacks: dict = DictConfig(WANDB_CALLBACK)
+    # save checkpoint frequency
+    save_ckpt_iter: int = 5000
+    # test on validation set frequency
+    validation_iter: int = 1000
+    # logging frequency
+    logging_iter: int = 1000
+    # maximum training iteration
+    max_iter: int = 1000000
+    # whether to visualize multistep teacher generation
+    visualize_teacher: bool = False
+    # Set the random seed.
+    seed: int = 0
+    # Validation seed
+    val_seed: int | None = None
+    # Resume
+    resume: bool = True
+    # DDP Parallelism
+    ddp: bool = False
+    # FSDP Parallelism
+    fsdp: bool = False
+    # Enable TensorFloat32 (convolution and matmul)
+    tf32_enabled: bool = True
+    # Number of gradient accumulation rounds
+    grad_accum_rounds: int = 1
+    # Global batch size (if not None, overrides grad_accum_rounds to match the specified batch size)
+    batch_size_global: int | None = None
+    # offload other modules to cpu during latent decoding
+    offload_module_in_decoding: bool = False
+    # apply cpu offloading in fsdp
+    fsdp_cpu_offload: bool = False
+    # Fallback minimum number of parameters for FSDP wrapping
+    # (10M wraps large models into fairly small shards)
+    # The FastGenNetwork should provide a fully_shard method that can be used to shard the network.
+    # If we need to shard a different module, we fall back to an auto-sharding policy based on this value.
+    fsdp_min_num_params: int = 10_000_000
+    # Sharding group size for FSDP. If None, fully shard across all ranks.
+    # If set, creates a 2D mesh with (replicate, shard) dimensions.
+    fsdp_sharding_group_size: Optional[int] = None
+    # global variables
+    global_vars: Optional[dict] = None
+    global_vars_val: List[dict | None] = [None]
+    # augment config
+    augment_pipe: Optional[DictConfig] = None
+@attrs.define(slots=False)
+class BaseConfig:
+    # Log config.
+    log_config: LogConfig = attrs.field(factory=LogConfig)
+    # Trainer configs.
+    trainer: BaseTrainerConfig = attrs.field(factory=BaseTrainerConfig)
+    # Model configs.
+    model: BaseModelConfig = attrs.field(factory=BaseModelConfig)
+    model_class: DictConfig = L(FastGenModel)(config=None)
+    # Data configs.
+    dataloader_train: dict = CIFAR10_Loader_Config
+    dataloader_val: Any = None
+    # Eval configs.
+    eval: EvalConfig = attrs.field(factory=EvalConfig)

FastGen/fastgen/configs/config_utils.py ADDED Viewed

	@@ -0,0 +1,317 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+import os
+from typing import Any, Optional, List, Dict
+import inspect
+from copy import deepcopy
+import attrs
+import yaml
+from omegaconf import DictConfig, OmegaConf, ListConfig
+from hydra import compose, initialize
+from hydra.core.config_store import ConfigStore
+import importlib
+from dataclasses import fields as dataclass_fields
+import attr
+from dataclasses import is_dataclass
+import fastgen.utils.logging_utils as logger
+def import_config_from_python_file(config_file: str) -> Any:
+    """
+    Import a config from a python file.
+    Args:
+        config_file (str): The path to the python file.
+    Returns:
+        Any: The config object.
+    """
+    if not config_file.endswith(".py"):
+        raise ValueError("Config file must be a Python file with a .py extension. " f"Received: {config_file}")
+    if not os.path.isfile(config_file):
+        raise FileNotFoundError(f"FastGen config file ({config_file}) not found.")
+    # Convert to importable module format.
+    config_module = config_file.replace("/", ".").replace(".py", "")
+    # Import the module
+    try:
+        config = importlib.import_module(config_module)
+    except ImportError as e:
+        logger.error(f"Failed to import config from python file: {e}")
+        raise e
+    return config.create_config()
+def config_from_dict(ref_instance: Any, kwargs: Any) -> Any:
+    """
+    Construct an instance of the same type as ref_instance using the provided dictionary or data or unstructured data
+    Args:
+        ref_instance: The reference instance to determine the type and fields when needed
+        kwargs: A dictionary of keyword arguments to use for constructing the new instance or primitive data or unstructured data
+    Returns:
+        Any: A new instance of the same type as ref_instance constructed using the provided kwargs or the primitive data or unstructured data
+    Raises:
+        AssertionError: If the fields do not match or if extra keys are found.
+        Exception: If there is an error constructing the new instance.
+    """
+    is_type = is_attrs_or_dataclass(ref_instance)
+    if not is_type:
+        return kwargs
+    else:
+        ref_fields = set(get_fields(ref_instance))
+        assert isinstance(kwargs, dict) or isinstance(kwargs, DictConfig), "kwargs must be a dictionary or a DictConfig"
+        keys = set(kwargs.keys())
+        # ref_fields must equal to or include all keys
+        extra_keys = keys - ref_fields
+        assert (
+            ref_fields == keys or keys.issubset(ref_fields)
+        ), f"Fields mismatch: {ref_fields} != {keys}. Extra keys found: {extra_keys} \n \t when constructing {type(ref_instance)} with {keys}"
+        resolved_kwargs: Dict[str, Any] = {}
+        for f in keys:
+            resolved_kwargs[f] = config_from_dict(getattr(ref_instance, f), kwargs[f])
+        try:
+            new_instance = type(ref_instance)(**resolved_kwargs)
+        except Exception as e:
+            logger.error(f"Error when constructing {type(ref_instance)} with {resolved_kwargs}")
+            logger.error(e)
+            raise e
+        return new_instance
+def flatten_dict(nested_dict, parent_key="", sep=".", exclude_key="_target_"):
+    """
+    Flattens a nested dictionary by joining keys with a separator.
+    Args:
+        nested_dict (dict): The dictionary to flatten.
+        parent_key (str, optional): The base key to prepend to flattened keys.
+                                    Used in recursion. Defaults to ''.
+        sep (str, optional): The separator to use between keys. Defaults to '.'.
+    Returns:
+        dict: A new, flattened dictionary.
+    """
+    items = []
+    for key, value in nested_dict.items():
+        if key == exclude_key:
+            continue
+        if value is None:
+            continue
+        # Create the new key by joining parent_key and key
+        new_key = parent_key + sep + key if parent_key else key
+        # If the value is a non-empty dictionary, recurse
+        if isinstance(value, DictConfig):
+            value = OmegaConf.to_container(value)
+        if isinstance(value, dict) and value:
+            items.extend(flatten_dict(value, new_key, sep=sep, exclude_key=exclude_key).items())
+        # If the value is not a dictionary, it's a leaf node
+        else:
+            items.append((new_key, value))
+    return dict(items)
+def override_config_with_opts(config: Any, opts: Optional[List[str]] = None) -> Any:
+    """
+    Override the config with the opts.
+    Args:
+        config (Any): The config object.
+        opts (Dict[str, Any]): Dict for the overrides.
+    Returns:
+        Any: The config object.
+    """
+    # Convert Config object to a DictConfig object for Hydra
+    if not isinstance(config, DictConfig):
+        config_dict = attrs.asdict(config)
+        config_dict = DictConfig(content=config_dict, flags={"allow_objects": True})
+    else:
+        config_dict = config
+    # Use Hydra to handle overrides
+    cs = ConfigStore.instance()
+    cs.store(name="config", node=config_dict)
+    if opts is None:
+        opts = []
+    if len(opts) > 0 and opts[0] != "-":
+        raise ValueError(f"opts must start with '-' to separate from other arguments. Got: {opts}")
+    opts = opts[1:]
+    with initialize(version_base=None):
+        try:
+            cfg = compose(config_name="config", overrides=opts)
+        except Exception as e:
+            raise ValueError(f"Failed to compose config with opts: {e}")
+        OmegaConf.resolve(cfg)
+    config = config_from_dict(config, cfg)
+    return config
+def override_config_with_yaml(config: Any, yaml_path: str) -> Any:
+    """
+    Override the config with the yaml file. **_target_ field is excluded**.
+    """
+    with open(yaml_path, "r") as f:
+        yaml_dict = yaml.safe_load(f)
+    yaml_dict = flatten_dict(yaml_dict)
+    config_dict = flatten_dict(attrs.asdict(config))
+    # Loose overriding: all mismatched keys are ignored
+    override_dict = {k: v for k, v in yaml_dict.items() if k in config_dict}
+    opts = ["-"] + [f"{k}={v}" for k, v in override_dict.items()]
+    return override_config_with_opts(config, opts=opts)
+def is_attrs_or_dataclass(obj) -> bool:
+    """
+    Check if the object is an instance of an attrs class or a dataclass.
+    Args:
+        obj: The object to check.
+    Returns:
+        bool: True if the object is an instance of an attrs class or a dataclass, False otherwise.
+    """
+    return is_dataclass(obj) or attr.has(type(obj))
+def get_fields(obj):
+    """
+    Get the fields of an attrs class or a dataclass.
+    Args:
+        obj: The object to get fields from. Must be an instance of an attrs class or a dataclass.
+    Returns:
+        list: A list of field names.
+    Raises:
+        ValueError: If the object is neither an attrs class nor a dataclass.
+    """
+    if is_dataclass(obj):
+        return [field.name for field in dataclass_fields(obj)]
+    elif attr.has(type(obj)):
+        return [field.name for field in attr.fields(type(obj))]
+    else:
+        raise ValueError("The object is neither an attrs class nor a dataclass.")
+def serialize_config(
+    config: Any,
+    return_type: str = "dict",
+    path: str | bytes | None = None,
+    filename: str = "config.yaml",
+    include_defaults: bool = False,
+) -> Dict[str, Any] | str:
+    """
+    Serialize a config (BaseConfig or DictConfig) to various formats.
+    Args:
+        config: The config to serialize (BaseConfig attrs object or DictConfig).
+        return_type: Output format - "dict" (plain dict), "yaml" (YAML string), or "file" (save to file).
+        path: Directory path to save the file. Required if return_type is "file".
+        filename: Name of the file to save. Only used if return_type is "file".
+        include_defaults: If True, add default parameter values from _target_ classes.
+    Returns:
+        Dict[str, Any] if return_type is "dict"
+        str (YAML) if return_type is "yaml" or "file"
+    Raises:
+        ValueError: If return_type is "file" but path is not provided.
+    """
+    if return_type == "file" and path is None:
+        raise ValueError("path must be provided when return_type is 'file'")
+    # Deep copy to avoid modifying original
+    config = deepcopy(config)
+    # Normalize to DictConfig with object support
+    if not isinstance(config, DictConfig):
+        config_dict = attrs.asdict(config)
+        config_omegaconf = DictConfig(content=config_dict, flags={"allow_objects": True})
+    else:
+        config_omegaconf = config
+    def is_serializable(item) -> bool:
+        try:
+            OmegaConf.to_yaml(item)
+            return True
+        except Exception:
+            return False
+    def get_default_params(cls_or_func):
+        if callable(cls_or_func):
+            signature = inspect.signature(cls_or_func)
+        else:
+            signature = inspect.signature(cls_or_func.__init__)
+        params = signature.parameters
+        return {name: param.default for name, param in params.items() if param.default is not inspect.Parameter.empty}
+    def process_config(conf):
+        if isinstance(conf, DictConfig):
+            for key, value in conf.items():
+                if isinstance(value, (DictConfig, ListConfig)):
+                    # Optionally add default params from _target_ classes
+                    if include_defaults:
+                        try:
+                            if "_target_" in value:
+                                default_params = get_default_params(value["_target_"])
+                                for default_key, default_v in default_params.items():
+                                    if default_key not in value:
+                                        value[default_key] = default_v
+                        except Exception as e:
+                            logger.error(f"Failed to add default argument values: {e}")
+                    process_config(value)
+                else:
+                    if not is_serializable(value) and value is not None:
+                        conf[key] = str(value)
+        elif isinstance(conf, ListConfig):
+            for i, item in enumerate(conf):
+                if isinstance(item, (DictConfig, ListConfig)):
+                    process_config(item)
+                else:
+                    if not is_serializable(item) and item is not None:
+                        conf[i] = str(item)
+        else:
+            raise NotImplementedError("Input config must be a DictConfig or ListConfig.")
+        return conf
+    config_omegaconf = process_config(config_omegaconf)
+    result_dict: Dict[str, Any] = OmegaConf.to_container(config_omegaconf, resolve=True)  # type: ignore
+    if return_type == "dict":
+        return result_dict
+    # For yaml and file, convert to YAML string
+    yaml_str = yaml.dump(result_dict, default_flow_style=False, sort_keys=True)
+    if return_type == "file":
+        os.makedirs(path, exist_ok=True)  # type: ignore
+        with open(f"{path}/{filename}", "w") as f:
+            f.write(yaml_str)
+        logger.info(f"Config is saved at {path}/{filename}")
+    return yaml_str

FastGen/fastgen/configs/data.py ADDED Viewed

	@@ -0,0 +1,123 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+import os
+from fastgen.datasets.class_cond_dataloader import ImageLoader
+from fastgen.datasets.wds_dataloaders import (
+    WDSLoader,
+    ImageWDSLoader,
+    VideoWDSLoader,
+)
+from fastgen.utils import LazyCall as L
+OUTPUT_ROOT = os.environ.get("FASTGEN_OUTPUT_ROOT", "FASTGEN_OUTPUT")
+DATA_ROOT_DIR = os.getenv("DATA_ROOT_DIR", f"{OUTPUT_ROOT}/DATA")
+S3_DATA_ROOT_DIR = os.getenv("DATA_ROOT_DIR", "s3://data")
+# ################################################################################
+# Generic Loaders (for config templates - override datatags for actual use)
+# ################################################################################
+# See fastgen/datasets/README.md for more details.
+ImageLoaderConfig = L(ImageWDSLoader)(
+    datatags=["WDS:/path/to/images"],
+    batch_size=32,
+    key_map={"real": "jpg", "condition": "txt"},
+    presets_map={"neg_condition": "empty_string"},
+    input_res=512,
+)
+ImageLatentLoaderConfig = L(WDSLoader)(
+    datatags=["WDS:/path/to/image_latents"],
+    batch_size=32,
+    key_map={"real": "latent.pth", "condition": "txt_emb.pth"},
+    # Negative condition embedding loaded from a shared file (same for all samples)
+    files_map={"neg_condition": "/path/to/neg_prompt_emb.npy"},
+)
+VideoLoaderConfig = L(VideoWDSLoader)(
+    datatags=["WDS:/path/to/videos"],
+    batch_size=2,
+    key_map={"real": "mp4", "condition": "txt"},
+    presets_map={"neg_condition": "neg_prompt_wan"},
+    sequence_length=81,
+    img_size=(832, 480),
+)
+VideoLatentLoaderConfig = L(WDSLoader)(
+    datatags=["WDS:/path/to/video_latents"],
+    batch_size=2,
+    key_map={"real": "latent.pth", "condition": "txt_emb.pth"},
+    # Negative condition embedding loaded from a shared file (same for all samples)
+    files_map={"neg_condition": "/path/to/neg_prompt_emb.npy"},
+    # NOTE: For v2v tasks, add condition latent (e.g., depth) to key_map:
+    #   key_map={"real": "latent.pth", "condition": "txt_emb.pth", "depth_latent": "depth_latent.pth"}
+)
+# ################################################################################
+# Generic KD Loaders (for paired/path data)
+# ################################################################################
+# See fastgen/methods/knowledge_distillation/README.md for more details.
+# For single-step KD: provides (real, noise, condition) pairs
+# Data requirements: {"real": clean, "noise": noise, "condition": cond}
+PairLoaderConfig = L(WDSLoader)(
+    datatags=["WDS:/path/to/pairs"],
+    batch_size=2,
+    key_map={"real": "latent.pth", "noise": "noise.pth", "condition": "txt_emb.pth"},
+)
+# For multi-step KD: provides (real, path, condition) with denoising trajectory
+# Data requirements: {"real": clean, "path": [B, steps, C, ...], "condition": cond}
+# path contains intermediate denoising steps (typically 4 steps)
+PathLoaderConfig = L(WDSLoader)(
+    datatags=["WDS:/path/to/paths"],
+    batch_size=2,
+    key_map={"real": "latent.pth", "path": "path.pth", "condition": "txt_emb.pth"},
+)
+# ################################################################################
+# Specific Datasets
+# ################################################################################
+CIFAR10_Loader_Config = L(ImageLoader)(
+    dataset_path=f"{DATA_ROOT_DIR}/cifar10/cifar10-32x32.zip",
+    s3_path=f"{S3_DATA_ROOT_DIR}/cifar10/cifar10-32x32.zip",
+    use_labels=True,
+    cache=True,
+    batch_size=128,
+    shuffle=True,
+    sampler_start_idx=None,
+)
+ImageNet64_Loader_Config = L(ImageLoader)(
+    dataset_path=f"{DATA_ROOT_DIR}/imagenet-64/imagenet-64x64.zip",
+    s3_path=f"{S3_DATA_ROOT_DIR}/imagenet-64/imagenet-64x64.zip",
+    use_labels=True,
+    cache=True,
+    batch_size=32,
+    shuffle=True,
+    sampler_start_idx=None,
+)
+ImageNet256_Loader_Config = L(ImageLoader)(
+    dataset_path=f"{DATA_ROOT_DIR}/imagenet-256/imagenet_256_sd.zip",
+    s3_path=f"{S3_DATA_ROOT_DIR}/imagenet-256/imagenet_256_sd.zip",
+    use_labels=True,
+    cache=True,
+    batch_size=32,
+    shuffle=True,
+    sampler_start_idx=None,
+)
+ImageNet64_EDMV2_Loader_Config = L(ImageLoader)(
+    dataset_path=f"{DATA_ROOT_DIR}/imagenet-64/imagenet-64x64-edmv2.zip",
+    s3_path=f"{S3_DATA_ROOT_DIR}/imagenet-64/imagenet-64x64-edmv2.zip",
+    use_labels=True,
+    cache=True,
+    batch_size=32,
+    shuffle=True,
+    sampler_start_idx=None,
+)

FastGen/fastgen/configs/data_dummy.py ADDED Viewed

	@@ -0,0 +1,67 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Dummy data loader for data-free distillation training (e.g., Self-Forcing without GAN)."""
+import torch
+from fastgen.utils import LazyCall as L
+class DummyVideoLoader:
+    """A dummy dataloader that generates random tensors for data-free training.
+    Use this when running Self-Forcing or other distillation methods without real data.
+    Requires setting gan_loss_weight_gen=0 since there's no real data for discriminator.
+    """
+    def __init__(
+        self,
+        batch_size: int = 1,
+        input_shape: list = [16, 21, 60, 104],  # [C, T, H, W]
+        text_seq_len: int = 512,
+        text_dim: int = 4096,
+        device: str = "cuda",
+        dtype: str = "bfloat16",
+    ):
+        self.batch_size = batch_size
+        self.input_shape = input_shape
+        self.text_seq_len = text_seq_len
+        self.text_dim = text_dim
+        self.device = device
+        self._dtype = getattr(torch, dtype) if isinstance(dtype, str) else dtype
+    def __iter__(self):
+        return self
+    def __next__(self):
+        return {
+            "real": torch.randn(
+                self.batch_size, *self.input_shape,
+                device=self.device, dtype=self._dtype
+            ),
+            "condition": torch.randn(
+                self.batch_size, self.text_seq_len, self.text_dim,
+                device=self.device, dtype=self._dtype
+            ),
+            "neg_condition": torch.zeros(
+                self.batch_size, self.text_seq_len, self.text_dim,
+                device=self.device, dtype=self._dtype
+            ),
+        }
+# Config for WAN T2V 480p (832x480, 81 frames -> 21 latent frames)
+DummyVideoLoaderConfig = L(DummyVideoLoader)(
+    batch_size=1,
+    input_shape=[16, 21, 60, 104],  # [C, T, H, W] latent shape for 480p
+    text_seq_len=512,
+    text_dim=4096,
+)
+# Config for WAN T2V 720p (1280x720, 81 frames -> 21 latent frames)
+DummyVideoLoader720pConfig = L(DummyVideoLoader)(
+    batch_size=1,
+    input_shape=[16, 21, 90, 160],  # [C, T, H, W] latent shape for 720p
+    text_seq_len=512,
+    text_dim=4096,
+)

FastGen/fastgen/configs/discriminator.py ADDED Viewed

	@@ -0,0 +1,106 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+from omegaconf import DictConfig
+from fastgen.utils import LazyCall as L
+from fastgen.networks.discriminators import (
+    Discriminator_EDM,
+    Discriminator_SD15,
+    Discriminator_SDXL,
+    Discriminator_ImageDiT,
+    Discriminator_VideoDiT,
+)
+Discriminator_EDM_CIFAR10_Config: DictConfig = L(Discriminator_EDM)(
+    feature_indices={0, 1, 2},
+    all_res=[32, 16, 8],
+    in_channels=256,
+)
+Discriminator_EDM_ImageNet64_Config: DictConfig = L(Discriminator_EDM)(
+    feature_indices=None,
+    all_res=[64, 32, 16, 8],
+    in_channels=768,
+)
+Discriminator_SD15_Res512_Config: DictConfig = L(Discriminator_SD15)(
+    feature_indices=None,
+    all_res=[32, 16, 8, 8, 8],
+    in_channels=1280,
+)
+Discriminator_SDXL_Res512_Config: DictConfig = L(Discriminator_SDXL)(
+    feature_indices=None,
+    all_res=[32, 16, 16, 16],
+    in_channels=1280,
+)
+Discriminator_SDXL_Res1024_Config: DictConfig = L(Discriminator_SDXL)(
+    feature_indices=None,
+    all_res=[64, 32, 32, 32],
+    in_channels=1280,
+)
+# Flux: hidden_dim=3072, 19 joint blocks + 38 single blocks = 57 total
+Discriminator_Flux_Config: DictConfig = L(Discriminator_ImageDiT)(
+    feature_indices=None,
+    num_blocks=57,  # 19 joint + 38 single blocks
+    inner_dim=3072,  # Flux hidden dimension
+)
+# 2B patchify: spatial-2, temporal-1; inner_dim=1920; layer=30
+Discriminator_CogVideoX2B_Config = L(Discriminator_VideoDiT)(
+    feature_indices=None,
+    num_blocks=30,
+    disc_type="dit_simple_conv3d",
+    inner_dim=1920 // 4,
+)
+# 5B patchify: spatial-2, temporal-1; inner_dim=3072; layer=42
+Discriminator_CogVideoX5B_Config = L(Discriminator_VideoDiT)(
+    feature_indices=None,
+    num_blocks=42,
+    disc_type="dit_simple_conv3d",
+    inner_dim=3072 // 4,
+)
+# 1.3B patchify: spatial-2, temporal-1; inner_dim=1536; layer=30
+Discriminator_Wan_1_3B_Config: DictConfig = L(Discriminator_VideoDiT)(
+    feature_indices=None,
+    num_blocks=30,
+    disc_type="dit_simple_conv3d",
+    inner_dim=1536 // 4,
+)
+# 14B patchify: spatial-2, temporal-1; inner_dim=5120; layer=40
+Discriminator_Wan_14B_Config: DictConfig = L(Discriminator_VideoDiT)(
+    feature_indices=None,
+    num_blocks=40,
+    disc_type="dit_simple_conv3d",
+    inner_dim=5120 // 4,
+)
+# 5B patchify: spatial-2, temporal-1; inner_dim=3072; layer=30
+Discriminator_Wan22_5B_Config: DictConfig = L(Discriminator_VideoDiT)(
+    feature_indices=None,
+    num_blocks=30,
+    disc_type="dit_simple_conv3d",
+    inner_dim=3072 // 4,
+)
+# Cosmos Predict2.5-2B: patchify spatial-2, temporal-1; inner_dim=2048; layer=28
+Discriminator_CosmosPredict2_2B_Config: DictConfig = L(Discriminator_VideoDiT)(
+    feature_indices=None,
+    num_blocks=28,
+    disc_type="dit_simple_conv3d",
+    inner_dim=2048,  # Must match model's inner_dim for Cosmos
+)
+# Cosmos Predict2.5-14B: patchify spatial-2, temporal-1; inner_dim=5120; layer=36
+Discriminator_CosmosPredict2_14B_Config: DictConfig = L(Discriminator_VideoDiT)(
+    feature_indices=None,
+    num_blocks=36,
+    disc_type="dit_simple_conv3d",
+    inner_dim=5120,  # Must match model's inner_dim for Cosmos
+)

FastGen/fastgen/configs/experiments/CogVideoX/config_dmd2.py ADDED Viewed

	@@ -0,0 +1,48 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+from fastgen.configs.discriminator import Discriminator_CogVideoX2B_Config
+import fastgen.configs.methods.config_dmd2 as config_dmd2_default
+from fastgen.configs.data import VideoLatentLoaderConfig
+from fastgen.configs.net import CogVideoXConfig
+""" Configs for the DMD2 model on CogVideoX model. """
+def create_config():
+    config = config_dmd2_default.create_config()
+    config.model.net_optimizer.lr = 1e-5
+    config.model.discriminator_optimizer.lr = 1e-5
+    config.model.fake_score_optimizer.lr = 1e-5
+    config.model.input_shape = [16, 13, 60, 90]
+    config.model.discriminator = Discriminator_CogVideoX2B_Config
+    config.model.discriminator.feature_indices = [15, 22, 29]
+    config.model.gan_loss_weight_gen = 0.03
+    config.model.net = CogVideoXConfig
+    config.model.guidance_scale = 6.0
+    config.model.enable_preprocessors = False
+    config.model.sample_t_cfg.time_dist_type = "uniform"
+    config.model.sample_t_cfg.min_t = 0.001
+    config.model.sample_t_cfg.max_t = 0.999
+    config.model.gan_use_same_t_noise = True
+    config.model.fake_score_pred_type = "x0"
+    config.model.student_sample_type = "ode"
+    # setting for 4-step training
+    config.model.student_sample_steps = 4
+    config.model.sample_t_cfg.t_list = [0.999, 0.937, 0.833, 0.624, 0.0]
+    config.dataloader_train = VideoLatentLoaderConfig
+    config.dataloader_train.batch_size = 2
+    config.trainer.max_iter = 10000
+    config.trainer.logging_iter = 100
+    config.trainer.save_ckpt_iter = 500
+    config.log_config.group = "CogVideoX_dmd2"
+    return config

FastGen/fastgen/configs/experiments/CogVideoX/config_kd.py ADDED Viewed

	@@ -0,0 +1,30 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+import fastgen.configs.methods.config_kd as config_kd_default
+from fastgen.configs.data import PairLoaderConfig
+from fastgen.configs.net import CogVideoXConfig
+""" Configs for the KD model on CogVideoX model. """
+def create_config():
+    config = config_kd_default.create_config()
+    config.trainer.max_iter = 6000
+    config.trainer.logging_iter = 100
+    config.trainer.save_ckpt_iter = 500
+    config.model.net_optimizer.lr = 1e-4
+    config.model.input_shape = [16, 13, 60, 90]
+    config.model.net = CogVideoXConfig
+    config.model.enable_preprocessors = False
+    config.model.precision = "bfloat16"
+    config.dataloader_train = PairLoaderConfig
+    config.dataloader_train.batch_size = 2
+    config.log_config.group = "CogVideoX_kd"
+    return config

FastGen/fastgen/configs/experiments/CogVideoX/config_sft.py ADDED Viewed

	@@ -0,0 +1,35 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+import fastgen.configs.methods.config_sft as config_sft_default
+from fastgen.configs.data import VideoLatentLoaderConfig
+from fastgen.configs.net import CogVideoXConfig
+""" Configs for the SFT model on CogVideoX model. """
+def create_config():
+    config = config_sft_default.create_config()
+    config.trainer.logging_iter = 500
+    config.model.net_optimizer.lr = 5e-5
+    config.model.guidance_scale = 6.0
+    config.model.student_sample_steps = 50
+    config.model.sample_t_cfg.time_dist_type = "uniform"
+    config.model.sample_t_cfg.min_t = 0.001
+    config.model.sample_t_cfg.max_t = 0.999
+    # CogVideoX latent shape: [C, T, H, W] = [16, 13, 60, 90]
+    # Corresponds to 49 frames at 480x720 resolution after VAE encoding
+    config.model.input_shape = [16, 13, 60, 90]
+    config.model.net = CogVideoXConfig
+    config.model.enable_preprocessors = False  # Using precomputed latents
+    config.dataloader_train = VideoLatentLoaderConfig
+    config.dataloader_train.batch_size = 2
+    config.log_config.group = "CogVideoX_sft"
+    return config

FastGen/fastgen/configs/experiments/CogVideoX/config_sft_5b.py ADDED Viewed

	@@ -0,0 +1,40 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+import fastgen.configs.methods.config_sft as config_sft_default
+from fastgen.configs.data import VideoLatentLoaderConfig
+from fastgen.configs.net import CogVideoX5BConfig
+""" Configs for the SFT model on CogVideoX-5B model. """
+def create_config():
+    config = config_sft_default.create_config()
+    config.trainer.logging_iter = 500
+    # Slightly lower LR for larger 5B model
+    config.model.net_optimizer.lr = 2e-5
+    config.model.guidance_scale = 6.0
+    config.model.student_sample_steps = 50
+    config.model.sample_t_cfg.time_dist_type = "uniform"
+    config.model.sample_t_cfg.min_t = 0.001
+    config.model.sample_t_cfg.max_t = 0.999
+    config.model.precision = "bfloat16"
+    # CogVideoX latent shape: [C, T, H, W] = [16, 13, 60, 90]
+    # Corresponds to 49 frames at 480x720 resolution after VAE encoding
+    # Same as 2B variant - they share the same VAE
+    config.model.input_shape = [16, 13, 60, 90]
+    config.model.net = CogVideoX5BConfig
+    config.model.enable_preprocessors = False  # Using precomputed latents
+    config.dataloader_train = VideoLatentLoaderConfig
+    # Reduced batch size due to larger model
+    config.dataloader_train.batch_size = 1
+    config.log_config.group = "CogVideoX5B_sft"
+    return config

FastGen/fastgen/configs/experiments/CosmosPredict2/config_dmd2.py ADDED Viewed

	@@ -0,0 +1,69 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""DMD2 config for Cosmos-Predict2.5-2B model."""
+import fastgen.configs.methods.config_dmd2 as config_dmd2_default
+from fastgen.configs.data import VideoLoaderConfig
+from fastgen.configs.discriminator import Discriminator_CosmosPredict2_2B_Config
+from fastgen.configs.net import CosmosPredict2_2B_Config, CosmosPredict2_2B_Aggressive_Config, CKPT_ROOT_DIR
+def create_config():
+    config = config_dmd2_default.create_config()
+    config.trainer.max_iter = 10000
+    config.trainer.logging_iter = 100
+    config.trainer.save_ckpt_iter = 500
+    # Optimizer settings
+    config.model.net_optimizer.lr = 1e-5
+    config.model.discriminator_optimizer.lr = 1e-5
+    config.model.fake_score_optimizer.lr = 1e-5
+    config.model.precision = "bfloat16"
+    # Latent shape: [C, T_latent, H_latent, W_latent]
+    # Cosmos VAE uses 4x8x8 compression (time, height, width)
+    # Small resolution for testing: 320x176 video -> 40x22 latent, 93 frames -> 24 latent
+    # Full 720p: [16, 24, 88, 160] (1280x704 @ 93 frames)
+    # config.model.input_shape = [16, 24, 22, 40]  # cthw - 256p (320x176 video)
+    config.model.input_shape = [16, 24, 60, 104]  # cthw - 480p, 93 frames
+    # config.model.input_shape = [16, 24, 88, 160]  # cthw - full 720p, 93 frames
+    # Network and discriminator
+    config.model.net = CosmosPredict2_2B_Config
+    config.model.discriminator = Discriminator_CosmosPredict2_2B_Config
+    config.model.discriminator.disc_type = "multiscale_down_mlp_large"
+    config.model.discriminator.feature_indices = [13, 20, 27]
+    # Teacher uses AGGRESSIVE SAC for memory savings
+    config.model.teacher = CosmosPredict2_2B_Aggressive_Config
+    # DMD2 settings
+    config.model.gan_loss_weight_gen = 0.03
+    config.model.gan_use_same_t_noise = True
+    config.model.fake_score_pred_type = "x0"
+    config.model.student_sample_type = "ode"
+    config.model.guidance_scale = 3.0
+    config.model.pretrained_model_path = f"{CKPT_ROOT_DIR}/cosmos_predict2/Cosmos-Predict2.5-2B/base/post-trained/81edfebe-bd6a-4039-8c1d-737df1a790bf_ema_bf16.pt"
+    # Timestep sampling
+    config.model.sample_t_cfg.time_dist_type = "shifted"
+    config.model.sample_t_cfg.min_t = 0.001
+    config.model.sample_t_cfg.max_t = 0.999
+    # setting for 4-step training
+    config.model.student_sample_steps = 4
+    config.model.sample_t_cfg.t_list = [0.999, 0.937, 0.833, 0.624, 0.0]
+    # Dataloader settings
+    config.dataloader_train = VideoLoaderConfig
+    config.dataloader_train.batch_size = 1
+    # Dataloader img_size = (W, H) = (latent_W * 8, latent_H * 8)
+    config.dataloader_train.img_size = (config.model.input_shape[-1] * 8, config.model.input_shape[-2] * 8)
+    config.dataloader_train.sequence_length = (config.model.input_shape[1] - 1) * 4 + 1
+    config.log_config.group = "cosmos_predict2_dmd2"
+    return config

FastGen/fastgen/configs/experiments/CosmosPredict2/config_dmd2_14b.py ADDED Viewed

	@@ -0,0 +1,37 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""DMD2 config for Cosmos-Predict2.5-14B model."""
+import fastgen.configs.experiments.CosmosPredict2.config_dmd2 as config_dmd2_base
+from fastgen.configs.discriminator import Discriminator_CosmosPredict2_14B_Config
+from fastgen.configs.net import CosmosPredict2_14B_Config, CosmosPredict2_14B_Aggressive_Config, CKPT_ROOT_DIR
+def create_config():
+    config = config_dmd2_base.create_config()
+    config.trainer.fsdp_cpu_offload = True
+    # Latent shape: [C, T_latent, H_latent, W_latent]
+    # Cosmos VAE uses 4x8x8 compression (time, height, width)
+    # Small resolution for testing: 320x176 video -> 40x22 latent, 93 frames -> 24 latent
+    # Full 720p: [16, 24, 88, 160] (1280x704 @ 93 frames)
+    # config.model.input_shape = [16, 24, 22, 40]  # cthw - 256p (320x176 video)
+    config.model.input_shape = [16, 24, 60, 104]  # cthw - 480p, 93 frames
+    # config.model.input_shape = [16, 24, 88, 160]  # cthw - full 720p, 93 frames
+    # Network and discriminator for 14B
+    config.model.net = CosmosPredict2_14B_Config
+    config.model.discriminator = Discriminator_CosmosPredict2_14B_Config
+    # Teacher uses AGGRESSIVE SAC for memory savings
+    config.model.teacher = CosmosPredict2_14B_Aggressive_Config
+    config.model.pretrained_model_path = f"{CKPT_ROOT_DIR}/cosmos_predict2/Cosmos-Predict2.5-14B/base/post-trained/e21d2a49-4747-44c8-ba44-9f6f9243715f_ema_bf16.pt"
+    # Dataloader img_size = (W, H) = (latent_W * 8, latent_H * 8)
+    config.dataloader_train.img_size = (config.model.input_shape[-1] * 8, config.model.input_shape[-2] * 8)
+    config.dataloader_train.sequence_length = (config.model.input_shape[1] - 1) * 4 + 1
+    config.log_config.group = "cosmos_predict2_14b_dmd2"
+    return config

FastGen/fastgen/configs/experiments/CosmosPredict2/config_dmd2_v2w.py ADDED Viewed

	@@ -0,0 +1,18 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""DMD2 config for Cosmos-Predict2.5-2B video2world model."""
+import fastgen.configs.experiments.CosmosPredict2.config_dmd2 as config_dmd2_base
+def create_config():
+    config = config_dmd2_base.create_config()
+    # Network for v2w
+    config.model.net.is_video2world = True
+    config.model.net.num_conditioning_frames = 1
+    config.log_config.group = "cosmos_predict2_dmd2_v2w"
+    return config

FastGen/fastgen/configs/experiments/CosmosPredict2/config_sft.py ADDED Viewed

	@@ -0,0 +1,48 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Configs for SFT on Cosmos-Predict2.5-2B model."""
+import fastgen.configs.methods.config_sft as config_sft_default
+from fastgen.configs.data import VideoLoaderConfig
+from fastgen.configs.net import CosmosPredict2_2B_Config, CKPT_ROOT_DIR
+def create_config():
+    config = config_sft_default.create_config()
+    config.trainer.max_iter = 10000
+    config.trainer.logging_iter = 100
+    config.trainer.save_ckpt_iter = 500
+    config.model.net_optimizer.lr = 1e-5
+    config.model.sample_t_cfg.time_dist_type = "uniform"
+    config.model.sample_t_cfg.min_t = 0.001
+    config.model.sample_t_cfg.max_t = 0.999
+    config.model.precision = "bfloat16"
+    # Latent shape: [C, T_latent, H_latent, W_latent]
+    # Cosmos VAE uses 4x8x8 compression (time, height, width)
+    # Small resolution for testing: 320x176 video -> 40x22 latent, 21 frames -> 6 latent
+    # Full 720p: [16, 24, 88, 160] (1280x704 @ 93 frames)
+    # config.model.input_shape = [16, 24, 24, 40]  # cthw - 256p
+    # config.model.input_shape = [16, 21, 60, 104]  # cthw - 480p, 81 frames
+    config.model.input_shape = [16, 24, 60, 104]  # cthw - 480p
+    # config.model.input_shape = [16, 24, 88, 160]  # cthw - full 720p
+    config.model.net = CosmosPredict2_2B_Config
+    config.model.pretrained_model_path = f"{CKPT_ROOT_DIR}/cosmos_predict2/Cosmos-Predict2.5-2B/base/post-trained/81edfebe-bd6a-4039-8c1d-737df1a790bf_ema_bf16.pt"
+    config.model.guidance_scale = 3.0
+    config.model.student_sample_steps = 35
+    config.dataloader_train = VideoLoaderConfig
+    config.dataloader_train.batch_size = 1
+    # Dataloader img_size = (W, H) = (latent_W * 8, latent_H * 8)
+    config.dataloader_train.img_size = (config.model.input_shape[-1] * 8, config.model.input_shape[-2] * 8)
+    config.dataloader_train.sequence_length = (config.model.input_shape[1] - 1) * 4 + 1
+    config.log_config.group = "cosmos_predict2_sft"
+    return config

FastGen/fastgen/configs/experiments/CosmosPredict2/config_sft_14b.py ADDED Viewed

	@@ -0,0 +1,19 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Configs for SFT on Cosmos-Predict2.5-14B model."""
+import fastgen.configs.experiments.CosmosPredict2.config_sft as config_sft_base
+from fastgen.configs.net import CosmosPredict2_14B_Config, CKPT_ROOT_DIR
+def create_config():
+    config = config_sft_base.create_config()
+    # Network for 14B
+    config.model.net = CosmosPredict2_14B_Config
+    config.model.pretrained_model_path = f"{CKPT_ROOT_DIR}/cosmos_predict2/Cosmos-Predict2.5-14B/base/post-trained/e21d2a49-4747-44c8-ba44-9f6f9243715f_ema_bf16.pt"
+    config.log_config.group = "cosmos_predict2_14b_sft"
+    return config

FastGen/fastgen/configs/experiments/CosmosPredict2/config_sft_v2w.py ADDED Viewed

	@@ -0,0 +1,18 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Configs for SFT on Cosmos-Predict2.5-2B video2world model."""
+import fastgen.configs.experiments.CosmosPredict2.config_sft as config_sft_base
+def create_config():
+    config = config_sft_base.create_config()
+    # Network for v2w
+    config.model.net.is_video2world = True
+    config.model.net.num_conditioning_frames = 1
+    config.log_config.group = "cosmos_predict2_sft_v2w"
+    return config

FastGen/fastgen/configs/experiments/DiT/config_mf_b.py ADDED Viewed

	@@ -0,0 +1,70 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+from omegaconf import DictConfig
+import fastgen.configs.methods.config_mean_flow as config_mf_default
+from fastgen.configs.data import ImageNet256_Loader_Config
+from fastgen.configs.net import DiT_IN256_B_Config
+from fastgen.configs.callbacks import EMA_CONST_CALLBACKS
+""" Configs for the MeanFlow model, on DiT-XL and ImageNet-256 dataset. """
+def create_config():
+    config = config_mf_default.create_config()
+    # model
+    config.model.input_shape = [4, 32, 32]
+    config.model.precision_amp = "bfloat16"
+    config.model.precision_amp_jvp = "float32"
+    config.model.cond_dropout_prob = 0.1
+    config.model.guidance_mixture_ratio = 0.5
+    config.model.sample_t_cfg.time_dist_type = "logitnormal"
+    config.model.sample_t_cfg.train_p_mean = -0.4
+    config.model.sample_t_cfg.train_p_std = 1.0
+    config.model.sample_t_cfg.min_t = 0.0
+    config.model.sample_t_cfg.max_t = 0.999
+    config.model.sample_t_cfg.r_sample_ratio = 0.25
+    config.model.loss_config.norm_method = "poly_1.0"
+    config.model.loss_config.norm_const = 1.0
+    config.model.loss_config.tangent_warmup_steps = 0
+    config.model.loss_config.loss_type = "l2"
+    config.model.net = DiT_IN256_B_Config
+    # remove the additional 1000 factor in JVP
+    config.model.net.scale_t = False
+    config.model.net.r_timestep = True
+    config.model.net.time_cond_type = "diff"
+    config.model.net_optimizer.optim_type = "adam"
+    config.model.net_optimizer.lr = 1e-4
+    config.model.net_optimizer.betas = (0.9, 0.95)
+    config.model.net_optimizer.weight_decay = 0.0
+    # ema
+    config.model.use_ema = ["ema_9999", "ema_99995", "ema_9996"]
+    config.trainer.callbacks = DictConfig(
+        {k: v for k, v in config.trainer.callbacks.items() if not k.startswith("ema")}
+    )
+    config.trainer.callbacks.update(EMA_CONST_CALLBACKS)
+    # Recommended setting for 2-step:
+    # config.model.sample_t_cfg.t_list = [0.999, 0.5, 0.0]
+    # config.model.student_sample_steps = 2
+    # dataloader
+    config.dataloader_train = ImageNet256_Loader_Config
+    config.dataloader_train.batch_size = 32
+    # trainer
+    config.trainer.batch_size_global = 1024
+    config.trainer.max_iter = 1200000
+    config.trainer.save_ckpt_iter = 50000
+    config.trainer.logging_iter = 10000
+    config.log_config.group = "imagenet256"
+    return config

FastGen/fastgen/configs/experiments/DiT/config_mf_xl.py ADDED Viewed

	@@ -0,0 +1,26 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+from fastgen.configs.net import DiT_IN256_XL_Config
+import fastgen.configs.experiments.DiT.config_mf_b as config_mf_default
+""" Configs for the MeanFlow model, on DiT-XL and ImageNet-256 dataset. """
+def create_config():
+    config = config_mf_default.create_config()
+    config.model.guidance_t_end = 0.75
+    config.model.guidance_scale = 0.2
+    config.model.guidance_mixture_ratio = 0.92
+    config.model.net = DiT_IN256_XL_Config
+    # remove the additional 1000 factor in JVP
+    config.model.net.scale_t = False
+    config.model.net.r_timestep = True
+    config.model.net.time_cond_type = "diff"
+    config.dataloader_train.batch_size = 8
+    return config

FastGen/fastgen/configs/experiments/DiT/config_sft_dit_xl.py ADDED Viewed

	@@ -0,0 +1,68 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+from omegaconf import DictConfig
+import fastgen.configs.methods.config_sft as config_sft_default
+from fastgen.configs.data import ImageNet256_Loader_Config
+from fastgen.configs.net import DiT_IN256_XL_Config, CKPT_ROOT_DIR
+from fastgen.configs.callbacks import EMA_CONST_CALLBACKS
+"""Configs for SFT (Supervised Fine-Tuning) on DiT-XL and ImageNet-256 dataset."""
+def create_config():
+    config = config_sft_default.create_config()
+    # model
+    # DiT latent shape: [C, H, W] = [4, 32, 32] for 256x256 images (256/8 = 32)
+    config.model.input_shape = [4, 32, 32]
+    config.model.precision_amp = "bfloat16"
+    config.model.cond_dropout_prob = 0.1  # 10% dropout for CFG training
+    # Timestep sampling config
+    config.model.sample_t_cfg.time_dist_type = "logitnormal"
+    config.model.sample_t_cfg.train_p_mean = -0.4
+    config.model.sample_t_cfg.train_p_std = 1.0
+    config.model.sample_t_cfg.min_t = 0.001
+    config.model.sample_t_cfg.max_t = 0.999
+    # Pretrained model path
+    # DiT-XL/2 ImageNet-256 checkpoint from https://github.com/facebookresearch/DiT
+    config.model.pretrained_model_path = f"{CKPT_ROOT_DIR}/imagenet-256/DiT-XL-2-256x256.pt"
+    # Network config
+    config.model.net = DiT_IN256_XL_Config
+    config.model.net.learn_sigma = True  # Facebook DiT checkpoint was trained with learn_sigma=True
+    # Facebook DiT was trained with DDPM (epsilon prediction), not flow matching
+    config.model.net.net_pred_type = "eps"
+    config.model.net.schedule_type = "sd"  # Uses same linear beta schedule as DiT (0.0001 to 0.02)
+    # Optimizer config (lower LR for fine-tuning; 1e-4 is for training from scratch)
+    config.model.net_optimizer.optim_type = "adamw"
+    config.model.net_optimizer.lr = 1e-5  # 10x lower for fine-tuning
+    config.model.net_optimizer.betas = (0.9, 0.95)
+    config.model.net_optimizer.weight_decay = 0.0
+    # EMA config
+    config.model.use_ema = ["ema_9999", "ema_99995"]
+    config.trainer.callbacks = DictConfig(
+        {k: v for k, v in config.trainer.callbacks.items() if not k.startswith("ema")}
+    )
+    config.trainer.callbacks.update(EMA_CONST_CALLBACKS)
+    # Dataloader
+    config.dataloader_train = ImageNet256_Loader_Config
+    # Sampling config for visualization
+    config.model.student_sample_steps = 50  # DiT typically uses 50-250 steps
+    # Trainer
+    config.trainer.batch_size_global = 256
+    config.trainer.max_iter = 400000
+    config.trainer.save_ckpt_iter = 10000
+    config.trainer.logging_iter = 1000
+    config.log_config.group = "dit_xl_imagenet256_sft"
+    return config

FastGen/fastgen/configs/experiments/DiT/config_sft_sit_xl.py ADDED Viewed

	@@ -0,0 +1,75 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+from omegaconf import DictConfig
+import fastgen.configs.methods.config_sft as config_sft_default
+from fastgen.configs.data import ImageNet256_Loader_Config
+from fastgen.configs.net import DiT_IN256_XL_Config, CKPT_ROOT_DIR
+from fastgen.configs.callbacks import EMA_CONST_CALLBACKS
+"""Configs for SFT (Supervised Fine-Tuning) on SiT-XL and ImageNet-256 dataset.
+SiT (Scalable Interpolant Transformers) uses the same architecture as DiT but
+is trained with flow matching (rectified flow) instead of DDPM.
+Reference: https://github.com/willisma/SiT
+"""
+def create_config():
+    config = config_sft_default.create_config()
+    # model
+    # SiT latent shape: [C, H, W] = [4, 32, 32] for 256x256 images (256/8 = 32)
+    config.model.input_shape = [4, 32, 32]
+    config.model.precision_amp = "bfloat16"
+    config.model.cond_dropout_prob = 0.1  # 10% dropout for CFG training
+    # Timestep sampling config for flow matching
+    # SiT uses uniform time sampling during training
+    config.model.sample_t_cfg.time_dist_type = "uniform"
+    config.model.sample_t_cfg.min_t = 0.001
+    config.model.sample_t_cfg.max_t = 0.999
+    # Pretrained model path
+    # SiT-XL/2 ImageNet-256 checkpoint from https://github.com/willisma/SiT
+    config.model.pretrained_model_path = f"{CKPT_ROOT_DIR}/imagenet-256/SiT-XL-2-256x256.pt"
+    # Network config - SiT uses DiT architecture
+    config.model.net = DiT_IN256_XL_Config
+    config.model.net.learn_sigma = True  # SiT checkpoint outputs 8 channels
+    # SiT was trained with flow matching (rectified flow)
+    config.model.net.net_pred_type = "flow"  # Flow/velocity prediction
+    config.model.net.schedule_type = "rf"  # Use RF schedule
+    config.model.net.use_sit_convention = True  # SiT convention: t -> 1-t, v -> -v
+    config.model.net.scale_t = False  # SiT uses continuous time t in [0, 1]
+    # Optimizer config (lower LR for fine-tuning; 1e-4 is for training from scratch)
+    config.model.net_optimizer.optim_type = "adamw"
+    config.model.net_optimizer.lr = 1e-5  # 10x lower for fine-tuning
+    config.model.net_optimizer.betas = (0.9, 0.95)
+    config.model.net_optimizer.weight_decay = 0.0
+    # EMA config
+    config.model.use_ema = ["ema_9999", "ema_99995"]
+    config.trainer.callbacks = DictConfig(
+        {k: v for k, v in config.trainer.callbacks.items() if not k.startswith("ema")}
+    )
+    config.trainer.callbacks.update(EMA_CONST_CALLBACKS)
+    # Dataloader
+    config.dataloader_train = ImageNet256_Loader_Config
+    # Sampling config for visualization
+    config.model.student_sample_steps = 50  # Standard steps
+    config.model.guidance_scale = 4.0  # Standard CFG
+    # Trainer
+    config.trainer.batch_size_global = 256
+    config.trainer.max_iter = 400000
+    config.trainer.save_ckpt_iter = 10000
+    config.log_config.group = "sit_xl_imagenet256_sft"
+    return config

FastGen/fastgen/configs/experiments/EDM/config_cm_cifar10.py ADDED Viewed

	@@ -0,0 +1,28 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+from omegaconf import DictConfig
+from fastgen.configs.methods.config_cm import create_config as cm_create_config
+from fastgen.configs.callbacks import EMA_CONST_CALLBACKS
+from fastgen.configs.net import CKPT_ROOT_DIR
+def create_config():
+    config = cm_create_config()
+    # Recommended setting for 2-step:
+    # config.model.sample_t_cfg.t_list = [80.0, 0.821, 0.0]
+    # config.model.student_sample_steps = 2
+    config.model.pretrained_model_path = f"{CKPT_ROOT_DIR}/cifar10/edm-cifar10-32x32-uncond-vp.pth"
+    config.model.use_ema = ["ema_9999", "ema_99995", "ema_9996"]
+    config.trainer.callbacks = DictConfig(
+        {k: v for k, v in config.trainer.callbacks.items() if not k.startswith("ema")}
+    )
+    config.trainer.callbacks.update(EMA_CONST_CALLBACKS)
+    config.trainer.max_iter = 350000
+    config.trainer.batch_size_global = 512
+    return config

FastGen/fastgen/configs/experiments/EDM/config_cm_cifar10_fast.py ADDED Viewed

	@@ -0,0 +1,25 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+from fastgen.configs.methods.config_cm import create_config as cm_create_config
+from fastgen.configs.net import CKPT_ROOT_DIR
+def create_config():
+    config = cm_create_config()
+    # Recommended setting for 2-step:
+    # config.model.sample_t_cfg.t_list = [80.0, 0.821, 0.0]
+    # config.model.student_sample_steps = 2
+    config.model.pretrained_model_path = f"{CKPT_ROOT_DIR}/cifar10/edm-cifar10-32x32-uncond-vp.pth"
+    config.trainer.callbacks.ct_schedule.kimg_per_stage = 512000
+    config.dataloader_train.batch_size = 128
+    config.trainer.max_iter = 8000
+    config.trainer.callbacks.ct_schedule.q = 256.0
+    config.trainer.callbacks.ema.beta = 0.9993
+    config.trainer.save_ckpt_iter = 500
+    config.trainer.logging_iter = 100
+    return config

FastGen/fastgen/configs/experiments/EDM/config_cm_in64.py ADDED Viewed

	@@ -0,0 +1,65 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+from omegaconf import DictConfig
+import fastgen.configs.methods.config_cm as config_cm_default
+from fastgen.configs.data import ImageNet64_Loader_Config
+from fastgen.configs.net import EDM_ImageNet64_Config, CKPT_ROOT_DIR
+from fastgen.utils import LazyCall as L
+from fastgen.utils.lr_scheduler import LambdaInverseSquareRootScheduler
+from fastgen.configs.callbacks import EMA_POWER_CALLBACKS
+""" Configs for the CM model, on EDM2 and ImageNet-64 dataset. """
+def create_config():
+    config = config_cm_default.create_config()
+    # trainer
+    # recommended setting for ImageNet-64 is max_iter * batch_size // (4 * 1000)
+    config.trainer.callbacks.ct_schedule.kimg_per_stage = 3200
+    config.trainer.callbacks.ct_schedule.q = 4
+    config.trainer.callbacks.ct_schedule.ratio_limit = 0.9961
+    # ema
+    config.model.use_ema = ["ema_1", "ema_5", "ema_10"]
+    config.trainer.callbacks = DictConfig(
+        {k: v for k, v in config.trainer.callbacks.items() if not k.startswith("ema")}
+    )
+    config.trainer.callbacks.update(EMA_POWER_CALLBACKS)
+    # model
+    config.model.precision_amp = "float16"
+    config.model.grad_scaler_enabled = True
+    config.model.grad_scaler_init_scale = 16
+    config.model.grad_scaler_growth_interval = 20000
+    config.model.pretrained_model_path = f"{CKPT_ROOT_DIR}/imagenet-64/edm-imagenet-64x64-cond-adm.pth"
+    config.model.input_shape = [3, 64, 64]
+    config.model.sample_t_cfg.train_p_mean = -0.8
+    config.model.sample_t_cfg.train_p_std = 1.6
+    config.model.loss_config.huber_const = 0.06
+    config.model.loss_config.weighting_ct_loss = "c_out_sq"
+    # Recommended setting for 2-step:
+    # config.model.sample_t_cfg.t_list = [80.0, 1.526, 0.0]
+    # config.model.student_sample_steps = 2
+    config.model.net = EDM_ImageNet64_Config
+    config.model.net.dropout = 0.2
+    config.model.net_optimizer.optim_type = "adam"
+    config.model.net_optimizer.lr = 1e-3
+    config.model.net_optimizer.betas = (0.9, 0.99)
+    config.model.net_optimizer.weight_decay = 0.0
+    config.model.net_scheduler = L(LambdaInverseSquareRootScheduler)(
+        warm_up_steps=0,
+        decay_steps=2000,
+    )
+    # During inference, sigma_shift can improve 2-step results
+    # config.model.net.sigma_shift = 0.003
+    # dataloader
+    config.dataloader_train = ImageNet64_Loader_Config
+    config.log_config.group = "edm_imagenet64_cm"
+    return config

FastGen/fastgen/configs/experiments/EDM/config_dmd2_cifar10.py ADDED Viewed

	@@ -0,0 +1,24 @@

+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+from omegaconf import DictConfig
+from fastgen.configs.methods.config_dmd2 import create_config as dmd2_create_config
+from fastgen.configs.callbacks import EMA_CONST_CALLBACKS
+from fastgen.configs.net import CKPT_ROOT_DIR
+def create_config():
+    config = dmd2_create_config()
+    config.model.pretrained_model_path = f"{CKPT_ROOT_DIR}/cifar10/edm-cifar10-32x32-cond-vp.pth"
+    config.model.use_ema = ["ema_9999", "ema_99995", "ema_9996"]
+    config.trainer.callbacks = DictConfig(
+        {k: v for k, v in config.trainer.callbacks.items() if not k.startswith("ema")}
+    )
+    config.trainer.callbacks.update(EMA_CONST_CALLBACKS)
+    config.trainer.max_iter = 100000
+    config.trainer.batch_size_global = 2048
+    return config