Spaces:

ArthurY
/

Physicsnemo

Sleeping

App Files Files Community

ArthurY commited on Nov 10, 2025

Commit

c3d0544

1 Parent(s): bda03b1

update source

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +70 -1
Dockerfile +3 -0
physics_mcp/mcp_output/requirements.txt +18 -11
physics_mcp/source/.dockerignore +8 -0
physics_mcp/source/.gitattributes +0 -0
physics_mcp/source/.gitignore +176 -0
physics_mcp/source/CHANGELOG.md +556 -0
physics_mcp/source/CITATION.cff +7 -0
physics_mcp/source/CONTRIBUTING.md +251 -0
physics_mcp/source/FAQ.md +60 -0
physics_mcp/source/LICENSE.txt +201 -0
physics_mcp/source/README.md +472 -0
physics_mcp/source/SECURITY.md +34 -0
physics_mcp/source/__init__.py +4 -0
physics_mcp/source/greptile.json +59 -0
physics_mcp/source/physicsnemo/__init__.py +22 -0
physics_mcp/source/physicsnemo/active_learning/README.md +66 -0
physics_mcp/source/physicsnemo/active_learning/__init__.py +35 -0
physics_mcp/source/physicsnemo/active_learning/_registry.py +332 -0
physics_mcp/source/physicsnemo/active_learning/config.py +808 -0
physics_mcp/source/physicsnemo/active_learning/driver.py +1449 -0
physics_mcp/source/physicsnemo/active_learning/logger.py +330 -0
physics_mcp/source/physicsnemo/active_learning/loop.py +534 -0
physics_mcp/source/physicsnemo/active_learning/protocols.py +1394 -0
physics_mcp/source/physicsnemo/constants.py +48 -0
physics_mcp/source/physicsnemo/datapipes/__init__.py +15 -0
physics_mcp/source/physicsnemo/datapipes/benchmarks/__init__.py +15 -0
physics_mcp/source/physicsnemo/datapipes/benchmarks/darcy.py +322 -0
physics_mcp/source/physicsnemo/datapipes/benchmarks/kelvin_helmholtz.py +436 -0
physics_mcp/source/physicsnemo/datapipes/benchmarks/kernels/__init__.py +15 -0
physics_mcp/source/physicsnemo/datapipes/benchmarks/kernels/finite_difference.py +139 -0
physics_mcp/source/physicsnemo/datapipes/benchmarks/kernels/finite_volume.py +759 -0
physics_mcp/source/physicsnemo/datapipes/benchmarks/kernels/indexing.py +182 -0
physics_mcp/source/physicsnemo/datapipes/benchmarks/kernels/initialization.py +77 -0
physics_mcp/source/physicsnemo/datapipes/benchmarks/kernels/utils.py +141 -0
physics_mcp/source/physicsnemo/datapipes/cae/__init__.py +18 -0
physics_mcp/source/physicsnemo/datapipes/cae/cae_dataset.py +1275 -0
physics_mcp/source/physicsnemo/datapipes/cae/domino_datapipe.py +1334 -0
physics_mcp/source/physicsnemo/datapipes/cae/mesh_datapipe.py +490 -0
physics_mcp/source/physicsnemo/datapipes/cae/readers.py +191 -0
physics_mcp/source/physicsnemo/datapipes/climate/__init__.py +19 -0
physics_mcp/source/physicsnemo/datapipes/climate/climate.py +813 -0
physics_mcp/source/physicsnemo/datapipes/climate/era5_hdf5.py +622 -0
physics_mcp/source/physicsnemo/datapipes/climate/era5_netcdf.py +15 -0
physics_mcp/source/physicsnemo/datapipes/climate/synthetic.py +182 -0
physics_mcp/source/physicsnemo/datapipes/climate/utils/__init__.py +15 -0
physics_mcp/source/physicsnemo/datapipes/climate/utils/invariant.py +139 -0
physics_mcp/source/physicsnemo/datapipes/climate/utils/zenith_angle.py +208 -0
physics_mcp/source/physicsnemo/datapipes/datapipe.py +60 -0
physics_mcp/source/physicsnemo/datapipes/gnn/__init__.py +15 -0

.gitignore CHANGED Viewed

	@@ -1 +1,70 @@
1	- *.DS_Store

+*.DS_Store
+# ===== 源代码中不必要的目录（来自NVIDIA原项目） =====
+# 文档 (102MB)
+physics_mcp/source/docs/
+# 测试 (28MB)
+physics_mcp/source/test/
+# 示例 (17MB)
+physics_mcp/source/examples/
+# ===== Git和CI/CD配置 =====
+physics_mcp/source/.github/
+physics_mcp/source/.gitlab/
+physics_mcp/source/.gitlab-ci.yml
+physics_mcp/source/.pre-commit-config.yaml
+physics_mcp/source/.markdownlint.yaml
+# ===== 项目配置文件 =====
+physics_mcp/source/Dockerfile
+physics_mcp/source/Makefile
+physics_mcp/source/.gitmodules
+# ===== Python缓存 =====
+**/__pycache__/
+**/*.py[cod]
+**/*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# ===== 虚拟环境 =====
+venv/
+env/
+ENV/
+.venv
+# ===== IDE配置 =====
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+# ===== Pytest和覆盖率 =====
+.pytest_cache/
+.coverage
+htmlcov/
+# ===== 日志和临时文件 =====
+*.log
+*.tmp
+*.tmp.txt
+physics_mcp/mcp_output/mcp_logs/
+physics_mcp/mcp_output/output/

Dockerfile CHANGED Viewed

@@ -11,6 +11,9 @@ RUN apt-get update && apt-get install -y \
     wget \
     && rm -rf /var/lib/apt/lists/*
 # Copy physics_mcp folder
 COPY physics_mcp /app/physics_mcp

     wget \
     && rm -rf /var/lib/apt/lists/*
+# Copy source directory (original NVIDIA physicsnemo code) - REQUIRED
+COPY physics_mcp/source /app/physics_mcp/source
 # Copy physics_mcp folder
 COPY physics_mcp /app/physics_mcp

physics_mcp/mcp_output/requirements.txt CHANGED Viewed

@@ -1,19 +1,26 @@
 fastmcp>=0.1.0
 pydantic>=2.0.0
-torch
-numpy
-scipy
-onnx
 tritonclient
 matplotlib
 pandas
-pyyaml
-cuml
-# Optional Dependencies
-# wandb
-# mlflow
 # dgl
 # pyg
-# vtk
-# netCDF4

 fastmcp>=0.1.0
 pydantic>=2.0.0
+torch>=2.4.0
+numpy>=1.22.4
+scipy>=1.9.0
+onnx>=1.14.0
 tritonclient
 matplotlib
 pandas
+pyyaml>=6.0
+tqdm>=4.60.0
+xarray>=2023.1.0
+zarr>=2.14.2
+s3fs>=2023.5.0
+timm>=1.0.0
+# Optional Dependencies (can be uncommented as needed)
+# cuml>=24.0.0  (requires RAPIDS conda channel - use scipy fallback instead)
+# wandb>=0.13.7
+# mlflow>=2.1.1
 # dgl
 # pyg
+# vtk>=9.2.6
+# netCDF4>=1.6.3
+# h5py>=3.7.0
+# nvidia-dali-cuda120>=1.35.0

physics_mcp/source/.dockerignore ADDED Viewed

	@@ -0,0 +1,8 @@

+.git
+.github
+.gitlab
+.coverage*
+.*cache
+examples
+docs
+test

physics_mcp/source/.gitattributes ADDED Viewed

File without changes

physics_mcp/source/.gitignore ADDED Viewed

	@@ -0,0 +1,176 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+docs/examples/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+# VsCode
+.vscode/
+.cursor/
+# VIM
+*.swp
+*~
+# Additional stuff
+nsight-systems*
+build/
+mlruns/
+checkpoints/
+# Hydra
+outputs/
+multirun/
+.hydra/
+# SLURM
+slurm-*.out
+sbatch_logs/

physics_mcp/source/CHANGELOG.md ADDED Viewed

	@@ -0,0 +1,556 @@

+<!-- markdownlint-disable MD024 -->
+# Changelog
+All notable changes to this project will be documented in this file.
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+## [1.3.0a0] - 2025-XX-YY
+### Added
+- Added mixture_of_experts for weather example in physicsnemo.examples.weather.
+  **⚠️Warning:** - It uses experimental DiT model subject to future API changes.
+  Added some modifications to DiT architecture in physicsnemo.experimental.models.dit.
+  Added learnable option to PositionalEmbedding in physicsnemo.models.diffusion.layers.
+- Added lead-time aware training support to the StormCast example.
+- Add a device aware kNN method to physicsnemo.utils.neighbors. Works with CPU or GPU
+  by dispatching to the proper optimized library, and torch.compile compatible.
+- Added additional testing of the DoMINO datapipe.
+- Examples: added a new example for full-waveform inversion using diffusion
+  models. Accessible in `examples/geophysics/diffusion_fwi`.
+- Domain Parallelism: Domain Parallelism is now available for kNN, radius_search,
+  and torch.nn.functional.pad.
+- Unified recipe for crash modeling, supporting Transolver and MeshGraphNet,
+  and three transient schemes.
+- Added a check to `stochastic_sampler` that helps handle the `EDMPrecond` model,
+  which has a specific `.forward()` signature
+- Added abstract interfaces for constructing active learning workflows, contained
+  under the `physicsnemo.active_learning` namespace. A preliminary example of how
+  to compose and define an active learning workflow is provided in `examples/active_learning`.
+  The `moons` example provides a minimal (pedagogical) composition that is meant to
+  illustrate how to define the necessary parts of the workflow.
+### Changed
+- Migrated Stokes MGN example to PyTorch Geometric.
+- Migrated Lennard Jones example to PyTorch Geometric.
+- Migrated physicsnemo.utils.sdf.signed_distance_field to a static return,
+  torch-only interface.  It also now works on distributed meshes and input fields.
+- Refactored DiTBlock to be more modular
+- Added NATTEN 2D neighborhood attention backend for DiTBlock
+- Migrated blood flow example to PyTorch Geometric.
+- Refactored DoMINO model code and examples for performance optimizations and improved readability.
+- Migrated HydroGraphNet example to PyTorch Geometric.
+- Support for saving and loading nested `physicsnemo.Module`s. It is now
+  possible to create nested modules with `m = Module(submodule, ...)`, and save
+  and load them with `Module.save` and `Module.from_checkpoint`.
+  **⚠️Warning:** - The modules have to be `physicsnemo.Module`s, and not
+  `torch.nn.Module`s.
+- Support passing custom tokenizer, detokenizer, and attention `Module`s in
+  experimental DiT architecture
+- Improved Transolver training recipe's configuration for checkpointing and normalization.
+- Bumped `multi-storage-client` version to 0.33.0 with rust client.
+### Deprecated
+### Removed
+### Fixed
+- Set `skip_scale` to Python float in U-Net to ensure compilation works.
+- Ensure stream dependencies are handled correctly in physicsnemo.utils.neighbors
+- Fixed the issue with incorrect handling of files with consecutive runs of
+  `combine_stl_solids.py` in the X-MGN recipe.
+- Fixed the `RuntimeError: Worker data receiving interrupted` error in the datacenter example.
+### Security
+### Dependencies
+## [1.2.0] - 2025-08-26
+### Added
+- Diffusion Transformer (DiT) model. The DiT model can be accessed in
+ `physicsnemo.experimental.models.dit.DiT`. **⚠️Warning:** - Experimental feature
+  subject to future API changes.
+- Improved documentation for diffusion models and diffusion utils.
+- Safe API to override `__init__`'s arguments saved in checkpoint file with
+  `Module.from_checkpoint("chkpt.mdlus", override_args=set(...))`.
+- PyTorch Geometric MeshGraphNet backend.
+- Functionality in DoMINO to take arbitrary number of `scalar` or `vector`
+  global parameters and encode them using `class ParameterModel`
+- TopoDiff model and example.
+- Added ability for DoMINO model to return volume neighbors.
+- Added functionality in DoMINO recipe to introduce physics residual losses.
+- Diffusion models, metrics, and utils: implementation of Student-t
+  distribution for EDM-based diffusion models (t-EDM). This feature is adapted
+  from the paper [Heavy-Tailed Diffusion Models, Pandey et al.](https://arxiv.org/abs/2410.14171>).
+  This includes a new EDM preconditioner (`tEDMPrecondSuperRes`), a loss
+  function (`tEDMResidualLoss`), and a new option in corrdiff `diffusion_step`.
+  &#9888;&#65039; This is an experimental feature that can be accessed through the
+  `physicsnemo.experimental` module; it might also be subjected to API changes
+  without notice.
+- Bumped Ruff version from 0.0.290 to 0.12.5. Replaced Black with `ruff-format`.
+- Domino improvements with Unet attention module and user configs
+- Hybrid MeshGraphNet for modeling structural deformation
+- Enabled TransformerEngine backend in the `transolver` model.
+- Inference code for x-meshgraphnet example for external aerodynamics.
+- Added a new example for external_aerodynamics: training `transolver` on
+  irregular mesh data for DrivaerML surface data.
+- Added a new example for external aerodynamics for finetuning pretrained models.
+### Changed
+- Diffusion utils: `physicsnemo.utils.generative` renamed into `physicsnemo.utils.diffusion`
+- Diffusion models: in CorrDiff model wrappers (`EDMPrecondSuperResolution` and
+  `UNet`), the arguments `profile_mode` and `amp_mode` cannot be overriden by
+  `from_checkpoint`. They are now properties that can be dynamically changed
+  *after* the model instantiation with, for example, `model.amp_mode = True`
+  and `model.profile_mode = False`.
+- Updated healpix data module to use correct `DistributedSampler` target for
+  test data loader
+- Existing DGL-based vortex shedding example has been renamed to `vortex_shedding_mgn_dgl`.
+  Added new `vortex_shedding_mgn` example that uses PyTorch Geometric instead.
+- HEALPixLayer can now use earth2grid HEALPix padding ops, if desired
+- Migrated Vortex Shedding Reduced Mesh example to PyTorch Geometric.
+- CorrDiff example: fixed bugs when training regression `UNet`.
+- Diffusion models: fixed bugs related to gradient checkpointing on non-square
+  images.
+- Diffusion models: created a separate class `Attention` for clarity and
+  modularity. Updated `UNetBlock` accordingly to use the `Attention` class
+  instead of custom attention logic. This will update the model architecture
+  for `SongUNet`-based diffusion models. Changes are not BC-breaking and are
+  transparent to the user.
+- &#9888;&#65039; **BC-breaking:** refactored the automatic mixed precision
+  (AMP) API in layers and models defined in `physicsnemo/models/diffusion/` for
+  improved usability. Note: it is now, not only possible, but *required* to
+  explicitly set `model.amp_mode = True` in order to use the model in a
+  `torch.autocast` clause. This applies to all `SongUNet`-based models.
+- Diffusion models: fixed and improved API to enable fp16 forward pass in
+  `UNet` and `EDMPrecondSuperResolution` model wrappers; fp16 forward pass can
+  now be toggled/untoggled by setting `model.use_fp16 = True`.
+- Diffusion models: improved API for Apex group norm. `SongUNet`-based models
+  will automatically perform conversion of the input tensors to
+  `torch.channels_last` memory format when `model.use_apex_gn` is `True`. New
+  warnings are raised when attempting to use Apex group norm on CPU.
+- Diffusion utils: systematic compilation of patching operations in `stochastic_sampler`
+  for improved performance.
+- CorrDiff example: added option for Student-t EDM (t-EDM) in `train.py` and
+  `generate.py`. When training a CorrDiff diffusion model, this feature can be
+  enabled with the hydra overrides `++training.hp.distribution=student_t` and
+  `++training.hp.nu_student_t=<nu_value>`. For generation, this feature can be
+  enabled with similar overrides: `++generation.distribution=student_t` and
+  `++generation.nu_student_t=<nu_value>`.
+- CorrDiff example: the parameters `P_mean` and `P_std` (used to compute the
+  noise level `sigma`) are now configurable. They can be set with the hydra
+  overrides `++training.hp.P_mean=<P_mean_value>` and
+  `++training.hp.P_std=<P_std_value>` for training (and similar ones with
+  `training.hp` replaced by `generation` for generation).
+- Diffusion utils: patch-based inference and lead time support with
+  deterministic sampler.
+- Existing DGL-based XAeroNet example has been renamed to `xaeronet_dgl`.
+  Added new `xaeronet` example that uses PyTorch Geometric instead.
+- Updated the deforming plate example to use the Hybrid MeshGraphNet model.
+- &#9888;&#65039; **BC-breaking:** Refactored the `transolver` model to improve
+  readability and performance, and extend to more use cases.
+- Diffusion models: improved lead time support for `SongUNetPosLtEmbd` and
+  `EDMLoss`. Lead-time embeddings can now be used with/without positional
+  embeddings.
+- Diffusion models: consolidate `ApexGroupNorm` and `GroupNorm` in
+  `models/diffusion/layers.py` with a factory `get_group_norm` that can
+  be used to instantiate either one of them. `get_group_norm` is now the
+  recommended way to instantiate a GroupNorm layer in `SongUNet`-based and
+  other diffusion models.
+- Physicsnemo models: improved checkpoint loading API in
+  `Module.from_checkpoint` that now exposes a `strict` parameter to raise error
+  on missing/unexpected keys, similar to that used in
+  `torch.nn.Module.load_state_dict`.
+- Migrated Hybrid MGN and deforming plate example to PyTorch Geometric.
+### Fixed
+- Bug fixes in DoMINO model in sphere sampling and tensor reshaping
+- Bug fixes in DoMINO utils random sampling and test.py
+- Optimized DoMINO config params based on DrivAer ML
+## [1.1.1] - 2025-06-16
+### Fixed
+- Fixed an inadvertent change to the deterministic sampler 2nd order correction
+- Bug Fix in Domino model ball query layer
+- Fixed bug models/unet/unet.py: setting num_conv_layers=1 gives errors
+## [1.1.0] - 2025-06-05
+### Added
+- Added ReGen score-based data assimilation example
+- General purpose patching API for patch-based diffusion
+- New positional embedding selection strategy for CorrDiff SongUNet models
+- Added Multi-Storage Client to allow checkpointing to/from Object Storage
+- Added a new aerodynamics example using DoMINO to compute design sensitivities
+  (e.g., drag adjoint) with respect to underlying input geometry.
+### Changed
+- Simplified CorrDiff config files, updated default values
+- Refactored CorrDiff losses and samplers to use the patching API
+- Support for non-square images and patches in patch-based diffusion
+- ERA5 download example updated to use current file format convention and
+  restricts global statistics computation to the training set
+- Support for training custom StormCast models and various other improvements for StormCast
+- Updated CorrDiff training code to support multiple patch iterations to amortize
+  regression cost and usage of `torch.compile`
+- Refactored `physicsnemo/models/diffusion/layers.py` to optimize data type
+  casting workflow, avoiding unnecessary casting under autocast mode
+- Refactored Conv2d to enable fusion of conv2d with bias addition
+- Refactored GroupNorm, UNetBlock, SongUNet, SongUNetPosEmbd to support usage of
+  Apex GroupNorm, fusion of activation with GroupNorm, and AMP workflow.
+- Updated SongUNetPosEmbd to avoid unnecessary HtoD Memcpy of `pos_embd`
+- Updated `from_checkpoint` to accommodate conversion between Apex optimized ckp
+  and non-optimized ckp
+- Refactored CorrDiff NVTX annotation workflow to be configurable
+- Refactored `ResidualLoss` to support patch-accumlating training for
+  amortizing regression costs
+- Explicit handling of Warp device for ball query and sdf
+- Merged SongUNetPosLtEmb with SongUNetPosEmb, add support for batch>1
+- Add lead time embedding support for `positional_embedding_selector`. Enable
+arbitrary positioning of probabilistic variables
+- Enable lead time aware regression without CE loss
+- Bumped minimum PyTorch version from 2.0.0 to 2.4.0, to minimize
+  support surface for `physicsnemo.distributed` functionality.
+### Dependencies
+- Made `nvidia.dali` an optional dependency
+## [1.0.1] - 2025-03-25
+### Added
+- Added version checks to ensure compatibility with older PyTorch for distributed
+  utilities and ShardTensor
+### Fixed
+- `EntryPoint` error that occured during physicsnemo checkpoint loading
+## [1.0.0] - 2025-03-18
+### Added
+- DoMINO model architecture, datapipe and training recipe
+- Added matrix decomposition scheme to improve graph partitioning
+- DrivAerML dataset support in FIGConvNet example.
+- Retraining recipe for DoMINO from a pretrained model checkpoint
+- Prototype support for domain parallelism of using ShardTensor (new).
+- Enable DeviceMesh initialization via DistributedManager.
+- Added Datacenter CFD use case.
+- Add leave-in profiling utilities to physicsnemo, to easily enable torch/python/nsight
+  profiling in all aspects of the codebase.
+### Changed
+- Refactored StormCast training example
+- Enhancements and bug fixes to DoMINO model and training example
+- Enhancement to parameterize DoMINO model with inlet velocity
+- Moved non-dimensionaliztion out of domino datapipe to datapipe in domino example
+- Updated utils in `physicsnemo.launch.logging` to avoid unnecessary `wandb` and `mlflow`
+  imports
+- Moved to experiment-based Hydra config in Lagrangian-MGN example
+- Make data caching optional in `MeshDatapipe`
+- The use of older `importlib_metadata` library is removed
+### Deprecated
+- ProcessGroupConfig is tagged for future deprecation in favor of DeviceMesh.
+### Fixed
+- Update pytests to skip when the required dependencies are not present
+- Bug in data processing script in domino training example
+- Fixed NCCL_ASYNC_ERROR_HANDLING deprecation warning
+### Dependencies
+- Remove the numpy dependency upper bound
+- Moved pytz and nvtx to optional
+- Update the base image for the Dockerfile
+- Introduce Multi-Storage Client (MSC) as an optional dependency.
+- Introduce `wrapt` as an optional dependency, needed when using
+  ShardTensor's automatic domain parallelism
+## [0.9.0] - 2024-12-04
+### Added
+- Graph Transformer processor for GraphCast/GenCast.
+- Utility to generate STL from Signed Distance Field.
+- Metrics for CAE and CFD domain such as integrals, drag, and turbulence invariances and
+  spectrum.
+- Added gradient clipping to StaticCapture utilities.
+- Bistride Multiscale MeshGraphNet example.
+- FIGConvUNet model and example.
+- The Transolver model.
+- The XAeroNet model.
+- Incoporated CorrDiff-GEFS-HRRR model into CorrDiff, with lead-time aware SongUNet and
+  cross entropy loss.
+- Option to offload checkpoints to further reduce memory usage
+- Added StormCast model training and simple inference to examples
+- Multi-scale geometry features for DoMINO model.
+### Changed
+- Refactored CorrDiff training recipe for improved usability
+- Fixed timezone calculation in datapipe cosine zenith utility.
+- Refactored EDMPrecondSRV2 preconditioner and fixed the bug related to the metadata
+- Extended the checkpointing utility to store metadata.
+- Corrected missing export of loggin function used by transolver model
+## [0.8.0] - 2024-09-24
+### Added
+- Graph Transformer processor for GraphCast/GenCast.
+- Utility to generate STL from Signed Distance Field.
+- Metrics for CAE and CFD domain such as integrals, drag, and turbulence invariances and
+  spectrum.
+- Added gradient clipping to StaticCapture utilities.
+- Bistride Multiscale MeshGraphNet example.
+### Changed
+- Refactored CorrDiff training recipe for improved usability
+- Fixed timezone calculation in datapipe cosine zenith utility.
+## [0.7.0] - 2024-07-23
+### Added
+- Code logging for CorrDiff via Wandb.
+- Augmentation pipeline for CorrDiff.
+- Regression output as additional conditioning for CorrDiff.
+- Learnable positional embedding for CorrDiff.
+- Support for patch-based CorrDiff training and generation (stochastic sampling only)
+- Enable CorrDiff multi-gpu generation
+- Diffusion model for fluid data super-resolution (CMU contribution).
+- The Virtual Foundry GraphNet.
+- A synthetic dataloader for global weather prediction models, demonstrated on GraphCast.
+- Sorted Empirical CDF CRPS algorithm
+- Support for history, cos zenith, and downscaling/upscaling in the ERA5 HDF5 dataloader.
+- An example showing how to train a "tensor-parallel" version of GraphCast on a
+Shallow-Water-Equation example.
+- 3D UNet
+- AeroGraphNet example of training of MeshGraphNet on Ahmed body and DrivAerNet datasets.
+- Warp SDF routine
+- DLWP HEALPix model
+- Pangu Weather model
+- Fengwu model
+- SwinRNN model
+- Modulated AFNO model
+### Changed
+- Raise `PhysicsNeMoUndefinedGroupError` when querying undefined process groups
+- Changed Indexing error in `examples/cfd/swe_nonlinear_pino` for `physicsnemo` loss function
+- Safeguarding against uninitialized usage of `DistributedManager`
+### Removed
+- Remove mlflow from deployment image
+### Fixed
+- Fixed bug in the partitioning logic for distributing graph structures
+intended for distributed message-passing.
+- Fixed bugs for corrdiff diffusion training of `EDMv1` and `EDMv2`
+- Fixed bug when trying to save DDP model trained through unified recipe
+### Dependencies
+- Update DALI to CUDA 12 compatible version.
+- Update minimum python version to 3.10
+## [0.6.0] - 2024-04-17
+### Added
+- The citation file.
+- Link to the CWA dataset.
+- ClimateDatapipe: an improved datapipe for HDF5/NetCDF4 formatted climate data
+- Performance optimizations to CorrDiff.
+- Physics-Informed Nonlinear Shallow Water Equations example.
+- Warp neighbor search routine with a minimal example.
+- Strict option for loading PhysicsNeMo checkpoints.
+- Regression only or diffusion only inference for CorrDiff.
+- Support for organization level model files on NGC file system
+- Physics-Informed Magnetohydrodynamics example.
+### Changed
+- Updated Ahmed Body and Vortex Shedding examples to use Hydra config.
+- Added more config options to FCN AFNO example.
+- Moved posiitonal embedding in CorrDiff from the dataloader to network architecture
+### Deprecated
+- `physicsnemo.models.diffusion.preconditioning.EDMPrecondSR`. Use `EDMPecondSRV2` instead.
+### Removed
+- Pickle dependency for CorrDiff.
+### Fixed
+- Consistent handling of single GPU runs in DistributedManager
+- Output location of objects downloaded with NGC file system
+- Bug in scaling the conditional input in CorrDiff deterministic sampler
+### Dependencies
+- Updated DGL build in Dockerfile
+- Updated default base image
+- Moved Onnx from optional to required dependencies
+- Optional Makani dependency required for SFNO model.
+## [0.5.0] - 2024-01-25
+### Added
+- Distributed process group configuration mechanism.
+- DistributedManager utility to instantiate process groups based on a process group config.
+- Helper functions to faciliate distributed training with shared parameters.
+- Brain anomaly detection example.
+- Updated Frechet Inception Distance to use Wasserstein 2-norm with improved stability.
+- Molecular Dynamics example.
+- Improved usage of GraphPartition, added more flexible ways of defining a partitioned graph.
+- Physics-Informed Stokes Flow example.
+- Profiling markers, benchmarking and performance optimizations for CorrDiff inference.
+- Unified weather model training example.
+### Changed
+- MLFLow logging such that only proc 0 logs to MLFlow.
+- FNO given seperate methods for constructing lift and spectral encoder layers.
+### Removed
+- The experimental SFNO
+### Dependencies
+- Removed experimental SFNO dependencies
+- Added CorrDiff dependencies (cftime, einops, pyspng, nvtx)
+- Made tqdm a required dependency
+## [0.4.0] - 2023-11-20
+### Added
+- Added Stokes flow dataset
+- An experimental version of SFNO to be used in unified training recipe for
+weather models
+- Added distributed FFT utility.
+- Added ruff as a linting tool.
+- Ported utilities from PhysicsNeMo Launch to main package.
+- EDM diffusion models and recipes for training and sampling.
+- NGC model registry download integration into package/filesystem.
+- Denoising diffusion tutorial.
+### Changed
+- The AFNO input argument `img_size` to `inp_shape`
+- Integrated the network architecture layers from PhysicsNeMo-Sym.
+- Updated the SFNO model, and the training and inference recipes.
+### Fixed
+- Fixed physicsnemo.Module `from_checkpoint` to work from custom model classes
+### Dependencies
+- Updated the base container to PyTorch 23.10.
+- Updated examples to use Pydantic v2.
+## [0.3.0] - 2023-09-21
+### Added
+- Added ability to compute CRPS(..., dim: int = 0).
+- Added EFI for arbitrary climatological CDF.
+- Added Kernel CRPS implementation (kcrps)
+- Added distributed utilities to create process groups and orthogonal process groups.
+- Added distributed AFNO model implementation.
+- Added distributed utilities for communication of buffers of varying size per rank.
+- Added distributed utilities for message passing across multiple GPUs.
+- Added instructions for docker build on ARM architecture.
+- Added batching support and fix the input time step for the DLWP wrapper.
+### Changed
+- Updating file system cache location to physicsnemo folder
+### Fixed
+- Fixed physicsnemo uninstall in CI docker image
+### Security
+- Handle the tar ball extracts in a safer way.
+### Dependencies
+- Updated the base container to latest PyTorch 23.07.
+- Update DGL version.
+- Updated require installs for python wheel
+- Added optional dependency list for python wheel
+## [0.2.1] - 2023-08-08
+### Fixed
+- Added a workaround fix for the CUDA graphs error in multi-node runs
+### Security
+- Update `certifi` package version
+## [0.2.0] - 2023-08-07
+### Added
+- Added a CHANGELOG.md
+- Added build support for internal DGL
+- 4D Fourier Neural Operator model
+- Ahmed body dataset
+- Unified Climate Datapipe
+### Changed
+- DGL install changed from pypi to source
+- Updated SFNO to add support for super resolution, flexible checkpoining, etc.
+### Fixed
+- Fixed issue with torch-harmonics version locking
+- Fixed the PhysicsNeMo editable install
+- Fixed AMP bug in static capture
+### Security
+- Fixed security issues with subprocess and urllib in `filesystem.py`
+### Dependencies
+- Updated the base container to latest PyTorch base container which is based on torch 2.0
+- Container now supports CUDA 12, Python 3.10
+## [0.1.0] - 2023-05-08
+### Added
+- Initial public release.

physics_mcp/source/CITATION.cff ADDED Viewed

	@@ -0,0 +1,7 @@

+cff-version: 1.2.0
+message: "If you use this software, please cite it as below."
+title: "NVIDIA PhysicsNeMo: An open-source framework for physics-based deep learning in science and engineering"
+date-released: "2023-02-24"
+authors:
+  - name: "PhysicsNeMo Contributors"
+repository-code: "https://github.com/NVIDIA/physicsnemo"

physics_mcp/source/CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,251 @@

+# PhysicsNeMo Contribution Guide
+## Introduction
+Welcome to Project PhysicsNeMo! We're excited you're here and want to contribute.
+This documentation is intended for individuals and institutions interested in
+contributing to PhysicsNeMo. PhysicsNeMo is an open-source project and, as such, its
+success relies on its community of contributors willing to keep improving it.
+Your contribution will be a valued addition to the code base; we simply ask
+that you read this page and understand our contribution process, whether you
+are a seasoned open-source contributor or whether you are a first-time
+contributor.
+### Communicate with Us
+We are happy to talk with you about your needs for PhysicsNeMo and your ideas for
+contributing to the project. One way to do this is to create an issue discussing
+your thoughts. It might be that a very similar feature is under development or
+already exists, so an issue is a great starting point. If you are looking for an
+issue to resolve that will help, refer to the
+[issue](https://github.com/NVIDIA/physicsnemo/issues) section.
+If you are considering collaborating with NVIDIA PhysicsNeMo team to enhance PhysicsNeMo,
+fill this [proposal form](https://forms.gle/fYsbZEtgRWJUQ3oQ9) and
+we will get back to you.
+## Contribute to PhysicsNeMo-Core
+### Pull Requests
+Developer workflow for code contributions is as follows:
+1. Developers must first [fork](https://help.github.com/en/articles/fork-a-repo)
+the [upstream](https://github.com/NVIDIA/physicsnemo) PhysicsNeMo repository.
+2. Git clone the forked repository and push changes to the personal fork.
+3. Once the code changes are staged on the fork and ready for review, a
+[Pull Request](https://help.github.com/en/articles/about-pull-requests) (PR)
+can be [requested](https://help.github.com/en/articles/creating-a-pull-request)
+to merge the changes from a branch of the fork into a selected branch of upstream.
+    - Exercise caution when selecting the source and target branches for the PR.
+    - Ensure that you update the [`CHANGELOG.md`](CHANGELOG.md) to reflect your contributions.
+    - Creation of a PR creation kicks off CI and a code review process.
+    - Atleast one PhysicsNeMo engineer will be assigned for the review.
+4. The PR will be accepted and the corresponding issue closed after adequate review and
+testing has been completed. Note that every PR should correspond to an open issue and
+should be linked on Github.
+### Licensing Information
+All source code files should start with this paragraph:
+```bash
+# SPDX-FileCopyrightText: Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+```
+### Signing Your Work
+- We require that all contributors "sign-off" on their commits. This certifies that the
+contribution is your original work, or you have rights to submit it under the same
+license, or a compatible license.
+  - Any contribution which contains commits that are not Signed-Off will not be accepted.
+- To sign off on a commit you simply use the `--signoff` (or `-s`) option when
+committing your changes:
+  ```bash
+  git commit -s -m "Add cool feature."
+  ```
+  This will append the following to your commit message:
+  ```text
+  Signed-off-by: Your Name <your@email.com>
+  ```
+- Full text of the DCO:
+  ```text
+    Developer Certificate of Origin
+    Version 1.1
+    Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
+    1 Letterman Drive
+    Suite D4700
+    San Francisco, CA, 94129
+    Everyone is permitted to copy and distribute verbatim copies of this license
+    document, but changing it is not allowed.
+  ```
+  ```text
+    Developer's Certificate of Origin 1.1
+    By making a contribution to this project, I certify that:
+    (a) The contribution was created in whole or in part by me and I have the right to
+    submit it under the open source license indicated in the file; or
+    (b) The contribution is based upon previous work that, to the best of my knowledge,
+    is covered under an appropriate open source license and I have the right under that
+    license to submit that work with modifications, whether created in whole or in part
+    by me, under the same open source license (unless I am permitted to submit under a
+    different license), as indicated in the file; or
+    (c) The contribution was provided directly to me by some other person who certified
+    (a), (b) or (c) and I have not modified it.
+    (d) I understand and agree that this project and the contribution are public and
+    that a record of the contribution (including all personal information I submit with
+    it, including my sign-off) is maintained indefinitely and may be redistributed
+    consistent with this project or the open source license(s) involved.
+  ```
+### Pre-commit
+For PhysicsNeMo development, [pre-commit](https://pre-commit.com/) is **required**.
+This will not only help developers pass the CI pipeline, but also accelerate reviews.
+Contributions that have not used pre-commit will *not be reviewed*.
+`pre-commit` is installed as part of the `dev` optional dependencies defined in `pyproject.toml`.
+To install `pre-commit` in an existing environment, follow the below steps inside the PhysicsNeMo
+repository folder:
+```bash
+pip install pre-commit
+pre-commit install
+```
+Once the above commands are executed, the pre-commit hooks will be activated and all
+the commits will be checked for appropriate formatting.
+### Continuous Integration (CI)
+To ensure quality of the code, your merge request (MR) will pass through several CI checks.
+It is mandatory for your MRs to pass these pipelines to ensure a successful merge.
+Please keep checking this document for the latest guidelines on pushing code. Currently,
+The pipeline has following stages:
+1. `format`
+    *Pre-commit will check this for you!* Checks for formatting of your
+    Python code, using `ruff format` via [Ruff](https://docs.astral.sh/ruff/).
+    If your MR fails this test, run `ruff format <script-name>.py` on
+    problematic scripts and Ruff will take care of the rest.
+2. `interrogate`
+   *Pre-commit will check this for you!*
+   Checks if the code being pushed is well documented. The goal is to make the
+   documentation live inside code. Very few exceptions are made.
+   Elements that are fine to have no documentation include `init-module`, `init-method`,
+   `private` and `semiprivate` classes/functions and `dunder` methods. For definitions of
+   these, refer [interrogate](https://interrogate.readthedocs.io/en/latest/). Meaning for
+   some methods/functions is very explicit and exceptions for these are made. These
+   include `forward`, `reset_parameters`, `extra_repr`, `MetaData`. If your MR fails this
+   test, add the missing documentation. Take a look at the pipeline output for hints on
+   which functions/classes need documentation.
+   To test the documentation before making a commit, you can run the following during
+   your development
+    ```bash
+    interrogate \
+      --ignore-init-method \
+      --ignore-init-module \
+      --ignore-module \
+      --ignore-private \
+      --ignore-semiprivate \
+      --ignore-magic \
+      --fail-under 99 \
+      --exclude '[setup.py]' \
+      --ignore-regex forward \
+      --ignore-regex reset_parameters \
+      --ignore-regex extra_repr \
+      --ignore-regex MetaData \
+      -vv \
+      --color \
+      ./physicsnemo/
+    ```
+3. `lint`
+    *Pre-commit will check this for you!*
+    Linters will perform static analysis to check the style, complexity, errors
+    and more. For markdown files `markdownlint` is used, its suggested to use
+    the vscode, neovim or sublime
+    [extensions](https://github.com/DavidAnson/markdownlint#related).
+    PhysicsNeMo uses `ruff check` via[Ruff](https://docs.astral.sh/ruff/) for
+    linting of various types. Currently we use flake8/pycodestyle (`E`),
+    Pyflakes (`F`), flake8-bandit (`S`), isort (`I`), and performance 'PERF'
+    rules. Many rule violations will be automatically fixed by Ruff; others may
+    require manual changes.
+4. `license`
+    *Pre-commit will check this for you!*
+    Checks for correct license headers of all files.
+    To run this locally use `make license`.
+    See the Licensing Information section above for details about the license header required.
+5. `pytest`
+    Checks if the test scripts from the `test` folder run and produce desired outputs. It
+    is imperative that your changes don't break the existing tests. If your MR fails this
+    test, you will have to review your changes and fix the issues.
+    To run pytest locally you can simply run `pytest` inside the `test` folder.
+    While writing these tests, we encourage you to make use of the [`@import_of_fail`](https://github.com/NVIDIA/physicsnemo/blob/main/test/pytest_utils.py#L25)
+    decorator to appropriately skip your tests for developers and users not having your
+    test specific dependencies. This mechanism helps us provide a better developer and
+    user experience when working with the unit tests.
+    Some of the tests require test data to be run; otherwise, they will be skipped.
+    To get the data (available to NVIDIANs only), set the `TEST_DATA_DIR` environment variable
+    to a desired value and run make get-data. After that, pytest will use the same
+    variable to find the test data. Alternatively, you can pass it explicitly using
+    `pytest --nfs-data-dir=<path to test data>`.
+6. `doctest`
+    Checks if the examples in the docstrings run and produce desired outputs.
+    It is highly recommended that you provide simple examples of your functions/classes
+    in the code's docstring itself.
+    Keep these examples simple and also add the expected outputs.
+    Refer [doctest](https://docs.python.org/3/library/doctest.html) for more information.
+    If your MR fails this test, check your changes and the docstrings.
+    To run doctest locally, you can simply run `pytest --doctest-modules` inside the
+    `physicsnemo` folder.
+7. `coverage`
+    Checks if your code additions have sufficient coverage.
+    Refer [coverage](https://coverage.readthedocs.io/en/6.5.0/index.html#) for more details.
+    If your MR fails this test, this means that you have not added enough tests to the `test`
+    folder for your module/functions.
+    Add extensive test scripts to cover different
+    branches and lines of your additions.
+    Aim for more than 80% code coverage.
+    To test coverage locally, run the `get_coverage.sh` script from the `test` folder and
+    check the coverage of the module that you added/edited.

physics_mcp/source/FAQ.md ADDED Viewed

	@@ -0,0 +1,60 @@

+# Frequently Asked Questions about PhysicsNeMo
+## Table of contents
+- [What is the recommended hardware for training using PhysicsNeMo framework?](#what-is-the-recommended-hardware-for-training-using-physicsnemo-framework)
+- [What model architectures are in PhysicsNeMo?](#what-model-architectures-are-in-physicsnemo)
+- [What is the difference between PhysicsNeMo Core and Symbolic?](#what-is-the-difference-between-physicsnemo-core-and-symbolic)
+- [What can I do if I dont see a PDE in PhysicsNeMo?](#what-can-i-do-if-i-dont-see-a-pde-in-physicsnemo)
+- [What is the difference between the pip install and the container?](#what-is-the-difference-between-the-pip-install-and-the-container)
+## What is the recommended hardware for training using PhysicsNeMo framework?
+Please refer to the recommended hardware section:
+[System Requirements](https://docs.nvidia.com/deeplearning/physicsnemo/getting-started/index.html#system-requirements)
+## What model architectures are in PhysicsNeMo?
+Nvidia PhysicsNeMo is built on top of PyTorch and you can build and train any model
+architecture you want in PhysicsNeMo. PhysicsNeMo however has a catalog of models that
+have been packaged in a configurable form to make it easy to retrain with new data or certain
+config parameters. Examples include GNNs like MeshGraphNet or Neural Operators like FNO.
+PhysicsNeMo samples have more models that illustrate how a specific approach with a specific
+model architecture can be applied to a specific problem.
+These are reference starting points for users to get started.
+You can find the list of built in model architectures
+[here](https://github.com/NVIDIA/physicsnemo/tree/main/physicsnemo/models) and
+[here](https://github.com/NVIDIA/physicsnemo-sym/tree/main/physicsnemo/sym/models)
+## What is the difference between PhysicsNeMo Core and Symbolic?
+PhysicsNeMo core is the foundational module that provides the core algorithms, network
+architectures and utilities that cover a broad spectrum of Physics-ML approaches.
+PhysicsNeMo Symbolic provides pythonic APIs, algorithms and utilities to be used with
+PhysicsNeMo core, to explicitly physics inform the model training. This includes symbolic
+APIs for PDEs, domain sampling and PDE-based residuals. It also provides higher level
+abstraction to compose a training loop from specification of the geometry, PDEs and
+constraints like boundary conditions using simple symbolic APIs.
+So if you are familiar with PyTorch and want to train model from a dataset, you start
+with PhysicsNeMo core and you import PhysicsNeMo symbolic to bring in explicit domain knowledge.
+Please refer to the [DeepONet example](https://github.com/physicsnemo/tree/main/examples/cfd/darcy_deeponet_physics)
+that illustrates the concept.
+If you are an engineer or domain expert accustomed to using numerical solvers, you can
+use PhysicsNeMo Symbolic to define your problem at a higher level of abstraction. Please
+refer to the [Lid Driven cavity](https://docs.nvidia.com/deeplearning/physicsnemo/physicsnemo-sym/user_guide/basics/lid_driven_cavity_flow.html)
+that illustrates the concept.
+## What can I do if I dont see a PDE in PhysicsNeMo?
+PhysicsNeMo Symbolic provides a well documented
+[example](https://docs.nvidia.com/deeplearning/physicsnemo/physicsnemo-sym/user_guide/foundational/1d_wave_equation.html#writing-custom-pdes-and-boundary-initial-conditions)
+that walks you through how to define a custom PDE. Please see the source [here](https://github.com/NVIDIA/physicsnemo-sym/tree/main/physicsnemo/sym/eq/pdes)
+to see the built-in PDE implementation as an additional reference for your own implementation.
+## What is the difference between the pip install and the container?
+There is no functional difference between the two. This is to simplify the ease of
+installing and setting up the PhysicsNeMo environment. Please refer to the
+[getting started guide](https://docs.nvidia.com/deeplearning/physicsnemo/getting-started/index.html#physicsnemo-with-docker-image-recommended)
+on how to install using Pip or using the container.

physics_mcp/source/LICENSE.txt ADDED Viewed

	@@ -0,0 +1,201 @@

+Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "{}"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright 2022 NVIDIA Corporation
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

physics_mcp/source/README.md ADDED Viewed

	@@ -0,0 +1,472 @@

+# NVIDIA PhysicsNeMo
+<!-- markdownlint-disable -->
+📝 NVIDIA Modulus has been renamed to NVIDIA PhysicsNeMo
+[![Project Status: Active - The project has reached a stable, usable state and is being actively developed.](https://www.repostatus.org/badges/latest/active.svg)](https://www.repostatus.org/#active)
+[![GitHub](https://img.shields.io/github/license/NVIDIA/physicsnemo)](https://github.com/NVIDIA/physicsnemo/blob/master/LICENSE.txt)
+[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
+<!-- markdownlint-enable -->
+[**NVIDIA PhysicsNeMo**](#what-is-physicsnemo)
+| [**Documentation**](https://docs.nvidia.com/deeplearning/physicsnemo/physicsnemo-core/index.html)
+| [**Install Guide**](#installation)
+| [**Getting Started**](#getting-started)
+| [**Contributing Guidelines**](#contributing-to-physicsnemo)
+| [**License**](#license)
+## What is PhysicsNeMo?
+NVIDIA PhysicsNeMo is an open-source deep-learning framework for building, training,
+fine-tuning, and inferring Physics AI models using state-of-the-art SciML methods for
+AI4Science and engineering.
+PhysicsNeMo provides Python modules to compose scalable and optimized training and
+inference pipelines to explore, develop, validate, and deploy AI models that combine
+physics knowledge with data, enabling real-time predictions.
+Whether you are exploring the use of neural operators, GNNs, or transformers, or are
+interested in Physics-Informed Neural Networks or a hybrid approach in between, PhysicsNeMo
+provides you with an optimized stack that will enable you to train your models at scale.
+<!-- markdownlint-disable -->
+<p align="center">
+  <img src=https://raw.githubusercontent.com/NVIDIA/physicsnemo/main/docs/img/value_prop/Knowledge_guided_models.gif alt="PhysicsNeMo"/>
+</p>
+<!-- markdownlint-enable -->
+<!-- toc -->
+- [More About PhysicsNeMo](#more-about-physicsnemo)
+  - [Scalable GPU-Optimized Training Library](#scalable-gpu-optimized-training-library)
+  - [A Suite of Physics-Informed ML Models](#a-suite-of-physics-informed-ml-models)
+  - [Seamless PyTorch Integration](#seamless-pytorch-integration)
+  - [Easy Customization and Extension](#easy-customization-and-extension)
+  - [AI4Science Library](#ai4science-library)
+    - [Domain-Specific Packages](#domain-specific-packages)
+- [Who is Using and Contributing to PhysicsNeMo](#who-is-using-and-contributing-to-physicsnemo)
+- [Why Use PhysicsNeMo](#why-are-they-using-physicsnemo)
+- [Getting Started](#getting-started)
+- [Resources](#resources)
+- [Installation](#installation)
+- [Contributing](#contributing-to-physicsnemo)
+- [Communication](#communication)
+- [License](#license)
+<!-- tocstop -->
+## More About PhysicsNeMo
+At a granular level, PhysicsNeMo is developed as modular functionality and therefore
+provides built-in composable modules that are packaged into a few key components:
+<!-- markdownlint-disable -->
+Component | Description |
+---- | --- |
+[**physicsnemo.models**](https://docs.nvidia.com/deeplearning/physicsnemo/physicsnemo-core/api/physicsnemo.models.html) | A collection of optimized, customizable, and easy-to-use families of model architectures such as Neural Operators, Graph Neural Networks, Diffusion models, Transformer models, and many more|
+[**physicsnemo.datapipes**](https://docs.nvidia.com/deeplearning/physicsnemo/physicsnemo-core/api/physicsnemo.datapipes.html) | Optimized and scalable built-in data pipelines fine-tuned to handle engineering and scientific data structures like point clouds, meshes, etc.|
+[**physicsnemo.distributed**](https://docs.nvidia.com/deeplearning/physicsnemo/physicsnemo-core/api/physicsnemo.distributed.html) | A distributed computing sub-module built on top of `torch.distributed` to enable parallel training with just a few steps|
+[**physicsnemo.curator**](https://github.com/NVIDIA/physicsnemo-curator) | A sub-module to streamline and accelerate the process of data curation for engineering datasets|
+[**physicsnemo.sym.geometry**](https://docs.nvidia.com/deeplearning/physicsnemo/physicsnemo-sym/user_guide/features/csg_and_tessellated_module.html) | A sub-module to handle geometry for DL training using Constructive Solid Geometry modeling and CAD files in STL format|
+[**physicsnemo.sym.eq**](https://docs.nvidia.com/deeplearning/physicsnemo/physicsnemo-sym/user_guide/features/nodes.html) | A sub-module to use PDEs in your DL training with several implementations of commonly observed equations and easy ways for customization|
+<!-- markdownlint-enable -->
+For a complete list, refer to the PhysicsNeMo API documentation for
+[PhysicsNeMo](https://docs.nvidia.com/deeplearning/physicsnemo/physicsnemo-core/index.html).
+## AI4Science Library
+Usually, PhysicsNeMo is used either as:
+- A complementary tool to PyTorch when exploring AI for SciML and AI4Science applications.
+- A deep learning research platform that provides scale and optimal performance on
+NVIDIA GPUs.
+### Domain-Specific Packages
+The following are packages dedicated to domain experts of specific communities, catering
+to their unique exploration needs:
+- [PhysicsNeMo CFD](https://github.com/NVIDIA/physicsnemo-cfd): Inference sub-module of PhysicsNeMo
+  to enable CFD domain experts to explore, experiment, and validate using pretrained
+  AI models for CFD use cases.
+- [PhysicsNeMo Curator](https://github.com/NVIDIA/physicsnemo-curator): Inference sub-module
+  of PhysicsNeMo to streamline and accelerate the process of data curation for engineering
+  datasets.
+- [Earth-2 Studio](https://github.com/NVIDIA/earth2studio): Inference sub-module of PhysicsNeMo
+  to enable climate researchers and scientists to explore and experiment with pretrained
+  AI models for weather and climate.
+### Scalable GPU-Optimized Training Library
+PhysicsNeMo provides a highly optimized and scalable training library for maximizing the
+power of NVIDIA GPUs.
+[Distributed computing](https://docs.nvidia.com/deeplearning/physicsnemo/physicsnemo-core/api/physicsnemo.distributed.html)
+utilities allow for efficient scaling from a single GPU to multi-node GPU clusters with
+a few lines of code, ensuring that large-scale
+physics-informed machine learning (ML) models can be trained quickly and effectively.
+The framework includes support for advanced
+[optimization utilities](https://docs.nvidia.com/deeplearning/physicsnemo/physicsnemo-core/api/physicsnemo.utils.html#module-physicsnemo.utils.capture),
+[tailor-made datapipes](https://docs.nvidia.com/deeplearning/physicsnemo/physicsnemo-core/api/physicsnemo.datapipes.html),
+and [validation utilities](https://github.com/NVIDIA/physicsnemo-sym/tree/main/physicsnemo/sym/eq)
+to enhance end-to-end training speed.
+### A Suite of Physics-Informed ML Models
+PhysicsNeMo offers a library of state-of-the-art models specifically designed
+for Physics-ML applications. Users can build any model architecture by using the underlying
+PyTorch layers and combining them with curated PhysicsNeMo layers.
+The [Model Zoo](https://docs.nvidia.com/deeplearning/physicsnemo/physicsnemo-core/api/physicsnemo.models.html#model-zoo)
+includes optimized implementations of families of model architectures such as
+Neural Operators:
+- [Fourier Neural Operators (FNOs)](physicsnemo/models/fno)
+- [DeepONet](https://docs.nvidia.com/deeplearning/physicsnemo/physicsnemo-sym/user_guide/neural_operators/deeponet.html)
+- [DoMINO](https://docs.nvidia.com/deeplearning/physicsnemo/physicsnemo-core/examples/cfd/external_aerodynamics/domino/readme.html)
+- [Graph Neural Networks (GNNs)](physicsnemo/models/gnn_layers)
+- [MeshGraphNet](https://docs.nvidia.com/deeplearning/physicsnemo/physicsnemo-core/examples/cfd/vortex_shedding_mgn/readme.html)
+- [MeshGraphNet for Lagrangian](https://docs.nvidia.com/deeplearning/physicsnemo/physicsnemo-core/examples/cfd/lagrangian_mgn/readme.html)
+- [XAeroNet](https://docs.nvidia.com/deeplearning/physicsnemo/physicsnemo-core/examples/cfd/external_aerodynamics/xaeronet/readme.html)
+- [Diffusion Models](physicsnemo/models/diffusion)
+- [Correction Diffusion Model](https://docs.nvidia.com/deeplearning/physicsnemo/physicsnemo-core/examples/generative/corrdiff/readme.html)
+- [DDPM](https://docs.nvidia.com/deeplearning/physicsnemo/physicsnemo-core/examples/generative/diffusion/readme.html)
+- [PhysicsNeMo GraphCast](https://docs.nvidia.com/deeplearning/physicsnemo/physicsnemo-core/examples/weather/graphcast/readme.html)
+- [Transsolver](https://github.com/NVIDIA/physicsnemo/tree/main/examples/cfd/darcy_transolver)
+- [RNNs](https://github.com/NVIDIA/physicsnemo/tree/main/physicsnemo/models)
+- [SwinVRNN](https://github.com/NVIDIA/physicsnemo/tree/main/physicsnemo/models/swinvrnn)
+- [Physics-Informed Neural Networks (PINNs)](https://docs.nvidia.com/deeplearning/physicsnemo/physicsnemo-sym/user_guide/foundational/1d_wave_equation.html)
+And many others.
+These models are optimized for various physics domains, such as computational fluid
+dynamics, structural mechanics, and electromagnetics. Users can download, customize, and
+build upon these models to suit their specific needs, significantly reducing the time
+required to develop high-fidelity simulations.
+### Seamless PyTorch Integration
+PhysicsNeMo is built on top of PyTorch, providing a familiar and user-friendly experience
+for those already proficient with PyTorch.
+This includes a simple Python interface and modular design, making it easy to use
+PhysicsNeMo with existing PyTorch workflows.
+Users can leverage the extensive PyTorch ecosystem, including its libraries and tools,
+while benefiting from PhysicsNeMo's specialized capabilities for physics-ML. This seamless
+integration ensures users can quickly adopt PhysicsNeMo without a steep learning curve.
+For more information, refer to [Converting PyTorch Models to PhysicsNeMo Models](https://docs.nvidia.com/deeplearning/physicsnemo/physicsnemo-core/api/physicsnemo.models.html#converting-pytorch-models-to-physicsnemo-models).
+### Easy Customization and Extension
+PhysicsNeMo is designed to be highly extensible, allowing users to add new functionality
+with minimal effort. The framework provides Pythonic APIs for
+defining new physics models, geometries, and constraints, making it easy to extend its
+capabilities to new use cases.
+The adaptability of PhysicsNeMo is further enhanced by key features such as
+[ONNX support](https://docs.nvidia.com/deeplearning/physicsnemo/physicsnemo-core/api/physicsnemo.deploy.html)
+for flexible model deployment,
+robust [logging utilities](https://docs.nvidia.com/deeplearning/physicsnemo/physicsnemo-core/api/physicsnemo.launch.logging.html)
+for streamlined error handling,
+and efficient
+[checkpointing](https://docs.nvidia.com/deeplearning/physicsnemo/physicsnemo-core/api/physicsnemo.launch.utils.html#module-physicsnemo.launch.utils.checkpoint)
+to simplify model loading and saving.
+This extensibility ensures that PhysicsNeMo can adapt to the evolving needs of researchers
+and engineers, facilitating the development of innovative solutions in the field of physics-ML.
+Detailed information on features and capabilities can be found in the [PhysicsNeMo documentation](https://docs.nvidia.com/physicsnemo/index.html#core).
+[Reference samples](examples/README.md) cover a broad spectrum of physics-constrained
+and data-driven
+workflows to suit the diversity of use cases in the science and engineering disciplines.
+> [!TIP]
+> Have questions about how PhysicsNeMo can assist you? Try our [Experimental] chatbot,
+> [PhysicsNeMo Guide](https://chatgpt.com/g/g-PXrBv20SC-modulus-guide), for answers.
+### Hello World
+You can start using PhysicsNeMo in your PyTorch code as simply as shown here:
+```python
+>>> import torch
+>>> from physicsnemo.models.mlp.fully_connected import FullyConnected
+>>> model = FullyConnected(in_features=32, out_features=64)
+>>> input = torch.randn(128, 32)
+>>> output = model(input)
+>>> output.shape
+torch.Size([128, 64])
+```
+To use the distributed module, you can do the following (example for
+distributed data parallel training; for a more in-depth tutorial, refer to
+[PhysicsNeMo Distributed](https://docs.nvidia.com/deeplearning/physicsnemo/physicsnemo-core/api/physicsnemo.distributed.html#)):
+```python
+import torch
+from torch.nn.parallel import DistributedDataParallel
+from physicsnemo.distributed import DistributedManager
+from physicsnemo.models.mlp.fully_connected import FullyConnected
+def main():
+    DistributedManager.initialize()
+    dist = DistributedManager()
+    arch = FullyConnected(in_features=32, out_features=64).to(dist.device)
+    if dist.distributed:
+        ddps = torch.cuda.Stream()
+        with torch.cuda.stream(ddps):
+            arch = DistributedDataParallel(
+                arch,
+                device_ids=[dist.local_rank],
+                output_device=dist.device,
+                broadcast_buffers=dist.broadcast_buffers,
+                find_unused_parameters=dist.find_unused_parameters,
+            )
+        torch.cuda.current_stream().wait_stream(ddps)
+    # Set up the optimizer
+    optimizer = torch.optim.Adam(
+        arch.parameters(),
+        lr=0.001,
+    )
+    def training_step(invar, target):
+        pred = arch(invar)
+        loss = torch.sum(torch.pow(pred - target, 2))
+        loss.backward()
+        optimizer.step()
+        return loss
+    # Sample training loop
+    for i in range(20):
+        # Random inputs and targets for simplicity
+        input = torch.randn(128, 32, device=dist.device)
+        target = torch.randn(128, 64, device=dist.device)
+        # Training step
+        loss = training_step(input, target)
+if __name__ == "__main__":
+    main()
+```
+To use the PDE module, you can do the following:
+```python
+>>> from physicsnemo.sym.eq.pdes.navier_stokes import NavierStokes
+>>> ns = NavierStokes(nu=0.01, rho=1, dim=2)
+>>> ns.pprint()
+continuity: u__x + v__y
+momentum_x: u*u__x + v*u__y + p__x + u__t - 0.01*u__x__x - 0.01*u__y__y
+momentum_y: u*v__x + v*v__y + p__y + v__t - 0.01*v__x__x - 0.01*v__y__y
+```
+## Who is Using and Contributing to PhysicsNeMo
+PhysicsNeMo is an open-source project and gets contributions from researchers in
+the SciML and AI4Science fields. While the PhysicsNeMo team works on optimizing the
+underlying software stack, the community collaborates and contributes model architectures,
+datasets, and reference applications so we can innovate in the pursuit of
+developing generalizable model architectures and algorithms.
+Some recent examples of community contributors are the [HP Labs 3D Printing team](https://developer.nvidia.com/blog/spotlight-hp-3d-printing-and-nvidia-physicsnemo-collaborate-on-open-source-manufacturing-digital-twin/),
+[Stanford Cardiovascular research team](https://developer.nvidia.com/blog/enabling-greater-patient-specific-cardiovascular-care-with-ai-surrogates/),
+[UIUC team](https://github.com/NVIDIA/physicsnemo/tree/main/examples/cfd/mhd_pino),
+[CMU team](https://github.com/NVIDIA/physicsnemo/tree/main/examples/generative/diffusion),
+etc.
+Recent examples of research teams using PhysicsNeMo are the
+[ORNL team](https://arxiv.org/abs/2404.05768),
+[TU Munich CFD team](https://www.nvidia.com/en-us/on-demand/session/gtc24-s62237/), etc.
+Please navigate to this page for a complete list of research work leveraging PhysicsNeMo.
+For a list of enterprises using PhysicsNeMo, refer to the [PhysicsNeMo Webpage](https://developer.nvidia.com/physicsnemo).
+Using PhysicsNeMo and interested in showcasing your work on
+[NVIDIA Blogs](https://developer.nvidia.com/blog/category/simulation-modeling-design/)?
+Fill out this [proposal form](https://forms.gle/XsBdWp3ji67yZAUF7) and we will get back
+to you!
+## Why Are They Using PhysicsNeMo
+Here are some of the key benefits of PhysicsNeMo for SciML model development:
+<!-- markdownlint-disable -->
+<img src="docs/img/value_prop/benchmarking.svg" width="100"> | <img src="docs/img/value_prop/recipe.svg" width="100"> | <img src="docs/img/value_prop/performance.svg" width="100">
+---|---|---|
+|SciML Benchmarking and Validation|Ease of Using Generalized SciML Recipes with Heterogeneous Datasets |Out-of-the-Box Performance and Scalability
+|PhysicsNeMo enables researchers to benchmark their AI models against proven architectures for standard benchmark problems with detailed domain-specific validation criteria.|PhysicsNeMo enables researchers to pick from state-of-the-art SciML architectures and use built-in data pipelines for their use case.| PhysicsNeMo provides out-of-the-box performant training pipelines, including optimized ETL pipelines for heterogeneous engineering and scientific datasets and out-of-the-box scaling across multi-GPU and multi-node GPUs.
+<!-- markdownlint-enable -->
+See what your peer SciML researchers are saying about PhysicsNeMo (coming soon).
+## Getting Started
+The following resources will help you learn how to use PhysicsNeMo. The best
+way is to start with a reference sample and then update it for your own use case.
+- [Using PhysicsNeMo with your PyTorch model](https://docs.nvidia.com/deeplearning/physicsnemo/physicsnemo-core/tutorials/simple_training_example.html#using-custom-models-in-physicsnemo)
+- [Using PhysicsNeMo built-in models](https://docs.nvidia.com/deeplearning/physicsnemo/physicsnemo-core/tutorials/simple_training_example.html#using-built-in-models)
+- [Getting Started Guide](https://docs.nvidia.com/deeplearning/physicsnemo/getting-started/index.html)
+- [Reference Samples](https://github.com/NVIDIA/physicsnemo/blob/main/examples/README.md)
+- [User Guide Documentation](https://docs.nvidia.com/deeplearning/physicsnemo/physicsnemo-core/index.html)
+## Resources
+- [Getting Started Webinar](https://www.nvidia.com/en-us/on-demand/session/gtc24-dlit61460/?playlistId=playList-bd07f4dc-1397-4783-a959-65cec79aa985)
+- [AI4Science PhysicsNeMo Bootcamp](https://github.com/openhackathons-org/End-to-End-AI-for-Science)
+- [PhysicsNeMo Pretrained Models](https://catalog.ngc.nvidia.com/models?filters=&orderBy=scoreDESC&query=PhysicsNeMo&page=&pageSize=)
+- [PhysicsNeMo Datasets and Supplementary Materials](https://catalog.ngc.nvidia.com/resources?filters=&orderBy=scoreDESC&query=PhysicsNeMo&page=&pageSize=)
+- [Self-Paced PhysicsNeMo DLI Training](https://learn.nvidia.com/courses/course-detail?course_id=course-v1:DLI+S-OV-04+V1)
+- [Deep Learning for Science and Engineering Lecture Series with PhysicsNeMo](https://www.nvidia.com/en-us/on-demand/deep-learning-for-science-and-engineering/)
+  - [PhysicsNeMo: Purpose and Usage](https://www.nvidia.com/en-us/on-demand/session/dliteachingkit-setk5002/)
+- [Video Tutorials](https://www.nvidia.com/en-us/on-demand/search/?facet.mimetype[]=event%20session&layout=list&page=1&q=physicsnemo&sort=relevance&sortDir=desc)
+## Installation
+The following instructions help you install the base PhysicsNeMo modules to get started.
+There are additional optional dependencies for specific models that are listed under
+[optional dependencies](#optional-dependencies).
+The training recipes are not packaged into the pip wheels or the container to keep the
+footprint low. We recommend users clone the appropriate training recipes and use them
+as a starting point. These training recipes may require additional example-specific dependencies,
+as indicated through their associated `requirements.txt` file.
+### PyPI
+The recommended method for installing the latest version of PhysicsNeMo is using PyPI:
+```Bash
+pip install nvidia-physicsnemo
+```
+The installation can be verified by running the [Hello World](#hello-world) example.
+#### Optional Dependencies
+PhysicsNeMo has many optional dependencies that are used in specific components.
+When using pip, all dependencies used in PhysicsNeMo can be installed with
+`pip install nvidia-physicsnemo[all]`. If you are developing PhysicsNeMo, developer dependencies
+can be installed using `pip install nvidia-physicsnemo[dev]`. Otherwise, additional dependencies
+can be installed on a case-by-case basis. Detailed information on installing the
+optional dependencies can be found in the
+[Getting Started Guide](https://docs.nvidia.com/deeplearning/physicsnemo/getting-started/index.html).
+### NVCR Container
+The recommended PhysicsNeMo Docker image can be pulled from the
+[NVIDIA Container Registry](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/physicsnemo/containers/physicsnemo)
+(refer to the NGC registry for the latest tag):
+```Bash
+docker pull nvcr.io/nvidia/physicsnemo/physicsnemo:25.06
+```
+Inside the container, you can clone the PhysicsNeMo git repositories and get
+started with the examples. The command below shows the instructions to launch
+the PhysicsNeMo container and run examples from this repo:
+```bash
+docker run --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 --runtime nvidia \
+--rm -it nvcr.io/nvidia/physicsnemo/physicsnemo:25.06 bash
+git clone https://github.com/NVIDIA/physicsnemo.git
+cd physicsnemo/examples/cfd/darcy_fno/
+pip install warp-lang # install NVIDIA Warp to run the Darcy example
+python train_fno_darcy.py
+```
+## From Source
+### Package
+For a local build of the PhysicsNeMo Python package from source, use:
+```Bash
+git clone git@github.com:NVIDIA/physicsnemo.git && cd physicsnemo
+pip install --upgrade pip
+pip install .
+```
+### Source Container
+To build the PhysicsNeMo Docker image:
+```bash
+docker build -t physicsnemo:deploy \
+    --build-arg TARGETPLATFORM=linux/amd64 --target deploy -f Dockerfile .
+```
+Alternatively, you can run `make container-deploy`.
+To build the CI image:
+```bash
+docker build -t physicsnemo:ci \
+    --build-arg TARGETPLATFORM=linux/amd64 --target ci -f Dockerfile .
+```
+Alternatively, you can run `make container-ci`.
+Currently, only `linux/amd64` and `linux/arm64` platforms are supported. If using
+`linux/arm64`, some dependencies like `warp-lang` might not install correctly.
+## PhysicsNeMo Migration Guide
+NVIDIA Modulus has been renamed to NVIDIA PhysicsNeMo. For migration:
+- Use `pip install nvidia-physicsnemo` rather than `pip install nvidia-modulus`
+  for PyPI wheels.
+- Use `nvcr.io/nvidia/physicsnemo/physicsnemo:<tag>` rather than
+  `nvcr.io/nvidia/modulus/modulus:<tag>` for Docker containers.
+- Replace `nvidia-modulus` with `nvidia-physicsnemo` in your pip requirements
+  files (`requirements.txt`, `setup.py`, `setup.cfg`, `pyproject.toml`, etc.).
+- In your code, change the import statements from `import modulus` to
+  `import physicsnemo`.
+The old PyPI registry and the NGC container registry will be deprecated soon
+and will not receive any bug fixes/updates. The old checkpoints will remain
+compatible with these updates.
+More details to follow soon.
+## DGL to PyTorch Geometric Migration Guide
+PhysicsNeMo supports a wide range of Graph Neural Networks (GNNs),
+including MeshGraphNet and others.
+Currently, PhysicsNeMo uses the DGL library as its GNN backend,
+with plans to completely transition to PyTorch Geometric (PyG) in a future release.
+For more details, please refer to the [DGL-to-PyG migration guide](https://github.com/NVIDIA/physicsnemo/blob/main/examples/dgl_to_pyg_migration.md).
+## Contributing to PhysicsNeMo
+PhysicsNeMo is an open-source collaboration, and its success is rooted in community
+contributions to further the field of Physics-ML. Thank you for contributing to the
+project so others can build on top of your contributions.
+For guidance on contributing to PhysicsNeMo, please refer to the
+[contributing guidelines](CONTRIBUTING.md).
+## Cite PhysicsNeMo
+If PhysicsNeMo helped your research and you would like to cite it, please refer to the [guidelines](https://github.com/NVIDIA/physicsnemo/blob/main/CITATION.cff).
+## Communication
+- GitHub Discussions: Discuss new architectures, implementations, Physics-ML research, etc.
+- GitHub Issues: Bug reports, feature requests, install issues, etc.
+- PhysicsNeMo Forum: The [PhysicsNeMo Forum](https://forums.developer.nvidia.com/t/welcome-to-the-physicsnemo-ml-model-framework-forum/178556)
+hosts an audience of new to moderate-level users and developers for general chat, online
+discussions, collaboration, etc.
+## Feedback
+Want to suggest some improvements to PhysicsNeMo? Use our [feedback form](https://docs.google.com/forms/d/e/1FAIpQLSfX4zZ0Lp7MMxzi3xqvzX4IQDdWbkNh5H_a_clzIhclE2oSBQ/viewform?usp=sf_link).
+## License
+PhysicsNeMo is provided under the Apache License 2.0. Please see [LICENSE.txt](./LICENSE.txt)
+for the full license text. Enterprise SLA, support, and preview access are available
+under NVAIE.

physics_mcp/source/SECURITY.md ADDED Viewed

	@@ -0,0 +1,34 @@

+# Security
+NVIDIA is dedicated to the security and trust of our software products and
+services, including all source code repositories managed through our organization.
+If you need to report a security issue, please use the appropriate contact points
+outlined below. **Please do not report security vulnerabilities through GitHub/GitLab.**
+## Reporting Potential Security Vulnerability in an NVIDIA Product
+To report a potential security vulnerability in any NVIDIA product:
+- Web: [Security Vulnerability Submission Form](https://www.nvidia.com/object/submit-security-vulnerability.html)
+- E-Mail: `psirt@nvidia.com`
+  - We encourage you to use the following PGP key for secure email communication:
+  [NVIDIA public PGP Key for communication](https://www.nvidia.com/en-us/security/pgp-key)
+  - Please include the following information:
+  - Product/Driver name and version/branch that contains the vulnerability
+  - Type of vulnerability (code execution, denial of service, buffer overflow, etc.)
+  - Instructions to reproduce the vulnerability
+  - Proof-of-concept or exploit code
+  - Potential impact of the vulnerability, including how an attacker could
+  exploit the vulnerability
+While NVIDIA currently does not have a bug bounty program, we do offer
+acknowledgement when an externally reported security issue is addressed under our
+coordinated vulnerability disclosure policy. Please visit our
+[Product Security Incident Response Team (PSIRT)](https://www.nvidia.com/en-us/security/psirt-policies/)
+policies page for more information.
+## NVIDIA Product Security
+For all security-related concerns, please visit NVIDIA's Product Security portal
+at `https://www.nvidia.com/en-us/security`

physics_mcp/source/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# -*- coding: utf-8 -*-
+"""
+physicsnemo Project Package Initialization File
+"""

physics_mcp/source/greptile.json ADDED Viewed

	@@ -0,0 +1,59 @@

+{
+  "comment": "",
+  "fixWithAI": false,
+  "commentTypes": [
+    "logic",
+    "syntax",
+    "style"
+  ],
+  "instructions": "",
+  "excludeAuthors": [
+    "dependabot[bot]",
+    "renovate[bot]"
+  ],
+  "ignorePatterns": "greptile.json\n",
+  "summarySection": {
+    "included": true,
+    "collapsible": false,
+    "defaultOpen": false
+  },
+  "triggerOnUpdates": false,
+  "updateSummaryOnly": false,
+  "issuesTableSection": {
+    "included": true,
+    "collapsible": false,
+    "defaultOpen": false
+  },
+  "confidenceScoreSection": {
+    "included": false,
+    "collapsible": false,
+    "defaultOpen": false
+  },
+  "sequenceDiagramSection": {
+    "included": false,
+    "collapsible": false,
+    "defaultOpen": false
+  },
+  "shouldUpdateDescription": false,
+  "customContext": {
+    "other": [
+      {
+        "scope": [],
+        "content": ""
+      }
+    ],
+    "rules": [
+      {
+        "scope": [],
+        "rule": ""
+      }
+    ],
+    "files": [
+      {
+        "scope": [],
+        "path": "",
+        "description": ""
+      }
+    ]
+  }
+}

physics_mcp/source/physicsnemo/__init__.py ADDED Viewed

	@@ -0,0 +1,22 @@

+# SPDX-FileCopyrightText: Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .datapipes.datapipe import Datapipe
+from .datapipes.meta import DatapipeMetaData
+from .models.meta import ModelMetaData
+from .models.module import Module
+__version__ = "1.3.0a0"

physics_mcp/source/physicsnemo/active_learning/README.md ADDED Viewed

	@@ -0,0 +1,66 @@

+# Active Learning Module
+The `physicsnemo.active_learning` namespace is used for defining the "scaffolding"
+that can be used to construct automated, end-to-end active learning workflows.
+For areas of science that are difficult to source ground-truths to train on
+(of which there are many), an active learning curriculum attempts to train a
+model with improved data efficiency; better generalization performance but requiring
+fewer training samples.
+Generally, an active learning workflow can be decomposed into three "phases"
+that are - in the simplest case - run sequentially:
+- **Training/fine-tuning**: A "learner" or surrogate model is initially trained
+on available data, and in subsequent active learning iterations, is fine-tuned
+with the new data appended on the original dataset.
+- **Querying**: One or more strategies that encode some heuristics for what
+new data is most informative for the learner. Examples of this include
+uncertainty-based methods, which may screen a pool of unlabeled data for
+those the model is least confident with.
+- **Labeling**: A method of obtaining ground truth (labels) for new data
+points, pipelined from the querying stage. This may entail running an
+expensive solver, or acquiring experimental data.
+The three phases are repeated until the learner converges. Because "convergence"
+may not be easily defined, we define an additional phase which we call
+**metrology**: this represents a phase most similar to querying, but allows
+a user to define some set of criteria to monitor over the course of active
+learning *beyond* simple validation metrics to ensure the model can be used
+with confidence as surrogates (e.g. within a simulation loop).
+## How to use this module
+With the context above in mind, inspecting the `driver` module will give you
+a sense for how the end-to-end workflow functions; the `Driver` class acts
+as an orchestrator for all the phases of active learning we described above.
+From there, you should realize that `Driver` is written in a highly abstract
+way: we need concrete *strategies* that implement querying, labeling, and metrology
+concepts. The `protocols` module provides the scaffolding to do so - we implement
+various components as `typing.Protocol` which are used for structural sub-typing:
+they can be thought of as abstract classes that define an expected interface
+in a function or class from which you can define your own classes by either
+inheriting from them, or defining your own class that implements the expected
+methods and attributes.
+In order to perform the training portion of active learning, we provide a
+minimal yet functional `DefaultTrainingLoop` inside the `loop` module. This
+loop simply requires a `protocols.TrainingProtocol` to be passed, which is
+a function that defines the logic for computing the loss per batch/training
+step.
+## Configuring workflows
+The `config` module defines some simple `dataclass`es that can be used
+to configure the behavior of various parts of active learning, e.g. how
+training is conducted, etc. Because `Driver` is designed to be checkpointable,
+with the exception of a few parts such as datasets, everything should be
+JSON-serializable.
+## Restarting workflows
+For classes and functions that are created at runtime, checkpointing requires
+that these components can be recreated when restarting from a checkpoint. To
+that end, the `_registry` module provides a user-friendly way to instantiate
+objects: user-defined strategy classes can be added to the registry to enable
+their creation in checkpoint restarts.

physics_mcp/source/physicsnemo/active_learning/__init__.py ADDED Viewed

	@@ -0,0 +1,35 @@

+# SPDX-FileCopyrightText: Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from physicsnemo.active_learning._registry import registry
+from physicsnemo.active_learning.config import (
+    DriverConfig,
+    OptimizerConfig,
+    StrategiesConfig,
+    TrainingConfig,
+)
+from physicsnemo.active_learning.driver import Driver
+from physicsnemo.active_learning.loop import DefaultTrainingLoop
+__all__ = [
+    "registry",
+    "Driver",
+    "DefaultTrainingLoop",
+    "DriverConfig",
+    "OptimizerConfig",
+    "StrategiesConfig",
+    "TrainingConfig",
+]

physics_mcp/source/physicsnemo/active_learning/_registry.py ADDED Viewed

	@@ -0,0 +1,332 @@

+# SPDX-FileCopyrightText: Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+import importlib
+import inspect
+from typing import Any, Callable
+from warnings import warn
+from physicsnemo.active_learning.protocols import ActiveLearningProtocol
+__all__ = ["registry"]
+class ActiveLearningRegistry:
+    """
+    Registry for active learning protocols.
+    This class provides a centralized registry for user-defined active learning
+    protocols that implement the `ActiveLearningProtocol`. It enables string-based
+    lookups for checkpointing and provides argument validation when constructing
+    protocol instances.
+    The registry supports two primary modes of interaction:
+    1. Registration via decorator: `@registry.register("my_strategy")`
+    2. Construction with validation: `registry.construct("my_strategy", **kwargs)`
+    Attributes
+    ----------
+    _registry : dict[str, type[ActiveLearningProtocol]]
+        Internal dictionary mapping protocol names to their class types.
+    Methods
+    -------
+    register(cls_name: str) -> Callable[[type[ActiveLearningProtocol]], type[ActiveLearningProtocol]]
+        Decorator to register a protocol class with a given name.
+    construct(cls_name: str, **kwargs) -> ActiveLearningProtocol
+        Construct an instance of a registered protocol with argument validation.
+    is_registered(cls_name: str) -> bool
+        Check if a protocol name is registered.
+    Properties
+    ----------
+    registered_names : list[str]
+        A list of all registered protocol names, sorted alphabetically.
+    Examples
+    --------
+    Register a custom strategy:
+    >>> from physicsnemo.active_learning._registry import registry
+    >>> @registry.register("my_custom_strategy")
+    ... class MyCustomStrategy:
+    ...     def __init__(self, param1: int, param2: str):
+    ...         self.param1 = param1
+    ...         self.param2 = param2
+    Construct an instance with validation:
+    >>> strategy = registry.construct("my_custom_strategy", param1=42, param2="test")
+    """
+    def __init__(self) -> None:
+        """Initialize an empty registry."""
+        self._registry: dict[str, type[ActiveLearningProtocol]] = {}
+    def register(
+        self, cls_name: str
+    ) -> Callable[[type[ActiveLearningProtocol]], type[ActiveLearningProtocol]]:
+        """
+        Decorator to register an active learning protocol class.
+        This decorator registers a class implementing the `ActiveLearningProtocol`
+        under the given name, allowing it to be retrieved and constructed later
+        using the `construct` method.
+        Parameters
+        ----------
+        cls_name : str
+            The name to register the protocol under. This will be used as the
+            key for later retrieval.
+        Returns
+        -------
+        Callable[[type[ActiveLearningProtocol]], type[ActiveLearningProtocol]]
+            A decorator function that registers the class and returns it unchanged.
+        Raises
+        ------
+        ValueError
+            If a protocol with the same name is already registered.
+        Examples
+        --------
+        >>> @registry.register("my_new_strategy")
+        ... class MyStrategy:
+        ...     def __init__(self, param: int):
+        ...         self.param = param
+        """
+        def decorator(
+            cls: type[ActiveLearningProtocol],
+        ) -> type[ActiveLearningProtocol]:
+            """
+            Method for decorating a class to registry it with the registry.
+            """
+            if cls_name in self._registry:
+                raise ValueError(
+                    f"Protocol '{cls_name}' is already registered. "
+                    f"Existing class: {self._registry[cls_name].__name__}"
+                )
+            self._registry[cls_name] = cls
+            return cls
+        return decorator
+    def construct(
+        self, cls_name: str, module_path: str | None = None, **kwargs: Any
+    ) -> ActiveLearningProtocol:
+        """
+        Construct an instance of a registered protocol with argument validation.
+        This method retrieves a registered protocol class by name, validates that
+        the provided keyword arguments match the class's constructor signature,
+        and returns a new instance of the class.
+        Parameters
+        ----------
+        cls_name : str
+            The name of the registered protocol to construct.
+        module_path: str | None
+            The path to the module to get the class from.
+        **kwargs : Any
+            Keyword arguments to pass to the protocol's constructor.
+        Returns
+        -------
+        ActiveLearningProtocol
+            A new instance of the requested protocol class.
+        Raises
+        ------
+        KeyError
+            If the protocol name is not registered.
+        TypeError
+            If the provided keyword arguments do not match the constructor signature.
+            This includes missing required parameters or unexpected parameters.
+        Examples
+        --------
+        >>> from physicsnemo.active_learning._registry import registry
+        >>> @registry.register("my_latest_strategy")
+        ... class MyStrategy:
+        ...     def __init__(self, param: int):
+        ...         self.param = param
+        >>> strategy = registry.construct("my_latest_strategy", param=42)
+        """
+        cls = self.get_class(cls_name, module_path)
+        # Validate arguments against the class signature
+        try:
+            sig = inspect.signature(cls.__init__)
+        except (ValueError, TypeError) as e:
+            raise TypeError(
+                f"Could not inspect signature of {cls.__name__}.__init__: {e}"
+            )
+        # Get parameters, excluding 'self'
+        params = {
+            name: param for name, param in sig.parameters.items() if name != "self"
+        }
+        # Check if the signature accepts **kwargs
+        has_var_keyword = any(
+            p.kind == inspect.Parameter.VAR_KEYWORD for p in params.values()
+        )
+        # Check for missing required parameters
+        missing = []
+        for name, param in params.items():
+            if (
+                param.kind
+                not in (inspect.Parameter.VAR_KEYWORD, inspect.Parameter.VAR_POSITIONAL)
+                and param.default is inspect.Parameter.empty
+                and name not in kwargs
+            ):
+                missing.append(name)
+        if missing:
+            raise TypeError(
+                f"Missing required arguments for {cls.__name__}: {', '.join(missing)}"
+            )
+        # Check for unexpected parameters (unless **kwargs is present)
+        if not has_var_keyword:
+            param_names = {
+                name
+                for name, param in params.items()
+                if param.kind
+                not in (inspect.Parameter.VAR_KEYWORD, inspect.Parameter.VAR_POSITIONAL)
+            }
+            unexpected = [name for name in kwargs if name not in param_names]
+            if unexpected:
+                warn(
+                    f"Unexpected arguments for {cls.__name__}: {', '.join(unexpected)}. "
+                    f"Valid parameters: {', '.join(sorted(param_names))}"
+                )
+        return cls(**kwargs)
+    def __getitem__(self, cls_name: str) -> type[ActiveLearningProtocol]:
+        """
+        Retrieve a registered protocol class by name using dict-like access.
+        This method allows accessing registered protocol classes using square
+        bracket notation, e.g., `registry['my_strategy']`.
+        Parameters
+        ----------
+        cls_name : str
+            The name of the registered protocol to retrieve.
+        Returns
+        -------
+        type[ActiveLearningProtocol]
+            The class type of the registered protocol.
+        Raises
+        ------
+        KeyError
+            If the protocol name is not registered.
+        Examples
+        --------
+        >>> from physicsnemo.active_learning._registry import registry
+        >>> @registry.register("my_strategy")
+        ... class MyStrategy:
+        ...     def __init__(self, param: int):
+        ...         self.param = param
+        >>> RetrievedClass = registry['my_strategy']
+        >>> instance = RetrievedClass(param=42)
+        """
+        if cls_name not in self._registry:
+            available = ", ".join(self._registry.keys()) if self._registry else "none"
+            raise KeyError(
+                f"Protocol '{cls_name}' is not registered. "
+                f"Available protocols: {available}"
+            )
+        return self._registry[cls_name]
+    def is_registered(self, cls_name: str) -> bool:
+        """
+        Check if a protocol name is registered.
+        Parameters
+        ----------
+        cls_name : str
+            The name of the protocol to check.
+        Returns
+        -------
+        bool
+            True if the protocol is registered, False otherwise.
+        """
+        return cls_name in self._registry
+    @property
+    def registered_names(self) -> list[str]:
+        """
+        A list of all registered protocol names, sorted alphabetically.
+        Returns
+        -------
+        list[str]
+            A list of all registered protocol names, sorted alphabetically.
+        """
+        return sorted(self._registry.keys())
+    def get_class(self, cls_name: str, module_path: str | None = None) -> type:
+        """
+        Get a class by name from the registry or from a module path.
+        Parameters
+        ----------
+        cls_name: str
+            The name of the class to get.
+        module_path: str | None
+            The path to the module to get the class from.
+        Returns
+        -------
+        type
+            The class.
+        Raises
+        ------
+        NameError: If the class is not found in the registry or module.
+        ModuleNotFoundError: If the module is not found with the specified module path.
+        """
+        if cls_name in self.registered_names:
+            return self._registry[cls_name]
+        else:
+            if module_path:
+                module = importlib.import_module(module_path)
+                cls = getattr(module, cls_name, None)
+                if not cls:
+                    raise NameError(
+                        f"Class {cls_name} not found in module {module_path}"
+                    )
+                return cls
+            else:
+                raise NameError(
+                    f"Class {cls_name} not found in registry, and no module path was provided."
+                )
+# Module-level registry instance for global access
+registry = ActiveLearningRegistry()

physics_mcp/source/physicsnemo/active_learning/config.py ADDED Viewed

	@@ -0,0 +1,808 @@

+# SPDX-FileCopyrightText: Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Configuration dataclasses for the active learning driver.
+This module provides structured configuration classes that separate different
+concerns in the active learning workflow: optimization, training, strategies,
+and driver orchestration.
+"""
+from __future__ import annotations
+import math
+import uuid
+from collections import defaultdict
+from dataclasses import dataclass, field
+from json import dumps
+from pathlib import Path
+from typing import Any
+from warnings import warn
+import torch
+from torch import distributed as dist
+from torch.optim import AdamW, Optimizer
+from torch.optim.lr_scheduler import _LRScheduler
+from physicsnemo.active_learning import protocols as p
+from physicsnemo.active_learning._registry import registry
+from physicsnemo.active_learning.loop import DefaultTrainingLoop
+from physicsnemo.distributed import DistributedManager
+@dataclass
+class OptimizerConfig:
+    """
+    Configuration for optimizer and learning rate scheduler.
+    This encapsulates all training optimization parameters, keeping
+    them separate from the active learning orchestration logic.
+    Attributes
+    ----------
+    optimizer_cls: type[Optimizer]
+        The optimizer class to use. Defaults to AdamW.
+    optimizer_kwargs: dict[str, Any]
+        Keyword arguments to pass to the optimizer constructor.
+        Defaults to {"lr": 1e-4}.
+    scheduler_cls: type[_LRScheduler] | None
+        The learning rate scheduler class to use. If None, no
+        scheduler will be configured.
+    scheduler_kwargs: dict[str, Any]
+        Keyword arguments to pass to the scheduler constructor.
+    """
+    optimizer_cls: type[Optimizer] = AdamW
+    optimizer_kwargs: dict[str, Any] = field(default_factory=lambda: {"lr": 1e-4})
+    scheduler_cls: type[_LRScheduler] | None = None
+    scheduler_kwargs: dict[str, Any] = field(default_factory=dict)
+    def __post_init__(self) -> None:
+        """Validate optimizer configuration."""
+        # Validate learning rate if present
+        if "lr" in self.optimizer_kwargs:
+            lr = self.optimizer_kwargs["lr"]
+            if not isinstance(lr, (int, float)) or lr <= 0:
+                raise ValueError(f"Learning rate must be positive, got {lr}")
+        # Validate that scheduler_kwargs is only set if scheduler_cls is provided
+        if self.scheduler_kwargs and self.scheduler_cls is None:
+            raise ValueError(
+                "scheduler_kwargs provided but scheduler_cls is None. "
+                "Provide a scheduler_cls or remove scheduler_kwargs."
+            )
+    def to_dict(self) -> dict[str, Any]:
+        """
+        Returns a JSON-serializable dictionary representation of the OptimizerConfig.
+        For round-tripping, the registry is used to de-serialize the optimizer and scheduler
+        classes.
+        Returns
+        -------
+        dict[str, Any]
+            A dictionary that can be JSON serialized.
+        """
+        opt = {
+            "__name__": self.optimizer_cls.__name__,
+            "__module__": self.optimizer_cls.__module__,
+        }
+        if self.scheduler_cls:
+            scheduler = {
+                "__name__": self.scheduler_cls.__name__,
+                "__module__": self.scheduler_cls.__module__,
+            }
+        else:
+            scheduler = None
+        return {
+            "optimizer_cls": opt,
+            "optimizer_kwargs": self.optimizer_kwargs,
+            "scheduler_cls": scheduler,
+            "scheduler_kwargs": self.scheduler_kwargs,
+        }
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> OptimizerConfig:
+        """
+        Creates an OptimizerConfig instance from a dictionary.
+        This method assumes that the optimizer and scheduler classes are
+        included in the ``physicsnemo.active_learning.registry``, or
+        a module path is specified to import the class from.
+        Parameters
+        ----------
+        data: dict[str, Any]
+            A dictionary that was previously serialized using the ``to_dict`` method.
+        Returns
+        -------
+        OptimizerConfig
+            A new ``OptimizerConfig`` instance.
+        """
+        optimizer_cls = registry.get_class(
+            data["optimizer_cls"]["__name__"], data["optimizer_cls"]["__module__"]
+        )
+        if (s := data.get("scheduler_cls")) is not None:
+            scheduler_cls = registry.get_class(s["__name__"], s["__module__"])
+        else:
+            scheduler_cls = None
+        return cls(
+            optimizer_cls=optimizer_cls,
+            optimizer_kwargs=data["optimizer_kwargs"],
+            scheduler_cls=scheduler_cls,
+            scheduler_kwargs=data["scheduler_kwargs"],
+        )
+@dataclass
+class TrainingConfig:
+    """
+    Configuration for the training phase of active learning.
+    This groups all training-related components together, making it
+    clear when training is or isn't being used in the AL workflow.
+    Attributes
+    ----------
+    train_datapool: p.DataPool
+        The pool of labeled data to use for training.
+    max_training_epochs: int
+        The maximum number of epochs to train for. If ``max_fine_tuning_epochs``
+        isn't specified, this value is used for all active learning steps.
+    val_datapool: p.DataPool | None
+        Optional pool of data to use for validation during training.
+    optimizer_config: OptimizerConfig
+        Configuration for the optimizer and scheduler. Defaults to
+        AdamW with lr=1e-4, no scheduler.
+    max_fine_tuning_epochs: int | None
+        The maximum number of epochs used during fine-tuning steps, i.e. after
+        the first active learning step. If ``None``, then the fine-tuning will
+        be performed for the duration of the active learning loop.
+    train_loop_fn: p.TrainingLoop
+        The training loop function that orchestrates the training process.
+        This defaults to a concrete implementation, ``DefaultTrainingLoop``,
+        which provides a very typical loop that includes the use of static
+        capture, etc.
+    """
+    train_datapool: p.DataPool
+    max_training_epochs: int
+    val_datapool: p.DataPool | None = None
+    optimizer_config: OptimizerConfig = field(default_factory=OptimizerConfig)
+    max_fine_tuning_epochs: int | None = None
+    train_loop_fn: p.TrainingLoop = field(default_factory=DefaultTrainingLoop)
+    def __post_init__(self) -> None:
+        """Validate training configuration."""
+        # Validate datapools have consistent interface
+        if not hasattr(self.train_datapool, "__len__"):
+            raise ValueError("train_datapool must implement __len__")
+        if self.val_datapool is not None and not hasattr(self.val_datapool, "__len__"):
+            raise ValueError("val_datapool must implement __len__")
+        # Validate training loop is callable
+        if not callable(self.train_loop_fn):
+            raise ValueError("train_loop_fn must be callable")
+        # set the same value for fine tuning epochs if not provided
+        if self.max_fine_tuning_epochs is None:
+            self.max_fine_tuning_epochs = self.max_training_epochs
+    def to_dict(self) -> dict[str, Any]:
+        """
+        Returns a JSON-serializable dictionary representation of the TrainingConfig.
+        For round-tripping, the registry is used to de-serialize the training loop
+        and optimizer configuration. Note that datapools (train_datapool and val_datapool)
+        are NOT serialized as they typically contain large datasets, file handles, or other
+        non-serializable state.
+        Returns
+        -------
+        dict[str, Any]
+            A dictionary that can be JSON serialized. Excludes datapools.
+        Warnings
+        --------
+        This method will issue a warning about the exclusion of datapools.
+        """
+        # Warn about datapool exclusion
+        warn(
+            "The `train_datapool` and `val_datapool` attributes are not supported for "
+            "serialization and will be excluded from the ``TrainingConfig`` dictionary. "
+            "You must re-provide these datapools when deserializing."
+        )
+        # Serialize optimizer config
+        optimizer_dict = self.optimizer_config.to_dict()
+        # Serialize training loop function
+        if not hasattr(self.train_loop_fn, "_args"):
+            raise ValueError(
+                f"Training loop {self.train_loop_fn} does not have an `_args` attribute "
+                "which is required for serialization. Make sure your training loop "
+                "either subclasses `ActiveLearningProtocol` or implements the `__new__` "
+                "method to capture object arguments."
+            )
+        train_loop_dict = self.train_loop_fn._args
+        return {
+            "max_training_epochs": self.max_training_epochs,
+            "max_fine_tuning_epochs": self.max_fine_tuning_epochs,
+            "optimizer_config": optimizer_dict,
+            "train_loop_fn": train_loop_dict,
+        }
+    @classmethod
+    def from_dict(cls, data: dict[str, Any], **kwargs: Any) -> TrainingConfig:
+        """
+        Creates a TrainingConfig instance from a dictionary.
+        This method assumes that the training loop class is included in the
+        ``physicsnemo.active_learning.registry``, or a module path is specified
+        to import the class from. Note that datapools must be provided via
+        kwargs as they are not serialized.
+        Parameters
+        ----------
+        data: dict[str, Any]
+            A dictionary that was previously serialized using the ``to_dict`` method.
+        **kwargs: Any
+            Additional keyword arguments to pass to the constructor. This is where
+            you must provide ``train_datapool`` and optionally ``val_datapool``.
+        Returns
+        -------
+        TrainingConfig
+            A new ``TrainingConfig`` instance.
+        Raises
+        ------
+        ValueError
+            If required datapools are not provided in kwargs, if the data contains
+            unexpected keys, or if object construction fails.
+        """
+        # Ensure required datapools are provided
+        if "train_datapool" not in kwargs:
+            raise ValueError(
+                "``train_datapool`` must be provided in kwargs when deserializing "
+                "TrainingConfig, as datapools are not serialized."
+            )
+        # Reconstruct optimizer config
+        optimizer_config = OptimizerConfig.from_dict(data["optimizer_config"])
+        # Reconstruct training loop function
+        train_loop_data = data["train_loop_fn"]
+        train_loop_fn = registry.construct(
+            train_loop_data["__name__"],
+            module_path=train_loop_data["__module__"],
+            **train_loop_data["__args__"],
+        )
+        # Build the config
+        try:
+            config = cls(
+                max_training_epochs=data["max_training_epochs"],
+                max_fine_tuning_epochs=data.get("max_fine_tuning_epochs"),
+                optimizer_config=optimizer_config,
+                train_loop_fn=train_loop_fn,
+                **kwargs,
+            )
+        except Exception as e:
+            raise ValueError(
+                "Failed to construct ``TrainingConfig`` from dictionary."
+            ) from e
+        return config
+@dataclass
+class StrategiesConfig:
+    """
+    Configuration for active learning strategies and data acquisition.
+    This encapsulates the query-label-metrology cycle that is at the
+    heart of active learning: strategies for selecting data, labeling it,
+    and measuring model uncertainty/performance.
+    Attributes
+    ----------
+    query_strategies: list[p.QueryStrategy]
+        The query strategies to use for selecting data to label.
+    queue_cls: type[p.AbstractQueue]
+        The queue implementation to use for passing data between
+        query and labeling phases.
+    label_strategy: p.LabelStrategy | None
+        The strategy to use for labeling queried data. If None,
+        labeling will be skipped.
+    metrology_strategies: list[p.MetrologyStrategy] | None
+        Strategies for measuring model performance and uncertainty.
+        If None, metrology will be skipped.
+    unlabeled_datapool: p.DataPool | None
+        Pool of unlabeled data that query strategies can sample from.
+        Not all strategies require this (some may generate synthetic data).
+    """
+    query_strategies: list[p.QueryStrategy]
+    queue_cls: type[p.AbstractQueue]
+    label_strategy: p.LabelStrategy | None = None
+    metrology_strategies: list[p.MetrologyStrategy] | None = None
+    unlabeled_datapool: p.DataPool | None = None
+    def __post_init__(self) -> None:
+        """Validate strategies configuration."""
+        # Must have at least one query strategy
+        if not self.query_strategies:
+            raise ValueError(
+                "At least one query strategy must be provided. "
+                "Active learning requires a mechanism to select data."
+            )
+        # All query strategies must be callable
+        for strategy in self.query_strategies:
+            if not callable(strategy):
+                raise ValueError(f"Query strategy {strategy} must be callable")
+        # Label strategy must be callable if provided
+        if self.label_strategy is not None and not callable(self.label_strategy):
+            raise ValueError("label_strategy must be callable")
+        # Metrology strategies must be callable if provided
+        if self.metrology_strategies is not None:
+            if not self.metrology_strategies:
+                raise ValueError(
+                    "metrology_strategies is an empty list. "
+                    "Either provide strategies or set to None to skip metrology."
+                )
+            for strategy in self.metrology_strategies:
+                if not callable(strategy):
+                    raise ValueError(f"Metrology strategy {strategy} must be callable")
+        # Validate queue class has basic queue interface
+        if not hasattr(self.queue_cls, "__call__"):
+            raise ValueError("queue_cls must be a callable class")
+    def to_dict(self) -> dict[str, Any]:
+        """
+        Method that converts the present ``StrategiesConfig`` instance into a dictionary
+        that can be JSON serialized.
+        This method, for the most part, assumes that strategies are subclasses of
+        ``ActiveLearningProtocol`` and/or they have an ``_args`` attribute that
+        captures the arguments to the constructor.
+        One issue is the inability to reliably serialize the ``unlabeled_datapool``,
+        which for the most part, likely does not need serialization as a dataset.
+        Regardless, this method will trigger a warning if ``unlabeled_datapool`` is
+        not None.
+        Returns
+        -------
+        dict[str, Any]
+            A dictionary that can be JSON serialized.
+        """
+        output = defaultdict(list)
+        for strategy in self.query_strategies:
+            if not hasattr(strategy, "_args"):
+                raise ValueError(
+                    f"Query strategy {strategy} does not have an `_args` attribute"
+                    " which is required for serialization. Make sure your strategy"
+                    " either subclasses `ActiveLearningProtocol` or implements"
+                    " the `__new__` method to capture object arguments."
+                )
+            output["query_strategies"].append(strategy._args)
+        if self.label_strategy is not None:
+            if not hasattr(self.label_strategy, "_args"):
+                raise ValueError(
+                    f"Label strategy {self.label_strategy} does not have an `_args` attribute"
+                    " which is required for serialization. Make sure your strategy"
+                    " either subclasses `ActiveLearningProtocol` or implements"
+                    " the `__new__` method to capture object arguments."
+                )
+            output["label_strategy"] = self.label_strategy._args
+        output["queue_cls"] = {
+            "__name__": self.queue_cls.__name__,
+            "__module__": self.queue_cls.__module__,
+        }
+        if self.metrology_strategies is not None:
+            for strategy in self.metrology_strategies:
+                if not hasattr(strategy, "_args"):
+                    raise ValueError(
+                        f"Metrology strategy {strategy} does not have an `_args` attribute"
+                        " which is required for serialization. Make sure your strategy"
+                        " either subclasses `ActiveLearningProtocol` or implements"
+                        " the `__new__` method to capture object arguments."
+                    )
+                output["metrology_strategies"].append(strategy._args)
+        if self.unlabeled_datapool is not None:
+            warn(
+                "The `unlabeled_datapool` attribute is not supported for serialization"
+                " and will be excluded from the ``StrategiesConfig`` dictionary."
+            )
+        return output
+    @classmethod
+    def from_dict(cls, data: dict[str, Any], **kwargs: Any) -> StrategiesConfig:
+        """
+        Create a ``StrategiesConfig`` instance from a dictionary.
+        This method heavily relies on classes being added to the
+        ``physicsnemo.active_learning.registry``, which is used to instantiate
+        all strategies and custom types used in active learning. As a fall
+        back, the `registry.construct` method will try and import the class
+        from the module path if it is not found in the registry.
+        Parameters
+        ----------
+        data: dict[str, Any]
+            A dictionary that was previously serialized using the ``to_dict`` method.
+        **kwargs: Any
+            Additional keyword arguments to pass to the constructor.
+        Returns
+        -------
+        StrategiesConfig
+            A new ``StrategiesConfig`` instance.
+        Raises
+        ------
+        ValueError:
+            If the data contains unexpected keys or if the object construction fails.
+        NameError:
+            If a class is not found in the registry and no module path is provided.
+        ModuleNotFoundError:
+            If a module is not found with the specified module path.
+        """
+        # ensure that the data contains no unexpected keys
+        data_keys = set(data.keys())
+        expected_keys = set(cls.__dataclass_fields__.keys())
+        extra_keys = data_keys - expected_keys
+        if extra_keys:
+            raise ValueError(
+                f"Unexpected keys in data: {extra_keys}. Expected keys are {expected_keys}."
+            )
+        # instantiate objects from the serialized data; general strategy is to
+        # use `registry.construct` that will try and resolve the class within
+        # the registry first, and if not found, then it will try and import the
+        # class from the module path.
+        output_dict = defaultdict(list)
+        for entry in data["query_strategies"]:
+            output_dict["query_strategies"].append(
+                registry.construct(
+                    entry["__name__"],
+                    module_path=entry["__module__"],
+                    **entry["__args__"],
+                )
+            )
+        if "metrology_strategies" in data:
+            for entry in data["metrology_strategies"]:
+                output_dict["metrology_strategies"].append(
+                    registry.construct(
+                        entry["__name__"],
+                        module_path=entry["__module__"],
+                        **entry["__args__"],
+                    )
+                )
+        if "label_strategy" in data:
+            output_dict["label_strategy"] = registry.construct(
+                data["label_strategy"]["__name__"],
+                module_path=data["label_strategy"]["__module__"],
+                **data["label_strategy"]["__args__"],
+            )
+        output_dict["queue_cls"] = registry.get_class(
+            data["queue_cls"]["__name__"], data["queue_cls"]["__module__"]
+        )
+        # potentially override with keyword arguments
+        output_dict.update(kwargs)
+        try:
+            config = cls(**output_dict)
+        except Exception as e:
+            raise ValueError(
+                "Failed to construct ``StrategiesConfig`` from dictionary."
+            ) from e
+        return config
+@dataclass
+class DriverConfig:
+    """
+    Configuration for driver orchestration and infrastructure.
+    This contains parameters that control the overall loop execution,
+    logging, checkpointing, and distributed training setup - orthogonal
+    to the specific AL or training logic.
+    Attributes
+    ----------
+    batch_size: int
+        The batch size to use for data loaders.
+    max_active_learning_steps: int | None, default None
+        Maximum number of AL iterations to perform. None means infinite.
+    run_id: str, default auto-generated UUID
+        Unique identifier for this run. Auto-generated if not provided.
+    fine_tuning_lr: float | None, default None
+        Learning rate to switch to after the first AL step for fine-tuning.
+    reset_optim_states: bool, default True
+        Whether to reset optimizer states between AL steps.
+    skip_training: bool, default False
+        If True, skip the training phase entirely.
+    skip_metrology: bool, default False
+        If True, skip the metrology phase entirely.
+    skip_labeling: bool, default False
+        If True, skip the labeling phase entirely.
+    checkpoint_interval: int, default 1
+        Save model checkpoint every N AL steps. 0 disables checkpointing.
+    checkpoint_on_training: bool, default False
+        If True, save checkpoint at the start of the training phase.
+    checkpoint_on_metrology: bool, default False
+        If True, save checkpoint at the start of the metrology phase.
+    checkpoint_on_query: bool, default False
+        If True, save checkpoint at the start of the query phase.
+    checkpoint_on_labeling: bool, default True
+        If True, save checkpoint at the start of the labeling phase.
+    model_checkpoint_frequency: int, default 0
+        Save model weights every N epochs during training. 0 means only save
+        between active learning phases. Useful for mid-training restarts.
+    root_log_dir: str | Path, default Path.cwd() / "active_learning_logs"
+        Directory to save logs and checkpoints to. Defaults to
+        an 'active_learning_logs' directory in the current working directory.
+    dist_manager: DistributedManager | None, default None
+        Manager for distributed training configuration.
+    collate_fn: callable | None, default None
+        Custom collate function for batching data.
+    num_dataloader_workers: int, default 0
+        Number of worker processes for data loading.
+    device: str | torch.device | None, default None
+        Device to use for model and data. This is intended for single process
+        workflows; for distributed workflows, the device should be set in
+        ``DistributedManager`` instead. If not specified, then the device
+        will default to ``torch.get_default_device()``.
+    dtype: torch.dtype | None, default None
+        The dtype to use for model and data, and AMP contexts. If not provided,
+        then the dtype will default to ``torch.get_default_dtype()``.
+    """
+    batch_size: int
+    max_active_learning_steps: int | None = None
+    run_id: str = field(default_factory=lambda: str(uuid.uuid4()))
+    fine_tuning_lr: float | None = None  # TODO: move to TrainingConfig
+    reset_optim_states: bool = True
+    skip_training: bool = False
+    skip_metrology: bool = False
+    skip_labeling: bool = False
+    checkpoint_interval: int = 1
+    checkpoint_on_training: bool = False
+    checkpoint_on_metrology: bool = False
+    checkpoint_on_query: bool = False
+    checkpoint_on_labeling: bool = True
+    model_checkpoint_frequency: int = 0
+    root_log_dir: str | Path = field(default=Path.cwd() / "active_learning_logs")
+    dist_manager: DistributedManager | None = None
+    collate_fn: callable | None = None
+    num_dataloader_workers: int = 0
+    device: str | torch.device | None = None
+    dtype: torch.dtype | None = None
+    def __post_init__(self) -> None:
+        """Validate driver configuration."""
+        if self.max_active_learning_steps is None:
+            self.max_active_learning_steps = float("inf")
+        if (
+            self.max_active_learning_steps is not None
+            and self.max_active_learning_steps <= 0
+        ):
+            raise ValueError(
+                "`max_active_learning_steps` must be a positive integer or None."
+            )
+        if not math.isfinite(self.batch_size) or self.batch_size <= 0:
+            raise ValueError("`batch_size` must be a positive integer.")
+        if not math.isfinite(self.checkpoint_interval) or self.checkpoint_interval < 0:
+            raise ValueError(
+                "`checkpoint_interval` must be a non-negative integer. "
+                "Use 0 to disable checkpointing."
+            )
+        if self.fine_tuning_lr is not None and self.fine_tuning_lr <= 0:
+            raise ValueError("`fine_tuning_lr` must be positive if provided.")
+        if self.num_dataloader_workers < 0:
+            raise ValueError("`num_dataloader_workers` must be non-negative.")
+        if self.model_checkpoint_frequency < 0:
+            raise ValueError("`model_checkpoint_frequency` must be non-negative.")
+        if isinstance(self.root_log_dir, str):
+            self.root_log_dir = Path(self.root_log_dir)
+        # Validate collate_fn if provided
+        if self.collate_fn is not None and not callable(self.collate_fn):
+            raise ValueError("`collate_fn` must be callable if provided.")
+        # device and dtype setup when not using DistributedManager
+        if self.device is None and not self.dist_manager:
+            self.device = torch.get_default_device()
+        if self.dtype is None:
+            self.dtype = torch.get_default_dtype()
+    def to_json(self) -> str:
+        """
+        Returns a JSON string representation of the ``DriverConfig``.
+        Note that certain fields are not serialized and must be provided when
+        deserializing: ``dist_manager``, ``collate_fn``.
+        Returns
+        -------
+        str
+            A JSON string representation of the config.
+        """
+        # base dict representation skips Python objects
+        dict_repr = {
+            key: self.__dict__[key]
+            for key in self.__dict__
+            if key
+            not in ["dist_manager", "collate_fn", "root_log_dir", "device", "dtype"]
+        }
+        # Note: checkpoint flags are included in dict_repr automatically
+        dict_repr["default_dtype"] = str(torch.get_default_dtype())
+        dict_repr["log_dir"] = str(self.root_log_dir)
+        # Convert dtype to string for JSON serialization
+        if self.dtype is not None:
+            dict_repr["dtype"] = str(self.dtype)
+        else:
+            dict_repr["dtype"] = None
+        if self.dist_manager is not None:
+            dict_repr["world_size"] = self.dist_manager.world_size
+            dict_repr["device"] = self.dist_manager.device.type
+            dict_repr["dist_manager_init_method"] = (
+                self.dist_manager._initialization_method
+            )
+        else:
+            if dist.is_initialized():
+                world_size = dist.get_world_size()
+            else:
+                world_size = 1
+            dict_repr["world_size"] = world_size
+            if self.device is not None:
+                dict_repr["device"] = (
+                    str(self.device)
+                    if hasattr(self.device, "type")
+                    else str(self.device)
+                )
+            else:
+                dict_repr["device"] = torch.get_default_device().type
+            dict_repr["dist_manager_init_method"] = None
+        if self.collate_fn is not None:
+            dict_repr["collate_fn"] = self.collate_fn.__name__
+        else:
+            dict_repr["collate_fn"] = None
+        return dumps(dict_repr, indent=2)
+    @classmethod
+    def from_json(cls, json_str: str, **kwargs: Any) -> DriverConfig:
+        """
+        Creates a DriverConfig instance from a JSON string.
+        This method reconstructs a DriverConfig from JSON. Note that certain
+        fields cannot be serialized and must be provided via kwargs:
+        - ``dist_manager``: DistributedManager instance (optional)
+        - ``collate_fn``: Custom collate function (optional)
+        Parameters
+        ----------
+        json_str: str
+            A JSON string that was previously serialized using ``to_json()``.
+        **kwargs: Any
+            Additional keyword arguments to override or provide non-serializable
+            fields like ``dist_manager`` and ``collate_fn``.
+        Returns
+        -------
+        DriverConfig
+            A new ``DriverConfig`` instance.
+        Raises
+        ------
+        ValueError
+            If the JSON cannot be parsed or required fields are missing.
+        Notes
+        -----
+        The device and dtype fields are reconstructed from their string
+        representations. The ``log_dir`` field in JSON is mapped to
+        ``root_log_dir`` in the config.
+        """
+        import json
+        try:
+            data = json.loads(json_str)
+        except json.JSONDecodeError as e:
+            raise ValueError(f"Invalid JSON string: {e}") from e
+        # Define fields that are not actual DriverConfig constructor parameters
+        metadata_fields = [
+            "default_dtype",
+            "world_size",
+            "dist_manager_init_method",
+            "log_dir",  # handled separately as root_log_dir
+        ]
+        non_serializable_fields = [
+            "dist_manager",
+            "collate_fn",
+            "root_log_dir",
+            "device",
+            "dtype",
+        ]
+        # Extract serializable fields that map directly
+        config_fields = {
+            key: value
+            for key, value in data.items()
+            if key not in metadata_fields + non_serializable_fields
+        }
+        # Handle root_log_dir (stored as "log_dir" in JSON)
+        if "log_dir" in data:
+            config_fields["root_log_dir"] = Path(data["log_dir"])
+        # Handle device reconstruction from string
+        if "device" in data and data["device"] is not None:
+            device_str = data["device"]
+            # Parse device strings like "cuda:0", "cpu", "cuda", etc.
+            config_fields["device"] = torch.device(device_str)
+        # Handle dtype reconstruction from string
+        if "dtype" in data and data["dtype"] is not None:
+            dtype_str = data["dtype"]
+            # Map string representations to torch dtypes
+            dtype_map = {
+                "torch.float32": torch.float32,
+                "torch.float64": torch.float64,
+                "torch.float16": torch.float16,
+                "torch.bfloat16": torch.bfloat16,
+                "torch.int32": torch.int32,
+                "torch.int64": torch.int64,
+                "torch.int8": torch.int8,
+                "torch.uint8": torch.uint8,
+            }
+            if dtype_str in dtype_map:
+                config_fields["dtype"] = dtype_map[dtype_str]
+            else:
+                warn(
+                    f"Unknown dtype string '{dtype_str}' in JSON. "
+                    "Using default dtype instead."
+                )
+        # Merge with provided kwargs (allows overriding and adding non-serializable fields)
+        config_fields.update(kwargs)
+        # Create the config
+        try:
+            config = cls(**config_fields)
+        except Exception as e:
+            raise ValueError(
+                "Failed to construct ``DriverConfig`` from JSON string."
+            ) from e
+        return config

physics_mcp/source/physicsnemo/active_learning/driver.py ADDED Viewed

	@@ -0,0 +1,1449 @@

+# SPDX-FileCopyrightText: Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This module contains the definition for an active learning driver
+class, which is responsible for orchestration and automation of
+the end-to-end active learning process.
+"""
+from __future__ import annotations
+import inspect
+import pickle
+from contextlib import contextmanager
+from copy import deepcopy
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Generator
+import torch
+from torch import distributed as dist
+from torch.nn.parallel import DistributedDataParallel
+from torch.utils.data import DataLoader, DistributedSampler
+from physicsnemo import Module
+from physicsnemo import __version__ as physicsnemo_version
+from physicsnemo.active_learning import protocols as p
+from physicsnemo.active_learning.config import (
+    DriverConfig,
+    StrategiesConfig,
+    TrainingConfig,
+)
+from physicsnemo.active_learning.logger import (
+    ActiveLearningLoggerAdapter,
+    setup_active_learning_logger,
+)
+from physicsnemo.distributed import DistributedManager
+@dataclass
+class ActiveLearningCheckpoint:
+    """
+    Metadata associated with an ongoing (or completed) active
+    learning experiment.
+    The information contained in this metadata should be sufficient
+    to restart the active learning experiment at the nearest point:
+    for example, training should be able to continue from an epoch,
+    while for querying/sampling, etc. we continue from a pre-existing
+    queue.
+    """
+    driver_config: DriverConfig
+    strategies_config: StrategiesConfig
+    active_learning_step_idx: int
+    active_learning_phase: p.ActiveLearningPhase
+    physicsnemo_version: str = physicsnemo_version
+    training_config: TrainingConfig | None = None
+    optimizer_state: dict[str, Any] | None = None
+    lr_scheduler_state: dict[str, Any] | None = None
+    has_query_queue: bool = False
+    has_label_queue: bool = False
+class Driver(p.DriverProtocol):
+    """
+    Provides a simple implementation of the ``DriverProtocol`` used to
+    orchestrate an active learning process within PhysicsNeMo.
+    At a high level, the active learning process is broken down into four
+    phases: training, metrology, query, and labeling.
+    To understand the orchestration, start by inspecting the
+    ``active_learning_step`` method, which defines a single iteration of
+    the active learning loop, which is dispatched by the ``run`` method.
+    From there, it should be relatively straightforward to trace the
+    remaining components.
+    Attributes
+    ----------
+    config: DriverConfig
+        Infrastructure and orchestration configuration.
+    learner: Module | p.LearnerProtocol
+        The learner module for the active learning process.
+    strategies_config: StrategiesConfig
+        Active learning strategies (query, label, metrology).
+    training_config: TrainingConfig | None
+        Training components. None if training is skipped.
+    inference_fn: p.InferenceProtocol | None
+        Custom inference function.
+    active_learning_step_idx: int
+        Current iteration index of the active learning loop.
+    query_queue: p.AbstractQueue
+        Queue populated with data by query strategies.
+    label_queue: p.AbstractQueue
+        Queue populated with labeled data by the label strategy.
+    optimizer: torch.optim.Optimizer | None
+        Configured optimizer (set after configure_optimizer is called).
+    lr_scheduler: torch.optim.lr_scheduler._LRScheduler | None
+        Configured learning rate scheduler.
+    logger: logging.Logger
+        Persistent logger for the active learning process.
+    """
+    # Phase execution order for active learning step (immutable)
+    _PHASE_ORDER = [
+        p.ActiveLearningPhase.TRAINING,
+        p.ActiveLearningPhase.METROLOGY,
+        p.ActiveLearningPhase.QUERY,
+        p.ActiveLearningPhase.LABELING,
+    ]
+    def __init__(
+        self,
+        config: DriverConfig,
+        learner: Module | p.LearnerProtocol,
+        strategies_config: StrategiesConfig,
+        training_config: TrainingConfig | None = None,
+        inference_fn: p.InferenceProtocol | None = None,
+    ) -> None:
+        """
+        Initializes the active learning driver.
+        At the bare minimum, the driver requires a config, learner, and
+        strategies config to be used in a purely querying loop. Additional
+        arguments can be provided to enable training and other workflows.
+        Parameters
+        ----------
+        config: DriverConfig
+            Orchestration and infrastructure configuration, for example
+            the batch size, the log directory, the distributed manager, etc.
+        learner: Module | p.LearnerProtocol
+            The model to use for active learning.
+        strategies_config: StrategiesConfig
+            Container for active learning strategies (query, label, metrology).
+        training_config: TrainingConfig | None
+            Training components. Required if ``skip_training`` is False in
+            the ``DriverConfig``.
+        inference_fn: p.InferenceProtocol | None
+            Custom inference function. If None, uses ``learner.__call__``.
+            This is not actually called by the driver, but is stored as an
+            attribute for attached strategies to use as needed.
+        """
+        # Configs have already validated themselves in __post_init__
+        self.config = config
+        self.learner = learner
+        self.strategies_config = strategies_config
+        self.training_config = training_config
+        self.inference_fn = inference_fn
+        self.active_learning_step_idx = 0
+        self.current_phase: p.ActiveLearningPhase | None = (
+            None  # Track current phase for logging context
+        )
+        self._last_checkpoint_path: Path | None = None
+        # Validate cross-config constraints
+        self._validate_config_consistency()
+        self._setup_logger()
+        self.attach_strategies()
+        # Initialize queues from strategies_config
+        self.query_queue = strategies_config.queue_cls()
+        self.label_queue = strategies_config.queue_cls()
+    def _validate_config_consistency(self) -> None:
+        """
+        Validate consistency across configs.
+        Each config validates itself, but this method checks relationships
+        between configs that can only be validated when composed together.
+        """
+        # If training is not skipped, training_config must be provided
+        if not self.config.skip_training and self.training_config is None:
+            raise ValueError(
+                "`training_config` must be provided when `skip_training` is False."
+            )
+        # If labeling is not skipped, must have label strategy and train datapool
+        if not self.config.skip_labeling:
+            if self.strategies_config.label_strategy is None:
+                raise ValueError(
+                    "`label_strategy` must be provided in strategies_config "
+                    "when `skip_labeling` is False."
+                )
+            if (
+                self.training_config is None
+                or self.training_config.train_datapool is None
+            ):
+                raise ValueError(
+                    "`train_datapool` must be provided in training_config "
+                    "when `skip_labeling` is False (labeled data is appended to it)."
+                )
+        # If fine-tuning lr is set, must have training enabled
+        if self.config.fine_tuning_lr is not None and self.config.skip_training:
+            raise ValueError(
+                "`fine_tuning_lr` has no effect when `skip_training` is True."
+            )
+    @property
+    def query_strategies(self) -> list[p.QueryStrategy]:
+        """Returns the query strategies from strategies_config."""
+        return self.strategies_config.query_strategies
+    @property
+    def label_strategy(self) -> p.LabelStrategy | None:
+        """Returns the label strategy from strategies_config."""
+        return self.strategies_config.label_strategy
+    @property
+    def metrology_strategies(self) -> list[p.MetrologyStrategy] | None:
+        """Returns the metrology strategies from strategies_config."""
+        return self.strategies_config.metrology_strategies
+    @property
+    def unlabeled_datapool(self) -> p.DataPool | None:
+        """Returns the unlabeled datapool from strategies_config."""
+        return self.strategies_config.unlabeled_datapool
+    @property
+    def train_datapool(self) -> p.DataPool | None:
+        """Returns the training datapool from training_config."""
+        return self.training_config.train_datapool if self.training_config else None
+    @property
+    def val_datapool(self) -> p.DataPool | None:
+        """Returns the validation datapool from training_config."""
+        return self.training_config.val_datapool if self.training_config else None
+    @property
+    def train_loop_fn(self) -> p.TrainingLoop | None:
+        """Returns the training loop function from training_config."""
+        return self.training_config.train_loop_fn if self.training_config else None
+    @property
+    def device(self) -> torch.device:
+        """Return a consistent device interface to use across the driver."""
+        if self.dist_manager is not None and self.dist_manager.is_initialized():
+            return self.dist_manager.device
+        else:
+            return torch.get_default_device()
+    @property
+    def run_id(self) -> str:
+        """Returns the run id from the ``DriverConfig``.
+        Returns
+        -------
+        str
+            The run id.
+        """
+        return self.config.run_id
+    @property
+    def log_dir(self) -> Path:
+        """Returns the log directory.
+        Note that this is the ``DriverConfig.root_log_dir`` combined
+        with the shortened run ID for the current run.
+        Effectively, this means that each run will have its own
+        directory for logs, checkpoints, etc.
+        Returns
+        -------
+        Path
+            The log directory.
+        """
+        return self.config.root_log_dir / self.short_run_id
+    @property
+    def short_run_id(self) -> str:
+        """Returns the first 8 characters of the run id.
+        The 8 character limit assumes that the run ID is a UUID4.
+        This is particularly useful for user-facing interfaces,
+        where you do not necessarily want to reference the full UUID.
+        Returns
+        -------
+        str
+            The first 8 characters of the run id.
+        """
+        return self.run_id[:8]
+    @property
+    def last_checkpoint(self) -> Path | None:
+        """
+        Returns path to the most recently saved checkpoint.
+        Returns
+        -------
+        Path | None
+            Path to the last checkpoint directory, or None if no checkpoint
+            has been saved yet.
+        """
+        return self._last_checkpoint_path
+    @property
+    def active_learning_step_idx(self) -> int:
+        """
+        Returns the current active learning step index.
+        This represents the number of times the active learning step
+        has been called, i.e. the number of iterations of the loop.
+        Returns
+        -------
+        int
+            The current active learning step index.
+        """
+        return self._active_learning_step_idx
+    @active_learning_step_idx.setter
+    def active_learning_step_idx(self, value: int) -> None:
+        """
+        Sets the current active learning step index.
+        Parameters
+        ----------
+        value: int
+            The new active learning step index.
+        Raises
+        ------
+        ValueError
+            If the new active learning step index is negative.
+        """
+        if value < 0:
+            raise ValueError("Active learning step index must be non-negative.")
+        self._active_learning_step_idx = value
+    @property
+    def dist_manager(self) -> DistributedManager | None:
+        """Returns the distributed manager, if it was specified as part
+        of the `DriverConfig` configuration.
+        Returns
+        -------
+        DistributedManager | None
+            The distributed manager.
+        """
+        return self.config.dist_manager
+    def configure_optimizer(self) -> None:
+        """Setup optimizer and LR schedulers from training_config."""
+        if self.training_config is None:
+            self.optimizer = None
+            self.lr_scheduler = None
+            return
+        opt_cfg = self.training_config.optimizer_config
+        if opt_cfg.optimizer_cls is not None:
+            try:
+                _ = inspect.signature(opt_cfg.optimizer_cls).bind(
+                    self.learner.parameters(), **opt_cfg.optimizer_kwargs
+                )
+            except TypeError as e:
+                raise ValueError(
+                    f"Invalid optimizer kwargs for {opt_cfg.optimizer_cls}; {e}"
+                )
+            self.optimizer = opt_cfg.optimizer_cls(
+                self.learner.parameters(), **opt_cfg.optimizer_kwargs
+            )
+        else:
+            self.optimizer = None
+            return
+        if opt_cfg.scheduler_cls is not None and self.optimizer is not None:
+            try:
+                _ = inspect.signature(opt_cfg.scheduler_cls).bind(
+                    self.optimizer, **opt_cfg.scheduler_kwargs
+                )
+            except TypeError as e:
+                raise ValueError(
+                    f"Invalid LR scheduler kwargs for {opt_cfg.scheduler_cls}; {e}"
+                )
+            self.lr_scheduler = opt_cfg.scheduler_cls(
+                self.optimizer, **opt_cfg.scheduler_kwargs
+            )
+        else:
+            self.lr_scheduler = None
+        # in the case where we want to reset optimizer states between active learning steps
+        if self.config.reset_optim_states and self.is_optimizer_configured:
+            self._original_optim_state = deepcopy(self.optimizer.state_dict())
+    @property
+    def is_optimizer_configured(self) -> bool:
+        """Returns whether the optimizer is configured."""
+        return getattr(self, "optimizer", None) is not None
+    @property
+    def is_lr_scheduler_configured(self) -> bool:
+        """Returns whether the LR scheduler is configured."""
+        return getattr(self, "lr_scheduler", None) is not None
+    def attach_strategies(self) -> None:
+        """Calls ``strategy.attach`` for all available strategies."""
+        super().attach_strategies()
+    def _setup_logger(self) -> None:
+        """
+        Sets up a persistent logger for the driver.
+        This logger is specialized in that it provides additional context
+        information depending on the part of the active learning cycle.
+        """
+        base_logger = setup_active_learning_logger(
+            "core.active_learning",
+            run_id=self.run_id,
+            log_dir=self.log_dir,
+        )
+        # Wrap with adapter to automatically include iteration context
+        self.logger = ActiveLearningLoggerAdapter(base_logger, driver_ref=self)
+    def _should_checkpoint_at_step(self) -> bool:
+        """
+        Determine if a checkpoint should be saved at the current AL step.
+        Uses the `checkpoint_interval` from config to decide. If interval is 0,
+        checkpointing is disabled. Otherwise, checkpoint at step 0 and every
+        N steps thereafter.
+        Returns
+        -------
+        bool
+            True if checkpoint should be saved, False otherwise.
+        """
+        if self.config.checkpoint_interval == 0:
+            return False
+        # Always checkpoint at step 0, then every checkpoint_interval steps
+        return self.active_learning_step_idx % self.config.checkpoint_interval == 0
+    def _serialize_queue(self, queue: p.AbstractQueue, file_path: Path) -> bool:
+        """
+        Serialize queue to a file.
+        If queue implements `to_list()`, serialize the list. Otherwise, use
+        torch.save to serialize the entire queue object.
+        Parameters
+        ----------
+        queue: p.AbstractQueue
+            The queue to serialize.
+        file_path: Path
+            Path where the queue should be saved.
+        Returns
+        -------
+        bool
+            True if serialization succeeded, False otherwise.
+        """
+        try:
+            if hasattr(queue, "to_list") and callable(getattr(queue, "to_list")):
+                # Use custom serialization method
+                queue_data = {"type": "list", "data": queue.to_list()}
+            else:
+                # Fallback to torch.save for the entire queue
+                queue_data = {"type": "torch", "data": queue}
+            torch.save(queue_data, file_path)
+            return True
+        except (TypeError, AttributeError, pickle.PicklingError, RuntimeError) as e:
+            # Some queues cannot be pickled, e.g. stdlib queue.Queue with thread locks
+            # Clean up any partially written file
+            if file_path.exists():
+                file_path.unlink()
+            self.logger.warning(
+                f"Failed to serialize queue to {file_path}: {e}. Queue state will not be saved. "
+                f"Consider implementing to_list()/from_list() methods for custom serialization."
+            )
+            return False
+    def _deserialize_queue(self, queue: p.AbstractQueue, file_path: Path) -> None:
+        """
+        Restore queue from a file.
+        Parameters
+        ----------
+        queue: p.AbstractQueue
+            The queue to restore data into.
+        file_path: Path
+            Path to the saved queue file.
+        """
+        if not file_path.exists():
+            return
+        try:
+            queue_data = torch.load(file_path, map_location="cpu", weights_only=False)
+            if queue_data["type"] == "list":
+                if hasattr(queue, "from_list") and callable(
+                    getattr(queue, "from_list")
+                ):
+                    queue.from_list(queue_data["data"])
+                else:
+                    # Manually populate queue from list
+                    for item in queue_data["data"]:
+                        queue.put(item)
+            elif queue_data["type"] == "torch":
+                # Restore from torch-saved queue - copy items to current queue
+                restored_queue = queue_data["data"]
+                # Copy items from restored queue to current queue
+                while not restored_queue.empty():
+                    queue.put(restored_queue.get())
+        except Exception as e:
+            self.logger.warning(
+                f"Failed to deserialize queue from {file_path}: {e}. "
+                f"Queue will be empty."
+            )
+    def save_checkpoint(
+        self, path: str | Path | None = None, training_epoch: int | None = None
+    ) -> Path | None:
+        """
+        Save a checkpoint of the active learning experiment.
+        Saves AL orchestration state (configs, queues, step index, phase) and model weights.
+        Training-specific state (optimizer, scheduler) is handled by DefaultTrainingLoop
+        and saved to training_state.pt during training.
+        Parameters
+        ----------
+        path: str | Path | None
+            Path to save checkpoint. If None, creates path based on current
+            AL step index and phase: log_dir/checkpoints/step_{idx}/{phase}/
+        training_epoch: int | None
+            Optional epoch number for mid-training checkpoints.
+        Returns
+        -------
+        Path | None
+            Checkpoint directory path, or None if checkpoint not saved (non-rank-0 in distributed).
+        """
+        # Determine checkpoint directory
+        if path is None:
+            phase_name = self.current_phase if self.current_phase else "init"
+            checkpoint_dir = (
+                self.log_dir
+                / "checkpoints"
+                / f"step_{self.active_learning_step_idx}"
+                / phase_name
+            )
+            if training_epoch is not None:
+                checkpoint_dir = checkpoint_dir / f"epoch_{training_epoch}"
+        else:
+            checkpoint_dir = Path(path)
+        # Create checkpoint directory
+        checkpoint_dir.mkdir(parents=True, exist_ok=True)
+        # Only rank 0 saves checkpoint in distributed setting
+        if self.dist_manager is not None and self.dist_manager.is_initialized():
+            if self.dist_manager.rank != 0:
+                return None
+        # Serialize configurations
+        driver_config_json = self.config.to_json()
+        strategies_config_dict = self.strategies_config.to_dict()
+        training_config_dict = (
+            self.training_config.to_dict() if self.training_config else None
+        )
+        # Serialize queue states to separate files
+        query_queue_file = checkpoint_dir / "query_queue.pt"
+        label_queue_file = checkpoint_dir / "label_queue.pt"
+        has_query_queue = self._serialize_queue(self.query_queue, query_queue_file)
+        has_label_queue = self._serialize_queue(self.label_queue, label_queue_file)
+        # Create checkpoint dataclass (only AL orchestration state)
+        checkpoint = ActiveLearningCheckpoint(
+            driver_config=driver_config_json,
+            strategies_config=strategies_config_dict,
+            active_learning_step_idx=self.active_learning_step_idx,
+            active_learning_phase=self.current_phase or p.ActiveLearningPhase.TRAINING,
+            physicsnemo_version=physicsnemo_version,
+            training_config=training_config_dict,
+            optimizer_state=None,  # Training loop handles this
+            lr_scheduler_state=None,  # Training loop handles this
+            has_query_queue=has_query_queue,
+            has_label_queue=has_label_queue,
+        )
+        # Add training epoch if in mid-training checkpoint
+        checkpoint_dict = {
+            "checkpoint": checkpoint,
+        }
+        if training_epoch is not None:
+            checkpoint_dict["training_epoch"] = training_epoch
+        # Save checkpoint metadata
+        checkpoint_path = checkpoint_dir / "checkpoint.pt"
+        torch.save(checkpoint_dict, checkpoint_path)
+        # Save model weights (separate from training state)
+        if isinstance(self.learner, Module):
+            model_name = (
+                self.learner.meta.name
+                if self.learner.meta
+                else self.learner.__class__.__name__
+            )
+            model_path = checkpoint_dir / f"{model_name}.mdlus"
+            self.learner.save(str(model_path))
+        elif hasattr(self.learner, "module") and isinstance(
+            self.learner.module, Module
+        ):
+            # Unwrap DDP
+            model_name = (
+                self.learner.module.meta.name
+                if self.learner.module.meta
+                else self.learner.module.__class__.__name__
+            )
+            model_path = checkpoint_dir / f"{model_name}.mdlus"
+            self.learner.module.save(str(model_path))
+        else:
+            model_name = self.learner.__class__.__name__
+            model_path = checkpoint_dir / f"{model_name}.pt"
+            torch.save(self.learner.state_dict(), model_path)
+        # Update last checkpoint path
+        self._last_checkpoint_path = checkpoint_dir
+        # Log successful checkpoint save
+        self.logger.info(
+            f"Saved checkpoint at step {self.active_learning_step_idx}, "
+            f"phase {self.current_phase}: {checkpoint_dir}"
+        )
+        return checkpoint_dir
+    @classmethod
+    def load_checkpoint(
+        cls,
+        checkpoint_path: str | Path,
+        learner: Module | p.LearnerProtocol | None = None,
+        train_datapool: p.DataPool | None = None,
+        val_datapool: p.DataPool | None = None,
+        unlabeled_datapool: p.DataPool | None = None,
+        **kwargs: Any,
+    ) -> Driver:
+        """
+        Load a Driver instance from a checkpoint.
+        Given a checkpoint directory, this method will attempt to reconstruct
+        the driver and its associated components from the checkpoint. The
+        checkpoint path must contain a ``checkpoint.pt`` file, which contains
+        the metadata associated with the experiment.
+        Additional parameters that might not be serialized with the checkpointing
+        mechanism can/need to be provided to this method; for example when
+        using non-`physicsnemo.Module` learners, and any data pools associated
+        with the workflow.
+        .. important::
+            Currently, the strategy states are not reloaded from the checkpoint.
+            This will be addressed in a future patch, but for now it is recommended
+            to back up your strategy states (e.g. metrology records) manually
+            before restarting experiments.
+        Parameters
+        ----------
+        checkpoint_path: str | Path
+            Path to checkpoint directory containing checkpoint.pt and model weights.
+        learner: Module | p.LearnerProtocol | None
+            Learner model to load weights into. If None, will attempt to
+            reconstruct from checkpoint (only works for physicsnemo.Module).
+        train_datapool: p.DataPool | None
+            Training datapool. Required if training_config exists in checkpoint.
+        val_datapool: p.DataPool | None
+            Validation datapool. Optional.
+        unlabeled_datapool: p.DataPool | None
+            Unlabeled datapool for query strategies. Optional.
+        **kwargs: Any
+            Additional keyword arguments to override config values.
+        Returns
+        -------
+        Driver
+            Reconstructed Driver instance ready to resume execution.
+        """
+        checkpoint_path = Path(checkpoint_path)
+        # Load checkpoint file
+        checkpoint_file = checkpoint_path / "checkpoint.pt"
+        if not checkpoint_file.exists():
+            raise FileNotFoundError(f"Checkpoint file not found: {checkpoint_file}")
+        checkpoint_dict = torch.load(
+            checkpoint_file, map_location="cpu", weights_only=False
+        )
+        checkpoint: ActiveLearningCheckpoint = checkpoint_dict["checkpoint"]
+        training_epoch = checkpoint_dict.get("training_epoch", None)
+        # Reconstruct configs
+        driver_config = DriverConfig.from_json(
+            checkpoint.driver_config, **kwargs.get("driver_config_overrides", {})
+        )
+        # TODO add strategy state loading from checkpoint
+        strategies_config = StrategiesConfig.from_dict(
+            checkpoint.strategies_config,
+            unlabeled_datapool=unlabeled_datapool,
+            **kwargs.get("strategies_config_overrides", {}),
+        )
+        training_config = None
+        if checkpoint.training_config is not None:
+            training_config = TrainingConfig.from_dict(
+                checkpoint.training_config,
+                train_datapool=train_datapool,
+                val_datapool=val_datapool,
+                **kwargs.get("training_config_overrides", {}),
+            )
+        # Load or reconstruct learner
+        if learner is None:
+            # Attempt to reconstruct from checkpoint (only for Module)
+            # Try to find any .mdlus file in the checkpoint directory
+            mdlus_files = list(checkpoint_path.glob("*.mdlus"))
+            if mdlus_files:
+                # Use the first .mdlus file found
+                model_path = mdlus_files[0]
+                learner = Module.from_checkpoint(str(model_path))
+            else:
+                raise ValueError(
+                    "No learner provided and unable to reconstruct from checkpoint. "
+                    "Please provide a learner instance."
+                )
+        else:
+            # Load model weights into provided learner
+            # Determine expected model filename based on learner type
+            if isinstance(learner, Module):
+                model_name = (
+                    learner.meta.name if learner.meta else learner.__class__.__name__
+                )
+                model_path = checkpoint_path / f"{model_name}.mdlus"
+                if model_path.exists():
+                    learner.load(str(model_path))
+                else:
+                    # Fallback: try to find any .mdlus file
+                    mdlus_files = list(checkpoint_path.glob("*.mdlus"))
+                    if mdlus_files:
+                        learner.load(str(mdlus_files[0]))
+            elif hasattr(learner, "module") and isinstance(learner.module, Module):
+                # Unwrap DDP
+                model_name = (
+                    learner.module.meta.name
+                    if learner.module.meta
+                    else learner.module.__class__.__name__
+                )
+                model_path = checkpoint_path / f"{model_name}.mdlus"
+                if model_path.exists():
+                    learner.module.load(str(model_path))
+                else:
+                    # Fallback: try to find any .mdlus file
+                    mdlus_files = list(checkpoint_path.glob("*.mdlus"))
+                    if mdlus_files:
+                        learner.module.load(str(mdlus_files[0]))
+            else:
+                # Non-Module learner: look for .pt file with class name
+                model_name = learner.__class__.__name__
+                model_path = checkpoint_path / f"{model_name}.pt"
+                if model_path.exists():
+                    state_dict = torch.load(model_path, map_location="cpu")
+                    learner.load_state_dict(state_dict)
+                else:
+                    # Fallback: try to find any .pt file
+                    pt_files = list(checkpoint_path.glob("*.pt"))
+                    # Filter out checkpoint.pt and queue files
+                    model_pt_files = [
+                        f
+                        for f in pt_files
+                        if f.name
+                        not in [
+                            "checkpoint.pt",
+                            "query_queue.pt",
+                            "label_queue.pt",
+                            "training_state.pt",
+                        ]
+                    ]
+                    if model_pt_files:
+                        state_dict = torch.load(model_pt_files[0], map_location="cpu")
+                        learner.load_state_dict(state_dict)
+        # Instantiate Driver
+        driver = cls(
+            config=driver_config,
+            learner=learner,
+            strategies_config=strategies_config,
+            training_config=training_config,
+            inference_fn=kwargs.get("inference_fn", None),
+        )
+        # Restore active learning state
+        driver.active_learning_step_idx = checkpoint.active_learning_step_idx
+        driver.current_phase = checkpoint.active_learning_phase
+        driver._last_checkpoint_path = checkpoint_path
+        # Load training state (optimizer, scheduler) if training_config exists
+        # This delegates to the training loop's checkpoint loading logic
+        if driver.training_config is not None:
+            driver.configure_optimizer()
+            # Use training loop to load training state (including model weights again if needed)
+            from physicsnemo.active_learning.loop import DefaultTrainingLoop
+            DefaultTrainingLoop.load_training_checkpoint(
+                checkpoint_dir=checkpoint_path,
+                model=driver.learner,
+                optimizer=driver.optimizer,
+                lr_scheduler=driver.lr_scheduler
+                if hasattr(driver, "lr_scheduler")
+                else None,
+            )
+        # Restore queue states from separate files
+        if checkpoint.has_query_queue:
+            query_queue_file = checkpoint_path / "query_queue.pt"
+            driver._deserialize_queue(driver.query_queue, query_queue_file)
+        if checkpoint.has_label_queue:
+            label_queue_file = checkpoint_path / "label_queue.pt"
+            driver._deserialize_queue(driver.label_queue, label_queue_file)
+        driver.logger.info(
+            f"Loaded checkpoint from {checkpoint_path} at step "
+            f"{checkpoint.active_learning_step_idx}, phase {checkpoint.active_learning_phase}"
+        )
+        if training_epoch is not None:
+            driver.logger.info(f"Resuming from training epoch {training_epoch}")
+        return driver
+    def barrier(self) -> None:
+        """
+        Wrapper to call barrier on the correct device.
+        Becomes a no-op if distributed is not initialized, otherwise
+        will attempt to read the local device ID from either the distributed manager
+        or the default device.
+        """
+        if dist.is_initialized():
+            if (
+                self.dist_manager is not None
+                and self.dist_manager.device.type == "cuda"
+            ):
+                dist.barrier(device_ids=[self.dist_manager.local_rank])
+            elif torch.get_default_device().type == "cuda":
+                # this might occur if distributed manager is not used
+                dist.barrier(device_ids=[torch.cuda.current_device()])
+            else:
+                dist.barrier()
+    def _configure_model(self) -> None:
+        """
+        Method that encapsulates all the logic for preparing the model
+        ahead of time.
+        If the distributed manager has been configured and initialized
+        with a world size greater than 1, then we wrap the model in DDP.
+        Otherwise, we simply move the model to the correct device.
+        After the model has been moved to device, we configure the optimizer
+        and learning rate scheduler if training is enabled.
+        """
+        if self.dist_manager is not None and self.dist_manager.is_initialized():
+            if self.dist_manager.world_size > 1 and not isinstance(
+                self.learner, DistributedDataParallel
+            ):
+                # wrap the model in DDP
+                self.learner = torch.nn.parallel.DistributedDataParallel(
+                    self.learner,
+                    device_ids=[self.dist_manager.local_rank],
+                    output_device=self.dist_manager.device,
+                    broadcast_buffers=self.dist_manager.broadcast_buffers,
+                    find_unused_parameters=self.dist_manager.find_unused_parameters,
+                )
+        else:
+            if self.config.device is not None:
+                self.learner = self.learner.to(self.config.device, self.config.dtype)
+        # assume all device management is done via the dist_manager, so at this
+        # point the model is on the correct device and we can set up the optimizer
+        # if we intend to train
+        if not self.config.skip_training and not self.is_optimizer_configured:
+            self.configure_optimizer()
+        if self.is_optimizer_configured and self.config.reset_optim_states:
+            self.optimizer.load_state_dict(self._original_optim_state)
+    def _get_phase_index(self, phase: p.ActiveLearningPhase | None) -> int:
+        """
+        Get index of phase in execution order.
+        Parameters
+        ----------
+        phase: p.ActiveLearningPhase | None
+            Phase to find index for. If None, returns 0 (start from beginning).
+        Returns
+        -------
+        int
+            Index in _PHASE_ORDER (0-3).
+        """
+        if phase is None:
+            return 0
+        try:
+            return self._PHASE_ORDER.index(phase)
+        except ValueError:
+            self.logger.warning(
+                f"Unknown phase {phase}, defaulting to start from beginning"
+            )
+            return 0
+    def _build_phase_queue(
+        self,
+        train_step_fn: p.TrainingProtocol | None,
+        validate_step_fn: p.ValidationProtocol | None,
+        args: tuple,
+        kwargs: dict,
+    ) -> list[Any]:
+        """
+        Build list of phase functions to execute for this AL step.
+        If current_phase is set (e.g., from checkpoint), only phases at or after
+        current_phase are included. Otherwise, all non-skipped phases are included.
+        Parameters
+        ----------
+        train_step_fn: p.TrainingProtocol | None
+            Training function to pass to training phase.
+        validate_step_fn: p.ValidationProtocol | None
+            Validation function to pass to training phase.
+        args: tuple
+            Additional arguments to pass to phase methods.
+        kwargs: dict
+            Additional keyword arguments to pass to phase methods.
+        Returns
+        -------
+        list[Callable]
+            Queue of phase functions to execute in order.
+        """
+        # Define all possible phases with their execution conditions
+        all_phases = [
+            (
+                p.ActiveLearningPhase.TRAINING,
+                lambda: self._training_phase(
+                    train_step_fn, validate_step_fn, *args, **kwargs
+                ),
+                not self.config.skip_training,
+            ),
+            (
+                p.ActiveLearningPhase.METROLOGY,
+                lambda: self._metrology_phase(*args, **kwargs),
+                not self.config.skip_metrology,
+            ),
+            (
+                p.ActiveLearningPhase.QUERY,
+                lambda: self._query_phase(*args, **kwargs),
+                True,  # Query phase always runs
+            ),
+            (
+                p.ActiveLearningPhase.LABELING,
+                lambda: self._labeling_phase(*args, **kwargs),
+                not self.config.skip_labeling,
+            ),
+        ]
+        # Find starting index based on current_phase (resume point)
+        start_idx = self._get_phase_index(self.current_phase)
+        if start_idx > 0:
+            self.logger.info(
+                f"Resuming AL step {self.active_learning_step_idx} from "
+                f"{self.current_phase}"
+            )
+        # Build queue: only phases from start_idx onwards that should run
+        phase_queue = []
+        for idx, (phase, phase_fn, should_run) in enumerate(all_phases):
+            # Skip phases before current_phase
+            if idx < start_idx:
+                self.logger.debug(
+                    f"Skipping {phase} (already completed in this AL step)"
+                )
+                continue
+            # Add phase to queue if not skipped by config
+            if should_run:
+                phase_queue.append(phase_fn)
+            else:
+                self.logger.debug(f"Skipping {phase} (disabled in config)")
+        return phase_queue
+    def _construct_dataloader(
+        self, pool: p.DataPool, shuffle: bool = False, drop_last: bool = False
+    ) -> DataLoader:
+        """
+        Helper method to construct a data loader for a given data pool.
+        In the case that a distributed manager was provided, then a distributed
+        sampler will be used, which will be bound to the current rank.
+        Otherwise, a regular sampler will be used. Similarly, if your data
+        structure requires a specialized function to construct batches,
+        then this function can be provided via the `collate_fn` argument.
+        Parameters
+        ----------
+        pool: p.DataPool
+            The data pool to construct a data loader for.
+        shuffle: bool = False
+            Whether to shuffle the data.
+        drop_last: bool = False
+            Whether to drop the last batch if it is not complete.
+        Returns
+        -------
+        DataLoader
+            The constructed data loader.
+        """
+        # if a distributed manager was omitted, then we assume single process
+        if self.dist_manager is not None and self.dist_manager.is_initialized():
+            sampler = DistributedSampler(
+                pool,
+                num_replicas=self.dist_manager.world_size,
+                rank=self.dist_manager.rank,
+                shuffle=shuffle,
+                drop_last=drop_last,
+            )
+            # set to None, because sampler will handle instead
+            shuffle = None
+        else:
+            sampler = None
+        # fully spec out the data loader
+        pin_memory = False
+        if self.dist_manager is not None and self.dist_manager.is_initialized():
+            if self.dist_manager.device.type == "cuda":
+                pin_memory = True
+        loader = DataLoader(
+            pool,
+            shuffle=shuffle,
+            sampler=sampler,
+            collate_fn=self.config.collate_fn,
+            batch_size=self.config.batch_size,
+            num_workers=self.config.num_dataloader_workers,
+            persistent_workers=self.config.num_dataloader_workers > 0,
+            pin_memory=pin_memory,
+        )
+        return loader
+    def active_learning_step(
+        self,
+        train_step_fn: p.TrainingProtocol | None = None,
+        validate_step_fn: p.ValidationProtocol | None = None,
+        *args: Any,
+        **kwargs: Any,
+    ) -> None:
+        """
+        Performs a single active learning iteration.
+        This method will perform the following sequence of steps:
+        1. Train the model stored in ``Driver.learner`` by creating data loaders
+        with ``Driver.train_datapool`` and ``Driver.val_datapool``.
+        2. Run the metrology strategies stored in ``Driver.metrology_strategies``.
+        3. Run the query strategies stored in ``Driver.query_strategies``, if available.
+        4. Run the labeling strategy stored in ``Driver.label_strategy``, if available.
+        When entering each stage, we check to ensure all components necessary for the
+        minimum function for that stage are available before proceeding.
+        If current_phase is set (e.g., from checkpoint resumption), only phases at
+        or after current_phase will be executed. After completing all phases,
+        current_phase is reset to None for the next AL step.
+        Parameters
+        ----------
+        train_step_fn: p.TrainingProtocol | None = None
+            The training function to use for training. If not provided, then the
+            ``Driver.train_loop_fn`` will be used.
+        validate_step_fn: p.ValidationProtocol | None = None
+            The validation function to use for validation. If not provided, then
+            validation will not be performed.
+        args: Any
+            Additional arguments to pass to the method. These will be passed to the
+            training loop, metrology strategies, query strategies, and labeling strategies.
+        kwargs: Any
+            Additional keyword arguments to pass to the method. These will be passed to the
+            training loop, metrology strategies, query strategies, and labeling strategies.
+        Raises
+        ------
+        ValueError
+            If any of the required components for a stage are not available.
+        """
+        self._setup_active_learning_step()
+        # Build queue of phase functions based on current_phase
+        phase_queue = self._build_phase_queue(
+            train_step_fn, validate_step_fn, args, kwargs
+        )
+        # Execute each phase in order (de-populate queue)
+        for phase_fn in phase_queue:
+            phase_fn()
+        # Reset current_phase after completing all phases in this AL step
+        self.current_phase = None
+        self.logger.debug("Entering barrier for synchronization.")
+        self.barrier()
+        self.active_learning_step_idx += 1
+        self.logger.info(
+            f"Completed active learning step {self.active_learning_step_idx}"
+        )
+    def _setup_active_learning_step(self) -> None:
+        """Initialize distributed manager and configure model for the active learning step."""
+        if self.dist_manager is not None and not self.dist_manager.is_initialized():
+            self.logger.info(
+                "Distributed manager configured but not initialized; initializing."
+            )
+            self.dist_manager.initialize()
+        self._configure_model()
+        self.logger.info(
+            f"Starting active learning step {self.active_learning_step_idx}"
+        )
+    def _training_phase(
+        self,
+        train_step_fn: p.TrainingProtocol | None,
+        validate_step_fn: p.ValidationProtocol | None,
+        *args: Any,
+        **kwargs: Any,
+    ) -> None:
+        """Execute the training phase of the active learning step."""
+        self._validate_training_requirements(train_step_fn, validate_step_fn)
+        # don't need to barrier because it'll be done at the end of training anyway
+        with self._phase_context("training", call_barrier=False):
+            # Note: Training phase checkpointing is handled by the training loop itself
+            # during epoch execution based on model_checkpoint_frequency
+            train_loader = self._construct_dataloader(self.train_datapool, shuffle=True)
+            self.logger.info(
+                f"There are {len(train_loader)} batches in the training loader."
+            )
+            val_loader = None
+            if self.val_datapool is not None:
+                if validate_step_fn or hasattr(self.learner, "validation_step"):
+                    val_loader = self._construct_dataloader(
+                        self.val_datapool, shuffle=False
+                    )
+                else:
+                    self.logger.warning(
+                        "Validation data is available, but no `validate_step_fn` "
+                        "or `validation_step` method in Learner is provided."
+                    )
+            # if a fine-tuning lr is provided, adjust it after the first iteration
+            if (
+                self.config.fine_tuning_lr is not None
+                and self.active_learning_step_idx > 0
+            ):
+                self.optimizer.param_groups[0]["lr"] = self.config.fine_tuning_lr
+            # Determine max epochs to train for this AL step
+            if self.active_learning_step_idx > 0:
+                target_max_epochs = self.training_config.max_fine_tuning_epochs
+            else:
+                target_max_epochs = self.training_config.max_training_epochs
+            # Check if resuming from mid-training checkpoint
+            start_epoch = 1
+            epochs_to_train = target_max_epochs
+            if self._last_checkpoint_path and self._last_checkpoint_path.exists():
+                training_state_path = self._last_checkpoint_path / "training_state.pt"
+                if training_state_path.exists():
+                    training_state = torch.load(
+                        training_state_path, map_location="cpu", weights_only=False
+                    )
+                    last_completed_epoch = training_state.get("training_epoch", 0)
+                    if last_completed_epoch > 0:
+                        start_epoch = last_completed_epoch + 1
+                        epochs_to_train = target_max_epochs - last_completed_epoch
+                        self.logger.info(
+                            f"Resuming training from epoch {start_epoch} "
+                            f"({epochs_to_train} epochs remaining)"
+                        )
+            # Skip training if all epochs already completed
+            if epochs_to_train <= 0:
+                self.logger.info(
+                    f"Training already complete ({target_max_epochs} epochs), "
+                    f"skipping training phase"
+                )
+                return
+            device = (
+                self.dist_manager.device
+                if self.dist_manager is not None
+                else self.config.device
+            )
+            dtype = self.config.dtype
+            # Set checkpoint directory and frequency on training loop
+            # This allows the training loop to handle training state checkpointing internally
+            if hasattr(self.train_loop_fn, "checkpoint_base_dir") and hasattr(
+                self.train_loop_fn, "checkpoint_frequency"
+            ):
+                # Checkpoint base is the current AL step's training directory
+                checkpoint_base = (
+                    self.log_dir
+                    / "checkpoints"
+                    / f"step_{self.active_learning_step_idx}"
+                    / "training"
+                )
+                self.train_loop_fn.checkpoint_base_dir = checkpoint_base
+                self.train_loop_fn.checkpoint_frequency = (
+                    self.config.model_checkpoint_frequency
+                )
+            self.train_loop_fn(
+                self.learner,
+                self.optimizer,
+                train_step_fn=train_step_fn,
+                validate_step_fn=validate_step_fn,
+                train_dataloader=train_loader,
+                validation_dataloader=val_loader,
+                lr_scheduler=self.lr_scheduler,
+                max_epochs=epochs_to_train,  # Only remaining epochs
+                device=device,
+                dtype=dtype,
+                **kwargs,
+            )
+    def _metrology_phase(self, *args: Any, **kwargs: Any) -> None:
+        """Execute the metrology phase of the active learning step."""
+        with self._phase_context("metrology"):
+            for strategy in self.metrology_strategies:
+                self.logger.info(
+                    f"Running metrology strategy: {strategy.__class__.__name__}"
+                )
+                strategy(*args, **kwargs)
+                self.logger.info(
+                    f"Completed metrics for strategy: {strategy.__class__.__name__}"
+                )
+                strategy.serialize_records(*args, **kwargs)
+    def _query_phase(self, *args: Any, **kwargs: Any) -> None:
+        """Execute the query phase of the active learning step."""
+        with self._phase_context("query"):
+            for strategy in self.query_strategies:
+                self.logger.info(
+                    f"Running query strategy: {strategy.__class__.__name__}"
+                )
+                strategy(self.query_queue, *args, **kwargs)
+            if self.query_queue.empty():
+                self.logger.warning(
+                    "Querying strategies produced no samples this iteration."
+                )
+    def _labeling_phase(self, *args: Any, **kwargs: Any) -> None:
+        """Execute the labeling phase of the active learning step."""
+        self._validate_labeling_requirements()
+        if self.query_queue.empty():
+            self.logger.warning("No samples to label. Skipping labeling phase.")
+            return
+        with self._phase_context("labeling"):
+            try:
+                self.label_strategy(self.query_queue, self.label_queue, *args, **kwargs)
+            except Exception as e:
+                self.logger.error(f"Exception encountered during labeling: {e}")
+            self.logger.info("Labeling completed. Now appending to training pool.")
+            # TODO this is done serially, could be improved with batched writes
+            sample_counter = 0
+            while not self.label_queue.empty():
+                self.train_datapool.append(self.label_queue.get())
+                sample_counter += 1
+            self.logger.info(f"Appended {sample_counter} samples to training pool.")
+    def _validate_training_requirements(
+        self,
+        train_step_fn: p.TrainingProtocol | None,
+        validate_step_fn: p.ValidationProtocol | None,
+    ) -> None:
+        """Validate that all required components for training are available."""
+        if self.training_config is None:
+            raise ValueError(
+                "`training_config` must be provided if `skip_training` is False."
+            )
+        if self.train_loop_fn is None:
+            raise ValueError("`train_loop_fn` must be provided in training_config.")
+        if self.train_datapool is None:
+            raise ValueError("`train_datapool` must be provided in training_config.")
+        if not train_step_fn and not hasattr(self.learner, "training_step"):
+            raise ValueError(
+                "`train_step_fn` must be provided if the model does not implement "
+                "the `training_step` method."
+            )
+        if validate_step_fn and self.val_datapool is None:
+            raise ValueError(
+                "`val_datapool` must be provided in training_config if "
+                "`validate_step_fn` is provided."
+            )
+    def _validate_labeling_requirements(self) -> None:
+        """Validate that all required components for labeling are available."""
+        if self.label_strategy is None:
+            raise ValueError(
+                "`label_strategy` must be provided in strategies_config if "
+                "`skip_labeling` is False."
+            )
+        if self.training_config is None or self.train_datapool is None:
+            raise ValueError(
+                "`train_datapool` must be provided in training_config for "
+                "labeling, as data will be appended to it."
+            )
+    @contextmanager
+    def _phase_context(
+        self, phase_name: p.ActiveLearningPhase, call_barrier: bool = True
+    ) -> Generator[None, Any, None]:
+        """
+        Context manager for consistent phase tracking, error handling, and synchronization.
+        Sets the current phase for logging context, handles exceptions,
+        and synchronizes distributed workers with a barrier. Also triggers
+        checkpoint saves at the start of each phase if configured.
+        Parameters
+        ----------
+        phase_name: p.ActiveLearningPhase
+            A discrete phase of the active learning workflow.
+        call_barrier: bool
+            Whether to call barrier for synchronization at the end.
+        """
+        self.current_phase = phase_name
+        # Save checkpoint at START of phase if configured
+        # Exception: training phase handles checkpointing internally
+        if phase_name != p.ActiveLearningPhase.TRAINING:
+            should_checkpoint = getattr(
+                self.config, f"checkpoint_on_{phase_name}", False
+            )
+            # Check if we should checkpoint based on interval
+            if should_checkpoint and self._should_checkpoint_at_step():
+                self.save_checkpoint()
+        try:
+            yield
+        except Exception as e:
+            self.logger.error(f"Exception encountered during {phase_name}: {e}")
+            raise
+        finally:
+            if call_barrier:
+                self.logger.debug("Entering barrier for synchronization.")
+                self.barrier()
+    def run(
+        self,
+        train_step_fn: p.TrainingProtocol | None = None,
+        validate_step_fn: p.ValidationProtocol | None = None,
+        *args: Any,
+        **kwargs: Any,
+    ) -> None:
+        """
+        Runs the active learning loop until the maximum number of
+        active learning steps is reached.
+        Parameters
+        ----------
+        train_step_fn: p.TrainingProtocol | None = None
+            The training function to use for training. If not provided, then the
+            ``Driver.train_loop_fn`` will be used.
+        validate_step_fn: p.ValidationProtocol | None = None
+            The validation function to use for validation. If not provided, then
+            validation will not be performed.
+        args: Any
+            Additional arguments to pass to the method. These will be passed to the
+            training loop, metrology strategies, query strategies, and labeling strategies.
+        kwargs: Any
+            Additional keyword arguments to pass to the method. These will be passed to the
+            training loop, metrology strategies, query strategies, and labeling strategies.
+        """
+        # TODO: refactor initialization logic here instead of inside the step
+        while self.active_learning_step_idx < self.config.max_active_learning_steps:
+            self.active_learning_step(
+                train_step_fn=train_step_fn,
+                validate_step_fn=validate_step_fn,
+                *args,
+                **kwargs,
+            )
+    def __call__(
+        self,
+        train_step_fn: p.TrainingProtocol | None = None,
+        validate_step_fn: p.ValidationProtocol | None = None,
+        *args: Any,
+        **kwargs: Any,
+    ) -> None:
+        """
+        Provides syntactic sugar for running the active learning loop.
+        Calls ``Driver.run`` internally.
+        Parameters
+        ----------
+        train_step_fn: p.TrainingProtocol | None = None
+            The training function to use for training. If not provided, then the
+            ``Driver.train_loop_fn`` will be used.
+        validate_step_fn: p.ValidationProtocol | None = None
+            The validation function to use for validation. If not provided, then
+            validation will not be performed.
+        args: Any
+            Additional arguments to pass to the method. These will be passed to the
+            training loop, metrology strategies, query strategies, and labeling strategies.
+        kwargs: Any
+            Additional keyword arguments to pass to the method. These will be passed to the
+            training loop, metrology strategies, query strategies, and labeling strategies.
+        """
+        self.run(
+            train_step_fn=train_step_fn,
+            validate_step_fn=validate_step_fn,
+            *args,
+            **kwargs,
+        )

physics_mcp/source/physicsnemo/active_learning/logger.py ADDED Viewed

	@@ -0,0 +1,330 @@

+# SPDX-FileCopyrightText: Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+import json
+import logging
+from contextlib import contextmanager
+from datetime import datetime
+from pathlib import Path
+from threading import local
+from typing import Any
+try:
+    from termcolor import colored
+except ImportError:
+    colored = None
+# Thread-local storage for context information
+_context_storage = local()
+class ActiveLearningLoggerAdapter(logging.LoggerAdapter):
+    """Logger adapter that automatically includes active learning iteration context.
+    This adapter automatically adds iteration information to log messages
+    by accessing the driver's current iteration state.
+    """
+    def __init__(self, logger: logging.Logger, driver_ref: Any = None):
+        """Initialize the adapter with a logger and optional driver reference.
+        Parameters
+        ----------
+        logger : logging.Logger
+            The underlying logger to adapt
+        driver_ref : Any, optional
+            Reference to the driver object to get iteration context from
+        """
+        super().__init__(logger, {})
+        self.driver_ref = driver_ref
+    def process(self, msg: str, kwargs: dict[str, Any]) -> tuple[str, dict[str, Any]]:
+        """Process the log message to add iteration, run ID, and phase context.
+        Parameters
+        ----------
+        msg : str
+            The log message
+        kwargs : dict[str, Any]
+            Additional keyword arguments
+        Returns
+        -------
+        tuple[str, dict[str, Any]]
+            Processed message and kwargs
+        """
+        # Add iteration, run ID, and phase context if driver reference is available
+        if self.driver_ref is not None:
+            extra = kwargs.get("extra", {})
+            # Add iteration context
+            if hasattr(self.driver_ref, "active_learning_step_idx"):
+                iteration = getattr(self.driver_ref, "active_learning_step_idx", None)
+                if iteration is not None:
+                    extra["iteration"] = iteration
+            # Add run ID context
+            if hasattr(self.driver_ref, "run_id"):
+                run_id = getattr(self.driver_ref, "run_id", None)
+                if run_id is not None:
+                    extra["run_id"] = run_id
+            # Add current phase context
+            if hasattr(self.driver_ref, "current_phase"):
+                phase = getattr(self.driver_ref, "current_phase", None)
+                if phase is not None:
+                    extra["phase"] = phase
+            if extra:
+                kwargs["extra"] = extra
+        return msg, kwargs
+class JSONFormatter(logging.Formatter):
+    """JSON formatter for structured logging to files.
+    This formatter converts log records to JSON format, including all
+    contextual information and metadata for structured analysis.
+    """
+    def format(self, record: logging.LogRecord) -> str:
+        """Format the log record as JSON.
+        Parameters
+        ----------
+        record : logging.LogRecord
+            The log record to format
+        Returns
+        -------
+        str
+            JSON-formatted log message
+        """
+        log_entry = {
+            "timestamp": datetime.fromtimestamp(record.created).isoformat(),
+            "level": record.levelname,
+            "logger": record.name,
+            "message": record.getMessage(),
+            "module": record.module,
+            "function": record.funcName,
+            "line": record.lineno,
+        }
+        # Add contextual information if available
+        if hasattr(record, "context"):
+            log_entry["context"] = record.context
+        if hasattr(record, "caller_object"):
+            log_entry["caller_object"] = record.caller_object
+        if hasattr(record, "iteration"):
+            log_entry["iteration"] = record.iteration
+        if hasattr(record, "phase"):
+            log_entry["phase"] = record.phase
+        extra_keys = list(filter(lambda x: x not in log_entry, record.__dict__.keys()))
+        # Add any extra fields
+        for key in extra_keys:
+            log_entry[key] = record.__dict__[key]
+        return json.dumps(log_entry)
+def _get_context_stack():
+    """Get the context stack for the current thread."""
+    if not hasattr(_context_storage, "context_stack"):
+        _context_storage.context_stack = []
+    return _context_storage.context_stack
+class ContextFormatter(logging.Formatter):
+    """Standard formatter that includes active learning context information with colors."""
+    def format(self, record):
+        # Build context string
+        context_parts = []
+        if hasattr(record, "caller_object") and record.caller_object:
+            context_parts.append(f"obj:{record.caller_object}")
+        if hasattr(record, "run_id") and record.run_id:
+            context_parts.append(f"run:{record.run_id}")
+        if hasattr(record, "iteration") and record.iteration is not None:
+            context_parts.append(f"iter:{record.iteration}")
+        if hasattr(record, "phase") and record.phase:
+            context_parts.append(f"phase:{record.phase}")
+        if hasattr(record, "context") and record.context:
+            for key, value in record.context.items():
+                context_parts.append(f"{key}:{value}")
+        context_str = f"[{', '.join(context_parts)}]" if context_parts else ""
+        # Use standard formatting
+        base_msg = super().format(record)
+        # Add color to the message based on level if termcolor is available
+        if colored is not None:
+            match record.levelno:
+                case level if level >= logging.ERROR:
+                    base_msg = colored(base_msg, "red")
+                case level if level >= logging.WARNING:
+                    base_msg = colored(base_msg, "yellow")
+                case level if level >= logging.INFO:
+                    base_msg = colored(base_msg, "white")
+                case _:  # DEBUG
+                    base_msg = colored(base_msg, "cyan")
+        # Add colored context string
+        if context_str:
+            if colored is not None:
+                context_str = colored(context_str, "blue")
+            base_msg += f" {context_str}"
+        return base_msg
+class ContextInjectingFilter(logging.Filter):
+    """Filter that injects contextual information into log records."""
+    def filter(self, record):
+        # Add context information from thread-local storage
+        context_stack = _get_context_stack()
+        if context_stack:
+            current_context = context_stack[-1]
+            if current_context["caller_object"]:
+                record.caller_object = current_context["caller_object"]
+            if current_context["iteration"] is not None:
+                record.iteration = current_context["iteration"]
+            if current_context.get("phase"):
+                record.phase = current_context["phase"]
+            if current_context["context"]:
+                record.context = current_context["context"]
+        return True
+def setup_active_learning_logger(
+    name: str,
+    run_id: str,
+    log_dir: str | Path = Path("active_learning_logs"),
+    level: int = logging.INFO,
+) -> logging.Logger:
+    """Set up a logger with active learning-specific formatting and handlers.
+    Parameters
+    ----------
+    name : str
+        Logger name
+    run_id : str
+        Unique identifier for this run, used in log filename
+    log_dir : str | Path, optional
+        Directory to store log files, by default "./logs"
+    level : int, optional
+        Logging level, by default logging.INFO
+    Returns
+    -------
+    logging.Logger
+        Configured standard Python logger
+    Example
+    -------
+    >>> logger = setup_active_learning_logger("experiment", "run_001")
+    >>> logger.info("Starting experiment")
+    >>> with log_context(caller_object="Trainer", iteration=5):
+    ...     logger.info("Training step")
+    """
+    # Get standard logger
+    logger = logging.getLogger(name)
+    logger.setLevel(level)
+    # Clear any existing handlers to avoid duplicates
+    logger.handlers.clear()
+    # Disable propagation to prevent duplicate messages from parent loggers
+    logger.propagate = False
+    # Create log directory if it doesn't exist
+    if isinstance(log_dir, str):
+        log_dir_path = Path(log_dir)
+    else:
+        log_dir_path = log_dir
+    log_dir_path.mkdir(parents=True, exist_ok=True)
+    # Set up console handler with standard formatting
+    console_handler = logging.StreamHandler()
+    console_handler.setLevel(logging.DEBUG)
+    console_handler.setFormatter(
+        ContextFormatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+    )
+    console_handler.addFilter(ContextInjectingFilter())
+    logger.addHandler(console_handler)
+    # Set up file handler with JSON formatting
+    log_file = log_dir_path / f"{run_id}.log"
+    file_handler = logging.FileHandler(log_file, mode="w")
+    file_handler.setLevel(logging.DEBUG)
+    file_handler.setFormatter(JSONFormatter())
+    file_handler.addFilter(ContextInjectingFilter())
+    logger.addHandler(file_handler)
+    return logger
+@contextmanager
+def log_context(
+    caller_object: str | None = None,
+    iteration: int | None = None,
+    phase: str | None = None,
+    **kwargs: Any,
+):
+    """Context manager for adding contextual information to log messages.
+    Parameters
+    ----------
+    caller_object : str, optional
+        Name or identifier of the object making the log call
+    iteration : int, optional
+        Current iteration counter
+    phase : str, optional
+        Current phase of the active learning process
+    **kwargs : Any
+        Additional contextual key-value pairs
+    Example
+    -------
+    >>> from logging import getLogger
+    >>> from physicsnemo.active_learning.logger import log_context
+    >>> logger = getLogger("my_logger")
+    >>> with log_context(caller_object="Trainer", iteration=5, phase="training", epoch=2):
+    ...     logger.info("Processing batch")
+    """
+    context_info = {
+        "caller_object": caller_object,
+        "iteration": iteration,
+        "phase": phase,
+        "context": kwargs,
+    }
+    context_stack = _get_context_stack()
+    context_stack.append(context_info)
+    try:
+        yield
+    finally:
+        context_stack.pop()

physics_mcp/source/physicsnemo/active_learning/loop.py ADDED Viewed

	@@ -0,0 +1,534 @@

+# SPDX-FileCopyrightText: Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+import inspect
+from pathlib import Path
+from typing import Any
+import torch
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import _LRScheduler
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from physicsnemo import Module
+from physicsnemo.active_learning import protocols as p
+from physicsnemo.distributed import DistributedManager
+from physicsnemo.launch.logging import LaunchLogger
+from physicsnemo.utils.capture import StaticCaptureEvaluateNoGrad, StaticCaptureTraining
+__all__ = ["DefaultTrainingLoop"]
+def _recursive_data_device_cast(
+    data: Any,
+    device: torch.device | str | None = None,
+    dtype: torch.dtype | None = None,
+    **kwargs: Any,
+) -> Any:
+    """
+    Recursively moves/cast input data to a specified device and dtype.
+    For iterable objects, we recurse through the elements depending on
+    the type of iterable until we reach an object that either has a ``to``
+    method that can be called, or just returns the data unchanged.
+    Parameters
+    ----------
+    data: Any
+        The data to move to the device.
+    device: torch.device | str | None = None
+        The device to move the data to.
+    dtype: torch.dtype | None = None
+        The dtype to move the data to.
+    kwargs: Any
+        Additional keyword arguments to pass to the `to` method.
+        By default, `non_blocking` is set to `True` to allow
+        asynchronous data transfers.
+    Returns
+    -------
+    Any
+        The data moved to the device.
+    """
+    kwargs.setdefault("non_blocking", True)
+    if hasattr(data, "to"):
+        # if there is a `to` method, then we can just call it
+        return data.to(device=device, dtype=dtype, **kwargs)
+    elif isinstance(data, dict):
+        return {
+            k: _recursive_data_device_cast(v, device, dtype) for k, v in data.items()
+        }
+    elif isinstance(data, list):
+        return [_recursive_data_device_cast(v, device, dtype) for v in data]
+    elif isinstance(data, tuple):
+        return tuple(_recursive_data_device_cast(v, device, dtype) for v in data)
+    else:
+        return data
+class DefaultTrainingLoop(p.TrainingLoop):
+    def __new__(cls, *args: Any, **kwargs: Any) -> DefaultTrainingLoop:
+        """
+        Wrapper for instantiating DefaultTrainingLoop.
+        This method captures arguments used to instantiate the loop
+        and stores them in the `_args` attribute for serialization.
+        This follows the same pattern as `ActiveLearningProtocol.__new__`.
+        Parameters
+        ----------
+        args: Any
+            Arguments to pass to the loop's constructor.
+        kwargs: Any
+            Keyword arguments to pass to the loop's constructor.
+        Returns
+        -------
+        DefaultTrainingLoop
+            A new instance with an `_args` attribute for serialization.
+        """
+        out = super().__new__(cls)
+        # Get signature of __init__ function
+        sig = inspect.signature(cls.__init__)
+        # Bind args and kwargs to signature
+        bound_args = sig.bind_partial(
+            *([None] + list(args)), **kwargs
+        )  # Add None to account for self
+        bound_args.apply_defaults()
+        # Get args and kwargs (excluding self and unroll kwargs)
+        instantiate_args = {}
+        for param, (k, v) in zip(sig.parameters.values(), bound_args.arguments.items()):
+            # Skip self
+            if k == "self":
+                continue
+            # Add args and kwargs to instantiate_args
+            if param.kind == param.VAR_KEYWORD:
+                instantiate_args.update(v)
+            else:
+                # Special handling for device: convert torch.device to string
+                if k == "device" and isinstance(v, torch.device):
+                    instantiate_args[k] = str(v)
+                # Special handling for dtype: convert to string representation
+                elif k == "dtype" and isinstance(v, torch.dtype):
+                    instantiate_args[k] = str(v)
+                else:
+                    instantiate_args[k] = v
+        # Store args needed for instantiation
+        out._args = {
+            "__name__": cls.__name__,
+            "__module__": cls.__module__,
+            "__args__": instantiate_args,
+        }
+        return out
+    def __init__(
+        self,
+        train_step_fn: p.TrainingProtocol | None = None,
+        validate_step_fn: p.ValidationProtocol | None = None,
+        enable_static_capture: bool = True,
+        use_progress_bars: bool = True,
+        device: str | torch.device | None = None,
+        dtype: torch.dtype | None = None,
+        checkpoint_frequency: int = 0,
+        **capture_kwargs: Any,
+    ) -> None:
+        """
+        Initializes the default training loop.
+        The general usage of this loop is to
+        TODO: add support for early stopping
+        Parameters
+        ----------
+        train_step_fn: TrainingProtocol | None = None
+            A callable that implements the logic for performing a single
+            training step. See ``protocols.TrainingProtocol`` for the expected
+            interface, but ultimately the function should return a scalar loss
+            value that has a ``backward`` method.
+        validate_step_fn: ValidationProtocol | None = None
+            A callable that implements the logic for performing a single
+            validation step. See ``protocols.ValidationProtocol`` for the expected
+            interface, but in contrast to ``train_step_fn`` this function should
+            not return anything.
+        enable_static_capture: bool = True
+            Whether to enable static capture for the training and validation steps.
+        use_progress_bars: bool = True
+            Whether to show ``tqdm`` progress bars to display epoch and step progress.
+        device: str | torch.device | None = None
+            The device used for performing the loop. If not provided, then the device
+            will default to the model's device at runtime.
+        dtype: torch.dtype | None = None
+            The dtype used for performing the loop. If not provided, then the dtype
+            will default to ``torch.get_default_dtype()``.
+        checkpoint_frequency: int = 0
+            How often to save checkpoints during training (every N epochs).
+            If 0, no checkpoints are saved during training. Set via Driver before
+            training execution.
+        capture_kwargs: Any
+            Additional keyword arguments to pass to the static capture decorators.
+        """
+        self.train_step_fn = train_step_fn
+        self.validate_step_fn = validate_step_fn
+        self.enable_static_capture = enable_static_capture
+        if isinstance(device, str):
+            device = torch.device(device)
+        # check to see if we can rely on DistributedManager
+        if device is None and DistributedManager.is_initialized():
+            device = DistributedManager.device
+        self.device = device
+        if dtype is None:
+            dtype = torch.get_default_dtype()
+        self.dtype = dtype
+        self.capture_kwargs = capture_kwargs
+        self.use_progress_bars = use_progress_bars
+        self.capture_functions = {}
+        self.checkpoint_frequency = checkpoint_frequency
+        self.checkpoint_base_dir: Path | None = None
+    def save_training_checkpoint(
+        self,
+        checkpoint_dir: Path,
+        model: Module | p.LearnerProtocol,
+        optimizer: Optimizer,
+        lr_scheduler: _LRScheduler | None = None,
+        training_epoch: int | None = None,
+    ) -> None:
+        """
+        Save training state to checkpoint directory.
+        Model weights are saved separately. Optimizer, scheduler, and epoch
+        metadata are combined into a single training_state.pt file.
+        Parameters
+        ----------
+        checkpoint_dir: Path
+            Directory to save checkpoint files.
+        model: Module | p.LearnerProtocol
+            Model to save weights for.
+        optimizer: Optimizer
+            Optimizer to save state from.
+        lr_scheduler: _LRScheduler | None
+            Optional LR scheduler to save state from.
+        training_epoch: int | None
+            Current training epoch for metadata.
+        """
+        checkpoint_dir.mkdir(parents=True, exist_ok=True)
+        # Save model weights separately
+        if isinstance(model, Module):
+            model_path = checkpoint_dir / "model.mdlus"
+            model.save(str(model_path))
+        else:
+            model_path = checkpoint_dir / "model_state.pt"
+            torch.save(model.state_dict(), model_path)
+        # Combine optimizer, scheduler, and epoch metadata into single file
+        training_state = {
+            "optimizer_state": optimizer.state_dict(),
+            "lr_scheduler_state": lr_scheduler.state_dict() if lr_scheduler else None,
+            "training_epoch": training_epoch,
+        }
+        training_state_path = checkpoint_dir / "training_state.pt"
+        torch.save(training_state, training_state_path)
+    @staticmethod
+    def load_training_checkpoint(
+        checkpoint_dir: Path,
+        model: Module | p.LearnerProtocol,
+        optimizer: Optimizer,
+        lr_scheduler: _LRScheduler | None = None,
+    ) -> int | None:
+        """
+        Load training state from checkpoint directory.
+        Model weights are loaded separately. Optimizer, scheduler, and epoch
+        metadata are loaded from the combined training_state.pt file.
+        Parameters
+        ----------
+        checkpoint_dir: Path
+            Directory containing checkpoint files.
+        model: Module | p.LearnerProtocol
+            Model to load weights into.
+        optimizer: Optimizer
+            Optimizer to load state into.
+        lr_scheduler: _LRScheduler | None
+            Optional LR scheduler to load state into.
+        Returns
+        -------
+        int | None
+            Training epoch from metadata if available, else None.
+        """
+        # Load model weights separately
+        if isinstance(model, Module):
+            model_path = checkpoint_dir / "model.mdlus"
+            if model_path.exists():
+                model.load(str(model_path))
+        else:
+            model_state_path = checkpoint_dir / "model_state.pt"
+            if model_state_path.exists():
+                state_dict = torch.load(model_state_path, map_location="cpu")
+                model.load_state_dict(state_dict)
+        # Load combined training state (optimizer, scheduler, epoch)
+        training_state_path = checkpoint_dir / "training_state.pt"
+        if training_state_path.exists():
+            training_state = torch.load(training_state_path, map_location="cpu")
+            # Restore optimizer state
+            if "optimizer_state" in training_state:
+                optimizer.load_state_dict(training_state["optimizer_state"])
+            # Restore scheduler state if present
+            if lr_scheduler and training_state.get("lr_scheduler_state"):
+                lr_scheduler.load_state_dict(training_state["lr_scheduler_state"])
+            # Return epoch metadata
+            return training_state.get("training_epoch", None)
+        return None
+    @property
+    def amp_type(self) -> torch.dtype:
+        if self.dtype in [torch.float16, torch.bfloat16]:
+            return self.dtype
+        else:
+            return torch.float16
+    def _create_capture_functions(
+        self,
+        model: Module | p.LearnerProtocol,
+        optimizer: Optimizer,
+        train_step_fn: p.TrainingProtocol | None = None,
+        validate_step_fn: p.ValidationProtocol | None = None,
+    ) -> tuple[p.TrainingProtocol | None, p.ValidationProtocol | None]:
+        """
+        Attempt to create static capture functions based off training and validation
+        functions.
+        This uses the Python object IDs to unique identify functions, and adds the
+        decorated functions to an internal `capture_functions` dictionary. If the
+        decorated functions already exist, then this function will be no-op.
+        Parameters
+        ----------
+        model: Module | p.LearnerProtocol
+            The model to train.
+        optimizer: Optimizer
+            The optimizer to use for training.
+        train_step_fn: p.TrainingProtocol | None = None
+            The training function to use for training.
+        validate_step_fn: p.ValidationProtocol | None = None
+            The validation function to use for validation.
+        Returns
+        -------
+        tuple[p.TrainingProtocol | None, p.ValidationProtocol | None]
+            The training and validation functions with static capture applied.
+        """
+        if not train_step_fn:
+            train_step_fn = self.train_step_fn
+        train_func_id = id(train_step_fn)
+        if train_func_id not in self.capture_functions:
+            try:
+                train_step_fn = StaticCaptureTraining(
+                    model=model,
+                    optim=optimizer,
+                    amp_type=self.amp_type,
+                    **self.capture_kwargs,
+                )(train_step_fn)
+                self.capture_functions[train_func_id] = train_step_fn
+            except Exception as e:
+                raise RuntimeError(
+                    "Failed to create static capture for `train_step_fn`. "
+                ) from e
+        else:
+            train_step_fn = self.capture_functions[train_func_id]
+        if not validate_step_fn:
+            validate_step_fn = self.validate_step_fn
+        if validate_step_fn:
+            val_func_id = id(validate_step_fn)
+            if val_func_id not in self.capture_functions:
+                try:
+                    validate_step_fn = StaticCaptureEvaluateNoGrad(
+                        model=model, amp_type=self.amp_type, **self.capture_kwargs
+                    )(validate_step_fn)
+                    self.capture_functions[val_func_id] = validate_step_fn
+                except Exception as e:
+                    raise RuntimeError(
+                        "Failed to create static capture for `validate_step_fn`. "
+                    ) from e
+            else:
+                validate_step_fn = self.capture_functions[val_func_id]
+        return train_step_fn, validate_step_fn
+    def __call__(
+        self,
+        model: Module | p.LearnerProtocol,
+        optimizer: Optimizer,
+        train_dataloader: DataLoader,
+        max_epochs: int,
+        validation_dataloader: DataLoader | None = None,
+        train_step_fn: p.TrainingProtocol | None = None,
+        validate_step_fn: p.ValidationProtocol | None = None,
+        lr_scheduler: _LRScheduler | None = None,
+        device: str | torch.device | None = None,
+        dtype: torch.dtype | None = None,
+        *args: Any,
+        **kwargs: Any,
+    ) -> None:
+        """
+        Performs ``max_epochs`` epochs of training and optionally validation.
+        Some of the arguments, such as ``train_step_fn`` and ``validate_step_fn``,
+        are optional only if the ``model`` implements the ``p.LearnerProtocol``.
+        If they are passed, however, they will take precedence over the methods
+        originally provided to the constructor method.
+        The bare minimum required arguments for this loop to work are:
+        1. A model to train
+        2. An optimizer to step
+        3. A training dataloader to iterate over
+        4. The maximum number of epochs to train for
+        If validation is required, then both ``validation_dataloader`` and
+        ``validate_step_fn`` must be specified.
+        Parameters
+        ----------
+        model: Module | p.LearnerProtocol
+            The model to train.
+        optimizer: torch.optim.Optimizer
+            The optimizer to use for training.
+        train_dataloader: DataLoader
+            The dataloader to use for training.
+        max_epochs: int
+            The number of epochs to train for.
+        validation_dataloader: DataLoader | None
+            The dataloader to use for validation. If not provided, then validation
+            will not be performed.
+        train_step_fn: p.TrainingProtocol | None = None
+            The training function to use for training. If passed, it will take
+            precedence over the method provided to the constructor method.
+        validate_step_fn: p.ValidationProtocol | None = None
+            The validation function to use for validation.
+        lr_scheduler: torch.optim.lr_scheduler._LRScheduler | None = None
+            The learning rate scheduler to use for training.
+        device: str | torch.device | None = None
+            The device used for performing the loop. If provided, it will
+            override the device specified in the constructor. If both values
+            are not provided, then we default to PyTorch's default device.
+        dtype: torch.dtype | None = None
+            The dtype used for performing the loop. If provided, it will
+            override the dtype specified in the constructor. If both values
+            are not provided, then we default to PyTorch's default dtype.
+        args: Any
+            Additional arguments to pass the training and validation
+            step functions.
+        kwargs: Any
+            Additional keyword arguments to pass the training and validation
+            step functions.
+        """
+        if not train_step_fn and not self.train_step_fn:
+            raise RuntimeError(
+                """
+                No training step function provided.
+                Either provide a `train_step_fn` to this constructor, or
+                provide a `train_step_fn` to the `__call__` method.
+                """
+            )
+        if not device and not self.device:
+            device = torch.get_default_device()
+        if not dtype and not self.dtype:
+            dtype = torch.get_default_dtype()
+        # if a device is specified, move the model
+        if device and device != model.device:
+            # not 100% sure this will trigger issues with the optimizer
+            # but allows a potentially different device to be used
+            model = model.to(device)
+        if self.enable_static_capture:
+            # if static capture is enabled, we check for a cache hit based on
+            # the incoming function IDs. If we miss, we then create new wrappers.
+            train_step_fn, validate_step_fn = self._create_capture_functions(
+                model, optimizer, train_step_fn, validate_step_fn
+            )
+        epoch_iter = range(1, max_epochs + 1)
+        if self.use_progress_bars:
+            epoch_iter = tqdm(epoch_iter, desc="Epoch", leave=False, position=0)
+        ########### EPOCH LOOP ###########
+        for epoch in epoch_iter:
+            model.train()
+            train_iter = iter(train_dataloader)
+            if self.use_progress_bars:
+                train_iter = tqdm(
+                    train_iter, desc="Training step", leave=False, unit="batch"
+                )
+            ########### TRAINING STEP LOOP ###########
+            with LaunchLogger(
+                "train", epoch=epoch, num_mini_batch=len(train_dataloader)
+            ) as log:
+                for batch in train_iter:
+                    batch = _recursive_data_device_cast(
+                        batch, device=device, dtype=dtype
+                    )
+                    model.zero_grad(set_to_none=True)
+                    loss = train_step_fn(model, batch, *args, **kwargs)
+                    log.log_minibatch({"train_loss": loss.detach().item()})
+                    # normally, static capture will call backward because of AMP
+                    if not self.enable_static_capture:
+                        loss.backward()
+                    optimizer.step()
+                    if lr_scheduler:
+                        lr_scheduler.step()
+            ########### VALIDATION STEP LOOP ###########
+            if validate_step_fn and validation_dataloader:
+                model.eval()
+                val_iter = iter(validation_dataloader)
+                if self.use_progress_bars:
+                    val_iter = tqdm(
+                        val_iter, desc="Validation step", leave=False, unit="batch"
+                    )
+                with LaunchLogger(
+                    "validation", epoch=epoch, num_mini_batch=len(validation_dataloader)
+                ) as log:
+                    for batch in val_iter:
+                        batch = _recursive_data_device_cast(
+                            batch, device=device, dtype=dtype
+                        )
+                        validate_step_fn(model, batch, *args, **kwargs)
+            ########### CHECKPOINT SAVE ###########
+            # Save training state at specified frequency
+            if self.checkpoint_base_dir and self.checkpoint_frequency > 0:
+                if epoch % self.checkpoint_frequency == 0:
+                    epoch_checkpoint_dir = self.checkpoint_base_dir / f"epoch_{epoch}"
+                    self.save_training_checkpoint(
+                        checkpoint_dir=epoch_checkpoint_dir,
+                        model=model,
+                        optimizer=optimizer,
+                        lr_scheduler=lr_scheduler,
+                        training_epoch=epoch,
+                    )

physics_mcp/source/physicsnemo/active_learning/protocols.py ADDED Viewed

	@@ -0,0 +1,1394 @@

+# SPDX-FileCopyrightText: Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This module contains base classes for active learning protocols.
+These are protocols intended to be abstract, and importing these
+classes specifically is intended to either be subclassed, or for
+type annotations.
+Protocol Architecture
+---------------------
+Python ``Protocol``s are used for structural typing: essentially, they are used to
+describe an expected interface in a way that is helpful for static type checkers
+to make sure concrete implementations provide everything that is needed for a workflow
+to function. ``Protocol``s are not actually enforced at runtime, and inheritance is not
+required for them to function: as long as the implementation provides the expected
+attributes and methods, they will be compatible with the protocol.
+The active learning framework is built around several key protocol abstractions
+that work together to orchestrate the active learning workflow:
+**Core Infrastructure Protocols:**
+ - `AbstractQueue[T]` - Generic queue protocol for passing data between components
+ - `DataPool[T]` - Protocol for data reservoirs that support appending and sampling
+ - `ActiveLearningProtocol` - Base protocol providing common interface for all AL strategies
+**Strategy Protocols (inherit from ActiveLearningProtocol):**
+ - `QueryStrategy` - Defines how to select data points for labeling
+ - `LabelStrategy` - Defines processes for adding ground truth labels to unlabeled data
+ - `MetrologyStrategy` - Defines procedures that assess model improvements beyond validation metrics
+**Model Interface Protocols:**
+ - `TrainingProtocol` - Interface for training step functions
+ - `ValidationProtocol` - Interface for validation step functions
+ - `InferenceProtocol` - Interface for inference step functions
+ - `TrainingLoop` - Interface for complete training loop implementations
+ - `LearnerProtocol` - Comprehensive interface for learner modules (combines training/validation/inference)
+**Orchestration Protocol:**
+ - `DriverProtocol` - Main orchestrator that coordinates all components in the active learning loop
+Protocol Relationships
+----------------------
+```mermaid
+graph TB
+    subgraph "Core Infrastructure"
+        AQ[AbstractQueue&lt;T&gt;]
+        DP[DataPool&lt;T&gt;]
+        ALP[ActiveLearningProtocol]
+    end
+    subgraph "Strategy Layer"
+        QS[QueryStrategy]
+        LS[LabelStrategy]
+        MS[MetrologyStrategy]
+    end
+    subgraph "Model Interface Layer"
+        TP[TrainingProtocol]
+        VP[ValidationProtocol]
+        IP[InferenceProtocol]
+        TL[TrainingLoop]
+        LP[LearnerProtocol]
+    end
+    subgraph "Orchestration Layer"
+        Driver[DriverProtocol]
+    end
+    %% Inheritance relationships (thick blue arrows)
+    ALP ==>|inherits| QS
+    ALP ==>|inherits| LS
+    ALP ==>|inherits| MS
+    %% Composition relationships (dashed green arrows)
+    Driver -.->|uses| LP
+    Driver -.->|manages| QS
+    Driver -.->|manages| LS
+    Driver -.->|manages| MS
+    Driver -.->|contains| DP
+    Driver -.->|contains| AQ
+    %% Protocol usage relationships (dotted purple arrows)
+    TL -.->|can use| TP
+    TL -.->|can use| VP
+    TL -.->|can use| LP
+    LP -.->|implements| TP
+    LP -.->|implements| VP
+    LP -.->|implements| IP
+    %% Data flow relationships (solid red arrows)
+    QS -->|enqueues to| AQ
+    AQ -->|consumed by| LS
+    LS -->|enqueues to| AQ
+    %% Styling for different relationship types
+    linkStyle 0 stroke:#1976d2,stroke-width:4px
+    linkStyle 1 stroke:#1976d2,stroke-width:4px
+    linkStyle 2 stroke:#1976d2,stroke-width:4px
+    linkStyle 3 stroke:#388e3c,stroke-width:2px,stroke-dasharray: 5 5
+    linkStyle 4 stroke:#388e3c,stroke-width:2px,stroke-dasharray: 5 5
+    linkStyle 5 stroke:#388e3c,stroke-width:2px,stroke-dasharray: 5 5
+    linkStyle 6 stroke:#388e3c,stroke-width:2px,stroke-dasharray: 5 5
+    linkStyle 7 stroke:#388e3c,stroke-width:2px,stroke-dasharray: 5 5
+    linkStyle 8 stroke:#388e3c,stroke-width:2px,stroke-dasharray: 5 5
+    linkStyle 9 stroke:#7b1fa2,stroke-width:2px,stroke-dasharray: 2 2
+    linkStyle 10 stroke:#7b1fa2,stroke-width:2px,stroke-dasharray: 2 2
+    linkStyle 11 stroke:#7b1fa2,stroke-width:2px,stroke-dasharray: 2 2
+    linkStyle 12 stroke:#7b1fa2,stroke-width:2px,stroke-dasharray: 2 2
+    linkStyle 13 stroke:#7b1fa2,stroke-width:2px,stroke-dasharray: 2 2
+    linkStyle 14 stroke:#7b1fa2,stroke-width:2px,stroke-dasharray: 2 2
+    linkStyle 15 stroke:#d32f2f,stroke-width:3px
+    linkStyle 16 stroke:#d32f2f,stroke-width:3px
+    linkStyle 17 stroke:#d32f2f,stroke-width:3px
+    %% Node styling
+    classDef coreInfra fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
+    classDef strategy fill:#f3e5f5,stroke:#7b1fa2,stroke-width:2px
+    classDef modelInterface fill:#e8f5e8,stroke:#388e3c,stroke-width:2px
+    classDef orchestration fill:#fff3e0,stroke:#f57c00,stroke-width:3px
+    class AQ,DP,ALP coreInfra
+    class QS,LS,MS strategy
+    class TP,VP,IP,TL,LP modelInterface
+    class Driver orchestration
+```
+**Relationship Legend:**
+- **Blue thick arrows (==>)**: Inheritance relationships (subclass extends parent)
+- **Green dashed arrows (-.->)**: Composition relationships (object contains/manages other objects)
+- **Purple dotted arrows (-.->)**: Protocol usage relationships (can use or implements interface)
+- **Red solid arrows (-->)**: Data flow relationships (data moves between components)
+Active Learning Workflow
+------------------------
+The typical active learning workflow orchestrated by `DriverProtocol` follows this sequence:
+1. **Training Phase**: Use `LearnerProtocol` or `TrainingLoop` to train the model on `training_pool`
+2. **Metrology Phase** (optional): Apply `MetrologyStrategy` instances to assess model performance
+3. **Query Phase**: Apply `QueryStrategy` instances to select samples from `unlabeled_pool` → `query_queue`
+4. **Labeling Phase** (optional): Apply `LabelStrategy` instances to label queued samples → `label_queue`
+5. **Data Integration**: Move labeled data from `label_queue` to `training_pool`
+Type Parameters
+---------------
+- `T`: Data structure containing both inputs and ground truth labels
+- `S`: Data structure containing only inputs (no ground truth labels)
+"""
+from __future__ import annotations
+import inspect
+import logging
+from enum import StrEnum
+from logging import Logger
+from pathlib import Path
+from typing import Any, Iterator, Protocol, TypeVar
+import torch
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import _LRScheduler
+from torch.utils.data import DataLoader
+from physicsnemo import Module
+# T is used to denote a data structure that contains inputs for a model and ground truths
+T = TypeVar("T")
+# S is used to denote a data structure that has inputs for a model, but no ground truth labels
+S = TypeVar("S")
+class ActiveLearningPhase(StrEnum):
+    """
+    An enumeration of the different phases of the active learning workflow.
+    This is primarily used in the metadata for restarting an ongoing active
+    learning experiment.
+    """
+    TRAINING = "training"
+    METROLOGY = "metrology"
+    QUERY = "query"
+    LABELING = "labeling"
+    DATA_INTEGRATION = "data_integration"
+class AbstractQueue(Protocol[T]):
+    """
+    Defines a generic queue protocol for data that is passed between active
+    learning components.
+    This can be a simple local `queue.Queue`, or a more sophisticated
+    distributed queue system.
+    The primary use case for this is to allow a query strategy to
+    enqueue some data structure for the labeling strategy to consume,
+    and once the labeling is done, enqueue to a data serialization
+    workflow. While there is no explcit restriction on the **type**
+    of queue that is implemented, a reasonable assumption to make
+    would be a FIFO queue, unless otherwise specified by the concrete
+    implementation.
+    Optional Serialization Methods
+    -------------------------------
+    Implementations may optionally provide `to_list()` and `from_list()`
+    methods for checkpoint serialization. If not provided, the queue
+    will be serialized using `torch.save()` as a fallback.
+    Type Parameters
+    ---------------
+    T
+        The type of items that will be stored in the queue.
+    """
+    def put(self, item: T) -> None:
+        """
+        Method to put a data structure into the queue.
+        Parameters
+        ----------
+        item: T
+            The data structure to put into the queue.
+        """
+        ...
+    def get(self) -> T:
+        """
+        Method to get a data structure from the queue.
+        This method should remove the data structure from the queue,
+        and return it to a consumer.
+        Returns
+        -------
+        T
+            The data structure that was removed from the queue.
+        """
+        ...
+    def empty(self) -> bool:
+        """
+        Method to check if the queue is empty/has been depleted.
+        Returns
+        -------
+        bool
+            True if the queue is empty, False otherwise.
+        """
+        ...
+class DataPool(Protocol[T]):
+    """
+    An abstract protocol for some reservoir of data that is
+    used for some part of active learning, parametrized such
+    that it will return data structures of an arbitrary type ``T``.
+    **All** methods are left abstract, and need to be defined
+    by concrete implementations. For the most part, a `torch.utils.data.Dataset`
+    would match this protocol, provided that it implements the ``append`` method
+    which will allow data to be persisted to a filesystem.
+    Methods
+    -------
+    __getitem__(self, index: int) -> T:
+        Method to get a single data structure from the data pool.
+    __len__(self) -> int:
+        Method to get the length of the data pool.
+    __iter__(self) -> Iterator[T]:
+        Method to iterate over the data pool.
+    append(self, item: T) -> None:
+        Method to append a data structure to the data pool.
+    """
+    def __getitem__(self, index: int) -> T:
+        """
+        Method to get a data structure from the data pool.
+        This method should retrieve an item from the pool by a
+        flat index.
+        Parameters
+        ----------
+        index: int
+            The index of the data structure to get.
+        Returns
+        -------
+        T
+            The data structure at the given index.
+        """
+        ...
+    def __len__(self) -> int:
+        """
+        Method to get the length of the data pool.
+        Returns
+        -------
+        int
+            The length of the data pool.
+        """
+        ...
+    def __iter__(self) -> Iterator[T]:
+        """
+        Method to iterate over the data pool.
+        This method should return an iterator over the data pool.
+        Returns
+        -------
+        Iterator[T]
+            An iterator over the data pool.
+        """
+        ...
+    def append(self, item: T) -> None:
+        """
+        Method to append a data structure to the data pool.
+        For persistent storage pools, this will actually mean that the
+        ``item`` is serialized to a filesystem.
+        Parameters
+        ----------
+        item: T
+            The data structure to append to the data pool.
+        """
+        ...
+class ActiveLearningProtocol(Protocol):
+    """
+    This protocol acts as a basis for all active learning protocols.
+    This ensures that all protocols have some common interface, for
+    example the ability to `attach` to another object for scope
+    management.
+    Attributes
+    ----------
+    __protocol_name__: str
+        The name of the protocol. This is primarily used for `repr`
+        and `str` f-strings. This should be defined by concrete
+        implementations.
+    _args: dict[str, Any]
+        A dictionary of arguments that were used to instantiate the protocol.
+        This is used for serialization and deserialization of the protocol,
+        and follows the same pattern as the ``_args`` attribute of
+        ``physicsnemo.Module``.
+    Methods
+    -------
+    attach(self, other: object) -> None:
+        This method is used to attach the current object to another,
+        allowing the protocol to access the attached object's scope.
+        The use case for this is to allow a protocol access to the
+        driver's scope to access dataset, model, etc. as needed.
+        This needs to be implemented by concrete implementations.
+    is_attached: bool
+        Whether the current object is attached to another object.
+        This is left abstract, as it depends on how ``attach`` is implemented.
+    logger: Logger
+        The logger for this protocol. This is used to log information
+        about the protocol's progress.
+    _setup_logger(self) -> None:
+        This method is used to setup the logger for the protocol.
+        The default implementation is to configure the logger similarly
+        to how ``physicsnemo`` loggers are configured.
+    """
+    __protocol_name__: str
+    __protocol_type__: ActiveLearningPhase
+    _args: dict[str, Any]
+    def __new__(cls, *args: Any, **kwargs: Any) -> ActiveLearningProtocol:
+        """
+        Wrapper for instantiating any subclass of `ActiveLearningProtocol`.
+        This method will use `inspect` to capture arguments and keyword
+        arguments that were used to instantiate the protocol, and stash
+        them into the `_args` attribute of the instance, following
+        what is done with `physicsnemo.Module`.
+        This approach is useful for reconstructing strategies from checkpoints.
+        Parameters
+        ----------
+        args: Any
+            Arguments to pass to the protocol's constructor.
+        kwargs: Any
+            Keyword arguments to pass to the protocol's constructor.
+        Returns
+        -------
+        ActiveLearningProtocol
+            A new instance of the protocol class. The instance will have an
+            `_args` attribute that contains the keys `__name__`, `__module__`,
+            and `__args__` as metadata for the protocol.
+        """
+        out = super().__new__(cls)
+        # Get signature of __init__ function
+        sig = inspect.signature(cls.__init__)
+        # Bind args and kwargs to signature
+        bound_args = sig.bind_partial(
+            *([None] + list(args)), **kwargs
+        )  # Add None to account for self
+        bound_args.apply_defaults()
+        # Get args and kwargs (excluding self and unroll kwargs)
+        instantiate_args = {}
+        for param, (k, v) in zip(sig.parameters.values(), bound_args.arguments.items()):
+            # Skip self
+            if k == "self":
+                continue
+            # Add args and kwargs to instantiate_args
+            if param.kind == param.VAR_KEYWORD:
+                instantiate_args.update(v)
+            else:
+                instantiate_args[k] = v
+        # Store args needed for instantiation
+        out._args = {
+            "__name__": cls.__name__,
+            "__module__": cls.__module__,
+            "__args__": instantiate_args,
+        }
+        return out
+    def attach(self, other: object) -> None:
+        """
+        This method is used to attach another object to the current protocol,
+        allowing the attached object to access the scope of this protocol.
+        The primary reason for this is to allow the protocol to access
+        things like the dataset, the learner model, etc. as needed.
+        Example use cases would be for a query strategy to access the ``unlabeled_pool``;
+        for a metrology strategy to access the ``validation_pool``, and for any
+        strategy to be able to access the surrogate/learner model.
+        This method can be as simple as setting ``self.driver = other``, but
+        is left abstract in case there are other potential use cases
+        where multiple protocols could share information.
+        Parameters
+        ----------
+        other: object
+            The object to attach to.
+        """
+        ...
+    @property
+    def is_attached(self) -> bool:
+        """
+        Property to check if the current object is already attached.
+        This is left abstract, as it depends on how ``attach`` is implemented.
+        Returns
+        -------
+        bool
+            True if the current object is attached, False otherwise.
+        """
+        ...
+    @property
+    def logger(self) -> Logger:
+        """
+        Property to access the logger for this protocol.
+        If the logger has not been configured yet, the property
+        will call the `_setup_logger` method to configure it.
+        Returns
+        -------
+        Logger
+            The logger for this protocol.
+        """
+        if not hasattr(self, "_logger"):
+            self._setup_logger()
+        return self._logger
+    @logger.setter
+    def logger(self, logger: Logger) -> None:
+        """
+        Setter for the logger for this protocol.
+        Parameters
+        ----------
+        logger: Logger
+            The logger to set for this protocol.
+        """
+        self._logger = logger
+    def _setup_logger(self) -> None:
+        """
+        Method to setup the logger for all active learning protocols.
+        Each protocol should have their own logger
+        """
+        self.logger = logging.getLogger(
+            f"core.active_learning.{self.__protocol_name__}"
+        )
+        # Don't add handlers here - let the parent logger handle formatting
+        # This prevents duplicate console output
+        self.logger.setLevel(logging.WARNING)
+    @property
+    def strategy_dir(self) -> Path:
+        """
+        Returns the directory where the underlying strategy can use
+        to persist data.
+        Depending on the strategy abstraction, further nesting may be
+        required (e.g active learning step index, phase, etc.).
+        Returns
+        -------
+        Path
+            The directory where the metrology strategy will persist
+            its records.
+        Raises
+        ------
+        RuntimeError
+            If the metrology strategy is not attached to a driver yet.
+        """
+        if not self.is_attached:
+            raise RuntimeError(
+                f"{self.__class__.__name__} is not attached to a driver yet."
+            )
+        path = (
+            self.driver.log_dir / str(self.__protocol_type__) / self.__class__.__name__
+        )
+        path.mkdir(parents=True, exist_ok=True)
+        return path
+    @property
+    def checkpoint_dir(self) -> Path:
+        """
+        Utility property for strategies to conveniently access the checkpoint directory.
+        This is useful for (de)serializing data tied to checkpointing.
+        Returns
+        -------
+        Path
+            The checkpoint directory, which includes the active learning step index.
+        Raises
+        ------
+        RuntimeError
+            If the strategy is not attached to a driver yet.
+        """
+        if not self.is_attached:
+            raise RuntimeError(
+                f"{self.__class__.__name__} is not attached to a driver yet."
+            )
+        path = (
+            self.driver.log_dir
+            / "checkpoints"
+            / f"step_{self.driver.active_learning_step_idx}"
+        )
+        path.mkdir(parents=True, exist_ok=True)
+        return path
+class QueryStrategy(ActiveLearningProtocol):
+    """
+    This protocol defines a query strategy for active learning.
+    A query strategy is responsible for selecting data points for labeling.
+    In the most general sense, concrete instances of this protocol
+    will specify how many samples to query, and the heuristics for
+    selecting samples.
+    Attributes
+    ----------
+    max_samples: int
+        The maximum number of samples to query. This can be interpreted
+        as the exact number of samples to query, or as an upper limit
+        for querying methods that are threshold based.
+    """
+    max_samples: int
+    __protocol_type__ = ActiveLearningPhase.QUERY
+    def sample(self, query_queue: AbstractQueue[T], *args: Any, **kwargs: Any) -> None:
+        """
+        Method that implements the logic behind querying data to be labeled.
+        This method should be implemented by concrete implementations,
+        and assume that an active learning driver will pass a queue
+        for this method to enqueue data to be labeled.
+        Additional ``args`` and ``kwargs`` are passed to the method,
+        and can be used to pass additional information to the query strategy.
+        This method will enqueue in place, and should not return anything.
+        Parameters
+        ----------
+        query_queue: AbstractQueue[T]
+            The queue to enqueue data to be labeled.
+        args: Any
+            Additional arguments to pass to the method.
+        kwargs: Any
+            Additional keyword arguments to pass to the method.
+        """
+        ...
+    def __call__(
+        self, query_queue: AbstractQueue[T], *args: Any, **kwargs: Any
+    ) -> None:
+        """
+        Syntactic sugar for the ``sample`` method.
+        This allows the object to be called as a function, and will pass
+        the arguments to the strategy's ``sample`` method.
+        Parameters
+        ----------
+        query_queue: AbstractQueue[T]
+            The queue to enqueue data to be labeled.
+        args: Any
+            Additional arguments to pass to the method.
+        kwargs: Any
+            Additional keyword arguments to pass to the method.
+        """
+        self.sample(query_queue, *args, **kwargs)
+class LabelStrategy(ActiveLearningProtocol):
+    """
+    This protocol defines a label strategy for active learning.
+    A label strategy is responsible for labeling data points; this may
+    be an simple Python function for demonstrating a concept, or an external,
+    potentially time consuming and complex, process.
+    Attributes
+    ----------
+    __is_external_process__: bool
+        Whether the label strategy is running in an external process.
+    __provides_fields__: set[str]
+        The fields that the label strategy provides. This should be
+        set by concrete implementations, and should be used to write
+        and map labeled data to fields within the data structure ``T``.
+    """
+    __is_external_process__: bool
+    __provides_fields__: set[str] | None = None
+    __protocol_type__ = ActiveLearningPhase.LABELING
+    def label(
+        self,
+        queue_to_label: AbstractQueue[T],
+        serialize_queue: AbstractQueue[T],
+        *args: Any,
+        **kwargs: Any,
+    ) -> None:
+        """
+        Method that implements the logic behind labeling data.
+        This method should be implemented by concrete implementations,
+        and assume that an active learning driver will pass a queue
+        for this method to dequeue data to be labeled.
+        Parameters
+        ----------
+        queue_to_label: AbstractQueue[T]
+            Queue containing data structures to be labeled. Generally speaking,
+            this should be passed over after running query strateg(ies).
+        serialize_queue: AbstractQueue[T]
+            Queue for enqueing labeled data to be serialized.
+        args: Any
+            Additional arguments to pass to the method.
+        kwargs: Any
+            Additional keyword arguments to pass to the method.
+        """
+        ...
+    def __call__(
+        self,
+        queue_to_label: AbstractQueue[T],
+        serialize_queue: AbstractQueue[T],
+        *args: Any,
+        **kwargs: Any,
+    ) -> None:
+        """
+        Syntactic sugar for the ``label`` method.
+        This allows the object to be called as a function, and will pass
+        the arguments to the strategy's ``label`` method.
+        Parameters
+        ----------
+        queue_to_label: AbstractQueue[T]
+            Queue containing data structures to be labeled.
+        serialize_queue: AbstractQueue[T]
+            Queue for enqueing labeled data to be serialized.
+        args: Any
+            Additional arguments to pass to the method.
+        kwargs: Any
+            Additional keyword arguments to pass to the method.
+        """
+        self.label(queue_to_label, serialize_queue, *args, **kwargs)
+class MetrologyStrategy(ActiveLearningProtocol):
+    """
+    This protocol defines a metrology strategy for active learning.
+    A metrology strategy is responsible for assessing the improvements to the underlying
+    model, beyond simple validation metrics. This should reflect the application
+    requirements of the model, which may include running a simulation.
+    Attributes
+    ----------
+    records: list[S]
+        A sequence of record data structures that records the
+        history of the active learning process, as viewed by
+        this particular metrology view.
+    """
+    records: list[S]
+    __protocol_type__ = ActiveLearningPhase.METROLOGY
+    def append(self, record: S) -> None:
+        """
+        Method to append a record to the metrology strategy.
+        Parameters
+        ----------
+        record: S
+            The record to append to the metrology strategy.
+        """
+        self.records.append(record)
+    def __len__(self) -> int:
+        """
+        Method to get the length of the metrology strategy.
+        Returns
+        -------
+        int
+            The length of the metrology strategy.
+        """
+        return len(self.records)
+    def serialize_records(
+        self, path: Path | None = None, *args: Any, **kwargs: Any
+    ) -> None:
+        """
+        Method to serialize the records of the metrology strategy.
+        This should be defined by a concrete implementation, which dictates
+        how the records are persisted, e.g. to a JSON file, database, etc.
+        The `strategy_dir` property can be used to determine the directory where
+        the records should be persisted.
+        Parameters
+        ----------
+        path: Path | None
+            The path to serialize the records to. If not provided, the strategy
+            should provide a reasonable default, such as with the checkpointing
+            or within the corresponding metrology directory via `strategy_dir`.
+        args: Any
+            Additional arguments to pass to the method.
+        kwargs: Any
+            Additional keyword arguments to pass to the method.
+        """
+        ...
+    def load_records(self, path: Path | None = None, *args: Any, **kwargs: Any) -> None:
+        """
+        Method to load the records of the metrology strategy, i.e.
+        the reverse of `serialize_records`.
+        This should be defined by a concrete implementation, which dictates
+        how the records are loaded, e.g. from a JSON file, database, etc.
+        If no path is provided, the strategy should load the latest records
+        as sensible defaults. The `records` attribute should then be overwritten
+        in-place.
+        Parameters
+        ----------
+        path: Path | None
+            The path to load the records from. If not provided, the strategy
+            should load the latest records as sensible defaults.
+        args: Any
+            Additional arguments to pass to the method.
+        kwargs: Any
+            Additional keyword arguments to pass to the method.
+        """
+        ...
+    def compute(self, *args: Any, **kwargs: Any) -> None:
+        """
+        Method to compute the metrology strategy. No data is passed to
+        this method, as it is expected that the data be drawn as needed
+        from various ``DataPool`` connected to the driver.
+        This method defines the core logic for computing a particular view
+        of performance by the underlying model on the data. Once computed,
+        the data needs to be formatted into a record data structure ``S``,
+        that is then appended to the ``records`` attribute.
+        Parameters
+        ----------
+        args: Any
+            Additional arguments to pass to the method.
+        kwargs: Any
+            Additional keyword arguments to pass to the method.
+        """
+        ...
+    def __call__(self, *args: Any, **kwargs: Any) -> None:
+        """
+        Syntactic sugar for the ``compute`` method.
+        This allows the object to be called as a function, and will pass
+        the arguments to the strategy's ``compute`` method.
+        Parameters
+        ----------
+        args: Any
+            Additional arguments to pass to the method.
+        kwargs: Any
+            Additional keyword arguments to pass to the method.
+        """
+        self.compute(*args, **kwargs)
+    def reset(self) -> None:
+        """
+        Method to reset any stateful attributes of the metrology strategy.
+        By default, the ``records`` attribute is reset to an empty list.
+        """
+        self.records = []
+class TrainingProtocol(Protocol):
+    """
+    This protocol defines the interface for training steps: given
+    a model and some input data, compute the reduced, differentiable
+    loss tensor and return it.
+    A concrete implementation can simply be a function with a signature that
+    matches what is defined in ``__call__``.
+    """
+    def __call__(
+        self, model: Module, data: T, *args: Any, **kwargs: Any
+    ) -> torch.Tensor:
+        """
+        Implements the training logic for a single training sample or batch.
+        For a PhysicsNeMo ``Module`` with trainable parameters, the output
+        of this function should correspond to a PyTorch tensor that is
+        ``backward``-ready. If there are any logging operations associated
+        with training, they should be performed within this function.
+        For ideal performance, this function should also be wrappable with
+        ``StaticCaptureTraining`` for optimization.
+        Parameters
+        ----------
+        model: Module
+            The model to train.
+        data: T
+            The data to train on. This data structure should comprise
+            both input and ground truths to compute the loss.
+        args: Any
+            Additional arguments to pass to the method.
+        kwargs: Any
+            Additional keyword arguments to pass to the method.
+        Returns
+        -------
+        torch.Tensor
+            The reduced, differentiable loss tensor.
+        Example
+        -------
+        Minimum viable implementation:
+        >>> import torch
+        >>> def training_step(model, data):
+        ...     output = model(data)
+        ...     loss = torch.sum(torch.pow(output - data, 2))
+        ...     return loss
+        """
+        ...
+class ValidationProtocol(Protocol):
+    """
+    This protocol defines the interface for validation steps: given
+    a model and some input data, compute metrics of interest and if
+    relevant to do so, log the results.
+    A concrete implementation can simply be a function with a signature that
+    matches what is defined in ``__call__``.
+    """
+    def __call__(self, model: Module, data: T, *args: Any, **kwargs: Any) -> None:
+        """
+        Implements the validation logic for a single sample or batch.
+        This method will be called in validation steps **only**, and not used
+        for training, query, or metrology steps. In those cases, implement the
+        ``inference_step`` method instead.
+        This function should not return anything, but should contain the logic
+        for computing metrics of interest over a validation/test set. If there
+        are any logging operations that need to be performed, they should also
+        be performed here.
+        Depending on the type of model architecture, consider wrapping this method
+        with ``StaticCaptureEvaluateNoGrad`` for performance optimizations. This
+        should be used if the model does not require autograd as part of its
+        forward pass.
+        Parameters
+        ----------
+        model: Module
+            The model to validate.
+        data: T
+            The data to validate on. This data structure should comprise
+            both input and ground truths to compute the loss.
+        args: Any
+            Additional arguments to pass to the method.
+        kwargs: Any
+            Additional keyword arguments to pass to the method.
+        Example
+        -------
+        Minimum viable implementation:
+        >>> import torch
+        >>> def validation_step(model, data):
+        ...     output = model(data)
+        ...     loss = torch.sum(torch.pow(output - data, 2))
+        ...     return loss
+        """
+        ...
+class InferenceProtocol(Protocol):
+    """
+    This protocol defines the interface for inference steps: given
+    a model and some input data, return the output of the model's forward pass.
+    A concrete implementation can simply be a function with a signature that
+    matches what is defined in ``__call__``.
+    """
+    def __call__(self, model: Module, data: S, *args: Any, **kwargs: Any) -> Any:
+        """
+        Implements the inference logic for a single sample or batch.
+        This method will be called in query and metrology steps, and should
+        return the output of the model's forward pass, likely minimally processed
+        so that any transformations can be performed by strategies that utilize
+        this protocol.
+        The key difference between this protocol and the other two training and
+        validation protocols is that the data structure ``S`` does not need
+        to contain ground truth values to compute a loss.
+        Similar to ``ValidationProtocol``, if relevant to the underlying architecture,
+        consider wrapping a concrete implementation of this protocol with
+        ``StaticCaptureInference`` for performance optimizations.
+        Parameters
+        ----------
+        model: Module
+            The model to infer on.
+        data: S
+            The data to infer on. This data structure should comprise
+            only input values to compute the forward pass.
+        args: Any
+            Additional arguments to pass to the method.
+        kwargs: Any
+            Additional keyword arguments to pass to the method.
+        Returns
+        -------
+        Any
+            The output of the model's forward pass.
+        Example
+        -------
+        Minimum viable implementation:
+        >>> def inference_step(model, data):
+        ...     output = model(data)
+        ...     return output
+        """
+        ...
+class TrainingLoop(Protocol):
+    """
+    Defines a protocol that implements a training loop.
+    This protocol is intended to be called within the active learning loop
+    during the training phase, where the model is trained on a specified
+    number of epochs or training steps, and optionally validated on a dataset.
+    If a ``LearnerProtocol`` is provided, then ``train_fn`` and ``validate_fn``
+    become optional as they will be defined within the ``LearnerProtocol``. If
+    they are provided, however, then they should override the ``LearnerProtocol``
+    variants.
+    If graph capture/compilation is intended, then ``train_fn`` and ``validate_fn``
+    should be wrapped with ``StaticCaptureTraining`` and ``StaticCaptureEvaluateNoGrad``,
+    respectively.
+    """
+    def __call__(
+        self,
+        model: Module | LearnerProtocol,
+        optimizer: Optimizer,
+        train_dataloader: DataLoader,
+        validation_dataloader: DataLoader | None = None,
+        train_step_fn: TrainingProtocol | None = None,
+        validate_step_fn: ValidationProtocol | None = None,
+        max_epochs: int | None = None,
+        max_train_steps: int | None = None,
+        max_val_steps: int | None = None,
+        lr_scheduler: _LRScheduler | None = None,
+        device: str | torch.device | None = None,
+        dtype: torch.dtype | None = None,
+        *args: Any,
+        **kwargs: Any,
+    ) -> None:
+        """
+        Defines the signature for a minimal viable training loop.
+        The protocol defines a ``model`` with trainable parameters
+        tracked by ``optimizer`` will go through multiple epochs or
+        training steps. In the latter, the ``train_dataloader`` will be
+        exhausted ``max_epochs`` times, while the mutually exclusive
+        ``max_train_steps`` will limit the number of training batches,
+        which can be greater or less than the length of the ``train_dataloader``.
+        (Optional) Validation is intended to be performed either at the end of a training
+        epoch, or when the maximum number of training steps is reached. The
+        ``max_val_steps`` parameter can be used to limit the number of batches to validate with
+        on a per-epoch basis. Validation is only performed if a ``validate_step_fn`` is provided,
+        alongside ``validation_dataloader``.
+        The pseudocode for training to ``max_epochs`` would look like this:
+        .. code-block:: python
+           max_epochs = 10
+           for epoch in range(max_epochs):
+               for train_idx, batch in enumerate(train_dataloader):
+                   optimizer.zero_grad()
+                   loss = train_step_fn(model, batch)
+                   loss.backward()
+                   optimizer.step()
+                   if train_idx + 1 == max_train_steps:
+                       break
+               if validate_step_fn and validation_dataloader:
+                   for val_idx, batch in enumerate(validation_dataloader):
+                       validate_step_fn(model, batch)
+                       if val_idx + 1 == max_val_steps:
+                           break
+        The pseudocode for training with a ``LearnerProtocol`` would look like this:
+        .. code-block:: python
+           for epoch in range(max_epochs):
+               for train_idx, batch in enumerate(train_dataloader):
+                   loss = model.training_step(batch)
+                   if train_idx + 1 == max_train_steps:
+                       break
+               if validation_dataloader:
+                   for val_idx, batch in enumerate(validation_dataloader):
+                       model.validation_step(batch)
+                       if val_idx + 1 == max_val_steps:
+                           break
+        The key difference between specifying ``train_step_fn`` and ``LearnerProtocol``
+        is that the former excludes the backward pass and optimizer step logic,
+        whereas the latter encapsulates them.
+        The ``device`` and ``dtype`` parameters are used to specify the device and
+        dtype to use for the training loop. If not provided, a reasonable default
+        should be used (e.g. from ``torch.get_default_device()`` and ``torch.get_default_dtype()``).
+        Parameters
+        ----------
+        model: Module | LearnerProtocol
+            The model to train.
+        optimizer: Optimizer
+            The optimizer to use for training.
+        train_dataloader: DataLoader
+            The dataloader to use for training.
+        validation_dataloader: DataLoader | None
+            The dataloader to use for validation.
+        train_step_fn: TrainingProtocol | None
+            The training function to use for training. This is optional only
+            if ``model`` implements the ``LearnerProtocol``. If this is
+            provided and ``model`` implements the ``LearnerProtocol``,
+            then this function will take precedence over the
+            ``LearnerProtocol.training_step`` method.
+        validate_step_fn: ValidationProtocol | None
+            The validation function to use for validation, only if it is
+            provided alongside ``validation_dataloader``. If ``model`` implements
+            the ``LearnerProtocol``, then this function will take precedence over
+            the ``LearnerProtocol.validation_step`` method.
+        max_epochs: int | None
+            The maximum number of epochs to train for. Mututally exclusive
+            with ``max_train_steps``.
+        max_train_steps: int | None
+            The maximum number of training steps to perform. Mututally exclusive
+            with ``max_epochs``. If this value is greater than the length
+            of ``train_dataloader``, then the training loop will recycle the data
+            (i.e. more than one epoch) until the maximum number of training steps
+            is reached.
+        max_val_steps: int | None
+            The maximum number of validation steps to perform per training
+            epoch. If ``None``, then the full validation set will be used.
+        lr_scheduler: _LRScheduler | None = None,
+            The learning rate scheduler to use for training. If provided,
+            this will be used to update the learning rate of the optimizer
+            during training. If not provided, then the learning rate will
+            not be adjusted within this function.
+        device: str | torch.device | None = None
+            The device to use for the training loop.
+        dtype: torch.dtype | None = None
+            The dtype to use for the training loop.
+        args: Any
+            Additional arguments to pass to the method.
+        kwargs: Any
+            Additional keyword arguments to pass to the method.
+        """
+        ...
+class LearnerProtocol:
+    """
+    This protocol represents the learner part of an active learning
+    algorithm.
+    This corresponds to a set of trainable parameters that are optimized,
+    and subsequently used for inference and evaluation.
+    The required methods make this classes that implement this protocol
+    provide all the required functionality across all active learning steps.
+    Keep in mind that, similar to all other protocols in this module, this
+    is merely the required interface and not the actual implementation.
+    """
+    def training_step(self, data: T, *args: Any, **kwargs: Any) -> None:
+        """
+        Implements the training logic for a single batch.
+        This method will be called in training steps **only**, and not used
+        for validation, query, or metrology steps. Specifically this means
+        that gradients will be computed and used to update parameters.
+        In cases where gradients are not needed, consider implementing the
+        ``validation_step`` method instead.
+        This should mirror the ``TrainingProtocol`` definition, except that
+        the model corresponds to this object.
+        Parameters
+        ----------
+        data: T
+            The data to train on. Typically assumed to be a batch
+            of data.
+        args: Any
+            Additional arguments to pass to the method.
+        kwargs: Any
+            Additional keyword arguments to pass to the method.
+        """
+        ...
+    def validation_step(self, data: T, *args: Any, **kwargs: Any) -> None:
+        """
+        Implements the validation logic for a single batch.
+        This can match the forward pass, without the need for weight updates.
+        This method will be called in validation steps **only**, and not used
+        for query or metrology steps. In those cases, implement the ``inference_step``
+        method instead.
+        This should mirror the ``ValidationProtocol`` definition, except that
+        the model corresponds to this object.
+        Parameters
+        ----------
+        data: T
+            The data to validate on. Typically assumed to be a batch
+            of data.
+        args: Any
+            Additional arguments to pass to the method.
+        kwargs: Any
+            Additional keyword arguments to pass to the method.
+        """
+        ...
+    def inference_step(self, data: T | S, *args: Any, **kwargs: Any) -> None:
+        """
+        Implements the inference logic for a single batch.
+        This can match the forward pass exactly, but provides an opportunity
+        to differentiate (or lack thereof, with no pun intended). Specifically,
+        this method will be called during query and metrology steps.
+        This should mirror the ``InferenceProtocol`` definition, except that
+        the model corresponds to this object.
+        Parameters
+        ----------
+        data: T
+            The data to infer on. Typically assumed to be a batch
+            of data.
+        args: Any
+            Additional arguments to pass to the method.
+        kwargs: Any
+            Additional keyword arguments to pass to the method.
+        """
+        ...
+    @property
+    def parameters(self) -> Iterator[torch.Tensor]:
+        """
+        Returns an iterator over the parameters of the learner.
+        If subclassing from `torch.nn.Module`, this will automatically return
+        the parameters of the module.
+        Returns
+        -------
+        Iterator[torch.Tensor]
+            An iterator over the parameters of the learner.
+        """
+        ...
+    def forward(self, *args: Any, **kwargs: Any) -> Any:
+        """
+        Implements the forward pass for a single batch.
+        This method is called between all active learning steps, and should
+        contain the logic for how a model ingests data and produces predictions.
+        Parameters
+        ----------
+        args: Any
+            Additional arguments to pass to the model.
+        kwargs: Any
+            Additional keyword arguments to pass to the model.
+        Returns
+        -------
+        Any
+            The output of the model's forward pass.
+        """
+        ...
+class DriverProtocol:
+    """
+    This protocol specifies the expected interface for an active learning
+    driver: for a concrete implementation, refer to the `driver` module
+    instead. The specification is provided mostly as a reference, and for
+    ease of type hinting to prevent circular imports.
+    Attributes
+    ----------
+    learner: LearnerProtocol
+        The learner module that will be used as the surrogate within
+        the active learning loop.
+    query_strategies: list[QueryStrategy]
+        The query strategies that will be used for selecting data points to label.
+        A list of strategies can be included, and will sequentially be used to
+        populate the ``query_queue`` that passes samples over to labeling.
+    query_queue: AbstractQueue[T]
+        The queue containing data samples to be labeled. ``QueryStrategy`` instances
+        should enqueue samples to this queue.
+    label_strategy: LabelStrategy | None
+        The label strategy that will be used for labeling data points. In contrast
+        to the other strategies, only a single label strategy is supported.
+        This strategy will consume the ``query_queue`` and enqueue labeled data to
+        the ``label_queue``.
+    label_queue: AbstractQueue[T] | None
+        The queue containing freshly labeled data. ``LabelStrategy`` instances
+        should enqueue labeled data to this queue, and the driver will subsequently
+        serialize data contained within this queue to a persistent format.
+    metrology_strategies: list[MetrologyStrategy] | None
+        The metrology strategies that will be used for assessing the performance
+        of the surrogate. A list of strategies can be included, and will sequentially
+        be used to populate the ``metrology_queue`` that passes data over to the
+        learner.
+    training_pool: DataPool[T]
+        The pool of data to be used for training. This data will be used to train
+        the underlying model, and is assumed to be mutable in that additional data
+        can be added to the pool over the course of active learning.
+    validation_pool: DataPool[T] | None
+        The pool of data to be used for validation. This data will be used for both
+        conventional validation, as well as for metrology. This dataset is considered
+        to be immutable, and should not be modified over the course of active learning.
+        This dataset is considered optional, as both validation and metrology are.
+    unlabeled_pool: DataPool[T] | None
+        An optional pool of data to be used for querying and labeling. If supplied,
+        this dataset can be depleted by a query strategy to select data points for labeling.
+        In principle, this could also represent a generative model, i.e. not just a static
+        dataset, but at a high level represents a distribution of data.
+    """
+    learner: LearnerProtocol
+    query_strategies: list[QueryStrategy]
+    query_queue: AbstractQueue[T]
+    label_strategy: LabelStrategy | None
+    label_queue: AbstractQueue[T] | None
+    metrology_strategies: list[MetrologyStrategy] | None
+    training_pool: DataPool[T]
+    validation_pool: DataPool[T] | None
+    unlabeled_pool: DataPool[T] | None
+    def active_learning_step(self, *args: Any, **kwargs: Any) -> None:
+        """
+        Implements the active learning step.
+        This step performs a single pass of the active learning loop, with the
+        intended order being: training, metrology, query, labeling, with
+        the metrology and labeling steps being optional.
+        Parameters
+        ----------
+        args: Any
+            Additional arguments to pass to the method.
+        kwargs: Any
+            Additional keyword arguments to pass to the method.
+        """
+        ...
+    def _setup_logger(self) -> None:
+        """
+        Sets up the logger for the driver.
+        The intended concrete method should account for the ability to
+        scope logging, such that things like active learning iteration
+        counts, etc. can be logged.
+        """
+        ...
+    def attach_strategies(self) -> None:
+        """
+        Attaches all provided strategies.
+        This method relies on the ``attach`` method of the strategies, which
+        will subsequently give the strategy access to the driver's scope.
+        Example use cases would be for any strategy (apart from label strategy)
+        to access the underlying model (``LearnerProtocol``); for a query
+        strategy to access the ``unlabeled_pool``; for a metrology strategy
+        to access the ``validation_pool``.
+        """
+        for strategy in self.query_strategies:
+            strategy.attach(self)
+        if self.label_strategy:
+            self.label_strategy.attach(self)
+        if self.metrology_strategies:
+            for strategy in self.metrology_strategies:
+                strategy.attach(self)

physics_mcp/source/physicsnemo/constants.py ADDED Viewed

	@@ -0,0 +1,48 @@

+# SPDX-FileCopyrightText: Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+constant values used by PhysicsNeMo
+"""
+import numpy as np
+import torch
+# string used to determine derivatives
+diff_str: str = "__"
+def diff(y: str, x: str, degree: int = 1) -> str:
+    """Function to apply diff string"""
+    return diff_str.join([y] + degree * [x])
+# for changing to float16 or float64
+tf_dt = torch.float32
+np_dt = np.float32
+# tensorboard naming
+TF_SUMMARY = False
+# Pytorch Version for which JIT will be default on
+# Torch version of NGC container 22.08
+JIT_PYTORCH_VERSION = "1.13.0a0+d321be6"
+# No scaling is needed if using NO_OP_SCALE
+NO_OP_SCALE = (0.0, 1.0)
+# If using NO_OP_NORM, it is effectively doing no normalization
+NO_OP_NORM = (-1.0, 1.0)

physics_mcp/source/physicsnemo/datapipes/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# SPDX-FileCopyrightText: Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

physics_mcp/source/physicsnemo/datapipes/benchmarks/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# SPDX-FileCopyrightText: Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

physics_mcp/source/physicsnemo/datapipes/benchmarks/darcy.py ADDED Viewed

	@@ -0,0 +1,322 @@

+# SPDX-FileCopyrightText: Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+from dataclasses import dataclass
+from typing import Dict, Tuple, Union
+import numpy as np
+import torch
+import warp as wp
+from ..datapipe import Datapipe
+from ..meta import DatapipeMetaData
+from .kernels.finite_difference import (
+    darcy_mgrid_jacobi_iterative_batched_2d,
+    mgrid_inf_residual_batched_2d,
+)
+from .kernels.initialization import init_uniform_random_4d
+from .kernels.utils import (
+    bilinear_upsample_batched_2d,
+    fourier_to_array_batched_2d,
+    threshold_3d,
+)
+Tensor = torch.Tensor
+# TODO unsure if better to remove this. Keeping this in for now
+wp.init()
+@dataclass
+class MetaData(DatapipeMetaData):
+    name: str = "Darcy2D"
+    # Optimization
+    auto_device: bool = True
+    cuda_graphs: bool = True
+    # Parallel
+    ddp_sharding: bool = False
+class Darcy2D(Datapipe):
+    """2D Darcy flow benchmark problem datapipe.
+    This datapipe continuously generates solutions to the 2D Darcy equation with variable
+    permeability. All samples are generated on the fly and is meant to be a benchmark
+    problem for testing data driven models. Permeability is drawn from a random Fourier
+    series and threshold it to give a piecewise constant function. The solution is obtained
+    using a GPU enabled multi-grid Jacobi iterative method.
+    Parameters
+    ----------
+    resolution : int, optional
+        Resolution to run simulation at, by default 256
+    batch_size : int, optional
+        Batch size of simulations, by default 64
+    nr_permeability_freq : int, optional
+        Number of frequencies to use for generating random permeability. Higher values
+        will give higher freq permeability fields., by default 5
+    max_permeability : float, optional
+        Max permeability, by default 2.0
+    min_permeability : float, optional
+        Min permeability, by default 0.5
+    max_iterations : int, optional
+        Maximum iterations to use for each multi-grid, by default 30000
+    convergence_threshold : float, optional
+        Solver L-Infinity convergence threshold, by default 1e-6
+    iterations_per_convergence_check : int, optional
+        Number of Jacobi iterations to run before checking convergence, by default 1000
+    nr_multigrids : int, optional
+        Number of multi-grid levels, by default 4
+    normaliser : Union[Dict[str, Tuple[float, float]], None], optional
+        Dictionary with keys `permeability` and `darcy`. The values for these keys are two floats corresponding to mean and std `(mean, std)`.
+    device : Union[str, torch.device], optional
+        Device for datapipe to run place data on, by default "cuda"
+    Raises
+    ------
+    ValueError
+        Incompatable multi-grid and resolution settings
+    """
+    def __init__(
+        self,
+        resolution: int = 256,
+        batch_size: int = 64,
+        nr_permeability_freq: int = 5,
+        max_permeability: float = 2.0,
+        min_permeability: float = 0.5,
+        max_iterations: int = 30000,
+        convergence_threshold: float = 1e-6,
+        iterations_per_convergence_check: int = 1000,
+        nr_multigrids: int = 4,
+        normaliser: Union[Dict[str, Tuple[float, float]], None] = None,
+        device: Union[str, torch.device] = "cuda",
+    ):
+        super().__init__(meta=MetaData())
+        # simulation params
+        self.resolution = resolution
+        self.batch_size = batch_size
+        self.nr_permeability_freq = nr_permeability_freq
+        self.max_permeability = max_permeability
+        self.min_permeability = min_permeability
+        self.max_iterations = max_iterations
+        self.convergence_threshold = convergence_threshold
+        self.iterations_per_convergence_check = iterations_per_convergence_check
+        self.nr_multigrids = nr_multigrids
+        self.normaliser = normaliser
+        # check normaliser keys
+        if self.normaliser is not None:
+            if not {"permeability", "darcy"}.issubset(set(self.normaliser.keys())):
+                raise ValueError(
+                    "normaliser need to have keys permeability and darcy with mean and std"
+                )
+        # Set up device for warp, warp has same naming convention as torch.
+        if isinstance(device, torch.device):
+            device = str(device)
+        self.device = device
+        # spatial dims
+        self.dx = 1.0 / (self.resolution + 1)  # pad edges by 1 for multi-grid
+        self.dim = (self.batch_size, self.resolution + 1, self.resolution + 1)
+        self.fourier_dim = (
+            4,
+            self.batch_size,
+            self.nr_permeability_freq,
+            self.nr_permeability_freq,
+        )
+        # assert resolution is compatible with multi-grid method
+        if (resolution % 2 ** (nr_multigrids - 1)) != 0:
+            raise ValueError("Resolution is incompatible with number of sub grids.")
+        # allocate arrays for constructing dataset
+        self.darcy0 = wp.zeros(self.dim, dtype=float, device=self.device)
+        self.darcy1 = wp.zeros(self.dim, dtype=float, device=self.device)
+        self.permeability = wp.zeros(self.dim, dtype=float, device=self.device)
+        self.rand_fourier = wp.zeros(self.fourier_dim, dtype=float, device=self.device)
+        self.inf_residual = wp.zeros([1], dtype=float, device=self.device)
+        # Output tenors
+        self.output_k = None
+        self.output_p = None
+    def initialize_batch(self) -> None:
+        """Initializes arrays for new batch of simulations"""
+        # initialize permeability
+        self.permeability.zero_()
+        seed = np.random.randint(np.iinfo(np.uint64).max, dtype=np.uint64)
+        wp.launch(
+            kernel=init_uniform_random_4d,
+            dim=self.fourier_dim,
+            inputs=[self.rand_fourier, -1.0, 1.0, seed],
+            device=self.device,
+        )
+        wp.launch(
+            kernel=fourier_to_array_batched_2d,
+            dim=self.dim,
+            inputs=[
+                self.permeability,
+                self.rand_fourier,
+                self.nr_permeability_freq,
+                self.resolution,
+                self.resolution,
+            ],
+            device=self.device,
+        )
+        wp.launch(
+            kernel=threshold_3d,
+            dim=self.dim,
+            inputs=[
+                self.permeability,
+                0.0,
+                self.min_permeability,
+                self.max_permeability,
+            ],
+            device=self.device,
+        )
+        # zero darcy arrays
+        self.darcy0.zero_()
+        self.darcy1.zero_()
+    def generate_batch(self) -> None:
+        """Solve for new batch of simulations"""
+        # initialize tensors with random permeability
+        self.initialize_batch()
+        # run solver
+        for res in range(self.nr_multigrids):
+            # calculate grid reduction factor and reduced dim
+            grid_reduction_factor = 2 ** (self.nr_multigrids - res - 1)
+            if grid_reduction_factor > 1:
+                multigrid_dim = tuple(
+                    [self.batch_size] + 2 * [(self.resolution) // grid_reduction_factor]
+                )
+            else:
+                multigrid_dim = self.dim
+            # run till max steps is reached
+            for k in range(
+                self.max_iterations // self.iterations_per_convergence_check
+            ):
+                # run jacobi iterations
+                for s in range(self.iterations_per_convergence_check):
+                    # iterate solver
+                    wp.launch(
+                        kernel=darcy_mgrid_jacobi_iterative_batched_2d,
+                        dim=multigrid_dim,
+                        inputs=[
+                            self.darcy0,
+                            self.darcy1,
+                            self.permeability,
+                            1.0,
+                            self.dim[1],
+                            self.dim[2],
+                            self.dx,
+                            grid_reduction_factor,
+                        ],
+                        device=self.device,
+                    )
+                    # swap buffers
+                    (self.darcy0, self.darcy1) = (self.darcy1, self.darcy0)
+                # compute residual
+                self.inf_residual.zero_()
+                wp.launch(
+                    kernel=mgrid_inf_residual_batched_2d,
+                    dim=multigrid_dim,
+                    inputs=[
+                        self.darcy0,
+                        self.darcy1,
+                        self.inf_residual,
+                        grid_reduction_factor,
+                    ],
+                    device=self.device,
+                )
+                normalized_inf_residual = self.inf_residual.numpy()[0]
+                # check if converged
+                if normalized_inf_residual < (
+                    self.convergence_threshold * grid_reduction_factor
+                ):
+                    break
+            # upsample to higher resolution
+            if grid_reduction_factor > 1:
+                wp.launch(
+                    kernel=bilinear_upsample_batched_2d,
+                    dim=self.dim,
+                    inputs=[
+                        self.darcy0,
+                        self.dim[1],
+                        self.dim[2],
+                        grid_reduction_factor,
+                    ],
+                    device=self.device,
+                )
+    def __iter__(self) -> Tuple[Tensor, Tensor]:
+        """
+        Yields
+        ------
+        Iterator[Tuple[Tensor, Tensor]]
+            Infinite iterator that returns a batch of (permeability, darcy pressure)
+            fields of size [batch, resolution, resolution]
+        """
+        # infinite generator
+        while True:
+            # run simulation
+            self.generate_batch()
+            # convert warp arrays to pytorch
+            permeability = wp.to_torch(self.permeability)
+            darcy = wp.to_torch(self.darcy0)
+            # add channel dims
+            permeability = torch.unsqueeze(permeability, axis=1)
+            darcy = torch.unsqueeze(darcy, axis=1)
+            # crop edges by 1 from multi-grid TODO messy
+            permeability = permeability[:, :, : self.resolution, : self.resolution]
+            darcy = darcy[:, :, : self.resolution, : self.resolution]
+            # normalize values
+            if self.normaliser is not None:
+                permeability = (
+                    permeability - self.normaliser["permeability"][0]
+                ) / self.normaliser["permeability"][1]
+                darcy = (darcy - self.normaliser["darcy"][0]) / self.normaliser[
+                    "darcy"
+                ][1]
+            # CUDA graphs static copies
+            if self.output_k is None:
+                self.output_k = permeability
+                self.output_p = darcy
+            else:
+                self.output_k.data.copy_(permeability)
+                self.output_p.data.copy_(darcy)
+            yield {"permeability": self.output_k, "darcy": self.output_p}
+    def __len__(self):
+        return sys.maxsize

physics_mcp/source/physicsnemo/datapipes/benchmarks/kelvin_helmholtz.py ADDED Viewed

	@@ -0,0 +1,436 @@

+# SPDX-FileCopyrightText: Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+from dataclasses import dataclass
+from typing import Dict, Tuple, Union
+import numpy as np
+import torch
+import warp as wp
+from ..datapipe import Datapipe
+from ..meta import DatapipeMetaData
+from .kernels.finite_volume import (
+    euler_apply_flux_batched_2d,
+    euler_conserved_to_primitive_batched_2d,
+    euler_extrapolation_batched_2d,
+    euler_get_flux_batched_2d,
+    euler_primitive_to_conserved_batched_2d,
+    initialize_kelvin_helmoltz_batched_2d,
+)
+from .kernels.initialization import init_uniform_random_2d
+Tensor = torch.Tensor
+# TODO unsure if better to remove this
+wp.init()
+@dataclass
+class MetaData(DatapipeMetaData):
+    name: str = "KelvinHelmholtz2D"
+    # Optimization
+    auto_device: bool = True
+    cuda_graphs: bool = True
+    # Parallel
+    ddp_sharding: bool = False
+class KelvinHelmholtz2D(Datapipe):
+    """Kelvin-Helmholtz instability benchmark problem datapipe.
+    This datapipe continuously generates samples with random initial conditions. All samples
+    are generated on the fly and is meant to be a benchmark problem for testing data driven
+    models. Initial conditions are given in the form of small perturbations. The solution
+    is obtained using a GPU enabled Finite Volume Method.
+    Parameters
+    ----------
+    resolution : int, optional
+        Resolution to run simulation at, by default 512
+    batch_size : int, optional
+        Batch size of simulations, by default 16
+    seq_length : int, optional
+        Sequence length of output samples, by default 8
+    nr_perturbation_freq : int, optional
+        Number of frequencies to use for generating random initial perturbations, by default 5
+    perturbation_range : float, optional
+        Range to use for random perturbations. This value will be the max amplitude of the
+        initial perturbation, by default 0.1
+    nr_snapshots : int, optional
+        Number of snapshots of simulation to generate for data generation. This will
+        control how long the simulation is run for, by default 256
+    iteration_per_snapshot : int, optional
+         Number of finite volume steps to take between each snapshot. Each step size is
+         fixed as the smallest possible value that satisfies the Courant-Friedrichs-Lewy
+         condition, by default 32
+    gamma : float, optional
+        Heat capacity ratio, by default 5.0/3.0
+    normaliser : Union[Dict[str, Tuple[float, float]], None], optional
+        Dictionary with keys `density`, `velocity`, and `pressure`. The values for these keys are two floats corresponding to mean and std `(mean, std)`.
+    device : Union[str, torch.device], optional
+        Device for datapipe to run place data on, by default "cuda"
+    """
+    def __init__(
+        self,
+        resolution: int = 512,
+        batch_size: int = 16,
+        seq_length: int = 8,
+        nr_perturbation_freq: int = 5,
+        perturbation_range: float = 0.1,
+        nr_snapshots: int = 256,
+        iteration_per_snapshot: int = 32,
+        gamma: float = 5.0 / 3.0,
+        normaliser: Union[Dict[str, Tuple[float, float]], None] = None,
+        device: Union[str, torch.device] = "cuda",
+    ):
+        super().__init__(meta=MetaData())
+        # simulation params
+        self.resolution = resolution
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.nr_perturbation_freq = nr_perturbation_freq
+        self.perturbation_range = perturbation_range
+        self.nr_snapshots = nr_snapshots
+        self.iteration_per_snapshot = iteration_per_snapshot
+        self.gamma = gamma
+        self.courant_fac = 0.4  # hard set
+        self.normaliser = normaliser
+        # check normaliser keys
+        if self.normaliser is not None:
+            if not {"density", "velocity", "pressure"}.issubset(
+                set(self.normaliser.keys())
+            ):
+                raise ValueError(
+                    "normaliser need to have keys `density`, `velocity` and `pressure` with mean and std"
+                )
+        # Set up device for warp, warp has same naming convention as torch.
+        if isinstance(device, torch.device):
+            device = str(device)
+        self.device = device
+        # spatial dims
+        self.dx = 1.0 / resolution
+        self.dt = (
+            self.courant_fac * self.dx / (np.sqrt(self.gamma * 5.0) + 2.0)
+        )  # hard set to smallest possible step needed
+        self.vol = self.dx**2
+        self.dim = (self.batch_size, self.resolution, self.resolution)
+        # allocate array for initial freq perturbation
+        self.w = wp.zeros(
+            (self.batch_size, self.nr_perturbation_freq),
+            dtype=float,
+            device=self.device,
+        )
+        # allocate conservation quantities
+        self.mass = wp.zeros(self.dim, dtype=float, device=self.device)
+        self.mom = wp.zeros(self.dim, dtype=wp.vec2, device=self.device)
+        self.e = wp.zeros(self.dim, dtype=float, device=self.device)
+        # allocate primitive quantities
+        self.rho = wp.zeros(self.dim, dtype=float, device=self.device)
+        self.vel = wp.zeros(self.dim, dtype=wp.vec2, device=self.device)
+        self.p = wp.zeros(self.dim, dtype=float, device=self.device)
+        # allocate flux values for computation
+        self.mass_flux_x = wp.zeros(self.dim, dtype=float, device=self.device)
+        self.mass_flux_y = wp.zeros(self.dim, dtype=float, device=self.device)
+        self.mom_flux_x = wp.zeros(self.dim, dtype=wp.vec2, device=self.device)
+        self.mom_flux_y = wp.zeros(self.dim, dtype=wp.vec2, device=self.device)
+        self.e_flux_x = wp.zeros(self.dim, dtype=float, device=self.device)
+        self.e_flux_y = wp.zeros(self.dim, dtype=float, device=self.device)
+        # allocate extrapolation values for computation
+        self.rho_xl = wp.zeros(self.dim, dtype=float, device=self.device)
+        self.rho_xr = wp.zeros(self.dim, dtype=float, device=self.device)
+        self.rho_yl = wp.zeros(self.dim, dtype=float, device=self.device)
+        self.rho_yr = wp.zeros(self.dim, dtype=float, device=self.device)
+        self.vel_xl = wp.zeros(self.dim, dtype=wp.vec2, device=self.device)
+        self.vel_xr = wp.zeros(self.dim, dtype=wp.vec2, device=self.device)
+        self.vel_yl = wp.zeros(self.dim, dtype=wp.vec2, device=self.device)
+        self.vel_yr = wp.zeros(self.dim, dtype=wp.vec2, device=self.device)
+        self.p_xl = wp.zeros(self.dim, dtype=float, device=self.device)
+        self.p_xr = wp.zeros(self.dim, dtype=float, device=self.device)
+        self.p_yl = wp.zeros(self.dim, dtype=float, device=self.device)
+        self.p_yr = wp.zeros(self.dim, dtype=float, device=self.device)
+        # allocate arrays for storing results
+        self.seq_rho = [
+            wp.zeros(self.dim, dtype=float, device=self.device)
+            for _ in range(self.nr_snapshots)
+        ]
+        self.seq_vel = [
+            wp.zeros(self.dim, dtype=wp.vec2, device=self.device)
+            for _ in range(self.nr_snapshots)
+        ]
+        self.seq_p = [
+            wp.zeros(self.dim, dtype=float, device=self.device)
+            for _ in range(self.nr_snapshots)
+        ]
+        self.output_rho = None
+        self.output_vel = None
+        self.output_p = None
+    def initialize_batch(self) -> None:
+        """Initializes arrays for new batch of simulations"""
+        # initialize random Fourier freq
+        seed = np.random.randint(np.iinfo(np.uint64).max, dtype=np.uint64)
+        wp.launch(
+            init_uniform_random_2d,
+            dim=[self.batch_size, self.nr_perturbation_freq],
+            inputs=[self.w, -self.perturbation_range, self.perturbation_range, seed],
+            device=self.device,
+        )
+        # initialize fields
+        wp.launch(
+            initialize_kelvin_helmoltz_batched_2d,
+            dim=self.dim,
+            inputs=[
+                self.rho,
+                self.vel,
+                self.p,
+                self.w,
+                0.05 / np.sqrt(2.0),
+                self.dim[1],
+                self.dim[2],
+                self.nr_perturbation_freq,
+            ],
+            device=self.device,
+        )
+        wp.launch(
+            euler_primitive_to_conserved_batched_2d,
+            dim=self.dim,
+            inputs=[
+                self.rho,
+                self.vel,
+                self.p,
+                self.mass,
+                self.mom,
+                self.e,
+                self.gamma,
+                self.vol,
+                self.dim[1],
+                self.dim[2],
+            ],
+            device=self.device,
+        )
+    def generate_batch(self) -> None:
+        """Solve for new batch of simulations"""
+        # initialize tensors with random coef
+        self.initialize_batch()
+        # run solver
+        for s in range(self.nr_snapshots):
+            # save arrays for
+            wp.copy(self.seq_rho[s], self.rho)
+            wp.copy(self.seq_vel[s], self.vel)
+            wp.copy(self.seq_p[s], self.p)
+            # iterations
+            for i in range(self.iteration_per_snapshot):
+                # compute primitives
+                wp.launch(
+                    euler_conserved_to_primitive_batched_2d,
+                    dim=self.dim,
+                    inputs=[
+                        self.mass,
+                        self.mom,
+                        self.e,
+                        self.rho,
+                        self.vel,
+                        self.p,
+                        self.gamma,
+                        self.vol,
+                        self.dim[1],
+                        self.dim[2],
+                    ],
+                    device=self.device,
+                )
+                # compute extrapolations to faces
+                wp.launch(
+                    euler_extrapolation_batched_2d,
+                    dim=self.dim,
+                    inputs=[
+                        self.rho,
+                        self.vel,
+                        self.p,
+                        self.rho_xl,
+                        self.rho_xr,
+                        self.rho_yl,
+                        self.rho_yr,
+                        self.vel_xl,
+                        self.vel_xr,
+                        self.vel_yl,
+                        self.vel_yr,
+                        self.p_xl,
+                        self.p_xr,
+                        self.p_yl,
+                        self.p_yr,
+                        self.gamma,
+                        self.dx,
+                        self.dt,
+                        self.dim[1],
+                        self.dim[2],
+                    ],
+                    device=self.device,
+                )
+                # compute fluxes
+                wp.launch(
+                    euler_get_flux_batched_2d,
+                    dim=self.dim,
+                    inputs=[
+                        self.rho_xl,
+                        self.rho_xr,
+                        self.rho_yl,
+                        self.rho_yr,
+                        self.vel_xl,
+                        self.vel_xr,
+                        self.vel_yl,
+                        self.vel_yr,
+                        self.p_xl,
+                        self.p_xr,
+                        self.p_yl,
+                        self.p_yr,
+                        self.mass_flux_x,
+                        self.mass_flux_y,
+                        self.mom_flux_x,
+                        self.mom_flux_y,
+                        self.e_flux_x,
+                        self.e_flux_y,
+                        self.gamma,
+                        self.dim[1],
+                        self.dim[2],
+                    ],
+                    device=self.device,
+                )
+                # apply fluxes
+                wp.launch(
+                    euler_apply_flux_batched_2d,
+                    dim=self.dim,
+                    inputs=[
+                        self.mass_flux_x,
+                        self.mass_flux_y,
+                        self.mom_flux_x,
+                        self.mom_flux_y,
+                        self.e_flux_x,
+                        self.e_flux_y,
+                        self.mass,
+                        self.mom,
+                        self.e,
+                        self.dx,
+                        self.dt,
+                        self.dim[1],
+                        self.dim[2],
+                    ],
+                    device=self.device,
+                )
+    def __iter__(self) -> Tuple[Tensor, Tensor, Tensor]:
+        """
+        Yields
+        ------
+        Iterator[Tuple[Tensor, Tensor]]
+            Infinite iterator that returns a batch of timeseries with (density, velocity, pressure)
+            fields of size [batch, seq_length, dim, resolution, resolution]
+        """
+        # infinite generator
+        while True:
+            # run simulation
+            self.generate_batch()
+            # return all samples generated before rerunning simulation
+            batch_ind = [
+                np.arange(self.nr_snapshots - self.seq_length)
+                for _ in range(self.batch_size)
+            ]
+            for b_ind in batch_ind:
+                np.random.shuffle(b_ind)
+            for bb in range(self.nr_snapshots - self.seq_length):
+                # run over batch to gather samples
+                batched_seq_rho = []
+                batched_seq_vel = []
+                batched_seq_p = []
+                for b in range(self.batch_size):
+                    # gather seq from each batch
+                    seq_rho = []
+                    seq_vel = []
+                    seq_p = []
+                    for s in range(self.seq_length):
+                        # get variables
+                        rho = wp.to_torch(self.seq_rho[batch_ind[b][bb] + s])[b]
+                        vel = wp.to_torch(self.seq_vel[batch_ind[b][bb] + s])[b]
+                        p = wp.to_torch(self.seq_p[batch_ind[b][bb] + s])[b]
+                        # add channels
+                        rho = torch.unsqueeze(rho, 0)
+                        vel = torch.permute(vel, (2, 0, 1))
+                        p = torch.unsqueeze(p, 0)
+                        # normalize values
+                        if self.normaliser is not None:
+                            rho = (
+                                rho - self.normaliser["density"][0]
+                            ) / self.normaliser["density"][1]
+                            vel = (
+                                vel - self.normaliser["velocity"][0]
+                            ) / self.normaliser["velocity"][1]
+                            p = (p - self.normaliser["pressure"][0]) / self.normaliser[
+                                "pressure"
+                            ][1]
+                        # store for producing seq
+                        seq_rho.append(rho)
+                        seq_vel.append(vel)
+                        seq_p.append(p)
+                    # concat seq
+                    batched_seq_rho.append(torch.stack(seq_rho, axis=0))
+                    batched_seq_vel.append(torch.stack(seq_vel, axis=0))
+                    batched_seq_p.append(torch.stack(seq_p, axis=0))
+                # CUDA graphs static copies
+                if self.output_rho is None:
+                    # concat batches
+                    self.output_rho = torch.stack(batched_seq_rho, axis=0)
+                    self.output_vel = torch.stack(batched_seq_vel, axis=0)
+                    self.output_p = torch.stack(batched_seq_p, axis=0)
+                else:
+                    self.output_rho.data.copy_(torch.stack(batched_seq_rho, axis=0))
+                    self.output_vel.data.copy_(torch.stack(batched_seq_vel, axis=0))
+                    self.output_p.data.copy_(torch.stack(batched_seq_p, axis=0))
+                yield {
+                    "density": self.output_rho,
+                    "velocity": self.output_vel,
+                    "pressure": self.output_p,
+                }
+    def __len__(self):
+        return sys.maxsize

physics_mcp/source/physicsnemo/datapipes/benchmarks/kernels/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# SPDX-FileCopyrightText: Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

physics_mcp/source/physicsnemo/datapipes/benchmarks/kernels/finite_difference.py ADDED Viewed

	@@ -0,0 +1,139 @@

+# SPDX-FileCopyrightText: Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+try:
+    import warp as wp
+except ImportError:
+    print(
+        """NVIDIA WARP is required for this datapipe. This package is under the
+NVIDIA Source Code License (NVSCL). To install use:
+pip install warp-lang
+"""
+    )
+    raise SystemExit(1)
+from .indexing import index_clamped_edges_batched_2d, index_zero_edges_batched_2d
+@wp.kernel
+def darcy_mgrid_jacobi_iterative_batched_2d(
+    darcy0: wp.array3d(dtype=float),
+    darcy1: wp.array3d(dtype=float),
+    permeability: wp.array3d(dtype=float),
+    source: float,
+    lx: int,
+    ly: int,
+    dx: float,
+    mgrid_reduction_factor: int,
+):  # pragma: no cover
+    """Mult-grid jacobi step for Darcy equation.
+    Parameters
+    ----------
+    darcy0 : wp.array3d
+        Darcy solution previous step
+    darcy1 : wp.array3d
+        Darcy solution for next step
+    permeability : wp.array3d
+        Permeability field for Darcy equation
+    source : float
+        Source value for Darcy equation
+    lx : int
+        Length of domain in x dim
+    ly : int
+        Length of domain in y dim
+    dx : float
+        Grid cell size
+    mgrid_reduction_factor : int
+        Current multi-grid running at
+    """
+    # get index
+    b, x, y = wp.tid()
+    # update index from grid reduction factor
+    gx = mgrid_reduction_factor * x + (mgrid_reduction_factor - 1)
+    gy = mgrid_reduction_factor * y + (mgrid_reduction_factor - 1)
+    gdx = dx * wp.float32(mgrid_reduction_factor)
+    # compute darcy stensil
+    d_0_1 = index_zero_edges_batched_2d(
+        darcy0, b, gx - mgrid_reduction_factor, gy, lx, ly
+    )
+    d_2_1 = index_zero_edges_batched_2d(
+        darcy0, b, gx + mgrid_reduction_factor, gy, lx, ly
+    )
+    d_1_0 = index_zero_edges_batched_2d(
+        darcy0, b, gx, gy - mgrid_reduction_factor, lx, ly
+    )
+    d_1_2 = index_zero_edges_batched_2d(
+        darcy0, b, gx, gy + mgrid_reduction_factor, lx, ly
+    )
+    # compute permeability stensil
+    p_1_1 = index_clamped_edges_batched_2d(permeability, b, gx, gy, lx, ly)
+    p_0_1 = index_clamped_edges_batched_2d(
+        permeability, b, gx - mgrid_reduction_factor, gy, lx, ly
+    )
+    p_2_1 = index_clamped_edges_batched_2d(
+        permeability, b, gx + mgrid_reduction_factor, gy, lx, ly
+    )
+    p_1_0 = index_clamped_edges_batched_2d(
+        permeability, b, gx, gy - mgrid_reduction_factor, lx, ly
+    )
+    p_1_2 = index_clamped_edges_batched_2d(
+        permeability, b, gx, gy + mgrid_reduction_factor, lx, ly
+    )
+    # compute terms
+    dx_squared = gdx * gdx
+    t_1 = p_1_1 * (d_0_1 + d_2_1 + d_1_0 + d_1_2) / dx_squared
+    t_2 = ((p_2_1 - p_0_1) * (d_2_1 - d_0_1)) / (2.0 * gdx)
+    t_3 = ((p_1_2 - p_1_0) * (d_1_2 - d_1_0)) / (2.0 * gdx)
+    # jacobi iterative method
+    d_star = (t_1 + t_2 + t_3 + source) / (p_1_1 * 4.0 / dx_squared)
+    # buffers get swapped each iteration
+    darcy1[b, gx, gy] = d_star
+@wp.kernel
+def mgrid_inf_residual_batched_2d(
+    phi0: wp.array3d(dtype=float),
+    phi1: wp.array3d(dtype=float),
+    inf_res: wp.array(dtype=float),
+    mgrid_reduction_factor: int,
+):  # pragma: no cover
+    """Infinity norm for checking multi-grid solutions.
+    Parameters
+    ----------
+    phi0 : wp.array3d
+        Previous solution
+    phi1 : wp.array3d
+        Current solution
+    inf_res : wp.array
+        Array to hold infinity norm value in
+    mgrid_reduction_factor : int
+        Current multi-grid running at
+    """
+    b, x, y = wp.tid()
+    gx = mgrid_reduction_factor * x + (mgrid_reduction_factor - 1)
+    gy = mgrid_reduction_factor * y + (mgrid_reduction_factor - 1)
+    wp.atomic_max(inf_res, 0, wp.abs(phi0[b, gx, gy] - phi1[b, gx, gy]))

physics_mcp/source/physicsnemo/datapipes/benchmarks/kernels/finite_volume.py ADDED Viewed

	@@ -0,0 +1,759 @@

+# SPDX-FileCopyrightText: Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+try:
+    import warp as wp
+except ImportError:
+    print(
+        """NVIDIA WARP is required for this datapipe. This package is under the
+NVIDIA Source Code License (NVSCL). To install use:
+pip install warp-lang
+"""
+    )
+    raise SystemExit(1)
+from .indexing import (
+    index_periodic_edges_batched_2d,
+    index_vec2_periodic_edges_batched_2d,
+)
+@wp.func
+def extrapolate_to_face_2d(
+    f: float, f_dx: float, f_dy: float, dx: float
+):  # pragma: no cover
+    """Extrapolate cell values to edges of face
+    Parameters
+    ----------
+    f : float
+        Cell value
+    f_dx : float
+        X derivative of cell value
+    f_dy : float
+        Y derivative of cell value
+    dx : float
+        Cell size
+    Returns
+    -------
+    wp.vec4
+        (value on left x, value on right x, value left y, value right y)
+    """
+    f_xl = f - f_dx * (dx / 2.0)
+    f_xr = f + f_dx * (dx / 2.0)
+    f_yl = f - f_dy * (dx / 2.0)
+    f_yr = f + f_dy * (dx / 2.0)
+    return wp.vec4(f_xl, f_xr, f_yl, f_yr)
+@wp.func
+def apply_flux_2d(
+    f: float,
+    flux_f_xl_dx: float,
+    flux_f_xr_dx: float,
+    flux_f_yl_dy: float,
+    flux_f_yr_dy: float,
+    dx: float,
+    dt: float,
+):  # pragma: no cover
+    """Apply flux to cell
+    Parameters
+    ----------
+    f : float
+        Cell value
+    flux_f_xl_dx : float
+        Left x flux
+    flux_f_xr_dx : float
+        Right x flux
+    flux_f_yl_dy : float
+        Left y flux
+    flux_f_yr_dy : float
+        Right y flux
+    dx : float
+        Cell size
+    dt : float
+        Time step size
+    Returns
+    -------
+    float
+        Cell value with added flux
+    """
+    f += -dt * dx * flux_f_xl_dx
+    f += dt * dx * flux_f_xr_dx
+    f += -dt * dx * flux_f_yl_dy
+    f += dt * dx * flux_f_yr_dy
+    return f
+@wp.func
+def apply_flux_vec2_2d(
+    f: wp.vec2,
+    flux_f_xl_dx: wp.vec2,
+    flux_f_xr_dx: wp.vec2,
+    flux_f_yl_dy: wp.vec2,
+    flux_f_yr_dy: wp.vec2,
+    dx: float,
+    dt: float,
+):  # pragma: no cover
+    """Apply flux on cell with vector value
+    Parameters
+    ----------
+    f : wp.vec2
+        Cell vector value
+    flux_f_xl_dx : wp.vec2
+        Vector flux in x left
+    flux_f_xr_dx : wp.vec2
+        Vector flux in x right
+    flux_f_yl_dy : wp.vec2
+        Vector flux in y left
+    flux_f_yr_dy : wp.vec2
+        Vector flux in y right
+    dx : float
+        Cell size
+    dt : float
+        Time step size
+    Returns
+    -------
+    wp.vec2
+        Vector cell value with added flux
+    """
+    f += -dt * dx * flux_f_xl_dx
+    f += dt * dx * flux_f_xr_dx
+    f += -dt * dx * flux_f_yl_dy
+    f += dt * dx * flux_f_yr_dy
+    return f
+@wp.func
+def euler_flux_2d(
+    rho_l: float,
+    rho_r: float,
+    vx_l: float,
+    vx_r: float,
+    vy_l: float,
+    vy_r: float,
+    p_l: float,
+    p_r: float,
+    gamma: float,
+):  # pragma: no cover
+    """Compute Euler flux
+    Parameters
+    ----------
+    rho_l : float
+        Density left
+    rho_r : float
+        Density right
+    vx_l : float
+        X velocity left
+    vx_r : float
+        X velocity right
+    vy_l : float
+        Y velocity left
+    vy_r : float
+        Y velocity right
+    p_l : float
+        Pressure left
+    p_r : float
+        Pressure right
+    gamma : float
+        Gas constant
+    Returns
+    -------
+    wp.vec4
+        Vector containing mass, momentum x, momentum y, and energy flux.
+    """
+    # get energies
+    e_l = p_l / (gamma - 1.0) + 0.5 * rho_l * (vx_l * vx_l + vy_l * vy_l)
+    e_r = p_r / (gamma - 1.0) + 0.5 * rho_r * (vx_r * vx_r + vy_r * vy_r)
+    # averaged states
+    rho_ave = 0.5 * (rho_l + rho_r)
+    momx_ave = 0.5 * (rho_l * vx_l + rho_r * vx_r)
+    momy_ave = 0.5 * (rho_l * vy_l + rho_r * vy_r)
+    e_ave = 0.5 * (e_l + e_r)
+    p_ave = (gamma - 1.0) * (
+        e_ave - 0.5 * (momx_ave * momx_ave + momy_ave * momy_ave) / rho_ave
+    )
+    # compute fluxes
+    flux_mass = momx_ave
+    flux_momx = momx_ave * momx_ave / rho_ave + p_ave
+    flux_momy = momx_ave * momy_ave / rho_ave
+    flux_e = (e_ave + p_ave) * momx_ave / rho_ave
+    # compute wavespeed
+    c_l = wp.sqrt(gamma * p_l / rho_l) + wp.abs(vx_l)
+    c_r = wp.sqrt(gamma * p_r / rho_r) + wp.abs(vx_r)
+    c = wp.max(c_l, c_r)
+    # add stabilizing diffusion term
+    flux_mass -= c * 0.5 * (rho_l - rho_r)
+    flux_momx -= c * 0.5 * (rho_l * vx_l - rho_r * vx_r)
+    flux_momy -= c * 0.5 * (rho_l * vy_l - rho_r * vy_r)
+    flux_e -= c * 0.5 * (e_l - e_r)
+    return wp.vec4(flux_mass, flux_momx, flux_momy, flux_e)
+@wp.kernel
+def euler_primitive_to_conserved_batched_2d(
+    rho: wp.array3d(dtype=float),
+    vel: wp.array3d(dtype=wp.vec2),
+    p: wp.array3d(dtype=float),
+    mass: wp.array3d(dtype=float),
+    mom: wp.array3d(dtype=wp.vec2),
+    e: wp.array3d(dtype=float),
+    gamma: float,
+    vol: float,
+    lx: int,
+    ly: int,
+):  # pragma: no cover
+    """Primitive Euler to conserved values
+    Parameters
+    ----------
+    rho : wp.array3d
+        Density
+    vel : wp.array3d
+        Velocity
+    p : wp.array3d
+        Pressure
+    mass : wp.array3d
+        Mass
+    mom : wp.array3d
+        Momentum
+    e : wp.array3d
+        Energy
+    gamma : float
+        Gas constant
+    vol : float
+        Volume of cell
+    lx : int
+        Grid size x dim
+    ly : int
+        Grid size y dim
+    """
+    # get index
+    b, i, j = wp.tid()
+    # get conserve values
+    rho_i_j = index_periodic_edges_batched_2d(rho, b, i, j, lx, ly)
+    vel_i_j = index_vec2_periodic_edges_batched_2d(vel, b, i, j, lx, ly)
+    p_i_j = index_periodic_edges_batched_2d(p, b, i, j, lx, ly)
+    # get primitive values
+    mass_i_j = rho_i_j * vol
+    mom_i_j = vel_i_j * rho_i_j * vol
+    e_i_j = (
+        p_i_j / (gamma - 1.0)
+        + 0.5 * rho_i_j * (vel_i_j[0] * vel_i_j[0] + vel_i_j[1] * vel_i_j[1])
+    ) * vol
+    # set values
+    mass[b, i, j] = mass_i_j
+    mom[b, i, j] = mom_i_j
+    e[b, i, j] = e_i_j
+@wp.kernel
+def euler_conserved_to_primitive_batched_2d(
+    mass: wp.array3d(dtype=float),
+    mom: wp.array3d(dtype=wp.vec2),
+    e: wp.array3d(dtype=float),
+    rho: wp.array3d(dtype=float),
+    vel: wp.array3d(dtype=wp.vec2),
+    p: wp.array3d(dtype=float),
+    gamma: float,
+    vol: float,
+    lx: int,
+    ly: int,
+):  # pragma: no cover
+    """Conserved Euler to primitive value
+    Parameters
+    ----------
+    mass : wp.array3d
+        Mass
+    mom : wp.array3d
+        Momentum
+    e : wp.array3d
+        Energy
+    rho : wp.array3d
+        Density
+    vel : wp.array3d
+        Velocity
+    p : wp.array3d
+        Pressure
+    gamma : float
+        Gas constant
+    vol : float
+        Cell volume
+    lx : int
+        Grid size X dim
+    ly : int
+        Grid size Y dim
+    """
+    # get index
+    b, i, j = wp.tid()
+    # get conserve values
+    mass_i_j = index_periodic_edges_batched_2d(mass, b, i, j, lx, ly)
+    mom_i_j = index_vec2_periodic_edges_batched_2d(mom, b, i, j, lx, ly)
+    e_i_j = index_periodic_edges_batched_2d(e, b, i, j, lx, ly)
+    # get primitive values
+    rho_i_j = mass_i_j / vol
+    vel_i_j = mom_i_j / rho_i_j / vol
+    p_i_j = (
+        e_i_j / vol
+        - 0.5 * rho_i_j * (vel_i_j[0] * vel_i_j[0] + vel_i_j[1] * vel_i_j[1])
+    ) * (gamma - 1.0)
+    # set values
+    rho[b, i, j] = rho_i_j
+    vel[b, i, j] = vel_i_j
+    p[b, i, j] = p_i_j
+@wp.kernel
+def euler_extrapolation_batched_2d(
+    rho: wp.array3d(dtype=float),
+    vel: wp.array3d(dtype=wp.vec2),
+    p: wp.array3d(dtype=float),
+    rho_xl: wp.array3d(dtype=float),
+    rho_xr: wp.array3d(dtype=float),
+    rho_yl: wp.array3d(dtype=float),
+    rho_yr: wp.array3d(dtype=float),
+    vel_xl: wp.array3d(dtype=wp.vec2),
+    vel_xr: wp.array3d(dtype=wp.vec2),
+    vel_yl: wp.array3d(dtype=wp.vec2),
+    vel_yr: wp.array3d(dtype=wp.vec2),
+    p_xl: wp.array3d(dtype=float),
+    p_xr: wp.array3d(dtype=float),
+    p_yl: wp.array3d(dtype=float),
+    p_yr: wp.array3d(dtype=float),
+    gamma: float,
+    dx: float,
+    dt: float,
+    lx: int,
+    ly: int,
+):  # pragma: no cover
+    """Extrapolate Euler values to edges
+    Parameters
+    ----------
+    rho : wp.array3d
+        Density
+    vel : wp.array3d
+        Velocity
+    p : wp.array3d
+        Pressure
+    rho_xl : wp.array3d
+        Density x left
+    rho_xr : wp.array3d
+        Density x right
+    rho_yl : wp.array3d
+        Density y left
+    rho_yr : wp.array3d
+        Density y right
+    vel_xl : wp.array3d
+        Velocity x left
+    vel_xr : wp.array3d
+        Velocity x right
+    vel_yl : wp.array3d
+        Velocity y left
+    vel_yr : wp.array3d
+        Velocity y right
+    p_xl : wp.array3d
+        Pressure x left
+    p_xr : wp.array3d
+        Pressure x right
+    p_yl : wp.array3d
+        Pressure y left
+    p_yr : wp.array3d
+        Pressure y right
+    gamma : float
+        Gas constant
+    dx : float
+        Cell size
+    dt : float
+        Time step size
+    lx : int
+        Grid size x
+    ly : int
+        Grid size y
+    """
+    # get index
+    b, i, j = wp.tid()
+    # get rho stensil
+    rho_1_1 = index_periodic_edges_batched_2d(rho, b, i, j, lx, ly)
+    rho_2_1 = index_periodic_edges_batched_2d(rho, b, i + 1, j, lx, ly)
+    rho_1_2 = index_periodic_edges_batched_2d(rho, b, i, j + 1, lx, ly)
+    rho_0_1 = index_periodic_edges_batched_2d(rho, b, i - 1, j, lx, ly)
+    rho_1_0 = index_periodic_edges_batched_2d(rho, b, i, j - 1, lx, ly)
+    # get momentum stensil
+    vel_1_1 = index_vec2_periodic_edges_batched_2d(vel, b, i, j, lx, ly)
+    vel_2_1 = index_vec2_periodic_edges_batched_2d(vel, b, i + 1, j, lx, ly)
+    vel_1_2 = index_vec2_periodic_edges_batched_2d(vel, b, i, j + 1, lx, ly)
+    vel_0_1 = index_vec2_periodic_edges_batched_2d(vel, b, i - 1, j, lx, ly)
+    vel_1_0 = index_vec2_periodic_edges_batched_2d(vel, b, i, j - 1, lx, ly)
+    # get energy stensil
+    p_1_1 = index_periodic_edges_batched_2d(p, b, i, j, lx, ly)
+    p_2_1 = index_periodic_edges_batched_2d(p, b, i + 1, j, lx, ly)
+    p_1_2 = index_periodic_edges_batched_2d(p, b, i, j + 1, lx, ly)
+    p_0_1 = index_periodic_edges_batched_2d(p, b, i - 1, j, lx, ly)
+    p_1_0 = index_periodic_edges_batched_2d(p, b, i, j - 1, lx, ly)
+    # compute density grad
+    rho_dx = (rho_2_1 - rho_0_1) / (2.0 * dx)
+    rho_dy = (rho_1_2 - rho_1_0) / (2.0 * dx)
+    # compute velocity grad
+    vel_dx = (vel_2_1 - vel_0_1) / (2.0 * dx)
+    vel_dy = (vel_1_2 - vel_1_0) / (2.0 * dx)
+    # compute pressure grad
+    p_dx = (p_2_1 - p_0_1) / (2.0 * dx)
+    p_dy = (p_1_2 - p_1_0) / (2.0 * dx)
+    # extrapolate half time step density
+    rho_prime = rho_1_1 - 0.5 * dt * (
+        vel_1_1[0] * rho_dx
+        + rho_1_1 * vel_dx[0]
+        + vel_1_1[1] * rho_dy
+        + rho_1_1 * vel_dy[1]
+    )
+    vx_prime = vel_1_1[0] - 0.5 * dt * (
+        vel_1_1[0] * vel_dx[0] + vel_1_1[1] * vel_dy[0] + (1.0 / rho_1_1) * p_dx
+    )
+    vy_prime = vel_1_1[1] - 0.5 * dt * (
+        vel_1_1[0] * vel_dx[1] + vel_1_1[1] * vel_dy[1] + (1.0 / rho_1_1) * p_dy
+    )
+    p_prime = p_1_1 - 0.5 * dt * (
+        gamma * p_1_1 * (vel_dx[0] + vel_dy[1]) + vel_1_1[0] * p_dx + vel_1_1[1] * p_dy
+    )
+    # extrapolate in space to face centers
+    rho_space_extra = extrapolate_to_face_2d(rho_prime, rho_dx, rho_dy, dx)
+    vx_space_extra = extrapolate_to_face_2d(vx_prime, vel_dx[0], vel_dy[0], dx)
+    vy_space_extra = extrapolate_to_face_2d(vy_prime, vel_dx[1], vel_dy[1], dx)
+    p_space_extra = extrapolate_to_face_2d(p_prime, p_dx, p_dy, dx)
+    # store values
+    rho_xl[b, i, j] = rho_space_extra[0]
+    rho_xr[b, i, j] = rho_space_extra[1]
+    rho_yl[b, i, j] = rho_space_extra[2]
+    rho_yr[b, i, j] = rho_space_extra[3]
+    vel_xl[b, i, j] = wp.vec2(vx_space_extra[0], vy_space_extra[0])
+    vel_xr[b, i, j] = wp.vec2(vx_space_extra[1], vy_space_extra[1])
+    vel_yl[b, i, j] = wp.vec2(vx_space_extra[2], vy_space_extra[2])
+    vel_yr[b, i, j] = wp.vec2(vx_space_extra[3], vy_space_extra[3])
+    p_xl[b, i, j] = p_space_extra[0]
+    p_xr[b, i, j] = p_space_extra[1]
+    p_yl[b, i, j] = p_space_extra[2]
+    p_yr[b, i, j] = p_space_extra[3]
+@wp.kernel
+def euler_get_flux_batched_2d(
+    rho_xl: wp.array3d(dtype=float),
+    rho_xr: wp.array3d(dtype=float),
+    rho_yl: wp.array3d(dtype=float),
+    rho_yr: wp.array3d(dtype=float),
+    vel_xl: wp.array3d(dtype=wp.vec2),
+    vel_xr: wp.array3d(dtype=wp.vec2),
+    vel_yl: wp.array3d(dtype=wp.vec2),
+    vel_yr: wp.array3d(dtype=wp.vec2),
+    p_xl: wp.array3d(dtype=float),
+    p_xr: wp.array3d(dtype=float),
+    p_yl: wp.array3d(dtype=float),
+    p_yr: wp.array3d(dtype=float),
+    mass_flux_x: wp.array3d(dtype=float),
+    mass_flux_y: wp.array3d(dtype=float),
+    mom_flux_x: wp.array3d(dtype=wp.vec2),
+    mom_flux_y: wp.array3d(dtype=wp.vec2),
+    e_flux_x: wp.array3d(dtype=float),
+    e_flux_y: wp.array3d(dtype=float),
+    gamma: float,
+    lx: int,
+    ly: int,
+):  # pragma: no cover
+    """Use extrapolated Euler values to compute fluxes
+    Parameters
+    ----------
+    rho_xl : wp.array3d
+        Density x left
+    rho_xr : wp.array3d
+        Density x right
+    rho_yl : wp.array3d
+        Density y left
+    rho_yr : wp.array3d
+        Density y right
+    vel_xl : wp.array3d
+        Velocity x left
+    vel_xr : wp.array3d
+        Velocity x right
+    vel_yl : wp.array3d
+        Velocity y left
+    vel_yr : wp.array3d
+        Velocity y right
+    p_xl : wp.array3d
+        Pressure x left
+    p_xr : wp.array3d
+        Pressure x right
+    p_yl : wp.array3d
+        Pressure y left
+    p_yr : wp.array3d
+        Pressure y right
+    mass_flux_x : wp.array3d
+        Mass flux x
+    mass_flux_y : wp.array3d
+        Mass flux y
+    mom_flux_x : wp.array3d
+        Momentum flux x
+    mom_flux_y : wp.array3d
+        Momentum flux y
+    e_flux_x : wp.array3d
+        Energy flux x
+    e_flux_y : wp.array3d
+        Energy flux y
+    gamma : float
+        Gas constant
+    lx : int
+        Grid size x
+    ly : int
+        Grid size y
+    """
+    # get index
+    b, i, j = wp.tid()
+    # get space extrapolation for faces
+    rho_xl_1 = index_periodic_edges_batched_2d(rho_xl, b, i + 1, j, lx, ly)
+    rho_xr_0 = index_periodic_edges_batched_2d(rho_xr, b, i, j, lx, ly)
+    rho_yl_1 = index_periodic_edges_batched_2d(rho_yl, b, i, j + 1, lx, ly)
+    rho_yr_0 = index_periodic_edges_batched_2d(rho_yr, b, i, j, lx, ly)
+    vel_xl_1 = index_vec2_periodic_edges_batched_2d(vel_xl, b, i + 1, j, lx, ly)
+    vel_xr_0 = index_vec2_periodic_edges_batched_2d(vel_xr, b, i, j, lx, ly)
+    vel_yl_1 = index_vec2_periodic_edges_batched_2d(vel_yl, b, i, j + 1, lx, ly)
+    vel_yr_0 = index_vec2_periodic_edges_batched_2d(vel_yr, b, i, j, lx, ly)
+    p_xl_1 = index_periodic_edges_batched_2d(p_xl, b, i + 1, j, lx, ly)
+    p_xr_0 = index_periodic_edges_batched_2d(p_xr, b, i, j, lx, ly)
+    p_yl_1 = index_periodic_edges_batched_2d(p_yl, b, i, j + 1, lx, ly)
+    p_yr_0 = index_periodic_edges_batched_2d(p_yr, b, i, j, lx, ly)
+    # compute fluxes
+    flux_x = euler_flux_2d(
+        rho_xl_1,
+        rho_xr_0,
+        vel_xl_1[0],
+        vel_xr_0[0],
+        vel_xl_1[1],
+        vel_xr_0[1],
+        p_xl_1,
+        p_xr_0,
+        gamma,
+    )
+    flux_y = euler_flux_2d(
+        rho_yl_1,
+        rho_yr_0,
+        vel_yl_1[1],
+        vel_yr_0[1],
+        vel_yl_1[0],
+        vel_yr_0[0],
+        p_yl_1,
+        p_yr_0,
+        gamma,
+    )
+    # set values
+    mass_flux_x[b, i, j] = flux_x[0]
+    mass_flux_y[b, i, j] = flux_y[0]
+    mom_flux_x[b, i, j] = wp.vec2(flux_x[1], flux_x[2])
+    mom_flux_y[b, i, j] = wp.vec2(flux_y[2], flux_y[1])
+    e_flux_x[b, i, j] = flux_x[3]
+    e_flux_y[b, i, j] = flux_y[3]
+@wp.kernel
+def euler_apply_flux_batched_2d(
+    mass_flux_x: wp.array3d(dtype=float),
+    mass_flux_y: wp.array3d(dtype=float),
+    mom_flux_x: wp.array3d(dtype=wp.vec2),
+    mom_flux_y: wp.array3d(dtype=wp.vec2),
+    e_flux_x: wp.array3d(dtype=float),
+    e_flux_y: wp.array3d(dtype=float),
+    mass: wp.array3d(dtype=float),
+    mom: wp.array3d(dtype=wp.vec2),
+    e: wp.array3d(dtype=float),
+    dx: float,
+    dt: float,
+    lx: int,
+    ly: int,
+):  # pragma: no cover
+    """Apply fluxes to Euler values
+    Parameters
+    ----------
+    mass_flux_x : wp.array3d
+        Mass flux X
+    mass_flux_y : wp.array3d
+        Mass flux Y
+    mom_flux_x : wp.array3d
+        Momentum flux X
+    mom_flux_y : wp.array3d
+        Momentum flux Y
+    e_flux_x : wp.array3d
+        Energy flux X
+    e_flux_y : wp.array3d
+        Energy flux Y
+    mass : wp.array3d
+        Mass
+    mom : wp.array3d
+        Momentum
+    e : wp.array3d
+        Energy
+    dx : float
+        Cell size
+    dt : float
+        Time step size
+    lx : int
+        Grid size x
+    ly : int
+        Grid size y
+    """
+    # get index
+    b, i, j = wp.tid()
+    # get new mass
+    mass_1 = index_periodic_edges_batched_2d(mass, b, i, j, lx, ly)
+    mass_flux_x_1 = index_periodic_edges_batched_2d(mass_flux_x, b, i, j, lx, ly)
+    mass_flux_x_0 = index_periodic_edges_batched_2d(mass_flux_x, b, i - 1, j, lx, ly)
+    mass_flux_y_1 = index_periodic_edges_batched_2d(mass_flux_y, b, i, j, lx, ly)
+    mass_flux_y_0 = index_periodic_edges_batched_2d(mass_flux_y, b, i, j - 1, lx, ly)
+    new_mass = apply_flux_2d(
+        mass_1, mass_flux_x_1, mass_flux_x_0, mass_flux_y_1, mass_flux_y_0, dx, dt
+    )
+    # get new mom
+    mom_1 = index_vec2_periodic_edges_batched_2d(mom, b, i, j, lx, ly)
+    mom_flux_x_1 = index_vec2_periodic_edges_batched_2d(mom_flux_x, b, i, j, lx, ly)
+    mom_flux_x_0 = index_vec2_periodic_edges_batched_2d(mom_flux_x, b, i - 1, j, lx, ly)
+    mom_flux_y_1 = index_vec2_periodic_edges_batched_2d(mom_flux_y, b, i, j, lx, ly)
+    mom_flux_y_0 = index_vec2_periodic_edges_batched_2d(mom_flux_y, b, i, j - 1, lx, ly)
+    new_mom = apply_flux_vec2_2d(
+        mom_1, mom_flux_x_1, mom_flux_x_0, mom_flux_y_1, mom_flux_y_0, dx, dt
+    )
+    # get new energy
+    e_1 = index_periodic_edges_batched_2d(e, b, i, j, lx, ly)
+    e_flux_x_1 = index_periodic_edges_batched_2d(e_flux_x, b, i, j, lx, ly)
+    e_flux_x_0 = index_periodic_edges_batched_2d(e_flux_x, b, i - 1, j, lx, ly)
+    e_flux_y_1 = index_periodic_edges_batched_2d(e_flux_y, b, i, j, lx, ly)
+    e_flux_y_0 = index_periodic_edges_batched_2d(e_flux_y, b, i, j - 1, lx, ly)
+    new_e = apply_flux_2d(e_1, e_flux_x_1, e_flux_x_0, e_flux_y_1, e_flux_y_0, dx, dt)
+    # set values
+    mass[b, i, j] = new_mass
+    mom[b, i, j] = new_mom
+    e[b, i, j] = new_e
+@wp.kernel
+def initialize_kelvin_helmoltz_batched_2d(
+    rho: wp.array3d(dtype=float),
+    vel: wp.array3d(dtype=wp.vec2),
+    p: wp.array3d(dtype=float),
+    w: wp.array2d(dtype=float),
+    sigma: float,
+    lx: float,
+    ly: float,
+    nr_freq: int,
+):  # pragma: no cover
+    """Initialize state for Kelvin Helmoltz Instability
+    Parameters
+    ----------
+    rho : wp.array3d
+        Density
+    vel : wp.array3d
+        Velocity
+    p : wp.array3d
+        Pressure
+    w : wp.array2d
+        Perturbation frequency amplitude
+    sigma : float
+        Perturbation sigma
+    vol : float
+        Volume of cell
+    gamma : float
+        Gas constant
+    lx : float
+        Grid size x
+    ly : float
+        Grid size y
+    nr_freq : int
+        Number of frequencies in perturbation
+    """
+    # get cell coords
+    b, i, j = wp.tid()
+    x = wp.float(i) / wp.float(lx)
+    y = wp.float(j) / wp.float(ly)
+    # initial flow bands
+    if wp.abs(y - 0.5) < 0.25:
+        ux = 0.5
+        r = 2.0
+    else:
+        ux = -0.5
+        r = 1.0
+    # perturbation
+    uy = wp.float32(0.0)
+    for f in range(nr_freq):
+        ff = wp.float32(f + 1)
+        uy += (
+            ff
+            * w[b, f]
+            * wp.sin(4.0 * 3.14159 * x * ff)
+            * (
+                wp.exp(-(y - 0.25) * (y - 0.25) / (2.0 * sigma * sigma))
+                + wp.exp(-(y - 0.75) * (y - 0.75) / (2.0 * sigma * sigma))
+            )
+        )
+    u = wp.vec2(ux, uy)
+    # set values
+    rho[b, i, j] = r
+    vel[b, i, j] = u
+    p[b, i, j] = 2.5

physics_mcp/source/physicsnemo/datapipes/benchmarks/kernels/indexing.py ADDED Viewed

	@@ -0,0 +1,182 @@

+# SPDX-FileCopyrightText: Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+try:
+    import warp as wp
+except ImportError:
+    print(
+        """NVIDIA WARP is required for this datapipe. This package is under the
+NVIDIA Source Code License (NVSCL). To install use:
+pip install warp-lang
+"""
+    )
+    raise SystemExit(1)
+# TODO bug in warp mod function
+@wp.func
+def _mod_int(x: int, length: int):  # pragma: no cover
+    """Mod int
+    Parameters
+    ----------
+    x : int
+        Int to mod
+    length : int
+        Mod by value
+    Returns
+    -------
+    int
+        Mod of x
+    """
+    if x < 0:
+        return x + length
+    elif x > length - 1:
+        return x - length
+    return x
+@wp.func
+def index_zero_edges_batched_2d(
+    array: wp.array3d(dtype=float), b: int, x: int, y: int, lx: int, ly: int
+):  # pragma: no cover
+    """Index batched 2d array with zero on edges
+    Parameters
+    ----------
+    array : wp.array3d
+        Array to index
+    b : int
+        Batch index
+    x : int
+        X index
+    y : int
+        Y index
+    lx : int
+        Grid size x
+    ly : int
+        Grid size y
+    Returns
+    -------
+    float
+        Array value
+    """
+    if x == -1:
+        return 0.0
+    elif x == lx:
+        return 0.0
+    elif y == -1:
+        return 0.0
+    elif y == ly:
+        return 0.0
+    else:
+        return array[b, x, y]
+@wp.func
+def index_clamped_edges_batched_2d(
+    array: wp.array3d(dtype=float), b: int, x: int, y: int, lx: int, ly: int
+):  # pragma: no cover
+    """Index batched 2d array with edges clamped to same value
+    Parameters
+    ----------
+    array : wp.array3d
+        Array to index
+    b : int
+        Batch index
+    x : int
+        X index
+    y : int
+        Y index
+    lx : int
+        Grid size x
+    ly : int
+        Grid size y
+    Returns
+    -------
+    float
+        Array value
+    """
+    x = wp.clamp(x, 0, lx - 1)
+    y = wp.clamp(y, 0, ly - 1)
+    return array[b, x, y]
+@wp.func
+def index_periodic_edges_batched_2d(
+    array: wp.array3d(dtype=float), b: int, x: int, y: int, lx: int, ly: int
+):  # pragma: no cover
+    """Index batched 2d array with periodic edges
+    Parameters
+    ----------
+    array : wp.array3d
+        Array to index
+    b : int
+        Batch index
+    x : int
+        X index
+    y : int
+        Y index
+    lx : int
+        Grid size x
+    ly : int
+        Grid size y
+    Returns
+    -------
+    float
+        Array value
+    """
+    x = _mod_int(x, lx)
+    y = _mod_int(y, ly)
+    return array[b, x, y]
+@wp.func
+def index_vec2_periodic_edges_batched_2d(
+    vec: wp.array3d(dtype=wp.vec2), b: int, x: int, y: int, lx: int, ly: int
+):  # pragma: no cover
+    """Index batched 2d array of wp.vec2 with periodic edges
+    Parameters
+    ----------
+    vec : wp.array3d
+        Array to index
+    b : int
+        Batch index
+    x : int
+        X index
+    y : int
+        Y index
+    lx : int
+        Grid size x
+    ly : int
+        Grid size y
+    Returns
+    -------
+    wp.vec2
+        Vector value
+    """
+    x = _mod_int(x, lx)
+    y = _mod_int(y, ly)
+    return vec[b, x, y]

physics_mcp/source/physicsnemo/datapipes/benchmarks/kernels/initialization.py ADDED Viewed

	@@ -0,0 +1,77 @@

+# SPDX-FileCopyrightText: Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+try:
+    import warp as wp
+except ImportError:
+    print(
+        """NVIDIA WARP is required for this datapipe. This package is under the
+NVIDIA Source Code License (NVSCL). To install use:
+pip install warp-lang
+"""
+    )
+    raise SystemExit(1)
+@wp.kernel
+def init_uniform_random_2d(
+    array: wp.array2d(dtype=float),
+    min_value: float,
+    max_value: float,
+    external_seed: int,
+):  # pragma: no cover
+    """Initialize 2d array with uniform random values
+    Parameters
+    ----------
+    array : wp.array2d
+        Array to initialize
+    min_value : float
+        Min random value
+    max_value : float
+        Max random value
+    external_seed : int
+        External seed to use
+    """
+    i, j = wp.tid()
+    state = wp.rand_init(external_seed, wp.tid())
+    array[i, j] = wp.randf(state, -min_value, max_value)
+@wp.kernel
+def init_uniform_random_4d(
+    array: wp.array4d(dtype=float),
+    min_value: float,
+    max_value: float,
+    external_seed: int,
+):  # pragma: no cover
+    """Initialize 4d array with uniform random values
+    Parameters
+    ----------
+    array : wp.array4d
+        Array to initialize
+    min_value : float
+        Min random value
+    max_value : float
+        Max random value
+    external_seed : int
+        External seed to use
+    """
+    b, i, j, k = wp.tid()
+    state = wp.rand_init(external_seed, wp.tid())
+    array[b, i, j, k] = wp.randf(state, min_value, max_value)

physics_mcp/source/physicsnemo/datapipes/benchmarks/kernels/utils.py ADDED Viewed

	@@ -0,0 +1,141 @@

+# SPDX-FileCopyrightText: Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+try:
+    import warp as wp
+except ImportError:
+    print(
+        """NVIDIA WARP is required for this datapipe. This package is under the
+NVIDIA Source Code License (NVSCL). To install use:
+pip install warp-lang
+"""
+    )
+    raise SystemExit(1)
+from .indexing import index_zero_edges_batched_2d
+@wp.kernel
+def bilinear_upsample_batched_2d(
+    array: wp.array3d(dtype=float), lx: int, ly: int, grid_reduction_factor: int
+):  # pragma: no cover
+    """Bilinear upsampling from batch 2d array
+    Parameters
+    ----------
+    array : wp.array3d
+        Array to perform upsampling on
+    lx : int
+        Grid size X
+    ly : int
+        Grid size Y
+    grid_reduction_factor : int
+        Grid reduction factor for multi-grid
+    """
+    # get index
+    b, x, y = wp.tid()
+    # get four neighbors coordinates
+    x_0 = x - (x + 1) % grid_reduction_factor
+    x_1 = x + (x + 1) % grid_reduction_factor
+    y_0 = y - (y + 1) % grid_reduction_factor
+    y_1 = y + (y + 1) % grid_reduction_factor
+    # simple linear upsampling
+    d_0_0 = index_zero_edges_batched_2d(array, b, x_0, y_0, lx, ly)
+    d_1_0 = index_zero_edges_batched_2d(array, b, x_1, y_0, lx, ly)
+    d_0_1 = index_zero_edges_batched_2d(array, b, x_0, y_1, lx, ly)
+    d_1_1 = index_zero_edges_batched_2d(array, b, x_1, y_1, lx, ly)
+    # get relative distance
+    rel_x = wp.float32(x - x_0) / wp.float32(grid_reduction_factor)
+    rel_y = wp.float32(y - y_0) / wp.float32(grid_reduction_factor)
+    # interpolation in x direction
+    d_x_0 = (1.0 - rel_x) * d_0_0 + rel_x * d_1_0
+    d_x_1 = (1.0 - rel_x) * d_0_1 + rel_x * d_1_1
+    # interpolation in y direction
+    d = (1.0 - rel_y) * d_x_0 + rel_y * d_x_1
+    # set interpolation
+    array[b, x, y] = d
+@wp.kernel
+def threshold_3d(
+    array: wp.array3d(dtype=float), threshold: float, min_value: float, max_value: float
+):  # pragma: no cover
+    """Threshold 3d array by value. Values bellow threshold will be `min_value` and those above will be `max_value`.
+    Parameters
+    ----------
+    array : wp.array3d
+        Array to apply threshold on
+    threshold : float
+        Threshold value
+    min_value : float
+        Value to set if bellow threshold
+    max_value : float
+        Value to set if above threshold
+    """
+    i, j, k = wp.tid()
+    if array[i, j, k] < threshold:
+        array[i, j, k] = min_value
+    else:
+        array[i, j, k] = max_value
+@wp.kernel
+def fourier_to_array_batched_2d(
+    array: wp.array3d(dtype=float),
+    fourier: wp.array4d(dtype=float),
+    nr_freq: int,
+    lx: int,
+    ly: int,
+):  # pragma: no cover
+    """Array of Fourier amplitudes to batched 2d spatial array
+    Parameters
+    ----------
+    array : wp.array3d
+        Spatial array
+    fourier : wp.array4d
+        Array of Fourier amplitudes
+    nr_freq : int
+        Number of frequencies in Fourier array
+    lx : int
+        Grid size x
+    ly : int
+        Grid size y
+    """
+    b, x, y = wp.tid()
+    dx = 6.28318 / wp.float32(lx)
+    dy = 6.28318 / wp.float32(ly)
+    rx = dx * wp.float32(x)
+    ry = dy * wp.float32(y)
+    for i in range(nr_freq):
+        for j in range(nr_freq):
+            ri = wp.float32(i)
+            rj = wp.float32(j)
+            ss = fourier[0, b, i, j] * wp.sin(ri * rx) * wp.sin(rj * ry)
+            cs = fourier[1, b, i, j] * wp.cos(ri * rx) * wp.sin(rj * ry)
+            sc = fourier[2, b, i, j] * wp.sin(ri * rx) * wp.cos(rj * ry)
+            cc = fourier[3, b, i, j] * wp.cos(ri * rx) * wp.cos(rj * ry)
+            wp.atomic_add(
+                array, b, x, y, 1.0 / (wp.float32(nr_freq) ** 2.0) * (ss + cs + sc + cc)
+            )

physics_mcp/source/physicsnemo/datapipes/cae/__init__.py ADDED Viewed

	@@ -0,0 +1,18 @@

+# SPDX-FileCopyrightText: Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .domino_datapipe import DoMINODataPipe
+from .mesh_datapipe import MeshDatapipe

physics_mcp/source/physicsnemo/datapipes/cae/cae_dataset.py ADDED Viewed

	@@ -0,0 +1,1275 @@

+# SPDX-FileCopyrightText: Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pathlib
+import time
+from abc import ABC, abstractmethod
+from concurrent.futures import ThreadPoolExecutor
+import numpy as np
+import torch
+import torch.distributed as dist
+import zarr
+from torch.distributed.tensor import Replicate, Shard
+try:
+    import tensorstore as ts
+    TENSORSTORE_AVAILABLE = True
+except ImportError:
+    TENSORSTORE_AVAILABLE = False
+try:
+    import pyvista as pv
+    PV_AVAILABLE = True
+except ImportError:
+    PV_AVAILABLE = False
+from physicsnemo.distributed import ShardTensor, ShardTensorSpec
+from physicsnemo.distributed.utils import compute_split_shapes
+# Abstractions:
+# - want to read npy/npz/.zarr/.stl/.vtp files
+# - Need to share next level abstractions
+# - Domain parallel dataloading is supported: output will be ShardTensor instead.
+# - need to be able to configure preprocessing
+# - CPU -> GPU transfer happens here, needs to be isolated in it's own stream
+# - Output of dataloader should be torch.Tensor objects.
+"""
+This datapipe handles reading files from Zarr and piping into torch.Tensor objects.
+It's expected that the files are organized as groups, with each .zarr
+file representing one training example.  To improve IO performance, the files
+should be chunked for each array.  The reader takes a list of keys in the
+group to read, and will not read keys that are not specified.  The exception
+is if _no_ keys are passed, in which case _all_ keys will be read.
+"""
+class BackendReader(ABC):
+    """
+    Abstract base class for backend readers.
+    """
+    def __init__(
+        self,
+        keys_to_read: list[str] | None,
+        keys_to_read_if_available: dict[str, torch.Tensor] | None,
+    ) -> None:
+        """
+        Initialize the backend reader.
+        """
+        self.keys_to_read = keys_to_read
+        self.keys_to_read_if_available = keys_to_read_if_available
+        self.volume_sampling_size = None
+        self.is_volumetric = any(["volume" in key for key in self.keys_to_read])
+    @abstractmethod
+    def read_file(self, filename: pathlib.Path) -> dict[str, torch.Tensor]:
+        """
+        Read a file and return a dictionary of tensors.
+        """
+        pass
+    @abstractmethod
+    def read_file_sharded(
+        self, filename: pathlib.Path, device_mesh: torch.distributed.DeviceMesh
+    ) -> tuple[dict[str, torch.Tensor], dict[str, dict]]:
+        """
+        Read a file and return a dictionary of tensors ready to convert to ShardTensors.
+        NOTE: this function does not actually convert torch tensors to ShardTensors.
+        It's possible that the conversion, in some cases, can be a collective function.
+        Due to the async nature of the loader, we don't rely on any ordering of
+        collectives and defer them to the last possible minute.
+        Additionally, these functions return CPU tensors and we don't actually
+        define shard tensors on cpu.
+        So, the dataset itself will convert a local tensor + shard info to shard tensor
+        after the cpu-> gpu movement.
+        """
+        pass
+    def fill_optional_keys(
+        self, data: dict[str, torch.Tensor]
+    ) -> dict[str, torch.Tensor]:
+        """
+        Fill missing keys with the keys from the keys_to_read_if_available dictionary.
+        """
+        for key in self.keys_to_read_if_available:
+            if key not in data.keys():
+                data[key] = self.keys_to_read_if_available[key]
+        return data
+    def _get_slice_boundaries(
+        self, array_shape: tuple[int], this_rank: int, n_splits: int, split_dim: int = 0
+    ) -> tuple[int, int, tuple | None]:
+        """
+        For an array, determine the slice boundaries for parallel reading.
+        Args:
+            array_shape: The total shape of the target array.
+            this_rank: The rank of the distributed process.
+            n_splits: The size of the distributed process.
+            split_dim: The dimension to split, default is 0.
+        Returns:
+            The slice boundaries for parallel reading.
+        """
+        # Determine what slice this rank should read
+        sections = compute_split_shapes(array_shape[split_dim], n_splits)
+        global_chunk_start = sum(sections[:this_rank])
+        global_chunk_stop = global_chunk_start + sections[this_rank]
+        chunk_sizes = tuple(
+            array_shape[:split_dim] + (section,) + array_shape[split_dim + 1 :]
+            for section in sections
+        )
+        return global_chunk_start, global_chunk_stop, chunk_sizes
+    def set_volume_sampling_size(self, volume_sampling_size: int):
+        """
+        Set the volume sampling size.  When set, the readers will
+        assume the volumetric data is shuffled on disk and read only
+        contiguous chunks of the data up to the sampling size.
+        Args:
+            volume_sampling_size: The total size of the volume sampling.
+        """
+        self.volume_sampling_size = volume_sampling_size
+    def select_random_sections_from_slice(
+        self,
+        slice_start: int,
+        slice_stop: int,
+        n_points: int,
+    ) -> slice:
+        """
+        select the contiguous chunks of the volume data to read.
+        Args:
+            n_volume_points: The number of points to sample from the volume.
+        Returns:
+            A tuple of the start and stop indices of the contiguous chunks.
+        """
+        if slice_stop - slice_start < n_points:
+            raise ValueError(
+                f"Slice size {slice_stop - slice_start} is less than the number of points {n_points}"
+            )
+        # Choose a random start point that will fit the entire n_points region:
+        start = np.random.randint(slice_start, slice_stop - n_points)
+        return slice(start, start + n_points)
+class NpyFileReader(BackendReader):
+    """
+    Reader for numpy files.
+    """
+    def __init__(
+        self,
+        keys_to_read: list[str] | None,
+        keys_to_read_if_available: dict[str, torch.Tensor] | None,
+    ) -> None:
+        super().__init__(keys_to_read, keys_to_read_if_available)
+    def read_file(self, filename: pathlib.Path) -> dict[str, torch.Tensor]:
+        """
+        Read a file and return a dictionary of tensors.
+        """
+        data = np.load(filename, allow_pickle=True).item()
+        missing_keys = set(self.keys_to_read) - set(data.keys())
+        if len(missing_keys) > 0:
+            raise ValueError(f"Keys {missing_keys} not found in file {filename}")
+        data = {key: torch.from_numpy(data[key]) for key in self.keys_to_read}
+        return self.fill_optional_keys(data)
+    def read_file_sharded(
+        self, filename: pathlib.Path, device_mesh: torch.distributed.DeviceMesh
+    ) -> dict[str, ShardTensor]:
+        pass
+    def set_volume_sampling_size(self, volume_sampling_size: int):
+        """
+        This is not supported for npy files.
+        """
+        raise NotImplementedError(
+            "volume sampling directly from disk is not supported for npy files."
+        )
+class NpzFileReader(BackendReader):
+    """
+    Reader for npz files.
+    """
+    def __init__(
+        self,
+        keys_to_read: list[str] | None,
+        keys_to_read_if_available: dict[str, torch.Tensor] | None,
+    ) -> None:
+        super().__init__(keys_to_read, keys_to_read_if_available)
+    def read_file(self, filename: pathlib.Path) -> dict[str, torch.Tensor]:
+        """
+        Read a file and return a dictionary of tensors.
+        """
+        in_data = np.load(filename)
+        keys_found = set(in_data.keys())
+        keys_missing = set(self.keys_to_read) - keys_found
+        if len(keys_missing) > 0:
+            raise ValueError(f"Keys {keys_missing} not found in file {filename}")
+        # Make sure to select the slice outside of the loop.
+        if self.is_volumetric:
+            if self.volume_sampling_size is not None:
+                volume_slice = self.select_random_sections_from_slice(
+                    0,
+                    in_data["volume_mesh_centers"].shape[0],
+                    self.volume_sampling_size,
+                )
+            else:
+                volume_slice = slice(0, in_data["volume_mesh_centers"].shape[0])
+        # This is a slower basic way to do this, to be improved:
+        data = {}
+        for key in self.keys_to_read:
+            if "volume" not in key:
+                data[key] = torch.from_numpy(in_data[key][:])
+            else:
+                data[key] = torch.from_numpy(in_data[key][volume_slice])
+        # data = {key: torch.from_numpy(in_data[key][:]) for key in self.keys_to_read}
+        return self.fill_optional_keys(data)
+    def read_file_sharded(
+        self, filename: pathlib.Path, device_mesh: torch.distributed.DeviceMesh
+    ) -> dict[str, ShardTensor]:
+        pass
+    def set_volume_sampling_size(self, volume_sampling_size: int):
+        """
+        This is not supported for npz files.
+        """
+        raise NotImplementedError(
+            "volume sampling directly from disk is not supported for npz files."
+        )
+class ZarrFileReader(BackendReader):
+    """
+    Reader for zarr files.
+    """
+    def __init__(
+        self,
+        keys_to_read: list[str] | None,
+        keys_to_read_if_available: dict[str, torch.Tensor] | None,
+    ) -> None:
+        super().__init__(keys_to_read, keys_to_read_if_available)
+    def read_file(self, filename: pathlib.Path) -> dict[str, torch.Tensor]:
+        """
+        Read a file and return a dictionary of tensors.
+        """
+        group = zarr.open_group(filename, mode="r")
+        missing_keys = set(self.keys_to_read) - set(group.keys())
+        if len(missing_keys) > 0:
+            raise ValueError(f"Keys {missing_keys} not found in file {filename}")
+        # Make sure to select the slice outside of the loop.
+        if self.is_volumetric:
+            if self.volume_sampling_size is not None:
+                volume_slice = self.select_random_sections_from_slice(
+                    0,
+                    group["volume_mesh_centers"].shape[0],
+                    self.volume_sampling_size,
+                )
+            else:
+                volume_slice = slice(0, group["volume_mesh_centers"].shape[0])
+        # This is a slower basic way to do this, to be improved:
+        data = {}
+        for key in self.keys_to_read:
+            if "volume" not in key:
+                data[key] = torch.from_numpy(group[key][:])
+            else:
+                data[key] = torch.from_numpy(group[key][volume_slice])
+        return self.fill_optional_keys(data)
+    def read_file_sharded(
+        self, filename: pathlib.Path, device_mesh: torch.distributed.DeviceMesh
+    ) -> tuple[dict[str, torch.Tensor], dict[str, dict]]:
+        """
+        Read a file and return a dictionary of tensors.
+        """
+        # We need the coordinates of this GPU:
+        this_rank = device_mesh.get_local_rank()
+        domain_size = dist.get_world_size(group=device_mesh.get_group())
+        group = zarr.open_group(filename, mode="r")
+        missing_keys = set(self.keys_to_read) - set(group.keys())
+        if len(missing_keys) > 0:
+            raise ValueError(f"Keys {missing_keys} not found in file {filename}")
+        data = {}
+        specs = {}
+        for key in self.keys_to_read:
+            # Open the array in zarr without reading it and get info:
+            zarr_array = group[key]
+            array_shape = zarr_array.shape
+            if array_shape == ():
+                # Read scalars from every rank and use replicate sharding
+                raw_data = torch.from_numpy(zarr_array[:])
+                placement = [
+                    Replicate(),
+                ]
+                chunk_sizes = None
+            else:
+                target_dim = 0
+                if array_shape[target_dim] < domain_size:
+                    # If the array is smaller than the number of ranks,
+                    # again read and use replicate sharding:
+                    raw_data = torch.from_numpy(zarr_array[:])
+                    placement = [
+                        Replicate(),
+                    ]
+                    chunk_sizes = None
+                else:
+                    # Read partially from the data and use Shard(target_dim) sharding
+                    chunk_start, chunk_stop, chunk_sizes = self._get_slice_boundaries(
+                        zarr_array.shape, this_rank, domain_size
+                    )
+                    raw_data = torch.from_numpy(zarr_array[chunk_start:chunk_stop])
+                    placement = [
+                        Shard(target_dim),
+                    ]
+                    # Turn chunk sizes into a dict over mesh dim 0:
+                    chunk_sizes = {0: chunk_sizes}
+            #
+            data[key] = raw_data
+            specs[key] = (placement, chunk_sizes)
+        # Patch in the optional keys:
+        data = self.fill_optional_keys(data)
+        for key in data.keys():
+            if key not in specs:
+                specs[key] = (
+                    [
+                        Replicate(),
+                    ],
+                    {},
+                )
+        return data, specs
+if PV_AVAILABLE:
+    class VTKFileReader(BackendReader):
+        """
+        Reader for vtk files.
+        """
+        def __init__(
+            self,
+            keys_to_read: list[str] | None,
+            keys_to_read_if_available: dict[str, torch.Tensor] | None,
+        ) -> None:
+            super().__init__(keys_to_read, keys_to_read_if_available)
+            self.stl_file_keys = [
+                "stl_coordinates",
+                "stl_centers",
+                "stl_faces",
+                "stl_areas",
+            ]
+            self.vtp_file_keys = [
+                "surface_mesh_centers",
+                "surface_normals",
+                "surface_mesh_sizes",
+                "CpMeanTrim",
+                "pMeanTrim",
+                "wallShearStressMeanTrim",
+            ]
+            self.vtu_file_keys = [
+                "volume_mesh_centers",
+                "volume_fields",
+            ]
+            self.exclude_patterns = [
+                "single_solid",
+            ]
+        def get_file_name(self, dir_name: pathlib.Path, extension: str) -> pathlib.Path:
+            """
+            Get the file name for a given directory and extension.
+            """
+            # >>> matches = [p for p in list(dir_name.iterdir()) if p.suffix == ".stl" and not any(pattern in p.name for pattern in exclude_patterns)]
+            matches = [
+                p
+                for p in dir_name.iterdir()
+                if p.suffix == extension
+                and not any(pattern in p.name for pattern in self.exclude_patterns)
+            ]
+            if len(matches) == 0:
+                raise FileNotFoundError(f"No {extension} files found in {dir_name}")
+            fname = matches[0]
+            return dir_name / fname
+        def read_file(self, filename: pathlib.Path) -> dict[str, torch.Tensor]:
+            """
+            Read a set of files and return a dictionary of tensors.
+            """
+            # This reader attempts to only read what's necessary, and not more.
+            # So, the functions that do the reading are each "one file" functions
+            # and we open them for processing only when necessary.
+            return_data = {}
+            # Note that this reader is, already, running in a background thread.
+            # It may or may not help to further thread these calls.
+            if any(key in self.stl_file_keys for key in self.keys_to_read):
+                stl_path = self.get_file_name(filename, ".stl")
+                stl_data = self.read_data_from_stl(stl_path)
+                return_data.update(stl_data)
+            if any(key in self.vtp_file_keys for key in self.keys_to_read):
+                vtp_path = self.get_file_name(filename, ".vtp")
+                vtp_data = self.read_data_from_vtp(vtp_path)
+                return_data.update(vtp_data)
+            if any(key in self.vtu_file_keys for key in self.keys_to_read):
+                raise NotImplementedError("VTU files are not supported yet.")
+            return self.fill_optional_keys(return_data)
+        def read_file_sharded(
+            self, filename: pathlib.Path, parallel_rank: int, parallel_size: int
+        ) -> tuple[dict[str, torch.Tensor], dict[str, ShardTensorSpec]]:
+            """
+            Read a file and return a dictionary of tensors.
+            """
+            raise NotImplementedError("Not implemented yet.")
+        def read_data_from_stl(
+            self,
+            stl_path: str,
+        ) -> dict:
+            """
+            Reads surface mesh data from an STL file and prepares a batch dictionary for inference.
+            Args:
+                stl_path (str): Path to the STL file.
+            Returns:
+                dict: Batch dictionary with mesh faces and coordinates as torch tensors.
+            """
+            mesh = pv.read(stl_path)
+            batch = {}
+            faces = mesh.faces.reshape(-1, 4)
+            faces = faces[:, 1:]
+            batch["stl_faces"] = faces.flatten()
+            batch["stl_coordinates"] = mesh.points
+            batch["surface_normals"] = mesh.cell_normals
+            batch = {k: torch.from_numpy(v) for k, v in batch.items()}
+            return batch
+        def read_data_from_vtp(self, vtp_path: str) -> dict:
+            """
+            Read vtp file from a file
+            """
+            raise NotImplementedError("Not implemented yet.")
+        def set_volume_sampling_size(self, volume_sampling_size: int):
+            """
+            This is not supported for vtk files.
+            """
+            raise NotImplementedError(
+                "volume sampling directly from disk is not supported for vtk files."
+            )
+if TENSORSTORE_AVAILABLE:
+    class TensorStoreZarrReader(BackendReader):
+        """
+        Reader for tensorstore zarr files.
+        """
+        def __init__(
+            self,
+            keys_to_read: list[str] | None,
+            keys_to_read_if_available: dict[str, torch.Tensor] | None,
+            cache_bytes_limit: int = 10_000_000,
+            data_copy_concurrency: int = 72,
+            file_io_concurrency: int = 72,
+        ) -> None:
+            super().__init__(keys_to_read, keys_to_read_if_available)
+            self.spec_template = {
+                "driver": "auto",
+                "kvstore": {
+                    "driver": "file",
+                    "path": None,
+                },
+            }
+            self.context = ts.Context(
+                {
+                    "cache_pool": {"total_bytes_limit": cache_bytes_limit},
+                    "data_copy_concurrency": {"limit": data_copy_concurrency},
+                    "file_io_concurrency": {"limit": file_io_concurrency},
+                }
+            )
+        def read_file(self, filename: pathlib.Path) -> dict[str, torch.Tensor]:
+            """
+            Read a file and return a dictionary of tensors.
+            """
+            # Trigger an async open of each data item:
+            read_futures = {}
+            for key in self.keys_to_read:
+                spec = self.spec_template.copy()
+                spec["kvstore"]["path"] = str(filename) + "/" + str(key)
+                read_futures[key] = ts.open(
+                    spec, create=False, open=True, context=self.context
+                )
+            # Wait for all the opens to conclude:
+            read_futures = {
+                key: read_futures[key].result() for key in read_futures.keys()
+            }
+            # Make sure to select the slice outside of the loop.
+            # We need
+            if self.is_volumetric:
+                if self.volume_sampling_size is not None:
+                    volume_slice = self.select_random_sections_from_slice(
+                        0,
+                        read_futures["volume_mesh_centers"].shape[0],
+                        self.volume_sampling_size,
+                    )
+                else:
+                    volume_slice = slice(
+                        0, read_futures["volume_mesh_centers"].shape[0]
+                    )
+            # Trigger an async read of each data item:
+            # (Each item will be a numpy ndarray after this:)
+            tensor_futures = {}
+            for key in self.keys_to_read:
+                if "volume" not in key:
+                    tensor_futures[key] = read_futures[key].read()
+                # For the volume data, read the slice:
+                else:
+                    tensor_futures[key] = read_futures[key][volume_slice].read()
+            # Convert them to torch tensors:
+            # (make sure to block for the result)
+            data = {
+                key: torch.as_tensor(tensor_futures[key].result(), dtype=torch.float32)
+                for key in self.keys_to_read
+            }
+            return self.fill_optional_keys(data)
+        def read_file_sharded(
+            self, filename: pathlib.Path, device_mesh: torch.distributed.DeviceMesh
+        ) -> tuple[dict[str, torch.Tensor], dict[str, dict]]:
+            """
+            Read a file and return a dictionary of tensors.
+            """
+            # We need the coordinates of this GPU:
+            this_rank = device_mesh.get_local_rank()
+            domain_size = dist.get_world_size(group=device_mesh.get_group())
+            # This pulls a list of store objects in tensorstore:
+            stores = {}
+            for key in self.keys_to_read:
+                spec = self.spec_template.copy()
+                spec["kvstore"]["path"] = str(filename) + "/" + str(key)
+                stores[key] = ts.open(
+                    spec, create=False, open=True, context=self.context
+                )
+            stores = {key: stores[key].result() for key in stores.keys()}
+            data = {}
+            specs = {}
+            for key in self.keys_to_read:
+                # Open the array in zarr without reading it and get info:
+                store = stores[key]
+                array_shape = store.shape
+                if array_shape == ():
+                    # Read scalars from every rank and use replicate sharding
+                    _slice = np.s_[:]
+                    # raw_data = torch.from_numpy(store[:])
+                    placement = [
+                        Replicate(),
+                    ]
+                    chunk_sizes = None
+                else:
+                    target_dim = 0
+                    if array_shape[target_dim] < domain_size:
+                        # If the array is smaller than the number of ranks,
+                        # again read and use replicate sharding:
+                        _slice = np.s_[:]
+                        # raw_data = torch.from_numpy(store[:])
+                        placement = [
+                            Replicate(),
+                        ]
+                        chunk_sizes = None
+                    else:
+                        # Read partially from the data and use Shard(target_dim) sharding
+                        chunk_start, chunk_stop, chunk_sizes = (
+                            self._get_slice_boundaries(
+                                store.shape, this_rank, domain_size
+                            )
+                        )
+                        _slice = np.s_[chunk_start:chunk_stop]
+                        # raw_data = torch.from_numpy(zarr_array[chunk_start:chunk_stop])
+                        placement = [
+                            Shard(target_dim),
+                        ]
+                        # Turn chunk sizes into a dict over mesh dim 0:
+                        chunk_sizes = {0: chunk_sizes}
+                # Trigger the reads as async:
+                data[key] = store[_slice].read()
+                specs[key] = (placement, chunk_sizes)
+            # Finally, await the full data read:
+            for key in self.keys_to_read:
+                data[key] = torch.as_tensor(data[key].result())
+            # Patch in the optional keys:
+            data = self.fill_optional_keys(data)
+            for key in data.keys():
+                if key not in specs:
+                    specs[key] = (
+                        [
+                            Replicate(),
+                        ],
+                        {},
+                    )
+            return data, specs
+else:
+    class TensorStoreZarrReader(BackendReader):
+        """
+        Null reader for tensorstore zarr files.
+        """
+        def __init__(
+            self,
+            keys_to_read: list[str] | None,
+            keys_to_read_if_available: dict[str, torch.Tensor] | None,
+        ) -> None:
+            # Raise an exception on construction if we get here:
+            raise NotImplementedError(
+                "TensorStoreZarrReader is not available without tensorstore.  `pip install tensorstore`."
+            )
+def is_vtk_directory(file: pathlib.Path) -> bool:
+    """
+    Check if a file is a vtk directory.
+    """
+    return file.is_dir() and all(
+        [f.suffix in [".vtp", ".stl", ".vtu", ".vtk", ".csv"] for f in file.iterdir()]
+    )
+class CAEDataset:
+    """
+    Dataset reader for DrivaerML and similar datasets.  In general, this
+    dataset supports reading dictionary-like data, and returning a
+    dictionary of torch.Tensor objects.
+    When constructed, the user must pass a directory of data examples.
+    The dataset will inspect the folder, identify all children, and decide:
+    - If every file is a directory ending in .zarr, the zarr reader is used.
+    - If every file is .npy, the .npy reader is used.
+    - If every file is .npz, the .npz reader is used.
+    - If every file is a directory without an extension, it's assumed to be .stl/.vtp/.vtu
+    The user can optionally force one path with a parameter.
+    The flow of this dataset is:
+    - Load data from file, using a thread.
+        - Each individual file reading tool may or may not have it's own threading
+          or multi processing enabled.  That's up to it.  This just does async
+          loading.
+        - Data should come out of the readers in dict{str : torch.Tensor} format
+    - The data is transferred from CPU to GPU in a separate stream.
+    Users can call __getitem__(i), which will trigger the pipeline,
+    or they can call `preload(i)`, which will start the pipeline for index `i`.
+    Subsequent calls to `__getitem__(i)` should be faster since the IO is in
+    progress or complete.
+    Using the `__iter__` functionality will automatically enable preloading.
+    """
+    def __init__(
+        self,
+        data_dir: str | pathlib.Path,
+        keys_to_read: list[str] | None,
+        keys_to_read_if_available: dict[str, torch.Tensor] | None,
+        output_device: torch.device,
+        preload_depth: int = 2,
+        pin_memory: bool = False,
+        device_mesh: torch.distributed.DeviceMesh | None = None,
+        placements: dict[str, torch.distributed.tensor.Placement] | None = None,
+        consumer_stream: torch.cuda.Stream | None = None,
+    ) -> None:
+        if isinstance(data_dir, str):
+            data_dir = pathlib.Path(data_dir)
+        # Verify the data directory exists:
+        if not data_dir.exists():
+            raise FileNotFoundError(f"Data directory {data_dir} does not exist")
+        # Verify the data directory is a directory:
+        if not data_dir.is_dir():
+            raise NotADirectoryError(f"Data directory {data_dir} is not a directory")
+        self._keys_to_read = keys_to_read
+        # Make sure the optional keys are on the right device:
+        self._keys_to_read_if_available = {
+            k: v.to(output_device) for k, v in keys_to_read_if_available.items()
+        }
+        self.file_reader, self._filenames = self._infer_file_type_and_filenames(
+            data_dir
+        )
+        self.pin_memory = pin_memory
+        # Check the file names; some can be read well in parallel, while others
+        # are not parallelizable.
+        self._length = len(self._filenames)
+        self.output_device = output_device
+        if output_device.type == "cuda":
+            self._data_loader_stream = torch.cuda.Stream()
+        else:
+            self._data_loader_stream = None
+        self.device_mesh = device_mesh
+        self.placements = placements
+        # This tracks global tensor info
+        # so we can convert to ShardTensor at the right time.
+        self.shard_spec = {}
+        if self.device_mesh is not None:
+            if self.device_mesh.ndim != 1:
+                raise ValueError("Device mesh must be one dimensional")
+        # This is thread storage for data preloading:
+        self._preload_queue = {}
+        self._transfer_events = {}
+        self.preload_depth = preload_depth
+        self.preload_executor = ThreadPoolExecutor(max_workers=max(1, preload_depth))
+        if consumer_stream is None and self.output_device.type == "cuda":
+            consumer_stream = torch.cuda.current_stream()
+        self.consumer_stream = consumer_stream
+    def set_indices(self, indices: list[int]):
+        """
+        Set the indices for the dataset for this epoch.
+        """
+        # TODO - this needs to block while anything is in the preprocess queue.
+        self.indices = indices
+    def idx_to_index(self, idx):
+        if hasattr(self, "indices"):
+            return self.indices[idx]
+        return idx
+    def _infer_file_type_and_filenames(
+        self, data_dir: pathlib.Path
+    ) -> tuple[str, list[str]]:
+        """
+        Infer the file type and filenames from the data directory.
+        """
+        # We validated the directory exists and is a directory already.
+        # List the files:
+        files = list(data_dir.iterdir())
+        # Initialize the file reader object
+        # Note that for some of these, they could be functions
+        # But others benefit from having a state, so we use classes:
+        if all(file.suffix == ".npy" for file in files):
+            file_reader = NpyFileReader(
+                self._keys_to_read, self._keys_to_read_if_available
+            )
+            return file_reader, files
+        elif all(file.suffix == ".npz" for file in files):
+            file_reader = NpzFileReader(
+                self._keys_to_read, self._keys_to_read_if_available
+            )
+            return file_reader, files
+        elif all(file.suffix == ".zarr" and file.is_dir() for file in files):
+            if TENSORSTORE_AVAILABLE:
+                file_reader = TensorStoreZarrReader(
+                    self._keys_to_read, self._keys_to_read_if_available
+                )
+            else:
+                file_reader = ZarrFileReader(
+                    self._keys_to_read, self._keys_to_read_if_available
+                )
+            return file_reader, files
+        elif all(is_vtk_directory(file) for file in files):
+            file_reader = VTKFileReader(
+                self._keys_to_read, self._keys_to_read_if_available
+            )
+            return file_reader, files
+            # Each "file" here is a directory of .vtp, stl, etc.
+        else:
+            # TODO - support folders of stl, vtp, vtu.
+            raise ValueError(f"Unsupported file type: {files[0]}")
+    def _move_to_gpu(
+        self, data: dict[str, torch.Tensor], idx: int
+    ) -> dict[str, torch.Tensor]:
+        """Convert numpy arrays to torch tensors and move to GPU if available.
+        Args:
+            data: Dictionary of key to torch tensor.
+        Returns:
+            Dictionary of key to torch tensor on GPU if available.
+        """
+        if self.output_device.type != "cuda":
+            return data
+        result = {}
+        with torch.cuda.stream(self._data_loader_stream):
+            for key in data.keys():
+                if data[key].device == self.output_device:
+                    result[key] = data[key]
+                    continue
+                if self.pin_memory:
+                    result[key] = (
+                        data[key].pin_memory().to(self.output_device, non_blocking=True)
+                    )
+                else:
+                    result[key] = data[key].to(self.output_device, non_blocking=True)
+                # Move to GPU if available
+                # result[key] = data[key].to(self.output_device, non_blocking=True)
+                result[key].record_stream(self.consumer_stream)
+        # Mark the consumer stream:
+        transfer_event = torch.cuda.Event()
+        transfer_event.record(self._data_loader_stream)
+        self._transfer_events[idx] = transfer_event
+        return result
+    def _convert_to_shard_tensors(
+        self,
+        tensors: dict[str, torch.Tensor],
+        filename: str,
+    ) -> dict[str, ShardTensor]:
+        """Convert tensors to ShardTensor objects for distributed training.
+        Args:
+            tensors: Dictionary of key to torch tensor.
+        Returns:
+            Dictionary of key to torch tensor or ShardTensor.
+        """
+        if self.device_mesh is None:
+            return tensors
+        spec_dict = self.shard_spec.pop(filename)
+        result = {}
+        for key in tensors.keys():
+            placement, chunk_sizes = spec_dict[key]
+            result[key] = ShardTensor.from_local(
+                local_tensor=tensors[key],
+                device_mesh=self.device_mesh,
+                placements=placement,
+                sharding_shapes=chunk_sizes,
+            )
+        return result
+    def preload(self, idx: int) -> None:
+        """
+        Asynchronously preload the data for the given index (up to CPU, not GPU).
+        Only one preload operation is supported at a time.
+        Args:
+            idx: Index of the sample to preload.
+        """
+        if idx in self._preload_queue:
+            # Skip items that are already in the queue
+            return
+        def _preload_worker():
+            data = self._read_file(self._filenames[idx])
+            if "stl_faces" in data:
+                data["stl_faces"] = data["stl_faces"].to(torch.int32)
+            # Convert to torch tensors
+            return self._move_to_gpu(data, idx)
+        self._preload_queue[idx] = self.preload_executor.submit(_preload_worker)
+    def get_preloaded(self, idx: int) -> dict[str, torch.Tensor] | None:
+        """
+        Retrieve the preloaded data (blocking if not ready).
+        Returns:
+            (idx, data) tuple where data is a dictionary of key to numpy array or torch tensor.
+        Raises:
+            RuntimeError: If no preload is in progress.
+            Exception: If preload failed.
+        """
+        if idx not in self._preload_queue:
+            return None
+        result = self._preload_queue[
+            idx
+        ].result()  # This will block until the result is ready
+        self._preload_queue.pop(idx)  # Clear the future after getting the result
+        return result
+    def __iter__(self):
+        # When starting the iterator method, start loading the data
+        # at idx = 0, idx = 1
+        # Start preprocessing at idx = 0, when the load completes
+        self.i = 0
+        N = len(self.indices) if hasattr(self, "indices") else len(self)
+        for i in range(self.preload_depth):
+            # Trigger the dataset to start loading index 0:
+            if N > i + 1:
+                self.preload(self.idx_to_index(self.i + i))
+        return self
+    def __next__(self):
+        N = len(self.indices) if hasattr(self, "indices") else len(self._filenames)
+        # Iteration bounds are based on the counter, not the random-access index
+        if self.i >= N:
+            self.i = 0
+            raise StopIteration
+        # This is the file random access index
+        target_index = self.idx_to_index(self.i)
+        # Before returning, put the next two target indexes into the queue:
+        for preload_i in range(self.preload_depth):
+            next_iteration_index = self.i + preload_i + 1
+            if N > next_iteration_index:
+                preload_idx = self.idx_to_index(next_iteration_index)
+                self.preload(preload_idx)
+        # Send up the random-access data:
+        data = self.__getitem__(target_index)
+        self.i += 1
+        return data
+    def __len__(self):
+        return len(self._filenames)
+    def _read_file(self, filename: pathlib.Path) -> dict[str, torch.Tensor]:
+        """
+        Read a file and return a dictionary of tensors.
+        """
+        if self.device_mesh is not None:
+            tensor_dict, spec_dict = self.file_reader.read_file_sharded(
+                filename, self.device_mesh
+            )
+            self.shard_spec[filename] = spec_dict
+            return tensor_dict
+        else:
+            return self.file_reader.read_file(filename)
+    def __getitem__(self, idx: int) -> dict[str, torch.Tensor | ShardTensor]:
+        """
+        Get a data sample.
+        Flow is:
+        - Read data, or get preloaded data if this idx is preloaded.
+        - Move data to GPU, if needed.
+            - Preloading data will move to GPU if it can.
+        - If domain parallelism is enabled, convert to ShardTensors.
+        - Return
+        Args:
+            idx: Index of the sample to retrieve
+        Returns:
+            Dictionary containing tensors/ShardTensors for the requested data
+        """
+        if idx >= len(self._filenames):
+            raise IndexError(
+                f"Index {idx} out of range for dataset of size {len(self._filenames)}"
+            )
+        # Attempt to get preloaded data:
+        data = self.get_preloaded(idx)
+        if data is None:
+            # Read data from zarr file
+            data = self._read_file(self._filenames[idx])
+            data = self._move_to_gpu(data, idx)
+        # This blocks until the preprocessing has transferred to GPU
+        if idx in self._transfer_events:
+            self.consumer_stream.wait_event(self._transfer_events[idx])
+            self._transfer_events.pop(idx)
+        # Convert to ShardTensors if using domain parallelism
+        if self.device_mesh is not None:
+            data = self._convert_to_shard_tensors(data, self._filenames[idx])
+        return data
+    def set_volume_sampling_size(self, volume_sampling_size: int):
+        """
+        Set the volume sampling size.  When set, the readers will
+        assume the volumetric data is shuffled on disk and read only
+        contiguous chunks of the data up to the sampling size.
+        Args:
+            volume_sampling_size: The total size of the volume sampling.
+        """
+        self.file_reader.set_volume_sampling_size(volume_sampling_size)
+    def close(self):
+        """
+        Explicitly close the dataset and cleanup resources, including the ThreadPoolExecutor.
+        """
+        if hasattr(self, "preload_executor") and self.preload_executor is not None:
+            self.preload_executor.shutdown(wait=True)
+            self.preload_executor = None
+    def __del__(self):
+        """
+        Cleanup resources when the dataset is destroyed.
+        """
+        self.close()
+def compute_mean_std_min_max(
+    dataset: CAEDataset, field_keys: list[str], max_samples: int = 20
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Compute the mean, standard deviation, minimum, and maximum for a specified field
+    across all samples in a dataset.
+    Uses a numerically stable online algorithm for mean and variance.
+    Args:
+        dataset (CAEDataset): The dataset to process.
+        field_key (str): The key for the field to normalize.
+    Returns:
+        tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+            mean, std, min, max tensors for the field.
+    """
+    N = {}
+    mean = {}
+    M2 = {}  # Sum of squares of differences from the current mean
+    min_val = {}
+    max_val = {}
+    # Read the first data item to get the shapes:
+    example_data = dataset[0]
+    # Create placeholders for the accumulators:
+    for key in field_keys:
+        N[key] = torch.zeros(1, dtype=torch.int64, device=example_data[key].device)
+        mean[key] = torch.zeros(
+            example_data[key].shape[-1],
+            device=example_data[key].device,
+            dtype=torch.float64,
+        )
+        M2[key] = torch.zeros(
+            example_data[key].shape[-1],
+            device=example_data[key].device,
+            dtype=torch.float64,
+        )
+        min_val[key] = torch.full(
+            (example_data[key].shape[-1],),
+            float("inf"),
+            device=example_data[key].device,
+        )
+        max_val[key] = torch.full(
+            (example_data[key].shape[-1],),
+            float("-inf"),
+            device=example_data[key].device,
+        )
+    global_start = time.perf_counter()
+    start = time.perf_counter()
+    data_list = np.arange(len(dataset))
+    np.random.shuffle(data_list)
+    for i, j in enumerate(data_list):
+        data = dataset[j]
+        if i >= max_samples:
+            break
+        for field_key in field_keys:
+            field_data = data[field_key]
+            # Compute batch statistics
+            batch_mean = field_data.mean(axis=(0))
+            batch_M2 = ((field_data - batch_mean) ** 2).sum(axis=(0))
+            batch_n = field_data.shape[0]
+            # Update running mean and M2 (Welford's algorithm)
+            delta = batch_mean - mean[field_key]
+            N[field_key] += batch_n  # batch_n should also be torch.int64
+            mean[field_key] = mean[field_key] + delta * (batch_n / N[field_key])
+            M2[field_key] = (
+                M2[field_key]
+                + batch_M2
+                + delta**2 * (batch_n * N[field_key]) / N[field_key]
+            )
+        end = time.perf_counter()
+        iteration_time = end - start
+        print(
+            f"on iteration {i} of {max_samples}, time: {iteration_time:.2f} seconds for file: {j}"
+        )
+        start = time.perf_counter()
+    var = {}
+    std = {}
+    for field_key in field_keys:
+        var[field_key] = M2[field_key] / (
+            N[field_key].item() - 1
+        )  # Convert N to Python int for division
+        std[field_key] = torch.sqrt(var[field_key])
+    start = time.perf_counter()
+    for i, j in enumerate(data_list):
+        data = dataset[j]
+        if i >= max_samples:
+            break
+        for field_key in field_keys:
+            field_data = data[field_key]
+            batch_n = field_data.shape[0]
+            # # Update min/max
+            mean_sample = mean[field_key]
+            std_sample = std[field_key]
+            mask = torch.ones_like(field_data, dtype=torch.bool)
+            for v in range(field_data.shape[-1]):
+                outliers = (field_data[:, v] < mean_sample[v] - 9.0 * std_sample[v]) | (
+                    field_data[:, v] > mean_sample[v] + 9.0 * std_sample[v]
+                )
+                mask[:, v] = ~outliers
+            batch_min = []
+            batch_max = []
+            for v in range(field_data.shape[-1]):
+                batch_min.append(field_data[mask[:, v], v].min())
+                batch_max.append(field_data[mask[:, v], v].max())
+            batch_min = torch.stack(batch_min)
+            batch_max = torch.stack(batch_max)
+            min_val[field_key] = torch.minimum(min_val[field_key], batch_min)
+            max_val[field_key] = torch.maximum(max_val[field_key], batch_max)
+        end = time.perf_counter()
+        iteration_time = end - start
+        print(
+            f"on iteration {i} of {max_samples}, time: {iteration_time:.2f} seconds for file: {j}"
+        )
+        start = time.perf_counter()
+    global_end = time.perf_counter()
+    global_time = global_end - global_start
+    print(f"Total time: {global_time:.2f} seconds for {max_samples} samples")
+    return mean, std, min_val, max_val

physics_mcp/source/physicsnemo/datapipes/cae/domino_datapipe.py ADDED Viewed

	@@ -0,0 +1,1334 @@

+# SPDX-FileCopyrightText: Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This code provides the datapipe for reading the processed npy files,
+generating multi-res grids, calculating signed distance fields,
+sampling random points in the volume and on surface,
+normalizing fields and returning the output tensors as a dictionary.
+This datapipe also non-dimensionalizes the fields, so the order in which the variables should
+be fixed: velocity, pressure, turbulent viscosity for volume variables and
+pressure, wall-shear-stress for surface variables. The different parameters such as
+variable names, domain resolution, sampling size etc. are configurable in config.yaml.
+"""
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Iterable, Literal, Optional, Protocol, Sequence, Union
+import numpy as np
+import torch
+import torch.cuda.nvtx as nvtx
+import torch.distributed as dist
+from omegaconf import DictConfig
+from torch.distributed.tensor.placement_types import Replicate, Shard
+from torch.utils.data import Dataset
+from physicsnemo.datapipes.cae.cae_dataset import (
+    CAEDataset,
+    compute_mean_std_min_max,
+)
+from physicsnemo.distributed import DistributedManager
+from physicsnemo.distributed.shard_tensor import ShardTensor, scatter_tensor
+from physicsnemo.utils.domino.utils import (
+    calculate_center_of_mass,
+    create_grid,
+    get_filenames,
+    normalize,
+    pad,
+    shuffle_array,
+    standardize,
+    unnormalize,
+    unstandardize,
+)
+from physicsnemo.utils.neighbors import knn
+from physicsnemo.utils.profiling import profile
+from physicsnemo.utils.sdf import signed_distance_field
+class BoundingBox(Protocol):
+    """
+    Type definition for the required format of bounding box dimensions.
+    """
+    min: Sequence
+    max: Sequence
+@dataclass
+class DoMINODataConfig:
+    """Configuration for DoMINO dataset processing pipeline.
+    Attributes:
+        data_path: Path to the dataset to load.
+        phase: Which phase of data to load ("train", "val", or "test").
+        surface_variables: (Surface specific) Names of surface variables.
+        surface_points_sample: (Surface specific) Number of surface points to sample per batch.
+        num_surface_neighbors: (Surface specific) Number of surface neighbors to consider for nearest neighbors approach.
+        surface_sampling_algorithm: (Surface specific) Algorithm to use for surface sampling ("area_weighted" or "random").
+        surface_factors: (Surface specific) Non-dimensionalization factors for surface variables.
+            If set, and scaling_type is:
+            - min_max_scaling -> rescale surface_fields to the min/max set here
+            - mean_std_scaling -> rescale surface_fields to the mean and std set here.
+        bounding_box_dims_surf: (Surface specific) Dimensions of bounding box. Must be an object with min/max
+            attributes that are arraylike.
+        volume_variables: (Volume specific) Names of volume variables.
+        volume_points_sample: (Volume specific) Number of volume points to sample per batch.
+        volume_sample_from_disk: (Volume specific) If the volume data is in a shuffled state on disk,
+            read contiguous chunks of the data rather than the entire volume data.  This greatly
+            accelerates IO in bandwidth limited systems or when the volumetric data is very large.
+        volume_factors: (Volume specific) Non-dimensionalization factors for volume variables scaling.
+            If set, and scaling_type is:
+            - min_max_scaling -> rescale volume_fields to the min/max set here
+            - mean_std_scaling -> rescale volume_fields to the mean and std set here.
+        bounding_box_dims: (Volume specific) Dimensions of bounding box. Must be an object with min/max
+            attributes that are arraylike.
+        grid_resolution: Resolution of the latent grid.
+        normalize_coordinates: Whether to normalize coordinates based on min/max values.
+            For surfaces: uses s_min/s_max, defined from:
+            - Surface bounding box, if defined.
+            - Min/max of the stl_vertices
+            For volumes: uses c_min/c_max, defined from:
+            - Volume bounding_box if defined,
+            - 1.5x s_min/max otherwise, except c_min[2] = s_min[2] in this case
+        sample_in_bbox: Whether to sample points in a specified bounding box.
+            Uses the same min/max points as coordinate normalization.
+            Only performed if compute_scaling_factors is false.
+        sampling: Whether to downsample the full resolution mesh to fit in GPU memory.
+            Surface and volume sampling points are configured separately as:
+            - surface.points_sample
+            - volume.points_sample
+        geom_points_sample: Number of STL points sampled per batch.
+            Independent of volume.points_sample and surface.points_sample.
+        scaling_type: Scaling type for volume variables.
+            If used, will rescale the volume_fields and surface fields outputs.
+            Requires volume.factor and surface.factor to be set.
+        compute_scaling_factors: Whether to compute scaling factors.
+            Not available if caching.
+            Many preprocessing pieces are disabled if computing scaling factors.
+        caching: Whether this is for caching or serving.
+        deterministic: Whether to use a deterministic seed for sampling and random numbers.
+        gpu_preprocessing: Whether to do preprocessing on the GPU (False for CPU).
+        gpu_output: Whether to return output on the GPU as cupy arrays.
+            If False, returns numpy arrays.
+            You might choose gpu_preprocessing=True and gpu_output=False if caching.
+        shard_grid: Whether to shard the grid across GPUs for domain parallelism.
+            Applies to the surf_grid and similiar tensors.
+        shard_points: Whether to shard the points across GPUs for domain parallelism.
+            Applies to the volume_fields/surface_fields and similiar tensors.
+    """
+    data_path: Path | None
+    phase: Literal["train", "val", "test"]
+    # Surface-specific variables:
+    surface_variables: Optional[Sequence] = ("pMean", "wallShearStress")
+    surface_points_sample: int = 1024
+    num_surface_neighbors: int = 11
+    surface_sampling_algorithm: str = Literal["area_weighted", "random"]
+    surface_factors: Optional[Sequence] = None
+    bounding_box_dims_surf: Optional[Union[BoundingBox, Sequence]] = None
+    # Volume specific variables:
+    volume_variables: Optional[Sequence] = ("UMean", "pMean")
+    volume_points_sample: int = 1024
+    volume_sample_from_disk: bool = False
+    volume_factors: Optional[Sequence] = None
+    bounding_box_dims: Optional[Union[BoundingBox, Sequence]] = None
+    grid_resolution: Sequence = (256, 96, 64)
+    normalize_coordinates: bool = False
+    sample_in_bbox: bool = False
+    sampling: bool = False
+    geom_points_sample: int = 300000
+    scaling_type: Optional[Literal["min_max_scaling", "mean_std_scaling"]] = None
+    compute_scaling_factors: bool = False
+    caching: bool = False
+    deterministic: bool = False
+    gpu_preprocessing: bool = True
+    gpu_output: bool = True
+    shard_grid: bool = False
+    shard_points: bool = False
+    def __post_init__(self):
+        if self.data_path is not None:
+            # Ensure data_path is a Path object:
+            if isinstance(self.data_path, str):
+                self.data_path = Path(self.data_path)
+            self.data_path = self.data_path.expanduser()
+            if not self.data_path.exists():
+                raise ValueError(f"Path {self.data_path} does not exist")
+            if not self.data_path.is_dir():
+                raise ValueError(f"Path {self.data_path} is not a directory")
+        # Object if caching settings are impossible:
+        if self.caching:
+            if self.sampling:
+                raise ValueError("Sampling should be False for caching")
+            if self.compute_scaling_factors:
+                raise ValueError("Compute scaling factors should be False for caching")
+        if self.phase not in [
+            "train",
+            "val",
+            "test",
+        ]:
+            raise ValueError(
+                f"phase should be one of ['train', 'val', 'test'], got {self.phase}"
+            )
+        if self.scaling_type is not None:
+            if self.scaling_type not in [
+                "min_max_scaling",
+                "mean_std_scaling",
+            ]:
+                raise ValueError(
+                    f"scaling_type should be one of ['min_max_scaling', 'mean_std_scaling'], got {self.scaling_type}"
+                )
+##### TODO
+# - The SDF normalization here is based on using a normalized mesh and
+#   a normalized coordinate.  The alternate method is to normalize to the min/max of the grid.
+class DoMINODataPipe(Dataset):
+    """
+    Datapipe for DoMINO
+    Leverages a dataset for the actual reading of the data, and this
+    object is responsible for preprocessing the data.
+    """
+    def __init__(
+        self,
+        input_path,
+        model_type: Literal["surface", "volume", "combined"],
+        pin_memory: bool = False,
+        **data_config_overrides,
+    ):
+        # Perform config packaging and validation
+        self.config = DoMINODataConfig(data_path=input_path, **data_config_overrides)
+        # Set up the distributed manager:
+        if not DistributedManager.is_initialized():
+            DistributedManager.initialize()
+        dist = DistributedManager()
+        # Set devices for the preprocessing and IO target
+        self.preproc_device = (
+            dist.device if self.config.gpu_preprocessing else torch.device("cpu")
+        )
+        # The cae_dataset will automatically target this device
+        # In an async transfer.
+        self.output_device = (
+            dist.device if self.config.gpu_output else torch.device("cpu")
+        )
+        # Model type determines whether we process surface, volume, or both.
+        self.model_type = model_type
+        # Update the arrays for bounding boxes:
+        if hasattr(self.config.bounding_box_dims, "max") and hasattr(
+            self.config.bounding_box_dims, "min"
+        ):
+            self.config.bounding_box_dims = [
+                torch.tensor(
+                    self.config.bounding_box_dims.max,
+                    device=self.preproc_device,
+                    dtype=torch.float32,
+                ),
+                torch.tensor(
+                    self.config.bounding_box_dims.min,
+                    device=self.preproc_device,
+                    dtype=torch.float32,
+                ),
+            ]
+            self.default_volume_grid = create_grid(
+                self.config.bounding_box_dims[0],
+                self.config.bounding_box_dims[1],
+                self.config.grid_resolution,
+            )
+        # And, do the surface bounding box if supplied:
+        if hasattr(self.config.bounding_box_dims_surf, "max") and hasattr(
+            self.config.bounding_box_dims_surf, "min"
+        ):
+            self.config.bounding_box_dims_surf = [
+                torch.tensor(
+                    self.config.bounding_box_dims_surf.max,
+                    device=self.preproc_device,
+                    dtype=torch.float32,
+                ),
+                torch.tensor(
+                    self.config.bounding_box_dims_surf.min,
+                    device=self.preproc_device,
+                    dtype=torch.float32,
+                ),
+            ]
+            self.default_surface_grid = create_grid(
+                self.config.bounding_box_dims_surf[0],
+                self.config.bounding_box_dims_surf[1],
+                self.config.grid_resolution,
+            )
+        # Ensure the volume and surface scaling factors are torch tensors
+        # and on the right device:
+        if self.config.volume_factors is not None:
+            if not isinstance(self.config.volume_factors, torch.Tensor):
+                self.config.volume_factors = torch.from_numpy(
+                    self.config.volume_factors
+                )
+            self.config.volume_factors = self.config.volume_factors.to(
+                self.preproc_device, dtype=torch.float32
+            )
+        if self.config.surface_factors is not None:
+            if not isinstance(self.config.surface_factors, torch.Tensor):
+                self.config.surface_factors = torch.from_numpy(
+                    self.config.surface_factors
+                )
+            self.config.surface_factors = self.config.surface_factors.to(
+                self.preproc_device, dtype=torch.float32
+            )
+        self.dataset = None
+    def compute_stl_scaling_and_surface_grids(
+        self,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Compute the min and max for the defining mesh.
+        If the user supplies a bounding box, we use that.  Otherwise,
+        it raises an error.
+        The returned min/max and grid are used for surface data.
+        """
+        # Check the bounding box is not unit length
+        if self.config.bounding_box_dims_surf is not None:
+            s_max = self.config.bounding_box_dims_surf[0]
+            s_min = self.config.bounding_box_dims_surf[1]
+            surf_grid = self.default_surface_grid
+        else:
+            raise ValueError("Bounding box dimensions are not set in config")
+        return s_min, s_max, surf_grid
+    def compute_volume_scaling_and_grids(
+        self,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Compute the min and max and grid for volume data.
+        If the user supplies a bounding box, we use that.  Otherwise,
+        it raises an error.
+        """
+        # Determine the volume min / max locations
+        if self.config.bounding_box_dims is not None:
+            c_max = self.config.bounding_box_dims[0]
+            c_min = self.config.bounding_box_dims[1]
+            volume_grid = self.default_volume_grid
+        else:
+            raise ValueError("Bounding box dimensions are not set in config")
+        return c_min, c_max, volume_grid
+    @profile
+    def downsample_geometry(
+        self,
+        stl_vertices,
+    ) -> torch.Tensor:
+        """
+        Downsample the geometry to the desired number of points.
+        Args:
+            stl_vertices: The vertices of the surface.
+        """
+        if self.config.sampling:
+            geometry_points = self.config.geom_points_sample
+            geometry_coordinates_sampled, idx_geometry = shuffle_array(
+                stl_vertices, geometry_points
+            )
+            if geometry_coordinates_sampled.shape[0] < geometry_points:
+                raise ValueError(
+                    "Surface mesh has fewer points than requested sample size"
+                )
+            geom_centers = geometry_coordinates_sampled
+        else:
+            geom_centers = stl_vertices
+        return geom_centers
+    def process_surface(
+        self,
+        s_min: torch.Tensor,
+        s_max: torch.Tensor,
+        c_min: torch.Tensor,
+        c_max: torch.Tensor,
+        *,  # Forcing the rest by keyword only since it's a long list ...
+        center_of_mass: torch.Tensor,
+        surf_grid: torch.Tensor,
+        surface_coordinates: torch.Tensor,
+        surface_normals: torch.Tensor,
+        surface_sizes: torch.Tensor,
+        stl_vertices: torch.Tensor,
+        stl_indices: torch.Tensor,
+        surface_fields: torch.Tensor | None,
+    ) -> dict[str, torch.Tensor]:
+        nx, ny, nz = self.config.grid_resolution
+        return_dict = {}
+        ########################################################################
+        # Remove any sizes <= 0:
+        ########################################################################
+        idx = surface_sizes > 0
+        surface_sizes = surface_sizes[idx]
+        surface_normals = surface_normals[idx]
+        surface_coordinates = surface_coordinates[idx]
+        if surface_fields is not None:
+            surface_fields = surface_fields[idx]
+        ########################################################################
+        # Reject surface points outside of the Bounding Box
+        # NOTE - this is using the VOLUME bounding box!
+        ########################################################################
+        if self.config.sample_in_bbox:
+            ids_min = surface_coordinates[:] > c_min
+            ids_max = surface_coordinates[:] < c_max
+            ids_in_bbox = ids_min & ids_max
+            ids_in_bbox = ids_in_bbox.all(dim=-1)
+            surface_coordinates = surface_coordinates[ids_in_bbox]
+            surface_normals = surface_normals[ids_in_bbox]
+            surface_sizes = surface_sizes[ids_in_bbox]
+            if surface_fields is not None:
+                surface_fields = surface_fields[ids_in_bbox]
+        ########################################################################
+        # Perform Down sampling of the surface fields.
+        # Note that we snapshot the full surface coordinates for
+        # use in the kNN in the next step.
+        ########################################################################
+        full_surface_coordinates = surface_coordinates
+        full_surface_normals = surface_normals
+        full_surface_sizes = surface_sizes
+        if self.config.sampling:
+            # Perform the down sampling:
+            if self.config.surface_sampling_algorithm == "area_weighted":
+                weights = surface_sizes
+            else:
+                weights = None
+            surface_coordinates_sampled, idx_surface = shuffle_array(
+                surface_coordinates,
+                self.config.surface_points_sample,
+                weights=weights,
+            )
+            if surface_coordinates_sampled.shape[0] < self.config.surface_points_sample:
+                raise ValueError(
+                    "Surface mesh has fewer points than requested sample size"
+                )
+            # Select out the sampled points for non-neighbor arrays:
+            if surface_fields is not None:
+                surface_fields = surface_fields[idx_surface]
+            # Subsample the normals and sizes:
+            surface_normals = surface_normals[idx_surface]
+            surface_sizes = surface_sizes[idx_surface]
+            # Update the coordinates to the sampled points:
+            surface_coordinates = surface_coordinates_sampled
+        ########################################################################
+        # Perform a kNN on the surface to find the neighbor information
+        ########################################################################
+        if self.config.num_surface_neighbors > 1:
+            # Perform the kNN:
+            neighbor_indices, neighbor_distances = knn(
+                points=full_surface_coordinates,
+                queries=surface_coordinates,
+                k=self.config.num_surface_neighbors,
+            )
+            # print(f"Full surface coordinates shape: {full_surface_coordinates.shape}")
+            # Pull out the neighbor elements.
+            # Note that `neighbor_indices` is the index into the original,
+            # full sized tensors (full_surface_coordinates, etc).
+            surface_neighbors = full_surface_coordinates[neighbor_indices][:, 1:]
+            surface_neighbors_normals = full_surface_normals[neighbor_indices][:, 1:]
+            surface_neighbors_sizes = full_surface_sizes[neighbor_indices][:, 1:]
+        else:
+            surface_neighbors = surface_coordinates
+            surface_neighbors_normals = surface_normals
+            surface_neighbors_sizes = surface_sizes
+        # Better to normalize everything after the kNN and sampling
+        if self.config.normalize_coordinates:
+            surface_coordinates = normalize(surface_coordinates, s_max, s_min)
+            surface_neighbors = normalize(surface_neighbors, s_max, s_min)
+            center_of_mass = normalize(center_of_mass, s_max, s_min)
+        pos_normals_com_surface = surface_coordinates - center_of_mass
+        ########################################################################
+        # Apply scaling to the targets, if desired:
+        ########################################################################
+        if self.config.scaling_type is not None and surface_fields is not None:
+            surface_fields = self.scale_model_targets(
+                surface_fields, self.config.surface_factors
+            )
+        return_dict.update(
+            {
+                "pos_surface_center_of_mass": pos_normals_com_surface,
+                "surface_mesh_centers": surface_coordinates,
+                "surface_mesh_neighbors": surface_neighbors,
+                "surface_normals": surface_normals,
+                "surface_neighbors_normals": surface_neighbors_normals,
+                "surface_areas": surface_sizes,
+                "surface_neighbors_areas": surface_neighbors_sizes,
+            }
+        )
+        if surface_fields is not None:
+            return_dict["surface_fields"] = surface_fields
+        return return_dict
+    def process_volume(
+        self,
+        c_min: torch.Tensor,
+        c_max: torch.Tensor,
+        volume_coordinates: torch.Tensor,
+        volume_grid: torch.Tensor,
+        center_of_mass: torch.Tensor,
+        stl_vertices: torch.Tensor,
+        stl_indices: torch.Tensor,
+        volume_fields: torch.Tensor | None,
+    ) -> dict[str, torch.Tensor]:
+        """
+        Preprocess the volume data.
+        First, if configured, we reject points not in the volume bounding box.
+        Next, if sampling is enabled, we sample the volume points and apply that
+        sampling to the ground truth too, if it's present.
+        """
+        ########################################################################
+        # Reject points outside the volumetric BBox
+        ########################################################################
+        if self.config.sample_in_bbox:
+            # Remove points in the volume that are outside
+            # of the bbox area.
+            min_check = volume_coordinates[:] > c_min
+            max_check = volume_coordinates[:] < c_max
+            ids_in_bbox = min_check & max_check
+            ids_in_bbox = ids_in_bbox.all(dim=1)
+            volume_coordinates = volume_coordinates[ids_in_bbox]
+            if volume_fields is not None:
+                volume_fields = volume_fields[ids_in_bbox]
+        ########################################################################
+        # Apply sampling to the volume coordinates and fields
+        ########################################################################
+        # If the volume data has been sampled from disk, directly, then
+        # still apply sampling.  We over-pull from disk deliberately.
+        if self.config.sampling:
+            # Generate a series of idx to sample the volume
+            # without replacement
+            volume_coordinates_sampled, idx_volume = shuffle_array(
+                volume_coordinates, self.config.volume_points_sample
+            )
+            volume_coordinates_sampled = volume_coordinates[idx_volume]
+            # In case too few points are in the sampled data (because the
+            # inputs were too few), pad the outputs:
+            if volume_coordinates_sampled.shape[0] < self.config.volume_points_sample:
+                raise ValueError(
+                    "Volume mesh has fewer points than requested sample size"
+                )
+            # Apply the same sampling to the targets, too:
+            if volume_fields is not None:
+                volume_fields = volume_fields[idx_volume]
+            volume_coordinates = volume_coordinates_sampled
+        ########################################################################
+        # Apply normalization to the coordinates, if desired:
+        ########################################################################
+        if self.config.normalize_coordinates:
+            volume_coordinates = normalize(volume_coordinates, c_max, c_min)
+            grid = normalize(volume_grid, c_max, c_min)
+            normed_vertices = normalize(stl_vertices, c_max, c_min)
+            center_of_mass = normalize(center_of_mass, c_max, c_min)
+        else:
+            grid = volume_grid
+            normed_vertices = stl_vertices
+            center_of_mass = center_of_mass
+        ########################################################################
+        # Apply scaling to the targets, if desired:
+        ########################################################################
+        if self.config.scaling_type is not None and volume_fields is not None:
+            volume_fields = self.scale_model_targets(
+                volume_fields, self.config.volume_factors
+            )
+        ########################################################################
+        # Compute Signed Distance Function for volumetric quantities
+        # Note - the SDF happens here, after volume data processing finishes,
+        # because we need to use the (maybe) normalized volume coordinates and grid
+        ########################################################################
+        # SDF calculation on the volume grid using WARP
+        sdf_grid, _ = signed_distance_field(
+            normed_vertices,
+            stl_indices,
+            grid,
+            use_sign_winding_number=True,
+        )
+        # Get the SDF of all the selected volume coordinates,
+        # And keep the closest point to each one.
+        sdf_nodes, sdf_node_closest_point = signed_distance_field(
+            normed_vertices,
+            stl_indices,
+            volume_coordinates,
+            use_sign_winding_number=True,
+        )
+        sdf_nodes = sdf_nodes.reshape((-1, 1))
+        # Use the closest point from the mesh to compute the volume encodings:
+        pos_normals_closest_vol, pos_normals_com_vol = self.calculate_volume_encoding(
+            volume_coordinates, sdf_node_closest_point, center_of_mass
+        )
+        return_dict = {
+            "volume_mesh_centers": volume_coordinates,
+            "sdf_nodes": sdf_nodes,
+            "grid": grid,
+            "sdf_grid": sdf_grid,
+            "pos_volume_closest": pos_normals_closest_vol,
+            "pos_volume_center_of_mass": pos_normals_com_vol,
+        }
+        if volume_fields is not None:
+            return_dict["volume_fields"] = volume_fields
+        return return_dict
+    def calculate_volume_encoding(
+        self,
+        volume_coordinates: torch.Tensor,
+        sdf_node_closest_point: torch.Tensor,
+        center_of_mass: torch.Tensor,
+    ):
+        pos_normals_closest_vol = volume_coordinates - sdf_node_closest_point
+        pos_normals_com_vol = volume_coordinates - center_of_mass
+        return pos_normals_closest_vol, pos_normals_com_vol
+    @torch.no_grad()
+    def process_data(self, data_dict):
+        # Validate that all required keys are present in data_dict
+        required_keys = [
+            "global_params_values",
+            "global_params_reference",
+            "stl_coordinates",
+            "stl_faces",
+            "stl_centers",
+            "stl_areas",
+        ]
+        missing_keys = [key for key in required_keys if key not in data_dict]
+        if missing_keys:
+            raise ValueError(
+                f"Missing required keys in data_dict: {missing_keys}. "
+                f"Required keys are: {required_keys}"
+            )
+        # Start building the preprocessed return dict:
+        return_dict = {
+            "global_params_values": data_dict["global_params_values"],
+            "global_params_reference": data_dict["global_params_reference"],
+        }
+        # DoMINO's sharded datapipe can be tricky - output shapes are not always
+        # so simple to calculate, since much of the datapipe is dynamic.
+        # The datset will read in sharded data, to minimize IO.
+        # We collect it all locally, here, and then scatter
+        # Appropriately for the outputs
+        if self.config.shard_grid or self.config.shard_points:
+            # Get the mesh:
+            mesh = data_dict["stl_coordinates"]._spec.mesh
+            local_data_dict = {}
+            for key, value in data_dict.items():
+                local_data_dict[key] = value.full_tensor()
+            data_dict = local_data_dict
+        ########################################################################
+        # Process the core STL information
+        ########################################################################
+        # This function gets information about the surface scale,
+        # and decides what the surface grid will be:
+        s_min, s_max, surf_grid = self.compute_stl_scaling_and_surface_grids()
+        # We always need to calculate the SDF on the surface grid:
+        # This is for the SDF Later:
+        if self.config.normalize_coordinates:
+            normed_vertices = normalize(data_dict["stl_coordinates"], s_max, s_min)
+            surf_grid = normalize(surf_grid, s_max, s_min)
+        else:
+            normed_vertices = data_dict["stl_coordinates"]
+        # For SDF calculations, make sure the mesh_indices_flattened is an integer array:
+        mesh_indices_flattened = data_dict["stl_faces"].to(torch.int32)
+        # Compute signed distance function for the surface grid:
+        sdf_surf_grid, _ = signed_distance_field(
+            mesh_vertices=normed_vertices,
+            mesh_indices=mesh_indices_flattened,
+            input_points=surf_grid,
+            use_sign_winding_number=True,
+        )
+        return_dict["sdf_surf_grid"] = sdf_surf_grid
+        return_dict["surf_grid"] = surf_grid
+        # Store this only if normalization is active:
+        if self.config.normalize_coordinates:
+            return_dict["surface_min_max"] = torch.stack([s_min, s_max])
+        # This is a center of mass computation for the stl surface,
+        # using the size of each mesh point as weight.
+        center_of_mass = calculate_center_of_mass(
+            data_dict["stl_centers"], data_dict["stl_areas"]
+        )
+        # This will apply downsampling if needed to the geometry coordinates
+        geom_centers = self.downsample_geometry(
+            stl_vertices=data_dict["stl_coordinates"],
+        )
+        return_dict["geometry_coordinates"] = geom_centers
+        ########################################################################
+        # Determine the volumetric bounds of the data:
+        ########################################################################
+        # Compute the min/max for volume an the unnomralized grid:
+        c_min, c_max, volume_grid = self.compute_volume_scaling_and_grids()
+        ########################################################################
+        # Process the surface data
+        ########################################################################
+        if self.model_type == "surface" or self.model_type == "combined":
+            surface_fields_raw = (
+                data_dict["surface_fields"] if "surface_fields" in data_dict else None
+            )
+            surface_dict = self.process_surface(
+                s_min,
+                s_max,
+                c_min,
+                c_max,
+                center_of_mass=center_of_mass,
+                surf_grid=surf_grid,
+                surface_coordinates=data_dict["surface_mesh_centers"],
+                surface_normals=data_dict["surface_normals"],
+                surface_sizes=data_dict["surface_areas"],
+                stl_vertices=data_dict["stl_coordinates"],
+                stl_indices=mesh_indices_flattened,
+                surface_fields=surface_fields_raw,
+            )
+            return_dict.update(surface_dict)
+        ########################################################################
+        # Process the volume data
+        ########################################################################
+        # For volume data, we store this only if normalizing coordinates:
+        if self.model_type == "volume" or self.model_type == "combined":
+            if self.config.normalize_coordinates:
+                return_dict["volume_min_max"] = torch.stack([c_min, c_max])
+        if self.model_type == "volume" or self.model_type == "combined":
+            volume_fields_raw = (
+                data_dict["volume_fields"] if "volume_fields" in data_dict else None
+            )
+            volume_dict = self.process_volume(
+                c_min,
+                c_max,
+                volume_coordinates=data_dict["volume_mesh_centers"],
+                volume_grid=volume_grid,
+                center_of_mass=center_of_mass,
+                stl_vertices=data_dict["stl_coordinates"],
+                stl_indices=mesh_indices_flattened,
+                volume_fields=volume_fields_raw,
+            )
+            return_dict.update(volume_dict)
+        # For domain parallelism, shard everything appropriately:
+        if self.config.shard_grid or self.config.shard_points:
+            # Mesh was defined above!
+            output_dict = {}
+            # For scattering, we need to know the _global_ index of rank
+            # 0 on this mesh:
+            global_index = dist.get_global_rank(mesh.get_group(), 0)
+            for key, value in return_dict.items():
+                grid_placements = (
+                    [
+                        Shard(0),
+                    ]
+                    if self.config.shard_grid
+                    else [
+                        Replicate(),
+                    ]
+                )
+                point_placements = (
+                    [
+                        Shard(0),
+                    ]
+                    if self.config.shard_points
+                    else [
+                        Replicate(),
+                    ]
+                )
+                if key == "volume_min_max":
+                    output_dict[key] = ShardTensor.from_local(
+                        value,
+                        mesh,
+                        [
+                            Replicate(),
+                        ],
+                    )
+                elif key == "surface_min_max":
+                    output_dict[key] = ShardTensor.from_local(
+                        value,
+                        mesh,
+                        [
+                            Replicate(),
+                        ],
+                    )
+                elif not isinstance(value, ShardTensor):
+                    if "grid" in key:
+                        output_dict[key] = scatter_tensor(
+                            value.contiguous(),
+                            global_index,
+                            mesh,
+                            grid_placements,
+                            global_shape=value.shape,
+                            dtype=value.dtype,
+                        )
+                    else:
+                        output_dict[key] = scatter_tensor(
+                            value.contiguous(),
+                            global_index,
+                            mesh,
+                            point_placements,
+                            global_shape=value.shape,
+                            dtype=value.dtype,
+                        )
+                else:
+                    output_dict[key] = value
+            return_dict = output_dict
+        return return_dict
+    def scale_model_targets(
+        self, fields: torch.Tensor, factors: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Scale the model targets based on the configured scaling factors.
+        """
+        if self.config.scaling_type == "mean_std_scaling":
+            field_mean = factors[0]
+            field_std = factors[1]
+            return standardize(fields, field_mean, field_std)
+        elif self.config.scaling_type == "min_max_scaling":
+            field_min = factors[1]
+            field_max = factors[0]
+            return normalize(fields, field_max, field_min)
+    def unscale_model_outputs(
+        self,
+        volume_fields: torch.Tensor | None = None,
+        surface_fields: torch.Tensor | None = None,
+    ):
+        """
+        Unscale the model outputs based on the configured scaling factors.
+        The unscaling is included here to make it a consistent interface regardless
+        of the scaling factors and type used.
+        """
+        # This is a step to make sure we can apply to sharded outputs:
+        if volume_fields is not None and isinstance(volume_fields, ShardTensor):
+            volume_spec = volume_fields._spec
+            volume_fields = ShardTensor.to_local(volume_fields)
+        else:
+            volume_spec = None
+        if surface_fields is not None and isinstance(surface_fields, ShardTensor):
+            surface_spec = surface_fields._spec
+            surface_fields = ShardTensor.to_local(surface_fields)
+        else:
+            surface_spec = None
+        if volume_fields is not None:
+            if self.config.scaling_type == "mean_std_scaling":
+                vol_mean = self.config.volume_factors[0]
+                vol_std = self.config.volume_factors[1]
+                volume_fields = unstandardize(volume_fields, vol_mean, vol_std)
+            elif self.config.scaling_type == "min_max_scaling":
+                vol_min = self.config.volume_factors[1]
+                vol_max = self.config.volume_factors[0]
+                volume_fields = unnormalize(volume_fields, vol_max, vol_min)
+        if surface_fields is not None:
+            if self.config.scaling_type == "mean_std_scaling":
+                surf_mean = self.config.surface_factors[0]
+                surf_std = self.config.surface_factors[1]
+                surface_fields = unstandardize(surface_fields, surf_mean, surf_std)
+            elif self.config.scaling_type == "min_max_scaling":
+                surf_min = self.config.surface_factors[1]
+                surf_max = self.config.surface_factors[0]
+                surface_fields = unnormalize(surface_fields, surf_max, surf_min)
+        if volume_spec is not None:
+            volume_fields = ShardTensor.from_local(
+                volume_fields,
+                device_mesh=volume_spec.mesh,
+                placements=volume_spec.placements,
+                sharding_shapes=volume_spec.sharding_shapes(),
+            )
+        if surface_spec is not None:
+            surface_fields = ShardTensor.from_local(
+                surface_fields,
+                device_mesh=surface_spec.mesh,
+                placements=surface_spec.placements,
+                sharding_shapes=surface_spec.sharding_shapes(),
+            )
+        return volume_fields, surface_fields
+    def set_dataset(self, dataset: Iterable) -> None:
+        """
+        Pass a dataset to the datapipe to enable iterating over both in one pass.
+        """
+        self.dataset = dataset
+        if self.config.volume_sample_from_disk:
+            # We deliberately double the data to read compared to the sampling size:
+            self.dataset.set_volume_sampling_size(
+                100 * self.config.volume_points_sample
+            )
+    def __len__(self):
+        if self.dataset is not None:
+            return len(self.dataset)
+        else:
+            return 0
+    def __getitem__(self, idx):
+        """
+        Function for fetching and processing a single file's data.
+        Domino, in general, expects one example per file and the files
+        are relatively large due to the mesh size.
+        Requires the user to have set a dataset via `set_dataset`.
+        """
+        if self.dataset is None:
+            raise ValueError("Dataset is not present")
+        # Get the data from the dataset.
+        # Under the hood, this may be fetching preloaded data.
+        data_dict = self.dataset[idx]
+        return self.__call__(data_dict)
+    def __call__(self, data_dict: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
+        """
+        Process the incoming data dictionary.
+        - Processes the data
+        - moves it to GPU
+        - adds a batch dimension
+        Args:
+            data_dict: Dictionary containing the data to process as torch.Tensors.
+        Returns:
+            Dictionary containing the processed data as torch.Tensors.
+        """
+        data_dict = self.process_data(data_dict)
+        # If the data is not on the target device, put it there:
+        for key, value in data_dict.items():
+            if value.device != self.output_device:
+                data_dict[key] = value.to(self.output_device)
+        # Add a batch dimension to the data_dict
+        data_dict = {k: v.unsqueeze(0) for k, v in data_dict.items()}
+        return data_dict
+    def __iter__(self):
+        if self.dataset is None:
+            raise ValueError(
+                "Dataset is not present, can not use the datapipe as an iterator."
+            )
+        for i, batch in enumerate(self.dataset):
+            yield self.__call__(batch)
+def compute_scaling_factors(
+    cfg: DictConfig,
+    input_path: str,
+    target_keys: list[str],
+    max_samples=20,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Using the dataset at the path, compute the mean, std, min, and max of the target keys.
+    Args:
+        cfg: Hydra configuration object containing all parameters
+        input_path: Path to the dataset to load.
+        target_keys: List of keys to compute the mean, std, min, and max of.
+        use_cache: (deprecated) This argument has no effect.
+    """
+    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+    dataset = CAEDataset(
+        data_dir=input_path,
+        keys_to_read=target_keys,
+        keys_to_read_if_available={},
+        output_device=device,
+    )
+    mean, std, min_val, max_val = compute_mean_std_min_max(
+        dataset,
+        field_keys=target_keys,
+        max_samples=max_samples,
+    )
+    return mean, std, min_val, max_val
+class CachedDoMINODataset(Dataset):
+    """
+    Dataset for reading cached DoMINO data files, with optional resampling.
+    Acts as a drop-in replacement for DoMINODataPipe.
+    """
+    # @nvtx_annotate(message="CachedDoMINODataset __init__")
+    def __init__(
+        self,
+        data_path: Union[str, Path],
+        phase: Literal["train", "val", "test"] = "train",
+        sampling: bool = False,
+        volume_points_sample: Optional[int] = None,
+        surface_points_sample: Optional[int] = None,
+        geom_points_sample: Optional[int] = None,
+        model_type=None,  # Model_type, surface, volume or combined
+        deterministic_seed=False,
+        surface_sampling_algorithm="area_weighted",
+    ):
+        super().__init__()
+        self.model_type = model_type
+        if deterministic_seed:
+            np.random.seed(42)
+        if isinstance(data_path, str):
+            data_path = Path(data_path)
+        self.data_path = data_path.expanduser()
+        if not self.data_path.exists():
+            raise AssertionError(f"Path {self.data_path} does not exist")
+        if not self.data_path.is_dir():
+            raise AssertionError(f"Path {self.data_path} is not a directory")
+        self.deterministic_seed = deterministic_seed
+        self.sampling = sampling
+        self.volume_points = volume_points_sample
+        self.surface_points = surface_points_sample
+        self.geom_points = geom_points_sample
+        self.surface_sampling_algorithm = surface_sampling_algorithm
+        self.filenames = get_filenames(self.data_path, exclude_dirs=True)
+        total_files = len(self.filenames)
+        self.phase = phase
+        self.indices = np.array(range(total_files))
+        np.random.shuffle(self.indices)
+        if not self.filenames:
+            raise AssertionError(f"No cached files found in {self.data_path}")
+    def __len__(self):
+        return len(self.indices)
+    # @nvtx_annotate(message="CachedDoMINODataset __getitem__")
+    def __getitem__(self, idx):
+        if self.deterministic_seed:
+            np.random.seed(idx)
+        nvtx.range_push("Load cached file")
+        index = self.indices[idx]
+        cfd_filename = self.filenames[index]
+        filepath = self.data_path / cfd_filename
+        result = np.load(filepath, allow_pickle=True).item()
+        result = {
+            k: torch.from_numpy(v) if isinstance(v, np.ndarray) else v
+            for k, v in result.items()
+        }
+        nvtx.range_pop()
+        if not self.sampling:
+            return result
+        nvtx.range_push("Sample points")
+        # Sample volume points if present
+        if "volume_mesh_centers" in result and self.volume_points:
+            coords_sampled, idx_volume = shuffle_array(
+                result["volume_mesh_centers"], self.volume_points
+            )
+            if coords_sampled.shape[0] < self.volume_points:
+                coords_sampled = pad(
+                    coords_sampled, self.volume_points, pad_value=-10.0
+                )
+            result["volume_mesh_centers"] = coords_sampled
+            for key in [
+                "volume_fields",
+                "pos_volume_closest",
+                "pos_volume_center_of_mass",
+                "sdf_nodes",
+            ]:
+                if key in result:
+                    result[key] = result[key][idx_volume]
+        # Sample surface points if present
+        if "surface_mesh_centers" in result and self.surface_points:
+            if self.surface_sampling_algorithm == "area_weighted":
+                coords_sampled, idx_surface = shuffle_array(
+                    points=result["surface_mesh_centers"],
+                    n_points=self.surface_points,
+                    weights=result["surface_areas"],
+                )
+            else:
+                coords_sampled, idx_surface = shuffle_array(
+                    result["surface_mesh_centers"], self.surface_points
+                )
+            if coords_sampled.shape[0] < self.surface_points:
+                coords_sampled = pad(
+                    coords_sampled, self.surface_points, pad_value=-10.0
+                )
+            ii = result["neighbor_indices"]
+            result["surface_mesh_neighbors"] = result["surface_mesh_centers"][ii]
+            result["surface_neighbors_normals"] = result["surface_normals"][ii]
+            result["surface_neighbors_areas"] = result["surface_areas"][ii]
+            result["surface_mesh_centers"] = coords_sampled
+            for key in [
+                "surface_fields",
+                "surface_areas",
+                "surface_normals",
+                "pos_surface_center_of_mass",
+                "surface_mesh_neighbors",
+                "surface_neighbors_normals",
+                "surface_neighbors_areas",
+            ]:
+                if key in result:
+                    result[key] = result[key][idx_surface]
+            del result["neighbor_indices"]
+        # Sample geometry points if present
+        if "geometry_coordinates" in result and self.geom_points:
+            coords_sampled, _ = shuffle_array(
+                result["geometry_coordinates"], self.geom_points
+            )
+            if coords_sampled.shape[0] < self.geom_points:
+                coords_sampled = pad(coords_sampled, self.geom_points, pad_value=-100.0)
+            result["geometry_coordinates"] = coords_sampled
+        nvtx.range_pop()
+        return result
+def create_domino_dataset(
+    cfg: DictConfig,
+    phase: Literal["train", "val", "test"],
+    keys_to_read: list[str],
+    keys_to_read_if_available: dict[str, torch.Tensor],
+    vol_factors: list[float],
+    surf_factors: list[float],
+    normalize_coordinates: bool = True,
+    sample_in_bbox: bool = True,
+    sampling: bool = True,
+    device_mesh: torch.distributed.DeviceMesh | None = None,
+    placements: dict[str, torch.distributed.tensor.Placement] | None = None,
+):
+    model_type = cfg.model.model_type
+    if phase == "train":
+        input_path = cfg.data.input_dir
+        dataloader_cfg = cfg.train.dataloader
+    elif phase == "val":
+        input_path = cfg.data.input_dir_val
+        dataloader_cfg = cfg.val.dataloader
+    elif phase == "test":
+        input_path = cfg.eval.test_path
+        dataloader_cfg = None
+    else:
+        raise ValueError(f"Invalid phase {phase}")
+    if cfg.data_processor.use_cache:
+        return CachedDoMINODataset(
+            input_path,
+            phase=phase,
+            sampling=sampling,
+            volume_points_sample=cfg.model.volume_points_sample,
+            surface_points_sample=cfg.model.surface_points_sample,
+            geom_points_sample=cfg.model.geom_points_sample,
+            model_type=cfg.model.model_type,
+            surface_sampling_algorithm=cfg.model.surface_sampling_algorithm,
+        )
+    else:
+        # The dataset path works in two pieces:
+        # There is a core "dataset" which is loading data and moving to GPU
+        # And there is the preprocess step, here.
+        # Optionally, and for backwards compatibility, the preprocess
+        # object can accept a dataset which will enable it as an iterator.
+        # The iteration function will loop over the dataset, preprocess the
+        # output, and return it.
+        overrides = {}
+        if hasattr(cfg.data, "gpu_preprocessing"):
+            overrides["gpu_preprocessing"] = cfg.data.gpu_preprocessing
+        if hasattr(cfg.data, "gpu_output"):
+            overrides["gpu_output"] = cfg.data.gpu_output
+        dm = DistributedManager()
+        if cfg.data.gpu_preprocessing:
+            device = dm.device
+            consumer_stream = torch.cuda.default_stream()
+        else:
+            device = torch.device("cpu")
+            consumer_stream = None
+        if dataloader_cfg is not None:
+            preload_depth = dataloader_cfg.preload_depth
+            pin_memory = dataloader_cfg.pin_memory
+        else:
+            preload_depth = 1
+            pin_memory = False
+        dataset = CAEDataset(
+            data_dir=input_path,
+            keys_to_read=keys_to_read,
+            keys_to_read_if_available=keys_to_read_if_available,
+            output_device=device,
+            preload_depth=preload_depth,
+            pin_memory=pin_memory,
+            device_mesh=device_mesh,
+            placements=placements,
+            consumer_stream=consumer_stream,
+        )
+        # Domain parallelism configuration:
+        # (By default, the dataset will shard as aggressively as possible,
+        # to improve IO speed and prevent bottlenecks - the datapipe
+        # has to reshard to the final shape.)
+        # NOTE: we can always capture the mesh and placements from the dataset
+        # outputs, so no need to pass them here.
+        if cfg.get("domain_parallelism", {}).get("domain_size", 1) > 1:
+            shard_grid = cfg.get("domain_parallelism", {}).get("shard_grid", False)
+            shard_points = cfg.get("domain_parallelism", {}).get("shard_points", False)
+            overrides["shard_grid"] = shard_grid
+            overrides["shard_points"] = shard_points
+        datapipe = DoMINODataPipe(
+            input_path,
+            phase=phase,
+            grid_resolution=cfg.model.interp_res,
+            normalize_coordinates=normalize_coordinates,
+            sampling=sampling,
+            sample_in_bbox=sample_in_bbox,
+            volume_points_sample=cfg.model.volume_points_sample,
+            surface_points_sample=cfg.model.surface_points_sample,
+            geom_points_sample=cfg.model.geom_points_sample,
+            volume_factors=vol_factors,
+            surface_factors=surf_factors,
+            scaling_type=cfg.model.normalization,
+            model_type=model_type,
+            bounding_box_dims=cfg.data.bounding_box,
+            bounding_box_dims_surf=cfg.data.bounding_box_surface,
+            volume_sample_from_disk=cfg.data.volume_sample_from_disk,
+            num_surface_neighbors=cfg.model.num_neighbors_surface,
+            surface_sampling_algorithm=cfg.model.surface_sampling_algorithm,
+            **overrides,
+        )
+        datapipe.set_dataset(dataset)
+        return datapipe
+if __name__ == "__main__":
+    fm_data = DoMINODataPipe(
+        data_path="/code/processed_data/new_models_1/",
+        phase="train",
+        sampling=False,
+        sample_in_bbox=False,
+    )

physics_mcp/source/physicsnemo/datapipes/cae/mesh_datapipe.py ADDED Viewed

	@@ -0,0 +1,490 @@

+# SPDX-FileCopyrightText: Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import torch
+import vtk
+try:
+    import nvidia.dali as dali
+    import nvidia.dali.plugin.pytorch as dali_pth
+except ImportError:
+    raise ImportError(
+        "DALI dataset requires NVIDIA DALI package to be installed. "
+        + "The package can be installed at:\n"
+        + "https://docs.nvidia.com/deeplearning/dali/user-guide/docs/installation.html"
+    )
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Iterable, List, Tuple, Union
+from torch import Tensor
+from physicsnemo.datapipes.datapipe import Datapipe
+from physicsnemo.datapipes.meta import DatapipeMetaData
+from .readers import read_cgns, read_vtp, read_vtu
+@dataclass
+class MetaData(DatapipeMetaData):
+    name: str = "MeshDatapipe"
+    # Optimization
+    auto_device: bool = True
+    cuda_graphs: bool = True
+    # Parallel
+    ddp_sharding: bool = True
+class MeshDatapipe(Datapipe):
+    """DALI data pipeline for mesh data
+    Parameters
+    ----------
+    data_dir : str
+        Directory where ERA5 data is stored
+    variables : List[str, None]
+        Ordered list of variables to be loaded from the files
+    num_variables : int
+        Number of variables to be loaded from the files
+    file_format : str, optional
+        File format of the data, by default "vtp"
+        Supported formats: "vtp", "vtu", "cgns"
+    stats_dir : Union[str, None], optional
+        Directory where statistics are stored, by default None
+        If provided, the statistics are used to normalize the attributes
+    batch_size : int, optional
+        Batch size, by default 1
+    num_steps : int, optional
+        Number of timesteps are included in the output variables, by default 1
+    shuffle : bool, optional
+        Shuffle dataset, by default True
+    num_workers : int, optional
+        Number of workers, by default 1
+    device: Union[str, torch.device], optional
+        Device for DALI pipeline to run on, by default cuda
+    process_rank : int, optional
+        Rank ID of local process, by default 0
+    world_size : int, optional
+        Number of training processes, by default 1
+    cache_data : False, optional
+        Whether to cache the data in memory for faster access in subsequent epochs, by default False
+    Parallel: True, optional
+        Setting parallel=True for an external_source node indicates to the pipeline to run the source in Python worker processes started by DALI.
+    """
+    def __init__(
+        self,
+        data_dir: str,
+        variables: List[str],
+        num_variables: int,
+        file_format: str = "vtp",
+        stats_dir: Union[str, None] = None,
+        batch_size: int = 1,
+        num_samples: int = 1,
+        shuffle: bool = True,
+        num_workers: int = 1,
+        device: Union[str, torch.device] = "cuda",
+        process_rank: int = 0,
+        world_size: int = 1,
+        cache_data: bool = False,
+        parallel: bool = True,
+    ):
+        super().__init__(meta=MetaData())
+        self.file_format = file_format
+        self.variables = variables
+        self.num_variables = num_variables
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+        self.shuffle = shuffle
+        self.data_dir = Path(data_dir)
+        self.stats_dir = Path(stats_dir) if stats_dir is not None else None
+        self.num_samples = num_samples
+        self.process_rank = process_rank
+        self.world_size = world_size
+        self.cache_data = cache_data
+        self.parallel = parallel
+        # if self.batch_size > 1:
+        #     raise NotImplementedError("Batch size greater than 1 is not supported yet")
+        # Set up device, needed for pipeline
+        if isinstance(device, str):
+            device = torch.device(device)
+        # Need a index id if cuda
+        if device.type == "cuda" and device.index is None:
+            device = torch.device("cuda:0")
+        self.device = device
+        # check root directory exists
+        if not self.data_dir.is_dir():
+            raise IOError(f"Error, data directory {self.data_dir} does not exist")
+        self.parse_dataset_files()
+        self.load_statistics()
+        self.pipe = self._create_pipeline()
+    def parse_dataset_files(self) -> None:
+        """Parses the data directory for valid files and determines training samples
+        Raises
+        ------
+        ValueError
+            In channels specified or number of samples per year is not valid
+        """
+        # get all input data files
+        match self.file_format:
+            case "vtp":
+                pattern = "*.vtp"
+            case "vtu":
+                pattern = "*.vtu"
+            case "cgns":
+                pattern = "*.cgns"
+            case _:
+                raise NotImplementedError(
+                    f"Data type {self.file_format} is not supported yet"
+                )
+        self.data_paths = sorted(str(path) for path in self.data_dir.glob(pattern))
+        for data_path in self.data_paths:
+            self.logger.info(f"File found: {data_path}")
+        self.total_samples = len(self.data_paths)
+        if self.num_samples > self.total_samples:
+            raise ValueError(
+                "Number of requested samples is greater than the total number of available samples!"
+            )
+        self.logger.info(
+            f"Total number of samples: {self.total_samples}, number of requested samples: {self.num_samples}"
+        )
+    def load_statistics(
+        self,
+    ) -> None:  # TODO generalize and combine with climate/era5_hdf5 datapipes
+        """Loads statistics from pre-computed numpy files
+        The statistic files should be of name global_means.npy and global_std.npy with
+        a shape of [1, C] located in the stat_dir.
+        Raises
+        ------
+        IOError
+            If mean or std numpy files are not found
+        AssertionError
+            If loaded numpy arrays are not of correct size
+        """
+        # If no stats dir we just skip loading the stats
+        if self.stats_dir is None:
+            self.mu = None
+            self.std = None
+            return
+        # load normalisation values
+        mean_stat_file = self.stats_dir / Path("global_means.npy")
+        std_stat_file = self.stats_dir / Path("global_stds.npy")
+        if not mean_stat_file.exists():
+            raise IOError(f"Mean statistics file {mean_stat_file} not found")
+        if not std_stat_file.exists():
+            raise IOError(f"Std statistics file {std_stat_file} not found")
+        # has shape [1, C]
+        self.mu = np.load(str(mean_stat_file))[:, 0 : self.num_variables]
+        # has shape [1, C]
+        self.sd = np.load(str(std_stat_file))[:, 0 : self.num_variables]
+        if not self.mu.shape == self.sd.shape == (1, self.num_variables):
+            raise AssertionError("Error, normalisation arrays have wrong shape")
+    def _create_pipeline(self) -> dali.Pipeline:
+        """Create DALI pipeline
+        Returns
+        -------
+        dali.Pipeline
+            Mesh DALI pipeline
+        """
+        pipe = dali.Pipeline(
+            batch_size=self.batch_size,
+            num_threads=2,
+            prefetch_queue_depth=2,
+            py_num_workers=self.num_workers,
+            device_id=self.device.index,
+            py_start_method="spawn",
+        )
+        with pipe:
+            source = MeshDaliExternalSource(
+                data_paths=self.data_paths,
+                file_format=self.file_format,
+                variables=self.variables,
+                num_samples=self.num_samples,
+                batch_size=self.batch_size,
+                shuffle=self.shuffle,
+                process_rank=self.process_rank,
+                world_size=self.world_size,
+                cache_data=self.cache_data,
+            )
+            # Update length of dataset
+            self.length = len(source) // self.batch_size
+            # Read current batch.
+            vertices, attributes, edges = dali.fn.external_source(
+                source,
+                num_outputs=3,
+                parallel=self.parallel,
+                batch=False,
+            )
+            if self.device.type == "cuda":
+                # Move tensors to GPU as external_source won't do that.
+                vertices = vertices.gpu()
+                attributes = attributes.gpu()
+                edges = edges.gpu()
+            # Normalize attributes if statistics are available.
+            if self.stats_dir is not None:
+                attributes = dali.fn.normalize(attributes, mean=self.mu, stddev=self.sd)
+            # Set outputs.
+            pipe.set_outputs(vertices, attributes, edges)
+        return pipe
+    def __iter__(self):
+        # Reset the pipeline before creating an iterator to enable epochs.
+        self.pipe.reset()
+        # Create DALI PyTorch iterator.
+        return dali_pth.DALIGenericIterator([self.pipe], ["vertices", "x", "edges"])
+    def __len__(self):
+        return self.length
+class MeshDaliExternalSource:
+    """DALI Source for lazy-loading with caching of mesh data
+    Parameters
+    ----------
+    data_paths : Iterable[str]
+        Directory where data is stored
+    num_samples : int
+        Total number of training samples
+    batch_size : int, optional
+        Batch size, by default 1
+    shuffle : bool, optional
+        Shuffle dataset, by default True
+    process_rank : int, optional
+        Rank ID of local process, by default 0
+    world_size : int, optional
+        Number of training processes, by default 1
+    cache_data : False, optional
+        Whether to cache the data in memory for faster access in subsequent epochs, by default False
+    Note
+    ----
+    For more information about DALI external source operator:
+    https://docs.nvidia.com/deeplearning/dali/archives/dali_1_13_0/user-guide/docs/examples/general/data_loading/parallel_external_source.html
+    """
+    def __init__(
+        self,
+        data_paths: Iterable[str],
+        file_format: str,
+        variables: List[str],
+        num_samples: int,
+        batch_size: int = 1,
+        shuffle: bool = True,
+        process_rank: int = 0,
+        world_size: int = 1,
+        cache_data: bool = False,
+    ):
+        self.data_paths = list(data_paths)
+        self.file_format = file_format
+        self.variables = variables
+        # Will be populated later once each worker starts running in its own process.
+        self.poly_data = None
+        self.num_samples = num_samples
+        self.batch_size = batch_size
+        self.shuffle = shuffle
+        self.cache_data = cache_data
+        self.last_epoch = None
+        self.indices = np.arange(num_samples)
+        # Shard from indices if running in parallel
+        self.indices = np.array_split(self.indices, world_size)[process_rank]
+        # Get number of full batches, ignore possible last incomplete batch for now.
+        # Also, DALI external source does not support incomplete batches in parallel mode.
+        self.num_batches = len(self.indices) // self.batch_size
+        self.mesh_reader_fn = self.mesh_reader()
+        self.parse_vtk_data_fn = self.parse_vtk_data()
+        if self.cache_data:
+            # Make cache for the data
+            self.data_cache = {}
+            for data_path in self.data_paths:
+                self.data_cache[data_path] = None
+    def __call__(self, sample_info: dali.types.SampleInfo) -> Tuple[Tensor, Tensor]:
+        if sample_info.iteration >= self.num_batches:
+            raise StopIteration()
+        # Shuffle before the next epoch starts.
+        if self.shuffle and sample_info.epoch_idx != self.last_epoch:
+            # All workers use the same rng seed so the resulting
+            # indices are the same across workers.
+            np.random.default_rng(seed=sample_info.epoch_idx).shuffle(self.indices)
+            self.last_epoch = sample_info.epoch_idx
+        # Get local indices from global index.
+        idx = self.indices[sample_info.idx_in_epoch]
+        # if self.poly_data is None:  # TODO check
+        # This will be called once per worker. Workers are persistent,
+        # so there is no need to explicitly close the files - this will be done
+        # when corresponding pipeline/dataset is destroyed.
+        if self.cache_data:
+            processed_data = self.data_cache.get(self.data_paths[idx])
+            if processed_data is None:
+                data = self.mesh_reader_fn(self.data_paths[idx])
+                processed_data = self.parse_vtk_data_fn(data, self.variables)
+                self.data_cache[self.data_paths[idx]] = processed_data
+        else:
+            data = self.mesh_reader_fn(self.data_paths[idx])
+            processed_data = self.parse_vtk_data_fn(data, self.variables)
+        return processed_data
+    def __len__(self):
+        return len(self.indices)
+    def mesh_reader(self):
+        if self.file_format == "vtp":
+            return read_vtp
+        if self.file_format == "vtu":
+            return read_vtu
+        if self.file_format == "cgns":
+            return read_cgns
+        else:
+            raise NotImplementedError(
+                f"Data type {self.file_format} is not supported yet"
+            )
+    def parse_vtk_data(self):
+        if self.file_format == "vtp":
+            return _parse_vtk_polydata
+        elif self.file_format in ["vtu", "cgns"]:
+            return _parse_vtk_unstructuredgrid
+        else:
+            raise NotImplementedError(
+                f"Data type {self.file_format} is not supported yet"
+            )
+def _parse_vtk_polydata(polydata, variables):
+    # Fetch vertices
+    points = polydata.GetPoints()
+    if points is None:
+        raise ValueError("Failed to get points from the polydata.")
+    vertices = torch.tensor(
+        np.array([points.GetPoint(i) for i in range(points.GetNumberOfPoints())]),
+        dtype=torch.float32,
+    )
+    # Fetch node attributes  # TODO modularize
+    attributes = []
+    point_data = polydata.GetPointData()
+    if point_data is None:
+        raise ValueError("Failed to get point data from the unstructured grid.")
+    for array_name in variables:
+        try:
+            array = point_data.GetArray(array_name)
+        except ValueError:
+            raise ValueError(
+                f"Failed to get array {array_name} from the unstructured grid."
+            )
+        array_data = np.zeros(
+            (points.GetNumberOfPoints(), array.GetNumberOfComponents())
+        )
+        for j in range(points.GetNumberOfPoints()):
+            array.GetTuple(j, array_data[j])
+        attributes.append(torch.tensor(array_data, dtype=torch.float32))
+    attributes = torch.cat(attributes, dim=-1)
+    # TODO torch.cat is usually very inefficient when the number of items is large.
+    # If possible, the resulting tensor should be pre-allocated and filled in during the loop.
+    # Fetch edges
+    polys = polydata.GetPolys()
+    if polys is None:
+        raise ValueError("Failed to get polygons from the polydata.")
+    polys.InitTraversal()
+    edges = []
+    id_list = vtk.vtkIdList()
+    for _ in range(polys.GetNumberOfCells()):
+        polys.GetNextCell(id_list)
+        num_ids = id_list.GetNumberOfIds()
+        edges = [
+            (id_list.GetId(j), id_list.GetId((j + 1) % num_ids)) for j in range(num_ids)
+        ]
+    edges = torch.tensor(edges, dtype=torch.long)
+    return vertices, attributes, edges
+def _parse_vtk_unstructuredgrid(grid, variables):
+    # Fetch vertices
+    points = grid.GetPoints()
+    if points is None:
+        raise ValueError("Failed to get points from the unstructured grid.")
+    vertices = torch.tensor(
+        np.array([points.GetPoint(i) for i in range(points.GetNumberOfPoints())]),
+        dtype=torch.float32,
+    )
+    # Fetch node attributes  # TODO modularize
+    attributes = []
+    point_data = grid.GetPointData()
+    if point_data is None:
+        raise ValueError("Failed to get point data from the unstructured grid.")
+    for array_name in variables:
+        try:
+            array = point_data.GetArray(array_name)
+        except ValueError:
+            raise ValueError(
+                f"Failed to get array {array_name} from the unstructured grid."
+            )
+        array_data = np.zeros(
+            (points.GetNumberOfPoints(), array.GetNumberOfComponents())
+        )
+        for j in range(points.GetNumberOfPoints()):
+            array.GetTuple(j, array_data[j])
+        attributes.append(torch.tensor(array_data, dtype=torch.float32))
+    if variables:
+        attributes = torch.cat(attributes, dim=-1)
+    else:
+        attributes = torch.zeros((1,), dtype=torch.float32)
+    # Return a dummy tensor of zeros for edges since they are not directly computable
+    return (
+        vertices,
+        attributes,
+        torch.zeros((0, 2), dtype=torch.long),
+    )  # Dummy tensor for edges

physics_mcp/source/physicsnemo/datapipes/cae/readers.py ADDED Viewed

	@@ -0,0 +1,191 @@

+# SPDX-FileCopyrightText: Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from typing import Any
+import torch
+import vtk
+Tensor = torch.Tensor
+def read_vtp(file_path: str) -> Any:  # TODO add support for older format (VTK)
+    """
+    Read a VTP file and return the polydata.
+    Parameters
+    ----------
+    file_path : str
+        Path to the VTP file.
+    Returns
+    -------
+    vtkPolyData
+        The polydata read from the VTP file.
+    """
+    # Check if file exists
+    if not os.path.exists(file_path):
+        raise FileNotFoundError(f"{file_path} does not exist.")
+    # Check if file has .vtp extension
+    if not file_path.endswith(".vtp"):
+        raise ValueError(f"Expected a .vtp file, got {file_path}")
+    reader = vtk.vtkXMLPolyDataReader()
+    reader.SetFileName(file_path)
+    reader.Update()
+    # Get the polydata
+    polydata = reader.GetOutput()
+    # Check if polydata is valid
+    if polydata is None:
+        raise ValueError(f"Failed to read polydata from {file_path}")
+    return polydata
+def read_vtu(file_path: str) -> Any:
+    """
+    Read a VTU file and return the unstructured grid data.
+    Parameters
+    ----------
+    file_path : str
+        Path to the VTU file.
+    Returns
+    -------
+    vtkUnstructuredGrid
+        The unstructured grid data read from the VTU file.
+    """
+    # Check if file exists
+    if not os.path.exists(file_path):
+        raise FileNotFoundError(f"{file_path} does not exist.")
+    # Check if file has .vtu extension
+    if not file_path.endswith(".vtu"):
+        raise ValueError(f"Expected a .vtu file, got {file_path}")
+    reader = vtk.vtkXMLUnstructuredGridReader()
+    reader.SetFileName(file_path)
+    reader.Update()
+    # Get the unstructured grid data
+    grid = reader.GetOutput()
+    # Check if grid is valid
+    if grid is None:
+        raise ValueError(f"Failed to read unstructured grid data from {file_path}")
+    return grid
+def read_cgns(file_path: str) -> Any:
+    """
+    Read a CGNS file and return the unstructured grid data.
+    Parameters
+    ----------
+    file_path : str
+        Path to the CGNS file.
+    Returns
+    -------
+    vtkUnstructuredGrid
+        The unstructured grid data read from the CGNS file.
+    """
+    # Check if file exists
+    if not os.path.exists(file_path):
+        raise FileNotFoundError(f"{file_path} does not exist.")
+    # Check if file has .cgns extension
+    if not file_path.endswith(".cgns"):
+        raise ValueError(f"Expected a .cgns file, got {file_path}")
+    reader = vtk.vtkCGNSReader()
+    reader.SetFileName(file_path)
+    reader.Update()
+    # Get the multi-block dataset
+    multi_block = reader.GetOutput()
+    # Check if the multi-block dataset is valid
+    if multi_block is None:
+        raise ValueError(f"Failed to read multi-block data from {file_path}")
+    # Extract and return the vtkUnstructuredGrid from the multi-block dataset
+    return _extract_unstructured_grid(multi_block)
+def read_stl(file_path: str) -> vtk.vtkPolyData:
+    """
+    Read an STL file and return the polydata.
+    Parameters
+    ----------
+    file_path : str
+        Path to the STL file.
+    Returns
+    -------
+    vtkPolyData
+        The polydata read from the STL file.
+    """
+    # Check if file exists
+    if not os.path.exists(file_path):
+        raise FileNotFoundError(f"{file_path} does not exist.")
+    # Check if file has .stl extension
+    if not file_path.endswith(".stl"):
+        raise ValueError(f"Expected a .stl file, got {file_path}")
+    # Create an STL reader
+    reader = vtk.vtkSTLReader()
+    reader.SetFileName(file_path)
+    reader.Update()
+    # Get the polydata
+    polydata = reader.GetOutput()
+    # Check if polydata is valid
+    if polydata is None:
+        raise ValueError(f"Failed to read polydata from {file_path}")
+    return polydata
+def _extract_unstructured_grid(
+    multi_block: vtk.vtkMultiBlockDataSet,
+) -> vtk.vtkUnstructuredGrid:
+    """
+    Extracts a vtkUnstructuredGrid from a vtkMultiBlockDataSet.
+    Parameters
+    ----------
+    multi_block : vtk.vtkMultiBlockDataSet
+        The multi-block dataset containing various data blocks.
+    Returns
+    -------
+    vtk.vtkUnstructuredGrid
+        The unstructured grid extracted from the multi-block dataset.
+    """
+    block = multi_block.GetBlock(0).GetBlock(0)
+    if isinstance(block, vtk.vtkUnstructuredGrid):
+        return block
+    raise ValueError("No vtkUnstructuredGrid found in the vtkMultiBlockDataSet.")

physics_mcp/source/physicsnemo/datapipes/climate/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+# SPDX-FileCopyrightText: Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .climate import ClimateDatapipe, ClimateDataSourceSpec
+from .era5_hdf5 import ERA5HDF5Datapipe
+from .synthetic import SyntheticWeatherDataLoader, SyntheticWeatherDataset

physics_mcp/source/physicsnemo/datapipes/climate/climate.py ADDED Viewed

	@@ -0,0 +1,813 @@

+# SPDX-FileCopyrightText: Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+from abc import ABC, abstractmethod
+from datetime import datetime, timedelta
+from itertools import chain
+import h5py
+import netCDF4 as nc
+import numpy as np
+import pytz
+import torch
+try:
+    import nvidia.dali as dali
+    import nvidia.dali.plugin.pytorch as dali_pth
+except ImportError:
+    raise ImportError(
+        "DALI dataset requires NVIDIA DALI package to be installed. "
+        + "The package can be installed at:\n"
+        + "https://docs.nvidia.com/deeplearning/dali/user-guide/docs/installation.html"
+    )
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Callable, Iterable, List, Mapping, Tuple, Union
+from scipy.io import netcdf_file
+from physicsnemo.datapipes.climate.utils.invariant import latlon_grid
+from physicsnemo.datapipes.climate.utils.zenith_angle import cos_zenith_angle
+from physicsnemo.datapipes.datapipe import Datapipe
+from physicsnemo.datapipes.meta import DatapipeMetaData
+from physicsnemo.launch.logging import PythonLogger
+Tensor = torch.Tensor
+@dataclass
+class MetaData(DatapipeMetaData):
+    name: str = "Climate"
+    # Optimization
+    auto_device: bool = True
+    cuda_graphs: bool = True
+    # Parallel
+    ddp_sharding: bool = True
+class ClimateDataSourceSpec:
+    """
+    A data source specification for ClimateDatapipe.
+    HDF5 files should contain the following variable with the corresponding
+    name:
+    `fields`: Tensor of shape (num_timesteps, num_channels, height, width),
+    containing climate data. The order of the channels should match the order
+    of the channels in the statistics files. The statistics files should be
+    `.npy` files with the shape (1, num_channels, 1, 1).
+    The names of the variables are found in the metadata file found in
+    `metadata_path`.
+    NetCDF4 files should contain a variable of shape
+    (num_timesteps, height, width) for each variable they provide. Only the
+    variables listed in `variables` will be loaded.
+    Parameters
+    ----------
+    data_dir : str
+        Directory where climate data is stored
+    name: Union[str, None], optional
+        The name that is used to label datapipe outputs from this source.
+        If None, the datapipe uses the number of the source in sequential order.
+    file_type: str
+        Type of files to read, supported values are "hdf5" (default) and "netcdf4"
+    stats_files: Union[Mapping[str, str], None], optional
+        Numpy files to data statistics for normalization. Supports either a channels
+        format, in which case the dict should contain the keys "mean" and "std", or a
+        named-variable format, in which case the dict should contain the key "norm" .
+        If None, no normalization will be used, by default None
+    metadata_path: Union[Mapping[str, str], None], optional for NetCDF, required for HDF5
+        Path to the metadata JSON file for the dataset (usually called data.json).
+    channels : Union[List[int], None], optional
+        Defines which climate variables to load, if None will use all in HDF5 file, by default None
+    variables: Union[List[str], None], optional for HDF5 files, mandatory for NetCDF4 files
+        List of named variables to load. Variables will be read in the order specified
+        by this parameter. Must be used for NetCDF4 files. Supported for HDF5 files
+        in which case it will override `channels`.
+    use_cos_zenith: bool, optional
+        If True, the cosine zenith angles corresponding to the coordinates of this
+        data source will be produced, default False
+    aux_variables : Union[Mapping[str, Callable], None], optional
+        A dictionary mapping strings to callables that accept arguments
+        (timestamps: numpy.ndarray, latlon: numpy.ndarray). These define any auxiliary
+        variables returned from this source.
+    num_steps : int, optional
+        Number of timesteps to return, by default 1
+    stride : int, optional
+        Number of steps between input and output variables. For example, if the dataset
+        contains data at every 6 hours, a stride 1 = 6 hour delta t and
+        stride 2 = 12 hours delta t, by default 1
+    """
+    def __init__(
+        self,
+        data_dir: str,
+        name: Union[str, None] = None,
+        file_type: str = "hdf5",
+        stats_files: Union[Mapping[str, str], None] = None,
+        metadata_path: Union[str, None] = None,
+        channels: Union[List[int], None] = None,
+        variables: Union[List[str], None] = None,
+        use_cos_zenith: bool = False,
+        aux_variables: Union[Mapping[str, Callable], None] = None,
+        num_steps: int = 1,
+        stride: int = 1,
+        backend_kwargs: Union[dict, None] = None,
+    ):
+        self.data_dir = Path(data_dir)
+        self.name = name
+        self.file_type = file_type
+        self.stats_files = (
+            {k: Path(fn) for (k, fn) in stats_files.items()}
+            if stats_files is not None
+            else None
+        )
+        self.metadata_path = Path(metadata_path) if metadata_path is not None else None
+        self.channels = channels
+        self.variables = variables
+        self.use_cos_zenith = use_cos_zenith
+        self.aux_variables = aux_variables if aux_variables is not None else {}
+        self.num_steps = num_steps
+        self.stride = stride
+        self.backend_kwargs = {} if backend_kwargs is None else backend_kwargs
+        self.logger = PythonLogger()
+        if file_type == "netcdf4" and not variables:
+            raise ValueError("Variables must be specified for a NetCDF4 source.")
+        # check root directory exists
+        if not self.data_dir.is_dir():
+            raise IOError(f"Error, data directory {self.data_dir} does not exist")
+        if self.stats_files is None:
+            self.logger.warning(
+                "Warning, no stats files specified, this will result in no normalisation"
+            )
+    def dimensions_compatible(self, other) -> bool:
+        """
+        Basic sanity check to test if two `ClimateDataSourceSpec` are
+        compatible.
+        """
+        return (
+            self.data_shape == other.data_shape
+            and self.cropped_data_shape == other.cropped_data_shape
+            and self.num_samples_per_year == other.num_samples_per_year
+            and self.total_length == other.total_length
+            and self.n_years == other.n_years
+        )
+    def parse_dataset_files(
+        self,
+        num_samples_per_year: Union[int, None] = None,
+        patch_size: Union[int, None] = None,
+    ) -> None:
+        """Parses the data directory for valid files and determines training samples
+        Parameters
+        ----------
+        num_samples_per_year : int, optional
+            Number of samples taken from each year. If None, all will be used, by default None
+        patch_size : Union[Tuple[int, int], int, None], optional
+            If specified, crops input and output variables so image dimensions are
+            divisible by patch_size, by default None
+        Raises
+        ------
+        ValueError
+            In channels specified or number of samples per year is not valid
+        """
+        # get all input data files
+        suffix = {"hdf5": "h5", "netcdf4": "nc"}[self.file_type]
+        self.data_paths = sorted(self.data_dir.glob(f"*.{suffix}"))
+        for data_path in self.data_paths:
+            self.logger.info(f"Climate data file found: {data_path}")
+        self.n_years = len(self.data_paths)
+        self.logger.info(f"Number of years: {self.n_years}")
+        # get total number of examples and image shape from the first file,
+        # assuming other files have exactly the same format.
+        self.logger.info(f"Getting file stats from {self.data_paths[0]}")
+        if self.file_type == "hdf5":
+            with h5py.File(self.data_paths[0], "r") as f:
+                dataset_shape = f["fields"].shape
+        else:
+            with nc.Dataset(self.data_paths[0], "r") as f:
+                var_shape = f[self.variables[0]].shape
+                dataset_shape = (var_shape[0], len(self.variables)) + var_shape[1:]
+        # truncate the dataset to avoid out-of-range sampling
+        data_samples_per_year = dataset_shape[0] - (self.num_steps - 1) * self.stride
+        self.data_shape = dataset_shape[2:]
+        # interpret list of variables into list of channels or vice versa
+        if self.file_type == "hdf5":
+            with open(self.metadata_path, "r") as f:
+                metadata = json.load(f)
+            data_vars = metadata["coords"]["channel"]
+            if self.variables is not None:
+                self.channels = [data_vars.index(v) for v in self.variables]
+            else:
+                if self.channels is None:
+                    self.variables = data_vars
+                else:
+                    self.variables = [data_vars[i] for i in self.channels]
+        # If channels not provided, use all of them
+        if self.channels is None:
+            self.channels = list(range(dataset_shape[1]))
+        # If num_samples_per_year use all
+        if num_samples_per_year is None:
+            num_samples_per_year = data_samples_per_year
+        self.num_samples_per_year = num_samples_per_year
+        # Adjust image shape if patch_size defined
+        if patch_size is not None:
+            self.cropped_data_shape = tuple(
+                s - s % patch_size[i] for i, s in enumerate(self.data_shape)
+            )
+        else:
+            self.cropped_data_shape = self.data_shape
+        self.logger.info(f"Input data shape: {self.cropped_data_shape}")
+        # Get total length
+        self.total_length = self.n_years * self.num_samples_per_year
+        # Sanity checks
+        if max(self.channels) >= dataset_shape[1]:
+            raise ValueError(
+                f"Provided channel has indexes greater than the number \
+            of fields {dataset_shape[1]}"
+            )
+        if self.num_samples_per_year > data_samples_per_year:
+            raise ValueError(
+                f"num_samples_per_year ({self.num_samples_per_year}) > number of \
+                samples available ({data_samples_per_year})!"
+            )
+        self._load_statistics()
+        self.logger.info(f"Number of samples/year: {self.num_samples_per_year}")
+        self.logger.info(f"Number of channels available: {dataset_shape[1]}")
+    def _load_statistics(self) -> None:
+        """Loads climate statistics from pre-computed numpy files
+        The statistic files should be of name global_means.npy and global_std.npy with
+        a shape of [1, C, 1, 1] located in the stat_dir.
+        Raises
+        ------
+        IOError
+            If statistics files are not found
+        AssertionError
+            If loaded numpy arrays are not of correct size
+        """
+        # If no stats files we just skip loading the stats
+        if self.stats_files is None:
+            self.mu = None
+            self.sd = None
+            return
+        # load normalisation values
+        if set(self.stats_files) == {"mean", "std"}:  # use mean and std files
+            mean_stat_file = self.stats_files["mean"]
+            std_stat_file = self.stats_files["std"]
+            if not mean_stat_file.exists():
+                raise IOError(f"Mean statistics file {mean_stat_file} not found")
+            if not std_stat_file.exists():
+                raise IOError(f"Std statistics file {std_stat_file} not found")
+            # has shape [1, C, 1, 1]
+            self.mu = np.load(str(mean_stat_file))[:, self.channels]
+            # has shape [1, C, 1, 1]
+            self.sd = np.load(str(std_stat_file))[:, self.channels]
+        elif set(self.stats_files) == {
+            "norm",
+        }:  # use dict formatted file with named variables
+            norm_stat_file = self.stats_files["norm"]
+            if not norm_stat_file.exists():
+                raise IOError(f"Statistics file {norm_stat_file} not found")
+            norm = np.load(str(norm_stat_file), allow_pickle=True).item()
+            mu = np.array([norm[var]["mean"] for var in self.variables])
+            self.mu = mu.reshape((1, len(mu), 1, 1))
+            sd = np.array([norm[var]["std"] for var in self.variables])
+            self.sd = sd.reshape((1, len(sd), 1, 1))
+        else:
+            raise ValueError(("Invalid statistics file specification"))
+        if not self.mu.shape == self.sd.shape == (1, len(self.channels), 1, 1):
+            raise ValueError("Error, normalisation arrays have wrong shape")
+class ClimateDatapipe(Datapipe):
+    """
+    A Climate DALI data pipeline. This pipeline loads data from
+    HDF5/NetCDF4 files. It can also return additional data such as the
+    solar zenith angle for each time step. Additionally, it normalizes
+    the data if a statistics file is provided. The pipeline returns a dictionary
+    with the following structure, where {name} indicates the name of the data
+    source provided:
+    - ``state_seq-{name}``: Tensors of shape
+        (batch_size, num_steps, num_channels, height, width).
+        This sequence is drawn from the data file and normalized if a
+        statistics file is provided.
+    - ``timestamps-{name}``: Tensors of shape (batch_size, num_steps), containing
+        timestamps for each timestep in the sequence.
+    - ``{aux_variable}-{name}``: Tensors of shape
+        (batch_size, num_steps, aux_channels, height, width),
+        containing the auxiliary variables returned by each data source
+    - ``cos_zenith-{name}``: Tensors of shape (batch_size, num_steps, 1, height, width),
+        containing the cosine of the solar zenith angle if specified.
+    - ``{invariant_name}``: Tensors of shape (batch_size, invariant_channels, height, width),
+        containing the time-invariant data (depending only on spatial coordinates)
+        returned by the datapipe. These can include e.g.
+        land-sea mask and geopotential/surface elevation.
+    To use this data pipeline, your data directory must be structured as
+    follows:
+    ```
+    data_dir
+    ├── 1980.h5
+    ├── 1981.h5
+    ├── 1982.h5
+    ├── ...
+    └── 2020.h5
+    ```
+    The files are assumed have no metadata, such as timestamps.
+    Because of this, it's important to specify the `dt` parameter and the
+    `start_year` parameter so that the pipeline can compute the correct
+    timestamps for each timestep. These timestamps are then used to compute the
+    cosine of the solar zenith angle, if specified.
+    Parameters
+    ----------
+    sources: Iterable[ClimateDataSourceSpec]
+        A list of data specifications defining the sources for the climate variables
+    batch_size : int, optional
+        Batch size, by default 1
+    dt : float, optional
+        Time in hours between each timestep in the dataset, by default 6 hr
+    start_year : int, optional
+        Start year of dataset, by default 1980
+    latlon_bounds : Tuple[Tuple[float, float], Tuple[float, float]], optional
+        Bounds of latitude and longitude in the data, in the format
+        ((lat_start, lat_end,), (lon_start, lon_end)).
+        By default ((90, -90), (0, 360)).
+    crop_window: Union[Tuple[Tuple[float, float], Tuple[float, float]], None], optional
+        The window to crop the data to, in the format ((i0,i1), (j0,j1)) where the
+        first spatial dimension will be cropped to i0:i1 and the second to j0:j1.
+        If not given, all data will be used.
+    invariants : Mapping[str,Callable], optional
+        Specifies the time-invariant data (for example latitude and longitude)
+        included in the data samples. Should be a dict where the keys are the
+        names of the invariants and the values are the corresponding
+        functions. The functions need to accept an argument of the shape
+        (2, data_shape[0], data_shape[1]) where the first dimension contains
+        latitude and longitude in degrees and the other dimensions corresponding
+        to the shape of data in the data files. For example,
+        invariants={"trig_latlon": invariants.LatLon()}
+        will include the sin/cos of lat/lon in the output.
+    num_samples_per_year : int, optional
+        Number of samples taken from each year. If None, all will be used, by default None
+    shuffle : bool, optional
+        Shuffle dataset, by default True
+    num_workers : int, optional
+        Number of workers, by default 1
+    device: Union[str, torch.device], optional
+        Device for DALI pipeline to run on, by default cuda
+    process_rank : int, optional
+        Rank ID of local process, by default 0
+    world_size : int, optional
+        Number of training processes, by default 1
+    """
+    def __init__(
+        self,
+        sources: Iterable[ClimateDataSourceSpec],
+        batch_size: int = 1,
+        dt: float = 6.0,
+        start_year: int = 1980,
+        latlon_bounds: Tuple[Tuple[float, float], Tuple[float, float]] = (
+            (90, -90),
+            (0, 360),
+        ),
+        crop_window: Union[
+            Tuple[Tuple[float, float], Tuple[float, float]], None
+        ] = None,
+        invariants: Union[Mapping[str, Callable], None] = None,
+        num_samples_per_year: Union[int, None] = None,
+        shuffle: bool = True,
+        num_workers: int = 1,  # TODO: is there a faster good default?
+        device: Union[str, torch.device] = "cuda",
+        process_rank: int = 0,
+        world_size: int = 1,
+    ):
+        super().__init__(meta=MetaData())
+        self.sources = list(sources)
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+        self.shuffle = shuffle
+        self.dt = dt
+        self.start_year = start_year
+        self.data_latlon_bounds = latlon_bounds
+        self.process_rank = process_rank
+        self.world_size = world_size
+        self.num_samples_per_year = num_samples_per_year
+        self.logger = PythonLogger()
+        if invariants is None:
+            invariants = {}
+        # Determine outputs of pipeline
+        self.pipe_outputs = []
+        for i, spec in enumerate(self.sources):
+            name = spec.name if spec.name is not None else i
+            self.pipe_outputs += [f"state_seq-{name}", f"timestamps-{name}"]
+            self.pipe_outputs.extend(
+                f"{aux_var}-{name}" for aux_var in spec.aux_variables
+            )
+            if spec.use_cos_zenith:
+                self.pipe_outputs.append(f"cos_zenith-{name}")
+        self.pipe_outputs.extend(invariants.keys())
+        # Set up device, needed for pipeline
+        if isinstance(device, str):
+            device = torch.device(device)
+        # Need a index id if cuda
+        if device.type == "cuda" and device.index is None:
+            device = torch.device("cuda:0")
+        self.device = device
+        # Load all data files and statistics
+        for spec in sources:
+            spec.parse_dataset_files(num_samples_per_year=num_samples_per_year)
+        for i, spec_i in enumerate(sources):
+            for spec_j in sources[i + 1 :]:
+                if not spec_i.dimensions_compatible(spec_j):
+                    raise ValueError("Incompatible data sources")
+        self.data_latlon = np.stack(
+            latlon_grid(bounds=self.data_latlon_bounds, shape=sources[0].data_shape),
+            axis=0,
+        )
+        if crop_window is None:
+            crop_window = (
+                (0, sources[0].cropped_data_shape[0]),
+                (0, sources[0].cropped_data_shape[1]),
+            )
+        self.crop_window = crop_window
+        self.window_latlon = self._crop_to_window(self.data_latlon)
+        self.window_latlon_dali = dali.types.Constant(self.window_latlon)
+        # load invariants
+        self.invariants = {
+            var: callback(self.window_latlon) for (var, callback) in invariants.items()
+        }
+        # Create pipeline
+        self.pipe = self._create_pipeline()
+    def _source_cls_from_type(self, source_type: str) -> type:
+        """Get the external source class based on a string descriptor."""
+        return {
+            "hdf5": ClimateHDF5DaliExternalSource,
+            "netcdf4": ClimateNetCDF4DaliExternalSource,
+        }[source_type]
+    def _crop_to_window(self, x):
+        cw = self.crop_window
+        if isinstance(x, dali.pipeline.DataNode):
+            # DALI doesn't support ellipsis notation
+            return x[:, :, cw[0][0] : cw[0][1], cw[1][0] : cw[1][1]]
+        else:
+            return x[..., cw[0][0] : cw[0][1], cw[1][0] : cw[1][1]]
+    def _source_outputs(self, spec: ClimateDataSourceSpec) -> List:
+        """Create DALI outputs for a given data source specification.
+        Parameters
+        ----------
+        spec: ClimateDataSourceSpec
+            The data source specification.
+        """
+        # HDF5/NetCDF source
+        source_cls = self._source_cls_from_type(spec.file_type)
+        source = source_cls(
+            data_paths=spec.data_paths,
+            num_samples=spec.total_length,
+            channels=spec.channels,
+            latlon=self.data_latlon,
+            variables=spec.variables,
+            aux_variables=spec.aux_variables,
+            stride=spec.stride,
+            dt=self.dt,
+            start_year=self.start_year,
+            num_steps=spec.num_steps,
+            num_samples_per_year=spec.num_samples_per_year,
+            batch_size=self.batch_size,
+            shuffle=self.shuffle,
+            process_rank=self.process_rank,
+            world_size=self.world_size,
+        )
+        # Update length of dataset
+        self.total_length = len(source) // self.batch_size
+        # Read current batch
+        (state_seq, timestamps, *aux) = dali.fn.external_source(
+            source,
+            num_outputs=source.num_outputs(),
+            parallel=True,
+            batch=False,
+        )
+        # Crop
+        state_seq = self._crop_to_window(state_seq)
+        aux = (self._crop_to_window(x) for x in aux)
+        # Normalize
+        if spec.stats_files is not None:
+            state_seq = dali.fn.normalize(state_seq, mean=spec.mu, stddev=spec.sd)
+        # Make output list
+        outputs = [state_seq, timestamps, *aux]
+        # Get cosine zenith angle
+        if spec.use_cos_zenith:
+            cos_zenith = dali.fn.cast(
+                cos_zenith_angle(timestamps, latlon=self.window_latlon_dali),
+                dtype=dali.types.FLOAT,
+            )
+            outputs.append(cos_zenith)
+        return outputs
+    def _invariant_outputs(self):
+        for inv in self.invariants.values():
+            if self.crop_window is not None:
+                inv = self._crop_to_window(inv)
+            yield dali.types.Constant(inv)
+    def _create_pipeline(self) -> dali.Pipeline:
+        """Create DALI pipeline
+        Returns
+        -------
+        dali.Pipeline
+            Climate DALI pipeline
+        """
+        pipe = dali.Pipeline(
+            batch_size=self.batch_size,
+            num_threads=2,
+            prefetch_queue_depth=2,
+            py_num_workers=self.num_workers,
+            device_id=self.device.index,
+            py_start_method="spawn",
+        )
+        with pipe:
+            # Concatenate outputs from all sources as well as invariants
+            outputs = list(
+                chain(
+                    *(self._source_outputs(spec) for spec in self.sources),
+                    self._invariant_outputs(),
+                )
+            )
+            if self.device.type == "cuda":
+                # Move tensors to GPU as external_source won't do that
+                outputs = [o.gpu() for o in outputs]
+            # Set outputs
+            pipe.set_outputs(*outputs)
+        return pipe
+    def __iter__(self):
+        # Reset the pipeline before creating an iterator to enable epochs.
+        self.pipe.reset()
+        # Create DALI PyTorch iterator.
+        return dali_pth.DALIGenericIterator([self.pipe], self.pipe_outputs)
+    def __len__(self):
+        return self.total_length
+class ClimateDaliExternalSource(ABC):
+    """DALI Source for lazy-loading the HDF5/NetCDF4 climate files
+    Parameters
+    ----------
+    data_paths : Iterable[str]
+        Directory where climate data is stored
+    num_samples : int
+        Total number of training samples
+    channels : Iterable[int]
+        List representing which climate variables to load
+    num_steps : int
+        Number of timesteps to load
+    stride : int
+        Number of steps between input and output variables
+    dt : float, optional
+        Time in hours between each timestep in the dataset, by default 6 hr
+    start_year : int, optional
+        Start year of dataset, by default 1980
+    num_samples_per_year : int
+        Number of samples randomly taken from each year
+    variables: Union[List[str], None], optional for HDF5 files, mandatory for NetCDF4 files
+        List of named variables to load. Variables will be read in the order specified
+        by this parameter.
+    aux_variables : Union[Mapping[str, Callable], None], optional
+        A dictionary mapping strings to callables that accept arguments
+        (timestamps: numpy.ndarray, latlon: numpy.ndarray). These define any auxiliary
+        variables returned from this source.
+    batch_size : int, optional
+        Batch size, by default 1
+    shuffle : bool, optional
+        Shuffle dataset, by default True
+    process_rank : int, optional
+        Rank ID of local process, by default 0
+    world_size : int, optional
+        Number of training processes, by default 1
+    Note
+    ----
+    For more information about DALI external source operator:
+    https://docs.nvidia.com/deeplearning/dali/archives/dali_1_13_0/user-guide/docs/examples/general/data_loading/parallel_external_source.html
+    """
+    def __init__(
+        self,
+        data_paths: Iterable[str],
+        num_samples: int,
+        channels: Iterable[int],
+        num_steps: int,
+        stride: int,
+        dt: float,
+        start_year: int,
+        num_samples_per_year: int,
+        latlon: np.ndarray,
+        variables: Union[List[str], None] = None,
+        aux_variables: List[Union[str, Callable]] = (),
+        batch_size: int = 1,
+        shuffle: bool = True,
+        process_rank: int = 0,
+        world_size: int = 1,
+        backend_kwargs: Union[dict, None] = None,
+    ):
+        self.data_paths = list(data_paths)
+        # Will be populated later once each worker starts running in its own process.
+        self.data_files = [None] * len(self.data_paths)
+        self.num_samples = num_samples
+        self.chans = list(channels)
+        self.latlon = latlon
+        self.variables = variables
+        self.aux_variables = aux_variables
+        self.num_steps = num_steps
+        self.stride = stride
+        self.dt = dt
+        self.start_year = start_year
+        self.num_samples_per_year = num_samples_per_year
+        self.batch_size = batch_size
+        self.shuffle = shuffle
+        self.backend_kwargs = {} if backend_kwargs is None else backend_kwargs
+        self.last_epoch = None
+        self.indices = np.arange(num_samples)
+        # Shard from indices if running in parallel
+        self.indices = np.array_split(self.indices, world_size)[process_rank]
+        # Get number of full batches, ignore possible last incomplete batch for now.
+        # Also, DALI external source does not support incomplete batches in parallel mode.
+        self.num_batches = len(self.indices) // self.batch_size
+    @abstractmethod
+    def _load_sequence(self, year_idx: int, idx: int) -> np.array:
+        """Write data from year index `year_idx` and sample index `idx` to output"""
+        pass
+    def __call__(self, sample_info: dali.types.SampleInfo) -> Tuple[Tensor, np.ndarray]:
+        if sample_info.iteration >= self.num_batches:
+            raise StopIteration()
+        # Shuffle before the next epoch starts
+        if self.shuffle and sample_info.epoch_idx != self.last_epoch:
+            # All workers use the same rng seed so the resulting
+            # indices are the same across workers
+            np.random.default_rng(seed=sample_info.epoch_idx).shuffle(self.indices)
+            self.last_epoch = sample_info.epoch_idx
+        # Get local indices from global index
+        # TODO: This is very hacky, but it works for now
+        idx = self.indices[sample_info.idx_in_epoch]
+        year_idx = idx // self.num_samples_per_year
+        in_idx = idx % self.num_samples_per_year
+        state_seq = self._load_sequence(year_idx, in_idx)
+        # Load sequence of timestamps
+        year = self.start_year + year_idx
+        start_time = datetime(year, 1, 1, tzinfo=pytz.utc) + timedelta(
+            hours=int(in_idx) * self.dt
+        )
+        timestamps = np.array(
+            [
+                (start_time + timedelta(hours=i * self.stride * self.dt)).timestamp()
+                for i in range(self.num_steps)
+            ]
+        )
+        # outputs from auxiliary sources
+        aux_outputs = (
+            callback(timestamps, self.latlon)
+            for callback in self.aux_variables.values()
+        )
+        return (state_seq, timestamps, *aux_outputs)
+    def num_outputs(self):
+        return 2 + len(self.aux_variables)
+    def __len__(self):
+        return len(self.indices)
+class ClimateHDF5DaliExternalSource(ClimateDaliExternalSource):
+    """DALI source for reading HDF5 formatted climate data files."""
+    def _get_data_file(self, year_idx: int) -> h5py.File:
+        """Return the opened file for year `year_idx`."""
+        if self.data_files[year_idx] is None:
+            # This will be called once per worker. Workers are persistent,
+            # so there is no need to explicitly close the files - this will be done
+            # when corresponding pipeline/dataset is destroyed.
+            # Lazy opening avoids unnecessary file open ops when sharding.
+            self.data_files[year_idx] = h5py.File(self.data_paths[year_idx], "r")
+        return self.data_files[year_idx]
+    def _load_sequence(self, year_idx: int, idx: int) -> np.array:
+        # TODO: the data is returned in a weird (time, channels, width, height) shape
+        data = self._get_data_file(year_idx)["fields"]
+        return data[idx : idx + self.num_steps * self.stride : self.stride, self.chans]
+class ClimateNetCDF4DaliExternalSource(ClimateDaliExternalSource):
+    """DALI source for reading NetCDF4 formatted climate data files."""
+    def _get_data_file(self, year_idx: int) -> netcdf_file:
+        """Return the opened file for year `year_idx`."""
+        if self.data_files[year_idx] is None:
+            # This will be called once per worker. Workers are persistent,
+            # so there is no need to explicitly close the files - this will be done
+            # when corresponding pipeline/dataset is destroyed
+            # Lazy opening avoids unnecessary file open ops when sharding.
+            # NOTE: The SciPy NetCDF reader can be used if the netCDF4 library
+            # causes crashes.
+            reader = self.backend_kwargs.get("reader", "netcdf4")
+            if reader == "scipy":
+                self.data_files[year_idx] = netcdf_file(self.data_paths[year_idx])
+            elif reader == "netcdf4":
+                self.data_files[year_idx] = nc.Dataset(self.data_paths[year_idx], "r")
+                self.data_files[year_idx].set_auto_maskandscale(False)
+        return self.data_files[year_idx]
+    def _load_sequence(self, year_idx: int, idx: int) -> np.array:
+        data_file = self._get_data_file(year_idx)
+        shape = data_file.variables[self.variables[0]].shape
+        shape = (self.num_steps, len(self.variables)) + shape[1:]
+        # TODO: this can be optimized to do the NetCDF scale/offset on GPU
+        output = np.empty(shape, dtype=np.float32)
+        for i, var in enumerate(self.variables):
+            v = data_file.variables[var]
+            output[:, i] = v[
+                idx : idx + self.num_steps * self.stride : self.stride
+            ].copy()  # .copy() avoids hanging references
+            if hasattr(v, "scale_factor"):
+                output[:, i] *= v.scale_factor
+            if hasattr(v, "add_offset"):
+                output[:, i] += v.add_offset
+        return output

physics_mcp/source/physicsnemo/datapipes/climate/era5_hdf5.py ADDED Viewed

	@@ -0,0 +1,622 @@

+# SPDX-FileCopyrightText: Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import h5py
+import numpy as np
+import torch
+try:
+    import nvidia.dali as dali
+    import nvidia.dali.plugin.pytorch as dali_pth
+except ImportError:
+    raise ImportError(
+        "DALI dataset requires NVIDIA DALI package to be installed. "
+        + "The package can be installed at:\n"
+        + "https://docs.nvidia.com/deeplearning/dali/user-guide/docs/installation.html"
+    )
+from dataclasses import dataclass
+from datetime import datetime, timedelta
+from pathlib import Path
+from typing import Dict, Iterable, List, Tuple, Union
+import pytz
+from physicsnemo.datapipes.climate.utils.invariant import latlon_grid
+from physicsnemo.datapipes.climate.utils.zenith_angle import cos_zenith_angle
+from ..datapipe import Datapipe
+from ..meta import DatapipeMetaData
+Tensor = torch.Tensor
+@dataclass
+class MetaData(DatapipeMetaData):
+    name: str = "ERA5HDF5"
+    # Optimization
+    auto_device: bool = True
+    cuda_graphs: bool = True
+    # Parallel
+    ddp_sharding: bool = True
+class ERA5HDF5Datapipe(Datapipe):
+    """ERA5 DALI data pipeline for HDF5 files
+    Parameters
+    ----------
+    data_dir : str
+        Directory where ERA5 data is stored
+    stats_dir : Union[str, None], optional
+        Directory to data statistic numpy files for normalization, if None, no normalization
+        will be used, by default None
+    channels : Union[List[int], None], optional
+        Defines which ERA5 variables to load, if None will use all in HDF5 file, by default None
+    batch_size : int, optional
+        Batch size, by default 1
+    stride : int, optional
+        Number of steps between input and output variables. For example, if the dataset
+        contains data at every 6 hours, a stride 1 = 6 hour delta t and
+        stride 2 = 12 hours delta t, by default 1
+    num_steps : int, optional
+        Number of timesteps are included in the output variables, by default 1
+    num_history : int, optional
+        Number of previous timesteps included in the input variables, by default 0
+    latlon_resolution: Tuple[int, int], optional
+        The resolution for the latitude-longitude grid (H, W). Needs to be specified
+        for cos zenith angle computation, or interpolation. By default None
+    interpolation_type: str, optional
+        Interpolation type for resizing. Supports ["INTERP_NN", "INTERP_LINEAR", "INTERP_CUBIC",
+        "INTERP_LANCZOS3", "INTERP_TRIANGULAR", "INTERP_GAUSSIAN"]. By default None
+        (no interpolation is done)
+    patch_size : Union[Tuple[int, int], int, None], optional
+        If specified, crops input and output variables so image dimensions are
+        divisible by patch_size, by default None
+    num_samples_per_year : int, optional
+        Number of samples randomly taken from each year. If None, all will be used, by default None
+    use_cos_zenith: bool, optional
+        If True, the cosine zenith angles corresponding to the coordinates will be produced,
+        by default False
+    cos_zenith_args: Dict, optional
+        Dictionary containing the following:
+        dt: float, optional
+            Time in hours between each timestep in the dataset, by default 6 hr
+        start_year: int, optional
+            Start year of dataset, by default 1980
+        latlon_bounds : Tuple[Tuple[float, float], Tuple[float, float]], optional
+            Bounds of latitude and longitude in the data, in the format
+            ((lat_start, lat_end,), (lon_start, lon_end)).
+            By default ((90, -90), (0, 360)).
+        Defaults are only applicable if use_cos_zenith is True. Otherwise, defaults to {}.
+    use_time_of_year_index: bool
+        If true, also returns the index that can be used to determine the time of the year
+        corresponding to each sample. By default False.
+    shuffle : bool, optional
+        Shuffle dataset, by default True
+    num_workers : int, optional
+        Number of workers, by default 1
+    device: Union[str, torch.device], optional
+        Device for DALI pipeline to run on, by default cuda
+    process_rank : int, optional
+        Rank ID of local process, by default 0
+    world_size : int, optional
+        Number of training processes, by default 1
+    """
+    def __init__(
+        self,
+        data_dir: str,
+        stats_dir: Union[str, None] = None,
+        channels: Union[List[int], None] = None,
+        batch_size: int = 1,
+        num_steps: int = 1,
+        num_history: int = 0,
+        stride: int = 1,
+        latlon_resolution: Union[Tuple[int, int], None] = None,
+        interpolation_type: Union[str, None] = None,
+        patch_size: Union[Tuple[int, int], int, None] = None,
+        num_samples_per_year: Union[int, None] = None,
+        use_cos_zenith: bool = False,
+        cos_zenith_args: Dict = {},
+        use_time_of_year_index: bool = False,
+        shuffle: bool = True,
+        num_workers: int = 1,
+        device: Union[str, torch.device] = "cuda",
+        process_rank: int = 0,
+        world_size: int = 1,
+    ):
+        super().__init__(meta=MetaData())
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+        self.shuffle = shuffle
+        self.data_dir = Path(data_dir)
+        self.stats_dir = Path(stats_dir) if stats_dir is not None else None
+        self.channels = channels
+        self.stride = stride
+        self.latlon_resolution = latlon_resolution
+        self.interpolation_type = interpolation_type
+        self.num_steps = num_steps
+        self.num_history = num_history
+        self.num_samples_per_year = num_samples_per_year
+        self.use_cos_zenith = use_cos_zenith
+        self.cos_zenith_args = cos_zenith_args
+        self.use_time_of_year_index = use_time_of_year_index
+        self.process_rank = process_rank
+        self.world_size = world_size
+        # cos zenith defaults
+        if use_cos_zenith:
+            cos_zenith_args["dt"] = cos_zenith_args.get("dt", 6.0)
+            cos_zenith_args["start_year"] = cos_zenith_args.get("start_year", 1980)
+            cos_zenith_args["latlon_bounds"] = cos_zenith_args.get(
+                "latlon_bounds",
+                (
+                    (90, -90),
+                    (0, 360),
+                ),
+            )
+        self.latlon_bounds = cos_zenith_args.get("latlon_bounds")
+        if isinstance(patch_size, int):
+            patch_size = (patch_size, patch_size)
+        self.patch_size = patch_size
+        # Set up device, needed for pipeline
+        if isinstance(device, str):
+            device = torch.device(device)
+        # Need a index id if cuda
+        if device.type == "cuda" and device.index is None:
+            device = torch.device("cuda:0")
+        self.device = device
+        # check root directory exists
+        if not self.data_dir.is_dir():
+            raise IOError(f"Error, data directory {self.data_dir} does not exist")
+        if self.stats_dir is not None and not self.stats_dir.is_dir():
+            raise IOError(f"Error, stats directory {self.stats_dir} does not exist")
+        # Check interpolation type
+        if self.interpolation_type is not None:
+            valid_interpolation = [
+                "INTERP_NN",
+                "INTERP_LINEAR",
+                "INTERP_CUBIC",
+                "INTERP_LANCZOS3",
+                "INTERP_TRIANGULAR",
+                "INTERP_GAUSSIAN",
+            ]
+            if self.interpolation_type not in valid_interpolation:
+                raise ValueError(
+                    f"Interpolation type {self.interpolation_type} not supported"
+                )
+            self.interpolation_type = getattr(dali.types, self.interpolation_type)
+        # Layout
+        # Avoiding API change for self.num_history == 0.
+        # Need to use FCHW layout in the future regardless of the num_history.
+        if self.num_history == 0:
+            self.layout = ["CHW", "FCHW"]
+        else:
+            self.layout = ["FCHW", "FCHW"]
+        self.output_keys = ["invar", "outvar"]
+        # Get latlon for zenith angle
+        if self.use_cos_zenith:
+            if not self.latlon_resolution:
+                raise ValueError("latlon_resolution must be set for cos zenith angle")
+            self.data_latlon = np.stack(
+                latlon_grid(bounds=self.latlon_bounds, shape=self.latlon_resolution),
+                axis=0,
+            )
+            self.latlon_dali = dali.types.Constant(self.data_latlon)
+            self.output_keys += ["cos_zenith"]
+        if self.use_time_of_year_index:
+            self.output_keys += ["time_of_year_idx"]
+        self.parse_dataset_files()
+        self.load_statistics()
+        self.pipe = self._create_pipeline()
+    def parse_dataset_files(self) -> None:
+        """Parses the data directory for valid HDF5 files and determines training samples
+        Raises
+        ------
+        ValueError
+            In channels specified or number of samples per year is not valid
+        """
+        # get all input data files
+        self.data_paths = sorted(self.data_dir.glob("????.h5"))
+        for data_path in self.data_paths:
+            self.logger.info(f"ERA5 file found: {data_path}")
+        self.n_years = len(self.data_paths)
+        self.logger.info(f"Number of years: {self.n_years}")
+        # get total number of examples and image shape from the first file,
+        # assuming other files have exactly the same format.
+        self.logger.info(f"Getting file stats from {self.data_paths[0]}")
+        with h5py.File(self.data_paths[0], "r") as f:
+            # truncate the dataset to avoid out-of-range sampling and ensure each
+            # rank has same number of samples (to avoid deadlocks)
+            data_samples_per_year = (
+                (
+                    f["fields"].shape[0]
+                    - (self.num_steps + self.num_history) * self.stride
+                )
+                // self.world_size
+            ) * self.world_size
+            if data_samples_per_year < 1:
+                raise ValueError(
+                    f"Not enough number of samples per year ({data_samples_per_year})"
+                )
+            self.img_shape = f["fields"].shape[2:]
+            # If channels not provided, use all of them
+            if self.channels is None:
+                self.channels = [i for i in range(f["fields"].shape[1])]
+            # If num_samples_per_year use all
+            if self.num_samples_per_year is None:
+                self.num_samples_per_year = data_samples_per_year
+            # Adjust image shape if patch_size defined
+            if self.patch_size is not None:
+                if self.use_cos_zenith:
+                    raise ValueError("Patching is not supported with cos zenith angle")
+                self.img_shape = [
+                    s - s % self.patch_size[i] for i, s in enumerate(self.img_shape)
+                ]
+            self.logger.info(f"Input image shape: {self.img_shape}")
+            # Get total length
+            self.total_length = self.n_years * self.num_samples_per_year
+            self.length = self.total_length
+            # Sanity checks
+            if max(self.channels) >= f["fields"].shape[1]:
+                raise ValueError(
+                    f"Provided channel has indexes greater than the number \
+                of fields {f['fields'].shape[1]}"
+                )
+            if self.num_samples_per_year > data_samples_per_year:
+                raise ValueError(
+                    f"num_samples_per_year ({self.num_samples_per_year}) > number of \
+                    samples available ({data_samples_per_year})!"
+                )
+            self.logger.info(f"Number of samples/year: {self.num_samples_per_year}")
+            self.logger.info(f"Number of channels available: {f['fields'].shape[1]}")
+    def load_statistics(self) -> None:
+        """Loads ERA5 statistics from pre-computed numpy files
+        The statistic files should be of name global_means.npy and global_std.npy with
+        a shape of [1, C, 1, 1] located in the stat_dir.
+        Raises
+        ------
+        IOError
+            If mean or std numpy files are not found
+        AssertionError
+            If loaded numpy arrays are not of correct size
+        """
+        # If no stats dir we just skip loading the stats
+        if self.stats_dir is None:
+            self.mu = None
+            self.std = None
+            return
+        # load normalisation values
+        mean_stat_file = self.stats_dir / Path("global_means.npy")
+        std_stat_file = self.stats_dir / Path("global_stds.npy")
+        if not mean_stat_file.exists():
+            raise IOError(f"Mean statistics file {mean_stat_file} not found")
+        if not std_stat_file.exists():
+            raise IOError(f"Std statistics file {std_stat_file} not found")
+        # has shape [1, C, 1, 1]
+        self.mu = np.load(str(mean_stat_file))[:, self.channels]
+        # has shape [1, C, 1, 1]
+        self.sd = np.load(str(std_stat_file))[:, self.channels]
+        if not self.mu.shape == self.sd.shape == (1, len(self.channels), 1, 1):
+            raise AssertionError("Error, normalisation arrays have wrong shape")
+    def _create_pipeline(self) -> dali.Pipeline:
+        """Create DALI pipeline
+        Returns
+        -------
+        dali.Pipeline
+            HDF5 DALI pipeline
+        """
+        pipe = dali.Pipeline(
+            batch_size=self.batch_size,
+            num_threads=2,
+            prefetch_queue_depth=2,
+            py_num_workers=self.num_workers,
+            device_id=self.device.index,
+            py_start_method="spawn",
+        )
+        with pipe:
+            source = ERA5DaliExternalSource(
+                data_paths=self.data_paths,
+                num_samples=self.total_length,
+                channels=self.channels,
+                stride=self.stride,
+                num_steps=self.num_steps,
+                num_history=self.num_history,
+                num_samples_per_year=self.num_samples_per_year,
+                use_cos_zenith=self.use_cos_zenith,
+                cos_zenith_args=self.cos_zenith_args,
+                use_time_of_year_index=self.use_time_of_year_index,
+                batch_size=self.batch_size,
+                shuffle=self.shuffle,
+                process_rank=self.process_rank,
+                world_size=self.world_size,
+            )
+            # Update length of dataset
+            self.length = len(source) // self.batch_size
+            # Read current batch.
+            invar, outvar, timestamps, time_of_year_idx = dali.fn.external_source(
+                source,
+                num_outputs=4,
+                parallel=True,
+                batch=False,
+                layout=self.layout,
+            )
+            if self.device.type == "cuda":
+                # Move tensors to GPU as external_source won't do that.
+                invar = invar.gpu()
+                outvar = outvar.gpu()
+            # Crop.
+            h, w = self.img_shape
+            if self.num_history == 0:
+                invar = invar[:, :h, :w]
+            else:
+                invar = invar[:, :, :h, :w]
+            outvar = outvar[:, :, :h, :w]
+            # Standardize.
+            if self.stats_dir is not None:
+                if self.num_history == 0:
+                    invar = dali.fn.normalize(invar, mean=self.mu[0], stddev=self.sd[0])
+                else:
+                    invar = dali.fn.normalize(invar, mean=self.mu, stddev=self.sd)
+                outvar = dali.fn.normalize(outvar, mean=self.mu, stddev=self.sd)
+            # Resize.
+            if self.interpolation_type is not None:
+                invar = dali.fn.resize(
+                    invar,
+                    resize_x=self.latlon_resolution[1],
+                    resize_y=self.latlon_resolution[0],
+                    interp_type=self.interpolation_type,
+                    antialias=False,
+                )
+                outvar = dali.fn.resize(
+                    outvar,
+                    resize_x=self.latlon_resolution[1],
+                    resize_y=self.latlon_resolution[0],
+                    interp_type=self.interpolation_type,
+                    antialias=False,
+                )
+            # cos zenith angle
+            if self.use_cos_zenith:
+                cos_zenith = dali.fn.cast(
+                    cos_zenith_angle(timestamps, latlon=self.latlon_dali),
+                    dtype=dali.types.FLOAT,
+                )
+                if self.device.type == "cuda":
+                    cos_zenith = cos_zenith.gpu()
+            # # Time of the year
+            # time_of_year_idx = dali.fn.cast(
+            #         time_of_year_idx,
+            #         dtype=dali.types.UINT32,
+            #     )
+            # Set outputs.
+            outputs = (invar, outvar)
+            if self.use_cos_zenith:
+                outputs += (cos_zenith,)
+            if self.use_time_of_year_index:
+                outputs += (time_of_year_idx,)
+            pipe.set_outputs(*outputs)
+        return pipe
+    def __iter__(self):
+        # Reset the pipeline before creating an iterator to enable epochs.
+        self.pipe.reset()
+        # Create DALI PyTorch iterator.
+        return dali_pth.DALIGenericIterator([self.pipe], self.output_keys)
+    def __len__(self):
+        return self.length
+class ERA5DaliExternalSource:
+    """DALI Source for lazy-loading the HDF5 ERA5 files
+    Parameters
+    ----------
+    data_paths : Iterable[str]
+        Directory where ERA5 data is stored
+    num_samples : int
+        Total number of training samples
+    channels : Iterable[int]
+        List representing which ERA5 variables to load
+    start_year : int, optional
+        Start year of dataset
+    stride : int
+        Number of steps between input and output variables
+    num_steps : int
+        Number of timesteps are included in the output variables
+    num_history : int
+        Number of previous timesteps included in the input variables
+    num_samples_per_year : int
+        Number of samples randomly taken from each year
+    batch_size : int, optional
+        Batch size, by default 1
+    use_cos_zenith: bool
+        If True, the cosine zenith angles corresponding to the coordinates will be produced
+    cos_zenith_args: Dict
+        Dictionary containing the following:
+        dt: float
+            Time in hours between each timestep in the dataset
+        start_year: int
+            Start year of dataset
+    shuffle : bool, optional
+        Shuffle dataset, by default True
+    process_rank : int, optional
+        Rank ID of local process, by default 0
+    world_size : int, optional
+        Number of training processes, by default 1
+    Note
+    ----
+    For more information about DALI external source operator:
+    https://docs.nvidia.com/deeplearning/dali/archives/dali_1_13_0/user-guide/docs/examples/general/data_loading/parallel_external_source.html
+    """
+    def __init__(
+        self,
+        data_paths: Iterable[str],
+        num_samples: int,
+        channels: Iterable[int],
+        num_steps: int,
+        num_history: int,
+        stride: int,
+        num_samples_per_year: int,
+        use_cos_zenith: bool,
+        cos_zenith_args: Dict,
+        use_time_of_year_index: bool,
+        batch_size: int = 1,
+        shuffle: bool = True,
+        process_rank: int = 0,
+        world_size: int = 1,
+    ):
+        self.data_paths = list(data_paths)
+        # Will be populated later once each worker starts running in its own process.
+        self.data_files = None
+        self.num_samples = num_samples
+        self.chans = list(channels)
+        self.num_steps = num_steps
+        self.num_history = num_history
+        self.stride = stride
+        self.num_samples_per_year = num_samples_per_year
+        self.use_cos_zenith = use_cos_zenith
+        self.use_time_of_year_index = use_time_of_year_index
+        self.batch_size = batch_size
+        self.shuffle = shuffle
+        self.last_epoch = None
+        self.indices = np.arange(num_samples)
+        # Shard from indices if running in parallel
+        self.indices = np.array_split(self.indices, world_size)[process_rank]
+        # Get number of full batches, ignore possible last incomplete batch for now.
+        # Also, DALI external source does not support incomplete batches in parallel mode.
+        self.num_batches = len(self.indices) // self.batch_size
+        # cos zenith args
+        if self.use_cos_zenith:
+            self.dt: float = cos_zenith_args.get("dt")
+            self.start_year: int = cos_zenith_args.get("start_year")
+    def __call__(
+        self, sample_info: dali.types.SampleInfo
+    ) -> Tuple[Tensor, Tensor, np.ndarray]:
+        if sample_info.iteration >= self.num_batches:
+            raise StopIteration()
+        if self.data_files is None:
+            # This will be called once per worker. Workers are persistent,
+            # so there is no need to explicitly close the files - this will be done
+            # when corresponding pipeline/dataset is destroyed.
+            self.data_files = [h5py.File(path, "r") for path in self.data_paths]
+        # Shuffle before the next epoch starts.
+        if self.shuffle and sample_info.epoch_idx != self.last_epoch:
+            # All workers use the same rng seed so the resulting
+            # indices are the same across workers.
+            np.random.default_rng(seed=sample_info.epoch_idx).shuffle(self.indices)
+            self.last_epoch = sample_info.epoch_idx
+        # Get local indices from global index.
+        idx = self.indices[sample_info.idx_in_epoch]
+        year_idx = idx // self.num_samples_per_year
+        in_idx = idx % self.num_samples_per_year
+        # Load sequence of timestamps
+        if self.use_cos_zenith:
+            year = self.start_year + year_idx
+            start_time = datetime(year, 1, 1, tzinfo=pytz.utc) + timedelta(
+                hours=int(in_idx) * self.dt
+            )
+            timestamps = np.array(
+                [
+                    (
+                        start_time + timedelta(hours=i * self.stride * self.dt)
+                    ).timestamp()
+                    for i in range(self.num_history + self.num_steps + 1)
+                ]
+            )
+        else:
+            timestamps = np.array([])
+        if self.use_time_of_year_index:
+            time_of_year_idx = in_idx
+        else:
+            time_of_year_idx = -1
+        data = self.data_files[year_idx]["fields"]
+        if self.num_history == 0:
+            # Has [C,H,W] shape.
+            invar = data[in_idx, self.chans]
+        else:
+            # Has [T,C,H,W] shape.
+            invar = data[
+                in_idx : in_idx + (self.num_history + 1) * self.stride : self.stride,
+                self.chans,
+            ]
+        # Has [T,C,H,W] shape.
+        outvar = np.empty((self.num_steps,) + invar.shape[-3:], dtype=invar.dtype)
+        for i in range(self.num_steps):
+            out_idx = in_idx + (self.num_history + i + 1) * self.stride
+            outvar[i] = data[out_idx, self.chans]
+        return invar, outvar, timestamps, np.array([time_of_year_idx])
+    def __len__(self):
+        return len(self.indices)

physics_mcp/source/physicsnemo/datapipes/climate/era5_netcdf.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# SPDX-FileCopyrightText: Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

physics_mcp/source/physicsnemo/datapipes/climate/synthetic.py ADDED Viewed

	@@ -0,0 +1,182 @@

+# SPDX-FileCopyrightText: Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import time
+from typing import Any, Dict, List, Tuple
+import numpy as np
+import torch
+from torch.utils.data import DataLoader, Dataset
+class SyntheticWeatherDataLoader(DataLoader):
+    """
+    This custom DataLoader initializes the SyntheticWeatherDataset with given arguments.
+    """
+    def __init__(self, *args, **kwargs):
+        dataset = SyntheticWeatherDataset(*args, **kwargs)
+        super().__init__(
+            dataset=dataset,
+            batch_size=kwargs.get("batch_size", 1),
+            shuffle=kwargs.get("shuffle", False),
+            num_workers=kwargs.get("num_workers", 0),
+            pin_memory=kwargs.get("pin_memory", False),
+            drop_last=kwargs.get("drop_last", False),
+        )
+class SyntheticWeatherDataset(Dataset):
+    """
+    A dataset for generating synthetic temperature data on a latitude-longitude grid for multiple atmospheric layers.
+    Args:
+        channels (list): List of channels representing different atmospheric layers.
+        num_samples_per_year (int): Total number of days to simulate per year.
+        num_steps (int): Number of consecutive days in each training sample.
+        grid_size (tuple): Latitude by longitude dimensions of the temperature grid.
+        base_temp (float): Base temperature around which variations are simulated.
+        amplitude (float): Amplitude of the sinusoidal temperature variation.
+        noise_level (float): Standard deviation of the noise added to temperature data.
+        **kwargs: Additional keyword arguments for advanced configurations.
+    """
+    def __init__(
+        self,
+        channels: List[int],
+        num_samples_per_year: int,
+        num_steps: int,
+        device: str | torch.device = "cuda",
+        grid_size: Tuple[int, int] = (721, 1440),
+        base_temp: float = 15,
+        amplitude: float = 10,
+        noise_level: float = 2,
+        **kwargs: Any,
+    ):
+        self.num_days: int = num_samples_per_year
+        self.num_steps: int = num_steps
+        self.num_channels: int = len(channels)
+        self.device = device
+        self.grid_size: Tuple[int, int] = grid_size
+        start_time = time.time()
+        self.temperatures: np.ndarray = self.generate_data(
+            self.num_days,
+            self.num_channels,
+            self.grid_size,
+            base_temp,
+            amplitude,
+            noise_level,
+        )
+        print(
+            f"Generated synthetic temperature data in {time.time() - start_time:.2f} seconds."
+        )
+        self.extra_args: Dict[str, Any] = kwargs
+    def generate_data(
+        self,
+        num_days: int,
+        num_channels: int,
+        grid_size: Tuple[int, int],
+        base_temp: float,
+        amplitude: float,
+        noise_level: float,
+    ) -> np.ndarray:
+        """
+        Generates synthetic temperature data over a specified number of days for multiple atmospheric layers.
+        Args:
+            num_days (int): Number of days to generate data for.
+            num_channels (int): Number of channels representing different layers.
+            grid_size (tuple): Grid size (latitude, longitude).
+            base_temp (float): Base mean temperature for the data.
+            amplitude (float): Amplitude of temperature variations.
+            noise_level (float): Noise level to add stochasticity to the temperature.
+        Returns:
+            numpy.ndarray: A 4D array of temperature values across days, channels, latitudes, and longitudes.
+        """
+        days = np.arange(num_days)
+        latitudes, longitudes = grid_size
+        # Create altitude effect and reshape
+        altitude_effect = np.arange(num_channels) * -0.5
+        altitude_effect = altitude_effect[
+            :, np.newaxis, np.newaxis
+        ]  # Shape: (num_channels, 1, 1)
+        altitude_effect = np.tile(
+            altitude_effect, (1, latitudes, longitudes)
+        )  # Shape: (num_channels, latitudes, longitudes)
+        altitude_effect = altitude_effect[
+            np.newaxis, :, :, :
+        ]  # Shape: (1, num_channels, latitudes, longitudes)
+        altitude_effect = np.tile(
+            altitude_effect, (num_days, 1, 1, 1)
+        )  # Shape: (num_days, num_channels, latitudes, longitudes)
+        # Create latitude variation and reshape
+        lat_variation = np.linspace(-amplitude, amplitude, latitudes)
+        lat_variation = lat_variation[:, np.newaxis]  # Shape: (latitudes, 1)
+        lat_variation = np.tile(
+            lat_variation, (1, longitudes)
+        )  # Shape: (latitudes, longitudes)
+        lat_variation = lat_variation[
+            np.newaxis, np.newaxis, :, :
+        ]  # Shape: (1, 1, latitudes, longitudes)
+        lat_variation = np.tile(
+            lat_variation, (num_days, num_channels, 1, 1)
+        )  # Shape: (num_days, num_channels, latitudes, longitudes)
+        # Create time effect and reshape
+        time_effect = np.sin(2 * np.pi * days / 365)
+        time_effect = time_effect[
+            :, np.newaxis, np.newaxis, np.newaxis
+        ]  # Shape: (num_days, 1, 1, 1)
+        time_effect = np.tile(
+            time_effect, (1, num_channels, latitudes, longitudes)
+        )  # Shape: (num_days, num_channels, latitudes, longitudes)
+        # Generate noise
+        noise = np.random.normal(
+            scale=noise_level, size=(num_days, num_channels, latitudes, longitudes)
+        )
+        # Calculate daily temperatures
+        daily_temps = base_temp + altitude_effect + lat_variation + time_effect + noise
+        return daily_temps
+    def __len__(self) -> int:
+        """
+        Returns the number of samples available in the dataset.
+        """
+        return self.num_days - self.num_steps
+    def __getitem__(self, idx: int) -> torch.Tensor:
+        """
+        Retrieves a sample from the dataset at the specified index.
+        """
+        return [
+            {
+                "invar": torch.tensor(self.temperatures[idx], dtype=torch.float32).to(
+                    self.device
+                ),
+                "outvar": torch.tensor(
+                    self.temperatures[idx + 1 : idx + self.num_steps + 1],
+                    dtype=torch.float32,
+                ).to(self.device),
+            }
+        ]

physics_mcp/source/physicsnemo/datapipes/climate/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# SPDX-FileCopyrightText: Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

physics_mcp/source/physicsnemo/datapipes/climate/utils/invariant.py ADDED Viewed

	@@ -0,0 +1,139 @@

+# SPDX-FileCopyrightText: Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from abc import ABC, abstractmethod
+from typing import List, Tuple
+import numpy as np
+import xarray as xr
+def latlon_grid(
+    bounds: Tuple[Tuple[float, float], Tuple[float, float]] = (
+        (90, -90),
+        (0, 360),
+    ),
+    shape: Tuple[int, int] = (1440, 721),
+) -> np.ndarray:
+    """Infer latitude and longitude coordinates from bounds and data shape on a
+    equirectangular grid."""
+    # get latitudes and longitudes from data shape
+    lat = np.linspace(*bounds[0], shape[0], dtype=np.float32)
+    # does longitude wrap around the globe?
+    lon_wraparound = (bounds[1][0] % 360) == (bounds[1][1] % 360)
+    if lon_wraparound:
+        # treat differently from lat due to wrap-around
+        lon = np.linspace(*bounds[1], shape[1] + 1, dtype=np.float32)[:-1]
+    else:
+        lon = np.linspace(*bounds[1], shape[1], dtype=np.float32)
+    return np.meshgrid(lat, lon, indexing="ij")
+class Invariant(ABC):
+    """Invariant abstract class representing data that is invariant to inputs on load"""
+    @abstractmethod
+    def __call__(self, latlon: np.ndarray):
+        pass
+class LatLon(Invariant):
+    """Time invariant latitude and longitude coordinates and trig functions"""
+    def __init__(
+        self, outputs: List[str] = ("sin_lat", "cos_lat", "sin_lon", "cos_lon")
+    ):
+        """
+        Outputs latitude and longitude and their trigonometric functions.
+        Parameters
+        ----------
+        outputs: List[str]
+            List of outputs. Supported values are
+            `{"lat", "lon", "sin_lat", "cos_lat", "sin_lon", "cos_lon"}`
+        """
+        self.outputs = outputs
+    def __call__(self, latlon: np.ndarray):
+        (lat, lon) = latlon
+        vars = {"lat": lat, "lon": lon}
+        # cos/sin latitudes and longitudes
+        if "sin_lat" in self.outputs:
+            vars["sin_lat"] = np.sin(np.deg2rad(lat))
+        if "cos_lat" in self.outputs:
+            vars["cos_lat"] = np.cos(np.deg2rad(lat))
+        if "sin_lon" in self.outputs:
+            vars["sin_lon"] = np.sin(np.deg2rad(lon))
+        if "cos_lon" in self.outputs:
+            vars["cos_lon"] = np.cos(np.deg2rad(lon))
+        return np.stack([vars[o] for o in self.outputs], axis=0)
+class FileInvariant(Invariant):
+    """
+    Loads an time-invariant variable from a NetCDF4 file. The file should
+    contain one or more data variables of dimensions
+    `(channels, latitude, longitude)` as well as variables `latitude` and
+    `longitude` specifying these coordinates. `latitude` and `longitude`
+    can be either 2D or 1D.
+    Parameters
+    ----------
+    filename: str
+        Path to the file containing the variable
+    var_name: str
+        The variable in the file containing the data
+    normalize: bool, optional
+        If True, normalize the data by to zero-mean and unit variance.
+        Default False.
+    interp_method: str, optional
+        Any argument accepted by xarray.DataArray.interp.
+        Default 'linear'.
+    """
+    def __init__(
+        self,
+        filename: str,
+        var_name: str,
+        normalize=False,
+        interp_method="linear",
+    ):
+        with xr.open_dataset(filename) as ds:
+            self.data = ds[var_name].astype(np.float32)
+            self.lat = ds["latitude"].to_numpy().astype(np.float32)
+            self.lon = ds["longitude"].to_numpy().astype(np.float32)
+        if self.lat.ndim == 1:
+            (self.lat, self.lon) = np.meshgrid(self.lat, self.lon, indexing="ij")
+        if normalize:
+            self.data = (self.data - self.data.mean()) / self.data.std()
+        self.interp_method = interp_method
+    def __call__(self, latlon: np.ndarray):
+        (lat, lon) = latlon
+        lat = xr.DataArray(lat, dims=["latitude", "longitude"])
+        lon = xr.DataArray(lon, dims=["latitude", "longitude"])
+        return self.data.interp(
+            method=self.interp_method, latitude=lat, longitude=lon
+        ).to_numpy()

physics_mcp/source/physicsnemo/datapipes/climate/utils/zenith_angle.py ADDED Viewed

	@@ -0,0 +1,208 @@

+# ignore_header_test
+# climt/LICENSE
+# @mcgibbon
+# BSD License
+# Copyright (c) 2016, Rodrigo Caballero
+# All rights reserved.
+# Redistribution and use in source and binary forms, with or without modification,
+# are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright notice, this
+#   list of conditions and the following disclaimer in the documentation and/or
+#   other materials provided with the distribution.
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from this
+#   software without specific prior written permission.
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+# IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+# OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+# OF THE POSSIBILITY OF SUCH DAMAGE.
+import datetime
+import numpy as np
+import pytz
+try:
+    import nvidia.dali as dali
+except ImportError:
+    raise ImportError(
+        "DALI dataset requires NVIDIA DALI package to be installed. "
+        + "The package can be installed at:\n"
+        + "https://docs.nvidia.com/deeplearning/dali/user-guide/docs/installation.html"
+    )
+RAD_PER_DEG = np.pi / 180.0
+DATETIME_2000 = datetime.datetime(2000, 1, 1, 12, 0, 0, tzinfo=pytz.utc).timestamp()
+def _dali_mod(a, b):
+    return a - b * dali.math.floor(a / b)
+def cos_zenith_angle(
+    time: dali.types.DALIDataType,
+    latlon: dali.types.DALIDataType,
+):
+    """
+    Dali datapipe for computing Cosine of sun-zenith angle for lon, lat at time (UTC).
+    Parameters
+    ----------
+    time : dali.types.DALIDataType
+        Time in seconds since 2000-01-01 12:00:00 UTC. Shape `(seq_length,)`.
+    latlon : dali.types.DALIDataType
+        Latitude and longitude in degrees. Shape `(2, nr_lat, nr_lon)`.
+    Returns
+    -------
+    dali.types.DALIDataType
+        Cosine of sun-zenith angle. Shape `(seq_length, 1, nr_lat, nr_lon)`.
+    """
+    lat = latlon[dali.newaxis, 0:1, :, :] * RAD_PER_DEG
+    lon = latlon[dali.newaxis, 1:2, :, :] * RAD_PER_DEG
+    time = time[:, dali.newaxis, dali.newaxis, dali.newaxis]
+    return _star_cos_zenith(time, lat, lon)
+def _days_from_2000(model_time):  # pragma: no cover
+    """Get the days since year 2000."""
+    return (model_time - DATETIME_2000) / (24.0 * 3600.0)
+def _greenwich_mean_sidereal_time(model_time):
+    """
+    Greenwich mean sidereal time, in radians.
+    Reference:
+        The AIAA 2006 implementation:
+            http://www.celestrak.com/publications/AIAA/2006-6753/
+    """
+    jul_centuries = _days_from_2000(model_time) / 36525.0
+    theta = 67310.54841 + jul_centuries * (
+        876600 * 3600
+        + 8640184.812866
+        + jul_centuries * (0.093104 - jul_centuries * 6.2 * 10e-6)
+    )
+    theta_radians = _dali_mod((theta / 240.0) * RAD_PER_DEG, 2 * np.pi)
+    return theta_radians
+def _local_mean_sidereal_time(model_time, longitude):
+    """
+    Local mean sidereal time. requires longitude in radians.
+    Ref:
+        http://www.setileague.org/askdr/lmst.htm
+    """
+    return _greenwich_mean_sidereal_time(model_time) + longitude
+def _sun_ecliptic_longitude(model_time):
+    """
+    Ecliptic longitude of the sun.
+    Reference:
+        http://www.geoastro.de/elevaz/basics/meeus.htm
+    """
+    julian_centuries = _days_from_2000(model_time) / 36525.0
+    # mean anomaly calculation
+    mean_anomaly = (
+        357.52910
+        + 35999.05030 * julian_centuries
+        - 0.0001559 * julian_centuries * julian_centuries
+        - 0.00000048 * julian_centuries * julian_centuries * julian_centuries
+    ) * RAD_PER_DEG
+    # mean longitude
+    mean_longitude = (
+        280.46645 + 36000.76983 * julian_centuries + 0.0003032 * (julian_centuries**2)
+    ) * RAD_PER_DEG
+    d_l = (
+        (1.914600 - 0.004817 * julian_centuries - 0.000014 * (julian_centuries**2))
+        * dali.math.sin(mean_anomaly)
+        + (0.019993 - 0.000101 * julian_centuries) * dali.math.sin(2 * mean_anomaly)
+        + 0.000290 * dali.math.sin(3 * mean_anomaly)
+    ) * RAD_PER_DEG
+    # true longitude
+    return mean_longitude + d_l
+def _obliquity_star(julian_centuries):
+    """
+    return obliquity of the sun
+    Use 5th order equation from
+    https://en.wikipedia.org/wiki/Ecliptic#Obliquity_of_the_ecliptic
+    """
+    return (
+        23.0
+        + 26.0 / 60
+        + 21.406 / 3600.0
+        - (
+            46.836769 * julian_centuries
+            - 0.0001831 * (julian_centuries**2)
+            + 0.00200340 * (julian_centuries**3)
+            - 0.576e-6 * (julian_centuries**4)
+            - 4.34e-8 * (julian_centuries**5)
+        )
+        / 3600.0
+    ) * RAD_PER_DEG
+def _right_ascension_declination(model_time):
+    """
+    Right ascension and declination of the sun.
+    """
+    julian_centuries = _days_from_2000(model_time) / 36525.0
+    eps = _obliquity_star(julian_centuries)
+    eclon = _sun_ecliptic_longitude(model_time)
+    x = dali.math.cos(eclon)
+    y = dali.math.cos(eps) * dali.math.sin(eclon)
+    z = dali.math.sin(eps) * dali.math.sin(eclon)
+    r = dali.math.sqrt(1.0 - z * z)
+    # sun declination
+    declination = dali.math.atan2(z, r)
+    # right ascension
+    right_ascension = 2 * dali.math.atan2(y, (x + r))
+    return right_ascension, declination
+def _local_hour_angle(model_time, longitude, right_ascension):
+    """
+    Hour angle at model_time for the given longitude and right_ascension
+    longitude in radians
+    Ref:
+        https://en.wikipedia.org/wiki/Hour_angle#Relation_with_the_right_ascension
+    """
+    return _local_mean_sidereal_time(model_time, longitude) - right_ascension
+def _star_cos_zenith(model_time, lat, lon):
+    """
+    Return cosine of star zenith angle
+    lon,lat in radians
+    Ref:
+        Azimuth:
+            https://en.wikipedia.org/wiki/Solar_azimuth_angle#Formulas
+        Zenith:
+            https://en.wikipedia.org/wiki/Solar_zenith_angle
+    """
+    ra, dec = _right_ascension_declination(model_time)
+    h_angle = _local_hour_angle(model_time, lon, ra)
+    cosine_zenith = dali.math.sin(lat) * dali.math.sin(dec) + dali.math.cos(
+        lat
+    ) * dali.math.cos(dec) * dali.math.cos(h_angle)
+    return cosine_zenith

physics_mcp/source/physicsnemo/datapipes/datapipe.py ADDED Viewed

	@@ -0,0 +1,60 @@

+# SPDX-FileCopyrightText: Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from physicsnemo.datapipes.meta import DatapipeMetaData
+class Datapipe:
+    """The base class for all datapipes in PhysicsNeMo.
+    Parameters
+    ----------
+    meta : DatapipeMetaData, optional
+        Meta data class for storing info regarding model, by default None
+    """
+    def __init__(self, meta: DatapipeMetaData = None):
+        super().__init__()
+        if not meta or not isinstance(meta, DatapipeMetaData):
+            self.meta = DatapipeMetaData()
+        else:
+            self.meta = meta
+        self.logger = logging.getLogger("core.datapipe")
+        handler = logging.StreamHandler()
+        formatter = logging.Formatter(
+            "[%(asctime)s - %(levelname)s] %(message)s", datefmt="%H:%M:%S"
+        )
+        handler.setFormatter(formatter)
+        self.logger.addHandler(handler)
+        self.logger.setLevel(logging.WARNING)
+    def debug(self):
+        """Turn on debug logging"""
+        self.logger.handlers.clear()
+        handler = logging.StreamHandler()
+        formatter = logging.Formatter(
+            f"[%(asctime)s - %(levelname)s - {self.meta.name}] %(message)s",
+            datefmt="%Y-%m-%d %H:%M:%S",
+        )
+        handler.setFormatter(formatter)
+        self.logger.addHandler(handler)
+        self.logger.setLevel(logging.DEBUG)
+        # TODO: set up debug log
+        # fh = logging.FileHandler(f'physicsnemo-core-{self.meta.name}.log')

physics_mcp/source/physicsnemo/datapipes/gnn/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# SPDX-FileCopyrightText: Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.