Spaces:

lidavidsh
/

ml-sharp

Running

App Files Files Community

amael-apple commited on Dec 12, 2025

Commit

c20d7cc

0 Parent(s):

Initial commit

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +166 -0
.pre-commit-config.yaml +23 -0
.python-version +1 -0
ACKNOWLEDGEMENTS +214 -0
CODE_OF_CONDUCT.md +70 -0
CONTRIBUTING.md +11 -0
LICENSE +47 -0
LICENSE_MODEL +88 -0
README.md +95 -0
pyproject.toml +69 -0
requirements.in +1 -0
requirements.txt +172 -0
src/sharp/__init__.py +4 -0
src/sharp/cli/__init__.py +19 -0
src/sharp/cli/predict.py +206 -0
src/sharp/cli/render.py +120 -0
src/sharp/models/__init__.py +79 -0
src/sharp/models/alignment.py +126 -0
src/sharp/models/blocks.py +210 -0
src/sharp/models/composer.py +251 -0
src/sharp/models/decoders/__init__.py +22 -0
src/sharp/models/decoders/base_decoder.py +21 -0
src/sharp/models/decoders/monodepth_decoder.py +37 -0
src/sharp/models/decoders/multires_conv_decoder.py +116 -0
src/sharp/models/decoders/unet_decoder.py +113 -0
src/sharp/models/encoders/__init__.py +24 -0
src/sharp/models/encoders/base_encoder.py +25 -0
src/sharp/models/encoders/monodepth_encoder.py +123 -0
src/sharp/models/encoders/spn_encoder.py +369 -0
src/sharp/models/encoders/unet_encoder.py +117 -0
src/sharp/models/encoders/vit_encoder.py +111 -0
src/sharp/models/gaussian_decoder.py +267 -0
src/sharp/models/heads.py +53 -0
src/sharp/models/initializer.py +297 -0
src/sharp/models/monodepth.py +268 -0
src/sharp/models/normalizers.py +80 -0
src/sharp/models/params.py +203 -0
src/sharp/models/predictor.py +201 -0
src/sharp/models/presets/__init__.py +23 -0
src/sharp/models/presets/monodepth.py +21 -0
src/sharp/models/presets/vit.py +58 -0
src/sharp/utils/__init__.py +5 -0
src/sharp/utils/camera.py +386 -0
src/sharp/utils/color_space.py +88 -0
src/sharp/utils/gaussians.py +480 -0
src/sharp/utils/gsplat.py +191 -0
src/sharp/utils/io.py +213 -0
src/sharp/utils/linalg.py +104 -0
src/sharp/utils/logging.py +45 -0
src/sharp/utils/math.py +183 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,166 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+.DS_STORE
+*.pt
+.aider*

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,23 @@

+exclude: |
+  (?x)(
+    ^src/sharp/external
+  )
+repos:
+- repo: https://github.com/pre-commit/pre-commit-hooks
+  rev: v4.5.0
+  hooks:
+  - id: trailing-whitespace
+  - id: end-of-file-fixer
+  # - id: no-commit-to-branch
+  #   args: ['--branch', 'main']
+- repo: https://github.com/charliermarsh/ruff-pre-commit
+  rev: v0.1.7
+  hooks:
+  - id: ruff
+    args: [--fix, --exit-non-zero-on-fix]
+  - id: ruff-format
+- repo: https://github.com/pre-commit/mirrors-mypy
+  rev: v1.7.1
+  hooks:
+  - id: mypy
+    additional_dependencies: [ types-PyYAML ]

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.13

ACKNOWLEDGEMENTS ADDED Viewed

	@@ -0,0 +1,214 @@

+Acknowledgements
+Portions of this Software may utilize the following copyrighted
+material, the use of which is hereby acknowledged.
+---------------------------------------------------------------------------------
+TIMM - Pytorch Image Models library
+https://github.com/huggingface/pytorch-image-models
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "{}"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright 2019 Ross Wightman
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-------

CODE_OF_CONDUCT.md ADDED Viewed

	@@ -0,0 +1,70 @@

+# Code of Conduct
+## Our Pledge
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to making participation in our project and
+our community a harassment-free experience for everyone, regardless of age, body
+size, disability, ethnicity, sex characteristics, gender identity and expression,
+level of experience, education, socio-economic status, nationality, personal
+appearance, race, religion, or sexual identity and orientation.
+## Our Standards
+Examples of behavior that contributes to creating a positive environment
+include:
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+Examples of unacceptable behavior by participants include:
+* The use of sexualized language or imagery and unwelcome sexual attention or
+  advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+  address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+## Our Responsibilities
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+## Scope
+This Code of Conduct applies within all project spaces, and it also applies when
+an individual is representing the project or its community in public spaces.
+Examples of representing a project or community include using an official
+project e-mail address, posting via an official social media account, or acting
+as an appointed representative at an online or offline event. Representation of
+a project may be further defined and clarified by project maintainers.
+## Enforcement
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the open source team at [opensource-conduct@group.apple.com](mailto:opensource-conduct@group.apple.com). All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an incident.
+Further details of specific enforcement policies may be posted separately.
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+## Attribution
+This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org), version 1.4,

CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,11 @@

+# Contribution Guide
+Thanks for your interest in contributing. This project was released to accompany a research paper for purposes of reproducibility, and beyond its publication there are limited plans for future development of the repository.
+While we welcome new pull requests and issues please note that our response may be limited. Forks and out-of-tree improvements are strongly encouraged.
+## Before you get started
+By submitting a pull request, you represent that you have the right to license your contribution to Apple and the community, and agree by submitting the patch that your contributions are licensed under the [LICENSE](LICENSE).
+We ask that all community members read and observe our [Code of Conduct](CODE_OF_CONDUCT.md).

LICENSE ADDED Viewed

	@@ -0,0 +1,47 @@

+Copyright (C) 2025 Apple Inc. All Rights Reserved.
+Disclaimer: IMPORTANT:  This Apple software is supplied to you by Apple
+Inc. ("Apple") in consideration of your agreement to the following
+terms, and your use, installation, modification or redistribution of
+this Apple software constitutes acceptance of these terms.  If you do
+not agree with these terms, please do not use, install, modify or
+redistribute this Apple software.
+In consideration of your agreement to abide by the following terms, and
+subject to these terms, Apple grants you a personal, non-exclusive
+license, under Apple's copyrights in this original Apple software (the
+"Apple Software"), to use, reproduce, modify and redistribute the Apple
+Software, with or without modifications, in source and/or binary forms;
+provided that if you redistribute the Apple Software in its entirety and
+without modifications, you must retain this notice and the following
+text and disclaimers in all such redistributions of the Apple Software.
+Neither the name, trademarks, service marks or logos of Apple Inc. may
+be used to endorse or promote products derived from the Apple Software
+without specific prior written permission from Apple.  Except as
+expressly stated in this notice, no other rights or licenses, express or
+implied, are granted by Apple herein, including but not limited to any
+patent rights that may be infringed by your derivative works or by other
+works in which the Apple Software may be incorporated.
+The Apple Software is provided by Apple on an "AS IS" basis.  APPLE
+MAKES NO WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION
+THE IMPLIED WARRANTIES OF NON-INFRINGEMENT, MERCHANTABILITY AND FITNESS
+FOR A PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND
+OPERATION ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
+IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL
+OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION,
+MODIFICATION AND/OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED
+AND WHETHER UNDER THEORY OF CONTRACT, TORT (INCLUDING NEGLIGENCE),
+STRICT LIABILITY OR OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-------------------------------------------------------------------------------
+SOFTWARE DISTRIBUTED IN THIS REPOSITORY:
+This software includes a number of subcomponents with separate
+copyright notices and license terms - please see the file ACKNOWLEDGEMENTS.
+-------------------------------------------------------------------------------

LICENSE_MODEL ADDED Viewed

	@@ -0,0 +1,88 @@

+Disclaimer: IMPORTANT: This Apple Machine Learning Research Model is
+specifically developed and released by Apple Inc. ("Apple") for the sole purpose
+of scientific research of artificial intelligence and machine-learning
+technology. “Apple Machine Learning Research Model” means the model, including
+but not limited to algorithms, formulas, trained model weights, parameters,
+configurations, checkpoints, and any related materials (including
+documentation).
+This Apple Machine Learning Research Model is provided to You by
+Apple in consideration of your agreement to the following terms, and your use,
+modification, creation of Model Derivatives, and or redistribution of the Apple
+Machine Learning Research Model constitutes acceptance of this Agreement. If You
+do not agree with these terms, please do not use, modify, create Model
+Derivatives of, or distribute this Apple Machine Learning Research Model or
+Model Derivatives.
+* License Scope: In consideration of your agreement to abide by the following
+  terms, and subject to these terms, Apple hereby grants you a personal,
+  non-exclusive, worldwide, non-transferable, royalty-free, revocable, and
+  limited license, to use, copy, modify, distribute, and create Model
+  Derivatives (defined below) of the Apple Machine Learning Research Model
+  exclusively for Research Purposes. You agree that any Model Derivatives You
+  may create or that may be created for You will be limited to Research Purposes
+  as well. “Research Purposes” means non-commercial scientific research and
+  academic development activities, such as experimentation, analysis, testing
+  conducted by You with the sole intent to advance scientific knowledge and
+  research. “Research Purposes” does not include any commercial exploitation,
+  product development or use in any commercial product or service.
+* Distribution of Apple Machine Learning Research Model and Model Derivatives:
+  If you choose to redistribute Apple Machine Learning Research Model or its
+  Model Derivatives, you must provide a copy of this Agreement to such third
+  party, and ensure that the following attribution notice be provided: “Apple
+  Machine Learning Research Model is licensed under the Apple Machine Learning
+  Research Model License Agreement.” Additionally, all Model Derivatives must
+  clearly be identified as such, including disclosure of modifications and
+  changes made to the Apple Machine Learning Research Model. The name,
+  trademarks, service marks or logos of Apple may not be used to endorse or
+  promote Model Derivatives or the relationship between You and Apple. “Model
+  Derivatives” means any models or any other artifacts created by modifications,
+  improvements, adaptations, alterations to the architecture, algorithm or
+  training processes of the Apple Machine Learning Research Model, or by any
+  retraining, fine-tuning of the Apple Machine Learning Research Model.
+* No Other License: Except as expressly stated in this notice, no other rights
+  or licenses, express or implied, are granted by Apple herein, including but
+  not limited to any patent, trademark, and similar intellectual property rights
+  worldwide that may be infringed by the Apple Machine Learning Research Model,
+  the Model Derivatives or by other works in which the Apple Machine Learning
+  Research Model may be incorporated.
+* Compliance with Laws: Your use of Apple Machine Learning Research Model must
+  be in compliance with all applicable laws and regulations.
+* Term and Termination: The term of this Agreement will begin upon your
+  acceptance of this Agreement or use of the Apple Machine Learning Research
+  Model and will continue until terminated in accordance with the following
+  terms. Apple may terminate this Agreement at any time if You are in breach of
+  any term or condition of this Agreement. Upon termination of this Agreement,
+  You must cease to use all Apple Machine Learning Research Models and Model
+  Derivatives and permanently delete any copy thereof. Sections 3, 6 and 7 will
+  survive termination.
+* Disclaimer and Limitation of Liability: This Apple Machine Learning Research
+  Model and any outputs generated by the Apple Machine Learning Research Model
+  are provided on an “AS IS” basis. APPLE MAKES NO WARRANTIES, EXPRESS OR
+  IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED WARRANTIES OF
+  NON-INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE,
+  REGARDING THE APPLE MACHINE LEARNING RESEARCH MODEL OR OUTPUTS GENERATED BY
+  THE APPLE MACHINE LEARNING RESEARCH MODEL. You are solely responsible for
+  determining the appropriateness of using or redistributing the Apple Machine
+  Learning Research Model and any outputs of the Apple Machine Learning Research
+  Model and assume any risks associated with Your use of the Apple Machine
+  Learning Research Model and any output and results. IN NO EVENT SHALL APPLE BE
+  LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+  IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION AND/OR DISTRIBUTION OF
+  THE APPLE MACHINE LEARNING RESEARCH MODEL AND ANY OUTPUTS OF THE APPLE MACHINE
+  LEARNING RESEARCH MODEL, HOWEVER CAUSED AND WHETHER UNDER THEORY OF CONTRACT,
+  TORT (INCLUDING NEGLIGENCE), STRICT LIABILITY OR OTHERWISE, EVEN IF APPLE HAS
+  BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+* Governing Law: This Agreement will be governed by and construed under the laws
+  of the State of California without regard to its choice of law principles. The
+  Convention on Contracts for the International Sale of Goods shall not apply to
+  the Agreement except that the arbitration clause and any arbitration hereunder
+  shall be governed by the Federal Arbitration Act, Chapters 1 and 2.
+Copyright (C) 2025 Apple Inc. All Rights Reserved.

README.md ADDED Viewed

	@@ -0,0 +1,95 @@

+# Sharp Monocular View Synthesis in Less Than a Second
+[![Project Page](https://img.shields.io/badge/Project-Page-green)](https://apple.github.io/ml-sharp/)
+[![arXiv](https://img.shields.io/badge/arXiv-2512.10685-b31b1b.svg)](https://arxiv.org/abs/2512.10685)
+This software project accompanies the research paper: _Sharp Monocular View Synthesis in Less Than a Second_
+by _Lars Mescheder, Wei Dong, Shiwei Li, Xuyang Bai, Marcel Santos, Peiyun Hu, Bruno Lecouat, Mingmin Zhen, Amaël Delaunoy,
+Tian Fang, Yanghai Tsin, Stephan Richter and Vladlen Koltun_.
+![](data/teaser.jpg)
+We present SHARP, an approach to photorealistic view synthesis from a single image. Given a single photograph, SHARP regresses the parameters of a 3D Gaussian representation of the depicted scene. This is done in less than a second on a standard GPU via a single feedforward pass through a neural network. The 3D Gaussian representation produced by SHARP can then be rendered in real time, yielding high-resolution photorealistic images for nearby views. The representation is metric, with absolute scale, supporting metric camera movements. Experimental results demonstrate that SHARP delivers robust zero-shot generalization across datasets. It sets a new state of the art on multiple datasets, reducing LPIPS by 25–34% and DISTS by 21–43% versus the best prior model, while lowering the synthesis time by three orders of magnitude.
+## Getting started
+We recommend to first create a python environment:
+```
+conda create -n sharp python=3.13
+```
+Afterwards, you can install the project using
+```
+pip install -r requirements.txt
+```
+To test the installation, run
+```
+sharp --help
+```
+## Using the CLI
+To run prediction:
+```
+sharp predict -i /path/to/input/images -o /path/to/output/gaussians
+```
+The model checkpoint will be downloaded automatically on first run and cached locally at `~/.cache/torch/hub/checkpoints/`.
+Alternatively, you can download the model directly:
+```
+wget https://ml-site.cdn-apple.com/models/sharp/sharp_2572gikvuh.pt
+```
+To use a manually downloaded checkpoint, specify it with the `-c` flag:
+```
+sharp predict -i /path/to/input/images -o /path/to/output/gaussians -c sharp_2572gikvuh.pt
+```
+The results will be 3D gaussian splats (3DGS) in the output folder. The 3DGS `.ply` files are compatible to various public 3DGS renderers. We follow the OpenCV coordinate convention (x right, y down, z forward). The 3DGS scene center is roughly at (0, 0, +z). When dealing with 3rdparty renderers, please scale and rotate to re-center the scene accordingly.
+### Rendering trajectories (CUDA GPU only)
+Additionally you can render videos with a camera trajectory. While the gaussians prediction works for all CPU, CUDA, and MPS, rendering videos via the `--render` option currently requires a CUDA GPU. The gsplat renderer takes a while to initialize at the first launch.
+```
+sharp predict -i /path/to/input/images -o /path/to/output/gaussians --render
+# Or from the intermediate gaussians:
+sharp render -i /path/to/output/gaussians -o /path/to/output/renderings
+```
+## Evaluation
+Please refer to the paper for both quantitative and qualitative evaluations.
+Additionally, please check out this [qualitative examples page](https://apple.github.io/ml-sharp/) containing several video comparisons against related work.
+## Citation
+If you find our work useful, please cite the following paper:
+```bibtex
+@inproceedings{Sharp2025:arxiv,
+  title      = {Sharp Monocular View Synthesis in Less Than a Second},
+  author     = {Lars Mescheder and Wei Dong and Shiwei Li and Xuyang Bai and Marcel Santos and Peiyun Hu and Bruno Lecouat and Mingmin Zhen and Ama\"{e}l Delaunoyand Tian Fang and Yanghai Tsin and Stephan R. Richter and Vladlen Koltun},
+  journal    = {arXiv preprint arXiv:2512.10685},
+  year       = {2025},
+  url        = {https://arxiv.org/abs/2512.10685},
+}
+```
+## Acknowledgements
+Our codebase is built using multiple opensource contributions, please see [ACKNOWLEDGEMENTS](ACKNOWLEDGEMENTS) for more details.
+## License
+Please check out the repository [LICENSE](LICENSE) before using the provided code and
+[LICENSE_MODEL](LICENSE_MODEL) for the released models.

pyproject.toml ADDED Viewed

	@@ -0,0 +1,69 @@

+[project]
+name = "sharp"
+version = "0.1"
+description = "Inference/Network/Model code for SHARP view synthesis model."
+readme = "README.md"
+dependencies = [
+  "click",
+  "gsplat",
+  "imageio[ffmpeg]",
+  "matplotlib",
+  "pillow-heif",
+  "plyfile",
+  "scipy",
+  "timm",
+  "torch",
+  "torchvision",
+]
+[project.scripts]
+sharp = "sharp.cli:main_cli"
+[project.urls]
+Homepage = "https://github.com/apple/ml-sharp"
+Repository = "https://github.com/apple/ml-sharp"
+[build-system]
+requires = ["setuptools", "setuptools-scm"]
+build-backend = "setuptools.build_meta"
+[tool.setuptools.packages.find]
+where = ["src"]
+[tool.pyright]
+include = ["src"]
+exclude = [
+    "**/node_modules",
+    "**/__pycache__",
+]
+pythonVersion = "3.13"
+[tool.pytest.ini_options]
+minversion = "6.0"
+addopts = "-ra -q"
+testpaths = [
+    "tests"
+]
+filterwarnings = [
+    "ignore::DeprecationWarning"
+]
+[tool.lint.per-file-ignores]
+"__init__.py" = ["F401", "D100", "D104"]
+[tool.ruff]
+line-length = 100
+lint.select = ["E", "F", "D", "I"]
+lint.ignore = ["D100", "D105",
+  # Imperative mood of docstring.
+  "D401",
+]
+extend-exclude = [
+    "*external*",
+    "third_party",
+]
+src = ["sharp"]
+target-version = "py39"
+[tool.ruff.lint.pydocstyle]
+convention = "google"

requirements.in ADDED Viewed

	@@ -0,0 +1 @@


1	+ -e .

requirements.txt ADDED Viewed

	@@ -0,0 +1,172 @@

+# This file was autogenerated by uv via the following command:
+#    uv pip compile requirements.in -o requirements.txt --universal
+-e .
+    # via -r requirements.in
+certifi==2025.8.3
+    # via requests
+charset-normalizer==3.4.3
+    # via requests
+click==8.3.0
+    # via sharp
+colorama==0.4.6 ; sys_platform == 'win32'
+    # via
+    #   click
+    #   tqdm
+contourpy==1.3.3
+    # via matplotlib
+cycler==0.12.1
+    # via matplotlib
+filelock==3.19.1
+    # via
+    #   huggingface-hub
+    #   torch
+fonttools==4.61.0
+    # via matplotlib
+fsspec==2025.9.0
+    # via
+    #   huggingface-hub
+    #   torch
+gsplat==1.5.3
+    # via sharp
+hf-xet==1.1.10 ; platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64'
+    # via huggingface-hub
+huggingface-hub==0.35.3
+    # via timm
+idna==3.10
+    # via requests
+imageio==2.37.0
+    # via sharp
+imageio-ffmpeg==0.6.0
+    # via imageio
+jaxtyping==0.3.3
+    # via gsplat
+jinja2==3.1.6
+    # via torch
+kiwisolver==1.4.9
+    # via matplotlib
+markdown-it-py==4.0.0
+    # via rich
+markupsafe==3.0.3
+    # via jinja2
+matplotlib==3.10.6
+    # via sharp
+mdurl==0.1.2
+    # via markdown-it-py
+mpmath==1.3.0
+    # via sympy
+networkx==3.5
+    # via torch
+ninja==1.13.0
+    # via gsplat
+numpy==2.3.3
+    # via
+    #   contourpy
+    #   gsplat
+    #   imageio
+    #   matplotlib
+    #   plyfile
+    #   scipy
+    #   torchvision
+nvidia-cublas-cu12==12.8.4.1 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.8.90 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-cuda-nvrtc-cu12==12.8.93 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-cuda-runtime-cu12==12.8.90 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-cudnn-cu12==9.10.2.21 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-cufft-cu12==11.3.3.83 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-cufile-cu12==1.13.1.3 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-curand-cu12==10.3.9.90 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-cusolver-cu12==11.7.3.90 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-cusparse-cu12==12.5.8.93 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cusparselt-cu12==0.7.1 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-nccl-cu12==2.27.3 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-nvjitlink-cu12==12.8.93 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via
+    #   nvidia-cufft-cu12
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+    #   torch
+nvidia-nvtx-cu12==12.8.90 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+packaging==25.0
+    # via
+    #   huggingface-hub
+    #   matplotlib
+pillow==11.3.0
+    # via
+    #   imageio
+    #   matplotlib
+    #   pillow-heif
+    #   torchvision
+pillow-heif==1.1.1
+    # via sharp
+plyfile==1.1.2
+    # via sharp
+psutil==7.1.0
+    # via imageio
+pygments==2.19.2
+    # via rich
+pyparsing==3.2.5
+    # via matplotlib
+python-dateutil==2.9.0.post0
+    # via matplotlib
+pyyaml==6.0.3
+    # via
+    #   huggingface-hub
+    #   timm
+requests==2.32.5
+    # via huggingface-hub
+rich==14.1.0
+    # via gsplat
+safetensors==0.6.2
+    # via timm
+scipy==1.16.2
+    # via sharp
+setuptools==80.9.0
+    # via
+    #   torch
+    #   triton
+six==1.17.0
+    # via python-dateutil
+sympy==1.14.0
+    # via torch
+timm==1.0.20
+    # via sharp
+torch==2.8.0
+    # via
+    #   gsplat
+    #   sharp
+    #   timm
+    #   torchvision
+torchvision==0.23.0
+    # via
+    #   sharp
+    #   timm
+tqdm==4.67.1
+    # via huggingface-hub
+triton==3.4.0 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+typing-extensions==4.15.0
+    # via
+    #   huggingface-hub
+    #   torch
+urllib3==2.6.0
+    # via requests
+wadler-lindig==0.1.7
+    # via jaxtyping

src/sharp/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+"""For licensing see accompanying LICENSE file.
+Copyright (C) 2025 Apple Inc. All Rights Reserved.
+"""

src/sharp/cli/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+"""Command-line-interface to run SHARP model.
+For licensing see accompanying LICENSE file.
+Copyright (C) 2025 Apple Inc. All Rights Reserved.
+"""
+import click
+from . import predict, render
+@click.group()
+def main_cli():
+    """Run inference for SHARP model."""
+    pass
+main_cli.add_command(predict.predict_cli, "predict")
+main_cli.add_command(render.render_cli, "render")

src/sharp/cli/predict.py ADDED Viewed

	@@ -0,0 +1,206 @@

+"""Contains `sharp predict` CLI implementation.
+For licensing see accompanying LICENSE file.
+Copyright (C) 2025 Apple Inc. All Rights Reserved.
+"""
+from __future__ import annotations
+import logging
+from pathlib import Path
+import click
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.data
+from sharp.models import (
+    PredictorParams,
+    RGBGaussianPredictor,
+    create_predictor,
+)
+from sharp.utils import io
+from sharp.utils import logging as logging_utils
+from sharp.utils.gaussians import (
+    Gaussians3D,
+    SceneMetaData,
+    save_ply,
+    unproject_gaussians,
+)
+from .render import render_gaussians
+LOGGER = logging.getLogger(__name__)
+DEFAULT_MODEL_URL = "https://ml-site.cdn-apple.com/models/sharp/sharp_2572gikvuh.pt"
+@click.command()
+@click.option(
+    "-i",
+    "--input-path",
+    type=click.Path(path_type=Path, exists=True),
+    help="Path to an image or containing a list of images.",
+    required=True,
+)
+@click.option(
+    "-o",
+    "--output-path",
+    type=click.Path(path_type=Path, file_okay=False),
+    help="Path to save the predicted Gaussians and renderings.",
+    required=True,
+)
+@click.option(
+    "-c",
+    "--checkpoint-path",
+    type=click.Path(path_type=Path, dir_okay=False),
+    default=None,
+    help="Path to the .pt checkpoint. If not provided, downloads the default model automatically.",
+    required=False,
+)
+@click.option(
+    "--render/--no-render",
+    "with_rendering",
+    is_flag=True,
+    default=False,
+    help="Whether to render trajectory for checkpoint.",
+)
+@click.option(
+    "--device",
+    type=str,
+    default="default",
+    help="Device to run on. ['cpu', 'mps', 'cuda']",
+)
+@click.option("-v", "--verbose", is_flag=True, help="Activate debug logs.")
+def predict_cli(
+    input_path: Path,
+    output_path: Path,
+    checkpoint_path: Path,
+    with_rendering: bool,
+    device: str,
+    verbose: bool,
+):
+    """Predict Gaussians from input images."""
+    logging_utils.configure(logging.DEBUG if verbose else logging.INFO)
+    extensions = io.get_supported_image_extensions()
+    image_paths = []
+    if input_path.is_file():
+        if input_path.suffix in extensions:
+            image_paths = [input_path]
+    else:
+        for ext in extensions:
+            image_paths.extend(list(input_path.glob(f"**/*{ext}")))
+    if len(image_paths) == 0:
+        LOGGER.info("No valid images found. Input was %s.", input_path)
+        return
+    LOGGER.info("Processing %d valid image files.", len(image_paths))
+    if device == "default":
+        if torch.cuda.is_available():
+            device = "cuda"
+        elif torch.mps.is_available():
+            device = "mps"
+        else:
+            device = "cpu"
+    LOGGER.info("Using device %s", device)
+    if with_rendering and device != "cuda":
+        LOGGER.warning("Can only run rendering with gsplat on CUDA. Rendering is disabled.")
+        with_rendering = False
+    # Load or download checkpoint
+    if checkpoint_path is None:
+        LOGGER.info("No checkpoint provided. Downloading default model from %s", DEFAULT_MODEL_URL)
+        state_dict = torch.hub.load_state_dict_from_url(DEFAULT_MODEL_URL, progress=True)
+    else:
+        LOGGER.info("Loading checkpoint from %s", checkpoint_path)
+        state_dict = torch.load(checkpoint_path, weights_only=True)
+    gaussian_predictor = create_predictor(PredictorParams())
+    gaussian_predictor.load_state_dict(state_dict)
+    gaussian_predictor.eval()
+    gaussian_predictor.to(device)
+    output_path.mkdir(exist_ok=True, parents=True)
+    for image_path in image_paths:
+        LOGGER.info("Processing %s", image_path)
+        image, _, f_px = io.load_rgb(image_path)
+        height, width = image.shape[:2]
+        intrinsics = torch.tensor(
+            [
+                [f_px, 0, (width - 1) / 2.0, 0],
+                [0, f_px, (height - 1) / 2.0, 0],
+                [0, 0, 1, 0],
+                [0, 0, 0, 1],
+            ],
+            device=device,
+            dtype=torch.float32,
+        )
+        gaussians = predict_image(gaussian_predictor, image, f_px, torch.device(device))
+        LOGGER.info("Saving 3DGS to %s", output_path)
+        save_ply(gaussians, f_px, (height, width), output_path / f"{image_path.stem}.ply")
+        if with_rendering:
+            output_video_path = (output_path / image_path.stem).with_suffix(".mp4")
+            LOGGER.info("Rendering trajectory to %s", output_video_path)
+            metadata = SceneMetaData(intrinsics[0, 0].item(), (width, height), "linearRGB")
+            render_gaussians(gaussians, metadata, output_video_path)
+@torch.no_grad()
+def predict_image(
+    predictor: RGBGaussianPredictor,
+    image: np.ndarray,
+    f_px: float,
+    device: torch.device,
+) -> Gaussians3D:
+    """Predict Gaussians from an image."""
+    internal_shape = (1536, 1536)
+    LOGGER.info("Running preprocessing.")
+    image_pt = torch.from_numpy(image.copy()).float().to(device).permute(2, 0, 1) / 255.0
+    _, height, width = image_pt.shape
+    disparity_factor = torch.tensor([f_px / width]).float().to(device)
+    image_resized_pt = F.interpolate(
+        image_pt[None],
+        size=(internal_shape[1], internal_shape[0]),
+        mode="bilinear",
+        align_corners=True,
+    )
+    # Predict Gaussians in the NDC space.
+    LOGGER.info("Running inference.")
+    gaussians_ndc = predictor(image_resized_pt, disparity_factor)
+    LOGGER.info("Running postprocessing.")
+    intrinsics = (
+        torch.tensor(
+            [
+                [f_px, 0, width / 2, 0],
+                [0, f_px, height / 2, 0],
+                [0, 0, 1, 0],
+                [0, 0, 0, 1],
+            ]
+        )
+        .float()
+        .to(device)
+    )
+    intrinsics_resized = intrinsics.clone()
+    intrinsics_resized[0] *= internal_shape[0] / width
+    intrinsics_resized[1] *= internal_shape[1] / height
+    # Convert Gaussians to metrics space.
+    gaussians = unproject_gaussians(
+        gaussians_ndc, torch.eye(4).to(device), intrinsics_resized, internal_shape
+    )
+    return gaussians

src/sharp/cli/render.py ADDED Viewed

	@@ -0,0 +1,120 @@

+"""Contains `sharp render` CLI implementation.
+For licensing see accompanying LICENSE file.
+Copyright (C) 2025 Apple Inc. All Rights Reserved.
+"""
+from __future__ import annotations
+import logging
+from pathlib import Path
+import click
+import torch
+import torch.utils.data
+from sharp.utils import camera, gsplat, io
+from sharp.utils import logging as logging_utils
+from sharp.utils.gaussians import Gaussians3D, SceneMetaData, load_ply
+LOGGER = logging.getLogger(__name__)
+@click.command()
+@click.option(
+    "-i",
+    "--input-path",
+    type=click.Path(exists=True, path_type=Path),
+    help="Path to the ply or a list of plys.",
+    required=True,
+)
+@click.option(
+    "-o",
+    "--output-path",
+    type=click.Path(path_type=Path, file_okay=False),
+    help="Path to save the rendered videos.",
+    required=True,
+)
+@click.option("-v", "--verbose", is_flag=True, help="Activate debug logs.")
+def render_cli(input_path: Path, output_path: Path, verbose: bool):
+    """Predict Gaussians from input images."""
+    logging_utils.configure(logging.DEBUG if verbose else logging.INFO)
+    if not torch.cuda.is_available():
+        LOGGER.error("Rendering a checkpoint requires CUDA.")
+        exit(1)
+    output_path.mkdir(exist_ok=True, parents=True)
+    params = camera.TrajectoryParams()
+    if input_path.suffix == ".ply":
+        scene_paths = [input_path]
+    elif input_path.is_dir():
+        scene_paths = list(input_path.glob("*.ply"))
+    else:
+        LOGGER.error("Input path must be either directory or single PLY file.")
+        exit(1)
+    for scene_path in scene_paths:
+        LOGGER.info("Rendering %s", scene_path)
+        gaussians, metadata = load_ply(scene_path)
+        render_gaussians(
+            gaussians=gaussians,
+            metadata=metadata,
+            params=params,
+            output_path=(output_path / scene_path.stem).with_suffix(".mp4"),
+        )
+def render_gaussians(
+    gaussians: Gaussians3D,
+    metadata: SceneMetaData,
+    output_path: Path,
+    params: camera.TrajectoryParams | None = None,
+) -> None:
+    """Render a single gaussian checkpoint file."""
+    (width, height) = metadata.resolution_px
+    f_px = metadata.focal_length_px
+    if params is None:
+        params = camera.TrajectoryParams()
+    if not torch.cuda.is_available():
+        raise RuntimeError("Rendering a checkpoint requires CUDA.")
+    device = torch.device("cuda")
+    intrinsics = torch.tensor(
+        [
+            [f_px, 0, (width - 1) / 2., 0],
+            [0, f_px, (height - 1) / 2., 0],
+            [0, 0, 1, 0],
+            [0, 0, 0, 1],
+        ],
+        device=device,
+        dtype=torch.float32,
+    )
+    camera_model = camera.create_camera_model(
+        gaussians, intrinsics, resolution_px=metadata.resolution_px
+    )
+    trajectory = camera.create_eye_trajectory(
+        gaussians, params, resolution_px=metadata.resolution_px, f_px=f_px
+    )
+    renderer = gsplat.GSplatRenderer(color_space=metadata.color_space)
+    video_writer = io.VideoWriter(output_path)
+    for _, eye_position in enumerate(trajectory):
+        camera_info = camera_model.compute(eye_position)
+        rendering_output = renderer(
+            gaussians.to(device),
+            extrinsics=camera_info.extrinsics[None].to(device),
+            intrinsics=camera_info.intrinsics[None].to(device),
+            image_width=camera_info.width,
+            image_height=camera_info.height,
+        )
+        color = (rendering_output.color[0].permute(1, 2, 0) * 255.0).to(dtype=torch.uint8)
+        depth = rendering_output.depth[0]
+        video_writer.add_frame(color, depth)
+    video_writer.close()

src/sharp/models/__init__.py ADDED Viewed

	@@ -0,0 +1,79 @@

+"""Contains different Gaussian predictors.
+For licensing see accompanying LICENSE file.
+Copyright (C) 2025 Apple Inc. All Rights Reserved.
+"""
+from __future__ import annotations
+from sharp.models.monodepth import (
+    create_monodepth_adaptor,
+    create_monodepth_dpt,
+)
+from .alignment import create_alignment
+from .composer import GaussianComposer
+from .gaussian_decoder import create_gaussian_decoder
+from .heads import DirectPredictionHead
+from .initializer import create_initializer
+from .params import PredictorParams
+from .predictor import RGBGaussianPredictor
+def create_predictor(params: PredictorParams) -> RGBGaussianPredictor:
+    """Create gaussian predictor model specified by name."""
+    if params.gaussian_decoder.stride < params.initializer.stride:
+        raise ValueError(
+            "We donot expected gaussian_decoder has higher resolution than initializer."
+        )
+    scale_factor = params.gaussian_decoder.stride // params.initializer.stride
+    gaussian_composer = GaussianComposer(
+        delta_factor=params.delta_factor,
+        min_scale=params.min_scale,
+        max_scale=params.max_scale,
+        color_activation_type=params.color_activation_type,
+        opacity_activation_type=params.opacity_activation_type,
+        color_space=params.color_space,
+        scale_factor=scale_factor,
+        base_scale_on_predicted_mean=params.base_scale_on_predicted_mean,
+    )
+    if params.num_monodepth_layers > 1 and params.initializer.num_layers != 2:
+        raise KeyError("We only support num_layers = 2 when num_monodepth_layers > 1.")
+    monodepth_model = create_monodepth_dpt(params.monodepth)
+    monodepth_adaptor = create_monodepth_adaptor(
+        monodepth_model,
+        params.monodepth_adaptor,
+        params.num_monodepth_layers,
+        params.sorting_monodepth,
+    )
+    if params.num_monodepth_layers == 2:
+        monodepth_adaptor.replicate_head(params.num_monodepth_layers)
+    gaussian_decoder = create_gaussian_decoder(
+        params.gaussian_decoder,
+        dims_depth_features=monodepth_adaptor.get_feature_dims(),
+    )
+    initializer = create_initializer(
+        params.initializer,
+    )
+    prediction_head = DirectPredictionHead(
+        feature_dim=gaussian_decoder.dim_out, num_layers=initializer.num_layers
+    )
+    decoder_dim = monodepth_model.decoder.dims_decoder[-1]
+    return RGBGaussianPredictor(
+        init_model=initializer,
+        feature_model=gaussian_decoder,
+        prediction_head=prediction_head,
+        monodepth_model=monodepth_adaptor,
+        gaussian_composer=gaussian_composer,
+        scale_map_estimator=create_alignment(params.depth_alignment, depth_decoder_dim=decoder_dim),
+    )
+__all__ = [
+    "PredictorParams",
+    "create_predictor",
+]

src/sharp/models/alignment.py ADDED Viewed

	@@ -0,0 +1,126 @@

+"""Contains modules for different types of alignment.
+For licensing see accompanying LICENSE file.
+Copyright (C) 2025 Apple Inc. All Rights Reserved.
+"""
+from __future__ import annotations
+import math
+import torch
+import torch.nn.functional as F
+from torch import nn
+from sharp.models.decoders import UNetDecoder
+from sharp.models.encoders import UNetEncoder
+from sharp.utils import math as math_utils
+from .params import AlignmentParams
+def create_alignment(
+    params: AlignmentParams, depth_decoder_dim: int | None = None
+) -> nn.Module | None:
+    """Create depth alignment."""
+    if depth_decoder_dim is None:
+        raise ValueError("Requires depth_decoder_dim for LearnedAlignment.")
+    alignment = LearnedAlignment(
+        depth_decoder_features=params.depth_decoder_features,
+        depth_decoder_dim=depth_decoder_dim,
+        steps=params.steps,
+        stride=params.stride,
+        base_width=params.base_width,
+        activation_type=params.activation_type,
+    )
+    if params.frozen:
+        alignment.requires_grad_(False)
+    return alignment
+class LearnedAlignment(nn.Module):
+    """Aligns tensors using a UNet."""
+    def __init__(
+        self,
+        steps: int = 4,
+        stride: int = 8,
+        base_width: int = 16,
+        depth_decoder_features: bool = False,
+        depth_decoder_dim: int = 256,
+        activation_type: math_utils.ActivationType = "exp",
+    ) -> None:
+        """Initialize LearnedAlignment.
+        Args:
+            steps: Number of steps in the UNet.
+            stride: Effective downsampling of the alignment module.
+            base_width: Base width of the UNet.
+            depth_decoder_features: Whether to use depth decoder features.
+            depth_decoder_dim: Dimension of the depth decoder features.
+            activation_type: Activation type for the alignment output.
+        """
+        super().__init__()
+        self.activation = math_utils.create_activation_pair(activation_type)
+        bias_value = self.activation.inverse(torch.tensor(1.0))
+        self.depth_decoder_features = depth_decoder_features
+        if depth_decoder_features:
+            dim_in = 2 + depth_decoder_dim
+        else:
+            dim_in = 2
+        def is_power_of_two(n: int) -> bool:
+            """Check if a number is a power of two."""
+            if n <= 0:
+                return False
+            return (n & (n - 1)) == 0
+        if not is_power_of_two(stride):
+            raise ValueError(f"Stride {stride} is not a power of two.")
+        steps_decoder = steps - int(math.log2(stride))
+        if steps_decoder < 1:
+            raise ValueError(f"{steps_decoder} must be greater or equal to 1.")
+        widths = [min(base_width << i, 1024) for i in range(steps + 1)]
+        self.encoder = UNetEncoder(dim_in=dim_in, width=widths, steps=steps, norm_num_groups=4)
+        self.decoder = UNetDecoder(
+            dim_out=widths[0], width=widths, steps=steps_decoder, norm_num_groups=4
+        )
+        self.conv_out = nn.Conv2d(widths[0], 1, 1, bias=True)
+        nn.init.zeros_(self.conv_out.weight)
+        nn.init.constant_(self.conv_out.bias, bias_value)
+    def forward(
+        self,
+        tensor_src: torch.Tensor,
+        tensor_tgt: torch.Tensor,
+        depth_decoder_features: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        """Compute alignment map."""
+        # Since the tensors are usually given by depth which is >= 1.0, we invert
+        # the tensors to have them in a reasonable range.
+        tensor_src = 1.0 / tensor_src.clamp(min=1e-4)
+        tensor_tgt = 1.0 / tensor_tgt.clamp(min=1e-4)
+        tensor_input = torch.cat([tensor_src, tensor_tgt], dim=1)
+        if self.depth_decoder_features:
+            height, width = tensor_src.shape[-2:]
+            upsampled_encodings = F.interpolate(
+                depth_decoder_features,
+                size=(height, width),
+                mode="bilinear",
+            )
+            tensor_input = torch.cat([tensor_input, upsampled_encodings], dim=1)
+        features = self.encoder(tensor_input)
+        output = self.conv_out(self.decoder(features))
+        alignment_map_lowres = self.activation.forward(output)
+        if alignment_map_lowres.shape[-2:] != tensor_src.shape[-2]:
+            alignment_map = F.interpolate(
+                alignment_map_lowres,
+                size=tensor_src.shape[-2:],
+                mode="bilinear",
+                align_corners=False,
+            )
+        return alignment_map

src/sharp/models/blocks.py ADDED Viewed

	@@ -0,0 +1,210 @@

+"""Contains reusable network components.
+For licensing see accompanying LICENSE file.
+Copyright (C) 2025 Apple Inc. All Rights Reserved.
+"""
+from __future__ import annotations
+from typing import Literal
+import torch
+from torch import nn
+NormLayerName = Literal["noop", "batch_norm", "group_norm", "instance_norm"]
+UpsamplingMode = Literal["transposed_conv", "nearest", "bilinear"]
+def norm_layer_2d(num_features: int, norm_type: NormLayerName, num_groups: int = 8) -> nn.Module:
+    """Create normalization layer."""
+    if norm_type == "noop":
+        return nn.Identity()
+    elif norm_type == "batch_norm":
+        return nn.BatchNorm2d(num_features=num_features)
+    elif norm_type == "group_norm":
+        return nn.GroupNorm(num_channels=num_features, num_groups=num_groups)
+    elif norm_type == "instance_norm":
+        return nn.InstanceNorm2d(num_features=num_features)
+    else:
+        raise ValueError(f"Invalid normalization layer type: {norm_type}")
+def upsampling_layer(upsampling_mode: UpsamplingMode, scale_factor: int, dim_in: int) -> nn.Module:
+    """Create upsampling layer."""
+    if upsampling_mode == "transposed_conv":
+        return nn.ConvTranspose2d(
+            in_channels=dim_in,
+            out_channels=dim_in,
+            kernel_size=scale_factor,
+            stride=scale_factor,
+            padding=0,
+            bias=False,
+        )
+    elif upsampling_mode in ("nearest", "bilinear"):
+        return nn.Upsample(scale_factor=scale_factor, mode=upsampling_mode)
+    else:
+        raise ValueError(f"Invalid upsampling mode {upsampling_mode}.")
+class ResidualBlock(nn.Module):
+    """Generic implementation of residual blocks.
+    This implements a generic residual block from
+        He et al. - Identity Mappings in Deep Residual Networks (2016),
+        https://arxiv.org/abs/1603.05027
+    which can be further customized via factory functions.
+    """
+    def __init__(self, residual: nn.Module, shortcut: nn.Module | None = None) -> None:
+        """Initialize ResidualBlock."""
+        super().__init__()
+        self.residual = residual
+        self.shortcut = shortcut
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Apply residual block."""
+        delta_x = self.residual(x)
+        if self.shortcut is not None:
+            x = self.shortcut(x)
+        return x + delta_x
+def residual_block_2d(
+    dim_in: int,
+    dim_out: int,
+    dim_hidden: int | None = None,
+    actvn: nn.Module | None = None,
+    norm_type: NormLayerName = "noop",
+    norm_num_groups: int = 8,
+    dilation: int = 1,
+    kernel_size: int = 3,
+):
+    """Create a simple 2D residual block."""
+    if actvn is None:
+        actvn = nn.ReLU()
+    if dim_hidden is None:
+        dim_hidden = dim_out // 2
+    # Padding to maintain output size
+    # See https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html
+    padding = (dilation * (kernel_size - 1)) // 2
+    def _create_block(dim_in: int, dim_out: int) -> list[nn.Module]:
+        layers = [
+            norm_layer_2d(dim_in, norm_type, num_groups=norm_num_groups),
+            actvn,
+        ]
+        layers.append(
+            nn.Conv2d(
+                dim_in,
+                dim_out,
+                kernel_size=kernel_size,
+                stride=1,
+                dilation=dilation,
+                padding=padding,
+            )
+        )
+        return layers
+    residual = nn.Sequential(
+        *_create_block(dim_in, dim_hidden),
+        *_create_block(dim_hidden, dim_out),
+    )
+    shortcut = None
+    if dim_in != dim_out:
+        shortcut = nn.Conv2d(dim_in, dim_out, 1)
+    return ResidualBlock(residual, shortcut)
+class FeatureFusionBlock2d(nn.Module):
+    """Feature fusion for DPT."""
+    # We use the name "deconv" for backward compatibility. However, "deconv" can also
+    # refer to some other upsampling layer or a no-op.
+    deconv: nn.Module
+    def __init__(
+        self,
+        dim_in: int,
+        dim_out: int | None = None,
+        upsampling_mode: UpsamplingMode | None = None,
+        batch_norm: bool = False,
+    ):
+        """Initialize feature fusion block.
+        Args:
+            dim_in: Dimensions of input.
+            dim_out: Dimensions of output.
+            batch_norm: Whether to use batch normalization in resnet blocks.
+            upsampling_mode: What mode to use for upsampling. None if no upsampling
+                is required.
+        """
+        super().__init__()
+        if dim_out is None:
+            dim_out = dim_in
+        self.resnet1 = self._residual_block(dim_in, batch_norm)
+        self.resnet2 = self._residual_block(dim_in, batch_norm)
+        if upsampling_mode is not None:
+            self.deconv = upsampling_layer(upsampling_mode, scale_factor=2, dim_in=dim_in)
+        else:
+            self.deconv = nn.Sequential()
+        self.out_conv = nn.Conv2d(
+            dim_in,
+            dim_out,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True,
+        )
+        self.skip_add = nn.quantized.FloatFunctional()
+    def forward(self, x0: torch.Tensor, x1: torch.Tensor | None = None) -> torch.Tensor:
+        """Process and fuse input features."""
+        x = x0
+        if x1 is not None:
+            res = self.resnet1(x1)
+            x = self.skip_add.add(x, res)
+        x = self.resnet2(x)
+        x = self.deconv(x)
+        x = self.out_conv(x)
+        return x
+    @staticmethod
+    def _residual_block(num_features: int, batch_norm: bool):
+        """Create a residual block."""
+        def _create_block(dim: int, batch_norm: bool) -> list[nn.Module]:
+            layers = [
+                nn.ReLU(False),
+                nn.Conv2d(
+                    num_features,
+                    num_features,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=not batch_norm,
+                ),
+            ]
+            if batch_norm:
+                layers.append(nn.BatchNorm2d(dim))
+            return layers
+        residual = nn.Sequential(
+            *_create_block(dim=num_features, batch_norm=batch_norm),
+            *_create_block(dim=num_features, batch_norm=batch_norm),
+        )
+        return ResidualBlock(residual)

src/sharp/models/composer.py ADDED Viewed

	@@ -0,0 +1,251 @@

+"""Defines module to compose final Gaussians from base values and delta values.
+For licensing see accompanying LICENSE file.
+Copyright (C) 2025 Apple Inc. All Rights Reserved.
+"""
+from __future__ import annotations
+import torch
+from torch import nn
+from torch.nn import functional as F
+from sharp.models.initializer import GaussianBaseValues
+from sharp.utils import math as math_utils
+from sharp.utils.color_space import ColorSpace, sRGB2linearRGB
+from sharp.utils.gaussians import Gaussians3D
+from .params import DeltaFactor
+def _get_scale_activation_constant(max_scale: float, min_scale: float) -> tuple[float, float]:
+    """Return constants for scale activation function."""
+    # To ensure for delta = 0, the value of scale_factor is 1 and the gradient is 1.
+    constant_a = (max_scale - min_scale) / (1 - min_scale) / (max_scale - 1)
+    constant_b = math_utils.inverse_sigmoid(
+        torch.tensor((1.0 - min_scale) / (max_scale - min_scale))
+    ).item()
+    return constant_a, constant_b
+class GaussianComposer(nn.Module):
+    """Converts base values and deltas into Gaussians."""
+    color_activation_type: math_utils.ActivationType
+    opacity_activation_type: math_utils.ActivationType
+    def __init__(
+        self,
+        delta_factor: DeltaFactor,
+        min_scale: float,
+        max_scale: float,
+        color_activation_type: math_utils.ActivationType,
+        opacity_activation_type: math_utils.ActivationType,
+        color_space: ColorSpace,
+        base_scale_on_predicted_mean: bool,
+        scale_factor: int = 1,
+    ) -> None:
+        """Initialize GaussianComposer.
+        Args:
+            delta_factor: Multiply delta offsets by this factor.
+            min_scale: The minimal scale factor for gaussian scale activation.
+            max_scale: The maximal scale factor for gaussian scale activation.
+            color_activation_type: Which activation function to use for colors.
+            opacity_activation_type: Which activation function to use for opacities.
+            color_space: Which color space is used in training.
+            scale_factor: The scale factor to upsample the delta_values before composition.
+            base_scale_on_predicted_mean: Whether to account z offsets for estimating base scale.
+        """
+        super().__init__()
+        self.delta_factor = delta_factor
+        self.max_scale = max_scale
+        self.min_scale = min_scale
+        self.color_activation_type = color_activation_type
+        self.opacity_activation_type = opacity_activation_type
+        self.color_space = color_space
+        self.scale_factor = scale_factor
+        self.base_scale_on_predicted_mean = base_scale_on_predicted_mean
+    def upsample_delta_value(self, delta: torch.Tensor, scale_factor: int = 1):
+        """Upsample the delta value.
+        Args:
+            delta: The delta values predicted by gaussian predictor.
+            scale_factor: The scale factor to upsample the delta_values.
+        """
+        (
+            batch_size,
+            num_channels,
+            num_layers,
+            image_height,
+            image_width,
+        ) = delta.shape
+        new_height = image_height * scale_factor
+        new_width = image_width * scale_factor
+        upsampled_delta = F.interpolate(
+            delta.view(batch_size, num_channels * num_layers, image_height, image_width),
+            scale_factor=scale_factor,
+        ).view(batch_size, num_channels, num_layers, new_height, new_width)
+        return upsampled_delta
+    def forward(
+        self,
+        delta: torch.Tensor,
+        base_values: GaussianBaseValues,
+        global_scale: torch.Tensor | None = None,
+        flatten_output: bool = True,
+    ) -> Gaussians3D:
+        """Combine predicted delta values with base gaussian values and apply activation function.
+        Args:
+            delta: The delta values predicted by gaussian predictor.
+            base_values: The gaussian base values.
+            global_scale: Global scale of Gaussians.
+            flatten_output: Flatten the gaussian parameters.
+        Returns:
+            The computed 3D Gaussians.
+        """
+        # Upsample the delta if delta and base_values have different strides.
+        scale_factor = self.scale_factor
+        # For triplane head, the delta has already been upsampled.
+        actual_scale_factor = base_values.mean_x_ndc.shape[-1] // delta.shape[-1]
+        if scale_factor != 1 and actual_scale_factor != 1:
+            delta = self.upsample_delta_value(delta, scale_factor)
+        mean_vectors = self._forward_mean(base_values, delta)
+        # Account for the change in base scale due to z offsets.
+        base_scales = (
+            (base_values.scales * base_values.mean_inverse_z_ndc * mean_vectors[:, 2:3, ...])
+            if self.base_scale_on_predicted_mean
+            else base_values.scales
+        )
+        singular_values = self._scale_activation(
+            base_scales,
+            delta[:, 3:6],
+            self.min_scale,
+            self.max_scale,
+        )
+        quaternions = self._quaternion_activation(base_values.quaternions, delta[:, 6:10])
+        colors = self._color_activation(base_values.colors, delta[:, 10:13])
+        opacities = self._opacity_activation(base_values.opacities, delta[:, 13])
+        if flatten_output:
+            # [B, C, N, H, W] -> [B, N, H, W, C].
+            # NOTE: opacities is [B, N, H, W] so it doesn't need to permute.
+            mean_vectors = mean_vectors.permute(0, 2, 3, 4, 1).flatten(1, 3)
+            singular_values = singular_values.permute(0, 2, 3, 4, 1).flatten(1, 3)
+            quaternions = quaternions.permute(0, 2, 3, 4, 1).flatten(1, 3)
+            colors = colors.permute(0, 2, 3, 4, 1).flatten(1, 3)
+            opacities = opacities.flatten(1, 3)
+        # Apply global scaling to convert Gaussians to metric space.
+        if global_scale is not None:
+            mean_vectors = global_scale[:, None, None] * mean_vectors
+            singular_values = global_scale[:, None, None] * singular_values
+        return Gaussians3D(
+            mean_vectors=mean_vectors,
+            singular_values=singular_values,
+            quaternions=quaternions,
+            colors=colors,
+            opacities=opacities,
+        )
+    def _forward_mean(self, base_values: GaussianBaseValues, delta: torch.Tensor) -> torch.Tensor:
+        # Concatenate base vectors and apply mean activation.
+        delta_factor = torch.tensor(
+            [self.delta_factor.xy, self.delta_factor.xy, self.delta_factor.z],
+            device=delta.device,
+        )[None, :, None, None, None]
+        dtype = base_values.mean_x_ndc.dtype
+        device = base_values.mean_x_ndc.device
+        target_shape = (1, 3, 1, 1, 1)
+        mean_x_mask = torch.tensor([1.0, 0.0, 0.0], dtype=dtype, device=device).reshape(
+            target_shape
+        )
+        mean_y_mask = torch.tensor([0.0, 1.0, 0.0], dtype=dtype, device=device).reshape(
+            target_shape
+        )
+        mean_z_mask = torch.tensor([0.0, 0.0, 1.0], dtype=dtype, device=device).reshape(
+            target_shape
+        )
+        mean_vectors_ndc = (
+            base_values.mean_x_ndc.repeat(target_shape) * mean_x_mask
+            + base_values.mean_y_ndc.repeat(target_shape) * mean_y_mask
+            + base_values.mean_inverse_z_ndc.repeat(target_shape) * mean_z_mask
+        )
+        mean_vectors = self._mean_activation(mean_vectors_ndc, delta_factor * delta[:, :3])
+        return mean_vectors
+    def _mean_activation(self, base: torch.Tensor, learned_delta: torch.Tensor) -> torch.Tensor:
+        """Mean activation function.
+        Args:
+            base: Tensor of shape [B, 3, H, W], where first two feature dimensions
+                (x,y) are in normalized device coordinates (NDC) where (-1, -1) is
+                the top, while the third dimension is inverse depth.
+            learned_delta: Tensor of shape [B, 3, H, W] with predicted delta values.
+        Returns:
+            Returns: The final mean vector after combining base and delta and applying nonlinearies.
+        """
+        xx = base[:, 0:1] + learned_delta[:, 0:1]
+        yy = base[:, 1:2] + learned_delta[:, 1:2]
+        a = base[:, 2:3]
+        b = learned_delta[:, 2:3]
+        # Original formula:
+        inverse_zz = F.softplus(math_utils.inverse_softplus(a) + b)
+        zz = 1.0 / (inverse_zz + 1e-3)
+        mean_vectors = torch.cat([zz * xx, zz * yy, zz], dim=1)
+        return mean_vectors
+    def _scale_activation(
+        self,
+        base: torch.Tensor,
+        learned_delta: torch.Tensor,
+        min_scale: float,
+        max_scale: float,
+    ) -> torch.Tensor:
+        constant_a, constant_b = _get_scale_activation_constant(max_scale, min_scale)
+        scale_factor = (max_scale - min_scale) * torch.sigmoid(
+            constant_a * self.delta_factor.scale * learned_delta + constant_b
+        ) + min_scale
+        return base * scale_factor
+    def _quaternion_activation(
+        self, base: torch.Tensor, learned_delta: torch.Tensor
+    ) -> torch.Tensor:
+        # No need to normalize the quaternions, since this is also done in rendering.
+        return base + self.delta_factor.quaternion * learned_delta
+    def _color_activation(self, base: torch.Tensor, learned_delta: torch.Tensor) -> torch.Tensor:
+        # For certain activation functions we need to clamp the base value to
+        # a supported range.
+        if self.color_activation_type == "sigmoid":
+            base = torch.clamp(base, min=0.01, max=0.99)
+        elif self.color_activation_type in ("exp", "softplus"):
+            base = torch.clamp(base, min=0.01)
+        activation = math_utils.create_activation_pair(self.color_activation_type)
+        colors: torch.Tensor = activation.forward(
+            activation.inverse(base) + self.delta_factor.color * learned_delta
+        )
+        # Convert gaussian color to linear if linearRGB colorspace is specified.
+        if self.color_space == "linearRGB":
+            colors = sRGB2linearRGB(colors)
+        return colors
+    def _opacity_activation(self, base: torch.Tensor, learned_delta: torch.Tensor) -> torch.Tensor:
+        activation = math_utils.create_activation_pair(self.opacity_activation_type)
+        return activation.forward(
+            activation.inverse(base) + self.delta_factor.opacity * learned_delta
+        )

src/sharp/models/decoders/__init__.py ADDED Viewed

	@@ -0,0 +1,22 @@

+"""Contains different decoders for Gaussian predictor.
+For licensing see accompanying LICENSE file.
+Copyright (C) 2025 Apple Inc. All Rights Reserved.
+"""
+from __future__ import annotations
+from .base_decoder import BaseDecoder
+from .monodepth_decoder import (
+    create_monodepth_decoder,
+)
+from .multires_conv_decoder import MultiresConvDecoder, UpsamplingMode
+from .unet_decoder import UNetDecoder
+__all__ = [
+    "BaseDecoder",
+    "UNetDecoder",
+    "MultiresConvDecoder",
+    "UpsamplingMode",
+    "create_monodepth_decoder",
+]

src/sharp/models/decoders/base_decoder.py ADDED Viewed

	@@ -0,0 +1,21 @@

+"""Contains the base class for decoders.
+For licensing see accompanying LICENSE file.
+Copyright (C) 2025 Apple Inc. All Rights Reserved.
+"""
+import abc
+from typing import List
+import torch
+from torch import nn
+class BaseDecoder(nn.Module, abc.ABC):
+    """Base decoder class."""
+    dim_out: int
+    @abc.abstractmethod
+    def forward(self, encodings: List[torch.Tensor]) -> torch.Tensor:
+        """Decode (multi-resolution) encodings."""

src/sharp/models/decoders/monodepth_decoder.py ADDED Viewed

	@@ -0,0 +1,37 @@

+"""Contains factory function for loading/creating monodepth decoder.
+For licensing see accompanying LICENSE file.
+Copyright (C) 2025 Apple Inc. All Rights Reserved.
+"""
+from __future__ import annotations
+from sharp.models.presets import (
+    MONODEPTH_ENCODER_DIMS_MAP,
+    ViTPreset,
+)
+from .multires_conv_decoder import MultiresConvDecoder
+def create_monodepth_decoder(
+    patch_encoder_preset: ViTPreset,
+    dims_decoder=None,
+) -> MultiresConvDecoder:
+    """Create DepthDensePredictionTransformer model.
+    Args:
+        patch_encoder_preset: The preset patch encoder architecture in SPN.
+        dims_decoder: The decoder architecture.
+    """
+    dims_encoder = MONODEPTH_ENCODER_DIMS_MAP[patch_encoder_preset]
+    if dims_decoder is None:
+        dims_decoder = dims_encoder[0]
+    if isinstance(dims_decoder, int):
+        dims_decoder = [dims_decoder]
+    decoder = MultiresConvDecoder(
+        dims_encoder=[dims_decoder[0]] + list(dims_encoder), dims_decoder=dims_decoder
+    )
+    return decoder

src/sharp/models/decoders/multires_conv_decoder.py ADDED Viewed

	@@ -0,0 +1,116 @@

+"""Contains multi-res convolutional decoder.
+Implements the decoder for Vision Transformers for Dense Prediction, https://arxiv.org/abs/2103.13413
+For licensing see accompanying LICENSE file.
+Copyright (C) 2025 Apple Inc. All Rights Reserved.
+"""
+from __future__ import annotations
+from typing import Iterable
+import torch
+import torch.nn as nn
+from sharp.models.blocks import FeatureFusionBlock2d, UpsamplingMode
+from sharp.utils.training import checkpoint_wrapper
+from .base_decoder import BaseDecoder
+class MultiresConvDecoder(BaseDecoder):
+    """Decoder for multi-resolution encodings."""
+    def __init__(
+        self,
+        dims_encoder: Iterable[int],
+        dims_decoder: Iterable[int] | int,
+        grad_checkpointing: bool = False,
+        upsampling_mode: UpsamplingMode = "transposed_conv",
+    ):
+        """Initialize multiresolution convolutional decoder.
+        Args:
+            dims_encoder: Expected dims at each level from the encoder.
+            dims_decoder: Dim of decoder features.
+            grad_checkpointing: Whether to checkpoint gradient during training.
+            upsampling_mode: What method to use for upsampling.
+        """
+        super().__init__()
+        self.dims_encoder = list(dims_encoder)
+        if isinstance(dims_decoder, int):
+            self.dims_decoder = [dims_decoder] * len(self.dims_encoder)
+        else:
+            self.dims_decoder = list(dims_decoder)
+        if len(self.dims_decoder) != len(self.dims_encoder):
+            raise ValueError("Received dims_encoder and dims_decoder of different sizes.")
+        self.dim_out = self.dims_decoder[0]
+        num_encoders = len(self.dims_encoder)
+        # At the highest resolution, i.e. level 0, we apply projection w/ 1x1 convolution
+        # when the dimensions mismatch. Otherwise we do not do anything, which is
+        # the default behavior of monodepth.
+        conv0 = (
+            nn.Conv2d(self.dims_encoder[0], self.dims_decoder[0], kernel_size=1, bias=False)
+            if self.dims_encoder[0] != self.dims_decoder[0]
+            else nn.Identity()
+        )
+        convs = [conv0]
+        for i in range(1, num_encoders):
+            convs.append(
+                nn.Conv2d(
+                    self.dims_encoder[i],
+                    self.dims_decoder[i],
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=False,
+                )
+            )
+        self.convs = nn.ModuleList(convs)
+        fusions = []
+        for i in range(num_encoders):
+            fusions.append(
+                FeatureFusionBlock2d(
+                    dim_in=self.dims_decoder[i],
+                    dim_out=self.dims_decoder[i - 1] if i != 0 else self.dim_out,
+                    upsampling_mode=upsampling_mode if i != 0 else None,
+                    batch_norm=False,
+                )
+            )
+        self.fusions = nn.ModuleList(fusions)
+        self.grad_checkpointing = grad_checkpointing
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, is_enabled=True):
+        """Enable grad checkpointing."""
+        self.grad_checkpointing = is_enabled
+    def forward(self, encodings: list[torch.Tensor]) -> torch.Tensor:
+        """Decode the multi-resolution encodings."""
+        num_levels = len(encodings)
+        num_encoders = len(self.dims_encoder)
+        if num_levels != num_encoders:
+            raise ValueError(
+                f"Encoder output levels={num_levels} at runtime "
+                f"mismatch with expected levels={num_encoders}."
+            )
+        # Project features of different encoder dims to the same decoder dim.
+        # Fuse features from the lowest resolution (num_levels-1)
+        # to the highest (0).
+        features = self.convs[-1](encodings[-1])
+        features = checkpoint_wrapper(self, self.fusions[-1], features)
+        for i in range(num_levels - 2, -1, -1):
+            features_i = self.convs[i](encodings[i])
+            features = checkpoint_wrapper(self, self.fusions[i], features, features_i)
+        return features

src/sharp/models/decoders/unet_decoder.py ADDED Viewed

	@@ -0,0 +1,113 @@

+"""Contains the UNet decoder.
+For licensing see accompanying LICENSE file.
+Copyright (C) 2025 Apple Inc. All Rights Reserved.
+"""
+from __future__ import annotations
+from typing import List
+import torch
+from torch import nn
+from sharp.models.blocks import (
+    NormLayerName,
+    norm_layer_2d,
+    residual_block_2d,
+)
+from .base_decoder import BaseDecoder
+class UNetDecoder(BaseDecoder):
+    """Decoder of UNet model."""
+    def __init__(
+        self,
+        dim_out: int,
+        width: List[int] | int,
+        steps: int = 5,
+        norm_type: NormLayerName = "group_norm",
+        norm_num_groups=8,
+        blocks_per_layer=2,
+    ) -> None:
+        """Initialize UNet Decoder.
+        Args:
+            dim_out: The number of output channels.
+            width: Width of last input feature map from encoder
+                or the width list of all input feature maps from encoder.
+            steps: The number of upsampling steps.
+            norm_type: Which kind of normalization layer to use.
+            norm_num_groups: How many groups to use for group norm (if relevant).
+            blocks_per_layer: How many blocks per layer to use.
+        """
+        super().__init__()
+        if blocks_per_layer < 1:
+            raise ValueError("blocks_per_layer must be greater or equal to one.")
+        self.dim_out = dim_out
+        self.convs_up = nn.ModuleList()
+        self.output_dims: list[int]
+        # If only one number is specified, we assume each layer will double the channel dimension.
+        if isinstance(width, int):
+            self.input_dims = [width >> i for i in range(0, steps + 1)]
+        else:
+            self.input_dims = width[::-1][: steps + 1]
+        for i_step in range(steps):
+            input_width = self.input_dims[i_step]
+            current_width = self.input_dims[i_step + 1]
+            convs_up_i = nn.Sequential(
+                nn.Upsample(scale_factor=2),
+                residual_block_2d(
+                    input_width * (1 if i_step == 0 else 2),
+                    current_width,
+                    norm_type=norm_type,
+                    norm_num_groups=norm_num_groups,
+                ),
+                *[
+                    residual_block_2d(
+                        current_width,
+                        current_width,
+                        norm_type=norm_type,
+                        norm_num_groups=norm_num_groups,
+                    )
+                    for _ in range(blocks_per_layer - 1)
+                ],
+            )
+            self.convs_up.append(convs_up_i)
+            input_width = 2 * current_width
+            current_width //= 2
+        last_width = self.input_dims[-1]
+        self.conv_out = nn.Sequential(
+            norm_layer_2d(last_width * 2, norm_type, num_groups=norm_num_groups),
+            nn.ReLU(),
+            nn.Conv2d(last_width * 2, dim_out, 1),
+            norm_layer_2d(dim_out, norm_type, num_groups=norm_num_groups),
+            nn.ReLU(),
+        )
+    def forward(self, features: list[torch.Tensor]) -> torch.Tensor:
+        """Apply UNet to image.
+        Args:
+            features: The input multi-level feature map from encoder.
+        Returns:
+            The output feature map.
+        """
+        i_feature_layer = len(features) - 1
+        out = self.convs_up[0](features[i_feature_layer])
+        i_feature_layer -= 1
+        for conv_up in self.convs_up[1:]:  # type: ignore
+            out = conv_up(torch.cat([out, features[i_feature_layer]], dim=1))
+            i_feature_layer -= 1
+        out = self.conv_out(torch.cat([out, features[i_feature_layer]], dim=1))
+        return out

src/sharp/models/encoders/__init__.py ADDED Viewed

	@@ -0,0 +1,24 @@

+"""Contains different encoders for Gaussian predictor.
+For licensing see accompanying LICENSE file.
+Copyright (C) 2025 Apple Inc. All Rights Reserved.
+"""
+from sharp.models.encoders.base_encoder import BaseEncoder
+from .monodepth_encoder import (
+    MonodepthFeatureEncoder,
+    create_monodepth_encoder,
+)
+from .spn_encoder import SlidingPyramidNetwork
+from .unet_encoder import UNetEncoder
+from .vit_encoder import create_vit
+__all__ = [
+    "create_vit",
+    "BaseEncoder",
+    "UNetEncoder",
+    "SlidingPyramidNetwork",
+    "MonodepthFeatureEncoder",
+    "create_monodepth_encoder",
+]

src/sharp/models/encoders/base_encoder.py ADDED Viewed

	@@ -0,0 +1,25 @@

+"""Contains the base class for encoders.
+For licensing see accompanying LICENSE file.
+Copyright (C) 2025 Apple Inc. All Rights Reserved.
+"""
+import abc
+import torch
+from torch import nn
+class BaseEncoder(nn.Module, abc.ABC):
+    """Base encoder class."""
+    dim_in: int
+    output_dims: list[int]
+    @abc.abstractmethod
+    def forward(self, image: torch.Tensor) -> list[torch.Tensor]:
+        """Encode input image into multi-resolution encodings."""
+    def internal_resolution(self) -> int:
+        """Internal resolution of the encoder."""
+        return 1536

src/sharp/models/encoders/monodepth_encoder.py ADDED Viewed

	@@ -0,0 +1,123 @@

+"""Contains Dense Transformer Prediction architecture.
+Implements a variant of Vision Transformers for Dense Prediction, https://arxiv.org/abs/2103.13413
+For licensing see accompanying LICENSE file.
+Copyright (C) 2025 Apple Inc. All Rights Reserved.
+"""
+from __future__ import annotations
+import torch
+import torch.nn as nn
+from sharp.models.presets import (
+    MONODEPTH_ENCODER_DIMS_MAP,
+    MONODEPTH_HOOK_IDS_MAP,
+    ViTPreset,
+)
+from .base_encoder import BaseEncoder
+from .spn_encoder import SlidingPyramidNetwork
+from .vit_encoder import create_vit
+def create_monodepth_encoder(
+    patch_encoder_preset: ViTPreset,
+    image_encoder_preset: ViTPreset,
+    use_patch_overlap: bool = True,
+    last_encoder: int = 256,
+) -> SlidingPyramidNetwork:
+    """Creates DepthDensePredictionTransformer model.
+    Args:
+        patch_encoder_preset: The preset patch encoder architecture in SPN.
+        image_encoder_preset: The preset image encoder architecture in SPN.
+        use_patch_overlap: Whether to use overlap between patches in SPN.
+        last_encoder: last number of encoder features.
+    """
+    dims_encoder = [last_encoder] + MONODEPTH_ENCODER_DIMS_MAP[patch_encoder_preset]
+    patch_encoder_block_ids = MONODEPTH_HOOK_IDS_MAP[patch_encoder_preset]
+    patch_encoder = create_vit(
+        preset=patch_encoder_preset,
+        intermediate_features_ids=patch_encoder_block_ids,
+        # We always need to output intermediate features for assembly.
+    )
+    image_encoder = create_vit(
+        preset=image_encoder_preset,
+        intermediate_features_ids=None,
+    )
+    encoder = SlidingPyramidNetwork(
+        dims_encoder=dims_encoder,
+        patch_encoder=patch_encoder,
+        image_encoder=image_encoder,
+        use_patch_overlap=use_patch_overlap,
+    )
+    return encoder
+class ProjectionModule(nn.Module):
+    """Apply projection of features."""
+    def __init__(self, dims_in: list[int], dims_out: list[int]) -> None:
+        """Initialize projection module."""
+        super().__init__()
+        if len(dims_in) != len(dims_out):
+            raise ValueError("Length of dims_in must be same as length of dims_out.")
+        self.convs = nn.ModuleList(
+            [nn.Conv2d(dim_in, dim_out, 1) for dim_in, dim_out in zip(dims_in, dims_out)]
+        )
+    def forward(self, encodings: list[torch.Tensor]) -> list[torch.Tensor]:
+        """Apply projection module."""
+        if len(encodings) != len(self.convs):
+            raise ValueError("Number of encodings must be equal to number of projections.")
+        return [conv(encoding) for conv, encoding in zip(self.convs, encodings)]
+class MonodepthFeatureEncoder(BaseEncoder):
+    """A wrapper around monodepth network to extract features."""
+    def __init__(
+        self,
+        monodepth_encoder: SlidingPyramidNetwork,
+        output_dims: list[int] | None = None,
+        freeze_projection: bool = False,
+    ) -> None:
+        """Initialize MonodepthFeatureExtractor."""
+        super().__init__()
+        self.encoder = monodepth_encoder
+        # The monodepth network returns two feature maps for the first entry in
+        # backbone.encoder.dims_encoder.
+        monodepth_dims = self.encoder.dims_encoder
+        monodepth_dims = monodepth_dims
+        if output_dims is not None:
+            if not len(output_dims) == len(monodepth_dims):
+                raise ValueError(
+                    "When set, number of output dimensions must be equal to output "
+                    f"dimensions of monodepth model {len(monodepth_dims)}."
+                )
+            self.projection = ProjectionModule(monodepth_dims, output_dims)
+            self.output_dims = output_dims
+        else:
+            self.projection = nn.Identity()
+            self.output_dims = monodepth_dims
+        if freeze_projection:
+            self.projection.requires_grad_(False)
+    def forward(self, input_features: torch.Tensor) -> list[torch.Tensor]:
+        """Extract multi-resolution features."""
+        encodings = self.encoder(input_features[:, :3].contiguous())
+        return self.projection(encodings)
+    def internal_resolution(self) -> int:
+        """Internal resolution of the encoder."""
+        return self.encoder.internal_resolution()

src/sharp/models/encoders/spn_encoder.py ADDED Viewed

	@@ -0,0 +1,369 @@

+"""Contains Sliding Pyramid Network architecture.
+For licensing see accompanying LICENSE file.
+Copyright (C) 2025 Apple Inc. All Rights Reserved.
+"""
+from __future__ import annotations
+import math
+from typing import Iterable
+import torch
+import torch.fx
+import torch.nn as nn
+import torch.nn.functional as F
+from sharp.utils.training import checkpoint_wrapper
+from .base_encoder import BaseEncoder
+from .vit_encoder import TimmViT
+# torch.fx.wrap is used here to mark functions as leaf nodes during symbolic tracing
+# ensuring they are not traced but seen as atomic operation. In short, symbolic tracing
+# struggles with native python functions and conditional flows.
+non_traceable_ops = ("len", "int")
+for op in non_traceable_ops:
+    torch.fx.wrap(op)
+class SlidingPyramidNetwork(BaseEncoder):
+    """Sliding Pyramid Network.
+    An encoder aimed at creating multi-resolution encodings from Vision Transformers.
+    Reference: Bochkovskii et al. - "Depth pro: Sharp monocular metric depth in less
+               than a second." (ICLR 2024)
+    """
+    def __init__(
+        self,
+        dims_encoder: Iterable[int],
+        patch_encoder: TimmViT,
+        image_encoder: TimmViT,
+        use_patch_overlap: bool = True,
+    ):
+        """Initialize Sliding Pyramid Network.
+        The framework
+            1. creates an image pyramid,
+            2. generates overlapping patches with a sliding window at each pyramid level,
+            3. creates batched encodings via vision transformer backbones,
+            4. produces multi-resolution encodings.
+        Args:
+            dims_encoder: Dimensions of the encoder at different layers.
+            patch_encoder: Backbone used for highres part of the pyramid.
+            image_encoder: Backbone used for lowres part of the pyramid.
+            use_patch_overlap: Whether to use overlap between patches in SPN.
+        """
+        super().__init__()
+        self.dim_in = patch_encoder.dim_in
+        self.dims_encoder = list(dims_encoder)
+        self.patch_encoder = patch_encoder
+        self.image_encoder = image_encoder
+        base_embed_dim = patch_encoder.embed_dim
+        lowres_embed_dim = image_encoder.embed_dim
+        self.patch_size = patch_encoder.internal_resolution()
+        self.grad_checkpointing = False
+        self.use_patch_overlap = use_patch_overlap
+        # Retrieve intermediate feature ids registered in create_monodepth_encoder.
+        self.patch_intermediate_features_ids = patch_encoder.intermediate_features_ids
+        if (
+            not isinstance(self.patch_intermediate_features_ids, list)
+            or not len(self.patch_intermediate_features_ids) == 4
+        ):
+            raise ValueError("Patch intermediate feature ids must be a 4-item list.")
+        self.image_intermediate_features_ids = image_encoder.intermediate_features_ids
+        def _create_project_upsample_block(
+            dim_in: int,
+            dim_out: int,
+            upsample_layers: int,
+            dim_intermediate=None,
+        ) -> nn.Module:
+            if dim_intermediate is None:
+                dim_intermediate = dim_out
+            # Projection.
+            blocks = [
+                nn.Conv2d(
+                    in_channels=dim_in,
+                    out_channels=dim_intermediate,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                    bias=False,
+                )
+            ]
+            # Upsampling.
+            blocks += [
+                nn.ConvTranspose2d(
+                    in_channels=dim_intermediate if i == 0 else dim_out,
+                    out_channels=dim_out,
+                    kernel_size=2,
+                    stride=2,
+                    padding=0,
+                    bias=False,
+                )
+                for i in range(upsample_layers)
+            ]
+            return nn.Sequential(*blocks)
+        self.upsample_latent0 = _create_project_upsample_block(
+            dim_in=base_embed_dim,
+            dim_out=self.dims_encoder[0],
+            upsample_layers=3,
+            dim_intermediate=self.dims_encoder[1],
+        )
+        self.upsample_latent1 = _create_project_upsample_block(
+            dim_in=base_embed_dim, dim_out=self.dims_encoder[1], upsample_layers=2
+        )
+        self.upsample0 = _create_project_upsample_block(
+            dim_in=base_embed_dim, dim_out=self.dims_encoder[2], upsample_layers=1
+        )
+        self.upsample1 = _create_project_upsample_block(
+            dim_in=base_embed_dim, dim_out=self.dims_encoder[3], upsample_layers=1
+        )
+        self.upsample2 = _create_project_upsample_block(
+            dim_in=base_embed_dim, dim_out=self.dims_encoder[4], upsample_layers=1
+        )
+        self.upsample_lowres = nn.ConvTranspose2d(
+            in_channels=lowres_embed_dim,
+            out_channels=self.dims_encoder[4],
+            kernel_size=2,
+            stride=2,
+            padding=0,
+            bias=True,
+        )
+        self.fuse_lowres = nn.Conv2d(
+            in_channels=(self.dims_encoder[4] + self.dims_encoder[4]),
+            out_channels=self.dims_encoder[4],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True,
+        )
+    def internal_resolution(self) -> int:
+        """Return the full image size of the SPN network."""
+        return self.patch_size * 4
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, is_enabled=True):
+        """Enable grad checkpointing."""
+        self.grad_checkpointing = is_enabled
+        self.patch_encoder.set_grad_checkpointing(is_enabled)
+        self.image_encoder.set_grad_checkpointing(is_enabled)
+    @torch.jit.ignore
+    def set_requires_grad_(self, patch_encoder: bool, image_encoder: bool):
+        """Set requires grad for separate components."""
+        self.patch_encoder.requires_grad_(patch_encoder)
+        self.image_encoder.requires_grad_(image_encoder)
+        # Always freeze the unused TimmViT head to exclude it from the calculation of
+        # trainable parameters.
+        self.patch_encoder.head.requires_grad_(False)
+        self.image_encoder.head.requires_grad_(False)
+        # These upsamplers only affect patch encoder's feature maps.
+        self.upsample_latent0.requires_grad_(patch_encoder)
+        self.upsample_latent1.requires_grad_(patch_encoder)
+        self.upsample0.requires_grad_(patch_encoder)
+        self.upsample1.requires_grad_(patch_encoder)
+        self.upsample2.requires_grad_(patch_encoder)
+        # This upsampler affects only image encoder's feature map.
+        self.upsample_lowres.requires_grad_(image_encoder)
+        # This fuser affects both image and patch encoders.
+        self.fuse_lowres.requires_grad_(image_encoder or patch_encoder)
+    def _create_pyramid(self, x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Creates a 3-level image pyramid."""
+        # Original resolution: 1536 by default.
+        x0 = x
+        # Middle resolution: 768 by default.
+        x1 = F.interpolate(x, size=None, scale_factor=0.5, mode="bilinear", align_corners=False)
+        # Low resolution: 384 by default, corresponding to the backbone resolution.
+        x2 = F.interpolate(x, size=None, scale_factor=0.25, mode="bilinear", align_corners=False)
+        return x0, x1, x2
+    def forward(self, x: torch.Tensor) -> list[torch.Tensor]:
+        """Encode input at multiple resolutions."""
+        batch_size = x.shape[0]
+        # Step 0: create a 3-level image pyramid.
+        x0, x1, x2 = self._create_pyramid(x)
+        if self.use_patch_overlap:
+            # Step 1: split to create batched overlapped mini-images at the ViT
+            # resolution.
+            # 5x5 @ 384x384 at the highest resolution (1536x1536).
+            x0_patches = split(x0, overlap_ratio=0.25, patch_size=self.patch_size)
+            # 3x3 @ 384x384 at the middle resolution (768x768).
+            x1_patches = split(x1, overlap_ratio=0.5, patch_size=self.patch_size)
+            # 1x1 # 384x384 at the lowest resolution (384x384).
+            x2_patches = x2
+            padding = 3
+        else:
+            # Step 1: split to create batched overlapped mini-images at the ViT
+            # resolution.
+            # 4x4 @ 384x384 at the highest resolution (1536x1536).
+            x0_patches = split(x0, overlap_ratio=0.0, patch_size=self.patch_size)
+            # 2x2 @ 384x384 at the middle resolution (768x768).
+            x1_patches = split(x1, overlap_ratio=0.0, patch_size=self.patch_size)
+            # 1x1 # 384x384 at the lowest resolution (384x384).
+            x2_patches = x2
+            padding = 0
+        x0_tile_size = x0_patches.shape[0]
+        # Concatenate all the sliding window patches and form a batch of size
+        # (35=5x5+3x3+1x1) or (21=4x4+2x2+1x1).
+        x_pyramid_patches = torch.cat(
+            (x0_patches, x1_patches, x2_patches),
+            dim=0,
+        )
+        # Run the ViT model and get the result of large batch size.
+        #
+        # For the retrieval of intermediate features forward hooks are more concise,
+        # but they are not well compatible with symbolic tracing because attributes
+        # of submodules can be lost during tracing. Therefore, forward hooks may not
+        # be preserved during graph transformation, leading to unexpected behavior.
+        # To avoid such issues it is safer not to use them because they are not
+        # essential here.
+        x_pyramid_encodings, patch_intermediate_features = self.patch_encoder(x_pyramid_patches)
+        # Step 3: merging.
+        # Merge highres latent encoding.
+        # NOTE: list type check has completed in init.
+        x_latent0_encodings = self.patch_encoder.reshape_feature(
+            patch_intermediate_features[self.patch_intermediate_features_ids[0]]  # type:ignore[index]
+        )
+        x_latent0_features = merge(
+            x_latent0_encodings[: batch_size * x0_tile_size],
+            batch_size=batch_size,
+            padding=padding,
+        )
+        x_latent1_encodings = self.patch_encoder.reshape_feature(
+            patch_intermediate_features[self.patch_intermediate_features_ids[1]]  # type:ignore[index]
+        )
+        x_latent1_features = merge(
+            x_latent1_encodings[: batch_size * x0_tile_size],
+            batch_size=batch_size,
+            padding=padding,
+        )
+        # Split the 35 batch size from pyramid encoding back into 5x5+3x3+1x1.
+        x0_encodings, x1_encodings, x2_encodings = torch.split(
+            x_pyramid_encodings,
+            [len(x0_patches), len(x1_patches), len(x2_patches)],
+            dim=0,
+        )
+        # 96x96 feature maps by merging 5x5 @ 24x24 patches with overlaps.
+        x0_features = merge(x0_encodings, batch_size=batch_size, padding=padding)
+        # 48x84 feature maps by merging 3x3 @ 24x24 patches with overlaps.
+        x1_features = merge(x1_encodings, batch_size=batch_size, padding=2 * padding)
+        # 24x24 feature maps.
+        x2_features = x2_encodings
+        # Apply the image encoder.
+        x_lowres_features, image_intermediate_features = self.image_encoder(x2_patches)
+        # Upsample feature maps.
+        x_latent0_features = checkpoint_wrapper(self, self.upsample_latent0, x_latent0_features)
+        x_latent1_features = checkpoint_wrapper(self, self.upsample_latent1, x_latent1_features)
+        x0_features = checkpoint_wrapper(self, self.upsample0, x0_features)
+        x1_features = checkpoint_wrapper(self, self.upsample1, x1_features)
+        x2_features = checkpoint_wrapper(self, self.upsample2, x2_features)
+        x_lowres_features = checkpoint_wrapper(self, self.upsample_lowres, x_lowres_features)
+        x_lowres_features = checkpoint_wrapper(
+            self, self.fuse_lowres, torch.cat((x2_features, x_lowres_features), dim=1)
+        )
+        output = [
+            x_latent0_features,
+            x_latent1_features,
+            x0_features,
+            x1_features,
+            x_lowres_features,
+        ]
+        return output
+# It seems that torch.fx.wrap can only be applied to functions, not methods.
+# Hence, split and merge were converted into functions to be marked as atomic
+# operations for symbolic tracing.
+@torch.fx.wrap
+def split(image: torch.Tensor, overlap_ratio: float = 0.25, patch_size: int = 384) -> torch.Tensor:
+    """Split the input into small patches with sliding window."""
+    patch_stride = int(patch_size * (1 - overlap_ratio))
+    image_size = image.shape[-1]
+    steps = int(math.ceil((image_size - patch_size) / patch_stride)) + 1
+    x_patch_list = []
+    for j in range(steps):
+        j0 = j * patch_stride
+        j1 = j0 + patch_size
+        for i in range(steps):
+            i0 = i * patch_stride
+            i1 = i0 + patch_size
+            x_patch_list.append(image[..., j0:j1, i0:i1])
+    return torch.cat(x_patch_list, dim=0)
+# Decorator marking function as an atomic operator for symbolic tracing.
+@torch.fx.wrap
+def merge(image_patches: torch.Tensor, batch_size: int, padding: int = 3) -> torch.Tensor:
+    """Merge the patched input into a image with sliding window."""
+    steps = int(math.sqrt(image_patches.shape[0] // batch_size))
+    idx = 0
+    output_list = []
+    for j in range(steps):
+        output_row_list = []
+        for i in range(steps):
+            output = image_patches[batch_size * idx : batch_size * (idx + 1)]
+            if padding != 0:
+                if j != 0:
+                    output = output[..., padding:, :]
+                if i != 0:
+                    output = output[..., :, padding:]
+                if j != steps - 1:
+                    output = output[..., :-padding, :]
+                if i != steps - 1:
+                    output = output[..., :, :-padding]
+            output_row_list.append(output)
+            idx += 1
+        output_row = torch.cat(output_row_list, dim=-1)
+        output_list.append(output_row)
+    output = torch.cat(output_list, dim=-2)
+    return output

src/sharp/models/encoders/unet_encoder.py ADDED Viewed

	@@ -0,0 +1,117 @@

+"""Contains backbone models for feature extraction from RGBD input.
+For licensing see accompanying LICENSE file.
+Copyright (C) 2025 Apple Inc. All Rights Reserved.
+"""
+from __future__ import annotations
+from typing import List
+import torch
+from torch import nn
+from sharp.models.blocks import (
+    NormLayerName,
+    norm_layer_2d,
+    residual_block_2d,
+)
+from .base_encoder import BaseEncoder
+class UNetEncoder(BaseEncoder):
+    """Encoder of UNet model."""
+    def __init__(
+        self,
+        dim_in: int,
+        width: List[int] | int,
+        steps: int = 6,
+        norm_type: NormLayerName = "group_norm",
+        norm_num_groups=8,
+        blocks_per_layer=2,
+    ) -> None:
+        """Initialize UNet Encoder.
+        Args:
+            dim_in: The number of input channels.
+            width: Width multiplicator of intermediate layers or the width list of all layers.
+            steps: The number of downsampling steps.
+            norm_type: Which kind of normalization layer to use.
+            norm_num_groups: How many groups to use for group norm (if relevant).
+            blocks_per_layer: How many residual blocks per layer to use.
+        """
+        super().__init__()
+        if blocks_per_layer < 1:
+            raise ValueError("blocks_per_layer must be greater or equal to one.")
+        self.dim_in = dim_in
+        self.width = width
+        self.num_steps = steps
+        self.convs_down = nn.ModuleList()
+        self.output_dims: list[int]
+        # If only one number is specified, we assume each layer will double the channel dimension.
+        if isinstance(width, int):
+            self.output_dims = [width << i for i in range(0, steps + 1)]
+        else:
+            if len(width) != (steps + 1):
+                raise ValueError("Length of width should match the steps for UNetEncoder.")
+            self.output_dims = width
+        self.conv_in = nn.Sequential(
+            nn.Conv2d(self.dim_in, self.output_dims[0], 3, stride=1, padding=1),
+            norm_layer_2d(self.output_dims[0], norm_type, num_groups=norm_num_groups),
+            nn.ReLU(),
+        )
+        for i_step in range(steps):
+            input_width = self.output_dims[i_step]
+            current_width = self.output_dims[i_step + 1]
+            convs_down_i = nn.Sequential(
+                nn.AvgPool2d(2, stride=2),
+                residual_block_2d(
+                    input_width,
+                    current_width,
+                    norm_type=norm_type,
+                    norm_num_groups=norm_num_groups,
+                ),
+                *[
+                    residual_block_2d(
+                        current_width,
+                        current_width,
+                        norm_type=norm_type,
+                        norm_num_groups=norm_num_groups,
+                    )
+                    for _ in range(blocks_per_layer - 1)
+                ],
+            )
+            self.convs_down.append(convs_down_i)
+    def forward(self, input: torch.Tensor) -> list[torch.Tensor]:
+        """Apply UNet Encoder to image.
+        Args:
+            input: The input image.
+        Returns:
+            The output multi-level feature map from encoder.
+        """
+        features = []
+        feat_i = self.conv_in(input)
+        features.append(feat_i)
+        for conv_down in self.convs_down:
+            feat_i = conv_down(feat_i)
+            features.append(feat_i)
+        return features
+    @property
+    def out_width(self) -> int:
+        """Compute the output width for UNet decoder."""
+        return self.output_dims[-1]

src/sharp/models/encoders/vit_encoder.py ADDED Viewed

	@@ -0,0 +1,111 @@

+"""Contains factory functions to build and load ViT.
+For licensing see accompanying LICENSE file.
+Copyright (C) 2025 Apple Inc. All Rights Reserved.
+"""
+from __future__ import annotations
+import logging
+import timm
+import torch
+from sharp.models.presets.vit import VIT_CONFIG_DICT, ViTConfig, ViTPreset
+LOGGER = logging.getLogger(__name__)
+class TimmViT(timm.models.VisionTransformer):
+    """Contains TIMM implementation for Vanilla ViT."""
+    def __init__(self, config: ViTConfig):
+        """Initialize ViT from TIMM implementation."""
+        # Handle mlp layers.
+        mlp_layer = timm.layers.GluMlp if config.mlp_mode == "glu" else timm.layers.Mlp
+        super().__init__(
+            in_chans=config.in_chans,
+            embed_dim=config.embed_dim,
+            depth=config.depth,
+            num_heads=config.num_heads,
+            init_values=config.init_values,
+            img_size=config.img_size,
+            patch_size=config.patch_size,
+            num_classes=config.num_classes,
+            mlp_ratio=config.mlp_ratio,
+            qkv_bias=config.qkv_bias,
+            global_pool=config.global_pool,
+            mlp_layer=mlp_layer,
+        )
+        # Required for extracting intermediate features.
+        self.dim_in = config.in_chans
+        self.intermediate_features_ids = config.intermediate_features_ids
+    def reshape_feature(self, embeddings: torch.Tensor):
+        """Discard class token and reshape 1D feature map to a 2D grid."""
+        batch_size, seq_len, channel = embeddings.shape
+        height, width = self.patch_embed.grid_size
+        # Remove class token.
+        if self.num_prefix_tokens:
+            embeddings = embeddings[:, self.num_prefix_tokens :, :]
+        # Shape: (batch, height, width, dim) -> (batch, dim, height, width)
+        embeddings = embeddings.reshape(batch_size, height, width, channel).permute(0, 3, 1, 2)
+        return embeddings
+    def forward(self, input_tensor: torch.Tensor) -> tuple[torch.Tensor, dict[int, torch.Tensor]]:
+        """Override forwarding with intermediate features.
+        Adapted from timm ViT.
+        Returns:
+            Output features and list of features from intermediate layers (patch encoder only).
+        """
+        intermediate_features = {}
+        x = self.patch_embed(input_tensor)
+        batch_size, seq_len, _ = x.shape
+        x = self._pos_embed(x)
+        x = self.patch_drop(x)
+        x = self.norm_pre(x)
+        for idx, block in enumerate(self.blocks):
+            x = block(x)
+            if self.intermediate_features_ids is not None and idx in self.intermediate_features_ids:
+                intermediate_features[idx] = x
+        x = self.norm(x)
+        x = self.reshape_feature(x)
+        return x, intermediate_features
+    def internal_resolution(self) -> int:
+        """Return the internal image size of the network."""
+        if isinstance(self.patch_embed.img_size, tuple):
+            return self.patch_embed.img_size[0]
+        else:
+            return self.patch_embed.img_size
+def create_vit(
+    config: ViTConfig | None = None,
+    preset: ViTPreset | None = "dinov2l16_384",
+    intermediate_features_ids: list[int] | None = None,
+) -> TimmViT:
+    """Factory function for creating a ViT model."""
+    if config is not None:
+        LOGGER.info("Using user-defined config.")
+    else:
+        if preset is None:
+            raise ValueError("User-defined config and preset cannot be both None.")
+        LOGGER.info("Using preset ViT %s.", preset)
+        config = VIT_CONFIG_DICT[preset]
+    config.intermediate_features_ids = intermediate_features_ids
+    model = TimmViT(config)
+    LOGGER.debug(model)
+    return model

src/sharp/models/gaussian_decoder.py ADDED Viewed

	@@ -0,0 +1,267 @@

+"""Contains Dense Transformer Prediction architecture.
+Implements a variant of Vision Transformers for Dense Prediction, https://arxiv.org/abs/2103.13413
+For licensing see accompanying LICENSE file.
+Copyright (C) 2025 Apple Inc. All Rights Reserved.
+"""
+from __future__ import annotations
+from typing import NamedTuple
+import torch
+import torch.nn as nn
+from sharp.models.blocks import (
+    FeatureFusionBlock2d,
+    NormLayerName,
+    residual_block_2d,
+)
+from sharp.models.decoders import BaseDecoder, MultiresConvDecoder
+from sharp.models.params import DPTImageEncoderType, GaussianDecoderParams
+def create_gaussian_decoder(
+    params: GaussianDecoderParams, dims_depth_features: list[int]
+) -> GaussianDensePredictionTransformer:
+    """Create gaussian_decoder model specified by gaussian_decoder_name."""
+    decoder = MultiresConvDecoder(
+        dims_depth_features,
+        params.dims_decoder,
+        grad_checkpointing=params.grad_checkpointing,
+        upsampling_mode=params.upsampling_mode,
+    )
+    return GaussianDensePredictionTransformer(
+        decoder=decoder,
+        dim_in=params.dim_in,
+        dim_out=params.dim_out,
+        stride_out=params.stride,
+        norm_type=params.norm_type,
+        norm_num_groups=params.norm_num_groups,
+        use_depth_input=params.use_depth_input,
+        grad_checkpointing=params.grad_checkpointing,
+        image_encoder_type=params.image_encoder_type,
+        image_encoder_params=params,
+    )
+def _create_project_upsample_block(
+    dim_in: int,
+    dim_out: int,
+    upsample_layers: int,
+    dim_intermediate: int | None = None,
+) -> nn.Module:
+    if dim_intermediate is None:
+        dim_intermediate = dim_out
+    # Projection.
+    blocks = [
+        nn.Conv2d(
+            in_channels=dim_in,
+            out_channels=dim_intermediate,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=False,
+        )
+    ]
+    # Upsampling.
+    blocks += [
+        nn.ConvTranspose2d(
+            in_channels=dim_intermediate if i == 0 else dim_out,
+            out_channels=dim_out,
+            kernel_size=2,
+            stride=2,
+            padding=0,
+            bias=False,
+        )
+        for i in range(upsample_layers)
+    ]
+    return nn.Sequential(*blocks)
+class ImageFeatures(NamedTuple):
+    """Image feature extracted from decoder."""
+    texture_features: torch.Tensor
+    geometry_features: torch.Tensor
+class SkipConvBackbone(nn.Module):
+    """A wrapper around a conv layer that behaves like a BaseBackbone."""
+    def __init__(self, dim_in: int, dim_out: int, kernel_size: int, stride_out: int):
+        """Initialize SkipConvBackbone."""
+        super().__init__()
+        self.stride_out = stride_out
+        if stride_out == 1 and kernel_size != 1:
+            raise ValueError("We only support kernel_size = 1 if stride_out is 1.")
+        padding: int = (kernel_size - 1) // 2
+        self.conv = nn.Conv2d(
+            dim_in, dim_out, kernel_size=kernel_size, stride=stride_out, padding=padding
+        )
+    def forward(
+        self,
+        input_features: torch.Tensor,
+        encodings: list[torch.Tensor] | None = None,
+    ) -> ImageFeatures:
+        """Apply SkipConvBackbone to image."""
+        output = self.conv(input_features)
+        return ImageFeatures(
+            texture_features=output,
+            geometry_features=output,
+        )
+    @property
+    def stride(self) -> int:
+        """Effective downsampling stride."""
+        return self.stride_out
+class GaussianDensePredictionTransformer(nn.Module):
+    """Dense Prediction Transformer for Gaussian.
+    Reuse monodepth decoded features for processing.
+    """
+    norm_type: NormLayerName
+    def __init__(
+        self,
+        decoder: BaseDecoder,
+        dim_in: int,
+        dim_out: int,
+        stride_out: int,
+        image_encoder_params: GaussianDecoderParams,
+        image_encoder_type: DPTImageEncoderType = "skip_conv",
+        norm_type: NormLayerName = "group_norm",
+        norm_num_groups: int = 8,
+        use_depth_input: bool = True,
+        grad_checkpointing: bool = False,
+    ):
+        """Initialize Dense Prediction Transformer for Gaussian.
+        Args:
+            decoder: Decoder to decode features.
+            monodepth_decoder: Optional monodepth decoder to fuse monodepth decoded features.
+            dim_in: Input dimension.
+            dim_out: Final output dimension.
+            stride_out: Stride of output feature map.
+            image_encoder_params: The backbone parameters to configurate the image encoder.
+            image_encoder_type: Type of image encoder to use.
+            encoder: Encoder to generate features using monodepth model.
+            norm_type: Type of norm layers.
+            norm_num_groups: Num groups for norm layers.
+            use_depth_input: Whether to use depth input.
+            grad_checkpointing: Whether to use gradient checkpointing.
+        """
+        super().__init__()
+        self.decoder = decoder
+        self.dim_in = dim_in
+        self.dim_out = dim_out
+        self.stride_out = stride_out
+        self.norm_type = norm_type
+        self.norm_num_groups = norm_num_groups
+        self.use_depth_input = use_depth_input
+        self.grad_checkpointing = grad_checkpointing
+        self.image_encoder_type = image_encoder_type
+        # Adopt an image encoder to lift dimension to monodepth feature and
+        # resize to be the same resolution as the decoder output.
+        dim_in = self.dim_in if use_depth_input else self.dim_in - 1
+        image_encoder_params.dim_in = dim_in
+        image_encoder_params.dim_out = decoder.dim_out
+        self.image_encoder = self._create_image_encoder(image_encoder_params, stride_out)
+        self.fusion = FeatureFusionBlock2d(decoder.dim_out)
+        if stride_out == 1:
+            self.upsample = _create_project_upsample_block(
+                decoder.dim_out,
+                decoder.dim_out,
+                upsample_layers=1,
+            )
+        elif stride_out == 2:
+            self.upsample = nn.Identity()
+        else:
+            raise ValueError("We only support stride is 1 or 2 for DPT backbone.")
+        self.texture_head = self._create_head(dim_decoder=decoder.dim_out, dim_out=self.dim_out)
+        self.geometry_head = self._create_head(dim_decoder=decoder.dim_out, dim_out=self.dim_out)
+    def _create_head(self, dim_decoder: int, dim_out: int) -> nn.Module:
+        return nn.Sequential(
+            residual_block_2d(
+                dim_in=dim_decoder,
+                dim_out=dim_decoder,
+                dim_hidden=dim_decoder // 2,
+                norm_type=self.norm_type,
+                norm_num_groups=self.norm_num_groups,
+            ),
+            residual_block_2d(
+                dim_in=dim_decoder,
+                dim_hidden=dim_decoder // 2,
+                dim_out=dim_decoder,
+                norm_type=self.norm_type,
+                norm_num_groups=self.norm_num_groups,
+            ),
+            nn.ReLU(),
+            nn.Conv2d(dim_decoder, dim_out, kernel_size=1, stride=1),
+            nn.ReLU(),
+        )
+    def _create_image_encoder(
+        self, image_encoder_params: GaussianDecoderParams, stride_out: int
+    ) -> nn.Module:
+        """Create image encoder and return based on parameters."""
+        if self.image_encoder_type == "skip_conv":
+            # Use kernel_size = 1 only if stride_out is 1.
+            return SkipConvBackbone(
+                image_encoder_params.dim_in,
+                image_encoder_params.dim_out,
+                kernel_size=3 if stride_out != 1 else 1,
+                stride_out=stride_out,
+            )
+        elif self.image_encoder_type == "skip_conv_kernel2":
+            return SkipConvBackbone(
+                image_encoder_params.dim_in,
+                image_encoder_params.dim_out,
+                kernel_size=stride_out,
+                stride_out=stride_out,
+            )
+        else:
+            raise ValueError(f"Unsupported image encoder type: {self.image_encoder_type}")
+    def forward(self, input_features: torch.Tensor, encodings: list[torch.Tensor]) -> ImageFeatures:
+        """Run monodepth and fuse features with input image to predict Gaussians.
+        Args:
+            input_features: The input features to use.
+            encodings: Feature encodings (e.g. from monodepth network).
+        """
+        features = self.decoder(encodings).contiguous()
+        features = self.upsample(features)
+        if self.use_depth_input:
+            skip_features = self.image_encoder(input_features).texture_features
+        else:
+            skip_features = self.image_encoder(input_features[:, :3].contiguous())
+        features = self.fusion(features, skip_features)
+        texture_features = self.texture_head(features)
+        geometry_features = self.geometry_head(features)
+        return ImageFeatures(
+            texture_features=texture_features,  # type: ignore
+            geometry_features=geometry_features,  # type: ignore
+        )
+    @property
+    def stride(self) -> int:
+        """Internal stride of GaussianDensePredictionTransformer."""
+        return self.stride_out

src/sharp/models/heads.py ADDED Viewed

	@@ -0,0 +1,53 @@

+"""Contains decoder head for direct prediction of delta values.
+For licensing see accompanying LICENSE file.
+Copyright (C) 2025 Apple Inc. All Rights Reserved.
+"""
+from __future__ import annotations
+import torch
+from torch import nn
+from .gaussian_decoder import ImageFeatures
+class DirectPredictionHead(nn.Module):
+    """Decodes features into delta values using convolutions."""
+    def __init__(self, feature_dim: int, num_layers: int) -> None:
+        """Initialize DirectGaussianPredictor.
+        Args:
+            feature_dim: Number of input features.
+            num_layers: The number of layers of Gaussians to predict.
+        """
+        super().__init__()
+        self.num_layers = num_layers
+        # 14 is 3 means, 3 scales, 4 quaternions, 3 colors and 1 opacity
+        self.geometry_prediction_head = nn.Conv2d(feature_dim, 3 * num_layers, 1)
+        self.geometry_prediction_head.weight.data.zero_()
+        assert self.geometry_prediction_head.bias is not None
+        self.geometry_prediction_head.bias.data.zero_()
+        self.texture_prediction_head = nn.Conv2d(feature_dim, (14 - 3) * num_layers, 1)
+        self.texture_prediction_head.weight.data.zero_()
+        assert self.texture_prediction_head.bias is not None
+        self.texture_prediction_head.bias.data.zero_()
+    def forward(self, image_features: ImageFeatures) -> torch.Tensor:
+        """Predict deltas for 3D Gaussians.
+        Args:
+            image_features: Image features from decoder.
+        Returns:
+            The predicted deltas for Gaussian attributes.
+        """
+        delta_values_geometry = self.geometry_prediction_head(image_features.geometry_features)
+        delta_values_texture = self.texture_prediction_head(image_features.texture_features)
+        delta_values_geometry = delta_values_geometry.unflatten(1, (3, self.num_layers))
+        delta_values_texture = delta_values_texture.unflatten(1, (14 - 3, self.num_layers))
+        delta_values = torch.cat([delta_values_geometry, delta_values_texture], dim=1)
+        return delta_values

src/sharp/models/initializer.py ADDED Viewed

	@@ -0,0 +1,297 @@

+"""Contains modules to initialize Gaussians from RGBD.
+For licensing see accompanying LICENSE file.
+Copyright (C) 2025 Apple Inc. All Rights Reserved.
+"""
+from __future__ import annotations
+from typing import NamedTuple
+import torch
+from torch import nn
+from .params import ColorInitOption, DepthInitOption, InitializerParams
+def create_initializer(params: InitializerParams) -> nn.Module:
+    """Create inpainter."""
+    return MultiLayerInitializer(
+        num_layers=params.num_layers,
+        stride=params.stride,
+        base_depth=params.base_depth,
+        scale_factor=params.scale_factor,
+        disparity_factor=params.disparity_factor,
+        color_option=params.color_option,
+        first_layer_depth_option=params.first_layer_depth_option,
+        rest_layer_depth_option=params.rest_layer_depth_option,
+        normalize_depth=params.normalize_depth,
+        feature_input_stop_grad=params.feature_input_stop_grad,
+    )
+class GaussianBaseValues(NamedTuple):
+    """Base values for gaussian predictor.
+    We predict x and y in normalized device coordinates (NDC) where (-1, -1) is the top
+    left corner and (1, 1) the bottom right corner. The last component of
+    mean_vectors_ndc is inverse depth.
+    """
+    mean_x_ndc: torch.Tensor
+    mean_y_ndc: torch.Tensor
+    mean_inverse_z_ndc: torch.Tensor
+    scales: torch.Tensor
+    quaternions: torch.Tensor
+    colors: torch.Tensor
+    opacities: torch.Tensor
+class InitializerOutput(NamedTuple):
+    """Output of initializer."""
+    # Gaussian base values.
+    gaussian_base_values: GaussianBaseValues
+    # Feature input to the Gaussian predictor.
+    feature_input: torch.Tensor
+    # Global scale to unscale output.
+    global_scale: torch.Tensor | None = None
+class MultiLayerInitializer(nn.Module):
+    """Initialize Gaussians with multilayer representation.
+    The returned tensors have the shape
+        batch_size x dim x num_layers x height x width
+    where dim indicates the dimensionality of the property.
+    Some of the dimensions might be set to 1 for efficiency reasons.
+    """
+    def __init__(
+        self,
+        num_layers: int,
+        stride: int,
+        base_depth: float,
+        scale_factor: float,
+        disparity_factor: float,
+        color_option: ColorInitOption = "first_layer",
+        first_layer_depth_option: DepthInitOption = "surface_min",
+        rest_layer_depth_option: DepthInitOption = "surface_min",
+        normalize_depth: bool = True,
+        feature_input_stop_grad: bool = True,
+    ) -> None:
+        """Initialize MultilayerInitializer.
+        Args:
+            stride: The downsample rate of output feature map.
+            base_depth: The depth of the first layer (after the foreground
+                layer if use_depth=True).
+            scale_factor: Multiply scale of Gaussians by this factor.
+            disparity_factor: Factor to convert inverse depth to disparity.
+            num_layers: How many layers of Gaussians to predict.
+            color_option: Which color option to initialize the multi-layer gaussians.
+            first_layer_depth_option: Which depth option to initialize the first layer of gaussians.
+            rest_layer_depth_option: Which depth option to initialize the rest layers of gaussians.
+            normalize_depth: # Whether to normalize depth to [DepthTransformParam.depth_min,
+                DepthTransformParam.depth_max).
+            feature_input_stop_grad: Whether to not propagate gradients through feature inputs.
+        """
+        super().__init__()
+        self.num_layers = num_layers
+        self.stride = stride
+        self.base_depth = base_depth
+        self.scale_factor = scale_factor
+        self.disparity_factor = disparity_factor
+        self.color_option = color_option
+        self.first_layer_depth_option = first_layer_depth_option
+        self.rest_layer_depth_option = rest_layer_depth_option
+        self.normalize_depth = normalize_depth
+        self.feature_input_stop_grad = feature_input_stop_grad
+    def prepare_feature_input(self, image: torch.Tensor, depth: torch.Tensor) -> torch.Tensor:
+        """Prepare the feature input to the Guassian predictor."""
+        if self.feature_input_stop_grad:
+            image = image.detach()
+            depth = depth.detach()
+        normalized_disparity = self.disparity_factor / depth
+        features_in = torch.cat([image, normalized_disparity], dim=1)
+        features_in = 2.0 * features_in - 1.0
+        return features_in
+    def forward(self, image: torch.Tensor, depth: torch.Tensor) -> InitializerOutput:
+        """Construct Gaussian base values and prepare feature input.
+        Args:
+            image: The image to process.
+            depth: The corresponding depth map from the monodepth network.
+        Returns:
+            The base value for Gaussians.
+        """
+        image = image.contiguous()
+        depth = depth.contiguous()
+        device = depth.device
+        batch_size, _, image_height, image_width = depth.shape
+        base_height, base_width = (
+            image_height // self.stride,
+            image_width // self.stride,
+        )
+        # global_scale is the inverse of the depth_factor, which is used to rescale
+        # the depth such that it is numerically stable for training.
+        global_scale: torch.Tensor | None = None
+        if self.normalize_depth:
+            depth, depth_factor = _rescale_depth(depth)
+            global_scale = 1.0 / depth_factor
+        def _create_disparity_layers(num_layers: int = 1) -> torch.Tensor:
+            """Create multiple disparity layers."""
+            disparity = torch.linspace(1.0 / self.base_depth, 0.0, num_layers + 1, device=device)
+            return disparity[None, None, :-1, None, None].repeat(
+                batch_size, 1, 1, base_height, base_width
+            )
+        def _create_surface_layer(
+            depth: torch.Tensor,
+            depth_pooling_mode: str,
+        ) -> torch.Tensor:
+            """Create multiple surface layers."""
+            disparity = 1.0 / depth
+            if depth_pooling_mode == "min":
+                disparity = torch.max_pool2d(disparity, self.stride, self.stride)
+            elif depth_pooling_mode == "max":
+                disparity = -torch.max_pool2d(-disparity, self.stride, self.stride)
+            else:
+                raise ValueError(f"Invalid depth pooling mode {depth_pooling_mode}.")
+            return disparity[:, :, None, :, :]
+        # Input disparity dimensions:
+        #   (batch_size, num_channels in (1, 2), height, width)
+        # Output disparity dimensions:
+        #   (batch_size, num_channels=1, num_layers in (1, 2), height, width)
+        if self.first_layer_depth_option == "surface_min":
+            first_disparity = _create_surface_layer(depth[:, 0:1], "min")
+        elif self.first_layer_depth_option == "surface_max":
+            first_disparity = _create_surface_layer(depth[:, 0:1], "max")
+        elif self.first_layer_depth_option in ("base_depth", "linear_disparity"):
+            first_disparity = _create_disparity_layers()
+        else:
+            raise ValueError(f"Unknown depth init option: {self.first_layer_depth_option}.")
+        if self.num_layers == 1:
+            disparity = first_disparity
+        else:  # Fill in the rest layers.
+            following_depth = depth if depth.shape[1] == 1 else depth[:, 1:]
+            if self.rest_layer_depth_option == "surface_min":
+                following_disparity = _create_surface_layer(following_depth, "min")
+            elif self.rest_layer_depth_option == "surface_max":
+                following_disparity = _create_surface_layer(following_depth, "max")
+            elif self.rest_layer_depth_option == "base_depth":
+                following_disparity = torch.cat(
+                    [_create_disparity_layers() for i in range(self.num_layers - 1)],
+                    dim=2,
+                )
+            elif self.rest_layer_depth_option == "linear_disparity":
+                following_disparity = _create_disparity_layers(self.num_layers - 1)
+            else:
+                raise ValueError(f"Unknown depth init option: {self.rest_layer_depth_option}.")
+            disparity = torch.cat([first_disparity, following_disparity], dim=2)
+        # Prepare base values.
+        base_x_ndc, base_y_ndc = _create_base_xy(depth, self.stride, self.num_layers)
+        disparity_scale_factor = 2 * self.scale_factor * self.stride / float(image_width)
+        base_scales = _create_base_scale(disparity, disparity_scale_factor)
+        base_quaternions = torch.tensor([1.0, 0.0, 0.0, 0.0], device=device)
+        base_quaternions = base_quaternions[None, :, None, None, None]
+        # Initializing the opacitiy this way ensures that the initial transmittance
+        # is approximately
+        #
+        #       1 / e ~= (1 - 1 / self.num_layers)**self.num_layers
+        #
+        # and hence independent of the number of layers.
+        #
+        base_opacities = torch.tensor([min(1.0 / self.num_layers, 0.5)], device=device)
+        base_colors = torch.empty(
+            batch_size, 3, self.num_layers, base_height, base_width, device=device
+        ).fill_(0.5)
+        # Dimensions: (batch_size, num_channels, num_layers, height, width)
+        if self.color_option == "none":
+            pass
+        elif self.color_option == "first_layer":
+            base_colors[:, :, 0] = torch.nn.functional.avg_pool2d(image, self.stride, self.stride)
+        elif self.color_option == "all_layers":
+            temp = torch.nn.functional.avg_pool2d(image, self.stride, self.stride)
+            base_colors = temp[:, :, None, :, :].repeat(1, 1, self.num_layers, 1, 1)
+        else:
+            raise ValueError(f"Unknown color init option: {self.color_option}.")
+        features_in = self.prepare_feature_input(image, depth)
+        base_gaussians = GaussianBaseValues(
+            mean_x_ndc=base_x_ndc,
+            mean_y_ndc=base_y_ndc,
+            mean_inverse_z_ndc=disparity,
+            scales=base_scales,
+            quaternions=base_quaternions,
+            colors=base_colors,
+            opacities=base_opacities,
+        )
+        return InitializerOutput(
+            gaussian_base_values=base_gaussians,
+            feature_input=features_in,
+            global_scale=global_scale,
+        )
+def _create_base_xy(
+    depth: torch.Tensor, stride: int, num_layers: int
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Create base x and y coordinates for the gaussians in NDC space."""
+    device = depth.device
+    batch_size, _, image_height, image_width = depth.shape
+    xx = torch.arange(0.5 * stride, image_width, stride, device=device)
+    yy = torch.arange(0.5 * stride, image_height, stride, device=device)
+    xx = 2 * xx / image_width - 1.0
+    yy = 2 * yy / image_height - 1.0
+    xx, yy = torch.meshgrid(xx, yy, indexing="xy")
+    base_x_ndc = xx[None, None, None].repeat(batch_size, 1, num_layers, 1, 1)
+    base_y_ndc = yy[None, None, None].repeat(batch_size, 1, num_layers, 1, 1)
+    return base_x_ndc, base_y_ndc
+def _create_base_scale(disparity: torch.Tensor, disparity_scale_factor: float) -> torch.Tensor:
+    """Create base scale for the gaussians."""
+    inverse_disparity = torch.ones_like(disparity) / disparity
+    base_scales = inverse_disparity * disparity_scale_factor
+    return base_scales
+def _rescale_depth(
+    depth: torch.Tensor, depth_min: float = 1.0, depth_max: float = 1e2
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Rescale a depth image tensor.
+    Args:
+        depth: The depth tensor to transform.
+        depth_min: The min depth to scale depth to.
+        depth_max: The max clamp depth after scaling.
+    Returns:
+        The rescaled depth and rescale factor.
+    """
+    current_depth_min = depth.flatten(depth.ndim - 3).min(dim=-1).values
+    depth_factor = depth_min / (current_depth_min + 1e-6)
+    depth = (depth * depth_factor[..., None, None, None]).clamp(max=depth_max)
+    return depth, depth_factor

src/sharp/models/monodepth.py ADDED Viewed

	@@ -0,0 +1,268 @@

+"""Contains Dense Transformer Prediction architecture.
+Implements a variant of Vision Transformers for Dense Prediction, https://arxiv.org/abs/2103.13413
+For licensing see accompanying LICENSE file.
+Copyright (C) 2025 Apple Inc. All Rights Reserved.
+"""
+from __future__ import annotations
+import copy
+from typing import NamedTuple, Tuple
+import torch
+import torch.nn as nn
+from sharp.models import normalizers
+from sharp.models.decoders import MultiresConvDecoder, create_monodepth_decoder
+from sharp.models.encoders import (
+    SlidingPyramidNetwork,
+    create_monodepth_encoder,
+)
+from sharp.utils import module_surgery
+from .params import MonodepthAdaptorParams, MonodepthParams
+DimsDecoder = Tuple[int, int, int, int, int]
+class MonodepthDensePredictionTransformer(nn.Module):
+    """Dense Prediction Transformer for monodepth.
+    Attach the disparity prediction head for monodepth prediction.
+    """
+    def __init__(
+        self,
+        encoder: SlidingPyramidNetwork,
+        decoder: MultiresConvDecoder,
+        last_dims: tuple[int, int],
+    ):
+        """Initialize Dense Prediction Transformer.
+        Args:
+            encoder: The SlidingPyramidTransformer backbone.
+            decoder: The MultiresConvDecoder decoder.
+            last_dims: The dimension for the last convolution layers.
+        """
+        super().__init__()
+        self.normalizer = normalizers.AffineRangeNormalizer(
+            input_range=(0, 1), output_range=(-1, 1)
+        )
+        self.encoder = encoder
+        self.decoder = decoder
+        dim_decoder = decoder.dim_out
+        self.head = nn.Sequential(
+            nn.Conv2d(dim_decoder, dim_decoder // 2, kernel_size=3, stride=1, padding=1),
+            nn.ConvTranspose2d(
+                in_channels=dim_decoder // 2,
+                out_channels=dim_decoder // 2,
+                kernel_size=2,
+                stride=2,
+                padding=0,
+                bias=True,
+            ),
+            nn.Conv2d(
+                dim_decoder // 2,
+                last_dims[0],
+                kernel_size=3,
+                stride=1,
+                padding=1,
+            ),
+            nn.ReLU(True),
+            nn.Conv2d(last_dims[0], last_dims[1], kernel_size=1, stride=1, padding=0),
+            nn.ReLU(),
+        )
+        # Set the final convoultion layer's bias to be 0.
+        self.head[4].bias.data.fill_(0)
+        self.grad_checkpointing = False
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, is_enabled=True):
+        """Enable grad checkpointing."""
+        self.grad_checkpointing = is_enabled
+        self.encoder.set_grad_checkpointing(self.grad_checkpointing)
+        self.decoder.set_grad_checkpointing(self.grad_checkpointing)
+    def forward(self, image: torch.Tensor) -> torch.Tensor:
+        """Decode by projection and fusion of multi-resolution encodings."""
+        encodings = self.encoder(self.normalizer(image))
+        num_encoder_features = len(self.encoder.dims_encoder)
+        features = self.decoder(encodings[:num_encoder_features])
+        disparity = self.head(features)
+        return disparity
+    def internal_resolution(self) -> int:
+        """Return the internal image size of the network."""
+        return self.encoder.internal_resolution()
+def create_monodepth_dpt(
+    params: MonodepthParams | None = None,
+) -> MonodepthDensePredictionTransformer:
+    """Creates DepthDensePredictionTransformer model.
+    Args:
+        params: Parameters of monodepth network.
+    Returns:
+        The configured monodepth DPT.
+    """
+    if params is None:
+        params = MonodepthParams()
+    encoder: SlidingPyramidNetwork = create_monodepth_encoder(
+        params.patch_encoder_preset,
+        params.image_encoder_preset,
+        use_patch_overlap=params.use_patch_overlap,
+        last_encoder=params.dims_decoder[0],
+    )
+    decoder: MultiresConvDecoder = create_monodepth_decoder(
+        params.patch_encoder_preset, params.dims_decoder
+    )
+    monodepth_model = MonodepthDensePredictionTransformer(
+        encoder=encoder, decoder=decoder, last_dims=(32, 1)
+    )
+    # By default, we don't train the monodepth model.
+    # However, we allow to selectively unfreeze parts of the network.
+    monodepth_model.requires_grad_(False)
+    monodepth_model.encoder.set_requires_grad_(
+        patch_encoder=params.unfreeze_patch_encoder,
+        image_encoder=params.unfreeze_image_encoder,
+    )
+    monodepth_model.decoder.requires_grad_(params.unfreeze_decoder)
+    monodepth_model.head.requires_grad_(params.unfreeze_head)
+    if not params.unfreeze_norm_layers:
+        module_surgery.freeze_norm_layer(monodepth_model)
+    monodepth_model.set_grad_checkpointing(params.grad_checkpointing)
+    return monodepth_model
+class MonodepthOutput(NamedTuple):
+    """Output of the monodepth model."""
+    # Disparity output from the monodepth model.
+    disparity: torch.Tensor
+    # Multi-level features from monodepth encoder.
+    encoder_features: list[torch.Tensor]
+    # Single-level feature from monodepth decoder.
+    decoder_features: torch.Tensor
+    # List of monodepth features to be used in gaussian predictor.
+    output_features: list[torch.Tensor]
+    # List of intermediate encoder features to be used in distillation.
+    intermediate_features: list[torch.Tensor] = []
+class MonodepthWithEncodingAdaptor(nn.Module):
+    """Monodepth model with feature maps."""
+    def __init__(
+        self,
+        monodepth_predictor: MonodepthDensePredictionTransformer,
+        return_encoder_features: bool,
+        return_decoder_features: bool,
+        num_monodepth_layers: int,
+        sorting_monodepth: bool,
+    ):
+        """Initialize MonodepthWithEncodingAdaptor.
+        Args:
+            monodepth_predictor: The monodepth model.
+            return_encoder_features: Whether to return encoder features from monodepth model.
+            return_decoder_features: Whether to return decoder features from monodepth model.
+            num_monodepth_layers: How many layers the monodepth model predicts.
+            sorting_monodepth: Whether to sort the monodepth output (for two layer monodepth).
+        """
+        super().__init__()
+        self.monodepth_predictor = monodepth_predictor
+        self.return_encoder_features = return_encoder_features
+        self.return_decoder_features = return_decoder_features
+        self.num_monodepth_layers = num_monodepth_layers
+        self.sorting_monodepth = sorting_monodepth
+    def forward(self, image: torch.Tensor) -> MonodepthOutput:
+        """Process image and return disparity and feature maps."""
+        inputs = self.monodepth_predictor.normalizer(image)
+        encoder_output = self.monodepth_predictor.encoder(inputs)
+        num_encoder_features = len(self.monodepth_predictor.encoder.dims_encoder)
+        # NOTE: whether intermediate features are empty have already been decided
+        # in monodepth_predictor during create_monodepth_dpt.
+        encoder_features = encoder_output[:num_encoder_features]
+        intermediate_features = encoder_output[num_encoder_features:]
+        decoder_features = self.monodepth_predictor.decoder(encoder_features)
+        disparity = self.monodepth_predictor.head(decoder_features)
+        # We cannot use disparity.shape[1], otherwise the tracer will fail.
+        if self.num_monodepth_layers == 2 and self.sorting_monodepth:
+            first_layer_disparity = disparity.max(dim=1, keepdims=True).values
+            second_layer_disparity = disparity.min(dim=1, keepdims=True).values
+            disparity = torch.cat([first_layer_disparity, second_layer_disparity], dim=1)
+        output_features = []
+        if self.return_encoder_features:
+            output_features.extend(encoder_features)
+        if self.return_decoder_features:
+            output_features.append(decoder_features)
+        return MonodepthOutput(
+            disparity=disparity,
+            encoder_features=encoder_features,
+            decoder_features=decoder_features,
+            output_features=output_features,
+            intermediate_features=intermediate_features,
+        )
+    def get_feature_dims(self) -> list[int]:
+        """Return dimensions of output feature maps."""
+        dims = []
+        if self.return_encoder_features:
+            dims.extend(self.monodepth_predictor.encoder.dims_encoder)
+        if self.return_decoder_features:
+            dims.append(self.monodepth_predictor.decoder.dim_out)
+        return dims
+    def internal_resolution(self) -> int:
+        """Return the internal image size of the network."""
+        return self.monodepth_predictor.internal_resolution()
+    def replicate_head(self, num_repeat: int):
+        """Replicate the last convolution layer (head[4] in DPT) for multi layer depth."""
+        conv_last = copy.deepcopy(self.monodepth_predictor.head[4])
+        self.monodepth_predictor.head[4].out_channels = num_repeat
+        self.monodepth_predictor.head[4].weight = nn.Parameter(
+            conv_last.weight.repeat(num_repeat, 1, 1, 1)
+        )
+        self.monodepth_predictor.head[4].bias = nn.Parameter(conv_last.bias.repeat(num_repeat))
+def create_monodepth_adaptor(
+    monodepth_predictor: MonodepthDensePredictionTransformer,
+    params: MonodepthAdaptorParams,
+    num_monodepth_layers: int,
+    sorting_monodepth: bool,
+) -> MonodepthWithEncodingAdaptor:
+    """Create an adaptor that returns both disparity and features."""
+    adaptor = MonodepthWithEncodingAdaptor(
+        monodepth_predictor=monodepth_predictor,
+        return_encoder_features=params.encoder_features,
+        return_decoder_features=params.decoder_features,
+        num_monodepth_layers=num_monodepth_layers,
+        sorting_monodepth=sorting_monodepth,
+    )
+    return adaptor

src/sharp/models/normalizers.py ADDED Viewed

	@@ -0,0 +1,80 @@

+"""Contains an implementation of image normalizers for perceptual loss.
+For licensing see accompanying LICENSE file.
+Copyright (C) 2025 Apple Inc. All Rights Reserved.
+"""
+from __future__ import annotations
+from typing import Sequence, Union
+import torch
+from torch import nn
+class MeanStdNormalizer(nn.Module):
+    """Normalizing image input by mean and std."""
+    mean: torch.Tensor
+    std_inv: torch.Tensor
+    def __init__(
+        self,
+        mean: Union[Sequence[float], torch.Tensor],
+        std: Union[Sequence[float], torch.Tensor],
+    ):
+        """Initialize MeanStdNormalizer."""
+        super(MeanStdNormalizer, self).__init__()
+        if not isinstance(mean, torch.Tensor):
+            mean = torch.as_tensor(mean).view(-1, 1, 1)
+        if not isinstance(std, torch.Tensor):
+            std = torch.as_tensor(std).view(-1, 1, 1)
+        self.register_buffer("mean", mean)
+        # We use inverse std to use a multiplication which is better supported by the hardware
+        self.register_buffer("std_inv", 1.0 / std)
+    def forward(self, image: torch.Tensor) -> torch.Tensor:
+        """Apply mean and std normalization over input image."""
+        return (image - self.mean) * self.std_inv
+class AffineRangeNormalizer(nn.Module):
+    """Perform linear mapping to map input_range to output_range.
+    Output_range defaults to (0, 1).
+    """
+    def __init__(
+        self,
+        input_range: tuple[float, float],
+        output_range: tuple[float, float] = (0, 1),
+    ):
+        """Initialize AffineRangeNormalizer."""
+        super().__init__()
+        input_min, input_max = input_range
+        output_min, output_max = output_range
+        if input_max <= input_min:
+            raise ValueError(f"Invalid input_range: {input_range}")
+        if output_max <= output_min:
+            raise ValueError(f"Invalid output_range: {output_range}")
+        self.scale = (output_max - output_min) / (input_max - input_min)
+        self.bias = output_min - input_min * self.scale
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Apply affine range normalization over input image."""
+        if self.scale != 1.0:
+            x = x * self.scale
+        if self.bias != 0.0:
+            x = x + self.bias
+        return x
+class MobileNetNormalizer(AffineRangeNormalizer):
+    """Image normalization in mobilenet."""
+    def __init__(self, input_range: tuple[float, float] = (0, 1)):
+        """Initialize MobileNetNormalizer."""
+        super().__init__(input_range=input_range, output_range=(-1, 1))

src/sharp/models/params.py ADDED Viewed

	@@ -0,0 +1,203 @@

+"""Contains params for backbone.
+For licensing see accompanying LICENSE file.
+Copyright (C) 2025 Apple Inc. All Rights Reserved.
+"""
+import dataclasses
+from typing import Literal
+import sharp.utils.math as math_utils
+from sharp.models.blocks import NormLayerName, UpsamplingMode
+from sharp.models.presets import ViTPreset
+from sharp.utils.color_space import ColorSpace
+DimsDecoder = tuple[int, int, int, int, int]
+DPTImageEncoderType = Literal["skip_conv", "skip_conv_kernel2"]
+ColorInitOption = Literal[
+    "none",  # Initialize as gray.
+    "first_layer",  # Initialize the first layer with input image, other layers with gray.
+    "all_layers",  # Initialize all layers with input image.
+]
+DepthInitOption = Literal[
+    # Initialize the layer of gaussian on surface using min pooling of input depth.
+    "surface_min",
+    # Initialize the layer of gaussian on surface using max pooling of input depth
+    "surface_max",
+    # Initialize the layer of gaussian on plane using base_depth depth.
+    "base_depth",
+    # Initialize the layer of gaussian on plane based on base_depth and index of layer.
+    "linear_disparity",
+]
+@dataclasses.dataclass
+class AlignmentParams:
+    """Parameters for depth alignment."""
+    kernel_size: int = 16
+    stride: int = 1
+    frozen: bool = False
+    # The following parameters are only used for LearnedAlignment.
+    # Number of steps in the UNet for LearnedAlignment.
+    steps: int = 4
+    # Activation type for LearnedAlignment.
+    activation_type: math_utils.ActivationType = "exp"
+    # Whether to use depth decoder features for LearnedAlignment.
+    depth_decoder_features: bool = False
+    # Base width of the UNet for LearnedAlignment.
+    base_width: int = 16
+@dataclasses.dataclass
+class DeltaFactor:
+    """Factors to multiply deltas with before activation.
+    These factors effectively selectively reduce the learning rate.
+    """
+    xy: float = 0.001
+    z: float = 0.001
+    color: float = 0.1  # We recommend 0.1 for linearRGB and 1.0 for sRGB.
+    opacity: float = 1.0
+    scale: float = 1.0
+    quaternion: float = 1.0
+@dataclasses.dataclass
+class InitializerParams:
+    """Parameters for initializer."""
+    # Common parameters.
+    # Multiply scale of Gaussians by this factor.
+    scale_factor: float = 1.0
+    # Factor to convert inverse depth to disparity.
+    disparity_factor: float = 1.0
+    # Stride of the initializer.
+    stride: int = 2
+    # Parameters that only affect MultiLayerInitializer.
+    # How many layers of Gaussians to predict (only available for MultiLayerInitializer).
+    num_layers: int = 2
+    # Which option to use for depth initialization.
+    first_layer_depth_option: DepthInitOption = "surface_min"
+    rest_layer_depth_option: DepthInitOption = "surface_min"
+    # Which option to use for color initialization.
+    color_option: ColorInitOption = "all_layers"
+    # Which depth value to use for depth layers.
+    base_depth: float = 10.0
+    # Deactivate gradient for feature inputs.
+    feature_input_stop_grad: bool = False
+    # Whether to normalize depth to [DepthTransformParam.depth_min,
+    # DepthTransformParam.depth_max).
+    normalize_depth: bool = True
+    # Output only the inpainted layer. In this case, num_layers = 1.
+    output_inpainted_layer_only: bool = False
+    # Whether to set the uninpainted region to zero opacities.
+    set_uninpainted_opacity_to_zero: bool = False
+    # Whether to concatenate the inpainting mask to the feature input.
+    concat_inpainting_mask: bool = False
+@dataclasses.dataclass
+class MonodepthParams:
+    """Parameters for monodepth network."""
+    patch_encoder_preset: ViTPreset = "dinov2l16_384"
+    image_encoder_preset: ViTPreset = "dinov2l16_384"
+    checkpoint_uri: str | None = None
+    unfreeze_patch_encoder: bool = False
+    unfreeze_image_encoder: bool = False
+    unfreeze_decoder: bool = False
+    unfreeze_head: bool = False
+    unfreeze_norm_layers: bool = False
+    grad_checkpointing: bool = False
+    use_patch_overlap: bool = True
+    dims_decoder: DimsDecoder = (256, 256, 256, 256, 256)
+@dataclasses.dataclass
+class MonodepthAdaptorParams:
+    """Parameters for monodepth network feature adaptor."""
+    encoder_features: bool = True
+    decoder_features: bool = False
+@dataclasses.dataclass
+class GaussianDecoderParams:
+    """Parameters for backbone with default values."""
+    dim_in: int = 5
+    dim_out: int = 32
+    # Which normalization to use in backbone.
+    norm_type: NormLayerName = "group_norm"
+    # How many groups to use for group normalization.
+    norm_num_groups: int = 8
+    # Stride of backbone.
+    stride: int = 2
+    patch_encoder_preset: ViTPreset = "dinov2l16_384"
+    image_encoder_preset: ViTPreset = "dinov2l16_384"
+    # Dimensionality of feature maps for DPT decoder.
+    dims_decoder: DimsDecoder = (128, 128, 128, 128, 128)
+    # Whether to use depth as input.
+    use_depth_input: bool = True
+    # Whether to enable gradient checkpointing for the backbone
+    grad_checkpointing: bool = False
+    # What mode to use for upsampling in decoder.
+    upsampling_mode: UpsamplingMode = "transposed_conv"
+    # The type of image encoder.
+    image_encoder_type: DPTImageEncoderType = "skip_conv_kernel2"
+@dataclasses.dataclass
+class PredictorParams:
+    """Parameters for predictors with default values."""
+    # Parameters for submodules.
+    initializer: InitializerParams = dataclasses.field(default_factory=InitializerParams)
+    monodepth: MonodepthParams = dataclasses.field(default_factory=MonodepthParams)
+    monodepth_adaptor: MonodepthAdaptorParams = dataclasses.field(
+        default_factory=MonodepthAdaptorParams
+    )
+    gaussian_decoder: GaussianDecoderParams = dataclasses.field(
+        default_factory=GaussianDecoderParams
+    )
+    # How to align depth map (only relevant for RGBGaussianPredictor).
+    depth_alignment: AlignmentParams = dataclasses.field(default_factory=AlignmentParams)
+    # Selectively reduce learning rate for different properties.
+    delta_factor: DeltaFactor = dataclasses.field(default_factory=DeltaFactor)
+    # The maximum scale of Gaussians relative to initial scale.
+    max_scale: float = 10.0
+    # The minimum scale of Gaussians relative to initial scale.
+    min_scale: float = 0.0
+    # Which normalization to use in prediction head.
+    norm_type: NormLayerName = "group_norm"
+    # How many groups to use for group normalization.
+    norm_num_groups: int = 8
+    # Whether to use predicted mean to sample triplane features.
+    use_predicted_mean: bool = False
+    # Which activation function to use for colors / opacities.
+    color_activation_type: math_utils.ActivationType = "sigmoid"
+    opacity_activation_type: math_utils.ActivationType = "sigmoid"
+    # Colorspace of the renderer ("linearRGB" or "sRGB").
+    color_space: ColorSpace = "linearRGB"
+    # A small value to avoid ill-conditioned splats
+    low_pass_filter_eps: float = 1e-2
+    # How many layer of depth does monodepth model predict.
+    num_monodepth_layers: int = 2
+    # Whether to sort the monodepth output (for two layer monodepth).
+    sorting_monodepth: bool = False
+    # Whether to account the z offsets for estimating base scale.
+    base_scale_on_predicted_mean: bool = True

src/sharp/models/predictor.py ADDED Viewed

	@@ -0,0 +1,201 @@

+"""Contains definition of RGB-only gaussian predictor.
+For licensing see accompanying LICENSE file.
+Copyright (C) 2025 Apple Inc. All Rights Reserved.
+"""
+from __future__ import annotations
+import logging
+import torch
+from torch import nn
+from sharp.models.monodepth import MonodepthWithEncodingAdaptor
+from sharp.utils.gaussians import Gaussians3D
+from .composer import GaussianComposer
+LOGGER = logging.getLogger(__name__)
+class DepthAlignment(nn.Module):
+    """Depth alignment in a dedicated nn.Module.
+    Wrap scale_map_estimator to perform the conditional logic in a separated torch
+    module outside the forward of RGBGaussianPredictor. This module can be then
+    excluded during symbolic tracing.
+    """
+    def __init__(self, scale_map_estimator: nn.Module | None):
+        """Initialize DepthAlignmentWrapper.
+        Args:
+            scale_map_estimator: Module to align monodepth to ground truth depth.
+        """
+        super().__init__()
+        self.scale_map_estimator = scale_map_estimator
+    def forward(
+        self,
+        monodepth: torch.Tensor,
+        depth: torch.Tensor,
+        depth_decoder_features: torch.Tensor | None = None,
+    ):
+        """Optionally align monodepth to ground truth with a local scale map.
+        Args:
+            monodepth: The monodepth model with intermediate features to use.
+            depth: Ground truth depth to align predicted depth to.
+            depth_decoder_features: The (optional) monodepth decoder features.
+        """
+        if depth is not None and self.scale_map_estimator is not None:
+            depth_alignment_map = self.scale_map_estimator(
+                monodepth[:, 0:1], depth, depth_decoder_features
+            )
+            monodepth = depth_alignment_map * monodepth
+        else:
+            # Some losses rely on the presence of an alignment map.
+            # We ensure that they can be computed by creating a fake alignment map.
+            depth_alignment_map = torch.ones_like(monodepth)
+        return monodepth, depth_alignment_map
+class RGBGaussianPredictor(nn.Module):
+    """Predicts 3D Gaussians from images."""
+    feature_model: nn.Module
+    def __init__(
+        self,
+        init_model: nn.Module,
+        monodepth_model: MonodepthWithEncodingAdaptor,
+        feature_model: nn.Module,
+        prediction_head: nn.Module,
+        gaussian_composer: GaussianComposer,
+        scale_map_estimator: nn.Module | None,
+    ) -> None:
+        """Initialize RGBGaussianPredictor.
+        Args:
+            init_model: A model mapping image and depth to base values.
+            monodepth_model: The monodepth model with intermediate features to use.
+            feature_model: The image2image model to predict Gaussians from.
+            prediction_head: Head to decode image features.
+            gaussian_composer: Module to compose final prediction from deltas and
+                base values.
+            scale_map_estimator: Module to align monodepth to ground truth depth.
+        Note:
+        ----
+            when monodepth_model is trainable, using local depth alignment can
+            result in the monodepth model losing its ability to predict shapes. It is
+            hence recommend to deactivate the corresponding flag.
+        """
+        super().__init__()
+        self.init_model = init_model
+        self.feature_model = feature_model
+        self.monodepth_model = monodepth_model
+        self.prediction_head = prediction_head
+        self.gaussian_composer = gaussian_composer
+        self.depth_alignment = DepthAlignment(scale_map_estimator)
+    def forward(
+        self,
+        image: torch.Tensor,
+        disparity_factor: torch.Tensor,
+        depth: torch.Tensor | None = None,
+    ) -> Gaussians3D:
+        """Predict 3D Gaussians.
+        Args:
+            image: The image to process.
+            disparity_factor: Factor to convert depth to disparities.
+            depth: Ground truth depth to align predicted depth to.
+        Returns:
+            The predicted 3D Gaussians.
+        Note:
+        ----
+        During training, it is recommended to feed an additional ground truth depth
+        map to the network to align the predicted depth to. During inference, it is
+        recommended to use depth_gt=None and use monodepth_disparity output from the
+        model instead to compute depth.
+        """
+        # Estimate depth and align to ground truth (if available).
+        monodepth_output = self.monodepth_model(image)
+        monodepth_disparity = monodepth_output.disparity
+        disparity_factor = disparity_factor[:, None, None, None]
+        monodepth = disparity_factor / monodepth_disparity.clamp(min=1e-4, max=1e4)
+        # In the model we apply additional alignment to provided ground truth depth
+        # as well as additional normalization.
+        #
+        # The overall graph looks as follows:
+        #
+        #     monodepth        depth    # Both monodepth and depth are metric here.
+        #         |              |
+        #         +------+-------+
+        #                |
+        #        +-------+--------+     # Optionally align monodepth to ground truth
+        #        |depth_alignement|     # with a local scale map.
+        #        +-------+--------+
+        #                |
+        #                v
+        #       monodepth (aligned)     # Monodepth is now aligned to ground truth.
+        #                |
+        #          +-----+----+         # Normalize depth and compute base gaussians.
+        #          |init_model|         # in these normalized coordinates.
+        #          +-----+----+
+        #                |
+        #                v
+        #   +------ init_output         # Init_output consists of features, base
+        #   |            |              # gaussians and a global scale.
+        #   |     +------+-----+
+        #   |     |main network|        # Compute delta values to base gaussians.
+        #   |     +------+-----+
+        #   |            |
+        #   |            V
+        #   |        delta_values       # The delta values are computed with normalized depth.
+        #   |            |
+        #   |    +-------+---------+
+        #   +--> |gaussian_composer|    # Add delta to base values and unscale gaussians.
+        #        +-------+---------+
+        #                |
+        #                v
+        #            gaussians          # The final Gaussians are metric again.
+        #
+        # The logic to decide whether to align monodepth to the ground truth is wrapped
+        # in a submodule 'DepthAlignement' to facilitate the symbolic tracing of the
+        # predictor. This way, the depth alignment submodule containing the conditional
+        # logic can be excluded during the tracing and the graph of the predictors is
+        # static.
+        monodepth, _ = self.depth_alignment(
+            monodepth,
+            depth,
+            monodepth_output.decoder_features,
+        )
+        init_output = self.init_model(image, monodepth)
+        image_features = self.feature_model(
+            init_output.feature_input, encodings=monodepth_output.output_features
+        )
+        delta_values = self.prediction_head(image_features)
+        gaussians = self.gaussian_composer(
+            delta=delta_values,
+            base_values=init_output.gaussian_base_values,
+            global_scale=init_output.global_scale,
+        )
+        return gaussians
+    def internal_resolution(self) -> int:
+        """Internal resolution."""
+        return self.monodepth_model.internal_resolution()
+    @property
+    def output_resolution(self) -> int:
+        """Output resolution of Gaussians."""
+        return self.internal_resolution() // 2

src/sharp/models/presets/__init__.py ADDED Viewed

	@@ -0,0 +1,23 @@

+"""Contains presets for pretrained neural networks.
+For licensing see accompanying LICENSE file.
+Copyright (C) 2025 Apple Inc. All Rights Reserved.
+"""
+from .monodepth import (
+    MONODEPTH_ENCODER_DIMS_MAP,
+    MONODEPTH_HOOK_IDS_MAP,
+)
+from .vit import (
+    VIT_CONFIG_DICT,
+    ViTConfig,
+    ViTPreset,
+)
+__all__ = [
+    "ViTConfig",
+    "ViTPreset",
+    "VIT_CONFIG_DICT",
+    "MONODEPTH_ENCODER_DIMS_MAP",
+    "MONODEPTH_HOOK_IDS_MAP",
+]

src/sharp/models/presets/monodepth.py ADDED Viewed

	@@ -0,0 +1,21 @@

+"""Contains preset for monodepth modules.
+For licensing see accompanying LICENSE file.
+Copyright (C) 2025 Apple Inc. All Rights Reserved.
+"""
+from __future__ import annotations
+from .vit import ViTPreset
+# Map the decoder configuration with the number of output channels
+# for each tensor from the decoder output.
+MONODEPTH_ENCODER_DIMS_MAP: dict[ViTPreset, list[int]] = {
+    # For publication
+    "dinov2l16_384": [256, 512, 1024, 1024],
+}
+MONODEPTH_HOOK_IDS_MAP: dict[ViTPreset, list[int]] = {
+    # For publication
+    "dinov2l16_384": [5, 11, 17, 23],
+}

src/sharp/models/presets/vit.py ADDED Viewed

	@@ -0,0 +1,58 @@

+"""Contains preset for ViT modules.
+For licensing see accompanying LICENSE file.
+Copyright (C) 2025 Apple Inc. All Rights Reserved.
+"""
+from __future__ import annotations
+import dataclasses
+from typing import Literal
+ViTPreset = Literal["dinov2l16_384",]
+MLPMode = Literal["vanilla", "glu"]
+@dataclasses.dataclass
+class ViTConfig:
+    """Configuration for ViT."""
+    in_chans: int
+    embed_dim: int
+    depth: int
+    num_heads: int
+    init_values: float
+    img_size: int = 384
+    patch_size: int = 16
+    num_classes: int = 21841
+    mlp_ratio: float = 4.0
+    drop_rate: float = 0.0
+    attn_drop_rate: float = 0.0
+    drop_path_rate: float = 0.0
+    qkv_bias: bool = True
+    global_pool: str = "avg"
+    # Properties for timm_vit.
+    mlp_mode: MLPMode = "vanilla"
+    # Properties for SPN.
+    intermediate_features_ids: list[int] | None = None
+    def asdict(self):
+        """Convenience method to convert the class to a dict."""
+        return dataclasses.asdict(self)
+VIT_CONFIG_DICT: dict[ViTPreset, ViTConfig] = {
+    "dinov2l16_384": ViTConfig(
+        in_chans=3,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        init_values=1e-5,
+        global_pool="",
+    ),
+}

src/sharp/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""Contains utils packages.
+For licensing see accompanying LICENSE file.
+Copyright (C) 2025 Apple Inc. All Rights Reserved.
+"""

src/sharp/utils/camera.py ADDED Viewed

	@@ -0,0 +1,386 @@

+"""Contains utility functionality to render different modalities.
+For licensing see accompanying LICENSE file.
+Copyright (C) 2025 Apple Inc. All Rights Reserved.
+"""
+from __future__ import annotations
+import dataclasses
+from typing import Literal, NamedTuple
+import numpy as np
+import torch
+from .gaussians import Gaussians3D
+from .linalg import eyes
+TrajetoryType = Literal["swipe", "shake", "rotate", "rotate_forward"]
+LookAtMode = Literal["point", "ahead"]
+@dataclasses.dataclass
+class CameraInfo:
+    """Camera info for a pinhole camera."""
+    intrinsics: torch.Tensor
+    extrinsics: torch.Tensor
+    width: int
+    height: int
+class FocusRange(NamedTuple):
+    """Parametrizes a range of depth / disparity values."""
+    min: float
+    focus: float
+    max: float
+@dataclasses.dataclass
+class TrajectoryParams:
+    """Parameters for trajectory."""
+    type: TrajetoryType = "rotate_forward"
+    lookat_mode: LookAtMode = "point"
+    max_disparity: float = 0.08
+    max_zoom: float = 0.15
+    distance_m: float = 0.0
+    num_steps: int = 60
+    num_repeats: int = 1
+def compute_max_offset(
+    scene: Gaussians3D,
+    params: TrajectoryParams,
+    resolution_px: tuple[int, int],
+    f_px: float,
+) -> np.ndarray:
+    """Compute the maximum offset for camera along X/Y/Z axis."""
+    scene_points = scene.mean_vectors
+    extrinsics = torch.eye(4).to(scene_points.device)
+    min_depth, _, _ = _compute_depth_quantiles(scene_points, extrinsics)
+    r_px = resolution_px
+    diagonal = np.sqrt((r_px[0] / f_px) ** 2 + (r_px[1] / f_px) ** 2)
+    max_lateral_offset_m = params.max_disparity * diagonal * min_depth
+    max_medial_offset_m = params.max_zoom * min_depth
+    max_offset_xyz_m = np.array([max_lateral_offset_m, max_lateral_offset_m, max_medial_offset_m])
+    return max_offset_xyz_m
+def create_eye_trajectory(
+    scene: Gaussians3D,
+    params: TrajectoryParams,
+    resolution_px: tuple[int, int],
+    f_px: float,
+) -> list[torch.Tensor]:
+    """Create eye trajectory for trajectory type."""
+    max_offset_xyz_m = compute_max_offset(
+        scene,
+        params,
+        resolution_px,
+        f_px,
+    )
+    # We place the eye trajectory at z=distance plane (default=0),
+    # assuming portal plane is placed at z=natural_distance.
+    if params.type == "swipe":
+        return create_eye_trajectory_swipe(
+            max_offset_xyz_m, params.distance_m, params.num_steps, params.num_repeats
+        )
+    elif params.type == "shake":
+        return create_eye_trajectory_shake(
+            max_offset_xyz_m, params.distance_m, params.num_steps, params.num_repeats
+        )
+    elif params.type == "rotate":
+        return create_eye_trajectory_rotate(
+            max_offset_xyz_m, params.distance_m, params.num_steps, params.num_repeats
+        )
+    elif params.type == "rotate_forward":
+        return create_eye_trajectory_rotate_forward(
+            max_offset_xyz_m, params.distance_m, params.num_steps, params.num_repeats
+        )
+    else:
+        raise ValueError(f"Invalid trajectory type {params.type}.")
+def create_eye_trajectory_swipe(
+    offset_xyz_m: np.ndarray,
+    distance_m: float,
+    num_steps: int,
+    num_repeats: int,
+) -> list[torch.Tensor]:
+    """Create a left to right swipe trajectory."""
+    offset_x_m, _, _ = offset_xyz_m
+    eye_positions = [
+        torch.tensor([x, 0, distance_m], dtype=torch.float32)
+        for x in np.linspace(-offset_x_m, offset_x_m, num_steps)
+    ]
+    return eye_positions * num_repeats
+def create_eye_trajectory_shake(
+    offset_xyz_m: np.ndarray,
+    distance_m: float,
+    num_steps: int,
+    num_repeats: int,
+) -> list[torch.Tensor]:
+    """Create a left right shake followed by an up down shake trajectory."""
+    num_steps_total = num_steps * num_repeats
+    num_steps_horizontal = num_steps_total // 2
+    num_steps_vertical = num_steps_total - num_steps_horizontal
+    offset_x_m, offset_y_m, _ = offset_xyz_m
+    eye_positions: list[torch.Tensor] = []
+    eye_positions.extend(
+        torch.tensor(
+            [offset_x_m * np.sin(2 * np.pi * t), 0.0, distance_m],
+            dtype=torch.float32,
+        )
+        for t in np.linspace(0, num_repeats, num_steps_horizontal)
+    )
+    eye_positions.extend(
+        torch.tensor(
+            [0.0, offset_y_m * np.sin(2 * np.pi * t), distance_m],
+            dtype=torch.float32,
+        )
+        for t in np.linspace(0, num_repeats, num_steps_vertical)
+    )
+    return eye_positions
+def create_eye_trajectory_rotate(
+    offset_xyz_m: np.ndarray,
+    distance_m: float,
+    num_steps: int,
+    num_repeats: int,
+) -> list[torch.Tensor]:
+    """Create a rotating trajectory."""
+    num_steps_total = num_steps * num_repeats
+    offset_x_m, offset_y_m, _ = offset_xyz_m
+    eye_positions = [
+        torch.tensor(
+            [
+                offset_x_m * np.sin(2 * np.pi * t),
+                offset_y_m * np.cos(2 * np.pi * t),
+                distance_m,
+            ],
+            dtype=torch.float32,
+        )
+        for t in np.linspace(0, num_repeats, num_steps_total)
+    ]
+    return eye_positions
+def create_eye_trajectory_rotate_forward(
+    offset_xyz_m: np.ndarray,
+    distance_m: float,
+    num_steps: int,
+    num_repeats: int,
+) -> list[torch.Tensor]:
+    """Create a rotating trajectory."""
+    num_steps_total = num_steps * num_repeats
+    offset_x_m, _, offset_z_m = offset_xyz_m
+    eye_positions = [
+        torch.tensor(
+            [
+                offset_x_m * np.sin(2 * np.pi * t),
+                0.0,
+                distance_m + offset_z_m * (1.0 - np.cos(2 * np.pi * t)) / 2,
+            ],
+            dtype=torch.float32,
+        )
+        for t in np.linspace(0, num_repeats, num_steps_total)
+    ]
+    return eye_positions
+def create_camera_model(
+    scene: Gaussians3D,
+    intrinsics: torch.Tensor,
+    resolution_px: tuple[int, int],
+    lookat_mode: LookAtMode = "point",
+) -> PinholeCameraModel:
+    """Create camera model to simulate general pinhole camera."""
+    screen_extrinsics = torch.eye(4)
+    screen_intrinsics = intrinsics.clone()
+    image_width, image_height = resolution_px
+    screen_resolution_px = get_screen_resolution_px_from_input(
+        width=image_width, height=image_height
+    )
+    screen_intrinsics[0] *= screen_resolution_px[0] / image_width
+    screen_intrinsics[1] *= screen_resolution_px[1] / image_height
+    camera_model = PinholeCameraModel(
+        scene,
+        screen_extrinsics=screen_extrinsics,
+        screen_intrinsics=screen_intrinsics,
+        screen_resolution_px=screen_resolution_px,
+        focus_depth_quantile=0.1,
+        min_depth_focus=2.0,
+        lookat_mode=lookat_mode,
+    )
+    return camera_model
+def create_camera_matrix(
+    position: torch.Tensor,
+    look_at_position: torch.Tensor | None = None,
+    world_up: torch.Tensor | None = None,
+    inverse: bool = False,
+) -> torch.Tensor:
+    """Create camera matrix from vectors."""
+    device = position.device
+    if look_at_position is None:
+        look_at_position = torch.zeros(3, device=device)
+    if world_up is None:
+        world_up = torch.tensor([0.0, 0.0, 1.0], device=device)
+    position, look_at_position, world_up = torch.broadcast_tensors(
+        position, look_at_position, world_up
+    )
+    camera_front = look_at_position - position
+    camera_front = camera_front / camera_front.norm(dim=-1, keepdim=True)
+    camera_right = torch.cross(camera_front, world_up, dim=-1)
+    camera_right = camera_right / camera_right.norm(dim=-1, keepdim=True)
+    camera_down = torch.cross(camera_front, camera_right, dim=-1)
+    rotation_matrix = torch.stack([camera_right, camera_down, camera_front], dim=-1)
+    matrix = eyes(dim=4, shape=position.shape[:-1], device=device)
+    if inverse:
+        matrix[..., :3, :3] = rotation_matrix.transpose(-1, -2)
+        matrix[..., :3, 3:4] = -rotation_matrix.transpose(-1, -2) @ position[..., None]
+    else:
+        matrix[..., :3, :3] = rotation_matrix
+        matrix[..., :3, 3] = position
+    return matrix
+class PinholeCameraModel:
+    """Camera model that focuses on point."""
+    def __init__(
+        self,
+        scene: Gaussians3D,
+        screen_extrinsics: torch.Tensor,
+        screen_intrinsics: torch.Tensor,
+        screen_resolution_px: tuple[int, int],
+        focus_depth_quantile: float = 0.1,
+        min_depth_focus: float = 2.0,
+        lookat_point: tuple[float, float, float] | None = None,
+        lookat_mode: LookAtMode = "point",
+    ) -> None:
+        """Initialize GeneralPinholeCameraModel.
+        Args:
+            scene: The scene to display.
+            screen_extrinsics: Extrinsics of the default position.
+            screen_intrinsics: Intrinsics to use for rendering.
+            screen_resolution_px: Width and height to render.
+            focus_depth_quantile: Where inside the depth range to focus on.
+            min_depth_focus: Depth to focus at.
+            lookat_point: a point that the camera's Z axis directs towards.
+            lookat_mode: "point" to look at a fixed point,
+                "ahead" to look straight ahead.
+        """
+        self.scene = scene
+        self.screen_extrinsics = screen_extrinsics
+        self.screen_intrinsics = screen_intrinsics
+        self.screen_resolution_px = screen_resolution_px
+        self.focus_depth_quantile = focus_depth_quantile
+        self.min_depth_focus = min_depth_focus
+        self.lookat_point = lookat_point
+        self.lookat_mode = lookat_mode
+        scene_points = scene.mean_vectors
+        if scene_points.ndim == 3:
+            scene_points = scene_points[0]
+        elif scene_points.ndim != 2:
+            raise ValueError("Unsupported dimensionality of scene points.")
+        self._scene_points = scene_points.cpu()
+        self.depth_quantiles = _compute_depth_quantiles(
+            self._scene_points,
+            self.screen_extrinsics,
+            q_focus=self.focus_depth_quantile,
+        )
+    def compute(self, eye_pos: torch.Tensor) -> CameraInfo:
+        """Compute camera for eye position."""
+        extrinsics = self.screen_extrinsics.clone()
+        origin = eye_pos if self.lookat_mode == "ahead" else torch.zeros(3)
+        if self.lookat_point is None:
+            depth_focus = max(self.min_depth_focus, self.depth_quantiles.focus)
+            look_at_position = origin + torch.tensor([0.0, 0.0, depth_focus])
+        else:
+            look_at_position = origin + torch.tensor([*self.lookat_point])
+        world_up = torch.tensor([0.0, -1.0, 0.0])
+        extrinsics_modifier = create_camera_matrix(
+            eye_pos, look_at_position, world_up, inverse=True
+        )
+        extrinsics = extrinsics_modifier @ self.screen_extrinsics
+        camera_info = CameraInfo(
+            intrinsics=self.screen_intrinsics,
+            extrinsics=extrinsics,
+            width=self.screen_resolution_px[0],
+            height=self.screen_resolution_px[1],
+        )
+        return camera_info
+    def set_screen_extrinsics(self, new_value: torch.Tensor) -> None:
+        """Modify the default extrinsics."""
+        self.screen_extrinsics = new_value
+        self.depth_quantiles = _compute_depth_quantiles(self._scene_points, self.screen_extrinsics)
+def get_screen_resolution_px_from_input(width: int, height: int) -> tuple[int, int]:
+    """Get resolution for metadata dictionary."""
+    resolution_px = (width, height)
+    # halve the dimensions for super large image
+    if resolution_px[1] > 3000:
+        resolution_px = (resolution_px[0] // 2, resolution_px[1] // 2)
+    # for mp4 compatibility, enforce dimensions to even number,
+    # otherwise could not be played in browser
+    if resolution_px[0] % 2 != 0:
+        resolution_px = (resolution_px[0] + 1, resolution_px[1])
+    if resolution_px[1] % 2 != 0:
+        resolution_px = (resolution_px[0], resolution_px[1] + 1)
+    return resolution_px
+def _compute_depth_quantiles(
+    points: torch.Tensor,
+    extrinsics: torch.Tensor,
+    q_near: float = 0.001,
+    q_focus: float = 0.1,
+    q_far: float = 0.999,
+) -> FocusRange:
+    """Compute disparity quantiles for scene and extrinsics id."""
+    points_local = points @ extrinsics[:3, :3].T + extrinsics[:3, 3]
+    depth_values = points_local[..., 2].flatten()
+    depth_values = depth_values[depth_values > 0]
+    q_values = torch.tensor([q_near, q_focus, q_far])
+    depth_quantiles_pt = torch.quantile(depth_values.cpu(), q_values)
+    depth_quantiles = FocusRange(
+        min=float(depth_quantiles_pt[0]),
+        focus=float(depth_quantiles_pt[1]),
+        max=float(depth_quantiles_pt[2]),
+    )
+    return depth_quantiles

src/sharp/utils/color_space.py ADDED Viewed

	@@ -0,0 +1,88 @@

+"""Contains color space utility functions.
+For licensing see accompanying LICENSE file.
+Copyright (C) 2025 Apple Inc. All Rights Reserved.
+"""
+from __future__ import annotations
+import logging
+from typing import Literal
+import torch
+from sharp.utils.robust import robust_where
+LOGGER = logging.getLogger(__name__)
+ColorSpace = Literal["sRGB", "linearRGB"]
+def encode_color_space(color_space: ColorSpace) -> int:
+    """Encode color space to integer."""
+    return 0 if color_space == "sRGB" else 1
+def decode_color_space(color_space_index: int) -> ColorSpace:
+    """Decode color space index to color space."""
+    return "sRGB" if color_space_index == 0 else "linearRGB"
+def sRGB2linearRGB(sRGB: torch.Tensor) -> torch.Tensor:
+    """SRGB to linearRGB conversion function.
+    Reference:
+    https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf
+    Section 7.7.7
+    Args:
+        sRGB: Input image tensor in sRGB space.
+    """
+    # We need to use robust_where to clamp the second branch.
+    # Otherwise, torch.where will lead to NaN in the backward pass, see
+    # https://github.com/pytorch/pytorch/issues/68425
+    THRESHOLD = 0.04045
+    def branch_true_func(x):
+        return x / 12.92
+    def branch_false_func(x):
+        return ((x + 0.055) / 1.055) ** 2.4
+    return robust_where(
+        sRGB <= THRESHOLD,
+        sRGB,
+        branch_true_func,
+        branch_false_func,
+        branch_false_safe_value=THRESHOLD,
+    )
+def linearRGB2sRGB(linearRGB: torch.Tensor) -> torch.Tensor:
+    """LinearRGB to sRGB conversion function.
+    Reference:
+    https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf
+    Section 7.7.7
+    Args:
+        linearRGB: Input image tensor in linearRGB space.
+    """
+    # We need to use robust_where to clamp the second branch.
+    # Otherwise, torch.where will lead to NaN in the backward pass, see
+    # https://github.com/pytorch/pytorch/issues/68425
+    THRESHOLD = 0.0031308
+    def branch_true_func(x):
+        return x * 12.92
+    def branch_false_func(x):
+        return 1.055 * (x ** (1 / 2.4)) - 0.055
+    return robust_where(
+        linearRGB <= THRESHOLD,
+        linearRGB,
+        branch_true_func,
+        branch_false_func,
+        branch_false_safe_value=THRESHOLD,
+    )

src/sharp/utils/gaussians.py ADDED Viewed

	@@ -0,0 +1,480 @@

+"""Contains basic data structures and functionality for 3D Gaussians.
+For licensing see accompanying LICENSE file.
+Copyright (C) 2025 Apple Inc. All Rights Reserved.
+"""
+from __future__ import annotations
+import logging
+from pathlib import Path
+from typing import Any, Literal, NamedTuple
+import numpy as np
+import torch
+from plyfile import PlyData, PlyElement
+from sharp.utils import color_space as cs_utils
+from sharp.utils import linalg
+LOGGER = logging.getLogger(__name__)
+BackgroundColor = Literal["black", "white", "random_color", "random_pixel"]
+class Gaussians3D(NamedTuple):
+    """Represents a collection of 3D Gaussians."""
+    mean_vectors: torch.Tensor
+    singular_values: torch.Tensor
+    quaternions: torch.Tensor
+    colors: torch.Tensor
+    opacities: torch.Tensor
+    def to(self, device: torch.device) -> Gaussians3D:
+        """Move Gaussians to device."""
+        return Gaussians3D(
+            mean_vectors=self.mean_vectors.to(device),
+            singular_values=self.singular_values.to(device),
+            quaternions=self.quaternions.to(device),
+            colors=self.colors.to(device),
+            opacities=self.opacities.to(device),
+        )
+class SceneMetaData(NamedTuple):
+    """Meta data about Gaussian scene."""
+    focal_length_px: float
+    resolution_px: tuple[int, int]
+    color_space: cs_utils.ColorSpace
+def get_unprojection_matrix(
+    extrinsics: torch.Tensor,
+    intrinsics: torch.Tensor,
+    image_shape: tuple[int, int],
+) -> torch.Tensor:
+    """Compute unprojection matrix to transform Gaussians to Euclidean space.
+    Args:
+        extrinsics: The 4x4 extrinsics matrix of the camera view.
+        intrinsics: The 4x4 intrinsics matrix of the camera view.
+        image_shape: The (width, height) of the input image.
+    Returns:
+        A 4x4 matrix to transform Gaussians from NDC space to Euclidean space.
+    """
+    device = intrinsics.device
+    image_width, image_height = image_shape
+    # This matrix converts OpenCV pixel coordinates to NDC coordinates where
+    # (-1, 1) denotes the top left and (1, 1) the bottom right of the image.
+    #
+    # Note that premultiplying the intrinsics with ndc_matrix typically yields a matrix
+    # that simply scales the x-axis by 2 * focal_length / image_width and the y-axis by
+    # 2 * focal_length / image_height.
+    ndc_matrix = torch.tensor(
+        [
+            [2.0 / image_width, 0.0, -1.0, 0.0],
+            [0.0, 2.0 / image_height, -1.0, 0.0],
+            [0.0, 0.0, 1.0, 0.0],
+            [0.0, 0.0, 0.0, 1.0],
+        ],
+        device=device,
+    )
+    return torch.linalg.inv(ndc_matrix @ intrinsics @ extrinsics)
+def unproject_gaussians(
+    gaussians_ndc: Gaussians3D,
+    extrinsics: torch.Tensor,
+    intrinsics: torch.Tensor,
+    image_shape: tuple[int, int],
+) -> Gaussians3D:
+    """Unproject Gaussians from NDC space to world coordinates."""
+    unprojection_matrix = get_unprojection_matrix(extrinsics, intrinsics, image_shape)
+    gaussians = apply_transform(gaussians_ndc, unprojection_matrix[:3])
+    return gaussians
+def apply_transform(gaussians: Gaussians3D, transform: torch.Tensor) -> Gaussians3D:
+    """Apply an affine transformation to 3D Gaussians.
+    Args:
+        gaussians: The Gaussians to transform.
+        transform: An affine transform with shape 3x4.
+    Returns:
+        The transformed Gaussians.
+    Note: This operation is not differentiable.
+    """
+    transform_linear = transform[..., :3, :3]
+    transform_offset = transform[..., :3, 3]
+    mean_vectors = gaussians.mean_vectors @ transform_linear.T + transform_offset
+    covariance_matrices = compose_covariance_matrices(
+        gaussians.quaternions, gaussians.singular_values
+    )
+    covariance_matrices = (
+        transform_linear @ covariance_matrices @ transform_linear.transpose(-1, -2)
+    )
+    quaternions, singular_values = decompose_covariance_matrices(covariance_matrices)
+    return Gaussians3D(
+        mean_vectors=mean_vectors,
+        singular_values=singular_values,
+        quaternions=quaternions,
+        colors=gaussians.colors,
+        opacities=gaussians.opacities,
+    )
+def decompose_covariance_matrices(
+    covariance_matrices: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Decompose 3D covariance matrices into quaternions and singular values.
+    Args:
+        covariance_matrices: The covariance matrices to decompose.
+    Returns:
+        Quaternion and singular values corresponding to the orientation and scales of
+        the diagonalized matrix.
+    Note: This operation is not differentiable.
+    """
+    device = covariance_matrices.device
+    dtype = covariance_matrices.dtype
+    # We convert to fp64 to avoid numerical errors.
+    covariance_matrices = covariance_matrices.detach().cpu().to(torch.float64)
+    rotations, singular_values_2, _ = torch.linalg.svd(covariance_matrices)
+    # NOTE: in SVD, it is possible that U and VT are both reflections.
+    # We need to correct them.
+    batch_idx, gaussian_idx = torch.where(torch.linalg.det(rotations) < 0)
+    num_reflections = len(gaussian_idx)
+    if num_reflections > 0:
+        LOGGER.warning(
+            "Received %d reflection matrices from SVD. Flipping them to rotations.",
+            num_reflections,
+        )
+        # Flip the last column of reflection and make it a rotation.
+        rotations[batch_idx, gaussian_idx, :, -1] *= -1
+    quaternions = linalg.quaternions_from_rotation_matrices(rotations)
+    quaternions = quaternions.to(dtype=dtype, device=device)
+    singular_values = singular_values_2.sqrt().to(dtype=dtype, device=device)
+    return quaternions, singular_values
+def compose_covariance_matrices(
+    quaternions: torch.Tensor, singular_values: torch.Tensor
+) -> torch.Tensor:
+    """Compose 3D covariance matrices into quaternions and singular values.
+    Args:
+        quaternions: The quaternions describing the principal basis.
+        singular_values: The scales of the diagonalized matrix.
+    Returns:
+        The 3x3 covariances matrices.
+    """
+    device = quaternions.device
+    rotations = linalg.rotation_matrices_from_quaternions(quaternions)
+    diagonal_matrix = torch.eye(3, device=device) * singular_values[..., :, None]
+    return rotations @ diagonal_matrix.square() @ rotations.transpose(-1, -2)
+def convert_spherical_harmonics_to_rgb(sh0: torch.Tensor) -> torch.Tensor:
+    """Convert degree-0 spherical harmonics to RGB.
+    Reference:
+        https://en.wikipedia.org/wiki/Table_of_spherical_harmonics
+    """
+    coeff_degree0 = np.sqrt(1.0 / (4.0 * np.pi))
+    return sh0 * coeff_degree0 + 0.5
+def convert_rgb_to_spherical_harmonics(rgb: torch.Tensor) -> torch.Tensor:
+    """Convert RGB to degree-0 spherical harmonics.
+    Reference:
+        https://en.wikipedia.org/wiki/Table_of_spherical_harmonics
+    """
+    coeff_degree0 = np.sqrt(1.0 / (4.0 * np.pi))
+    return (rgb - 0.5) / coeff_degree0
+def load_ply(path: Path) -> tuple[Gaussians3D, SceneMetaData]:
+    """Loads a ply from a file."""
+    plydata = PlyData.read(path)
+    vertices = next(filter(lambda x: x.name == "vertex", plydata.elements))
+    properties = ["x", "y", "z"]
+    properties.extend([f"f_dc_{i}" for i in range(3)])
+    properties.extend([f"scale_{i}" for i in range(3)])
+    properties.extend([f"rot_{i}" for i in range(3)])
+    for prop in properties:
+        if prop not in vertices:
+            raise KeyError(f"Incompatible ply file: property {prop} not found in ply elements.")
+    mean_vectors = np.stack(
+        (
+            np.asarray(vertices["x"]),
+            np.asarray(vertices["y"]),
+            np.asarray(vertices["z"]),
+        ),
+        axis=1,
+    )
+    scale_logits = np.stack(
+        (
+            np.asarray(vertices["scale_0"]),
+            np.asarray(vertices["scale_1"]),
+            np.asarray(vertices["scale_2"]),
+        ),
+        axis=1,
+    )
+    quaternions = np.stack(
+        (
+            np.asarray(vertices["rot_0"]),
+            np.asarray(vertices["rot_1"]),
+            np.asarray(vertices["rot_2"]),
+            np.asarray(vertices["rot_3"]),
+        ),
+        axis=1,
+    )
+    spherical_harmonics_deg0 = np.stack(
+        (
+            np.asarray(vertices["f_dc_0"]),
+            np.asarray(vertices["f_dc_1"]),
+            np.asarray(vertices["f_dc_2"]),
+        ),
+        axis=1,
+    )
+    colors = convert_spherical_harmonics_to_rgb(spherical_harmonics_deg0)
+    opacity_logits = np.asarray(vertices["opacity"])[..., None]
+    supplement_elements = [element for element in plydata.elements if element.name != "vertex"]
+    supplement_data: dict[str, Any] = {}
+    supplement_keys = ["extrinsic", "intrinsic", "color_space", "image_size"]
+    for element in supplement_elements:
+        for key in supplement_keys:
+            if key not in supplement_data and key in element:
+                supplement_data[key] = np.asarray(element[key])
+    # Parse intrinsics and image_size.
+    if "intrinsic" in supplement_data:
+        intrinsics_data = supplement_data["intrinsic"]
+        # Legacy: image_size is contained in intrinsic element.
+        if "image_size" not in supplement_data:
+            if len(intrinsics_data) != 4:
+                raise ValueError(
+                    "Expect legacy intrinsics with len=4 containing image size, "
+                    f"but received len={len(intrinsics_data)}"
+                )
+            focal_length_px = (intrinsics_data[0], intrinsics_data[1])
+            width = int(intrinsics_data[2])
+            height = int(intrinsics_data[3])
+        else:
+            if len(intrinsics_data) != 9:
+                raise ValueError(
+                    "Expect 9 elements in intrinsics, " f"but received {len(intrinsics_data)}."
+                )
+            intrinsics_matrix = intrinsics_data.reshape((3, 3))
+            focal_length_px = (intrinsics_matrix[0, 0], intrinsics_matrix[1, 1])
+            image_size_data = supplement_data["image_size"]
+            width = image_size_data[0]
+            height = image_size_data[1]
+    # Default to VGA resolution: focal length = 512, image size = (640, 480).
+    else:
+        focal_length_px = (512, 512)
+        width = 640
+        height = 480
+    # Parse extrinsics.
+    extrinsics_data = supplement_data.get("extrinsic", np.eye(4).flatten())
+    extrinsics_matrix = np.eye(4)
+    # Legacy: extrinsics store 12 elements.
+    if len(extrinsics_data) == 12:
+        extrinsics_matrix[:3] = extrinsics_data.reshape((3, 4))
+        extrinsics_matrix[:3, :3] = extrinsics_matrix[:3, :3].copy().T
+    elif len(extrinsics_data) == 16:
+        extrinsics_matrix[:] = extrinsics_data.reshape((4, 4))
+    else:
+        raise ValueError(f"Unrecognized extrinsics matrix shape {len(extrinsics_data)}")
+    # Parse color space.
+    color_space_index = supplement_data.get("color_space", 1)
+    color_space = cs_utils.decode_color_space(color_space_index)
+    if color_space == "sRGB":
+        colors = cs_utils.sRGB2linearRGB(colors)
+    mean_vectors = torch.from_numpy(mean_vectors).view(1, -1, 3).float()
+    quaternions = torch.from_numpy(quaternions).view(1, -1, 4).float()
+    singular_values = torch.exp(torch.from_numpy(scale_logits).view(1, -1, 3)).float()
+    opacities = torch.sigmoid(torch.from_numpy(opacity_logits).view(1, -1)).float()
+    colors = torch.from_numpy(colors).view(1, -1, 3).float()
+    gaussians = Gaussians3D(
+        mean_vectors=mean_vectors,
+        quaternions=quaternions,
+        singular_values=singular_values,
+        opacities=opacities,
+        colors=colors,
+    )
+    metadata = SceneMetaData(focal_length_px[0], (width, height), color_space)
+    return gaussians, metadata
+@torch.no_grad()
+def save_ply(
+    gaussians: Gaussians3D, f_px: float, image_shape: tuple[int, int], path: Path
+) -> PlyData:
+    """Save a predicted Gaussian3D to a ply file."""
+    def _inverse_sigmoid(tensor: torch.Tensor) -> torch.Tensor:
+        return torch.log(tensor / (1.0 - tensor))
+    xyz = gaussians.mean_vectors.flatten(0, 1)
+    scale_logits = torch.log(gaussians.singular_values).flatten(0, 1)
+    quaternions = gaussians.quaternions.flatten(0, 1)
+    # SHARP takes an image, convert it to sRGB color space as input,
+    # and predicts linearRGB Gaussians as output.
+    # The SHARP renderer would blend linearRGB Gaussians and convert rendered images and videos
+    # back to sRGB for the best display quality.
+    #
+    # However, public renderers do not have such linear2sRGB conversions after rendering.
+    # If they render linearRGB Gaussians as-is, the output would be dark without Gamma correction.
+    #
+    # To make it compatible to public renderers, we force convert linearRGB to sRGB during export.
+    # - The SHARP renderer will still handle conversions properly.
+    # - Public renderers will be mostly working fine when regarding sRGB images as linearRGB images,
+    #   although for the best performance, it is recommended to apply the conversions.
+    colors = convert_rgb_to_spherical_harmonics(
+        cs_utils.linearRGB2sRGB(gaussians.colors.flatten(0, 1))
+    )
+    color_space_index = cs_utils.encode_color_space("sRGB")
+    # Store opacity logits.
+    opacity_logits = _inverse_sigmoid(gaussians.opacities).flatten(0, 1).unsqueeze(-1)
+    attributes = torch.cat(
+        (
+            xyz,
+            colors,
+            opacity_logits,
+            scale_logits,
+            quaternions,
+        ),
+        dim=1,
+    )
+    dtype_full = [
+        (attribute, "f4")
+        for attribute in ["x", "y", "z"]
+        + [f"f_dc_{i}" for i in range(3)]
+        + ["opacity"]
+        + [f"scale_{i}" for i in range(3)]
+        + [f"rot_{i}" for i in range(4)]
+    ]
+    num_gaussians = len(xyz)
+    elements = np.empty(num_gaussians, dtype=dtype_full)
+    elements[:] = list(map(tuple, attributes.detach().cpu().numpy()))
+    vertex_elements = PlyElement.describe(elements, "vertex")
+    # Load image-wise metadata.
+    image_height, image_width = image_shape
+    # Export image size.
+    dtype_image_size = [("image_size", "u4")]
+    image_size_array = np.empty(2, dtype=dtype_image_size)
+    image_size_array[:] = np.array([image_width, image_height])
+    image_size_element = PlyElement.describe(image_size_array, "image_size")
+    # Export intrinsics.
+    dtype_intrinsic = [("intrinsic", "f4")]
+    intrinsic_array = np.empty(9, dtype=dtype_intrinsic)
+    intrinsic = np.array(
+        [
+            f_px,
+            0,
+            image_width * 0.5,
+            0,
+            f_px,
+            image_height * 0.5,
+            0,
+            0,
+            1,
+        ]
+    )
+    intrinsic_array[:] = intrinsic.flatten()
+    intrinsic_element = PlyElement.describe(intrinsic_array, "intrinsic")
+    # Export dummy extrinsics.
+    dtype_extrinsic = [("extrinsic", "f4")]
+    extrinsic_array = np.empty(16, dtype=dtype_extrinsic)
+    extrinsic_array[:] = np.eye(4).flatten()
+    extrinsic_element = PlyElement.describe(extrinsic_array, "extrinsic")
+    # Export number of frames and particles per frame.
+    dtype_frames = [("frame", "i4")]
+    frame_array = np.empty(2, dtype=dtype_frames)
+    frame_array[:] = np.array([1, num_gaussians], dtype=np.int32)
+    frame_element = PlyElement.describe(frame_array, "frame")
+    # Export disparity ranges for transform.
+    dtype_disparity = [("disparity", "f4")]
+    disparity_array = np.empty(2, dtype=dtype_disparity)
+    disparity = 1.0 / gaussians.mean_vectors[0, ..., -1]
+    quantiles = (
+        torch.quantile(disparity, q=torch.tensor([0.1, 0.9], device=disparity.device))
+        .float()
+        .cpu()
+        .numpy()
+    )
+    disparity_array[:] = quantiles
+    disparity_element = PlyElement.describe(disparity_array, "disparity")
+    # Export colorspace.
+    dtype_color_space = [("color_space", "u1")]
+    color_space_array = np.empty(1, dtype=dtype_color_space)
+    color_space_array[:] = np.array([color_space_index]).flatten()
+    color_space_element = PlyElement.describe(color_space_array, "color_space")
+    dtype_version = [("version", "u1")]
+    version_array = np.empty(3, dtype=dtype_version)
+    version_array[:] = np.array([1, 5, 0], dtype=np.uint8).flatten()
+    version_element = PlyElement.describe(version_array, "version")
+    plydata = PlyData(
+        [
+            vertex_elements,
+            extrinsic_element,
+            intrinsic_element,
+            image_size_element,
+            frame_element,
+            disparity_element,
+            color_space_element,
+            version_element,
+        ]
+    )
+    plydata.write(path)
+    return plydata

src/sharp/utils/gsplat.py ADDED Viewed

	@@ -0,0 +1,191 @@

+"""Contains utility code for gsplat renderer.
+For licensing see accompanying LICENSE file.
+Copyright (C) 2025 Apple Inc. All Rights Reserved.
+"""
+from __future__ import annotations
+from pathlib import Path
+from typing import NamedTuple
+import gsplat
+import torch
+from torch import nn
+from sharp.utils import color_space as cs_utils
+from sharp.utils import io, vis
+from sharp.utils.gaussians import BackgroundColor, Gaussians3D
+class RenderingOutputs(NamedTuple):
+    """Outputs of 3D Gaussians renderer."""
+    color: torch.Tensor
+    depth: torch.Tensor
+    alpha: torch.Tensor
+def write_renderings(rendering: RenderingOutputs, output_folder: Path, filename: str):
+    """Write rendered color/depth/alpha to files."""
+    batch_size = len(rendering.color)
+    if batch_size != 1:
+        raise RuntimeError("We only support saving rendering of batch size = 1")
+    def _save_image_tensor(tensor: torch.Tensor, suffix: str):
+        np_array = tensor.permute(1, 2, 0).numpy()
+        io.save_image(np_array, (output_folder / filename).with_suffix(suffix))
+    color = (rendering.color[0].cpu() * 255.0).to(dtype=torch.uint8)
+    colorized_depth = vis.colorize_depth(rendering.depth[0], val_max=100.0)
+    colorized_alpha = vis.colorize_alpha(rendering.alpha[0])
+    _save_image_tensor(color, ".color.png")
+    _save_image_tensor(colorized_depth, ".depth.png")
+    _save_image_tensor(colorized_alpha, ".alpha.png")
+class GSplatRenderer(nn.Module):
+    """Module to render 3D Gaussians to images using gsplat."""
+    color_space: cs_utils.ColorSpace
+    background_color: BackgroundColor
+    def __init__(
+        self,
+        color_space: cs_utils.ColorSpace = "sRGB",
+        background_color: BackgroundColor = "black",
+        low_pass_filter_eps: float = 0.0,
+    ) -> None:
+        """Initialize gsplat renderer.
+        Args:
+            color_space: The color space to use for rendering.
+            background_color: The background color to use for rendering.
+            low_pass_filter_eps: The epsilon value for the low pass filter.
+        """
+        super().__init__()
+        self.color_space = color_space
+        self.background_color = background_color
+        self.low_pass_filter_eps = low_pass_filter_eps
+    def forward(
+        self,
+        gaussians: Gaussians3D,
+        extrinsics: torch.Tensor,
+        intrinsics: torch.Tensor,
+        image_width: int,
+        image_height: int,
+    ) -> RenderingOutputs:
+        """Predict images from gaussians.
+        Args:
+            gaussians: The Gaussians to render.
+            extrinsics: The extrinsics of the camera to render to in OpenCV format.
+            intrinsics: The intriniscs of the camera to render to in OpenCV format.
+            image_width: The desired output image width.
+            image_height: The desired output image height.
+        """
+        batch_size = len(gaussians.mean_vectors)
+        outputs_list: list[RenderingOutputs] = []
+        for ib in range(batch_size):
+            colors, alphas, meta = gsplat.rendering.rasterization(
+                means=gaussians.mean_vectors[ib],
+                quats=gaussians.quaternions[ib],
+                scales=gaussians.singular_values[ib],
+                opacities=gaussians.opacities[ib],
+                colors=gaussians.colors[ib],
+                viewmats=extrinsics[ib : ib + 1],
+                Ks=intrinsics[ib : ib + 1, :3, :3],
+                width=image_width,
+                height=image_height,
+                render_mode="RGB+D",
+                rasterize_mode="classic",
+                absgrad=False,
+                packed=False,
+                eps2d=self.low_pass_filter_eps,
+            )
+            rendered_color = colors[..., 0:3].permute([0, 3, 1, 2])
+            rendered_depth_unnormalized = colors[..., 3:4].permute([0, 3, 1, 2])
+            rendered_alpha = alphas.permute([0, 3, 1, 2])
+            # Compose with background color.
+            rendered_color = self.compose_with_background(
+                rendered_color, rendered_alpha, self.background_color
+            )
+            # Colorspace conversion.
+            if self.color_space == "sRGB":
+                pass
+            elif self.color_space == "linearRGB":
+                rendered_color = cs_utils.linearRGB2sRGB(rendered_color)
+            else:
+                ValueError("Unsupported ColorSpace type.")
+            # splats: (B, N, 10)
+            cov2d = self._conics_to_covars2d(meta["conics"])
+            # Set the cov2d of invisible splats to 1 to avoid nan in condition number calculation..
+            splats_visible_mask = meta["depths"] > 1e-2
+            cov2d[~splats_visible_mask][..., 0, 0] = 1
+            cov2d[~splats_visible_mask][..., 1, 1] = 1
+            cov2d[~splats_visible_mask][..., 0, 1] = 0
+            # Normalize the depth by alpha.
+            rendered_depth = rendered_depth_unnormalized / torch.clip(rendered_alpha, min=1e-8)
+            outputs = RenderingOutputs(
+                color=rendered_color,
+                depth=rendered_depth,
+                alpha=rendered_alpha,
+            )
+            outputs_list.append(outputs)
+        return RenderingOutputs(
+            color=torch.cat([item.color for item in outputs_list], dim=0).contiguous(),
+            depth=torch.cat([item.depth for item in outputs_list], dim=0).contiguous(),
+            alpha=torch.cat([item.alpha for item in outputs_list], dim=0).contiguous(),
+        )
+    @staticmethod
+    def compose_with_background(
+        rendered_rgb: torch.Tensor,
+        rendered_alpha: torch.Tensor,
+        background_color: BackgroundColor,
+    ) -> torch.Tensor:
+        """Compose rendered RGB with background color."""
+        if background_color == "black":
+            return rendered_rgb
+        elif background_color == "white":
+            return rendered_rgb + (1.0 - rendered_alpha)
+        elif background_color == "random_color":
+            return (
+                rendered_rgb
+                + (1.0 - rendered_alpha)
+                * torch.rand(3, dtype=rendered_rgb.dtype, device=rendered_rgb.device)[
+                    None, :, None, None
+                ]
+            )
+        elif background_color == "random_pixel":
+            return rendered_rgb + (1.0 - rendered_alpha) * torch.rand_like(rendered_rgb)
+        else:
+            raise ValueError("Unsupported BackgroundColor type.")
+    @staticmethod
+    def _conics_to_covars2d(conics: torch.Tensor, eps=1e-8) -> torch.Tensor:
+        """Convert conics to covariance matrices."""
+        a = conics[..., 0]
+        b = conics[..., 1]
+        c = conics[..., 2]
+        # Reconstruct determinant.
+        det = 1 / (a * c - b**2 + eps)
+        det = det.clamp(min=eps)
+        # Reconstruct covars2d.
+        covars2d = torch.zeros(*conics.shape[:-1], 2, 2, device=conics.device)
+        covars2d[..., 1, 1] = a * det
+        covars2d[..., 0, 0] = c * det
+        covars2d[..., 0, 1] = -b * det
+        covars2d[..., 1, 0] = -b * det
+        covars2d = torch.nan_to_num(covars2d, nan=0.0, posinf=0.0, neginf=0.0)
+        return covars2d

src/sharp/utils/io.py ADDED Viewed

	@@ -0,0 +1,213 @@

+"""Contains image IO.
+For licensing see accompanying LICENSE file.
+Copyright (C) 2025 Apple Inc. All Rights Reserved.
+"""
+from __future__ import annotations
+import io
+import logging
+from pathlib import Path
+from typing import IO, Any, Protocol
+import imageio.v2 as iio
+import numpy as np
+import pillow_heif
+import torch
+from PIL import ExifTags, Image, TiffTags
+from .vis import METRIC_DEPTH_MAX_CLAMP_METER, colorize_depth
+LOGGER = logging.getLogger(__name__)
+# NOTE: unused, kept for reference.
+Image.MAX_IMAGE_PIXELS = 200000000
+def load_rgb(
+    path: Path, auto_rotate: bool = True, remove_alpha: bool = True
+) -> tuple[np.ndarray, list[bytes] | None, float]:
+    """Load an RGB image."""
+    LOGGER.debug(f"Loading image {path} ...")
+    if path.suffix.lower() in [".heic"]:
+        heif_file = pillow_heif.open_heif(path, convert_hdr_to_8bit=True)
+        img_pil = heif_file.to_pillow()
+    else:
+        img_pil = Image.open(path)
+    img_exif = extract_exif(img_pil)
+    icc_profile = img_pil.info.get("icc_profile", None)
+    # Rotate the image.
+    if auto_rotate:
+        exif_orientation = img_exif.get("Orientation", 1)
+        if exif_orientation == 3:
+            img_pil = img_pil.transpose(Image.ROTATE_180)
+        elif exif_orientation == 6:
+            img_pil = img_pil.transpose(Image.ROTATE_270)
+        elif exif_orientation == 8:
+            img_pil = img_pil.transpose(Image.ROTATE_90)
+        elif exif_orientation != 1:
+            LOGGER.warning(f"Ignoring image orientation {exif_orientation}.")
+    # Extract the focal length.
+    f_35mm = img_exif.get("FocalLengthIn35mmFilm", img_exif.get("FocalLenIn35mmFilm", None))
+    if f_35mm is None or f_35mm < 1:
+        f_35mm = img_exif.get("FocalLength", None)
+        if f_35mm is None:
+            LOGGER.warn(f"Did not find focallength in exif data of {path} - Setting to 30mm.")
+            f_35mm = 30.0
+        if f_35mm < 10.0:
+            LOGGER.info("Found focal length below 10mm, assuming it's not for 35mm.")
+            # This is a very crude approximation.
+            f_35mm *= 8.4
+    img = np.asarray(img_pil)
+    # Convert to RGB if single channel.
+    if img.ndim < 3 or img.shape[2] == 1:
+        img = np.dstack((img, img, img))
+    if remove_alpha:
+        img = img[:, :, :3]
+    LOGGER.debug(f"\tHxW: {img.shape[0]}x{img.shape[1]}")
+    LOGGER.debug(f"\tfocal length @ 35mm film: {f_35mm}mm")
+    f_px = convert_focallength(img.shape[1], img.shape[0], f_35mm)
+    LOGGER.debug(f"\tfocal length: {f_px:.2f}px")
+    return img, icc_profile, f_px
+def extract_exif(img_pil: Image.Image) -> dict[str, Any]:
+    """Return exif information as a dictionary."""
+    # Get full exif description from get_ifd(0x8769):
+    # cf https://pillow.readthedocs.io/en/stable/releasenotes/8.2.0.html#image-getexif-exif-and-gps-ifd # noqa
+    img_exif = img_pil.getexif().get_ifd(0x8769)
+    exif_dict = {ExifTags.TAGS[k]: v for k, v in img_exif.items() if k in ExifTags.TAGS}
+    # https://pillow.readthedocs.io/en/stable/_modules/PIL/TiffTags.html# # noqa
+    tiff_tags = img_pil.getexif()
+    tiff_dict = {TiffTags.TAGS_V2[k].name: v for k, v in tiff_tags.items() if k in TiffTags.TAGS_V2}
+    return {**exif_dict, **tiff_dict}
+def convert_focallength(width: float, height: float, f_mm: float = 30) -> float:
+    """Converts a focal length given in mm to pixels."""
+    return f_mm * np.sqrt(width**2.0 + height**2.0) / np.sqrt(36**2 + 24**2)
+def save_image(
+    image: np.ndarray,
+    output_path: Path,
+    icc_profile: list[bytes] | None = None,
+    jpeg_quality: int = 92,
+) -> None:
+    """Save image to given path."""
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    extensions_to_format = Image.registered_extensions()
+    try:
+        format = extensions_to_format[output_path.suffix.lower()]
+    except KeyError:
+        raise ValueError(f"Unsupported output format {output_path.suffix}.")
+    with output_path.open("wb") as file_handle:
+        write_image(
+            image,
+            file_handle,
+            format,
+            icc_profile=icc_profile,
+            jpeg_quality=jpeg_quality,
+        )
+def write_image(
+    image: np.ndarray,
+    output_io: IO[bytes],
+    format="jpg",
+    icc_profile: list[bytes] | None = None,
+    jpeg_quality: int = 92,
+):
+    """Write image to binary stream."""
+    pil_config = {}
+    if format == "JPEG":
+        pil_config["quality"] = jpeg_quality
+    image_pil = Image.fromarray(image)
+    # Workaround to error [io.UnsupportedOperation: seek].
+    if format == "TIFF":
+        bytes_io = io.BytesIO()
+        image_pil.save(bytes_io, format="TIFF")
+        bytes_io.seek(0)
+        output_io.write(bytes_io.read())
+        return
+    image_pil.save(output_io, format, icc_profile=icc_profile, **pil_config)
+def get_supported_image_extensions(with_heic: bool = True) -> list[str]:
+    """Return supported image extensions."""
+    exts = Image.registered_extensions()
+    supported_extensions = {ex for ex, f in exts.items() if f in Image.OPEN}
+    if with_heic:
+        supported_extensions.add(".heic")
+    supported_extensions_upper = {ex.upper() for ex in supported_extensions}
+    return list(supported_extensions | supported_extensions_upper)
+def get_supported_video_extensions():
+    """Return supported video extensions."""
+    supported_extensions = {".mp4", ".mov"}
+    supported_extensions_upper = {ext.upper() for ext in supported_extensions}
+    return list(supported_extensions | supported_extensions_upper)
+class OutputWriter(Protocol):
+    """Protocol for writing output to disk."""
+    def add_frame(self, image: torch.Tensor, depth: torch.Tensor) -> None:
+        """Add a single frame to output."""
+        ...
+    def close(self) -> None:
+        """Finish writing."""
+        ...
+class VideoWriter(OutputWriter):
+    """Output writer for video output."""
+    def __init__(self, output_path: Path, fps: float = 30.0, render_depth: bool = True) -> None:
+        """Initialize VideoWriter."""
+        output_path.parent.mkdir(exist_ok=True, parents=True)
+        self.output_path = output_path
+        self.image_writer = iio.get_writer(output_path, fps=fps)
+        self.max_depth_estimate = None
+        if render_depth:
+            self.depth_writer = iio.get_writer(output_path.with_suffix(".depth.mp4"), fps=fps)
+    def add_frame(self, image: torch.Tensor, depth: torch.Tensor) -> None:
+        """Add a single frame to output."""
+        image_np = image.detach().cpu().numpy()
+        self.image_writer.append_data(image_np)
+        if self.depth_writer is not None:
+            if self.max_depth_estimate is None:
+                self.max_depth_estimate = depth.max().item()
+            colored_depth_pt = colorize_depth(
+                depth,
+                min(self.max_depth_estimate, METRIC_DEPTH_MAX_CLAMP_METER),  # type: ignore[call-overload]
+            )
+            colored_depth_np = colored_depth_pt.squeeze(0).permute(1, 2, 0).cpu().numpy()
+            self.depth_writer.append_data(colored_depth_np)
+    def close(self):
+        """Finish writing."""
+        self.image_writer.close()

src/sharp/utils/linalg.py ADDED Viewed

	@@ -0,0 +1,104 @@

+"""Contains linear algebra related utility functions.
+For licensing see accompanying LICENSE file.
+Copyright (C) 2025 Apple Inc. All Rights Reserved.
+"""
+from __future__ import annotations
+import torch
+import torch.nn.functional as F
+from scipy.spatial.transform import Rotation
+def rotation_matrices_from_quaternions(quaternions: torch.Tensor) -> torch.Tensor:
+    """Convert batch of quaternions into rotations matrices.
+    Args:
+        quaternions: The quaternions convert to matrices.
+    Returns:
+        The rotations matrices corresponding to the (normalized) quaternions.
+    """
+    device = quaternions.device
+    shape = quaternions.shape[:-1]
+    quaternions = quaternions / torch.linalg.norm(quaternions, dim=-1, keepdim=True)
+    real_part = quaternions[..., 0]
+    vector_part = quaternions[..., 1:]
+    vector_cross = get_cross_product_matrix(vector_part)
+    real_part = real_part[..., None, None]
+    matrix_outer = vector_part[..., :, None] * vector_part[..., None, :]
+    matrix_diag = real_part.square() * eyes(3, shape=shape, device=device)
+    matrix_cross_1 = 2 * real_part * vector_cross
+    matrix_cross_2 = vector_cross @ vector_cross
+    return matrix_outer + matrix_diag + matrix_cross_1 + matrix_cross_2
+def quaternions_from_rotation_matrices(matrices: torch.Tensor) -> torch.Tensor:
+    """Convert batch of rotation matrices to quaternions.
+    Args:
+        matrices: The matrices to convert to quaternions.
+    Returns:
+        The quaternions corresponding to the rotation matrices.
+    Note: this operation is not differentiable and will be performed on the CPU.
+    """
+    if not matrices.shape[-2:] == (3, 3):
+        raise ValueError(f"matrices have invalid shape {matrices.shape}")
+    matrices_np = matrices.detach().cpu().numpy()
+    quaternions_np = Rotation.from_matrix(matrices_np.reshape(-1, 3, 3)).as_quat()
+    # We use a convention where the w component is at the start of the quaternion.
+    quaternions_np = quaternions_np[:, [3, 0, 1, 2]]
+    quaternions_np = quaternions_np.reshape(matrices_np.shape[:-2] + (4,))
+    return torch.as_tensor(quaternions_np, device=matrices.device, dtype=matrices.dtype)
+def get_cross_product_matrix(vectors: torch.Tensor) -> torch.Tensor:
+    """Generate cross product matrix for vector exterior product."""
+    if not vectors.shape[-1] == 3:
+        raise ValueError("Only 3-dimensional vectors are supported")
+    device = vectors.device
+    shape = vectors.shape[:-1]
+    unit_basis = eyes(3, shape=shape, device=device)
+    # We compute the matrix by multiplying each column of unit_basis with the
+    # corresponding vector.
+    return torch.cross(vectors[..., :, None], unit_basis, dim=-2)
+def eyes(
+    dim: int, shape: tuple[int, ...], device: torch.device | str | None = None
+) -> torch.Tensor:
+    """Create batch of identity matrices."""
+    return torch.eye(dim, device=device).broadcast_to(shape + (dim, dim)).clone()
+def quaternion_product(q1, q2):
+    """Compute dot product between two quaternions."""
+    real_1 = q1[..., :1]
+    real_2 = q2[..., :1]
+    vector_1 = q1[..., 1:]
+    vector_2 = q2[..., 1:]
+    real_out = real_1 * real_2 - (vector_1 * vector_2).sum(dim=-1, keepdim=True)
+    vector_out = real_1 * vector_2 + real_2 * vector_1 + torch.cross(vector_1, vector_2)
+    return torch.concatenate([real_out, vector_out], dim=-1)
+def quaternion_conj(q):
+    """Get conjugate of a quaternion."""
+    real = q[..., :1]
+    vector = q[..., 1:]
+    return torch.concatenate([real, -vector], dim=-1)
+def project(u: torch.Tensor, basis: torch.Tensor) -> torch.Tensor:
+    """Project tensor u to unit basis a."""
+    unit_u = F.normalize(u, dim=-1)
+    inner_prod = (unit_u * basis).sum(dim=-1, keepdim=True)
+    return inner_prod * u

src/sharp/utils/logging.py ADDED Viewed

	@@ -0,0 +1,45 @@

+"""Contains logging related utility functions.
+For licensing see accompanying LICENSE file.
+Copyright (C) 2025 Apple Inc. All Rights Reserved.
+"""
+from __future__ import annotations
+import logging
+import sys
+from pathlib import Path
+def configure(log_level: int, log_path: Path | None = None, prefix: str | None = None) -> None:
+    """Configure logger globally.
+    Args:
+        log_level: The desired verbosity level.
+        log_path: The path to write logs to.
+        prefix: The prefix of the logger.
+    """
+    logger = logging.getLogger(prefix)
+    # Reset logger to initial state (e.g. to avoid side effects from imports).
+    for handler in logger.handlers:
+        logger.removeHandler(handler)
+    for filter in logger.filters:
+        logger.removeFilter(filter)
+    # Set level.
+    logger.setLevel(log_level)
+    formatter = logging.Formatter("%(asctime)s | %(levelname)s | %(message)s")
+    # Set up console handler.
+    stdout_handler = logging.StreamHandler(sys.stdout)
+    stdout_handler.setFormatter(formatter)
+    logger.addHandler(stdout_handler)
+    # Set up file handler.
+    if log_path is not None:
+        file_handler = logging.FileHandler(log_path, mode="w")
+        file_handler.setFormatter(formatter)
+        logger.addHandler(file_handler)

src/sharp/utils/math.py ADDED Viewed

	@@ -0,0 +1,183 @@

+"""Contains utility math functions.
+For licensing see accompanying LICENSE file.
+Copyright (C) 2025 Apple Inc. All Rights Reserved.
+"""
+from __future__ import annotations
+from typing import Any, Callable, Literal, NamedTuple, Tuple, Union
+import torch
+from torch import autograd
+ActivationType = Literal[
+    "linear",
+    "exp",
+    "sigmoid",
+    "softplus",
+    "relu_with_pushback",
+    "hard_sigmoid_with_pushback",
+]
+ActivationFunction = Callable[[torch.Tensor], torch.Tensor]
+class ActivationPair(NamedTuple):
+    """A pair of forward and inverse activation functions."""
+    forward: ActivationFunction
+    inverse: ActivationFunction
+def create_activation_pair(activation_type: ActivationType) -> ActivationPair:
+    """Create activation function and corresponding inverse function.
+    Args:
+        activation_type: The activation type to create.
+    Returns:
+        The corresponding activation functions and the corresponding inverse function.
+    """
+    if activation_type == "linear":
+        return ActivationPair(lambda x: x, lambda x: x)
+    elif activation_type == "exp":
+        return ActivationPair(torch.exp, torch.log)
+    elif activation_type == "sigmoid":
+        return ActivationPair(torch.sigmoid, inverse_sigmoid)
+    elif activation_type == "softplus":
+        return ActivationPair(torch.nn.functional.softplus, inverse_softplus)
+    elif activation_type == "relu_with_pushback":
+        return ActivationPair(relu_with_pushback, lambda x: x)
+    elif activation_type == "hard_sigmoid_with_pushback":
+        return ActivationPair(hard_sigmoid_with_pushback, lambda x: 6.0 * x - 3.0)
+    else:
+        raise ValueError(f"Unsupported activation function: {activation_type}.")
+def inverse_sigmoid(tensor: torch.Tensor) -> torch.Tensor:
+    """Compute inverse sigmoid."""
+    return torch.log(tensor / (1.0 - tensor))
+def inverse_softplus(tensor: torch.Tensor, eps: float = 1e-06) -> torch.Tensor:
+    """Compute inverse softplus."""
+    tensor = tensor.clamp_min(eps)
+    sigmoid = torch.sigmoid(-tensor)
+    exp = sigmoid / (1.0 - sigmoid)
+    return tensor + torch.log(-exp + 1.0)
+# The first value describes the threshold from where clamping will be applied, while
+# the second value describes the value to clamp with.
+SoftClampRange = Tuple[Union[torch.Tensor, float], Union[torch.Tensor, float]]
+def softclamp(
+    tensor: torch.Tensor,
+    min: SoftClampRange | None = None,
+    max: SoftClampRange | None = None,
+) -> torch.Tensor:
+    """Clamp tensor to min/max in differentiable way.
+    Args:
+        tensor: The tensor to clamp.
+        min: Pair of threshold to start clamping and value to clamp to.
+            The first value should be larger than the second.
+        max: Pair of threshold to start clamping and value to clamp to.
+            The first value should be smaller than the second.
+    Returns:
+        The clamped tensor.
+    """
+    def normalize(clamp_range: SoftClampRange) -> torch.Tensor:
+        value0, value1 = clamp_range
+        return value0 + (value1 - value0) * torch.tanh((tensor - value0) / (value1 - value0))
+    tensor_clamped = tensor
+    if min is not None:
+        tensor_clamped = torch.maximum(tensor_clamped, normalize(min))
+    if max is not None:
+        tensor_clamped = torch.minimum(tensor_clamped, normalize(max))
+    return tensor_clamped
+class ClampWithPushback(autograd.Function):
+    """Implementation of clamp_with_pushback function."""
+    @staticmethod
+    def forward(
+        ctx: Any,
+        tensor: torch.Tensor,
+        min: float | None,
+        max: float | None,
+        pushback: float,
+    ) -> torch.Tensor:
+        """Apply clamp."""
+        if min is not None and max is not None and min >= max:
+            raise ValueError("Only min < max is supported.")
+        ctx.save_for_backward(tensor)
+        ctx.min = min
+        ctx.max = max
+        ctx.pushback = pushback
+        return torch.clamp(tensor, min=min, max=max)
+    @staticmethod
+    def backward(  # type: ignore[override] # Deal with buggy torch annotations.
+        ctx: Any, grad_in: torch.Tensor
+    ) -> tuple[torch.Tensor, None, None, None]:
+        """Compute gradient of clamp with pushback."""
+        grad_out = grad_in.clone()
+        (tensor,) = ctx.saved_tensors
+        if ctx.min is not None:
+            mask_min = tensor < ctx.min
+            grad_out[mask_min] = -ctx.pushback
+        if ctx.max is not None:
+            mask_max = tensor > ctx.max
+            grad_out[mask_max] = ctx.pushback
+        return grad_out, None, None, None
+def clamp_with_pushback(
+    tensor: torch.Tensor,
+    min: float | None = None,
+    max: float | None = None,
+    pushback: float = 1e-2,
+) -> torch.Tensor:
+    """Variant of clamp function which avoid the vanishing gradient problem.
+    This function is equivalent to adding a regularizer of the form
+        pushback * sum_i (
+            relu(min - preactivation_i) + relu(preactivation_i - max)
+        )
+    to the full loss function, which pushes clamped values back.
+    When used in minimization problems, pushback should be greater than
+    zero. In maximization problems, pushback should be smaller than zero.
+    """
+    output = ClampWithPushback.apply(tensor, min, max, pushback)
+    assert isinstance(output, torch.Tensor)
+    return output
+def hard_sigmoid_with_pushback(x: torch.Tensor, slope: float = 1.0 / 6.0) -> torch.Tensor:
+    """Apply hard sigmoid with pushback.
+    For compatibility reasons, we follow the default PyTorch implementation with a
+    default slope of 1/6:
+        https://pytorch.org/docs/stable/generated/torch.nn.Hardsigmoid.html
+    """
+    return clamp_with_pushback(slope * x + 0.5, min=0.0, max=1.0)
+def relu_with_pushback(x: torch.Tensor) -> torch.Tensor:
+    """Compute relu with pushback."""
+    return clamp_with_pushback(x, min=0.0)