Spaces:

theairlabcmu
/

Any4D

Running on Zero

App Files Files Community

Jay Karhade commited on Dec 12, 2025

Commit

0343ccd

1 Parent(s): 1e88e3f

Initial Space release

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
.gitignore +2 -0
MoGe/.gitignore +423 -0
MoGe/CHANGELOG.md +28 -0
MoGe/CODE_OF_CONDUCT.md +9 -0
MoGe/LICENSE +224 -0
MoGe/README.md +220 -0
MoGe/SECURITY.md +41 -0
MoGe/SUPPORT.md +25 -0
MoGe/baselines/da_v2.py +88 -0
MoGe/baselines/da_v2_metric.py +99 -0
MoGe/baselines/metric3d_v2.py +117 -0
MoGe/baselines/moge.py +83 -0
MoGe/configs/eval/all_benchmarks.json +78 -0
MoGe/configs/eval/benchmarks/ddad.json +9 -0
MoGe/configs/eval/benchmarks/diode.json +9 -0
MoGe/configs/eval/benchmarks/eth3d.json +10 -0
MoGe/configs/eval/benchmarks/gso.json +8 -0
MoGe/configs/eval/benchmarks/hammer.json +10 -0
MoGe/configs/eval/benchmarks/ibims-1.json +10 -0
MoGe/configs/eval/benchmarks/kitti.json +9 -0
MoGe/configs/eval/benchmarks/nyu.json +8 -0
MoGe/configs/eval/benchmarks/sintel.json +10 -0
MoGe/configs/eval/benchmarks/spring.json +9 -0
MoGe/configs/train/v1.json +77 -0
MoGe/docs/eval.md +77 -0
MoGe/docs/train.md +181 -0
MoGe/moge/__init__.py +0 -0
MoGe/moge/model/__init__.py +17 -0
MoGe/moge/model/dinov2/__init__.py +6 -0
MoGe/moge/model/dinov2/hub/__init__.py +4 -0
MoGe/moge/model/dinov2/hub/backbones.py +156 -0
MoGe/moge/model/dinov2/hub/utils.py +39 -0
MoGe/moge/model/dinov2/layers/__init__.py +11 -0
MoGe/moge/model/dinov2/layers/attention.py +89 -0
MoGe/moge/model/dinov2/layers/block.py +259 -0
MoGe/moge/model/dinov2/layers/dino_head.py +58 -0
MoGe/moge/model/dinov2/layers/drop_path.py +34 -0
MoGe/moge/model/dinov2/layers/layer_scale.py +27 -0
MoGe/moge/model/dinov2/layers/mlp.py +40 -0
MoGe/moge/model/dinov2/layers/patch_embed.py +88 -0
MoGe/moge/model/dinov2/layers/swiglu_ffn.py +72 -0
MoGe/moge/model/dinov2/models/__init__.py +43 -0
MoGe/moge/model/dinov2/models/vision_transformer.py +396 -0
MoGe/moge/model/dinov2/utils/__init__.py +4 -0
MoGe/moge/model/dinov2/utils/cluster.py +95 -0
MoGe/moge/model/dinov2/utils/config.py +72 -0
MoGe/moge/model/dinov2/utils/dtype.py +37 -0
MoGe/moge/model/dinov2/utils/param_groups.py +103 -0
MoGe/moge/model/dinov2/utils/utils.py +95 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+examples/* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ checkpoints/
2	+ .gradio/

MoGe/.gitignore ADDED Viewed

	@@ -0,0 +1,423 @@

+## Ignore Visual Studio temporary files, build results, and
+## files generated by popular Visual Studio add-ons.
+##
+## Get latest from https://github.com/github/gitignore/blob/main/VisualStudio.gitignore
+# User-specific files
+*.rsuser
+*.suo
+*.user
+*.userosscache
+*.sln.docstates
+# User-specific files (MonoDevelop/Xamarin Studio)
+*.userprefs
+# Mono auto generated files
+mono_crash.*
+# Build results
+[Dd]ebug/
+[Dd]ebugPublic/
+[Rr]elease/
+[Rr]eleases/
+x64/
+x86/
+[Ww][Ii][Nn]32/
+[Aa][Rr][Mm]/
+[Aa][Rr][Mm]64/
+bld/
+[Bb]in/
+[Oo]bj/
+[Ll]og/
+[Ll]ogs/
+# Visual Studio 2015/2017 cache/options directory
+.vs/
+# Uncomment if you have tasks that create the project's static files in wwwroot
+#wwwroot/
+# Visual Studio 2017 auto generated files
+Generated\ Files/
+# MSTest test Results
+[Tt]est[Rr]esult*/
+[Bb]uild[Ll]og.*
+# NUnit
+*.VisualState.xml
+TestResult.xml
+nunit-*.xml
+# Build Results of an ATL Project
+[Dd]ebugPS/
+[Rr]eleasePS/
+dlldata.c
+# Benchmark Results
+BenchmarkDotNet.Artifacts/
+# .NET Core
+project.lock.json
+project.fragment.lock.json
+artifacts/
+# ASP.NET Scaffolding
+ScaffoldingReadMe.txt
+# StyleCop
+StyleCopReport.xml
+# Files built by Visual Studio
+*_i.c
+*_p.c
+*_h.h
+*.ilk
+*.meta
+*.obj
+*.iobj
+*.pch
+*.pdb
+*.ipdb
+*.pgc
+*.pgd
+*.rsp
+*.sbr
+*.tlb
+*.tli
+*.tlh
+*.tmp
+*.tmp_proj
+*_wpftmp.csproj
+*.log
+*.tlog
+*.vspscc
+*.vssscc
+.builds
+*.pidb
+*.svclog
+*.scc
+# Chutzpah Test files
+_Chutzpah*
+# Visual C++ cache files
+ipch/
+*.aps
+*.ncb
+*.opendb
+*.opensdf
+*.sdf
+*.cachefile
+*.VC.db
+*.VC.VC.opendb
+# Visual Studio profiler
+*.psess
+*.vsp
+*.vspx
+*.sap
+# Visual Studio Trace Files
+*.e2e
+# TFS 2012 Local Workspace
+$tf/
+# Guidance Automation Toolkit
+*.gpState
+# ReSharper is a .NET coding add-in
+_ReSharper*/
+*.[Rr]e[Ss]harper
+*.DotSettings.user
+# TeamCity is a build add-in
+_TeamCity*
+# DotCover is a Code Coverage Tool
+*.dotCover
+# AxoCover is a Code Coverage Tool
+.axoCover/*
+!.axoCover/settings.json
+# Coverlet is a free, cross platform Code Coverage Tool
+coverage*.json
+coverage*.xml
+coverage*.info
+# Visual Studio code coverage results
+*.coverage
+*.coveragexml
+# NCrunch
+_NCrunch_*
+.*crunch*.local.xml
+nCrunchTemp_*
+# MightyMoose
+*.mm.*
+AutoTest.Net/
+# Web workbench (sass)
+.sass-cache/
+# Installshield output folder
+[Ee]xpress/
+# DocProject is a documentation generator add-in
+DocProject/buildhelp/
+DocProject/Help/*.HxT
+DocProject/Help/*.HxC
+DocProject/Help/*.hhc
+DocProject/Help/*.hhk
+DocProject/Help/*.hhp
+DocProject/Help/Html2
+DocProject/Help/html
+# Click-Once directory
+publish/
+# Publish Web Output
+*.[Pp]ublish.xml
+*.azurePubxml
+# Note: Comment the next line if you want to checkin your web deploy settings,
+# but database connection strings (with potential passwords) will be unencrypted
+*.pubxml
+*.publishproj
+# Microsoft Azure Web App publish settings. Comment the next line if you want to
+# checkin your Azure Web App publish settings, but sensitive information contained
+# in these scripts will be unencrypted
+PublishScripts/
+# NuGet Packages
+*.nupkg
+# NuGet Symbol Packages
+*.snupkg
+# The packages folder can be ignored because of Package Restore
+**/[Pp]ackages/*
+# except build/, which is used as an MSBuild target.
+!**/[Pp]ackages/build/
+# Uncomment if necessary however generally it will be regenerated when needed
+#!**/[Pp]ackages/repositories.config
+# NuGet v3's project.json files produces more ignorable files
+*.nuget.props
+*.nuget.targets
+# Microsoft Azure Build Output
+csx/
+*.build.csdef
+# Microsoft Azure Emulator
+ecf/
+rcf/
+# Windows Store app package directories and files
+AppPackages/
+BundleArtifacts/
+Package.StoreAssociation.xml
+_pkginfo.txt
+*.appx
+*.appxbundle
+*.appxupload
+# Visual Studio cache files
+# files ending in .cache can be ignored
+*.[Cc]ache
+# but keep track of directories ending in .cache
+!?*.[Cc]ache/
+# Others
+ClientBin/
+~$*
+*~
+*.dbmdl
+*.dbproj.schemaview
+*.jfm
+*.pfx
+*.publishsettings
+orleans.codegen.cs
+# Including strong name files can present a security risk
+# (https://github.com/github/gitignore/pull/2483#issue-259490424)
+#*.snk
+# Since there are multiple workflows, uncomment next line to ignore bower_components
+# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
+#bower_components/
+# RIA/Silverlight projects
+Generated_Code/
+# Backup & report files from converting an old project file
+# to a newer Visual Studio version. Backup files are not needed,
+# because we have git ;-)
+_UpgradeReport_Files/
+Backup*/
+UpgradeLog*.XML
+UpgradeLog*.htm
+ServiceFabricBackup/
+*.rptproj.bak
+# SQL Server files
+*.mdf
+*.ldf
+*.ndf
+# Business Intelligence projects
+*.rdl.data
+*.bim.layout
+*.bim_*.settings
+*.rptproj.rsuser
+*- [Bb]ackup.rdl
+*- [Bb]ackup ([0-9]).rdl
+*- [Bb]ackup ([0-9][0-9]).rdl
+# Microsoft Fakes
+FakesAssemblies/
+# GhostDoc plugin setting file
+*.GhostDoc.xml
+# Node.js Tools for Visual Studio
+.ntvs_analysis.dat
+node_modules/
+# Visual Studio 6 build log
+*.plg
+# Visual Studio 6 workspace options file
+*.opt
+# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
+*.vbw
+# Visual Studio 6 auto-generated project file (contains which files were open etc.)
+*.vbp
+# Visual Studio 6 workspace and project file (working project files containing files to include in project)
+*.dsw
+*.dsp
+# Visual Studio 6 technical files
+*.ncb
+*.aps
+# Visual Studio LightSwitch build output
+**/*.HTMLClient/GeneratedArtifacts
+**/*.DesktopClient/GeneratedArtifacts
+**/*.DesktopClient/ModelManifest.xml
+**/*.Server/GeneratedArtifacts
+**/*.Server/ModelManifest.xml
+_Pvt_Extensions
+# Paket dependency manager
+.paket/paket.exe
+paket-files/
+# FAKE - F# Make
+.fake/
+# CodeRush personal settings
+.cr/personal
+# Python Tools for Visual Studio (PTVS)
+__pycache__/
+*.pyc
+# Cake - Uncomment if you are using it
+# tools/**
+# !tools/packages.config
+# Tabs Studio
+*.tss
+# Telerik's JustMock configuration file
+*.jmconfig
+# BizTalk build output
+*.btp.cs
+*.btm.cs
+*.odx.cs
+*.xsd.cs
+# OpenCover UI analysis results
+OpenCover/
+# Azure Stream Analytics local run output
+ASALocalRun/
+# MSBuild Binary and Structured Log
+*.binlog
+# NVidia Nsight GPU debugger configuration file
+*.nvuser
+# MFractors (Xamarin productivity tool) working folder
+.mfractor/
+# Local History for Visual Studio
+.localhistory/
+# Visual Studio History (VSHistory) files
+.vshistory/
+# BeatPulse healthcheck temp database
+healthchecksdb
+# Backup folder for Package Reference Convert tool in Visual Studio 2017
+MigrationBackup/
+# Ionide (cross platform F# VS Code tools) working folder
+.ionide/
+# Fody - auto-generated XML schema
+FodyWeavers.xsd
+# VS Code files for those working on multiple tools
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+*.code-workspace
+# Local History for Visual Studio Code
+.history/
+# Windows Installer files from build outputs
+*.cab
+*.msi
+*.msix
+*.msm
+*.msp
+# JetBrains Rider
+*.sln.iml
+# Python
+*.egg-info/
+/build
+# MoGe
+/data*
+/download
+/extract
+/debug
+/workspace
+/mlruns
+/infer_output
+/video_output
+/eval_output
+/.blobcache
+/test_images
+/test_videos
+/vis
+/videos
+/blobmnt
+/eval_dump
+/pretrained
+/.gradio
+/tmp

MoGe/CHANGELOG.md ADDED Viewed

	@@ -0,0 +1,28 @@

+## 2024-11-28
+### Added
+- Supported user-provided camera FOV. See [scripts/infer.py](scripts/infer.py) --fov_x.
+  - Related issues: [#25](https://github.com/microsoft/MoGe/issues/25) and [#24](https://github.com/microsoft/MoGe/issues/24).
+- Added inference scripts for panorama images. See [scripts/infer_panorama.py](scripts/infer_panorama.py).
+  - Related issue: [#19](https://github.com/microsoft/MoGe/issues/19).
+### Fixed
+- Suppressed unnecessary numpy runtime warnings.
+- Specified recommended versions of requirements.
+  - Related issue: [#21](https://github.com/microsoft/MoGe/issues/21).
+### Changed
+- Moved `app.py` and `infer.py` to [scripts/](scripts/)
+- Improved edge removal.
+## 2025-03-18
+### Added
+- Training and evaluation code. See [docs/train.md](docs/train.md) and [docs/eval.md](docs/eval.md).
+- Supported installation via pip. Thanks to @fabiencastan and @jgoueslard
+ for commits in the [#47](https://github.com/microsoft/MoGe/pull/47)
+- Supported command-line usage when installed.
+### Changed
+- Moved `scripts/` into `moge/` for package installation and command-line usage.
+- Renamed `moge.model.moge_model` to `moge.model.v1` for version management.
+  Now you can import the model class through `from moge.model.v1 import MoGeModel` or `from moge.model import import_model_class_by_version; MoGeModel = import_model_class_by_version('v1')`.
+- Exposed `num_tokens` parameter in MoGe model.

MoGe/CODE_OF_CONDUCT.md ADDED Viewed

	@@ -0,0 +1,9 @@

+# Microsoft Open Source Code of Conduct
+This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
+Resources:
+- [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
+- [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
+- Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns

MoGe/LICENSE ADDED Viewed

	@@ -0,0 +1,224 @@

+    MIT License
+    Copyright (c) Microsoft Corporation.
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to deal
+    in the Software without restriction, including without limitation the rights
+    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+    copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+    The above copyright notice and this permission notice shall be included in all
+    copies or substantial portions of the Software.
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+    SOFTWARE
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

MoGe/README.md ADDED Viewed

	@@ -0,0 +1,220 @@

+<div align="center">
+# MoGe: Unlocking Accurate Monocular Geometry Estimation for Open-Domain Images with Optimal Training Supervision
+<a href="https://arxiv.org/abs/2410.19115"><img src='https://img.shields.io/badge/arXiv-Paper-red?logo=arxiv&logoColor=white' alt='arXiv'></a>
+<a href='https://wangrc.site/MoGePage/'><img src='https://img.shields.io/badge/Project_Page-Website-green?logo=googlechrome&logoColor=white' alt='Project Page'></a>
+<a href='https://huggingface.co/spaces/Ruicheng/MoGe'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Live_Demo-blue'></a>
+</div>
+<img src="./assets/overview_simplified.png" width="100%" alt="Method overview" align="center">
+MoGe is a powerful model for recovering 3D geometry from monocular open-domain images. The model consists of a ViT encoder and a convolutional decoder. It directly predicts an affine-invariant point map as well as a mask that excludes regions with undefined geometry (e.g., sky), from which the camera shift, camera focal length and depth map can be further derived.
+***Check our [website](https://wangrc.site/MoGePage) for videos and interactive results!***
+## Features
+* **Accurate 3D geometry estimation**: Estimate point maps from single images with high precision. Capable of capturing depth variations up to 1000×, ensuring a comprehensive scene representation.
+* **Optional ground-truth FOV input**: Enhance model accuracy further by providing the true field of view.
+* **Flexible resolution support**: Works seamlessly with various resolutions and aspect ratios, from 2:1 to 1:2.
+* **Optimized for speed**: Achieves <0.1s latency per image on an A100 / RTX 3090 GPU with fp16, and 0.2s with fp32.
+## TODO List
+- [x] Release inference code & ViT-Large model.
+- [x] Release evaluation and training code.
+- [ ] Release ViT-Base and ViT-Giant models.
+🌟*Updated on 2025/03/18* [CHANGELOG](CHANGELOG.md)
+  - **Training and evaluation code released!**
+  - Installation via pip and CLI usage supported.
+## Installation
+### Install via pip
+```bash
+pip install git+https://github.com/microsoft/MoGe.git
+```
+### Or clone this repository
+```bash
+git clone https://github.com/microsoft/MoGe.git
+cd MoGe
+```
+and install the requirements
+```bash
+pip install -r requirements.txt
+```
+MoGe should be compatible with most requirements versions. Please check the `requirements.txt` for more details if you have concerns.
+## Usage
+### Pretrained model
+The ViT-Large model has been uploaded to Hugging Face hub at [Ruicheng/moge-vitl](https://huggingface.co/Ruicheng/moge-vitl).
+You may load the model via `MoGeModel.from_pretrained("Ruicheng/moge-vitl")` without manually downloading.
+If loading the model from a local file is preferred, you may manually download the model from the huggingface hub and load it via `MoGeModel.from_pretrained("PATH_TO_LOCAL_MODEL.pt")`.
+### Minimal code example
+Here is a minimal example for loading the model and inferring on a single image.
+```python
+import cv2
+import torch
+from moge.model.v1 import MoGeModel
+device = torch.device("cuda")
+# Load the model from huggingface hub (or load from local).
+model = MoGeModel.from_pretrained("Ruicheng/moge-vitl").to(device)
+# Read the input image and convert to tensor (3, H, W) and normalize to [0, 1]
+input_image = cv2.cvtColor(cv2.imread("PATH_TO_IMAGE.jpg"), cv2.COLOR_BGR2RGB)
+input_image = torch.tensor(input_image / 255, dtype=torch.float32, device=device).permute(2, 0, 1)
+# Infer
+output = model.infer(input_image)
+# `output` has keys "points", "depth", "mask" and "intrinsics",
+# The maps are in the same size as the input image.
+# {
+#     "points": (H, W, 3),    # scale-invariant point map in OpenCV camera coordinate system (x right, y down, z forward)
+#     "depth": (H, W),        # scale-invariant depth map
+#     "mask": (H, W),         # a binary mask for valid pixels.
+#     "intrinsics": (3, 3),   # normalized camera intrinsics
+# }
+# For more usage details, see the `MoGeModel.infer` docstring.
+```
+### Gradio demo | `moge app`
+The demo is also available at our [Hugging Face space](https://huggingface.co/spaces/Ruicheng/MoGe).
+```bash
+# Using the command line tool
+moge app
+# In this repo
+python moge/scripts/app.py   # --share for Gradio public sharing
+```
+See also [`moge/scripts/app.py`](moge/scripts/app.py)
+### Inference | `moge infer`
+Run the script `moge/scripts/infer.py` via the following command:
+```bash
+# Save the output [maps], [glb] and [ply] files
+moge infer -i IMAGES_FOLDER_OR_IMAGE_PATH --o OUTPUT_FOLDER --maps --glb --ply
+# Show the result in a window (requires pyglet < 2.0, e.g. pip install pyglet==1.5.29)
+moge infer -i IMAGES_FOLDER_OR_IMAGE_PATH --o OUTPUT_FOLDER --show
+```
+For detailed options, run `moge infer --help`:
+```
+Usage: moge infer [OPTIONS]
+  Inference script for the MoGe model.
+Options:
+  -i, --input PATH            Input image or folder path. "jpg" and "png" are
+                              supported.
+  --fov_x FLOAT               If camera parameters are known, set the
+                              horizontal field of view in degrees. Otherwise,
+                              MoGe will estimate it.
+  -o, --output PATH           Output folder path
+  --pretrained TEXT           Pretrained model name or path. Defaults to
+                              "Ruicheng/moge-vitl"
+  --device TEXT               Device name (e.g. "cuda", "cuda:0", "cpu").
+                              Defaults to "cuda"
+  --fp16                      Use fp16 precision for 2x faster inference.
+  --resize INTEGER            Resize the image(s) & output maps to a specific
+                              size. Defaults to None (no resizing).
+  --resolution_level INTEGER  An integer [0-9] for the resolution level for
+                              inference. Higher value means more tokens and
+                              the finer details will be captured, but
+                              inference can be slower. Defaults to 9. Note
+                              that it is irrelevant to the output size, which
+                              is always the same as the input size.
+                              `resolution_level` actually controls
+                              `num_tokens`. See `num_tokens` for more details.
+  --num_tokens INTEGER        number of tokens used for inference. A integer
+                              in the (suggested) range of `[1200, 2500]`.
+                              `resolution_level` will be ignored if
+                              `num_tokens` is provided. Default: None
+  --threshold FLOAT           Threshold for removing edges. Defaults to 0.03.
+                              Smaller value removes more edges. "inf" means no
+                              thresholding.
+  --maps                      Whether to save the output maps and fov(image,
+                              depth, mask, points, fov).
+  --glb                       Whether to save the output as a.glb file. The
+                              color will be saved as a texture.
+  --ply                       Whether to save the output as a.ply file. The
+                              color will be saved as vertex colors.
+  --show                      Whether show the output in a window. Note that
+                              this requires pyglet<2 installed as required by
+                              trimesh.
+  --help                      Show this message and exit.
+```
+See also [`moge/scripts/infer.py`](moge/scripts/infer.py)
+### 360° panorama images | `moge infer_panorama`
+> *NOTE: This is an experimental extension of MoGe.*
+The script will split the 360-degree panorama image into multiple perspective views and infer on each view separately.
+The output maps will be combined to produce a panorama depth map and point map.
+Note that the panorama image must have spherical parameterization (e.g., environment maps or equirectangular images). Other formats must be converted to spherical format before using this script. Run `moge infer_panorama --help` for detailed options.
+<div align="center">
+  <img src="./assets/panorama_pipeline.png" width="80%">
+The photo is from [this URL](https://commons.wikimedia.org/wiki/Category:360%C2%B0_panoramas_with_equirectangular_projection#/media/File:Braunschweig_Sankt-%C3%84gidien_Panorama_02.jpg)
+</div>
+See also [`moge/scripts/infer_panorama.py`](moge/scripts/infer_panorama.py)
+## Training & Finetuning
+See [docs/train.md](docs/train.md)
+## Evaluation
+See [docs/eval.md](docs/eval.md)
+## License
+MoGe code is released under the MIT license, except for DINOv2 code in `moge/model/dinov2` which is released by Meta AI under the Apache 2.0 license.
+See [LICENSE](LICENSE) for more details.
+## Citation
+If you find our work useful in your research, we gratefully request that you consider citing our paper:
+```
+@misc{wang2024moge,
+    title={MoGe: Unlocking Accurate Monocular Geometry Estimation for Open-Domain Images with Optimal Training Supervision},
+    author={Wang, Ruicheng and Xu, Sicheng and Dai, Cassie and Xiang, Jianfeng and Deng, Yu and Tong, Xin and Yang, Jiaolong},
+    year={2024},
+    eprint={2410.19115},
+    archivePrefix={arXiv},
+    primaryClass={cs.CV},
+    url={https://arxiv.org/abs/2410.19115},
+}
+```

MoGe/SECURITY.md ADDED Viewed

	@@ -0,0 +1,41 @@

+<!-- BEGIN MICROSOFT SECURITY.MD V0.0.9 BLOCK -->
+## Security
+Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet) and [Xamarin](https://github.com/xamarin).
+If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/security.md/definition), please report it to us as described below.
+## Reporting Security Issues
+**Please do not report security vulnerabilities through public GitHub issues.**
+Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/security.md/msrc/create-report).
+If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/security.md/msrc/pgp).
+You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc).
+Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
+  * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
+  * Full paths of source file(s) related to the manifestation of the issue
+  * The location of the affected source code (tag/branch/commit or direct URL)
+  * Any special configuration required to reproduce the issue
+  * Step-by-step instructions to reproduce the issue
+  * Proof-of-concept or exploit code (if possible)
+  * Impact of the issue, including how an attacker might exploit the issue
+This information will help us triage your report more quickly.
+If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/security.md/msrc/bounty) page for more details about our active programs.
+## Preferred Languages
+We prefer all communications to be in English.
+## Policy
+Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/security.md/cvd).
+<!-- END MICROSOFT SECURITY.MD BLOCK -->

MoGe/SUPPORT.md ADDED Viewed

	@@ -0,0 +1,25 @@

+# TODO: The maintainer of this repo has not yet edited this file
+**REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project?
+- **No CSS support:** Fill out this template with information about how to file issues and get help.
+- **Yes CSS support:** Fill out an intake form at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). CSS will work with/help you to determine next steps.
+- **Not sure?** Fill out an intake as though the answer were "Yes". CSS will help you decide.
+*Then remove this first heading from this SUPPORT.MD file before publishing your repo.*
+# Support
+## How to file issues and get help
+This project uses GitHub Issues to track bugs and feature requests. Please search the existing
+issues before filing new issues to avoid duplicates.  For new issues, file your bug or
+feature request as a new Issue.
+For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE
+FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER
+CHANNEL. WHERE WILL YOU HELP PEOPLE?**.
+## Microsoft Support Policy
+Support for this **PROJECT or PRODUCT** is limited to the resources listed above.

MoGe/baselines/da_v2.py ADDED Viewed

	@@ -0,0 +1,88 @@

+# Reference: https://github.com/DepthAnything/Depth-Anything-V2
+import os
+import sys
+from typing import *
+from pathlib import Path
+import click
+import torch
+import torch.nn.functional as F
+import torchvision.transforms as T
+import torchvision.transforms.functional as TF
+from moge.test.baseline import MGEBaselineInterface
+class Baseline(MGEBaselineInterface):
+    def __init__(self, repo_path: str, backbone: str, num_tokens: int, device: Union[torch.device, str]):
+        # Create from repo
+        repo_path = os.path.abspath(repo_path)
+        if repo_path not in sys.path:
+            sys.path.append(repo_path)
+        if not Path(repo_path).exists():
+            raise FileNotFoundError(f'Cannot find the Depth-Anything repository at {repo_path}. Please clone the repository and provide the path to it using the --repo option.')
+        from depth_anything_v2.dpt import DepthAnythingV2
+        device = torch.device(device)
+        # Instantiate model
+        model = DepthAnythingV2(encoder=backbone, features=256, out_channels=[256, 512, 1024, 1024])
+        # Load checkpoint
+        checkpoint_path = os.path.join(repo_path, f'checkpoints/depth_anything_v2_{backbone}.pth')
+        if not os.path.exists(checkpoint_path):
+            raise FileNotFoundError(f'Cannot find the checkpoint file at {checkpoint_path}. Please download the checkpoint file and place it in the checkpoints directory.')
+        checkpoint = torch.load(checkpoint_path, map_location='cpu', weights_only=True)
+        model.load_state_dict(checkpoint)
+        model.to(device).eval()
+        self.model = model
+        self.num_tokens = num_tokens
+        self.device = device
+    @click.command()
+    @click.option('--repo', 'repo_path', type=click.Path(), default='../Depth-Anything-V2', help='Path to the Depth-Anything repository.')
+    @click.option('--backbone', type=click.Choice(['vits', 'vitb', 'vitl']), default='vitl', help='Encoder architecture.')
+    @click.option('--num_tokens', type=int, default=None, help='Number of tokens to use for the input image.')
+    @click.option('--device', type=str, default='cuda', help='Device to use for inference.')
+    @staticmethod
+    def load(repo_path: str, backbone, num_tokens: int, device: torch.device = 'cuda'):
+        return Baseline(repo_path, backbone, num_tokens, device)
+    @torch.inference_mode()
+    def infer(self, image: torch.Tensor, intrinsics: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]:
+        original_height, original_width = image.shape[-2:]
+        assert intrinsics is None, "Depth-Anything-V2 does not support camera intrinsics input"
+        if image.ndim == 3:
+            image = image.unsqueeze(0)
+            omit_batch_dim = True
+        else:
+            omit_batch_dim = False
+        if self.num_tokens is None:
+            resize_factor = 518 / min(original_height, original_width)
+            expected_width = round(original_width * resize_factor / 14) * 14
+            expected_height = round(original_height * resize_factor / 14) * 14
+        else:
+            aspect_ratio = original_width / original_height
+            tokens_rows = round((self.num_tokens * aspect_ratio) ** 0.5)
+            tokens_cols = round((self.num_tokens / aspect_ratio) ** 0.5)
+            expected_width = tokens_cols * 14
+            expected_height = tokens_rows * 14
+        image = TF.resize(image, (expected_height, expected_width), interpolation=T.InterpolationMode.BICUBIC, antialias=True)
+        image = TF.normalize(image, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        disparity = self.model(image)
+        disparity = F.interpolate(disparity[:, None], size=(original_height, original_width), mode='bilinear', align_corners=False, antialias=False)[:, 0]
+        if omit_batch_dim:
+            disparity = disparity.squeeze(0)
+        return {
+            'disparity_affine_invariant': disparity
+        }

MoGe/baselines/da_v2_metric.py ADDED Viewed

	@@ -0,0 +1,99 @@

+# Reference https://github.com/DepthAnything/Depth-Anything-V2/metric_depth
+import os
+import sys
+from typing import *
+from pathlib import Path
+import click
+import torch
+import torch.nn.functional as F
+import torchvision.transforms as T
+import torchvision.transforms.functional as TF
+import cv2
+from moge.test.baseline import MGEBaselineInterface
+class Baseline(MGEBaselineInterface):
+    def __init__(self, repo_path: str, backbone: str, domain: str, num_tokens: int, device: str):
+        device = torch.device(device)
+        repo_path = os.path.abspath(repo_path)
+        if not Path(repo_path).exists():
+            raise FileNotFoundError(f'Cannot find the Depth-Anything repository at {repo_path}. Please clone the repository and provide the path to it using the --repo option.')
+        sys.path.append(os.path.join(repo_path, 'metric_depth'))
+        from depth_anything_v2.dpt import DepthAnythingV2
+        model_configs = {
+            'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
+            'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
+            'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]}
+        }
+        if domain == 'indoor':
+            dataset = 'hypersim'
+            max_depth = 20
+        elif domain == 'outdoor':
+            dataset = 'vkitti'
+            max_depth = 80
+        else:
+            raise ValueError(f"Invalid domain: {domain}")
+        model = DepthAnythingV2(**model_configs[backbone], max_depth=max_depth)
+        checkpoint_path = os.path.join(repo_path, f'checkpoints/depth_anything_v2_metric_{dataset}_{backbone}.pth')
+        if not os.path.exists(checkpoint_path):
+            raise FileNotFoundError(f'Cannot find the checkpoint file at {checkpoint_path}. Please download the checkpoint file and place it in the checkpoints directory.')
+        model.load_state_dict(torch.load(checkpoint_path, map_location='cpu', weights_only=True))
+        model.eval().to(device)
+        self.model = model
+        self.num_tokens = num_tokens
+        self.device = device
+    @click.command()
+    @click.option('--repo', 'repo_path', type=click.Path(), default='../Depth-Anything-V2', help='Path to the Depth-Anything repository.')
+    @click.option('--backbone', type=click.Choice(['vits', 'vitb', 'vitl']), default='vitl', help='Backbone architecture.')
+    @click.option('--domain', type=click.Choice(['indoor', 'outdoor']), help='Domain of the dataset.')
+    @click.option('--num_tokens', type=int, default=None, help='Number of tokens for the ViT model')
+    @click.option('--device', type=str, default='cuda', help='Device to use for inference.')
+    @staticmethod
+    def load(repo_path: str, backbone: str, domain: str, num_tokens: int, device: str):
+        return Baseline(repo_path, backbone, domain, num_tokens, device)
+    @torch.inference_mode()
+    def infer(self, image: torch.Tensor, intrinsics: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]:
+        original_height, original_width = image.shape[-2:]
+        assert intrinsics is None, "Depth-Anything-V2 does not support camera intrinsics input"
+        if image.ndim == 3:
+            image = image.unsqueeze(0)
+            omit_batch_dim = True
+        else:
+            omit_batch_dim = False
+        if self.num_tokens is None:
+            resize_factor = 518 / min(original_height, original_width)
+            expected_width = round(original_width * resize_factor / 14) * 14
+            expected_height = round(original_height * resize_factor / 14) * 14
+        else:
+            aspect_ratio = original_width / original_height
+            tokens_rows = round((self.num_tokens * aspect_ratio) ** 0.5)
+            tokens_cols = round((self.num_tokens / aspect_ratio) ** 0.5)
+            expected_width = tokens_cols * 14
+            expected_height = tokens_rows * 14
+        image = TF.resize(image, (expected_height, expected_width), interpolation=T.InterpolationMode.BICUBIC, antialias=True)
+        image = TF.normalize(image, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        depth = self.model(image)
+        depth = F.interpolate(depth[:, None], size=(original_height, original_width), mode='bilinear', align_corners=False, antialias=False)[:, 0]
+        if omit_batch_dim:
+            depth = depth.squeeze(0)
+        return {
+            'depth_metric': depth
+        }

MoGe/baselines/metric3d_v2.py ADDED Viewed

	@@ -0,0 +1,117 @@

+# Reference: https://github.com/YvanYin/Metric3D
+import os
+import sys
+from typing import *
+import click
+import torch
+import torch.nn.functional as F
+import cv2
+from moge.test.baseline import MGEBaselineInterface
+class Baseline(MGEBaselineInterface):
+    def __init__(self, backbone: Literal['vits', 'vitl', 'vitg'], device):
+        backbone_map = {
+            'vits': 'metric3d_vit_small',
+            'vitl': 'metric3d_vit_large',
+            'vitg': 'metric3d_vit_giant2'
+        }
+        device = torch.device(device)
+        model = torch.hub.load('yvanyin/metric3d', backbone_map[backbone], pretrain=True)
+        model.to(device).eval()
+        self.model = model
+        self.device = device
+    @click.command()
+    @click.option('--backbone', type=click.Choice(['vits', 'vitl', 'vitg']), default='vitl', help='Encoder architecture.')
+    @click.option('--device', type=str, default='cuda', help='Device to use.')
+    @staticmethod
+    def load(backbone: str = 'vitl', device: torch.device = 'cuda'):
+        return Baseline(backbone, device)
+    @torch.inference_mode()
+    def inference_one_image(self, image: torch.Tensor, intrinsics: torch.Tensor = None):
+        # Reference: https://github.com/YvanYin/Metric3D/blob/main/mono/utils/do_test.py
+        # rgb_origin: RGB, 0-255, uint8
+        rgb_origin = image.cpu().numpy().transpose((1, 2, 0)) * 255
+        # keep ratio resize
+        input_size = (616, 1064) # for vit model
+        h, w = rgb_origin.shape[:2]
+        scale = min(input_size[0] / h, input_size[1] / w)
+        rgb = cv2.resize(rgb_origin, (int(w * scale), int(h * scale)), interpolation=cv2.INTER_LINEAR)
+        if intrinsics is not None:
+            focal = intrinsics[0, 0] * int(w * scale)
+        # padding to input_size
+        padding = [123.675, 116.28, 103.53]
+        h, w = rgb.shape[:2]
+        pad_h = input_size[0] - h
+        pad_w = input_size[1] - w
+        pad_h_half = pad_h // 2
+        pad_w_half = pad_w // 2
+        rgb = cv2.copyMakeBorder(rgb, pad_h_half, pad_h - pad_h_half, pad_w_half, pad_w - pad_w_half, cv2.BORDER_CONSTANT, value=padding)
+        pad_info = [pad_h_half, pad_h - pad_h_half, pad_w_half, pad_w - pad_w_half]
+        # normalize rgb
+        mean = torch.tensor([123.675, 116.28, 103.53]).float()[:, None, None]
+        std = torch.tensor([58.395, 57.12, 57.375]).float()[:, None, None]
+        rgb = torch.from_numpy(rgb.transpose((2, 0, 1))).float()
+        rgb = torch.div((rgb - mean), std)
+        rgb = rgb[None, :, :, :].cuda()
+        # inference
+        pred_depth, confidence, output_dict = self.model.inference({'input': rgb})
+        # un pad
+        pred_depth = pred_depth.squeeze()
+        pred_depth = pred_depth[pad_info[0] : pred_depth.shape[0] - pad_info[1], pad_info[2] : pred_depth.shape[1] - pad_info[3]]
+        pred_depth = pred_depth.clamp_min(0.5)  # clamp to 0.5m, since metric3d could yield very small depth values, resulting in crashed the scale shift alignment.
+        # upsample to original size
+        pred_depth = F.interpolate(pred_depth[None, None, :, :], image.shape[-2:], mode='bilinear').squeeze()
+        if intrinsics is not None:
+            # de-canonical transform
+            canonical_to_real_scale = focal / 1000.0 # 1000.0 is the focal length of canonical camera
+            pred_depth = pred_depth * canonical_to_real_scale # now the depth is metric
+            pred_depth = torch.clamp(pred_depth, 0, 300)
+        pred_normal, normal_confidence = output_dict['prediction_normal'].split([3, 1], dim=1) # see https://arxiv.org/abs/2109.09881 for details
+        # un pad and resize to some size if needed
+        pred_normal = pred_normal.squeeze(0)
+        pred_normal = pred_normal[:, pad_info[0] : pred_normal.shape[1] - pad_info[1], pad_info[2] : pred_normal.shape[2] - pad_info[3]]
+        # you can now do anything with the normal
+        pred_normal = F.interpolate(pred_normal[None, :, :, :], image.shape[-2:], mode='bilinear').squeeze(0)
+        pred_normal = F.normalize(pred_normal, p=2, dim=0)
+        return pred_depth, pred_normal.permute(1, 2, 0)
+    @torch.inference_mode()
+    def infer(self, image: torch.Tensor, intrinsics: torch.Tensor = None):
+        # image: (B, H, W, 3) or (H, W, 3)
+        if image.ndim == 3:
+            pred_depth, pred_normal = self.inference_one_image(image, intrinsics)
+        else:
+            for i in range(image.shape[0]):
+                pred_depth_i, pred_normal_i = self.inference_one_image(image[i], intrinsics[i] if intrinsics is not None else None)
+                pred_depth.append(pred_depth_i)
+                pred_normal.append(pred_normal_i)
+            pred_depth = torch.stack(pred_depth, dim=0)
+            pred_normal = torch.stack(pred_normal, dim=0)
+        if intrinsics is not None:
+            return {
+                "depth_metric": pred_depth,
+            }
+        else:
+            return {
+                "depth_scale_invariant": pred_depth,
+            }

MoGe/baselines/moge.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import os
+import sys
+from typing import *
+import importlib
+import click
+import torch
+import utils3d
+from moge.test.baseline import MGEBaselineInterface
+class Baseline(MGEBaselineInterface):
+    def __init__(self, num_tokens: int, resolution_level: int, pretrained_model_name_or_path: str, use_fp16: bool, device: str = 'cuda:0', version: str = 'v1'):
+        super().__init__()
+        from moge.model import import_model_class_by_version
+        MoGeModel = import_model_class_by_version(version)
+        self.version = version
+        self.model = MoGeModel.from_pretrained(pretrained_model_name_or_path).to(device).eval()
+        self.device = torch.device(device)
+        self.num_tokens = num_tokens
+        self.resolution_level = resolution_level
+        self.use_fp16 = use_fp16
+    @click.command()
+    @click.option('--num_tokens', type=int, default=None)
+    @click.option('--resolution_level', type=int, default=9)
+    @click.option('--pretrained', 'pretrained_model_name_or_path', type=str, default='Ruicheng/moge-vitl')
+    @click.option('--fp16', 'use_fp16', is_flag=True)
+    @click.option('--device', type=str, default='cuda:0')
+    @click.option('--version', type=str, default='v1')
+    @staticmethod
+    def load(num_tokens: int, resolution_level: int, pretrained_model_name_or_path: str, use_fp16: bool, device: str = 'cuda:0', version: str = 'v1'):
+        return Baseline(num_tokens, resolution_level, pretrained_model_name_or_path, use_fp16, device, version)
+    # Implementation for inference
+    @torch.inference_mode()
+    def infer(self, image: torch.FloatTensor, intrinsics: Optional[torch.FloatTensor] = None):
+        if intrinsics is not None:
+            fov_x, _ = utils3d.torch.intrinsics_to_fov(intrinsics)
+            fov_x = torch.rad2deg(fov_x)
+        else:
+            fov_x = None
+        output = self.model.infer(image, fov_x=fov_x, apply_mask=True, num_tokens=self.num_tokens)
+        if self.version == 'v1':
+            return {
+                'points_scale_invariant': output['points'],
+                'depth_scale_invariant': output['depth'],
+                'intrinsics': output['intrinsics'],
+            }
+        else:
+            return {
+                'points_metric': output['points'],
+                'depth_metric': output['depth'],
+                'intrinsics': output['intrinsics'],
+            }
+    @torch.inference_mode()
+    def infer_for_evaluation(self, image: torch.FloatTensor, intrinsics: torch.FloatTensor = None):
+        if intrinsics is not None:
+            fov_x, _ = utils3d.torch.intrinsics_to_fov(intrinsics)
+            fov_x = torch.rad2deg(fov_x)
+        else:
+            fov_x = None
+        output = self.model.infer(image, fov_x=fov_x, apply_mask=False, num_tokens=self.num_tokens, use_fp16=self.use_fp16)
+        if self.version == 'v1':
+            return {
+                'points_scale_invariant': output['points'],
+                'depth_scale_invariant': output['depth'],
+                'intrinsics': output['intrinsics'],
+            }
+        else:
+            return {
+                'points_metric': output['points'],
+                'depth_metric': output['depth'],
+                'intrinsics': output['intrinsics'],
+            }

MoGe/configs/eval/all_benchmarks.json ADDED Viewed

	@@ -0,0 +1,78 @@

+{
+    "NYUv2": {
+        "path": "data/eval/NYUv2",
+        "width": 640,
+        "height": 480,
+        "split": ".index.txt",
+        "depth_unit": 1.0
+    },
+    "KITTI": {
+        "path": "data/eval/KITTI",
+        "width": 750,
+        "height": 375,
+        "split": ".index.txt",
+        "depth_unit": 1
+    },
+    "ETH3D": {
+        "path": "data/eval/ETH3D",
+        "width": 2048,
+        "height": 1365,
+        "split": ".index.txt",
+        "include_segmentation": true,
+        "depth_unit": 1
+    },
+    "iBims-1": {
+        "path": "data/eval/iBims-1",
+        "width": 640,
+        "height": 480,
+        "split": ".index.txt",
+        "has_sharp_boundary": true,
+        "include_segmentation": true,
+        "depth_unit": 1.0
+    },
+    "GSO": {
+        "path": "data/eval/GSO",
+        "width": 512,
+        "height": 512,
+        "split": ".index.txt"
+    },
+    "Sintel": {
+        "path": "data/eval/Sintel",
+        "width": 872,
+        "height": 436,
+        "split": ".index.txt",
+        "has_sharp_boundary": true,
+        "include_segmentation": true
+    },
+    "DDAD": {
+        "path": "data/eval/DDAD",
+        "width": 1400,
+        "height": 700,
+        "include_segmentation": true,
+        "split": ".index.txt",
+        "depth_unit": 1.0
+    },
+    "DIODE": {
+        "path": "data/eval/DIODE",
+        "width": 1024,
+        "height": 768,
+        "split": ".index.txt",
+        "include_segmentation": true,
+        "depth_unit": 1.0
+    },
+    "Spring": {
+        "path": "data/eval/Spring",
+        "width": 1920,
+        "height": 1080,
+        "split": ".index.txt",
+        "has_sharp_boundary": true
+    },
+    "HAMMER": {
+        "path": "data/eval/HAMMER",
+        "width": 1664,
+        "height": 832,
+        "split": ".index.txt",
+        "depth_unit": 1,
+        "has_sharp_boundary": true
+    }
+}

MoGe/configs/eval/benchmarks/ddad.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "DDAD": {
+        "path": "data/eval/DDAD",
+        "width": 1400,
+        "height": 700,
+        "include_segmentation": true,
+        "split": ".index.txt"
+    }
+}

MoGe/configs/eval/benchmarks/diode.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "DIODE": {
+        "path": "data/eval/DIODE",
+        "width": 1024,
+        "height": 768,
+        "split": ".index.txt",
+        "include_segmentation": true
+    }
+}

MoGe/configs/eval/benchmarks/eth3d.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+    "ETH3D": {
+        "path": "data/eval/ETH3D",
+        "width": 2048,
+        "height": 1365,
+        "split": ".index.txt",
+        "include_segmentation": true,
+        "depth_unit": 1
+    }
+}

MoGe/configs/eval/benchmarks/gso.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "GSO": {
+        "path": "data/eval/GSO",
+        "width": 512,
+        "height": 512,
+        "split": ".index.txt"
+    }
+}

MoGe/configs/eval/benchmarks/hammer.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+    "HAMMER": {
+        "path": "data/eval/HAMMER",
+        "width": 1664,
+        "height": 832,
+        "split": ".index.txt",
+        "depth_unit": 1,
+        "has_sharp_boundary": true
+    }
+}

MoGe/configs/eval/benchmarks/ibims-1.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+    "iBims-1": {
+        "path": "data/eval/iBims-1",
+        "width": 640,
+        "height": 480,
+        "split": ".index.txt",
+        "include_segmentation": true,
+        "has_sharp_boundary": true
+    }
+}

MoGe/configs/eval/benchmarks/kitti.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "KITTI": {
+        "path": "data/eval/KITTI",
+        "width": 750,
+        "height": 375,
+        "split": ".index.txt",
+        "depth_unit": 1
+    }
+}

MoGe/configs/eval/benchmarks/nyu.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "NYUv2": {
+        "path": "data/eval/NYUv2",
+        "width": 640,
+        "height": 480,
+        "split": ".test.txt"
+    }
+}

MoGe/configs/eval/benchmarks/sintel.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+    "Sintel": {
+        "path": "data/eval/Sintel",
+        "width": 872,
+        "height": 436,
+        "split": ".index.txt",
+        "include_segmentation": true,
+        "has_sharp_boundary": true
+    }
+}

MoGe/configs/eval/benchmarks/spring.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "Spring": {
+        "path": "data/eval/Spring",
+        "width": 1920,
+        "height": 1080,
+        "split": ".test.txt",
+        "has_sharp_boundary": true
+    }
+}

MoGe/configs/train/v1.json ADDED Viewed

	@@ -0,0 +1,77 @@

+{
+    "data": {
+        "aspect_ratio_range": [0.5, 2.0],
+        "area_range": [250000, 1000000],
+        "clamp_max_depth": 1000.0,
+        "center_augmentation": 0.5,
+        "fov_range_absolute": [1, 179],
+        "fov_range_relative": [0.01, 1.0],
+        "image_augmentation": ["jittering", "jpeg_loss", "blurring"],
+        "datasets": [
+            {
+                "name": "TartanAir",
+                "path": "blobmnt/data_v3/TartanAir",
+                "label_type": "synthetic",
+                "index": ".index.txt",
+                "depth": "depth.png",
+                "weight": 4.8,
+                "center_augmentation": 0.25,
+                "fov_range_absolute": [30, 150],
+                "fov_range_relative": [0.5, 1.0],
+                "image_augmentation": ["jittering", "jpeg_loss", "blurring", "shot_noise"]
+            }
+        ]
+    },
+    "model_version": "v1",
+    "model": {
+        "encoder": "dinov2_vitl14",
+        "remap_output": "exp",
+        "intermediate_layers": 4,
+        "dim_upsample": [256, 128, 64],
+        "dim_times_res_block_hidden": 2,
+        "num_res_blocks": 2,
+        "num_tokens_range": [1200, 2500],
+        "last_conv_channels": 32,
+        "last_conv_size": 1
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "params": [
+            {"params": {"include": ["*"], "exclude": ["*backbone.*"]}, "lr": 1e-4},
+            {"params": {"include": ["*backbone.*"]}, "lr": 1e-5}
+        ]
+    },
+    "lr_scheduler": {
+        "type": "SequentialLR",
+        "params": {
+            "schedulers": [
+                {"type": "LambdaLR", "params": {"lr_lambda": ["1.0", "max(0.0, min(1.0, (epoch - 1000) / 1000))"]}},
+                {"type": "StepLR", "params": {"step_size": 25000, "gamma": 0.5}}
+            ],
+            "milestones": [2000]
+        }
+    },
+    "low_resolution_training_steps": 50000,
+    "loss": {
+        "invalid": {},
+        "synthetic": {
+            "global": {"function": "affine_invariant_global_loss", "weight": 1.0, "params": {"align_resolution": 32}},
+            "patch_4": {"function": "affine_invariant_local_loss", "weight": 1.0, "params": {"level": 4, "align_resolution": 16, "num_patches": 16}},
+            "patch_16": {"function": "affine_invariant_local_loss", "weight": 1.0, "params": {"level": 16, "align_resolution": 8, "num_patches": 256}},
+            "patch_64": {"function": "affine_invariant_local_loss", "weight": 1.0, "params": {"level": 64, "align_resolution": 4, "num_patches": 4096}},
+            "normal": {"function": "normal_loss", "weight": 1.0},
+            "mask": {"function": "mask_l2_loss", "weight": 1.0}
+        },
+        "sfm": {
+            "global": {"function": "affine_invariant_global_loss", "weight": 1.0, "params": {"align_resolution": 32}},
+            "patch_4": {"function": "affine_invariant_local_loss", "weight": 1.0, "params": {"level": 4, "align_resolution": 16, "num_patches": 16}},
+            "patch_16": {"function": "affine_invariant_local_loss", "weight": 1.0, "params": {"level": 16, "align_resolution": 8, "num_patches": 256}},
+            "mask": {"function": "mask_l2_loss", "weight": 1.0}
+        },
+        "lidar": {
+            "global": {"function": "affine_invariant_global_loss", "weight": 1.0, "params": {"align_resolution": 32}},
+            "patch_4": {"function": "affine_invariant_local_loss", "weight": 1.0, "params": {"level": 4, "align_resolution": 16, "num_patches": 16}},
+            "mask": {"function": "mask_l2_loss", "weight": 1.0}
+        }
+    }
+}

MoGe/docs/eval.md ADDED Viewed

	@@ -0,0 +1,77 @@

+# Evaluation
+We provide a unified evaluation script that runs baselines on multiple benchmarks. It takes a baseline model and evaluation configurations, evaluates on-the-fly, and reports results instantly in a JSON file.
+## Benchmarks
+Donwload the processed datasets from [Huggingface Datasets](https://huggingface.co/datasets/Ruicheng/monocular-geometry-evaluation) and put them in the `data/eval` directory, using `huggingface-cli`:
+```bash
+mkdir -p data/eval
+huggingface-cli download Ruicheng/monocular-geometry-evaluation --repo-type dataset --local-dir data/eval --local-dir-use-symlinks False
+```
+Then unzip the downloaded files:
+```bash
+cd data/eval
+unzip '*.zip'
+# rm *.zip # if you don't keep the zip files
+```
+## Configuration
+See [`configs/eval/all_benchmarks.json`](../configs/eval/all_benchmarks.json) for an example of evaluation configurations on all benchmarks. You can modify this file to evaluate on different benchmarks or different baselines.
+## Baseline
+Some examples of baselines are provided in [`baselines/`](../baselines/). Pass the path to the baseline model python code to the `--baseline` argument of the evaluation script.
+## Run Evaluation
+Run the script [`moge/scripts/eval_baseline.py`](../moge/scripts/eval_baseline.py).
+For example,
+```bash
+# Evaluate MoGe on the 10 benchmarks
+python moge/scripts/eval_baseline.py --baseline baselines/moge.py --config configs/eval/all_benchmarks.json --output eval_output/moge.json --pretrained Ruicheng/moge-vitl --resolution_level 9
+# Evaluate Depth Anything V2 on the 10 benchmarks. (NOTE: affine disparity)
+python moge/scripts/eval_baseline.py --baseline baselines/da_v2.py --config configs/eval/all_benchmarks.json --output eval_output/da_v2.json
+```
+The `--baselies` `--input` `--output` arguments are for the inference script. The rest arguments, e.g. `--pretrained` `--resolution_level`, are custormized for loading the baseline model.
+Details of the arguments:
+```
+Usage: eval_baseline.py [OPTIONS]
+  Evaluation script.
+Options:
+  --baseline PATH  Path to the baseline model python code.
+  --config PATH    Path to the evaluation configurations. Defaults to
+                   "configs/eval/all_benchmarks.json".
+  --output PATH    Path to the output json file.
+  --oracle         Use oracle mode for evaluation, i.e., use the GT intrinsics
+                   input.
+  --dump_pred      Dump predition results.
+  --dump_gt        Dump ground truth.
+  --help           Show this message and exit.
+```
+## Wrap a Customized Baseline
+Wrap any baseline method with [`moge.test.baseline.MGEBaselineInterface`](../moge/test/baseline.py).
+See [`baselines/`](../baselines/) for more examples.
+It is a good idea to check the correctness of the baseline implementation by running inference on a small set of images via [`moge/scripts/infer_baselines.py`](../moge/scripts/infer_baselines.py):
+```base
+python moge/scripts/infer_baselines.py --baseline baselines/moge.py --input example_images/ --output infer_outupt/moge --pretrained Ruicheng/moge-vitl --maps --ply
+```

MoGe/docs/train.md ADDED Viewed

	@@ -0,0 +1,181 @@

+# Training
+This document provides instructions for training and finetuning the MoGe model.
+## Additional Requirements
+The following packages other than those listed in [`pyproject.toml`](../pyproject.toml) are required for training and finetuning the MoGe model:
+```
+accelerate
+sympy
+mlflow
+```
+## Data preparation
+### Dataset format
+Each dataset should be organized as follows:
+```
+somedataset
+├── .index.txt          # A list of instance paths
+├── folder1
+│   ├── instance1       # Each instance is in a folder
+│   │   ├── image.jpg   # RGB image.
+│   │   ├── depth.png   # 16-bit depth. See moge/utils/io.py for details
+│   │   ├── meta.json   # Stores "intrinsics" as a 3x3 matrix
+│   │   └── ...         # Other componests such as segmentation mask, normal map etc.
+...
+```
+* `.index.txt` is placed at top directory to store a list of instance paths in this dataset. The dataloader will look for instances in this list. You may also use a custom split, e.g. `.train.txt`, `.val.txt` and specify it in the configuration file.
+* For depth images, it is recommended to use `read_depth()` and `write_depth()` in [`moge/utils/io.py`](../moge/utils/io.py) to read and write depth images. The depth is stored in logarithmic scale in 16-bit PNG format, offering a balanced precision, dynamic range and compression ratio compared to 16-bit and 32-bit EXR and linear depth formats. It also encodes `NaN` and `Inf` values for invalid depth values.
+* The `meta.json` should be a dictionary containing the key `intrinsics`, which are **normalized** camera parameters. You may put more metadata.
+* We also support reading and storing segementation masks for evaluation data (see paper evaluation of local points), which are saved in PNG format with semantic labels stored in png metadata as JSON strings. See `read_segmentation()` and `write_segmentation()` in [`moge/utils/io.py`](../moge/utils/io.py) for details.
+### Visual inspection
+We provide a script to visualize the data and check the data quality. It will export the instance as a PLY file for visualization of point cloud.
+```bash
+python moge/scripts/vis_data.py PATH_TO_INSTANCE --ply [-o SOMEWHERE_ELSE_TO_SAVE_VIS]
+```
+### DataLoader
+Our training dataloaders is customized to handle loading data, performing perspective crop, and augmentation in a multithreading pipeline. Please refer to [`moge/train/dataloader.py`](../moge/train/dataloader.py) if you have any concern.
+## Configuration
+See [`configs/train/v1.json`](../configs/train/v1.json) for an example configuration file. The configuration file defines the hyperparameters for training the MoGe model.
+Here is a commented configuration for reference:
+```json
+{
+    "data": {
+        "aspect_ratio_range": [0.5, 2.0],               # Range of aspect ratio of sampled images
+        "area_range": [250000, 1000000],                # Range of sampled image area in pixels
+        "clamp_max_depth": 1000.0,                      # Maximum far/near
+        "center_augmentation": 0.5,                     # Ratio of center crop augmentation
+        "fov_range_absolute": [1, 179],                 # Absolute range of FOV in degrees
+        "fov_range_relative": [0.01, 1.0],              # Relative range of FOV to the original FOV
+        "image_augmentation": ["jittering", "jpeg_loss", "blurring"],       # List of image augmentation techniques
+        "datasets": [
+            {
+                "name": "TartanAir",                    # Name of the dataset. Name it as you like.
+                "path": "data/TartanAir",               # Path to the dataset
+                "label_type": "synthetic",              # Label type for this dataset. Losses will be applied accordingly. see "loss" config
+                "weight": 4.8,                          # Probability of sampling this dataset
+                "index": ".index.txt",                  # File name of the index file.  Defaults to .index.txt
+                "depth": "depth.png",                   # File name of depth images. Defaults to depth.png
+                "center_augmentation": 0.25,            # Below are dataset-specific hyperparameters. Overriding the global ones above.
+                "fov_range_absolute": [30, 150],
+                "fov_range_relative": [0.5, 1.0],
+                "image_augmentation": ["jittering", "jpeg_loss", "blurring", "shot_noise"]
+            }
+        ]
+    },
+    "model_version": "v1",                 # Model version. If you have multiple model variants, you can use this to switch between them.
+    "model": {                             # Model hyperparameters. Will be passed to Model __init__() as kwargs.
+        "encoder": "dinov2_vitl14",
+        "remap_output": "exp",
+        "intermediate_layers": 4,
+        "dim_upsample": [256, 128, 64],
+        "dim_times_res_block_hidden": 2,
+        "num_res_blocks": 2,
+        "num_tokens_range": [1200, 2500],
+        "last_conv_channels": 32,
+        "last_conv_size": 1
+    },
+    "optimizer": {                          # Reflection-like optimizer configurations. See moge.train.utils.py build_optimizer() for details.
+        "type": "AdamW",
+        "params": [
+            {"params": {"include": ["*"], "exclude": ["*backbone.*"]}, "lr": 1e-4},
+            {"params": {"include": ["*backbone.*"]}, "lr": 1e-5}
+        ]
+    },
+    "lr_scheduler": {                       # Reflection-like lr_scheduler configurations. See moge.train.utils.py build_lr_scheduler() for details.
+        "type": "SequentialLR",
+        "params": {
+            "schedulers": [
+                {"type": "LambdaLR", "params": {"lr_lambda": ["1.0", "max(0.0, min(1.0, (epoch - 1000) / 1000))"]}},
+                {"type": "StepLR", "params": {"step_size": 25000, "gamma": 0.5}}
+            ],
+            "milestones": [2000]
+        }
+    },
+    "low_resolution_training_steps": 50000, # Total number of low-resolution training steps. It makes the early stage training faster. Later stage training on varying size images will be slower.
+    "loss": {
+        "invalid": {},                      # invalid instance due to runtime error when loading data
+        "synthetic": {                      # Below are loss hyperparameters
+            "global": {"function": "affine_invariant_global_loss", "weight": 1.0, "params": {"align_resolution": 32}},
+            "patch_4": {"function": "affine_invariant_local_loss", "weight": 1.0, "params": {"level": 4, "align_resolution": 16, "num_patches": 16}},
+            "patch_16": {"function": "affine_invariant_local_loss", "weight": 1.0, "params": {"level": 16, "align_resolution": 8, "num_patches": 256}},
+            "patch_64": {"function": "affine_invariant_local_loss", "weight": 1.0, "params": {"level": 64, "align_resolution": 4, "num_patches": 4096}},
+            "normal": {"function": "normal_loss", "weight": 1.0},
+            "mask": {"function": "mask_l2_loss", "weight": 1.0}
+        },
+        "sfm": {
+            "global": {"function": "affine_invariant_global_loss", "weight": 1.0, "params": {"align_resolution": 32}},
+            "patch_4": {"function": "affine_invariant_local_loss", "weight": 1.0, "params": {"level": 4, "align_resolution": 16, "num_patches": 16}},
+            "patch_16": {"function": "affine_invariant_local_loss", "weight": 1.0, "params": {"level": 16, "align_resolution": 8, "num_patches": 256}},
+            "mask": {"function": "mask_l2_loss", "weight": 1.0}
+        },
+        "lidar": {
+            "global": {"function": "affine_invariant_global_loss", "weight": 1.0, "params": {"align_resolution": 32}},
+            "patch_4": {"function": "affine_invariant_local_loss", "weight": 1.0, "params": {"level": 4, "align_resolution": 16, "num_patches": 16}},
+            "mask": {"function": "mask_l2_loss", "weight": 1.0}
+        }
+    }
+}
+```
+## Run Training
+Launch the training script [`moge/scripts/train.py`](../moge/scripts/train.py). Note that we use [`accelerate`](https://github.com/huggingface/accelerate) for distributed training.
+```bash
+accelerate launch \
+    --num_processes 8 \
+    moge/scripts/train.py \
+    --config configs/train/v1.json \
+    --workspace workspace/debug \
+    --gradient_accumulation_steps 2 \
+    --batch_size_forward 2 \
+    --checkpoint latest \
+    --enable_gradient_checkpointing True \
+    --vis_every 1000 \
+    --enable_mlflow True
+```
+## Finetuning
+To finetune the pre-trained MoGe model, download the model checkpoint and put it in a local directory, e.g. `pretrained/moge-vitl.pt`.
+> NOTE: when finetuning pretrained MoGe model, a much lower learning rate is required.
+The suggested learning rate for finetuning is not greater than 1e-5 for the head and 1e-6 for the backbone.
+And the batch size is recommended to be 32 at least.
+The settings in default configuration are not optimal for specific datasets and may require further tuning.
+```bash
+accelerate launch \
+    --num_processes 8 \
+    moge/scripts/train.py \
+    --config configs/train/v1.json \
+    --workspace workspace/debug \
+    --gradient_accumulation_steps 2 \
+    --batch_size_forward 2 \
+    --checkpoint pretrained/moge-vitl.pt \
+    --enable_gradient_checkpointing True \
+    --vis_every 1000 \
+    --enable_mlflow True
+```

MoGe/moge/__init__.py ADDED Viewed

File without changes

MoGe/moge/model/__init__.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import importlib
+from typing import *
+if TYPE_CHECKING:
+    from .v1 import MoGeModel as MoGeModelV1
+def import_model_class_by_version(version: str) -> Type[Union['MoGeModelV1']]:
+    assert version in ['v1'], f'Unsupported model version: {version}'
+    try:
+        module = importlib.import_module(f'.{version}', __package__)
+    except ModuleNotFoundError:
+        raise ValueError(f'Model version "{version}" not found.')
+    cls = getattr(module, 'MoGeModel')
+    return cls

MoGe/moge/model/dinov2/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+__version__ = "0.0.1"

MoGe/moge/model/dinov2/hub/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.

MoGe/moge/model/dinov2/hub/backbones.py ADDED Viewed

	@@ -0,0 +1,156 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+from enum import Enum
+from typing import Union
+import torch
+from .utils import _DINOV2_BASE_URL, _make_dinov2_model_name
+class Weights(Enum):
+    LVD142M = "LVD142M"
+def _make_dinov2_model(
+    *,
+    arch_name: str = "vit_large",
+    img_size: int = 518,
+    patch_size: int = 14,
+    init_values: float = 1.0,
+    ffn_layer: str = "mlp",
+    block_chunks: int = 0,
+    num_register_tokens: int = 0,
+    interpolate_antialias: bool = False,
+    interpolate_offset: float = 0.1,
+    pretrained: bool = True,
+    weights: Union[Weights, str] = Weights.LVD142M,
+    **kwargs,
+):
+    from ..models import vision_transformer as vits
+    if isinstance(weights, str):
+        try:
+            weights = Weights[weights]
+        except KeyError:
+            raise AssertionError(f"Unsupported weights: {weights}")
+    model_base_name = _make_dinov2_model_name(arch_name, patch_size)
+    vit_kwargs = dict(
+        img_size=img_size,
+        patch_size=patch_size,
+        init_values=init_values,
+        ffn_layer=ffn_layer,
+        block_chunks=block_chunks,
+        num_register_tokens=num_register_tokens,
+        interpolate_antialias=interpolate_antialias,
+        interpolate_offset=interpolate_offset,
+    )
+    vit_kwargs.update(**kwargs)
+    model = vits.__dict__[arch_name](**vit_kwargs)
+    if pretrained:
+        model_full_name = _make_dinov2_model_name(arch_name, patch_size, num_register_tokens)
+        url = _DINOV2_BASE_URL + f"/{model_base_name}/{model_full_name}_pretrain.pth"
+        state_dict = torch.hub.load_state_dict_from_url(url, map_location="cpu")
+        model.load_state_dict(state_dict, strict=True)
+    return model
+def dinov2_vits14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-S/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(arch_name="vit_small", pretrained=pretrained, weights=weights, **kwargs)
+def dinov2_vitb14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-B/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(arch_name="vit_base", pretrained=pretrained, weights=weights, **kwargs)
+def dinov2_vitl14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-L/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(arch_name="vit_large", pretrained=pretrained, weights=weights, **kwargs)
+def dinov2_vitg14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-g/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_giant2",
+        ffn_layer="swiglufused",
+        weights=weights,
+        pretrained=pretrained,
+        **kwargs,
+    )
+def dinov2_vits14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-S/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_small",
+        pretrained=pretrained,
+        weights=weights,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )
+def dinov2_vitb14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-B/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_base",
+        pretrained=pretrained,
+        weights=weights,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )
+def dinov2_vitl14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-L/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_large",
+        pretrained=pretrained,
+        weights=weights,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )
+def dinov2_vitg14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-g/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_giant2",
+        ffn_layer="swiglufused",
+        weights=weights,
+        pretrained=pretrained,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )

MoGe/moge/model/dinov2/hub/utils.py ADDED Viewed

	@@ -0,0 +1,39 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+import itertools
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+_DINOV2_BASE_URL = "https://dl.fbaipublicfiles.com/dinov2"
+def _make_dinov2_model_name(arch_name: str, patch_size: int, num_register_tokens: int = 0) -> str:
+    compact_arch_name = arch_name.replace("_", "")[:4]
+    registers_suffix = f"_reg{num_register_tokens}" if num_register_tokens else ""
+    return f"dinov2_{compact_arch_name}{patch_size}{registers_suffix}"
+class CenterPadding(nn.Module):
+    def __init__(self, multiple):
+        super().__init__()
+        self.multiple = multiple
+    def _get_pad(self, size):
+        new_size = math.ceil(size / self.multiple) * self.multiple
+        pad_size = new_size - size
+        pad_size_left = pad_size // 2
+        pad_size_right = pad_size - pad_size_left
+        return pad_size_left, pad_size_right
+    @torch.inference_mode()
+    def forward(self, x):
+        pads = list(itertools.chain.from_iterable(self._get_pad(m) for m in x.shape[:1:-1]))
+        output = F.pad(x, pads)
+        return output

MoGe/moge/model/dinov2/layers/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+from .dino_head import DINOHead
+from .mlp import Mlp
+from .patch_embed import PatchEmbed
+from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused
+from .block import NestedTensorBlock
+from .attention import MemEffAttention

MoGe/moge/model/dinov2/layers/attention.py ADDED Viewed

	@@ -0,0 +1,89 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+import logging
+import os
+import warnings
+from torch import Tensor
+from torch import nn
+logger = logging.getLogger("dinov2")
+XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None
+try:
+    if XFORMERS_ENABLED:
+        from xformers.ops import memory_efficient_attention, unbind
+        XFORMERS_AVAILABLE = True
+        # warnings.warn("xFormers is available (Attention)")
+    else:
+        # warnings.warn("xFormers is disabled (Attention)")
+        raise ImportError
+except ImportError:
+    XFORMERS_AVAILABLE = False
+    # warnings.warn("xFormers is not available (Attention)")
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+    ) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x: Tensor, attn_bias=None) -> Tensor:
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
+        attn = q @ k.transpose(-2, -1)
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class MemEffAttention(Attention):
+    def forward(self, x: Tensor, attn_bias=None) -> Tensor:
+        if not XFORMERS_AVAILABLE:
+            if attn_bias is not None:
+                raise AssertionError("xFormers is required for using nested tensors")
+            return super().forward(x)
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
+        q, k, v = unbind(qkv, 2)
+        x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
+        x = x.reshape([B, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x

MoGe/moge/model/dinov2/layers/block.py ADDED Viewed

	@@ -0,0 +1,259 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+import logging
+import os
+from typing import Callable, List, Any, Tuple, Dict
+import warnings
+import torch
+from torch import nn, Tensor
+from .attention import Attention, MemEffAttention
+from .drop_path import DropPath
+from .layer_scale import LayerScale
+from .mlp import Mlp
+logger = logging.getLogger("dinov2")
+XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None
+try:
+    if XFORMERS_ENABLED:
+        from xformers.ops import fmha, scaled_index_add, index_select_cat
+        XFORMERS_AVAILABLE = True
+        # warnings.warn("xFormers is available (Block)")
+    else:
+        # warnings.warn("xFormers is disabled (Block)")
+        raise ImportError
+except ImportError:
+    XFORMERS_AVAILABLE = False
+    # warnings.warn("xFormers is not available (Block)")
+class Block(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        ffn_bias: bool = True,
+        drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values=None,
+        drop_path: float = 0.0,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
+        attn_class: Callable[..., nn.Module] = Attention,
+        ffn_layer: Callable[..., nn.Module] = Mlp,
+    ) -> None:
+        super().__init__()
+        # print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}")
+        self.norm1 = norm_layer(dim)
+        self.attn = attn_class(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            proj_bias=proj_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+        self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = ffn_layer(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+            bias=ffn_bias,
+        )
+        self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.sample_drop_ratio = drop_path
+    def forward(self, x: Tensor) -> Tensor:
+        def attn_residual_func(x: Tensor) -> Tensor:
+            return self.ls1(self.attn(self.norm1(x)))
+        def ffn_residual_func(x: Tensor) -> Tensor:
+            return self.ls2(self.mlp(self.norm2(x)))
+        if self.training and self.sample_drop_ratio > 0.1:
+            # the overhead is compensated only for a drop path rate larger than 0.1
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+        elif self.training and self.sample_drop_ratio > 0.0:
+            x = x + self.drop_path1(attn_residual_func(x))
+            x = x + self.drop_path1(ffn_residual_func(x))  # FIXME: drop_path2
+        else:
+            x = x + attn_residual_func(x)
+            x = x + ffn_residual_func(x)
+        return x
+def drop_add_residual_stochastic_depth(
+    x: Tensor,
+    residual_func: Callable[[Tensor], Tensor],
+    sample_drop_ratio: float = 0.0,
+) -> Tensor:
+    # 1) extract subset using permutation
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    x_subset = x[brange]
+    # 2) apply residual_func to get residual
+    residual = residual_func(x_subset)
+    x_flat = x.flatten(1)
+    residual = residual.flatten(1)
+    residual_scale_factor = b / sample_subset_size
+    # 3) add the residual
+    x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+    return x_plus_residual.view_as(x)
+def get_branges_scales(x, sample_drop_ratio=0.0):
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    residual_scale_factor = b / sample_subset_size
+    return brange, residual_scale_factor
+def add_residual(x, brange, residual, residual_scale_factor, scaling_vector=None):
+    if scaling_vector is None:
+        x_flat = x.flatten(1)
+        residual = residual.flatten(1)
+        x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+    else:
+        x_plus_residual = scaled_index_add(
+            x, brange, residual.to(dtype=x.dtype), scaling=scaling_vector, alpha=residual_scale_factor
+        )
+    return x_plus_residual
+attn_bias_cache: Dict[Tuple, Any] = {}
+def get_attn_bias_and_cat(x_list, branges=None):
+    """
+    this will perform the index select, cat the tensors, and provide the attn_bias from cache
+    """
+    batch_sizes = [b.shape[0] for b in branges] if branges is not None else [x.shape[0] for x in x_list]
+    all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list))
+    if all_shapes not in attn_bias_cache.keys():
+        seqlens = []
+        for b, x in zip(batch_sizes, x_list):
+            for _ in range(b):
+                seqlens.append(x.shape[1])
+        attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens)
+        attn_bias._batch_sizes = batch_sizes
+        attn_bias_cache[all_shapes] = attn_bias
+    if branges is not None:
+        cat_tensors = index_select_cat([x.flatten(1) for x in x_list], branges).view(1, -1, x_list[0].shape[-1])
+    else:
+        tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in x_list)
+        cat_tensors = torch.cat(tensors_bs1, dim=1)
+    return attn_bias_cache[all_shapes], cat_tensors
+def drop_add_residual_stochastic_depth_list(
+    x_list: List[Tensor],
+    residual_func: Callable[[Tensor, Any], Tensor],
+    sample_drop_ratio: float = 0.0,
+    scaling_vector=None,
+) -> Tensor:
+    # 1) generate random set of indices for dropping samples in the batch
+    branges_scales = [get_branges_scales(x, sample_drop_ratio=sample_drop_ratio) for x in x_list]
+    branges = [s[0] for s in branges_scales]
+    residual_scale_factors = [s[1] for s in branges_scales]
+    # 2) get attention bias and index+concat the tensors
+    attn_bias, x_cat = get_attn_bias_and_cat(x_list, branges)
+    # 3) apply residual_func to get residual, and split the result
+    residual_list = attn_bias.split(residual_func(x_cat, attn_bias=attn_bias))  # type: ignore
+    outputs = []
+    for x, brange, residual, residual_scale_factor in zip(x_list, branges, residual_list, residual_scale_factors):
+        outputs.append(add_residual(x, brange, residual, residual_scale_factor, scaling_vector).view_as(x))
+    return outputs
+class NestedTensorBlock(Block):
+    def forward_nested(self, x_list: List[Tensor]) -> List[Tensor]:
+        """
+        x_list contains a list of tensors to nest together and run
+        """
+        assert isinstance(self.attn, MemEffAttention)
+        if self.training and self.sample_drop_ratio > 0.0:
+            def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.attn(self.norm1(x), attn_bias=attn_bias)
+            def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.mlp(self.norm2(x))
+            x_list = drop_add_residual_stochastic_depth_list(
+                x_list,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=self.ls1.gamma if isinstance(self.ls1, LayerScale) else None,
+            )
+            x_list = drop_add_residual_stochastic_depth_list(
+                x_list,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=self.ls2.gamma if isinstance(self.ls1, LayerScale) else None,
+            )
+            return x_list
+        else:
+            def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.ls1(self.attn(self.norm1(x), attn_bias=attn_bias))
+            def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.ls2(self.mlp(self.norm2(x)))
+            attn_bias, x = get_attn_bias_and_cat(x_list)
+            x = x + attn_residual_func(x, attn_bias=attn_bias)
+            x = x + ffn_residual_func(x)
+            return attn_bias.split(x)
+    def forward(self, x_or_x_list):
+        if isinstance(x_or_x_list, Tensor):
+            return super().forward(x_or_x_list)
+        elif isinstance(x_or_x_list, list):
+            if not XFORMERS_AVAILABLE:
+                raise AssertionError("xFormers is required for using nested tensors")
+            return self.forward_nested(x_or_x_list)
+        else:
+            raise AssertionError

MoGe/moge/model/dinov2/layers/dino_head.py ADDED Viewed

	@@ -0,0 +1,58 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn as nn
+from torch.nn.init import trunc_normal_
+from torch.nn.utils import weight_norm
+class DINOHead(nn.Module):
+    def __init__(
+        self,
+        in_dim,
+        out_dim,
+        use_bn=False,
+        nlayers=3,
+        hidden_dim=2048,
+        bottleneck_dim=256,
+        mlp_bias=True,
+    ):
+        super().__init__()
+        nlayers = max(nlayers, 1)
+        self.mlp = _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=hidden_dim, use_bn=use_bn, bias=mlp_bias)
+        self.apply(self._init_weights)
+        self.last_layer = weight_norm(nn.Linear(bottleneck_dim, out_dim, bias=False))
+        self.last_layer.weight_g.data.fill_(1)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+    def forward(self, x):
+        x = self.mlp(x)
+        eps = 1e-6 if x.dtype == torch.float16 else 1e-12
+        x = nn.functional.normalize(x, dim=-1, p=2, eps=eps)
+        x = self.last_layer(x)
+        return x
+def _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=None, use_bn=False, bias=True):
+    if nlayers == 1:
+        return nn.Linear(in_dim, bottleneck_dim, bias=bias)
+    else:
+        layers = [nn.Linear(in_dim, hidden_dim, bias=bias)]
+        if use_bn:
+            layers.append(nn.BatchNorm1d(hidden_dim))
+        layers.append(nn.GELU())
+        for _ in range(nlayers - 2):
+            layers.append(nn.Linear(hidden_dim, hidden_dim, bias=bias))
+            if use_bn:
+                layers.append(nn.BatchNorm1d(hidden_dim))
+            layers.append(nn.GELU())
+        layers.append(nn.Linear(hidden_dim, bottleneck_dim, bias=bias))
+        return nn.Sequential(*layers)

MoGe/moge/model/dinov2/layers/drop_path.py ADDED Viewed

	@@ -0,0 +1,34 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py
+from torch import nn
+def drop_path(x, drop_prob: float = 0.0, training: bool = False):
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0:
+        random_tensor.div_(keep_prob)
+    output = x * random_tensor
+    return output
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)

MoGe/moge/model/dinov2/layers/layer_scale.py ADDED Viewed

	@@ -0,0 +1,27 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110
+from typing import Union
+import torch
+from torch import Tensor
+from torch import nn
+class LayerScale(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        init_values: Union[float, Tensor] = 1e-5,
+        inplace: bool = False,
+    ) -> None:
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+    def forward(self, x: Tensor) -> Tensor:
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma

MoGe/moge/model/dinov2/layers/mlp.py ADDED Viewed

	@@ -0,0 +1,40 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py
+from typing import Callable, Optional
+from torch import Tensor, nn
+class Mlp(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x

MoGe/moge/model/dinov2/layers/patch_embed.py ADDED Viewed

	@@ -0,0 +1,88 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+from typing import Callable, Optional, Tuple, Union
+from torch import Tensor
+import torch.nn as nn
+def make_2tuple(x):
+    if isinstance(x, tuple):
+        assert len(x) == 2
+        return x
+    assert isinstance(x, int)
+    return (x, x)
+class PatchEmbed(nn.Module):
+    """
+    2D image to patch embedding: (B,C,H,W) -> (B,N,D)
+    Args:
+        img_size: Image size.
+        patch_size: Patch token size.
+        in_chans: Number of input image channels.
+        embed_dim: Number of linear projection output channels.
+        norm_layer: Normalization layer.
+    """
+    def __init__(
+        self,
+        img_size: Union[int, Tuple[int, int]] = 224,
+        patch_size: Union[int, Tuple[int, int]] = 16,
+        in_chans: int = 3,
+        embed_dim: int = 768,
+        norm_layer: Optional[Callable] = None,
+        flatten_embedding: bool = True,
+    ) -> None:
+        super().__init__()
+        image_HW = make_2tuple(img_size)
+        patch_HW = make_2tuple(patch_size)
+        patch_grid_size = (
+            image_HW[0] // patch_HW[0],
+            image_HW[1] // patch_HW[1],
+        )
+        self.img_size = image_HW
+        self.patch_size = patch_HW
+        self.patches_resolution = patch_grid_size
+        self.num_patches = patch_grid_size[0] * patch_grid_size[1]
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        self.flatten_embedding = flatten_embedding
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+    def forward(self, x: Tensor) -> Tensor:
+        _, _, H, W = x.shape
+        patch_H, patch_W = self.patch_size
+        assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}"
+        assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}"
+        x = self.proj(x)  # B C H W
+        H, W = x.size(2), x.size(3)
+        x = x.flatten(2).transpose(1, 2)  # B HW C
+        x = self.norm(x)
+        if not self.flatten_embedding:
+            x = x.reshape(-1, H, W, self.embed_dim)  # B H W C
+        return x
+    def flops(self) -> float:
+        Ho, Wo = self.patches_resolution
+        flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
+        if self.norm is not None:
+            flops += Ho * Wo * self.embed_dim
+        return flops

MoGe/moge/model/dinov2/layers/swiglu_ffn.py ADDED Viewed

	@@ -0,0 +1,72 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+import os
+from typing import Callable, Optional
+import warnings
+from torch import Tensor, nn
+import torch.nn.functional as F
+class SwiGLUFFN(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias)
+        self.w3 = nn.Linear(hidden_features, out_features, bias=bias)
+    def forward(self, x: Tensor) -> Tensor:
+        x12 = self.w12(x)
+        x1, x2 = x12.chunk(2, dim=-1)
+        hidden = F.silu(x1) * x2
+        return self.w3(hidden)
+XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None
+try:
+    if XFORMERS_ENABLED:
+        from xformers.ops import SwiGLU
+        XFORMERS_AVAILABLE = True
+        # warnings.warn("xFormers is available (SwiGLU)")
+    else:
+        # warnings.warn("xFormers is disabled (SwiGLU)")
+        raise ImportError
+except ImportError:
+    SwiGLU = SwiGLUFFN
+    XFORMERS_AVAILABLE = False
+    # warnings.warn("xFormers is not available (SwiGLU)")
+class SwiGLUFFNFused(SwiGLU):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
+        super().__init__(
+            in_features=in_features,
+            hidden_features=hidden_features,
+            out_features=out_features,
+            bias=bias,
+        )

MoGe/moge/model/dinov2/models/__init__.py ADDED Viewed

	@@ -0,0 +1,43 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+import logging
+from . import vision_transformer as vits
+logger = logging.getLogger("dinov2")
+def build_model(args, only_teacher=False, img_size=224):
+    args.arch = args.arch.removesuffix("_memeff")
+    if "vit" in args.arch:
+        vit_kwargs = dict(
+            img_size=img_size,
+            patch_size=args.patch_size,
+            init_values=args.layerscale,
+            ffn_layer=args.ffn_layer,
+            block_chunks=args.block_chunks,
+            qkv_bias=args.qkv_bias,
+            proj_bias=args.proj_bias,
+            ffn_bias=args.ffn_bias,
+            num_register_tokens=args.num_register_tokens,
+            interpolate_offset=args.interpolate_offset,
+            interpolate_antialias=args.interpolate_antialias,
+        )
+        teacher = vits.__dict__[args.arch](**vit_kwargs)
+        if only_teacher:
+            return teacher, teacher.embed_dim
+        student = vits.__dict__[args.arch](
+            **vit_kwargs,
+            drop_path_rate=args.drop_path_rate,
+            drop_path_uniform=args.drop_path_uniform,
+        )
+        embed_dim = student.embed_dim
+    return student, teacher, embed_dim
+def build_model_from_cfg(cfg, only_teacher=False):
+    return build_model(cfg.student, only_teacher=only_teacher, img_size=cfg.crops.global_crops_size)

MoGe/moge/model/dinov2/models/vision_transformer.py ADDED Viewed

	@@ -0,0 +1,396 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+from functools import partial
+import math
+import logging
+from typing import Sequence, Tuple, Union, Callable
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from torch.nn.init import trunc_normal_
+from ..layers import Mlp, PatchEmbed, SwiGLUFFNFused, MemEffAttention, NestedTensorBlock as Block
+logger = logging.getLogger("dinov2")
+def named_apply(fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False) -> nn.Module:
+    if not depth_first and include_root:
+        fn(module=module, name=name)
+    for child_name, child_module in module.named_children():
+        child_name = ".".join((name, child_name)) if name else child_name
+        named_apply(fn=fn, module=child_module, name=child_name, depth_first=depth_first, include_root=True)
+    if depth_first and include_root:
+        fn(module=module, name=name)
+    return module
+class BlockChunk(nn.ModuleList):
+    def forward(self, x):
+        for b in self:
+            x = b(x)
+        return x
+class DinoVisionTransformer(nn.Module):
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        ffn_bias=True,
+        proj_bias=True,
+        drop_path_rate=0.0,
+        drop_path_uniform=False,
+        init_values=None,  # for layerscale: None or 0 => no layerscale
+        embed_layer=PatchEmbed,
+        act_layer=nn.GELU,
+        block_fn=Block,
+        ffn_layer="mlp",
+        block_chunks=1,
+        num_register_tokens=0,
+        interpolate_antialias=False,
+        interpolate_offset=0.1,
+    ):
+        """
+        Args:
+            img_size (int, tuple): input image size
+            patch_size (int, tuple): patch size
+            in_chans (int): number of input channels
+            embed_dim (int): embedding dimension
+            depth (int): depth of transformer
+            num_heads (int): number of attention heads
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+            qkv_bias (bool): enable bias for qkv if True
+            proj_bias (bool): enable bias for proj in attn if True
+            ffn_bias (bool): enable bias for ffn if True
+            drop_path_rate (float): stochastic depth rate
+            drop_path_uniform (bool): apply uniform drop rate across blocks
+            weight_init (str): weight init scheme
+            init_values (float): layer-scale init values
+            embed_layer (nn.Module): patch embedding layer
+            act_layer (nn.Module): MLP activation layer
+            block_fn (nn.Module): transformer block class
+            ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity"
+            block_chunks: (int) split block sequence into block_chunks units for FSDP wrap
+            num_register_tokens: (int) number of extra cls tokens (so-called "registers")
+            interpolate_antialias: (str) flag to apply anti-aliasing when interpolating positional embeddings
+            interpolate_offset: (float) work-around offset to apply when interpolating positional embeddings
+        """
+        super().__init__()
+        norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.num_tokens = 1
+        self.n_blocks = depth
+        self.num_heads = num_heads
+        self.patch_size = patch_size
+        self.num_register_tokens = num_register_tokens
+        self.interpolate_antialias = interpolate_antialias
+        self.interpolate_offset = interpolate_offset
+        self.patch_embed = embed_layer(img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim))
+        assert num_register_tokens >= 0
+        self.register_tokens = (
+            nn.Parameter(torch.zeros(1, num_register_tokens, embed_dim)) if num_register_tokens else None
+        )
+        if drop_path_uniform is True:
+            dpr = [drop_path_rate] * depth
+        else:
+            dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        if ffn_layer == "mlp":
+            logger.info("using MLP layer as FFN")
+            ffn_layer = Mlp
+        elif ffn_layer == "swiglufused" or ffn_layer == "swiglu":
+            logger.info("using SwiGLU layer as FFN")
+            ffn_layer = SwiGLUFFNFused
+        elif ffn_layer == "identity":
+            logger.info("using Identity layer as FFN")
+            def f(*args, **kwargs):
+                return nn.Identity()
+            ffn_layer = f
+        else:
+            raise NotImplementedError
+        blocks_list = [
+            block_fn(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                proj_bias=proj_bias,
+                ffn_bias=ffn_bias,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                act_layer=act_layer,
+                ffn_layer=ffn_layer,
+                init_values=init_values,
+            )
+            for i in range(depth)
+        ]
+        if block_chunks > 0:
+            self.chunked_blocks = True
+            chunked_blocks = []
+            chunksize = depth // block_chunks
+            for i in range(0, depth, chunksize):
+                # this is to keep the block index consistent if we chunk the block list
+                chunked_blocks.append([nn.Identity()] * i + blocks_list[i : i + chunksize])
+            self.blocks = nn.ModuleList([BlockChunk(p) for p in chunked_blocks])
+        else:
+            self.chunked_blocks = False
+            self.blocks = nn.ModuleList(blocks_list)
+        self.norm = norm_layer(embed_dim)
+        self.head = nn.Identity()
+        self.mask_token = nn.Parameter(torch.zeros(1, embed_dim))
+        self.init_weights()
+    def init_weights(self):
+        trunc_normal_(self.pos_embed, std=0.02)
+        nn.init.normal_(self.cls_token, std=1e-6)
+        if self.register_tokens is not None:
+            nn.init.normal_(self.register_tokens, std=1e-6)
+        named_apply(init_weights_vit_timm, self)
+    def interpolate_pos_encoding(self, x, w, h):
+        previous_dtype = x.dtype
+        npatch = x.shape[1] - 1
+        N = self.pos_embed.shape[1] - 1
+        if npatch == N and w == h:
+            return self.pos_embed
+        pos_embed = self.pos_embed.float()
+        class_pos_embed = pos_embed[:, 0]
+        patch_pos_embed = pos_embed[:, 1:]
+        dim = x.shape[-1]
+        w0 = w // self.patch_size
+        h0 = h // self.patch_size
+        M = int(math.sqrt(N))  # Recover the number of patches in each dimension
+        assert N == M * M
+        kwargs = {}
+        if self.interpolate_offset:
+            # Historical kludge: add a small number to avoid floating point error in the interpolation, see https://github.com/facebookresearch/dino/issues/8
+            # Note: still needed for backward-compatibility, the underlying operators are using both output size and scale factors
+            sx = float(w0 + self.interpolate_offset) / M
+            sy = float(h0 + self.interpolate_offset) / M
+            kwargs["scale_factor"] = (sx, sy)
+        else:
+            # Simply specify an output size instead of a scale factor
+            kwargs["size"] = (w0, h0)
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed.reshape(1, M, M, dim).permute(0, 3, 1, 2),
+            mode="bicubic",
+            antialias=self.interpolate_antialias,
+            **kwargs,
+        )
+        assert (w0, h0) == patch_pos_embed.shape[-2:]
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1).to(previous_dtype)
+    def prepare_tokens_with_masks(self, x, masks=None):
+        B, nc, w, h = x.shape
+        x = self.patch_embed(x)
+        if masks is not None:
+            x = torch.where(masks.unsqueeze(-1), self.mask_token.to(x.dtype).unsqueeze(0), x)
+        x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
+        x = x + self.interpolate_pos_encoding(x, w, h)
+        if self.register_tokens is not None:
+            x = torch.cat(
+                (
+                    x[:, :1],
+                    self.register_tokens.expand(x.shape[0], -1, -1),
+                    x[:, 1:],
+                ),
+                dim=1,
+            )
+        return x
+    def forward_features_list(self, x_list, masks_list):
+        x = [self.prepare_tokens_with_masks(x, masks) for x, masks in zip(x_list, masks_list)]
+        for blk in self.blocks:
+            x = blk(x)
+        all_x = x
+        output = []
+        for x, masks in zip(all_x, masks_list):
+            x_norm = self.norm(x)
+            output.append(
+                {
+                    "x_norm_clstoken": x_norm[:, 0],
+                    "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
+                    "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
+                    "x_prenorm": x,
+                    "masks": masks,
+                }
+            )
+        return output
+    def forward_features(self, x, masks=None):
+        if isinstance(x, list):
+            return self.forward_features_list(x, masks)
+        x = self.prepare_tokens_with_masks(x, masks)
+        for blk in self.blocks:
+            x = blk(x)
+        x_norm = self.norm(x)
+        return {
+            "x_norm_clstoken": x_norm[:, 0],
+            "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
+            "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
+            "x_prenorm": x,
+            "masks": masks,
+        }
+    def _get_intermediate_layers_not_chunked(self, x, n=1):
+        x = self.prepare_tokens_with_masks(x)
+        # If n is an int, take the n last blocks. If it's a list, take them
+        output, total_block_len = [], len(self.blocks)
+        blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+        for i, blk in enumerate(self.blocks):
+            x = blk(x)
+            if i in blocks_to_take:
+                output.append(x)
+        assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+        return output
+    def _get_intermediate_layers_chunked(self, x, n=1):
+        x = self.prepare_tokens_with_masks(x)
+        output, i, total_block_len = [], 0, len(self.blocks[-1])
+        # If n is an int, take the n last blocks. If it's a list, take them
+        blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+        for block_chunk in self.blocks:
+            for blk in block_chunk[i:]:  # Passing the nn.Identity()
+                x = blk(x)
+                if i in blocks_to_take:
+                    output.append(x)
+                i += 1
+        assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+        return output
+    def get_intermediate_layers(
+        self,
+        x: torch.Tensor,
+        n: Union[int, Sequence] = 1,  # Layers or n last layers to take
+        reshape: bool = False,
+        return_class_token: bool = False,
+        norm=True,
+    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]:
+        if self.chunked_blocks:
+            outputs = self._get_intermediate_layers_chunked(x, n)
+        else:
+            outputs = self._get_intermediate_layers_not_chunked(x, n)
+        if norm:
+            outputs = [self.norm(out) for out in outputs]
+        class_tokens = [out[:, 0] for out in outputs]
+        outputs = [out[:, 1 + self.num_register_tokens :] for out in outputs]
+        if reshape:
+            B, _, w, h = x.shape
+            outputs = [
+                out.reshape(B, w // self.patch_size, h // self.patch_size, -1).permute(0, 3, 1, 2).contiguous()
+                for out in outputs
+            ]
+        if return_class_token:
+            return tuple(zip(outputs, class_tokens))
+        return tuple(outputs)
+    def forward(self, *args, is_training=False, **kwargs):
+        ret = self.forward_features(*args, **kwargs)
+        if is_training:
+            return ret
+        else:
+            return self.head(ret["x_norm_clstoken"])
+def init_weights_vit_timm(module: nn.Module, name: str = ""):
+    """ViT weight initialization, original timm impl (for reproducibility)"""
+    if isinstance(module, nn.Linear):
+        trunc_normal_(module.weight, std=0.02)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+def vit_small(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=384,
+        depth=12,
+        num_heads=6,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_base(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_large(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_giant2(patch_size=16, num_register_tokens=0, **kwargs):
+    """
+    Close to ViT-giant, with embed-dim 1536 and 24 heads => embed-dim per head 64
+    """
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1536,
+        depth=40,
+        num_heads=24,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model

MoGe/moge/model/dinov2/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.

MoGe/moge/model/dinov2/utils/cluster.py ADDED Viewed

	@@ -0,0 +1,95 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+from enum import Enum
+import os
+from pathlib import Path
+from typing import Any, Dict, Optional
+class ClusterType(Enum):
+    AWS = "aws"
+    FAIR = "fair"
+    RSC = "rsc"
+def _guess_cluster_type() -> ClusterType:
+    uname = os.uname()
+    if uname.sysname == "Linux":
+        if uname.release.endswith("-aws"):
+            # Linux kernel versions on AWS instances are of the form "5.4.0-1051-aws"
+            return ClusterType.AWS
+        elif uname.nodename.startswith("rsc"):
+            # Linux kernel versions on RSC instances are standard ones but hostnames start with "rsc"
+            return ClusterType.RSC
+    return ClusterType.FAIR
+def get_cluster_type(cluster_type: Optional[ClusterType] = None) -> Optional[ClusterType]:
+    if cluster_type is None:
+        return _guess_cluster_type()
+    return cluster_type
+def get_checkpoint_path(cluster_type: Optional[ClusterType] = None) -> Optional[Path]:
+    cluster_type = get_cluster_type(cluster_type)
+    if cluster_type is None:
+        return None
+    CHECKPOINT_DIRNAMES = {
+        ClusterType.AWS: "checkpoints",
+        ClusterType.FAIR: "checkpoint",
+        ClusterType.RSC: "checkpoint/dino",
+    }
+    return Path("/") / CHECKPOINT_DIRNAMES[cluster_type]
+def get_user_checkpoint_path(cluster_type: Optional[ClusterType] = None) -> Optional[Path]:
+    checkpoint_path = get_checkpoint_path(cluster_type)
+    if checkpoint_path is None:
+        return None
+    username = os.environ.get("USER")
+    assert username is not None
+    return checkpoint_path / username
+def get_slurm_partition(cluster_type: Optional[ClusterType] = None) -> Optional[str]:
+    cluster_type = get_cluster_type(cluster_type)
+    if cluster_type is None:
+        return None
+    SLURM_PARTITIONS = {
+        ClusterType.AWS: "learnlab",
+        ClusterType.FAIR: "learnlab",
+        ClusterType.RSC: "learn",
+    }
+    return SLURM_PARTITIONS[cluster_type]
+def get_slurm_executor_parameters(
+    nodes: int, num_gpus_per_node: int, cluster_type: Optional[ClusterType] = None, **kwargs
+) -> Dict[str, Any]:
+    # create default parameters
+    params = {
+        "mem_gb": 0,  # Requests all memory on a node, see https://slurm.schedmd.com/sbatch.html
+        "gpus_per_node": num_gpus_per_node,
+        "tasks_per_node": num_gpus_per_node,  # one task per GPU
+        "cpus_per_task": 10,
+        "nodes": nodes,
+        "slurm_partition": get_slurm_partition(cluster_type),
+    }
+    # apply cluster-specific adjustments
+    cluster_type = get_cluster_type(cluster_type)
+    if cluster_type == ClusterType.AWS:
+        params["cpus_per_task"] = 12
+        del params["mem_gb"]
+    elif cluster_type == ClusterType.RSC:
+        params["cpus_per_task"] = 12
+    # set additional parameters / apply overrides
+    params.update(kwargs)
+    return params

MoGe/moge/model/dinov2/utils/config.py ADDED Viewed

	@@ -0,0 +1,72 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+import math
+import logging
+import os
+from omegaconf import OmegaConf
+import dinov2.distributed as distributed
+from dinov2.logging import setup_logging
+from dinov2.utils import utils
+from dinov2.configs import dinov2_default_config
+logger = logging.getLogger("dinov2")
+def apply_scaling_rules_to_cfg(cfg):  # to fix
+    if cfg.optim.scaling_rule == "sqrt_wrt_1024":
+        base_lr = cfg.optim.base_lr
+        cfg.optim.lr = base_lr
+        cfg.optim.lr *= math.sqrt(cfg.train.batch_size_per_gpu * distributed.get_global_size() / 1024.0)
+        logger.info(f"sqrt scaling learning rate; base: {base_lr}, new: {cfg.optim.lr}")
+    else:
+        raise NotImplementedError
+    return cfg
+def write_config(cfg, output_dir, name="config.yaml"):
+    logger.info(OmegaConf.to_yaml(cfg))
+    saved_cfg_path = os.path.join(output_dir, name)
+    with open(saved_cfg_path, "w") as f:
+        OmegaConf.save(config=cfg, f=f)
+    return saved_cfg_path
+def get_cfg_from_args(args):
+    args.output_dir = os.path.abspath(args.output_dir)
+    args.opts += [f"train.output_dir={args.output_dir}"]
+    default_cfg = OmegaConf.create(dinov2_default_config)
+    cfg = OmegaConf.load(args.config_file)
+    cfg = OmegaConf.merge(default_cfg, cfg, OmegaConf.from_cli(args.opts))
+    return cfg
+def default_setup(args):
+    distributed.enable(overwrite=True)
+    seed = getattr(args, "seed", 0)
+    rank = distributed.get_global_rank()
+    global logger
+    setup_logging(output=args.output_dir, level=logging.INFO)
+    logger = logging.getLogger("dinov2")
+    utils.fix_random_seeds(seed + rank)
+    logger.info("git:\n  {}\n".format(utils.get_sha()))
+    logger.info("\n".join("%s: %s" % (k, str(v)) for k, v in sorted(dict(vars(args)).items())))
+def setup(args):
+    """
+    Create configs and perform basic setups.
+    """
+    cfg = get_cfg_from_args(args)
+    os.makedirs(args.output_dir, exist_ok=True)
+    default_setup(args)
+    apply_scaling_rules_to_cfg(cfg)
+    write_config(cfg, args.output_dir)
+    return cfg

MoGe/moge/model/dinov2/utils/dtype.py ADDED Viewed

	@@ -0,0 +1,37 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+from typing import Dict, Union
+import numpy as np
+import torch
+TypeSpec = Union[str, np.dtype, torch.dtype]
+_NUMPY_TO_TORCH_DTYPE: Dict[np.dtype, torch.dtype] = {
+    np.dtype("bool"): torch.bool,
+    np.dtype("uint8"): torch.uint8,
+    np.dtype("int8"): torch.int8,
+    np.dtype("int16"): torch.int16,
+    np.dtype("int32"): torch.int32,
+    np.dtype("int64"): torch.int64,
+    np.dtype("float16"): torch.float16,
+    np.dtype("float32"): torch.float32,
+    np.dtype("float64"): torch.float64,
+    np.dtype("complex64"): torch.complex64,
+    np.dtype("complex128"): torch.complex128,
+}
+def as_torch_dtype(dtype: TypeSpec) -> torch.dtype:
+    if isinstance(dtype, torch.dtype):
+        return dtype
+    if isinstance(dtype, str):
+        dtype = np.dtype(dtype)
+    assert isinstance(dtype, np.dtype), f"Expected an instance of nunpy dtype, got {type(dtype)}"
+    return _NUMPY_TO_TORCH_DTYPE[dtype]

MoGe/moge/model/dinov2/utils/param_groups.py ADDED Viewed

	@@ -0,0 +1,103 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+from collections import defaultdict
+import logging
+logger = logging.getLogger("dinov2")
+def get_vit_lr_decay_rate(name, lr_decay_rate=1.0, num_layers=12, force_is_backbone=False, chunked_blocks=False):
+    """
+    Calculate lr decay rate for different ViT blocks.
+    Args:
+        name (string): parameter name.
+        lr_decay_rate (float): base lr decay rate.
+        num_layers (int): number of ViT blocks.
+    Returns:
+        lr decay rate for the given parameter.
+    """
+    layer_id = num_layers + 1
+    if name.startswith("backbone") or force_is_backbone:
+        if (
+            ".pos_embed" in name
+            or ".patch_embed" in name
+            or ".mask_token" in name
+            or ".cls_token" in name
+            or ".register_tokens" in name
+        ):
+            layer_id = 0
+        elif force_is_backbone and (
+            "pos_embed" in name
+            or "patch_embed" in name
+            or "mask_token" in name
+            or "cls_token" in name
+            or "register_tokens" in name
+        ):
+            layer_id = 0
+        elif ".blocks." in name and ".residual." not in name:
+            layer_id = int(name[name.find(".blocks.") :].split(".")[2]) + 1
+        elif chunked_blocks and "blocks." in name and "residual." not in name:
+            layer_id = int(name[name.find("blocks.") :].split(".")[2]) + 1
+        elif "blocks." in name and "residual." not in name:
+            layer_id = int(name[name.find("blocks.") :].split(".")[1]) + 1
+    return lr_decay_rate ** (num_layers + 1 - layer_id)
+def get_params_groups_with_decay(model, lr_decay_rate=1.0, patch_embed_lr_mult=1.0):
+    chunked_blocks = False
+    if hasattr(model, "n_blocks"):
+        logger.info("chunked fsdp")
+        n_blocks = model.n_blocks
+        chunked_blocks = model.chunked_blocks
+    elif hasattr(model, "blocks"):
+        logger.info("first code branch")
+        n_blocks = len(model.blocks)
+    elif hasattr(model, "backbone"):
+        logger.info("second code branch")
+        n_blocks = len(model.backbone.blocks)
+    else:
+        logger.info("else code branch")
+        n_blocks = 0
+    all_param_groups = []
+    for name, param in model.named_parameters():
+        name = name.replace("_fsdp_wrapped_module.", "")
+        if not param.requires_grad:
+            continue
+        decay_rate = get_vit_lr_decay_rate(
+            name, lr_decay_rate, num_layers=n_blocks, force_is_backbone=n_blocks > 0, chunked_blocks=chunked_blocks
+        )
+        d = {"params": param, "is_last_layer": False, "lr_multiplier": decay_rate, "wd_multiplier": 1.0, "name": name}
+        if "last_layer" in name:
+            d.update({"is_last_layer": True})
+        if name.endswith(".bias") or "norm" in name or "gamma" in name:
+            d.update({"wd_multiplier": 0.0})
+        if "patch_embed" in name:
+            d.update({"lr_multiplier": d["lr_multiplier"] * patch_embed_lr_mult})
+        all_param_groups.append(d)
+        logger.info(f"""{name}: lr_multiplier: {d["lr_multiplier"]}, wd_multiplier: {d["wd_multiplier"]}""")
+    return all_param_groups
+def fuse_params_groups(all_params_groups, keys=("lr_multiplier", "wd_multiplier", "is_last_layer")):
+    fused_params_groups = defaultdict(lambda: {"params": []})
+    for d in all_params_groups:
+        identifier = ""
+        for k in keys:
+            identifier += k + str(d[k]) + "_"
+        for k in keys:
+            fused_params_groups[identifier][k] = d[k]
+        fused_params_groups[identifier]["params"].append(d["params"])
+    return fused_params_groups.values()

MoGe/moge/model/dinov2/utils/utils.py ADDED Viewed

	@@ -0,0 +1,95 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+import logging
+import os
+import random
+import subprocess
+from urllib.parse import urlparse
+import numpy as np
+import torch
+from torch import nn
+logger = logging.getLogger("dinov2")
+def load_pretrained_weights(model, pretrained_weights, checkpoint_key):
+    if urlparse(pretrained_weights).scheme:  # If it looks like an URL
+        state_dict = torch.hub.load_state_dict_from_url(pretrained_weights, map_location="cpu")
+    else:
+        state_dict = torch.load(pretrained_weights, map_location="cpu")
+    if checkpoint_key is not None and checkpoint_key in state_dict:
+        logger.info(f"Take key {checkpoint_key} in provided checkpoint dict")
+        state_dict = state_dict[checkpoint_key]
+    # remove `module.` prefix
+    state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()}
+    # remove `backbone.` prefix induced by multicrop wrapper
+    state_dict = {k.replace("backbone.", ""): v for k, v in state_dict.items()}
+    msg = model.load_state_dict(state_dict, strict=False)
+    logger.info("Pretrained weights found at {} and loaded with msg: {}".format(pretrained_weights, msg))
+def fix_random_seeds(seed=31):
+    """
+    Fix random seeds.
+    """
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+def get_sha():
+    cwd = os.path.dirname(os.path.abspath(__file__))
+    def _run(command):
+        return subprocess.check_output(command, cwd=cwd).decode("ascii").strip()
+    sha = "N/A"
+    diff = "clean"
+    branch = "N/A"
+    try:
+        sha = _run(["git", "rev-parse", "HEAD"])
+        subprocess.check_output(["git", "diff"], cwd=cwd)
+        diff = _run(["git", "diff-index", "HEAD"])
+        diff = "has uncommitted changes" if diff else "clean"
+        branch = _run(["git", "rev-parse", "--abbrev-ref", "HEAD"])
+    except Exception:
+        pass
+    message = f"sha: {sha}, status: {diff}, branch: {branch}"
+    return message
+class CosineScheduler(object):
+    def __init__(self, base_value, final_value, total_iters, warmup_iters=0, start_warmup_value=0, freeze_iters=0):
+        super().__init__()
+        self.final_value = final_value
+        self.total_iters = total_iters
+        freeze_schedule = np.zeros((freeze_iters))
+        warmup_schedule = np.linspace(start_warmup_value, base_value, warmup_iters)
+        iters = np.arange(total_iters - warmup_iters - freeze_iters)
+        schedule = final_value + 0.5 * (base_value - final_value) * (1 + np.cos(np.pi * iters / len(iters)))
+        self.schedule = np.concatenate((freeze_schedule, warmup_schedule, schedule))
+        assert len(self.schedule) == self.total_iters
+    def __getitem__(self, it):
+        if it >= self.total_iters:
+            return self.final_value
+        else:
+            return self.schedule[it]
+def has_batchnorms(model):
+    bn_types = (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d, nn.SyncBatchNorm)
+    for name, module in model.named_modules():
+        if isinstance(module, bn_types):
+            return True
+    return False