Jay Karhade commited on
Commit
0343ccd
·
1 Parent(s): 1e88e3f

Initial Space release

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. .gitignore +2 -0
  3. MoGe/.gitignore +423 -0
  4. MoGe/CHANGELOG.md +28 -0
  5. MoGe/CODE_OF_CONDUCT.md +9 -0
  6. MoGe/LICENSE +224 -0
  7. MoGe/README.md +220 -0
  8. MoGe/SECURITY.md +41 -0
  9. MoGe/SUPPORT.md +25 -0
  10. MoGe/baselines/da_v2.py +88 -0
  11. MoGe/baselines/da_v2_metric.py +99 -0
  12. MoGe/baselines/metric3d_v2.py +117 -0
  13. MoGe/baselines/moge.py +83 -0
  14. MoGe/configs/eval/all_benchmarks.json +78 -0
  15. MoGe/configs/eval/benchmarks/ddad.json +9 -0
  16. MoGe/configs/eval/benchmarks/diode.json +9 -0
  17. MoGe/configs/eval/benchmarks/eth3d.json +10 -0
  18. MoGe/configs/eval/benchmarks/gso.json +8 -0
  19. MoGe/configs/eval/benchmarks/hammer.json +10 -0
  20. MoGe/configs/eval/benchmarks/ibims-1.json +10 -0
  21. MoGe/configs/eval/benchmarks/kitti.json +9 -0
  22. MoGe/configs/eval/benchmarks/nyu.json +8 -0
  23. MoGe/configs/eval/benchmarks/sintel.json +10 -0
  24. MoGe/configs/eval/benchmarks/spring.json +9 -0
  25. MoGe/configs/train/v1.json +77 -0
  26. MoGe/docs/eval.md +77 -0
  27. MoGe/docs/train.md +181 -0
  28. MoGe/moge/__init__.py +0 -0
  29. MoGe/moge/model/__init__.py +17 -0
  30. MoGe/moge/model/dinov2/__init__.py +6 -0
  31. MoGe/moge/model/dinov2/hub/__init__.py +4 -0
  32. MoGe/moge/model/dinov2/hub/backbones.py +156 -0
  33. MoGe/moge/model/dinov2/hub/utils.py +39 -0
  34. MoGe/moge/model/dinov2/layers/__init__.py +11 -0
  35. MoGe/moge/model/dinov2/layers/attention.py +89 -0
  36. MoGe/moge/model/dinov2/layers/block.py +259 -0
  37. MoGe/moge/model/dinov2/layers/dino_head.py +58 -0
  38. MoGe/moge/model/dinov2/layers/drop_path.py +34 -0
  39. MoGe/moge/model/dinov2/layers/layer_scale.py +27 -0
  40. MoGe/moge/model/dinov2/layers/mlp.py +40 -0
  41. MoGe/moge/model/dinov2/layers/patch_embed.py +88 -0
  42. MoGe/moge/model/dinov2/layers/swiglu_ffn.py +72 -0
  43. MoGe/moge/model/dinov2/models/__init__.py +43 -0
  44. MoGe/moge/model/dinov2/models/vision_transformer.py +396 -0
  45. MoGe/moge/model/dinov2/utils/__init__.py +4 -0
  46. MoGe/moge/model/dinov2/utils/cluster.py +95 -0
  47. MoGe/moge/model/dinov2/utils/config.py +72 -0
  48. MoGe/moge/model/dinov2/utils/dtype.py +37 -0
  49. MoGe/moge/model/dinov2/utils/param_groups.py +103 -0
  50. MoGe/moge/model/dinov2/utils/utils.py +95 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ examples/* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ checkpoints/
2
+ .gradio/
MoGe/.gitignore ADDED
@@ -0,0 +1,423 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Ignore Visual Studio temporary files, build results, and
2
+ ## files generated by popular Visual Studio add-ons.
3
+ ##
4
+ ## Get latest from https://github.com/github/gitignore/blob/main/VisualStudio.gitignore
5
+
6
+ # User-specific files
7
+ *.rsuser
8
+ *.suo
9
+ *.user
10
+ *.userosscache
11
+ *.sln.docstates
12
+
13
+ # User-specific files (MonoDevelop/Xamarin Studio)
14
+ *.userprefs
15
+
16
+ # Mono auto generated files
17
+ mono_crash.*
18
+
19
+ # Build results
20
+ [Dd]ebug/
21
+ [Dd]ebugPublic/
22
+ [Rr]elease/
23
+ [Rr]eleases/
24
+ x64/
25
+ x86/
26
+ [Ww][Ii][Nn]32/
27
+ [Aa][Rr][Mm]/
28
+ [Aa][Rr][Mm]64/
29
+ bld/
30
+ [Bb]in/
31
+ [Oo]bj/
32
+ [Ll]og/
33
+ [Ll]ogs/
34
+
35
+ # Visual Studio 2015/2017 cache/options directory
36
+ .vs/
37
+ # Uncomment if you have tasks that create the project's static files in wwwroot
38
+ #wwwroot/
39
+
40
+ # Visual Studio 2017 auto generated files
41
+ Generated\ Files/
42
+
43
+ # MSTest test Results
44
+ [Tt]est[Rr]esult*/
45
+ [Bb]uild[Ll]og.*
46
+
47
+ # NUnit
48
+ *.VisualState.xml
49
+ TestResult.xml
50
+ nunit-*.xml
51
+
52
+ # Build Results of an ATL Project
53
+ [Dd]ebugPS/
54
+ [Rr]eleasePS/
55
+ dlldata.c
56
+
57
+ # Benchmark Results
58
+ BenchmarkDotNet.Artifacts/
59
+
60
+ # .NET Core
61
+ project.lock.json
62
+ project.fragment.lock.json
63
+ artifacts/
64
+
65
+ # ASP.NET Scaffolding
66
+ ScaffoldingReadMe.txt
67
+
68
+ # StyleCop
69
+ StyleCopReport.xml
70
+
71
+ # Files built by Visual Studio
72
+ *_i.c
73
+ *_p.c
74
+ *_h.h
75
+ *.ilk
76
+ *.meta
77
+ *.obj
78
+ *.iobj
79
+ *.pch
80
+ *.pdb
81
+ *.ipdb
82
+ *.pgc
83
+ *.pgd
84
+ *.rsp
85
+ *.sbr
86
+ *.tlb
87
+ *.tli
88
+ *.tlh
89
+ *.tmp
90
+ *.tmp_proj
91
+ *_wpftmp.csproj
92
+ *.log
93
+ *.tlog
94
+ *.vspscc
95
+ *.vssscc
96
+ .builds
97
+ *.pidb
98
+ *.svclog
99
+ *.scc
100
+
101
+ # Chutzpah Test files
102
+ _Chutzpah*
103
+
104
+ # Visual C++ cache files
105
+ ipch/
106
+ *.aps
107
+ *.ncb
108
+ *.opendb
109
+ *.opensdf
110
+ *.sdf
111
+ *.cachefile
112
+ *.VC.db
113
+ *.VC.VC.opendb
114
+
115
+ # Visual Studio profiler
116
+ *.psess
117
+ *.vsp
118
+ *.vspx
119
+ *.sap
120
+
121
+ # Visual Studio Trace Files
122
+ *.e2e
123
+
124
+ # TFS 2012 Local Workspace
125
+ $tf/
126
+
127
+ # Guidance Automation Toolkit
128
+ *.gpState
129
+
130
+ # ReSharper is a .NET coding add-in
131
+ _ReSharper*/
132
+ *.[Rr]e[Ss]harper
133
+ *.DotSettings.user
134
+
135
+ # TeamCity is a build add-in
136
+ _TeamCity*
137
+
138
+ # DotCover is a Code Coverage Tool
139
+ *.dotCover
140
+
141
+ # AxoCover is a Code Coverage Tool
142
+ .axoCover/*
143
+ !.axoCover/settings.json
144
+
145
+ # Coverlet is a free, cross platform Code Coverage Tool
146
+ coverage*.json
147
+ coverage*.xml
148
+ coverage*.info
149
+
150
+ # Visual Studio code coverage results
151
+ *.coverage
152
+ *.coveragexml
153
+
154
+ # NCrunch
155
+ _NCrunch_*
156
+ .*crunch*.local.xml
157
+ nCrunchTemp_*
158
+
159
+ # MightyMoose
160
+ *.mm.*
161
+ AutoTest.Net/
162
+
163
+ # Web workbench (sass)
164
+ .sass-cache/
165
+
166
+ # Installshield output folder
167
+ [Ee]xpress/
168
+
169
+ # DocProject is a documentation generator add-in
170
+ DocProject/buildhelp/
171
+ DocProject/Help/*.HxT
172
+ DocProject/Help/*.HxC
173
+ DocProject/Help/*.hhc
174
+ DocProject/Help/*.hhk
175
+ DocProject/Help/*.hhp
176
+ DocProject/Help/Html2
177
+ DocProject/Help/html
178
+
179
+ # Click-Once directory
180
+ publish/
181
+
182
+ # Publish Web Output
183
+ *.[Pp]ublish.xml
184
+ *.azurePubxml
185
+ # Note: Comment the next line if you want to checkin your web deploy settings,
186
+ # but database connection strings (with potential passwords) will be unencrypted
187
+ *.pubxml
188
+ *.publishproj
189
+
190
+ # Microsoft Azure Web App publish settings. Comment the next line if you want to
191
+ # checkin your Azure Web App publish settings, but sensitive information contained
192
+ # in these scripts will be unencrypted
193
+ PublishScripts/
194
+
195
+ # NuGet Packages
196
+ *.nupkg
197
+ # NuGet Symbol Packages
198
+ *.snupkg
199
+ # The packages folder can be ignored because of Package Restore
200
+ **/[Pp]ackages/*
201
+ # except build/, which is used as an MSBuild target.
202
+ !**/[Pp]ackages/build/
203
+ # Uncomment if necessary however generally it will be regenerated when needed
204
+ #!**/[Pp]ackages/repositories.config
205
+ # NuGet v3's project.json files produces more ignorable files
206
+ *.nuget.props
207
+ *.nuget.targets
208
+
209
+ # Microsoft Azure Build Output
210
+ csx/
211
+ *.build.csdef
212
+
213
+ # Microsoft Azure Emulator
214
+ ecf/
215
+ rcf/
216
+
217
+ # Windows Store app package directories and files
218
+ AppPackages/
219
+ BundleArtifacts/
220
+ Package.StoreAssociation.xml
221
+ _pkginfo.txt
222
+ *.appx
223
+ *.appxbundle
224
+ *.appxupload
225
+
226
+ # Visual Studio cache files
227
+ # files ending in .cache can be ignored
228
+ *.[Cc]ache
229
+ # but keep track of directories ending in .cache
230
+ !?*.[Cc]ache/
231
+
232
+ # Others
233
+ ClientBin/
234
+ ~$*
235
+ *~
236
+ *.dbmdl
237
+ *.dbproj.schemaview
238
+ *.jfm
239
+ *.pfx
240
+ *.publishsettings
241
+ orleans.codegen.cs
242
+
243
+ # Including strong name files can present a security risk
244
+ # (https://github.com/github/gitignore/pull/2483#issue-259490424)
245
+ #*.snk
246
+
247
+ # Since there are multiple workflows, uncomment next line to ignore bower_components
248
+ # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
249
+ #bower_components/
250
+
251
+ # RIA/Silverlight projects
252
+ Generated_Code/
253
+
254
+ # Backup & report files from converting an old project file
255
+ # to a newer Visual Studio version. Backup files are not needed,
256
+ # because we have git ;-)
257
+ _UpgradeReport_Files/
258
+ Backup*/
259
+ UpgradeLog*.XML
260
+ UpgradeLog*.htm
261
+ ServiceFabricBackup/
262
+ *.rptproj.bak
263
+
264
+ # SQL Server files
265
+ *.mdf
266
+ *.ldf
267
+ *.ndf
268
+
269
+ # Business Intelligence projects
270
+ *.rdl.data
271
+ *.bim.layout
272
+ *.bim_*.settings
273
+ *.rptproj.rsuser
274
+ *- [Bb]ackup.rdl
275
+ *- [Bb]ackup ([0-9]).rdl
276
+ *- [Bb]ackup ([0-9][0-9]).rdl
277
+
278
+ # Microsoft Fakes
279
+ FakesAssemblies/
280
+
281
+ # GhostDoc plugin setting file
282
+ *.GhostDoc.xml
283
+
284
+ # Node.js Tools for Visual Studio
285
+ .ntvs_analysis.dat
286
+ node_modules/
287
+
288
+ # Visual Studio 6 build log
289
+ *.plg
290
+
291
+ # Visual Studio 6 workspace options file
292
+ *.opt
293
+
294
+ # Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
295
+ *.vbw
296
+
297
+ # Visual Studio 6 auto-generated project file (contains which files were open etc.)
298
+ *.vbp
299
+
300
+ # Visual Studio 6 workspace and project file (working project files containing files to include in project)
301
+ *.dsw
302
+ *.dsp
303
+
304
+ # Visual Studio 6 technical files
305
+ *.ncb
306
+ *.aps
307
+
308
+ # Visual Studio LightSwitch build output
309
+ **/*.HTMLClient/GeneratedArtifacts
310
+ **/*.DesktopClient/GeneratedArtifacts
311
+ **/*.DesktopClient/ModelManifest.xml
312
+ **/*.Server/GeneratedArtifacts
313
+ **/*.Server/ModelManifest.xml
314
+ _Pvt_Extensions
315
+
316
+ # Paket dependency manager
317
+ .paket/paket.exe
318
+ paket-files/
319
+
320
+ # FAKE - F# Make
321
+ .fake/
322
+
323
+ # CodeRush personal settings
324
+ .cr/personal
325
+
326
+ # Python Tools for Visual Studio (PTVS)
327
+ __pycache__/
328
+ *.pyc
329
+
330
+ # Cake - Uncomment if you are using it
331
+ # tools/**
332
+ # !tools/packages.config
333
+
334
+ # Tabs Studio
335
+ *.tss
336
+
337
+ # Telerik's JustMock configuration file
338
+ *.jmconfig
339
+
340
+ # BizTalk build output
341
+ *.btp.cs
342
+ *.btm.cs
343
+ *.odx.cs
344
+ *.xsd.cs
345
+
346
+ # OpenCover UI analysis results
347
+ OpenCover/
348
+
349
+ # Azure Stream Analytics local run output
350
+ ASALocalRun/
351
+
352
+ # MSBuild Binary and Structured Log
353
+ *.binlog
354
+
355
+ # NVidia Nsight GPU debugger configuration file
356
+ *.nvuser
357
+
358
+ # MFractors (Xamarin productivity tool) working folder
359
+ .mfractor/
360
+
361
+ # Local History for Visual Studio
362
+ .localhistory/
363
+
364
+ # Visual Studio History (VSHistory) files
365
+ .vshistory/
366
+
367
+ # BeatPulse healthcheck temp database
368
+ healthchecksdb
369
+
370
+ # Backup folder for Package Reference Convert tool in Visual Studio 2017
371
+ MigrationBackup/
372
+
373
+ # Ionide (cross platform F# VS Code tools) working folder
374
+ .ionide/
375
+
376
+ # Fody - auto-generated XML schema
377
+ FodyWeavers.xsd
378
+
379
+ # VS Code files for those working on multiple tools
380
+ .vscode/*
381
+ !.vscode/settings.json
382
+ !.vscode/tasks.json
383
+ !.vscode/launch.json
384
+ !.vscode/extensions.json
385
+ *.code-workspace
386
+
387
+ # Local History for Visual Studio Code
388
+ .history/
389
+
390
+ # Windows Installer files from build outputs
391
+ *.cab
392
+ *.msi
393
+ *.msix
394
+ *.msm
395
+ *.msp
396
+
397
+ # JetBrains Rider
398
+ *.sln.iml
399
+
400
+ # Python
401
+ *.egg-info/
402
+ /build
403
+
404
+ # MoGe
405
+ /data*
406
+ /download
407
+ /extract
408
+ /debug
409
+ /workspace
410
+ /mlruns
411
+ /infer_output
412
+ /video_output
413
+ /eval_output
414
+ /.blobcache
415
+ /test_images
416
+ /test_videos
417
+ /vis
418
+ /videos
419
+ /blobmnt
420
+ /eval_dump
421
+ /pretrained
422
+ /.gradio
423
+ /tmp
MoGe/CHANGELOG.md ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## 2024-11-28
2
+ ### Added
3
+ - Supported user-provided camera FOV. See [scripts/infer.py](scripts/infer.py) --fov_x.
4
+ - Related issues: [#25](https://github.com/microsoft/MoGe/issues/25) and [#24](https://github.com/microsoft/MoGe/issues/24).
5
+ - Added inference scripts for panorama images. See [scripts/infer_panorama.py](scripts/infer_panorama.py).
6
+ - Related issue: [#19](https://github.com/microsoft/MoGe/issues/19).
7
+
8
+ ### Fixed
9
+ - Suppressed unnecessary numpy runtime warnings.
10
+ - Specified recommended versions of requirements.
11
+ - Related issue: [#21](https://github.com/microsoft/MoGe/issues/21).
12
+
13
+ ### Changed
14
+ - Moved `app.py` and `infer.py` to [scripts/](scripts/)
15
+ - Improved edge removal.
16
+
17
+ ## 2025-03-18
18
+ ### Added
19
+ - Training and evaluation code. See [docs/train.md](docs/train.md) and [docs/eval.md](docs/eval.md).
20
+ - Supported installation via pip. Thanks to @fabiencastan and @jgoueslard
21
+ for commits in the [#47](https://github.com/microsoft/MoGe/pull/47)
22
+ - Supported command-line usage when installed.
23
+
24
+ ### Changed
25
+ - Moved `scripts/` into `moge/` for package installation and command-line usage.
26
+ - Renamed `moge.model.moge_model` to `moge.model.v1` for version management.
27
+ Now you can import the model class through `from moge.model.v1 import MoGeModel` or `from moge.model import import_model_class_by_version; MoGeModel = import_model_class_by_version('v1')`.
28
+ - Exposed `num_tokens` parameter in MoGe model.
MoGe/CODE_OF_CONDUCT.md ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ # Microsoft Open Source Code of Conduct
2
+
3
+ This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
4
+
5
+ Resources:
6
+
7
+ - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
8
+ - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
9
+ - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
MoGe/LICENSE ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) Microsoft Corporation.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE
22
+
23
+
24
+ Apache License
25
+ Version 2.0, January 2004
26
+ http://www.apache.org/licenses/
27
+
28
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
29
+
30
+ 1. Definitions.
31
+
32
+ "License" shall mean the terms and conditions for use, reproduction,
33
+ and distribution as defined by Sections 1 through 9 of this document.
34
+
35
+ "Licensor" shall mean the copyright owner or entity authorized by
36
+ the copyright owner that is granting the License.
37
+
38
+ "Legal Entity" shall mean the union of the acting entity and all
39
+ other entities that control, are controlled by, or are under common
40
+ control with that entity. For the purposes of this definition,
41
+ "control" means (i) the power, direct or indirect, to cause the
42
+ direction or management of such entity, whether by contract or
43
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
44
+ outstanding shares, or (iii) beneficial ownership of such entity.
45
+
46
+ "You" (or "Your") shall mean an individual or Legal Entity
47
+ exercising permissions granted by this License.
48
+
49
+ "Source" form shall mean the preferred form for making modifications,
50
+ including but not limited to software source code, documentation
51
+ source, and configuration files.
52
+
53
+ "Object" form shall mean any form resulting from mechanical
54
+ transformation or translation of a Source form, including but
55
+ not limited to compiled object code, generated documentation,
56
+ and conversions to other media types.
57
+
58
+ "Work" shall mean the work of authorship, whether in Source or
59
+ Object form, made available under the License, as indicated by a
60
+ copyright notice that is included in or attached to the work
61
+ (an example is provided in the Appendix below).
62
+
63
+ "Derivative Works" shall mean any work, whether in Source or Object
64
+ form, that is based on (or derived from) the Work and for which the
65
+ editorial revisions, annotations, elaborations, or other modifications
66
+ represent, as a whole, an original work of authorship. For the purposes
67
+ of this License, Derivative Works shall not include works that remain
68
+ separable from, or merely link (or bind by name) to the interfaces of,
69
+ the Work and Derivative Works thereof.
70
+
71
+ "Contribution" shall mean any work of authorship, including
72
+ the original version of the Work and any modifications or additions
73
+ to that Work or Derivative Works thereof, that is intentionally
74
+ submitted to Licensor for inclusion in the Work by the copyright owner
75
+ or by an individual or Legal Entity authorized to submit on behalf of
76
+ the copyright owner. For the purposes of this definition, "submitted"
77
+ means any form of electronic, verbal, or written communication sent
78
+ to the Licensor or its representatives, including but not limited to
79
+ communication on electronic mailing lists, source code control systems,
80
+ and issue tracking systems that are managed by, or on behalf of, the
81
+ Licensor for the purpose of discussing and improving the Work, but
82
+ excluding communication that is conspicuously marked or otherwise
83
+ designated in writing by the copyright owner as "Not a Contribution."
84
+
85
+ "Contributor" shall mean Licensor and any individual or Legal Entity
86
+ on behalf of whom a Contribution has been received by Licensor and
87
+ subsequently incorporated within the Work.
88
+
89
+ 2. Grant of Copyright License. Subject to the terms and conditions of
90
+ this License, each Contributor hereby grants to You a perpetual,
91
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
92
+ copyright license to reproduce, prepare Derivative Works of,
93
+ publicly display, publicly perform, sublicense, and distribute the
94
+ Work and such Derivative Works in Source or Object form.
95
+
96
+ 3. Grant of Patent License. Subject to the terms and conditions of
97
+ this License, each Contributor hereby grants to You a perpetual,
98
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
99
+ (except as stated in this section) patent license to make, have made,
100
+ use, offer to sell, sell, import, and otherwise transfer the Work,
101
+ where such license applies only to those patent claims licensable
102
+ by such Contributor that are necessarily infringed by their
103
+ Contribution(s) alone or by combination of their Contribution(s)
104
+ with the Work to which such Contribution(s) was submitted. If You
105
+ institute patent litigation against any entity (including a
106
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
107
+ or a Contribution incorporated within the Work constitutes direct
108
+ or contributory patent infringement, then any patent licenses
109
+ granted to You under this License for that Work shall terminate
110
+ as of the date such litigation is filed.
111
+
112
+ 4. Redistribution. You may reproduce and distribute copies of the
113
+ Work or Derivative Works thereof in any medium, with or without
114
+ modifications, and in Source or Object form, provided that You
115
+ meet the following conditions:
116
+
117
+ (a) You must give any other recipients of the Work or
118
+ Derivative Works a copy of this License; and
119
+
120
+ (b) You must cause any modified files to carry prominent notices
121
+ stating that You changed the files; and
122
+
123
+ (c) You must retain, in the Source form of any Derivative Works
124
+ that You distribute, all copyright, patent, trademark, and
125
+ attribution notices from the Source form of the Work,
126
+ excluding those notices that do not pertain to any part of
127
+ the Derivative Works; and
128
+
129
+ (d) If the Work includes a "NOTICE" text file as part of its
130
+ distribution, then any Derivative Works that You distribute must
131
+ include a readable copy of the attribution notices contained
132
+ within such NOTICE file, excluding those notices that do not
133
+ pertain to any part of the Derivative Works, in at least one
134
+ of the following places: within a NOTICE text file distributed
135
+ as part of the Derivative Works; within the Source form or
136
+ documentation, if provided along with the Derivative Works; or,
137
+ within a display generated by the Derivative Works, if and
138
+ wherever such third-party notices normally appear. The contents
139
+ of the NOTICE file are for informational purposes only and
140
+ do not modify the License. You may add Your own attribution
141
+ notices within Derivative Works that You distribute, alongside
142
+ or as an addendum to the NOTICE text from the Work, provided
143
+ that such additional attribution notices cannot be construed
144
+ as modifying the License.
145
+
146
+ You may add Your own copyright statement to Your modifications and
147
+ may provide additional or different license terms and conditions
148
+ for use, reproduction, or distribution of Your modifications, or
149
+ for any such Derivative Works as a whole, provided Your use,
150
+ reproduction, and distribution of the Work otherwise complies with
151
+ the conditions stated in this License.
152
+
153
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
154
+ any Contribution intentionally submitted for inclusion in the Work
155
+ by You to the Licensor shall be under the terms and conditions of
156
+ this License, without any additional terms or conditions.
157
+ Notwithstanding the above, nothing herein shall supersede or modify
158
+ the terms of any separate license agreement you may have executed
159
+ with Licensor regarding such Contributions.
160
+
161
+ 6. Trademarks. This License does not grant permission to use the trade
162
+ names, trademarks, service marks, or product names of the Licensor,
163
+ except as required for reasonable and customary use in describing the
164
+ origin of the Work and reproducing the content of the NOTICE file.
165
+
166
+ 7. Disclaimer of Warranty. Unless required by applicable law or
167
+ agreed to in writing, Licensor provides the Work (and each
168
+ Contributor provides its Contributions) on an "AS IS" BASIS,
169
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
170
+ implied, including, without limitation, any warranties or conditions
171
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
172
+ PARTICULAR PURPOSE. You are solely responsible for determining the
173
+ appropriateness of using or redistributing the Work and assume any
174
+ risks associated with Your exercise of permissions under this License.
175
+
176
+ 8. Limitation of Liability. In no event and under no legal theory,
177
+ whether in tort (including negligence), contract, or otherwise,
178
+ unless required by applicable law (such as deliberate and grossly
179
+ negligent acts) or agreed to in writing, shall any Contributor be
180
+ liable to You for damages, including any direct, indirect, special,
181
+ incidental, or consequential damages of any character arising as a
182
+ result of this License or out of the use or inability to use the
183
+ Work (including but not limited to damages for loss of goodwill,
184
+ work stoppage, computer failure or malfunction, or any and all
185
+ other commercial damages or losses), even if such Contributor
186
+ has been advised of the possibility of such damages.
187
+
188
+ 9. Accepting Warranty or Additional Liability. While redistributing
189
+ the Work or Derivative Works thereof, You may choose to offer,
190
+ and charge a fee for, acceptance of support, warranty, indemnity,
191
+ or other liability obligations and/or rights consistent with this
192
+ License. However, in accepting such obligations, You may act only
193
+ on Your own behalf and on Your sole responsibility, not on behalf
194
+ of any other Contributor, and only if You agree to indemnify,
195
+ defend, and hold each Contributor harmless for any liability
196
+ incurred by, or claims asserted against, such Contributor by reason
197
+ of your accepting any such warranty or additional liability.
198
+
199
+ END OF TERMS AND CONDITIONS
200
+
201
+ APPENDIX: How to apply the Apache License to your work.
202
+
203
+ To apply the Apache License to your work, attach the following
204
+ boilerplate notice, with the fields enclosed by brackets "[]"
205
+ replaced with your own identifying information. (Don't include
206
+ the brackets!) The text should be enclosed in the appropriate
207
+ comment syntax for the file format. We also recommend that a
208
+ file or class name and description of purpose be included on the
209
+ same "printed page" as the copyright notice for easier
210
+ identification within third-party archives.
211
+
212
+ Copyright [yyyy] [name of copyright owner]
213
+
214
+ Licensed under the Apache License, Version 2.0 (the "License");
215
+ you may not use this file except in compliance with the License.
216
+ You may obtain a copy of the License at
217
+
218
+ http://www.apache.org/licenses/LICENSE-2.0
219
+
220
+ Unless required by applicable law or agreed to in writing, software
221
+ distributed under the License is distributed on an "AS IS" BASIS,
222
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
223
+ See the License for the specific language governing permissions and
224
+ limitations under the License.
MoGe/README.md ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div align="center">
2
+
3
+ # MoGe: Unlocking Accurate Monocular Geometry Estimation for Open-Domain Images with Optimal Training Supervision
4
+
5
+ <a href="https://arxiv.org/abs/2410.19115"><img src='https://img.shields.io/badge/arXiv-Paper-red?logo=arxiv&logoColor=white' alt='arXiv'></a>
6
+ <a href='https://wangrc.site/MoGePage/'><img src='https://img.shields.io/badge/Project_Page-Website-green?logo=googlechrome&logoColor=white' alt='Project Page'></a>
7
+ <a href='https://huggingface.co/spaces/Ruicheng/MoGe'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Live_Demo-blue'></a>
8
+
9
+ </div>
10
+
11
+ <img src="./assets/overview_simplified.png" width="100%" alt="Method overview" align="center">
12
+
13
+ MoGe is a powerful model for recovering 3D geometry from monocular open-domain images. The model consists of a ViT encoder and a convolutional decoder. It directly predicts an affine-invariant point map as well as a mask that excludes regions with undefined geometry (e.g., sky), from which the camera shift, camera focal length and depth map can be further derived.
14
+
15
+ ***Check our [website](https://wangrc.site/MoGePage) for videos and interactive results!***
16
+
17
+ ## Features
18
+
19
+ * **Accurate 3D geometry estimation**: Estimate point maps from single images with high precision. Capable of capturing depth variations up to 1000×, ensuring a comprehensive scene representation.
20
+ * **Optional ground-truth FOV input**: Enhance model accuracy further by providing the true field of view.
21
+ * **Flexible resolution support**: Works seamlessly with various resolutions and aspect ratios, from 2:1 to 1:2.
22
+ * **Optimized for speed**: Achieves <0.1s latency per image on an A100 / RTX 3090 GPU with fp16, and 0.2s with fp32.
23
+
24
+ ## TODO List
25
+
26
+ - [x] Release inference code & ViT-Large model.
27
+ - [x] Release evaluation and training code.
28
+ - [ ] Release ViT-Base and ViT-Giant models.
29
+
30
+ 🌟*Updated on 2025/03/18* [CHANGELOG](CHANGELOG.md)
31
+ - **Training and evaluation code released!**
32
+ - Installation via pip and CLI usage supported.
33
+
34
+ ## Installation
35
+
36
+ ### Install via pip
37
+
38
+ ```bash
39
+ pip install git+https://github.com/microsoft/MoGe.git
40
+ ```
41
+
42
+ ### Or clone this repository
43
+
44
+ ```bash
45
+ git clone https://github.com/microsoft/MoGe.git
46
+ cd MoGe
47
+ ```
48
+
49
+ and install the requirements
50
+
51
+ ```bash
52
+ pip install -r requirements.txt
53
+ ```
54
+
55
+ MoGe should be compatible with most requirements versions. Please check the `requirements.txt` for more details if you have concerns.
56
+
57
+ ## Usage
58
+
59
+ ### Pretrained model
60
+
61
+ The ViT-Large model has been uploaded to Hugging Face hub at [Ruicheng/moge-vitl](https://huggingface.co/Ruicheng/moge-vitl).
62
+ You may load the model via `MoGeModel.from_pretrained("Ruicheng/moge-vitl")` without manually downloading.
63
+
64
+ If loading the model from a local file is preferred, you may manually download the model from the huggingface hub and load it via `MoGeModel.from_pretrained("PATH_TO_LOCAL_MODEL.pt")`.
65
+
66
+ ### Minimal code example
67
+
68
+ Here is a minimal example for loading the model and inferring on a single image.
69
+
70
+ ```python
71
+ import cv2
72
+ import torch
73
+ from moge.model.v1 import MoGeModel
74
+
75
+ device = torch.device("cuda")
76
+
77
+ # Load the model from huggingface hub (or load from local).
78
+ model = MoGeModel.from_pretrained("Ruicheng/moge-vitl").to(device)
79
+
80
+ # Read the input image and convert to tensor (3, H, W) and normalize to [0, 1]
81
+ input_image = cv2.cvtColor(cv2.imread("PATH_TO_IMAGE.jpg"), cv2.COLOR_BGR2RGB)
82
+ input_image = torch.tensor(input_image / 255, dtype=torch.float32, device=device).permute(2, 0, 1)
83
+
84
+ # Infer
85
+ output = model.infer(input_image)
86
+ # `output` has keys "points", "depth", "mask" and "intrinsics",
87
+ # The maps are in the same size as the input image.
88
+ # {
89
+ # "points": (H, W, 3), # scale-invariant point map in OpenCV camera coordinate system (x right, y down, z forward)
90
+ # "depth": (H, W), # scale-invariant depth map
91
+ # "mask": (H, W), # a binary mask for valid pixels.
92
+ # "intrinsics": (3, 3), # normalized camera intrinsics
93
+ # }
94
+ # For more usage details, see the `MoGeModel.infer` docstring.
95
+ ```
96
+
97
+ ### Gradio demo | `moge app`
98
+
99
+ The demo is also available at our [Hugging Face space](https://huggingface.co/spaces/Ruicheng/MoGe).
100
+
101
+ ```bash
102
+ # Using the command line tool
103
+ moge app
104
+
105
+ # In this repo
106
+ python moge/scripts/app.py # --share for Gradio public sharing
107
+ ```
108
+
109
+ See also [`moge/scripts/app.py`](moge/scripts/app.py)
110
+
111
+
112
+ ### Inference | `moge infer`
113
+
114
+ Run the script `moge/scripts/infer.py` via the following command:
115
+
116
+ ```bash
117
+ # Save the output [maps], [glb] and [ply] files
118
+ moge infer -i IMAGES_FOLDER_OR_IMAGE_PATH --o OUTPUT_FOLDER --maps --glb --ply
119
+
120
+ # Show the result in a window (requires pyglet < 2.0, e.g. pip install pyglet==1.5.29)
121
+ moge infer -i IMAGES_FOLDER_OR_IMAGE_PATH --o OUTPUT_FOLDER --show
122
+ ```
123
+
124
+ For detailed options, run `moge infer --help`:
125
+
126
+ ```
127
+ Usage: moge infer [OPTIONS]
128
+
129
+ Inference script for the MoGe model.
130
+
131
+ Options:
132
+ -i, --input PATH Input image or folder path. "jpg" and "png" are
133
+ supported.
134
+ --fov_x FLOAT If camera parameters are known, set the
135
+ horizontal field of view in degrees. Otherwise,
136
+ MoGe will estimate it.
137
+ -o, --output PATH Output folder path
138
+ --pretrained TEXT Pretrained model name or path. Defaults to
139
+ "Ruicheng/moge-vitl"
140
+ --device TEXT Device name (e.g. "cuda", "cuda:0", "cpu").
141
+ Defaults to "cuda"
142
+ --fp16 Use fp16 precision for 2x faster inference.
143
+ --resize INTEGER Resize the image(s) & output maps to a specific
144
+ size. Defaults to None (no resizing).
145
+ --resolution_level INTEGER An integer [0-9] for the resolution level for
146
+ inference. Higher value means more tokens and
147
+ the finer details will be captured, but
148
+ inference can be slower. Defaults to 9. Note
149
+ that it is irrelevant to the output size, which
150
+ is always the same as the input size.
151
+ `resolution_level` actually controls
152
+ `num_tokens`. See `num_tokens` for more details.
153
+ --num_tokens INTEGER number of tokens used for inference. A integer
154
+ in the (suggested) range of `[1200, 2500]`.
155
+ `resolution_level` will be ignored if
156
+ `num_tokens` is provided. Default: None
157
+ --threshold FLOAT Threshold for removing edges. Defaults to 0.03.
158
+ Smaller value removes more edges. "inf" means no
159
+ thresholding.
160
+ --maps Whether to save the output maps and fov(image,
161
+ depth, mask, points, fov).
162
+ --glb Whether to save the output as a.glb file. The
163
+ color will be saved as a texture.
164
+ --ply Whether to save the output as a.ply file. The
165
+ color will be saved as vertex colors.
166
+ --show Whether show the output in a window. Note that
167
+ this requires pyglet<2 installed as required by
168
+ trimesh.
169
+ --help Show this message and exit.
170
+ ```
171
+
172
+ See also [`moge/scripts/infer.py`](moge/scripts/infer.py)
173
+
174
+ ### 360° panorama images | `moge infer_panorama`
175
+
176
+ > *NOTE: This is an experimental extension of MoGe.*
177
+
178
+ The script will split the 360-degree panorama image into multiple perspective views and infer on each view separately.
179
+ The output maps will be combined to produce a panorama depth map and point map.
180
+
181
+ Note that the panorama image must have spherical parameterization (e.g., environment maps or equirectangular images). Other formats must be converted to spherical format before using this script. Run `moge infer_panorama --help` for detailed options.
182
+
183
+
184
+ <div align="center">
185
+ <img src="./assets/panorama_pipeline.png" width="80%">
186
+
187
+ The photo is from [this URL](https://commons.wikimedia.org/wiki/Category:360%C2%B0_panoramas_with_equirectangular_projection#/media/File:Braunschweig_Sankt-%C3%84gidien_Panorama_02.jpg)
188
+ </div>
189
+
190
+ See also [`moge/scripts/infer_panorama.py`](moge/scripts/infer_panorama.py)
191
+
192
+ ## Training & Finetuning
193
+
194
+ See [docs/train.md](docs/train.md)
195
+
196
+ ## Evaluation
197
+
198
+ See [docs/eval.md](docs/eval.md)
199
+
200
+ ## License
201
+
202
+ MoGe code is released under the MIT license, except for DINOv2 code in `moge/model/dinov2` which is released by Meta AI under the Apache 2.0 license.
203
+ See [LICENSE](LICENSE) for more details.
204
+
205
+
206
+ ## Citation
207
+
208
+ If you find our work useful in your research, we gratefully request that you consider citing our paper:
209
+
210
+ ```
211
+ @misc{wang2024moge,
212
+ title={MoGe: Unlocking Accurate Monocular Geometry Estimation for Open-Domain Images with Optimal Training Supervision},
213
+ author={Wang, Ruicheng and Xu, Sicheng and Dai, Cassie and Xiang, Jianfeng and Deng, Yu and Tong, Xin and Yang, Jiaolong},
214
+ year={2024},
215
+ eprint={2410.19115},
216
+ archivePrefix={arXiv},
217
+ primaryClass={cs.CV},
218
+ url={https://arxiv.org/abs/2410.19115},
219
+ }
220
+ ```
MoGe/SECURITY.md ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- BEGIN MICROSOFT SECURITY.MD V0.0.9 BLOCK -->
2
+
3
+ ## Security
4
+
5
+ Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet) and [Xamarin](https://github.com/xamarin).
6
+
7
+ If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/security.md/definition), please report it to us as described below.
8
+
9
+ ## Reporting Security Issues
10
+
11
+ **Please do not report security vulnerabilities through public GitHub issues.**
12
+
13
+ Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/security.md/msrc/create-report).
14
+
15
+ If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/security.md/msrc/pgp).
16
+
17
+ You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc).
18
+
19
+ Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20
+
21
+ * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22
+ * Full paths of source file(s) related to the manifestation of the issue
23
+ * The location of the affected source code (tag/branch/commit or direct URL)
24
+ * Any special configuration required to reproduce the issue
25
+ * Step-by-step instructions to reproduce the issue
26
+ * Proof-of-concept or exploit code (if possible)
27
+ * Impact of the issue, including how an attacker might exploit the issue
28
+
29
+ This information will help us triage your report more quickly.
30
+
31
+ If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/security.md/msrc/bounty) page for more details about our active programs.
32
+
33
+ ## Preferred Languages
34
+
35
+ We prefer all communications to be in English.
36
+
37
+ ## Policy
38
+
39
+ Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/security.md/cvd).
40
+
41
+ <!-- END MICROSOFT SECURITY.MD BLOCK -->
MoGe/SUPPORT.md ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # TODO: The maintainer of this repo has not yet edited this file
2
+
3
+ **REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project?
4
+
5
+ - **No CSS support:** Fill out this template with information about how to file issues and get help.
6
+ - **Yes CSS support:** Fill out an intake form at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). CSS will work with/help you to determine next steps.
7
+ - **Not sure?** Fill out an intake as though the answer were "Yes". CSS will help you decide.
8
+
9
+ *Then remove this first heading from this SUPPORT.MD file before publishing your repo.*
10
+
11
+ # Support
12
+
13
+ ## How to file issues and get help
14
+
15
+ This project uses GitHub Issues to track bugs and feature requests. Please search the existing
16
+ issues before filing new issues to avoid duplicates. For new issues, file your bug or
17
+ feature request as a new Issue.
18
+
19
+ For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE
20
+ FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER
21
+ CHANNEL. WHERE WILL YOU HELP PEOPLE?**.
22
+
23
+ ## Microsoft Support Policy
24
+
25
+ Support for this **PROJECT or PRODUCT** is limited to the resources listed above.
MoGe/baselines/da_v2.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Reference: https://github.com/DepthAnything/Depth-Anything-V2
2
+ import os
3
+ import sys
4
+ from typing import *
5
+ from pathlib import Path
6
+
7
+ import click
8
+ import torch
9
+ import torch.nn.functional as F
10
+ import torchvision.transforms as T
11
+ import torchvision.transforms.functional as TF
12
+
13
+ from moge.test.baseline import MGEBaselineInterface
14
+
15
+
16
+ class Baseline(MGEBaselineInterface):
17
+ def __init__(self, repo_path: str, backbone: str, num_tokens: int, device: Union[torch.device, str]):
18
+ # Create from repo
19
+ repo_path = os.path.abspath(repo_path)
20
+ if repo_path not in sys.path:
21
+ sys.path.append(repo_path)
22
+ if not Path(repo_path).exists():
23
+ raise FileNotFoundError(f'Cannot find the Depth-Anything repository at {repo_path}. Please clone the repository and provide the path to it using the --repo option.')
24
+ from depth_anything_v2.dpt import DepthAnythingV2
25
+
26
+ device = torch.device(device)
27
+
28
+ # Instantiate model
29
+ model = DepthAnythingV2(encoder=backbone, features=256, out_channels=[256, 512, 1024, 1024])
30
+
31
+ # Load checkpoint
32
+ checkpoint_path = os.path.join(repo_path, f'checkpoints/depth_anything_v2_{backbone}.pth')
33
+ if not os.path.exists(checkpoint_path):
34
+ raise FileNotFoundError(f'Cannot find the checkpoint file at {checkpoint_path}. Please download the checkpoint file and place it in the checkpoints directory.')
35
+ checkpoint = torch.load(checkpoint_path, map_location='cpu', weights_only=True)
36
+ model.load_state_dict(checkpoint)
37
+
38
+ model.to(device).eval()
39
+ self.model = model
40
+ self.num_tokens = num_tokens
41
+ self.device = device
42
+
43
+ @click.command()
44
+ @click.option('--repo', 'repo_path', type=click.Path(), default='../Depth-Anything-V2', help='Path to the Depth-Anything repository.')
45
+ @click.option('--backbone', type=click.Choice(['vits', 'vitb', 'vitl']), default='vitl', help='Encoder architecture.')
46
+ @click.option('--num_tokens', type=int, default=None, help='Number of tokens to use for the input image.')
47
+ @click.option('--device', type=str, default='cuda', help='Device to use for inference.')
48
+ @staticmethod
49
+ def load(repo_path: str, backbone, num_tokens: int, device: torch.device = 'cuda'):
50
+ return Baseline(repo_path, backbone, num_tokens, device)
51
+
52
+ @torch.inference_mode()
53
+ def infer(self, image: torch.Tensor, intrinsics: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]:
54
+ original_height, original_width = image.shape[-2:]
55
+
56
+ assert intrinsics is None, "Depth-Anything-V2 does not support camera intrinsics input"
57
+
58
+ if image.ndim == 3:
59
+ image = image.unsqueeze(0)
60
+ omit_batch_dim = True
61
+ else:
62
+ omit_batch_dim = False
63
+
64
+ if self.num_tokens is None:
65
+ resize_factor = 518 / min(original_height, original_width)
66
+ expected_width = round(original_width * resize_factor / 14) * 14
67
+ expected_height = round(original_height * resize_factor / 14) * 14
68
+ else:
69
+ aspect_ratio = original_width / original_height
70
+ tokens_rows = round((self.num_tokens * aspect_ratio) ** 0.5)
71
+ tokens_cols = round((self.num_tokens / aspect_ratio) ** 0.5)
72
+ expected_width = tokens_cols * 14
73
+ expected_height = tokens_rows * 14
74
+ image = TF.resize(image, (expected_height, expected_width), interpolation=T.InterpolationMode.BICUBIC, antialias=True)
75
+
76
+ image = TF.normalize(image, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
77
+
78
+ disparity = self.model(image)
79
+
80
+ disparity = F.interpolate(disparity[:, None], size=(original_height, original_width), mode='bilinear', align_corners=False, antialias=False)[:, 0]
81
+
82
+ if omit_batch_dim:
83
+ disparity = disparity.squeeze(0)
84
+
85
+ return {
86
+ 'disparity_affine_invariant': disparity
87
+ }
88
+
MoGe/baselines/da_v2_metric.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Reference https://github.com/DepthAnything/Depth-Anything-V2/metric_depth
2
+ import os
3
+ import sys
4
+ from typing import *
5
+ from pathlib import Path
6
+
7
+ import click
8
+ import torch
9
+ import torch.nn.functional as F
10
+ import torchvision.transforms as T
11
+ import torchvision.transforms.functional as TF
12
+ import cv2
13
+
14
+ from moge.test.baseline import MGEBaselineInterface
15
+
16
+
17
+ class Baseline(MGEBaselineInterface):
18
+
19
+ def __init__(self, repo_path: str, backbone: str, domain: str, num_tokens: int, device: str):
20
+ device = torch.device(device)
21
+ repo_path = os.path.abspath(repo_path)
22
+ if not Path(repo_path).exists():
23
+ raise FileNotFoundError(f'Cannot find the Depth-Anything repository at {repo_path}. Please clone the repository and provide the path to it using the --repo option.')
24
+ sys.path.append(os.path.join(repo_path, 'metric_depth'))
25
+ from depth_anything_v2.dpt import DepthAnythingV2
26
+
27
+ model_configs = {
28
+ 'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
29
+ 'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
30
+ 'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]}
31
+ }
32
+
33
+ if domain == 'indoor':
34
+ dataset = 'hypersim'
35
+ max_depth = 20
36
+ elif domain == 'outdoor':
37
+ dataset = 'vkitti'
38
+ max_depth = 80
39
+ else:
40
+ raise ValueError(f"Invalid domain: {domain}")
41
+
42
+ model = DepthAnythingV2(**model_configs[backbone], max_depth=max_depth)
43
+ checkpoint_path = os.path.join(repo_path, f'checkpoints/depth_anything_v2_metric_{dataset}_{backbone}.pth')
44
+ if not os.path.exists(checkpoint_path):
45
+ raise FileNotFoundError(f'Cannot find the checkpoint file at {checkpoint_path}. Please download the checkpoint file and place it in the checkpoints directory.')
46
+ model.load_state_dict(torch.load(checkpoint_path, map_location='cpu', weights_only=True))
47
+ model.eval().to(device)
48
+
49
+ self.model = model
50
+ self.num_tokens = num_tokens
51
+ self.device = device
52
+
53
+ @click.command()
54
+ @click.option('--repo', 'repo_path', type=click.Path(), default='../Depth-Anything-V2', help='Path to the Depth-Anything repository.')
55
+ @click.option('--backbone', type=click.Choice(['vits', 'vitb', 'vitl']), default='vitl', help='Backbone architecture.')
56
+ @click.option('--domain', type=click.Choice(['indoor', 'outdoor']), help='Domain of the dataset.')
57
+ @click.option('--num_tokens', type=int, default=None, help='Number of tokens for the ViT model')
58
+ @click.option('--device', type=str, default='cuda', help='Device to use for inference.')
59
+ @staticmethod
60
+ def load(repo_path: str, backbone: str, domain: str, num_tokens: int, device: str):
61
+ return Baseline(repo_path, backbone, domain, num_tokens, device)
62
+
63
+ @torch.inference_mode()
64
+ def infer(self, image: torch.Tensor, intrinsics: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]:
65
+ original_height, original_width = image.shape[-2:]
66
+
67
+ assert intrinsics is None, "Depth-Anything-V2 does not support camera intrinsics input"
68
+
69
+ if image.ndim == 3:
70
+ image = image.unsqueeze(0)
71
+ omit_batch_dim = True
72
+ else:
73
+ omit_batch_dim = False
74
+
75
+ if self.num_tokens is None:
76
+ resize_factor = 518 / min(original_height, original_width)
77
+ expected_width = round(original_width * resize_factor / 14) * 14
78
+ expected_height = round(original_height * resize_factor / 14) * 14
79
+ else:
80
+ aspect_ratio = original_width / original_height
81
+ tokens_rows = round((self.num_tokens * aspect_ratio) ** 0.5)
82
+ tokens_cols = round((self.num_tokens / aspect_ratio) ** 0.5)
83
+ expected_width = tokens_cols * 14
84
+ expected_height = tokens_rows * 14
85
+ image = TF.resize(image, (expected_height, expected_width), interpolation=T.InterpolationMode.BICUBIC, antialias=True)
86
+
87
+ image = TF.normalize(image, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
88
+
89
+ depth = self.model(image)
90
+
91
+ depth = F.interpolate(depth[:, None], size=(original_height, original_width), mode='bilinear', align_corners=False, antialias=False)[:, 0]
92
+
93
+ if omit_batch_dim:
94
+ depth = depth.squeeze(0)
95
+
96
+ return {
97
+ 'depth_metric': depth
98
+ }
99
+
MoGe/baselines/metric3d_v2.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Reference: https://github.com/YvanYin/Metric3D
2
+ import os
3
+ import sys
4
+ from typing import *
5
+
6
+ import click
7
+ import torch
8
+ import torch.nn.functional as F
9
+ import cv2
10
+
11
+ from moge.test.baseline import MGEBaselineInterface
12
+
13
+
14
+ class Baseline(MGEBaselineInterface):
15
+ def __init__(self, backbone: Literal['vits', 'vitl', 'vitg'], device):
16
+ backbone_map = {
17
+ 'vits': 'metric3d_vit_small',
18
+ 'vitl': 'metric3d_vit_large',
19
+ 'vitg': 'metric3d_vit_giant2'
20
+ }
21
+
22
+ device = torch.device(device)
23
+ model = torch.hub.load('yvanyin/metric3d', backbone_map[backbone], pretrain=True)
24
+ model.to(device).eval()
25
+
26
+ self.model = model
27
+ self.device = device
28
+
29
+ @click.command()
30
+ @click.option('--backbone', type=click.Choice(['vits', 'vitl', 'vitg']), default='vitl', help='Encoder architecture.')
31
+ @click.option('--device', type=str, default='cuda', help='Device to use.')
32
+ @staticmethod
33
+ def load(backbone: str = 'vitl', device: torch.device = 'cuda'):
34
+ return Baseline(backbone, device)
35
+
36
+ @torch.inference_mode()
37
+ def inference_one_image(self, image: torch.Tensor, intrinsics: torch.Tensor = None):
38
+ # Reference: https://github.com/YvanYin/Metric3D/blob/main/mono/utils/do_test.py
39
+
40
+ # rgb_origin: RGB, 0-255, uint8
41
+ rgb_origin = image.cpu().numpy().transpose((1, 2, 0)) * 255
42
+
43
+ # keep ratio resize
44
+ input_size = (616, 1064) # for vit model
45
+ h, w = rgb_origin.shape[:2]
46
+ scale = min(input_size[0] / h, input_size[1] / w)
47
+ rgb = cv2.resize(rgb_origin, (int(w * scale), int(h * scale)), interpolation=cv2.INTER_LINEAR)
48
+ if intrinsics is not None:
49
+ focal = intrinsics[0, 0] * int(w * scale)
50
+
51
+ # padding to input_size
52
+ padding = [123.675, 116.28, 103.53]
53
+ h, w = rgb.shape[:2]
54
+ pad_h = input_size[0] - h
55
+ pad_w = input_size[1] - w
56
+ pad_h_half = pad_h // 2
57
+ pad_w_half = pad_w // 2
58
+ rgb = cv2.copyMakeBorder(rgb, pad_h_half, pad_h - pad_h_half, pad_w_half, pad_w - pad_w_half, cv2.BORDER_CONSTANT, value=padding)
59
+ pad_info = [pad_h_half, pad_h - pad_h_half, pad_w_half, pad_w - pad_w_half]
60
+
61
+ # normalize rgb
62
+ mean = torch.tensor([123.675, 116.28, 103.53]).float()[:, None, None]
63
+ std = torch.tensor([58.395, 57.12, 57.375]).float()[:, None, None]
64
+ rgb = torch.from_numpy(rgb.transpose((2, 0, 1))).float()
65
+ rgb = torch.div((rgb - mean), std)
66
+ rgb = rgb[None, :, :, :].cuda()
67
+
68
+ # inference
69
+ pred_depth, confidence, output_dict = self.model.inference({'input': rgb})
70
+
71
+ # un pad
72
+ pred_depth = pred_depth.squeeze()
73
+ pred_depth = pred_depth[pad_info[0] : pred_depth.shape[0] - pad_info[1], pad_info[2] : pred_depth.shape[1] - pad_info[3]]
74
+ pred_depth = pred_depth.clamp_min(0.5) # clamp to 0.5m, since metric3d could yield very small depth values, resulting in crashed the scale shift alignment.
75
+
76
+ # upsample to original size
77
+ pred_depth = F.interpolate(pred_depth[None, None, :, :], image.shape[-2:], mode='bilinear').squeeze()
78
+
79
+ if intrinsics is not None:
80
+ # de-canonical transform
81
+ canonical_to_real_scale = focal / 1000.0 # 1000.0 is the focal length of canonical camera
82
+ pred_depth = pred_depth * canonical_to_real_scale # now the depth is metric
83
+ pred_depth = torch.clamp(pred_depth, 0, 300)
84
+
85
+ pred_normal, normal_confidence = output_dict['prediction_normal'].split([3, 1], dim=1) # see https://arxiv.org/abs/2109.09881 for details
86
+
87
+ # un pad and resize to some size if needed
88
+ pred_normal = pred_normal.squeeze(0)
89
+ pred_normal = pred_normal[:, pad_info[0] : pred_normal.shape[1] - pad_info[1], pad_info[2] : pred_normal.shape[2] - pad_info[3]]
90
+
91
+ # you can now do anything with the normal
92
+ pred_normal = F.interpolate(pred_normal[None, :, :, :], image.shape[-2:], mode='bilinear').squeeze(0)
93
+ pred_normal = F.normalize(pred_normal, p=2, dim=0)
94
+
95
+ return pred_depth, pred_normal.permute(1, 2, 0)
96
+
97
+ @torch.inference_mode()
98
+ def infer(self, image: torch.Tensor, intrinsics: torch.Tensor = None):
99
+ # image: (B, H, W, 3) or (H, W, 3)
100
+ if image.ndim == 3:
101
+ pred_depth, pred_normal = self.inference_one_image(image, intrinsics)
102
+ else:
103
+ for i in range(image.shape[0]):
104
+ pred_depth_i, pred_normal_i = self.inference_one_image(image[i], intrinsics[i] if intrinsics is not None else None)
105
+ pred_depth.append(pred_depth_i)
106
+ pred_normal.append(pred_normal_i)
107
+ pred_depth = torch.stack(pred_depth, dim=0)
108
+ pred_normal = torch.stack(pred_normal, dim=0)
109
+
110
+ if intrinsics is not None:
111
+ return {
112
+ "depth_metric": pred_depth,
113
+ }
114
+ else:
115
+ return {
116
+ "depth_scale_invariant": pred_depth,
117
+ }
MoGe/baselines/moge.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ from typing import *
4
+ import importlib
5
+
6
+ import click
7
+ import torch
8
+ import utils3d
9
+
10
+ from moge.test.baseline import MGEBaselineInterface
11
+
12
+
13
+ class Baseline(MGEBaselineInterface):
14
+
15
+ def __init__(self, num_tokens: int, resolution_level: int, pretrained_model_name_or_path: str, use_fp16: bool, device: str = 'cuda:0', version: str = 'v1'):
16
+ super().__init__()
17
+ from moge.model import import_model_class_by_version
18
+ MoGeModel = import_model_class_by_version(version)
19
+ self.version = version
20
+
21
+ self.model = MoGeModel.from_pretrained(pretrained_model_name_or_path).to(device).eval()
22
+
23
+ self.device = torch.device(device)
24
+ self.num_tokens = num_tokens
25
+ self.resolution_level = resolution_level
26
+ self.use_fp16 = use_fp16
27
+
28
+ @click.command()
29
+ @click.option('--num_tokens', type=int, default=None)
30
+ @click.option('--resolution_level', type=int, default=9)
31
+ @click.option('--pretrained', 'pretrained_model_name_or_path', type=str, default='Ruicheng/moge-vitl')
32
+ @click.option('--fp16', 'use_fp16', is_flag=True)
33
+ @click.option('--device', type=str, default='cuda:0')
34
+ @click.option('--version', type=str, default='v1')
35
+ @staticmethod
36
+ def load(num_tokens: int, resolution_level: int, pretrained_model_name_or_path: str, use_fp16: bool, device: str = 'cuda:0', version: str = 'v1'):
37
+ return Baseline(num_tokens, resolution_level, pretrained_model_name_or_path, use_fp16, device, version)
38
+
39
+ # Implementation for inference
40
+ @torch.inference_mode()
41
+ def infer(self, image: torch.FloatTensor, intrinsics: Optional[torch.FloatTensor] = None):
42
+ if intrinsics is not None:
43
+ fov_x, _ = utils3d.torch.intrinsics_to_fov(intrinsics)
44
+ fov_x = torch.rad2deg(fov_x)
45
+ else:
46
+ fov_x = None
47
+ output = self.model.infer(image, fov_x=fov_x, apply_mask=True, num_tokens=self.num_tokens)
48
+
49
+ if self.version == 'v1':
50
+ return {
51
+ 'points_scale_invariant': output['points'],
52
+ 'depth_scale_invariant': output['depth'],
53
+ 'intrinsics': output['intrinsics'],
54
+ }
55
+ else:
56
+ return {
57
+ 'points_metric': output['points'],
58
+ 'depth_metric': output['depth'],
59
+ 'intrinsics': output['intrinsics'],
60
+ }
61
+
62
+ @torch.inference_mode()
63
+ def infer_for_evaluation(self, image: torch.FloatTensor, intrinsics: torch.FloatTensor = None):
64
+ if intrinsics is not None:
65
+ fov_x, _ = utils3d.torch.intrinsics_to_fov(intrinsics)
66
+ fov_x = torch.rad2deg(fov_x)
67
+ else:
68
+ fov_x = None
69
+ output = self.model.infer(image, fov_x=fov_x, apply_mask=False, num_tokens=self.num_tokens, use_fp16=self.use_fp16)
70
+
71
+ if self.version == 'v1':
72
+ return {
73
+ 'points_scale_invariant': output['points'],
74
+ 'depth_scale_invariant': output['depth'],
75
+ 'intrinsics': output['intrinsics'],
76
+ }
77
+ else:
78
+ return {
79
+ 'points_metric': output['points'],
80
+ 'depth_metric': output['depth'],
81
+ 'intrinsics': output['intrinsics'],
82
+ }
83
+
MoGe/configs/eval/all_benchmarks.json ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "NYUv2": {
3
+ "path": "data/eval/NYUv2",
4
+ "width": 640,
5
+ "height": 480,
6
+ "split": ".index.txt",
7
+ "depth_unit": 1.0
8
+ },
9
+ "KITTI": {
10
+ "path": "data/eval/KITTI",
11
+ "width": 750,
12
+ "height": 375,
13
+ "split": ".index.txt",
14
+ "depth_unit": 1
15
+ },
16
+ "ETH3D": {
17
+ "path": "data/eval/ETH3D",
18
+ "width": 2048,
19
+ "height": 1365,
20
+ "split": ".index.txt",
21
+ "include_segmentation": true,
22
+ "depth_unit": 1
23
+ },
24
+ "iBims-1": {
25
+ "path": "data/eval/iBims-1",
26
+ "width": 640,
27
+ "height": 480,
28
+ "split": ".index.txt",
29
+ "has_sharp_boundary": true,
30
+ "include_segmentation": true,
31
+ "depth_unit": 1.0
32
+ },
33
+ "GSO": {
34
+ "path": "data/eval/GSO",
35
+ "width": 512,
36
+ "height": 512,
37
+ "split": ".index.txt"
38
+ },
39
+ "Sintel": {
40
+ "path": "data/eval/Sintel",
41
+ "width": 872,
42
+ "height": 436,
43
+ "split": ".index.txt",
44
+ "has_sharp_boundary": true,
45
+ "include_segmentation": true
46
+ },
47
+ "DDAD": {
48
+ "path": "data/eval/DDAD",
49
+ "width": 1400,
50
+ "height": 700,
51
+ "include_segmentation": true,
52
+ "split": ".index.txt",
53
+ "depth_unit": 1.0
54
+ },
55
+ "DIODE": {
56
+ "path": "data/eval/DIODE",
57
+ "width": 1024,
58
+ "height": 768,
59
+ "split": ".index.txt",
60
+ "include_segmentation": true,
61
+ "depth_unit": 1.0
62
+ },
63
+ "Spring": {
64
+ "path": "data/eval/Spring",
65
+ "width": 1920,
66
+ "height": 1080,
67
+ "split": ".index.txt",
68
+ "has_sharp_boundary": true
69
+ },
70
+ "HAMMER": {
71
+ "path": "data/eval/HAMMER",
72
+ "width": 1664,
73
+ "height": 832,
74
+ "split": ".index.txt",
75
+ "depth_unit": 1,
76
+ "has_sharp_boundary": true
77
+ }
78
+ }
MoGe/configs/eval/benchmarks/ddad.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "DDAD": {
3
+ "path": "data/eval/DDAD",
4
+ "width": 1400,
5
+ "height": 700,
6
+ "include_segmentation": true,
7
+ "split": ".index.txt"
8
+ }
9
+ }
MoGe/configs/eval/benchmarks/diode.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "DIODE": {
3
+ "path": "data/eval/DIODE",
4
+ "width": 1024,
5
+ "height": 768,
6
+ "split": ".index.txt",
7
+ "include_segmentation": true
8
+ }
9
+ }
MoGe/configs/eval/benchmarks/eth3d.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "ETH3D": {
3
+ "path": "data/eval/ETH3D",
4
+ "width": 2048,
5
+ "height": 1365,
6
+ "split": ".index.txt",
7
+ "include_segmentation": true,
8
+ "depth_unit": 1
9
+ }
10
+ }
MoGe/configs/eval/benchmarks/gso.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "GSO": {
3
+ "path": "data/eval/GSO",
4
+ "width": 512,
5
+ "height": 512,
6
+ "split": ".index.txt"
7
+ }
8
+ }
MoGe/configs/eval/benchmarks/hammer.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "HAMMER": {
3
+ "path": "data/eval/HAMMER",
4
+ "width": 1664,
5
+ "height": 832,
6
+ "split": ".index.txt",
7
+ "depth_unit": 1,
8
+ "has_sharp_boundary": true
9
+ }
10
+ }
MoGe/configs/eval/benchmarks/ibims-1.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "iBims-1": {
3
+ "path": "data/eval/iBims-1",
4
+ "width": 640,
5
+ "height": 480,
6
+ "split": ".index.txt",
7
+ "include_segmentation": true,
8
+ "has_sharp_boundary": true
9
+ }
10
+ }
MoGe/configs/eval/benchmarks/kitti.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "KITTI": {
3
+ "path": "data/eval/KITTI",
4
+ "width": 750,
5
+ "height": 375,
6
+ "split": ".index.txt",
7
+ "depth_unit": 1
8
+ }
9
+ }
MoGe/configs/eval/benchmarks/nyu.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "NYUv2": {
3
+ "path": "data/eval/NYUv2",
4
+ "width": 640,
5
+ "height": 480,
6
+ "split": ".test.txt"
7
+ }
8
+ }
MoGe/configs/eval/benchmarks/sintel.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Sintel": {
3
+ "path": "data/eval/Sintel",
4
+ "width": 872,
5
+ "height": 436,
6
+ "split": ".index.txt",
7
+ "include_segmentation": true,
8
+ "has_sharp_boundary": true
9
+ }
10
+ }
MoGe/configs/eval/benchmarks/spring.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Spring": {
3
+ "path": "data/eval/Spring",
4
+ "width": 1920,
5
+ "height": 1080,
6
+ "split": ".test.txt",
7
+ "has_sharp_boundary": true
8
+ }
9
+ }
MoGe/configs/train/v1.json ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "data": {
3
+ "aspect_ratio_range": [0.5, 2.0],
4
+ "area_range": [250000, 1000000],
5
+ "clamp_max_depth": 1000.0,
6
+ "center_augmentation": 0.5,
7
+ "fov_range_absolute": [1, 179],
8
+ "fov_range_relative": [0.01, 1.0],
9
+ "image_augmentation": ["jittering", "jpeg_loss", "blurring"],
10
+ "datasets": [
11
+ {
12
+ "name": "TartanAir",
13
+ "path": "blobmnt/data_v3/TartanAir",
14
+ "label_type": "synthetic",
15
+ "index": ".index.txt",
16
+ "depth": "depth.png",
17
+ "weight": 4.8,
18
+ "center_augmentation": 0.25,
19
+ "fov_range_absolute": [30, 150],
20
+ "fov_range_relative": [0.5, 1.0],
21
+ "image_augmentation": ["jittering", "jpeg_loss", "blurring", "shot_noise"]
22
+ }
23
+ ]
24
+ },
25
+ "model_version": "v1",
26
+ "model": {
27
+ "encoder": "dinov2_vitl14",
28
+ "remap_output": "exp",
29
+ "intermediate_layers": 4,
30
+ "dim_upsample": [256, 128, 64],
31
+ "dim_times_res_block_hidden": 2,
32
+ "num_res_blocks": 2,
33
+ "num_tokens_range": [1200, 2500],
34
+ "last_conv_channels": 32,
35
+ "last_conv_size": 1
36
+ },
37
+ "optimizer": {
38
+ "type": "AdamW",
39
+ "params": [
40
+ {"params": {"include": ["*"], "exclude": ["*backbone.*"]}, "lr": 1e-4},
41
+ {"params": {"include": ["*backbone.*"]}, "lr": 1e-5}
42
+ ]
43
+ },
44
+ "lr_scheduler": {
45
+ "type": "SequentialLR",
46
+ "params": {
47
+ "schedulers": [
48
+ {"type": "LambdaLR", "params": {"lr_lambda": ["1.0", "max(0.0, min(1.0, (epoch - 1000) / 1000))"]}},
49
+ {"type": "StepLR", "params": {"step_size": 25000, "gamma": 0.5}}
50
+ ],
51
+ "milestones": [2000]
52
+ }
53
+ },
54
+ "low_resolution_training_steps": 50000,
55
+ "loss": {
56
+ "invalid": {},
57
+ "synthetic": {
58
+ "global": {"function": "affine_invariant_global_loss", "weight": 1.0, "params": {"align_resolution": 32}},
59
+ "patch_4": {"function": "affine_invariant_local_loss", "weight": 1.0, "params": {"level": 4, "align_resolution": 16, "num_patches": 16}},
60
+ "patch_16": {"function": "affine_invariant_local_loss", "weight": 1.0, "params": {"level": 16, "align_resolution": 8, "num_patches": 256}},
61
+ "patch_64": {"function": "affine_invariant_local_loss", "weight": 1.0, "params": {"level": 64, "align_resolution": 4, "num_patches": 4096}},
62
+ "normal": {"function": "normal_loss", "weight": 1.0},
63
+ "mask": {"function": "mask_l2_loss", "weight": 1.0}
64
+ },
65
+ "sfm": {
66
+ "global": {"function": "affine_invariant_global_loss", "weight": 1.0, "params": {"align_resolution": 32}},
67
+ "patch_4": {"function": "affine_invariant_local_loss", "weight": 1.0, "params": {"level": 4, "align_resolution": 16, "num_patches": 16}},
68
+ "patch_16": {"function": "affine_invariant_local_loss", "weight": 1.0, "params": {"level": 16, "align_resolution": 8, "num_patches": 256}},
69
+ "mask": {"function": "mask_l2_loss", "weight": 1.0}
70
+ },
71
+ "lidar": {
72
+ "global": {"function": "affine_invariant_global_loss", "weight": 1.0, "params": {"align_resolution": 32}},
73
+ "patch_4": {"function": "affine_invariant_local_loss", "weight": 1.0, "params": {"level": 4, "align_resolution": 16, "num_patches": 16}},
74
+ "mask": {"function": "mask_l2_loss", "weight": 1.0}
75
+ }
76
+ }
77
+ }
MoGe/docs/eval.md ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Evaluation
2
+
3
+ We provide a unified evaluation script that runs baselines on multiple benchmarks. It takes a baseline model and evaluation configurations, evaluates on-the-fly, and reports results instantly in a JSON file.
4
+
5
+ ## Benchmarks
6
+
7
+ Donwload the processed datasets from [Huggingface Datasets](https://huggingface.co/datasets/Ruicheng/monocular-geometry-evaluation) and put them in the `data/eval` directory, using `huggingface-cli`:
8
+
9
+ ```bash
10
+ mkdir -p data/eval
11
+ huggingface-cli download Ruicheng/monocular-geometry-evaluation --repo-type dataset --local-dir data/eval --local-dir-use-symlinks False
12
+ ```
13
+
14
+ Then unzip the downloaded files:
15
+
16
+ ```bash
17
+ cd data/eval
18
+ unzip '*.zip'
19
+ # rm *.zip # if you don't keep the zip files
20
+ ```
21
+
22
+ ## Configuration
23
+
24
+ See [`configs/eval/all_benchmarks.json`](../configs/eval/all_benchmarks.json) for an example of evaluation configurations on all benchmarks. You can modify this file to evaluate on different benchmarks or different baselines.
25
+
26
+ ## Baseline
27
+
28
+ Some examples of baselines are provided in [`baselines/`](../baselines/). Pass the path to the baseline model python code to the `--baseline` argument of the evaluation script.
29
+
30
+ ## Run Evaluation
31
+
32
+ Run the script [`moge/scripts/eval_baseline.py`](../moge/scripts/eval_baseline.py).
33
+ For example,
34
+
35
+ ```bash
36
+ # Evaluate MoGe on the 10 benchmarks
37
+ python moge/scripts/eval_baseline.py --baseline baselines/moge.py --config configs/eval/all_benchmarks.json --output eval_output/moge.json --pretrained Ruicheng/moge-vitl --resolution_level 9
38
+
39
+ # Evaluate Depth Anything V2 on the 10 benchmarks. (NOTE: affine disparity)
40
+ python moge/scripts/eval_baseline.py --baseline baselines/da_v2.py --config configs/eval/all_benchmarks.json --output eval_output/da_v2.json
41
+ ```
42
+
43
+ The `--baselies` `--input` `--output` arguments are for the inference script. The rest arguments, e.g. `--pretrained` `--resolution_level`, are custormized for loading the baseline model.
44
+
45
+ Details of the arguments:
46
+
47
+ ```
48
+ Usage: eval_baseline.py [OPTIONS]
49
+
50
+ Evaluation script.
51
+
52
+ Options:
53
+ --baseline PATH Path to the baseline model python code.
54
+ --config PATH Path to the evaluation configurations. Defaults to
55
+ "configs/eval/all_benchmarks.json".
56
+ --output PATH Path to the output json file.
57
+ --oracle Use oracle mode for evaluation, i.e., use the GT intrinsics
58
+ input.
59
+ --dump_pred Dump predition results.
60
+ --dump_gt Dump ground truth.
61
+ --help Show this message and exit.
62
+ ```
63
+
64
+
65
+
66
+ ## Wrap a Customized Baseline
67
+
68
+ Wrap any baseline method with [`moge.test.baseline.MGEBaselineInterface`](../moge/test/baseline.py).
69
+ See [`baselines/`](../baselines/) for more examples.
70
+
71
+ It is a good idea to check the correctness of the baseline implementation by running inference on a small set of images via [`moge/scripts/infer_baselines.py`](../moge/scripts/infer_baselines.py):
72
+
73
+ ```base
74
+ python moge/scripts/infer_baselines.py --baseline baselines/moge.py --input example_images/ --output infer_outupt/moge --pretrained Ruicheng/moge-vitl --maps --ply
75
+ ```
76
+
77
+
MoGe/docs/train.md ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # Training
3
+
4
+ This document provides instructions for training and finetuning the MoGe model.
5
+
6
+ ## Additional Requirements
7
+
8
+ The following packages other than those listed in [`pyproject.toml`](../pyproject.toml) are required for training and finetuning the MoGe model:
9
+
10
+ ```
11
+ accelerate
12
+ sympy
13
+ mlflow
14
+ ```
15
+
16
+ ## Data preparation
17
+
18
+ ### Dataset format
19
+
20
+ Each dataset should be organized as follows:
21
+
22
+ ```
23
+ somedataset
24
+ ├── .index.txt # A list of instance paths
25
+ ├── folder1
26
+ │ ├── instance1 # Each instance is in a folder
27
+ │ │ ├── image.jpg # RGB image.
28
+ │ │ ├── depth.png # 16-bit depth. See moge/utils/io.py for details
29
+ │ │ ├── meta.json # Stores "intrinsics" as a 3x3 matrix
30
+ │ │ └── ... # Other componests such as segmentation mask, normal map etc.
31
+ ...
32
+ ```
33
+
34
+ * `.index.txt` is placed at top directory to store a list of instance paths in this dataset. The dataloader will look for instances in this list. You may also use a custom split, e.g. `.train.txt`, `.val.txt` and specify it in the configuration file.
35
+
36
+ * For depth images, it is recommended to use `read_depth()` and `write_depth()` in [`moge/utils/io.py`](../moge/utils/io.py) to read and write depth images. The depth is stored in logarithmic scale in 16-bit PNG format, offering a balanced precision, dynamic range and compression ratio compared to 16-bit and 32-bit EXR and linear depth formats. It also encodes `NaN` and `Inf` values for invalid depth values.
37
+
38
+ * The `meta.json` should be a dictionary containing the key `intrinsics`, which are **normalized** camera parameters. You may put more metadata.
39
+
40
+ * We also support reading and storing segementation masks for evaluation data (see paper evaluation of local points), which are saved in PNG format with semantic labels stored in png metadata as JSON strings. See `read_segmentation()` and `write_segmentation()` in [`moge/utils/io.py`](../moge/utils/io.py) for details.
41
+
42
+
43
+ ### Visual inspection
44
+
45
+ We provide a script to visualize the data and check the data quality. It will export the instance as a PLY file for visualization of point cloud.
46
+
47
+ ```bash
48
+ python moge/scripts/vis_data.py PATH_TO_INSTANCE --ply [-o SOMEWHERE_ELSE_TO_SAVE_VIS]
49
+ ```
50
+
51
+ ### DataLoader
52
+
53
+ Our training dataloaders is customized to handle loading data, performing perspective crop, and augmentation in a multithreading pipeline. Please refer to [`moge/train/dataloader.py`](../moge/train/dataloader.py) if you have any concern.
54
+
55
+
56
+ ## Configuration
57
+
58
+ See [`configs/train/v1.json`](../configs/train/v1.json) for an example configuration file. The configuration file defines the hyperparameters for training the MoGe model.
59
+ Here is a commented configuration for reference:
60
+
61
+ ```json
62
+ {
63
+ "data": {
64
+ "aspect_ratio_range": [0.5, 2.0], # Range of aspect ratio of sampled images
65
+ "area_range": [250000, 1000000], # Range of sampled image area in pixels
66
+ "clamp_max_depth": 1000.0, # Maximum far/near
67
+ "center_augmentation": 0.5, # Ratio of center crop augmentation
68
+ "fov_range_absolute": [1, 179], # Absolute range of FOV in degrees
69
+ "fov_range_relative": [0.01, 1.0], # Relative range of FOV to the original FOV
70
+ "image_augmentation": ["jittering", "jpeg_loss", "blurring"], # List of image augmentation techniques
71
+ "datasets": [
72
+ {
73
+ "name": "TartanAir", # Name of the dataset. Name it as you like.
74
+ "path": "data/TartanAir", # Path to the dataset
75
+ "label_type": "synthetic", # Label type for this dataset. Losses will be applied accordingly. see "loss" config
76
+ "weight": 4.8, # Probability of sampling this dataset
77
+ "index": ".index.txt", # File name of the index file. Defaults to .index.txt
78
+ "depth": "depth.png", # File name of depth images. Defaults to depth.png
79
+ "center_augmentation": 0.25, # Below are dataset-specific hyperparameters. Overriding the global ones above.
80
+ "fov_range_absolute": [30, 150],
81
+ "fov_range_relative": [0.5, 1.0],
82
+ "image_augmentation": ["jittering", "jpeg_loss", "blurring", "shot_noise"]
83
+ }
84
+ ]
85
+ },
86
+ "model_version": "v1", # Model version. If you have multiple model variants, you can use this to switch between them.
87
+ "model": { # Model hyperparameters. Will be passed to Model __init__() as kwargs.
88
+ "encoder": "dinov2_vitl14",
89
+ "remap_output": "exp",
90
+ "intermediate_layers": 4,
91
+ "dim_upsample": [256, 128, 64],
92
+ "dim_times_res_block_hidden": 2,
93
+ "num_res_blocks": 2,
94
+ "num_tokens_range": [1200, 2500],
95
+ "last_conv_channels": 32,
96
+ "last_conv_size": 1
97
+ },
98
+ "optimizer": { # Reflection-like optimizer configurations. See moge.train.utils.py build_optimizer() for details.
99
+ "type": "AdamW",
100
+ "params": [
101
+ {"params": {"include": ["*"], "exclude": ["*backbone.*"]}, "lr": 1e-4},
102
+ {"params": {"include": ["*backbone.*"]}, "lr": 1e-5}
103
+ ]
104
+ },
105
+ "lr_scheduler": { # Reflection-like lr_scheduler configurations. See moge.train.utils.py build_lr_scheduler() for details.
106
+ "type": "SequentialLR",
107
+ "params": {
108
+ "schedulers": [
109
+ {"type": "LambdaLR", "params": {"lr_lambda": ["1.0", "max(0.0, min(1.0, (epoch - 1000) / 1000))"]}},
110
+ {"type": "StepLR", "params": {"step_size": 25000, "gamma": 0.5}}
111
+ ],
112
+ "milestones": [2000]
113
+ }
114
+ },
115
+ "low_resolution_training_steps": 50000, # Total number of low-resolution training steps. It makes the early stage training faster. Later stage training on varying size images will be slower.
116
+ "loss": {
117
+ "invalid": {}, # invalid instance due to runtime error when loading data
118
+ "synthetic": { # Below are loss hyperparameters
119
+ "global": {"function": "affine_invariant_global_loss", "weight": 1.0, "params": {"align_resolution": 32}},
120
+ "patch_4": {"function": "affine_invariant_local_loss", "weight": 1.0, "params": {"level": 4, "align_resolution": 16, "num_patches": 16}},
121
+ "patch_16": {"function": "affine_invariant_local_loss", "weight": 1.0, "params": {"level": 16, "align_resolution": 8, "num_patches": 256}},
122
+ "patch_64": {"function": "affine_invariant_local_loss", "weight": 1.0, "params": {"level": 64, "align_resolution": 4, "num_patches": 4096}},
123
+ "normal": {"function": "normal_loss", "weight": 1.0},
124
+ "mask": {"function": "mask_l2_loss", "weight": 1.0}
125
+ },
126
+ "sfm": {
127
+ "global": {"function": "affine_invariant_global_loss", "weight": 1.0, "params": {"align_resolution": 32}},
128
+ "patch_4": {"function": "affine_invariant_local_loss", "weight": 1.0, "params": {"level": 4, "align_resolution": 16, "num_patches": 16}},
129
+ "patch_16": {"function": "affine_invariant_local_loss", "weight": 1.0, "params": {"level": 16, "align_resolution": 8, "num_patches": 256}},
130
+ "mask": {"function": "mask_l2_loss", "weight": 1.0}
131
+ },
132
+ "lidar": {
133
+ "global": {"function": "affine_invariant_global_loss", "weight": 1.0, "params": {"align_resolution": 32}},
134
+ "patch_4": {"function": "affine_invariant_local_loss", "weight": 1.0, "params": {"level": 4, "align_resolution": 16, "num_patches": 16}},
135
+ "mask": {"function": "mask_l2_loss", "weight": 1.0}
136
+ }
137
+ }
138
+ }
139
+ ```
140
+
141
+ ## Run Training
142
+
143
+ Launch the training script [`moge/scripts/train.py`](../moge/scripts/train.py). Note that we use [`accelerate`](https://github.com/huggingface/accelerate) for distributed training.
144
+
145
+ ```bash
146
+ accelerate launch \
147
+ --num_processes 8 \
148
+ moge/scripts/train.py \
149
+ --config configs/train/v1.json \
150
+ --workspace workspace/debug \
151
+ --gradient_accumulation_steps 2 \
152
+ --batch_size_forward 2 \
153
+ --checkpoint latest \
154
+ --enable_gradient_checkpointing True \
155
+ --vis_every 1000 \
156
+ --enable_mlflow True
157
+ ```
158
+
159
+
160
+ ## Finetuning
161
+
162
+ To finetune the pre-trained MoGe model, download the model checkpoint and put it in a local directory, e.g. `pretrained/moge-vitl.pt`.
163
+
164
+ > NOTE: when finetuning pretrained MoGe model, a much lower learning rate is required.
165
+ The suggested learning rate for finetuning is not greater than 1e-5 for the head and 1e-6 for the backbone.
166
+ And the batch size is recommended to be 32 at least.
167
+ The settings in default configuration are not optimal for specific datasets and may require further tuning.
168
+
169
+ ```bash
170
+ accelerate launch \
171
+ --num_processes 8 \
172
+ moge/scripts/train.py \
173
+ --config configs/train/v1.json \
174
+ --workspace workspace/debug \
175
+ --gradient_accumulation_steps 2 \
176
+ --batch_size_forward 2 \
177
+ --checkpoint pretrained/moge-vitl.pt \
178
+ --enable_gradient_checkpointing True \
179
+ --vis_every 1000 \
180
+ --enable_mlflow True
181
+ ```
MoGe/moge/__init__.py ADDED
File without changes
MoGe/moge/model/__init__.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import importlib
2
+ from typing import *
3
+
4
+ if TYPE_CHECKING:
5
+ from .v1 import MoGeModel as MoGeModelV1
6
+
7
+
8
+ def import_model_class_by_version(version: str) -> Type[Union['MoGeModelV1']]:
9
+ assert version in ['v1'], f'Unsupported model version: {version}'
10
+
11
+ try:
12
+ module = importlib.import_module(f'.{version}', __package__)
13
+ except ModuleNotFoundError:
14
+ raise ValueError(f'Model version "{version}" not found.')
15
+
16
+ cls = getattr(module, 'MoGeModel')
17
+ return cls
MoGe/moge/model/dinov2/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ __version__ = "0.0.1"
MoGe/moge/model/dinov2/hub/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
MoGe/moge/model/dinov2/hub/backbones.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ from enum import Enum
7
+ from typing import Union
8
+
9
+ import torch
10
+
11
+ from .utils import _DINOV2_BASE_URL, _make_dinov2_model_name
12
+
13
+
14
+ class Weights(Enum):
15
+ LVD142M = "LVD142M"
16
+
17
+
18
+ def _make_dinov2_model(
19
+ *,
20
+ arch_name: str = "vit_large",
21
+ img_size: int = 518,
22
+ patch_size: int = 14,
23
+ init_values: float = 1.0,
24
+ ffn_layer: str = "mlp",
25
+ block_chunks: int = 0,
26
+ num_register_tokens: int = 0,
27
+ interpolate_antialias: bool = False,
28
+ interpolate_offset: float = 0.1,
29
+ pretrained: bool = True,
30
+ weights: Union[Weights, str] = Weights.LVD142M,
31
+ **kwargs,
32
+ ):
33
+ from ..models import vision_transformer as vits
34
+
35
+ if isinstance(weights, str):
36
+ try:
37
+ weights = Weights[weights]
38
+ except KeyError:
39
+ raise AssertionError(f"Unsupported weights: {weights}")
40
+
41
+ model_base_name = _make_dinov2_model_name(arch_name, patch_size)
42
+ vit_kwargs = dict(
43
+ img_size=img_size,
44
+ patch_size=patch_size,
45
+ init_values=init_values,
46
+ ffn_layer=ffn_layer,
47
+ block_chunks=block_chunks,
48
+ num_register_tokens=num_register_tokens,
49
+ interpolate_antialias=interpolate_antialias,
50
+ interpolate_offset=interpolate_offset,
51
+ )
52
+ vit_kwargs.update(**kwargs)
53
+ model = vits.__dict__[arch_name](**vit_kwargs)
54
+
55
+ if pretrained:
56
+ model_full_name = _make_dinov2_model_name(arch_name, patch_size, num_register_tokens)
57
+ url = _DINOV2_BASE_URL + f"/{model_base_name}/{model_full_name}_pretrain.pth"
58
+ state_dict = torch.hub.load_state_dict_from_url(url, map_location="cpu")
59
+ model.load_state_dict(state_dict, strict=True)
60
+
61
+ return model
62
+
63
+
64
+ def dinov2_vits14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
65
+ """
66
+ DINOv2 ViT-S/14 model (optionally) pretrained on the LVD-142M dataset.
67
+ """
68
+ return _make_dinov2_model(arch_name="vit_small", pretrained=pretrained, weights=weights, **kwargs)
69
+
70
+
71
+ def dinov2_vitb14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
72
+ """
73
+ DINOv2 ViT-B/14 model (optionally) pretrained on the LVD-142M dataset.
74
+ """
75
+ return _make_dinov2_model(arch_name="vit_base", pretrained=pretrained, weights=weights, **kwargs)
76
+
77
+
78
+ def dinov2_vitl14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
79
+ """
80
+ DINOv2 ViT-L/14 model (optionally) pretrained on the LVD-142M dataset.
81
+ """
82
+ return _make_dinov2_model(arch_name="vit_large", pretrained=pretrained, weights=weights, **kwargs)
83
+
84
+
85
+ def dinov2_vitg14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
86
+ """
87
+ DINOv2 ViT-g/14 model (optionally) pretrained on the LVD-142M dataset.
88
+ """
89
+ return _make_dinov2_model(
90
+ arch_name="vit_giant2",
91
+ ffn_layer="swiglufused",
92
+ weights=weights,
93
+ pretrained=pretrained,
94
+ **kwargs,
95
+ )
96
+
97
+
98
+ def dinov2_vits14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
99
+ """
100
+ DINOv2 ViT-S/14 model with registers (optionally) pretrained on the LVD-142M dataset.
101
+ """
102
+ return _make_dinov2_model(
103
+ arch_name="vit_small",
104
+ pretrained=pretrained,
105
+ weights=weights,
106
+ num_register_tokens=4,
107
+ interpolate_antialias=True,
108
+ interpolate_offset=0.0,
109
+ **kwargs,
110
+ )
111
+
112
+
113
+ def dinov2_vitb14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
114
+ """
115
+ DINOv2 ViT-B/14 model with registers (optionally) pretrained on the LVD-142M dataset.
116
+ """
117
+ return _make_dinov2_model(
118
+ arch_name="vit_base",
119
+ pretrained=pretrained,
120
+ weights=weights,
121
+ num_register_tokens=4,
122
+ interpolate_antialias=True,
123
+ interpolate_offset=0.0,
124
+ **kwargs,
125
+ )
126
+
127
+
128
+ def dinov2_vitl14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
129
+ """
130
+ DINOv2 ViT-L/14 model with registers (optionally) pretrained on the LVD-142M dataset.
131
+ """
132
+ return _make_dinov2_model(
133
+ arch_name="vit_large",
134
+ pretrained=pretrained,
135
+ weights=weights,
136
+ num_register_tokens=4,
137
+ interpolate_antialias=True,
138
+ interpolate_offset=0.0,
139
+ **kwargs,
140
+ )
141
+
142
+
143
+ def dinov2_vitg14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
144
+ """
145
+ DINOv2 ViT-g/14 model with registers (optionally) pretrained on the LVD-142M dataset.
146
+ """
147
+ return _make_dinov2_model(
148
+ arch_name="vit_giant2",
149
+ ffn_layer="swiglufused",
150
+ weights=weights,
151
+ pretrained=pretrained,
152
+ num_register_tokens=4,
153
+ interpolate_antialias=True,
154
+ interpolate_offset=0.0,
155
+ **kwargs,
156
+ )
MoGe/moge/model/dinov2/hub/utils.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ import itertools
7
+ import math
8
+
9
+ import torch
10
+ import torch.nn as nn
11
+ import torch.nn.functional as F
12
+
13
+
14
+ _DINOV2_BASE_URL = "https://dl.fbaipublicfiles.com/dinov2"
15
+
16
+
17
+ def _make_dinov2_model_name(arch_name: str, patch_size: int, num_register_tokens: int = 0) -> str:
18
+ compact_arch_name = arch_name.replace("_", "")[:4]
19
+ registers_suffix = f"_reg{num_register_tokens}" if num_register_tokens else ""
20
+ return f"dinov2_{compact_arch_name}{patch_size}{registers_suffix}"
21
+
22
+
23
+ class CenterPadding(nn.Module):
24
+ def __init__(self, multiple):
25
+ super().__init__()
26
+ self.multiple = multiple
27
+
28
+ def _get_pad(self, size):
29
+ new_size = math.ceil(size / self.multiple) * self.multiple
30
+ pad_size = new_size - size
31
+ pad_size_left = pad_size // 2
32
+ pad_size_right = pad_size - pad_size_left
33
+ return pad_size_left, pad_size_right
34
+
35
+ @torch.inference_mode()
36
+ def forward(self, x):
37
+ pads = list(itertools.chain.from_iterable(self._get_pad(m) for m in x.shape[:1:-1]))
38
+ output = F.pad(x, pads)
39
+ return output
MoGe/moge/model/dinov2/layers/__init__.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ from .dino_head import DINOHead
7
+ from .mlp import Mlp
8
+ from .patch_embed import PatchEmbed
9
+ from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused
10
+ from .block import NestedTensorBlock
11
+ from .attention import MemEffAttention
MoGe/moge/model/dinov2/layers/attention.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ # References:
7
+ # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
8
+ # https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
9
+
10
+ import logging
11
+ import os
12
+ import warnings
13
+
14
+ from torch import Tensor
15
+ from torch import nn
16
+
17
+
18
+ logger = logging.getLogger("dinov2")
19
+
20
+
21
+ XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None
22
+ try:
23
+ if XFORMERS_ENABLED:
24
+ from xformers.ops import memory_efficient_attention, unbind
25
+
26
+ XFORMERS_AVAILABLE = True
27
+ # warnings.warn("xFormers is available (Attention)")
28
+ else:
29
+ # warnings.warn("xFormers is disabled (Attention)")
30
+ raise ImportError
31
+ except ImportError:
32
+ XFORMERS_AVAILABLE = False
33
+ # warnings.warn("xFormers is not available (Attention)")
34
+
35
+
36
+ class Attention(nn.Module):
37
+ def __init__(
38
+ self,
39
+ dim: int,
40
+ num_heads: int = 8,
41
+ qkv_bias: bool = False,
42
+ proj_bias: bool = True,
43
+ attn_drop: float = 0.0,
44
+ proj_drop: float = 0.0,
45
+ ) -> None:
46
+ super().__init__()
47
+ self.num_heads = num_heads
48
+ head_dim = dim // num_heads
49
+ self.scale = head_dim**-0.5
50
+
51
+ self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
52
+ self.attn_drop = nn.Dropout(attn_drop)
53
+ self.proj = nn.Linear(dim, dim, bias=proj_bias)
54
+ self.proj_drop = nn.Dropout(proj_drop)
55
+
56
+ def forward(self, x: Tensor, attn_bias=None) -> Tensor:
57
+ B, N, C = x.shape
58
+ qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
59
+
60
+ q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
61
+ attn = q @ k.transpose(-2, -1)
62
+
63
+ attn = attn.softmax(dim=-1)
64
+ attn = self.attn_drop(attn)
65
+
66
+ x = (attn @ v).transpose(1, 2).reshape(B, N, C)
67
+ x = self.proj(x)
68
+ x = self.proj_drop(x)
69
+ return x
70
+
71
+
72
+ class MemEffAttention(Attention):
73
+ def forward(self, x: Tensor, attn_bias=None) -> Tensor:
74
+ if not XFORMERS_AVAILABLE:
75
+ if attn_bias is not None:
76
+ raise AssertionError("xFormers is required for using nested tensors")
77
+ return super().forward(x)
78
+
79
+ B, N, C = x.shape
80
+ qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
81
+
82
+ q, k, v = unbind(qkv, 2)
83
+
84
+ x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
85
+ x = x.reshape([B, N, C])
86
+
87
+ x = self.proj(x)
88
+ x = self.proj_drop(x)
89
+ return x
MoGe/moge/model/dinov2/layers/block.py ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ # References:
7
+ # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
8
+ # https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
9
+
10
+ import logging
11
+ import os
12
+ from typing import Callable, List, Any, Tuple, Dict
13
+ import warnings
14
+
15
+ import torch
16
+ from torch import nn, Tensor
17
+
18
+ from .attention import Attention, MemEffAttention
19
+ from .drop_path import DropPath
20
+ from .layer_scale import LayerScale
21
+ from .mlp import Mlp
22
+
23
+
24
+ logger = logging.getLogger("dinov2")
25
+
26
+
27
+ XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None
28
+ try:
29
+ if XFORMERS_ENABLED:
30
+ from xformers.ops import fmha, scaled_index_add, index_select_cat
31
+
32
+ XFORMERS_AVAILABLE = True
33
+ # warnings.warn("xFormers is available (Block)")
34
+ else:
35
+ # warnings.warn("xFormers is disabled (Block)")
36
+ raise ImportError
37
+ except ImportError:
38
+ XFORMERS_AVAILABLE = False
39
+ # warnings.warn("xFormers is not available (Block)")
40
+
41
+
42
+ class Block(nn.Module):
43
+ def __init__(
44
+ self,
45
+ dim: int,
46
+ num_heads: int,
47
+ mlp_ratio: float = 4.0,
48
+ qkv_bias: bool = False,
49
+ proj_bias: bool = True,
50
+ ffn_bias: bool = True,
51
+ drop: float = 0.0,
52
+ attn_drop: float = 0.0,
53
+ init_values=None,
54
+ drop_path: float = 0.0,
55
+ act_layer: Callable[..., nn.Module] = nn.GELU,
56
+ norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
57
+ attn_class: Callable[..., nn.Module] = Attention,
58
+ ffn_layer: Callable[..., nn.Module] = Mlp,
59
+ ) -> None:
60
+ super().__init__()
61
+ # print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}")
62
+ self.norm1 = norm_layer(dim)
63
+ self.attn = attn_class(
64
+ dim,
65
+ num_heads=num_heads,
66
+ qkv_bias=qkv_bias,
67
+ proj_bias=proj_bias,
68
+ attn_drop=attn_drop,
69
+ proj_drop=drop,
70
+ )
71
+ self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
72
+ self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
73
+
74
+ self.norm2 = norm_layer(dim)
75
+ mlp_hidden_dim = int(dim * mlp_ratio)
76
+ self.mlp = ffn_layer(
77
+ in_features=dim,
78
+ hidden_features=mlp_hidden_dim,
79
+ act_layer=act_layer,
80
+ drop=drop,
81
+ bias=ffn_bias,
82
+ )
83
+ self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
84
+ self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
85
+
86
+ self.sample_drop_ratio = drop_path
87
+
88
+ def forward(self, x: Tensor) -> Tensor:
89
+ def attn_residual_func(x: Tensor) -> Tensor:
90
+ return self.ls1(self.attn(self.norm1(x)))
91
+
92
+ def ffn_residual_func(x: Tensor) -> Tensor:
93
+ return self.ls2(self.mlp(self.norm2(x)))
94
+
95
+ if self.training and self.sample_drop_ratio > 0.1:
96
+ # the overhead is compensated only for a drop path rate larger than 0.1
97
+ x = drop_add_residual_stochastic_depth(
98
+ x,
99
+ residual_func=attn_residual_func,
100
+ sample_drop_ratio=self.sample_drop_ratio,
101
+ )
102
+ x = drop_add_residual_stochastic_depth(
103
+ x,
104
+ residual_func=ffn_residual_func,
105
+ sample_drop_ratio=self.sample_drop_ratio,
106
+ )
107
+ elif self.training and self.sample_drop_ratio > 0.0:
108
+ x = x + self.drop_path1(attn_residual_func(x))
109
+ x = x + self.drop_path1(ffn_residual_func(x)) # FIXME: drop_path2
110
+ else:
111
+ x = x + attn_residual_func(x)
112
+ x = x + ffn_residual_func(x)
113
+ return x
114
+
115
+
116
+ def drop_add_residual_stochastic_depth(
117
+ x: Tensor,
118
+ residual_func: Callable[[Tensor], Tensor],
119
+ sample_drop_ratio: float = 0.0,
120
+ ) -> Tensor:
121
+ # 1) extract subset using permutation
122
+ b, n, d = x.shape
123
+ sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
124
+ brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
125
+ x_subset = x[brange]
126
+
127
+ # 2) apply residual_func to get residual
128
+ residual = residual_func(x_subset)
129
+
130
+ x_flat = x.flatten(1)
131
+ residual = residual.flatten(1)
132
+
133
+ residual_scale_factor = b / sample_subset_size
134
+
135
+ # 3) add the residual
136
+ x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
137
+ return x_plus_residual.view_as(x)
138
+
139
+
140
+ def get_branges_scales(x, sample_drop_ratio=0.0):
141
+ b, n, d = x.shape
142
+ sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
143
+ brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
144
+ residual_scale_factor = b / sample_subset_size
145
+ return brange, residual_scale_factor
146
+
147
+
148
+ def add_residual(x, brange, residual, residual_scale_factor, scaling_vector=None):
149
+ if scaling_vector is None:
150
+ x_flat = x.flatten(1)
151
+ residual = residual.flatten(1)
152
+ x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
153
+ else:
154
+ x_plus_residual = scaled_index_add(
155
+ x, brange, residual.to(dtype=x.dtype), scaling=scaling_vector, alpha=residual_scale_factor
156
+ )
157
+ return x_plus_residual
158
+
159
+
160
+ attn_bias_cache: Dict[Tuple, Any] = {}
161
+
162
+
163
+ def get_attn_bias_and_cat(x_list, branges=None):
164
+ """
165
+ this will perform the index select, cat the tensors, and provide the attn_bias from cache
166
+ """
167
+ batch_sizes = [b.shape[0] for b in branges] if branges is not None else [x.shape[0] for x in x_list]
168
+ all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list))
169
+ if all_shapes not in attn_bias_cache.keys():
170
+ seqlens = []
171
+ for b, x in zip(batch_sizes, x_list):
172
+ for _ in range(b):
173
+ seqlens.append(x.shape[1])
174
+ attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens)
175
+ attn_bias._batch_sizes = batch_sizes
176
+ attn_bias_cache[all_shapes] = attn_bias
177
+
178
+ if branges is not None:
179
+ cat_tensors = index_select_cat([x.flatten(1) for x in x_list], branges).view(1, -1, x_list[0].shape[-1])
180
+ else:
181
+ tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in x_list)
182
+ cat_tensors = torch.cat(tensors_bs1, dim=1)
183
+
184
+ return attn_bias_cache[all_shapes], cat_tensors
185
+
186
+
187
+ def drop_add_residual_stochastic_depth_list(
188
+ x_list: List[Tensor],
189
+ residual_func: Callable[[Tensor, Any], Tensor],
190
+ sample_drop_ratio: float = 0.0,
191
+ scaling_vector=None,
192
+ ) -> Tensor:
193
+ # 1) generate random set of indices for dropping samples in the batch
194
+ branges_scales = [get_branges_scales(x, sample_drop_ratio=sample_drop_ratio) for x in x_list]
195
+ branges = [s[0] for s in branges_scales]
196
+ residual_scale_factors = [s[1] for s in branges_scales]
197
+
198
+ # 2) get attention bias and index+concat the tensors
199
+ attn_bias, x_cat = get_attn_bias_and_cat(x_list, branges)
200
+
201
+ # 3) apply residual_func to get residual, and split the result
202
+ residual_list = attn_bias.split(residual_func(x_cat, attn_bias=attn_bias)) # type: ignore
203
+
204
+ outputs = []
205
+ for x, brange, residual, residual_scale_factor in zip(x_list, branges, residual_list, residual_scale_factors):
206
+ outputs.append(add_residual(x, brange, residual, residual_scale_factor, scaling_vector).view_as(x))
207
+ return outputs
208
+
209
+
210
+ class NestedTensorBlock(Block):
211
+ def forward_nested(self, x_list: List[Tensor]) -> List[Tensor]:
212
+ """
213
+ x_list contains a list of tensors to nest together and run
214
+ """
215
+ assert isinstance(self.attn, MemEffAttention)
216
+
217
+ if self.training and self.sample_drop_ratio > 0.0:
218
+
219
+ def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
220
+ return self.attn(self.norm1(x), attn_bias=attn_bias)
221
+
222
+ def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
223
+ return self.mlp(self.norm2(x))
224
+
225
+ x_list = drop_add_residual_stochastic_depth_list(
226
+ x_list,
227
+ residual_func=attn_residual_func,
228
+ sample_drop_ratio=self.sample_drop_ratio,
229
+ scaling_vector=self.ls1.gamma if isinstance(self.ls1, LayerScale) else None,
230
+ )
231
+ x_list = drop_add_residual_stochastic_depth_list(
232
+ x_list,
233
+ residual_func=ffn_residual_func,
234
+ sample_drop_ratio=self.sample_drop_ratio,
235
+ scaling_vector=self.ls2.gamma if isinstance(self.ls1, LayerScale) else None,
236
+ )
237
+ return x_list
238
+ else:
239
+
240
+ def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
241
+ return self.ls1(self.attn(self.norm1(x), attn_bias=attn_bias))
242
+
243
+ def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
244
+ return self.ls2(self.mlp(self.norm2(x)))
245
+
246
+ attn_bias, x = get_attn_bias_and_cat(x_list)
247
+ x = x + attn_residual_func(x, attn_bias=attn_bias)
248
+ x = x + ffn_residual_func(x)
249
+ return attn_bias.split(x)
250
+
251
+ def forward(self, x_or_x_list):
252
+ if isinstance(x_or_x_list, Tensor):
253
+ return super().forward(x_or_x_list)
254
+ elif isinstance(x_or_x_list, list):
255
+ if not XFORMERS_AVAILABLE:
256
+ raise AssertionError("xFormers is required for using nested tensors")
257
+ return self.forward_nested(x_or_x_list)
258
+ else:
259
+ raise AssertionError
MoGe/moge/model/dinov2/layers/dino_head.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ import torch
7
+ import torch.nn as nn
8
+ from torch.nn.init import trunc_normal_
9
+ from torch.nn.utils import weight_norm
10
+
11
+
12
+ class DINOHead(nn.Module):
13
+ def __init__(
14
+ self,
15
+ in_dim,
16
+ out_dim,
17
+ use_bn=False,
18
+ nlayers=3,
19
+ hidden_dim=2048,
20
+ bottleneck_dim=256,
21
+ mlp_bias=True,
22
+ ):
23
+ super().__init__()
24
+ nlayers = max(nlayers, 1)
25
+ self.mlp = _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=hidden_dim, use_bn=use_bn, bias=mlp_bias)
26
+ self.apply(self._init_weights)
27
+ self.last_layer = weight_norm(nn.Linear(bottleneck_dim, out_dim, bias=False))
28
+ self.last_layer.weight_g.data.fill_(1)
29
+
30
+ def _init_weights(self, m):
31
+ if isinstance(m, nn.Linear):
32
+ trunc_normal_(m.weight, std=0.02)
33
+ if isinstance(m, nn.Linear) and m.bias is not None:
34
+ nn.init.constant_(m.bias, 0)
35
+
36
+ def forward(self, x):
37
+ x = self.mlp(x)
38
+ eps = 1e-6 if x.dtype == torch.float16 else 1e-12
39
+ x = nn.functional.normalize(x, dim=-1, p=2, eps=eps)
40
+ x = self.last_layer(x)
41
+ return x
42
+
43
+
44
+ def _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=None, use_bn=False, bias=True):
45
+ if nlayers == 1:
46
+ return nn.Linear(in_dim, bottleneck_dim, bias=bias)
47
+ else:
48
+ layers = [nn.Linear(in_dim, hidden_dim, bias=bias)]
49
+ if use_bn:
50
+ layers.append(nn.BatchNorm1d(hidden_dim))
51
+ layers.append(nn.GELU())
52
+ for _ in range(nlayers - 2):
53
+ layers.append(nn.Linear(hidden_dim, hidden_dim, bias=bias))
54
+ if use_bn:
55
+ layers.append(nn.BatchNorm1d(hidden_dim))
56
+ layers.append(nn.GELU())
57
+ layers.append(nn.Linear(hidden_dim, bottleneck_dim, bias=bias))
58
+ return nn.Sequential(*layers)
MoGe/moge/model/dinov2/layers/drop_path.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ # References:
7
+ # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
8
+ # https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py
9
+
10
+
11
+ from torch import nn
12
+
13
+
14
+ def drop_path(x, drop_prob: float = 0.0, training: bool = False):
15
+ if drop_prob == 0.0 or not training:
16
+ return x
17
+ keep_prob = 1 - drop_prob
18
+ shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
19
+ random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
20
+ if keep_prob > 0.0:
21
+ random_tensor.div_(keep_prob)
22
+ output = x * random_tensor
23
+ return output
24
+
25
+
26
+ class DropPath(nn.Module):
27
+ """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
28
+
29
+ def __init__(self, drop_prob=None):
30
+ super(DropPath, self).__init__()
31
+ self.drop_prob = drop_prob
32
+
33
+ def forward(self, x):
34
+ return drop_path(x, self.drop_prob, self.training)
MoGe/moge/model/dinov2/layers/layer_scale.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ # Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110
7
+
8
+ from typing import Union
9
+
10
+ import torch
11
+ from torch import Tensor
12
+ from torch import nn
13
+
14
+
15
+ class LayerScale(nn.Module):
16
+ def __init__(
17
+ self,
18
+ dim: int,
19
+ init_values: Union[float, Tensor] = 1e-5,
20
+ inplace: bool = False,
21
+ ) -> None:
22
+ super().__init__()
23
+ self.inplace = inplace
24
+ self.gamma = nn.Parameter(init_values * torch.ones(dim))
25
+
26
+ def forward(self, x: Tensor) -> Tensor:
27
+ return x.mul_(self.gamma) if self.inplace else x * self.gamma
MoGe/moge/model/dinov2/layers/mlp.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ # References:
7
+ # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
8
+ # https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py
9
+
10
+
11
+ from typing import Callable, Optional
12
+
13
+ from torch import Tensor, nn
14
+
15
+
16
+ class Mlp(nn.Module):
17
+ def __init__(
18
+ self,
19
+ in_features: int,
20
+ hidden_features: Optional[int] = None,
21
+ out_features: Optional[int] = None,
22
+ act_layer: Callable[..., nn.Module] = nn.GELU,
23
+ drop: float = 0.0,
24
+ bias: bool = True,
25
+ ) -> None:
26
+ super().__init__()
27
+ out_features = out_features or in_features
28
+ hidden_features = hidden_features or in_features
29
+ self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
30
+ self.act = act_layer()
31
+ self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
32
+ self.drop = nn.Dropout(drop)
33
+
34
+ def forward(self, x: Tensor) -> Tensor:
35
+ x = self.fc1(x)
36
+ x = self.act(x)
37
+ x = self.drop(x)
38
+ x = self.fc2(x)
39
+ x = self.drop(x)
40
+ return x
MoGe/moge/model/dinov2/layers/patch_embed.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ # References:
7
+ # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
8
+ # https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
9
+
10
+ from typing import Callable, Optional, Tuple, Union
11
+
12
+ from torch import Tensor
13
+ import torch.nn as nn
14
+
15
+
16
+ def make_2tuple(x):
17
+ if isinstance(x, tuple):
18
+ assert len(x) == 2
19
+ return x
20
+
21
+ assert isinstance(x, int)
22
+ return (x, x)
23
+
24
+
25
+ class PatchEmbed(nn.Module):
26
+ """
27
+ 2D image to patch embedding: (B,C,H,W) -> (B,N,D)
28
+
29
+ Args:
30
+ img_size: Image size.
31
+ patch_size: Patch token size.
32
+ in_chans: Number of input image channels.
33
+ embed_dim: Number of linear projection output channels.
34
+ norm_layer: Normalization layer.
35
+ """
36
+
37
+ def __init__(
38
+ self,
39
+ img_size: Union[int, Tuple[int, int]] = 224,
40
+ patch_size: Union[int, Tuple[int, int]] = 16,
41
+ in_chans: int = 3,
42
+ embed_dim: int = 768,
43
+ norm_layer: Optional[Callable] = None,
44
+ flatten_embedding: bool = True,
45
+ ) -> None:
46
+ super().__init__()
47
+
48
+ image_HW = make_2tuple(img_size)
49
+ patch_HW = make_2tuple(patch_size)
50
+ patch_grid_size = (
51
+ image_HW[0] // patch_HW[0],
52
+ image_HW[1] // patch_HW[1],
53
+ )
54
+
55
+ self.img_size = image_HW
56
+ self.patch_size = patch_HW
57
+ self.patches_resolution = patch_grid_size
58
+ self.num_patches = patch_grid_size[0] * patch_grid_size[1]
59
+
60
+ self.in_chans = in_chans
61
+ self.embed_dim = embed_dim
62
+
63
+ self.flatten_embedding = flatten_embedding
64
+
65
+ self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW)
66
+ self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
67
+
68
+ def forward(self, x: Tensor) -> Tensor:
69
+ _, _, H, W = x.shape
70
+ patch_H, patch_W = self.patch_size
71
+
72
+ assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}"
73
+ assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}"
74
+
75
+ x = self.proj(x) # B C H W
76
+ H, W = x.size(2), x.size(3)
77
+ x = x.flatten(2).transpose(1, 2) # B HW C
78
+ x = self.norm(x)
79
+ if not self.flatten_embedding:
80
+ x = x.reshape(-1, H, W, self.embed_dim) # B H W C
81
+ return x
82
+
83
+ def flops(self) -> float:
84
+ Ho, Wo = self.patches_resolution
85
+ flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
86
+ if self.norm is not None:
87
+ flops += Ho * Wo * self.embed_dim
88
+ return flops
MoGe/moge/model/dinov2/layers/swiglu_ffn.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ import os
7
+ from typing import Callable, Optional
8
+ import warnings
9
+
10
+ from torch import Tensor, nn
11
+ import torch.nn.functional as F
12
+
13
+
14
+ class SwiGLUFFN(nn.Module):
15
+ def __init__(
16
+ self,
17
+ in_features: int,
18
+ hidden_features: Optional[int] = None,
19
+ out_features: Optional[int] = None,
20
+ act_layer: Callable[..., nn.Module] = None,
21
+ drop: float = 0.0,
22
+ bias: bool = True,
23
+ ) -> None:
24
+ super().__init__()
25
+ out_features = out_features or in_features
26
+ hidden_features = hidden_features or in_features
27
+ self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias)
28
+ self.w3 = nn.Linear(hidden_features, out_features, bias=bias)
29
+
30
+ def forward(self, x: Tensor) -> Tensor:
31
+ x12 = self.w12(x)
32
+ x1, x2 = x12.chunk(2, dim=-1)
33
+ hidden = F.silu(x1) * x2
34
+ return self.w3(hidden)
35
+
36
+
37
+ XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None
38
+ try:
39
+ if XFORMERS_ENABLED:
40
+ from xformers.ops import SwiGLU
41
+
42
+ XFORMERS_AVAILABLE = True
43
+ # warnings.warn("xFormers is available (SwiGLU)")
44
+ else:
45
+ # warnings.warn("xFormers is disabled (SwiGLU)")
46
+ raise ImportError
47
+ except ImportError:
48
+ SwiGLU = SwiGLUFFN
49
+ XFORMERS_AVAILABLE = False
50
+
51
+ # warnings.warn("xFormers is not available (SwiGLU)")
52
+
53
+
54
+ class SwiGLUFFNFused(SwiGLU):
55
+ def __init__(
56
+ self,
57
+ in_features: int,
58
+ hidden_features: Optional[int] = None,
59
+ out_features: Optional[int] = None,
60
+ act_layer: Callable[..., nn.Module] = None,
61
+ drop: float = 0.0,
62
+ bias: bool = True,
63
+ ) -> None:
64
+ out_features = out_features or in_features
65
+ hidden_features = hidden_features or in_features
66
+ hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
67
+ super().__init__(
68
+ in_features=in_features,
69
+ hidden_features=hidden_features,
70
+ out_features=out_features,
71
+ bias=bias,
72
+ )
MoGe/moge/model/dinov2/models/__init__.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ import logging
7
+
8
+ from . import vision_transformer as vits
9
+
10
+
11
+ logger = logging.getLogger("dinov2")
12
+
13
+
14
+ def build_model(args, only_teacher=False, img_size=224):
15
+ args.arch = args.arch.removesuffix("_memeff")
16
+ if "vit" in args.arch:
17
+ vit_kwargs = dict(
18
+ img_size=img_size,
19
+ patch_size=args.patch_size,
20
+ init_values=args.layerscale,
21
+ ffn_layer=args.ffn_layer,
22
+ block_chunks=args.block_chunks,
23
+ qkv_bias=args.qkv_bias,
24
+ proj_bias=args.proj_bias,
25
+ ffn_bias=args.ffn_bias,
26
+ num_register_tokens=args.num_register_tokens,
27
+ interpolate_offset=args.interpolate_offset,
28
+ interpolate_antialias=args.interpolate_antialias,
29
+ )
30
+ teacher = vits.__dict__[args.arch](**vit_kwargs)
31
+ if only_teacher:
32
+ return teacher, teacher.embed_dim
33
+ student = vits.__dict__[args.arch](
34
+ **vit_kwargs,
35
+ drop_path_rate=args.drop_path_rate,
36
+ drop_path_uniform=args.drop_path_uniform,
37
+ )
38
+ embed_dim = student.embed_dim
39
+ return student, teacher, embed_dim
40
+
41
+
42
+ def build_model_from_cfg(cfg, only_teacher=False):
43
+ return build_model(cfg.student, only_teacher=only_teacher, img_size=cfg.crops.global_crops_size)
MoGe/moge/model/dinov2/models/vision_transformer.py ADDED
@@ -0,0 +1,396 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ # References:
7
+ # https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
8
+ # https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
9
+
10
+ from functools import partial
11
+ import math
12
+ import logging
13
+ from typing import Sequence, Tuple, Union, Callable
14
+
15
+ import torch
16
+ import torch.nn as nn
17
+ import torch.utils.checkpoint
18
+ from torch.nn.init import trunc_normal_
19
+
20
+ from ..layers import Mlp, PatchEmbed, SwiGLUFFNFused, MemEffAttention, NestedTensorBlock as Block
21
+
22
+
23
+ logger = logging.getLogger("dinov2")
24
+
25
+
26
+ def named_apply(fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False) -> nn.Module:
27
+ if not depth_first and include_root:
28
+ fn(module=module, name=name)
29
+ for child_name, child_module in module.named_children():
30
+ child_name = ".".join((name, child_name)) if name else child_name
31
+ named_apply(fn=fn, module=child_module, name=child_name, depth_first=depth_first, include_root=True)
32
+ if depth_first and include_root:
33
+ fn(module=module, name=name)
34
+ return module
35
+
36
+
37
+ class BlockChunk(nn.ModuleList):
38
+ def forward(self, x):
39
+ for b in self:
40
+ x = b(x)
41
+ return x
42
+
43
+
44
+ class DinoVisionTransformer(nn.Module):
45
+ def __init__(
46
+ self,
47
+ img_size=224,
48
+ patch_size=16,
49
+ in_chans=3,
50
+ embed_dim=768,
51
+ depth=12,
52
+ num_heads=12,
53
+ mlp_ratio=4.0,
54
+ qkv_bias=True,
55
+ ffn_bias=True,
56
+ proj_bias=True,
57
+ drop_path_rate=0.0,
58
+ drop_path_uniform=False,
59
+ init_values=None, # for layerscale: None or 0 => no layerscale
60
+ embed_layer=PatchEmbed,
61
+ act_layer=nn.GELU,
62
+ block_fn=Block,
63
+ ffn_layer="mlp",
64
+ block_chunks=1,
65
+ num_register_tokens=0,
66
+ interpolate_antialias=False,
67
+ interpolate_offset=0.1,
68
+ ):
69
+ """
70
+ Args:
71
+ img_size (int, tuple): input image size
72
+ patch_size (int, tuple): patch size
73
+ in_chans (int): number of input channels
74
+ embed_dim (int): embedding dimension
75
+ depth (int): depth of transformer
76
+ num_heads (int): number of attention heads
77
+ mlp_ratio (int): ratio of mlp hidden dim to embedding dim
78
+ qkv_bias (bool): enable bias for qkv if True
79
+ proj_bias (bool): enable bias for proj in attn if True
80
+ ffn_bias (bool): enable bias for ffn if True
81
+ drop_path_rate (float): stochastic depth rate
82
+ drop_path_uniform (bool): apply uniform drop rate across blocks
83
+ weight_init (str): weight init scheme
84
+ init_values (float): layer-scale init values
85
+ embed_layer (nn.Module): patch embedding layer
86
+ act_layer (nn.Module): MLP activation layer
87
+ block_fn (nn.Module): transformer block class
88
+ ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity"
89
+ block_chunks: (int) split block sequence into block_chunks units for FSDP wrap
90
+ num_register_tokens: (int) number of extra cls tokens (so-called "registers")
91
+ interpolate_antialias: (str) flag to apply anti-aliasing when interpolating positional embeddings
92
+ interpolate_offset: (float) work-around offset to apply when interpolating positional embeddings
93
+ """
94
+ super().__init__()
95
+ norm_layer = partial(nn.LayerNorm, eps=1e-6)
96
+
97
+ self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models
98
+ self.num_tokens = 1
99
+ self.n_blocks = depth
100
+ self.num_heads = num_heads
101
+ self.patch_size = patch_size
102
+ self.num_register_tokens = num_register_tokens
103
+ self.interpolate_antialias = interpolate_antialias
104
+ self.interpolate_offset = interpolate_offset
105
+
106
+ self.patch_embed = embed_layer(img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
107
+ num_patches = self.patch_embed.num_patches
108
+
109
+ self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
110
+ self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim))
111
+ assert num_register_tokens >= 0
112
+ self.register_tokens = (
113
+ nn.Parameter(torch.zeros(1, num_register_tokens, embed_dim)) if num_register_tokens else None
114
+ )
115
+
116
+ if drop_path_uniform is True:
117
+ dpr = [drop_path_rate] * depth
118
+ else:
119
+ dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule
120
+
121
+ if ffn_layer == "mlp":
122
+ logger.info("using MLP layer as FFN")
123
+ ffn_layer = Mlp
124
+ elif ffn_layer == "swiglufused" or ffn_layer == "swiglu":
125
+ logger.info("using SwiGLU layer as FFN")
126
+ ffn_layer = SwiGLUFFNFused
127
+ elif ffn_layer == "identity":
128
+ logger.info("using Identity layer as FFN")
129
+
130
+ def f(*args, **kwargs):
131
+ return nn.Identity()
132
+
133
+ ffn_layer = f
134
+ else:
135
+ raise NotImplementedError
136
+
137
+ blocks_list = [
138
+ block_fn(
139
+ dim=embed_dim,
140
+ num_heads=num_heads,
141
+ mlp_ratio=mlp_ratio,
142
+ qkv_bias=qkv_bias,
143
+ proj_bias=proj_bias,
144
+ ffn_bias=ffn_bias,
145
+ drop_path=dpr[i],
146
+ norm_layer=norm_layer,
147
+ act_layer=act_layer,
148
+ ffn_layer=ffn_layer,
149
+ init_values=init_values,
150
+ )
151
+ for i in range(depth)
152
+ ]
153
+ if block_chunks > 0:
154
+ self.chunked_blocks = True
155
+ chunked_blocks = []
156
+ chunksize = depth // block_chunks
157
+ for i in range(0, depth, chunksize):
158
+ # this is to keep the block index consistent if we chunk the block list
159
+ chunked_blocks.append([nn.Identity()] * i + blocks_list[i : i + chunksize])
160
+ self.blocks = nn.ModuleList([BlockChunk(p) for p in chunked_blocks])
161
+ else:
162
+ self.chunked_blocks = False
163
+ self.blocks = nn.ModuleList(blocks_list)
164
+
165
+ self.norm = norm_layer(embed_dim)
166
+ self.head = nn.Identity()
167
+
168
+ self.mask_token = nn.Parameter(torch.zeros(1, embed_dim))
169
+
170
+ self.init_weights()
171
+
172
+ def init_weights(self):
173
+ trunc_normal_(self.pos_embed, std=0.02)
174
+ nn.init.normal_(self.cls_token, std=1e-6)
175
+ if self.register_tokens is not None:
176
+ nn.init.normal_(self.register_tokens, std=1e-6)
177
+ named_apply(init_weights_vit_timm, self)
178
+
179
+ def interpolate_pos_encoding(self, x, w, h):
180
+ previous_dtype = x.dtype
181
+ npatch = x.shape[1] - 1
182
+ N = self.pos_embed.shape[1] - 1
183
+ if npatch == N and w == h:
184
+ return self.pos_embed
185
+ pos_embed = self.pos_embed.float()
186
+ class_pos_embed = pos_embed[:, 0]
187
+ patch_pos_embed = pos_embed[:, 1:]
188
+ dim = x.shape[-1]
189
+ w0 = w // self.patch_size
190
+ h0 = h // self.patch_size
191
+ M = int(math.sqrt(N)) # Recover the number of patches in each dimension
192
+ assert N == M * M
193
+ kwargs = {}
194
+ if self.interpolate_offset:
195
+ # Historical kludge: add a small number to avoid floating point error in the interpolation, see https://github.com/facebookresearch/dino/issues/8
196
+ # Note: still needed for backward-compatibility, the underlying operators are using both output size and scale factors
197
+ sx = float(w0 + self.interpolate_offset) / M
198
+ sy = float(h0 + self.interpolate_offset) / M
199
+ kwargs["scale_factor"] = (sx, sy)
200
+ else:
201
+ # Simply specify an output size instead of a scale factor
202
+ kwargs["size"] = (w0, h0)
203
+ patch_pos_embed = nn.functional.interpolate(
204
+ patch_pos_embed.reshape(1, M, M, dim).permute(0, 3, 1, 2),
205
+ mode="bicubic",
206
+ antialias=self.interpolate_antialias,
207
+ **kwargs,
208
+ )
209
+ assert (w0, h0) == patch_pos_embed.shape[-2:]
210
+ patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
211
+ return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1).to(previous_dtype)
212
+
213
+ def prepare_tokens_with_masks(self, x, masks=None):
214
+ B, nc, w, h = x.shape
215
+ x = self.patch_embed(x)
216
+ if masks is not None:
217
+ x = torch.where(masks.unsqueeze(-1), self.mask_token.to(x.dtype).unsqueeze(0), x)
218
+
219
+ x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
220
+ x = x + self.interpolate_pos_encoding(x, w, h)
221
+
222
+ if self.register_tokens is not None:
223
+ x = torch.cat(
224
+ (
225
+ x[:, :1],
226
+ self.register_tokens.expand(x.shape[0], -1, -1),
227
+ x[:, 1:],
228
+ ),
229
+ dim=1,
230
+ )
231
+
232
+ return x
233
+
234
+ def forward_features_list(self, x_list, masks_list):
235
+ x = [self.prepare_tokens_with_masks(x, masks) for x, masks in zip(x_list, masks_list)]
236
+ for blk in self.blocks:
237
+ x = blk(x)
238
+
239
+ all_x = x
240
+ output = []
241
+ for x, masks in zip(all_x, masks_list):
242
+ x_norm = self.norm(x)
243
+ output.append(
244
+ {
245
+ "x_norm_clstoken": x_norm[:, 0],
246
+ "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
247
+ "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
248
+ "x_prenorm": x,
249
+ "masks": masks,
250
+ }
251
+ )
252
+ return output
253
+
254
+ def forward_features(self, x, masks=None):
255
+ if isinstance(x, list):
256
+ return self.forward_features_list(x, masks)
257
+
258
+ x = self.prepare_tokens_with_masks(x, masks)
259
+
260
+ for blk in self.blocks:
261
+ x = blk(x)
262
+
263
+ x_norm = self.norm(x)
264
+ return {
265
+ "x_norm_clstoken": x_norm[:, 0],
266
+ "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
267
+ "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
268
+ "x_prenorm": x,
269
+ "masks": masks,
270
+ }
271
+
272
+ def _get_intermediate_layers_not_chunked(self, x, n=1):
273
+ x = self.prepare_tokens_with_masks(x)
274
+ # If n is an int, take the n last blocks. If it's a list, take them
275
+ output, total_block_len = [], len(self.blocks)
276
+ blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
277
+ for i, blk in enumerate(self.blocks):
278
+ x = blk(x)
279
+ if i in blocks_to_take:
280
+ output.append(x)
281
+ assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
282
+ return output
283
+
284
+ def _get_intermediate_layers_chunked(self, x, n=1):
285
+ x = self.prepare_tokens_with_masks(x)
286
+ output, i, total_block_len = [], 0, len(self.blocks[-1])
287
+ # If n is an int, take the n last blocks. If it's a list, take them
288
+ blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
289
+ for block_chunk in self.blocks:
290
+ for blk in block_chunk[i:]: # Passing the nn.Identity()
291
+ x = blk(x)
292
+ if i in blocks_to_take:
293
+ output.append(x)
294
+ i += 1
295
+ assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
296
+ return output
297
+
298
+ def get_intermediate_layers(
299
+ self,
300
+ x: torch.Tensor,
301
+ n: Union[int, Sequence] = 1, # Layers or n last layers to take
302
+ reshape: bool = False,
303
+ return_class_token: bool = False,
304
+ norm=True,
305
+ ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]:
306
+ if self.chunked_blocks:
307
+ outputs = self._get_intermediate_layers_chunked(x, n)
308
+ else:
309
+ outputs = self._get_intermediate_layers_not_chunked(x, n)
310
+ if norm:
311
+ outputs = [self.norm(out) for out in outputs]
312
+ class_tokens = [out[:, 0] for out in outputs]
313
+ outputs = [out[:, 1 + self.num_register_tokens :] for out in outputs]
314
+ if reshape:
315
+ B, _, w, h = x.shape
316
+ outputs = [
317
+ out.reshape(B, w // self.patch_size, h // self.patch_size, -1).permute(0, 3, 1, 2).contiguous()
318
+ for out in outputs
319
+ ]
320
+ if return_class_token:
321
+ return tuple(zip(outputs, class_tokens))
322
+ return tuple(outputs)
323
+
324
+ def forward(self, *args, is_training=False, **kwargs):
325
+ ret = self.forward_features(*args, **kwargs)
326
+ if is_training:
327
+ return ret
328
+ else:
329
+ return self.head(ret["x_norm_clstoken"])
330
+
331
+
332
+ def init_weights_vit_timm(module: nn.Module, name: str = ""):
333
+ """ViT weight initialization, original timm impl (for reproducibility)"""
334
+ if isinstance(module, nn.Linear):
335
+ trunc_normal_(module.weight, std=0.02)
336
+ if module.bias is not None:
337
+ nn.init.zeros_(module.bias)
338
+
339
+
340
+ def vit_small(patch_size=16, num_register_tokens=0, **kwargs):
341
+ model = DinoVisionTransformer(
342
+ patch_size=patch_size,
343
+ embed_dim=384,
344
+ depth=12,
345
+ num_heads=6,
346
+ mlp_ratio=4,
347
+ block_fn=partial(Block, attn_class=MemEffAttention),
348
+ num_register_tokens=num_register_tokens,
349
+ **kwargs,
350
+ )
351
+ return model
352
+
353
+
354
+ def vit_base(patch_size=16, num_register_tokens=0, **kwargs):
355
+ model = DinoVisionTransformer(
356
+ patch_size=patch_size,
357
+ embed_dim=768,
358
+ depth=12,
359
+ num_heads=12,
360
+ mlp_ratio=4,
361
+ block_fn=partial(Block, attn_class=MemEffAttention),
362
+ num_register_tokens=num_register_tokens,
363
+ **kwargs,
364
+ )
365
+ return model
366
+
367
+
368
+ def vit_large(patch_size=16, num_register_tokens=0, **kwargs):
369
+ model = DinoVisionTransformer(
370
+ patch_size=patch_size,
371
+ embed_dim=1024,
372
+ depth=24,
373
+ num_heads=16,
374
+ mlp_ratio=4,
375
+ block_fn=partial(Block, attn_class=MemEffAttention),
376
+ num_register_tokens=num_register_tokens,
377
+ **kwargs,
378
+ )
379
+ return model
380
+
381
+
382
+ def vit_giant2(patch_size=16, num_register_tokens=0, **kwargs):
383
+ """
384
+ Close to ViT-giant, with embed-dim 1536 and 24 heads => embed-dim per head 64
385
+ """
386
+ model = DinoVisionTransformer(
387
+ patch_size=patch_size,
388
+ embed_dim=1536,
389
+ depth=40,
390
+ num_heads=24,
391
+ mlp_ratio=4,
392
+ block_fn=partial(Block, attn_class=MemEffAttention),
393
+ num_register_tokens=num_register_tokens,
394
+ **kwargs,
395
+ )
396
+ return model
MoGe/moge/model/dinov2/utils/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
MoGe/moge/model/dinov2/utils/cluster.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ from enum import Enum
7
+ import os
8
+ from pathlib import Path
9
+ from typing import Any, Dict, Optional
10
+
11
+
12
+ class ClusterType(Enum):
13
+ AWS = "aws"
14
+ FAIR = "fair"
15
+ RSC = "rsc"
16
+
17
+
18
+ def _guess_cluster_type() -> ClusterType:
19
+ uname = os.uname()
20
+ if uname.sysname == "Linux":
21
+ if uname.release.endswith("-aws"):
22
+ # Linux kernel versions on AWS instances are of the form "5.4.0-1051-aws"
23
+ return ClusterType.AWS
24
+ elif uname.nodename.startswith("rsc"):
25
+ # Linux kernel versions on RSC instances are standard ones but hostnames start with "rsc"
26
+ return ClusterType.RSC
27
+
28
+ return ClusterType.FAIR
29
+
30
+
31
+ def get_cluster_type(cluster_type: Optional[ClusterType] = None) -> Optional[ClusterType]:
32
+ if cluster_type is None:
33
+ return _guess_cluster_type()
34
+
35
+ return cluster_type
36
+
37
+
38
+ def get_checkpoint_path(cluster_type: Optional[ClusterType] = None) -> Optional[Path]:
39
+ cluster_type = get_cluster_type(cluster_type)
40
+ if cluster_type is None:
41
+ return None
42
+
43
+ CHECKPOINT_DIRNAMES = {
44
+ ClusterType.AWS: "checkpoints",
45
+ ClusterType.FAIR: "checkpoint",
46
+ ClusterType.RSC: "checkpoint/dino",
47
+ }
48
+ return Path("/") / CHECKPOINT_DIRNAMES[cluster_type]
49
+
50
+
51
+ def get_user_checkpoint_path(cluster_type: Optional[ClusterType] = None) -> Optional[Path]:
52
+ checkpoint_path = get_checkpoint_path(cluster_type)
53
+ if checkpoint_path is None:
54
+ return None
55
+
56
+ username = os.environ.get("USER")
57
+ assert username is not None
58
+ return checkpoint_path / username
59
+
60
+
61
+ def get_slurm_partition(cluster_type: Optional[ClusterType] = None) -> Optional[str]:
62
+ cluster_type = get_cluster_type(cluster_type)
63
+ if cluster_type is None:
64
+ return None
65
+
66
+ SLURM_PARTITIONS = {
67
+ ClusterType.AWS: "learnlab",
68
+ ClusterType.FAIR: "learnlab",
69
+ ClusterType.RSC: "learn",
70
+ }
71
+ return SLURM_PARTITIONS[cluster_type]
72
+
73
+
74
+ def get_slurm_executor_parameters(
75
+ nodes: int, num_gpus_per_node: int, cluster_type: Optional[ClusterType] = None, **kwargs
76
+ ) -> Dict[str, Any]:
77
+ # create default parameters
78
+ params = {
79
+ "mem_gb": 0, # Requests all memory on a node, see https://slurm.schedmd.com/sbatch.html
80
+ "gpus_per_node": num_gpus_per_node,
81
+ "tasks_per_node": num_gpus_per_node, # one task per GPU
82
+ "cpus_per_task": 10,
83
+ "nodes": nodes,
84
+ "slurm_partition": get_slurm_partition(cluster_type),
85
+ }
86
+ # apply cluster-specific adjustments
87
+ cluster_type = get_cluster_type(cluster_type)
88
+ if cluster_type == ClusterType.AWS:
89
+ params["cpus_per_task"] = 12
90
+ del params["mem_gb"]
91
+ elif cluster_type == ClusterType.RSC:
92
+ params["cpus_per_task"] = 12
93
+ # set additional parameters / apply overrides
94
+ params.update(kwargs)
95
+ return params
MoGe/moge/model/dinov2/utils/config.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ import math
7
+ import logging
8
+ import os
9
+
10
+ from omegaconf import OmegaConf
11
+
12
+ import dinov2.distributed as distributed
13
+ from dinov2.logging import setup_logging
14
+ from dinov2.utils import utils
15
+ from dinov2.configs import dinov2_default_config
16
+
17
+
18
+ logger = logging.getLogger("dinov2")
19
+
20
+
21
+ def apply_scaling_rules_to_cfg(cfg): # to fix
22
+ if cfg.optim.scaling_rule == "sqrt_wrt_1024":
23
+ base_lr = cfg.optim.base_lr
24
+ cfg.optim.lr = base_lr
25
+ cfg.optim.lr *= math.sqrt(cfg.train.batch_size_per_gpu * distributed.get_global_size() / 1024.0)
26
+ logger.info(f"sqrt scaling learning rate; base: {base_lr}, new: {cfg.optim.lr}")
27
+ else:
28
+ raise NotImplementedError
29
+ return cfg
30
+
31
+
32
+ def write_config(cfg, output_dir, name="config.yaml"):
33
+ logger.info(OmegaConf.to_yaml(cfg))
34
+ saved_cfg_path = os.path.join(output_dir, name)
35
+ with open(saved_cfg_path, "w") as f:
36
+ OmegaConf.save(config=cfg, f=f)
37
+ return saved_cfg_path
38
+
39
+
40
+ def get_cfg_from_args(args):
41
+ args.output_dir = os.path.abspath(args.output_dir)
42
+ args.opts += [f"train.output_dir={args.output_dir}"]
43
+ default_cfg = OmegaConf.create(dinov2_default_config)
44
+ cfg = OmegaConf.load(args.config_file)
45
+ cfg = OmegaConf.merge(default_cfg, cfg, OmegaConf.from_cli(args.opts))
46
+ return cfg
47
+
48
+
49
+ def default_setup(args):
50
+ distributed.enable(overwrite=True)
51
+ seed = getattr(args, "seed", 0)
52
+ rank = distributed.get_global_rank()
53
+
54
+ global logger
55
+ setup_logging(output=args.output_dir, level=logging.INFO)
56
+ logger = logging.getLogger("dinov2")
57
+
58
+ utils.fix_random_seeds(seed + rank)
59
+ logger.info("git:\n {}\n".format(utils.get_sha()))
60
+ logger.info("\n".join("%s: %s" % (k, str(v)) for k, v in sorted(dict(vars(args)).items())))
61
+
62
+
63
+ def setup(args):
64
+ """
65
+ Create configs and perform basic setups.
66
+ """
67
+ cfg = get_cfg_from_args(args)
68
+ os.makedirs(args.output_dir, exist_ok=True)
69
+ default_setup(args)
70
+ apply_scaling_rules_to_cfg(cfg)
71
+ write_config(cfg, args.output_dir)
72
+ return cfg
MoGe/moge/model/dinov2/utils/dtype.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+
7
+ from typing import Dict, Union
8
+
9
+ import numpy as np
10
+ import torch
11
+
12
+
13
+ TypeSpec = Union[str, np.dtype, torch.dtype]
14
+
15
+
16
+ _NUMPY_TO_TORCH_DTYPE: Dict[np.dtype, torch.dtype] = {
17
+ np.dtype("bool"): torch.bool,
18
+ np.dtype("uint8"): torch.uint8,
19
+ np.dtype("int8"): torch.int8,
20
+ np.dtype("int16"): torch.int16,
21
+ np.dtype("int32"): torch.int32,
22
+ np.dtype("int64"): torch.int64,
23
+ np.dtype("float16"): torch.float16,
24
+ np.dtype("float32"): torch.float32,
25
+ np.dtype("float64"): torch.float64,
26
+ np.dtype("complex64"): torch.complex64,
27
+ np.dtype("complex128"): torch.complex128,
28
+ }
29
+
30
+
31
+ def as_torch_dtype(dtype: TypeSpec) -> torch.dtype:
32
+ if isinstance(dtype, torch.dtype):
33
+ return dtype
34
+ if isinstance(dtype, str):
35
+ dtype = np.dtype(dtype)
36
+ assert isinstance(dtype, np.dtype), f"Expected an instance of nunpy dtype, got {type(dtype)}"
37
+ return _NUMPY_TO_TORCH_DTYPE[dtype]
MoGe/moge/model/dinov2/utils/param_groups.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ from collections import defaultdict
7
+ import logging
8
+
9
+
10
+ logger = logging.getLogger("dinov2")
11
+
12
+
13
+ def get_vit_lr_decay_rate(name, lr_decay_rate=1.0, num_layers=12, force_is_backbone=False, chunked_blocks=False):
14
+ """
15
+ Calculate lr decay rate for different ViT blocks.
16
+ Args:
17
+ name (string): parameter name.
18
+ lr_decay_rate (float): base lr decay rate.
19
+ num_layers (int): number of ViT blocks.
20
+ Returns:
21
+ lr decay rate for the given parameter.
22
+ """
23
+ layer_id = num_layers + 1
24
+ if name.startswith("backbone") or force_is_backbone:
25
+ if (
26
+ ".pos_embed" in name
27
+ or ".patch_embed" in name
28
+ or ".mask_token" in name
29
+ or ".cls_token" in name
30
+ or ".register_tokens" in name
31
+ ):
32
+ layer_id = 0
33
+ elif force_is_backbone and (
34
+ "pos_embed" in name
35
+ or "patch_embed" in name
36
+ or "mask_token" in name
37
+ or "cls_token" in name
38
+ or "register_tokens" in name
39
+ ):
40
+ layer_id = 0
41
+ elif ".blocks." in name and ".residual." not in name:
42
+ layer_id = int(name[name.find(".blocks.") :].split(".")[2]) + 1
43
+ elif chunked_blocks and "blocks." in name and "residual." not in name:
44
+ layer_id = int(name[name.find("blocks.") :].split(".")[2]) + 1
45
+ elif "blocks." in name and "residual." not in name:
46
+ layer_id = int(name[name.find("blocks.") :].split(".")[1]) + 1
47
+
48
+ return lr_decay_rate ** (num_layers + 1 - layer_id)
49
+
50
+
51
+ def get_params_groups_with_decay(model, lr_decay_rate=1.0, patch_embed_lr_mult=1.0):
52
+ chunked_blocks = False
53
+ if hasattr(model, "n_blocks"):
54
+ logger.info("chunked fsdp")
55
+ n_blocks = model.n_blocks
56
+ chunked_blocks = model.chunked_blocks
57
+ elif hasattr(model, "blocks"):
58
+ logger.info("first code branch")
59
+ n_blocks = len(model.blocks)
60
+ elif hasattr(model, "backbone"):
61
+ logger.info("second code branch")
62
+ n_blocks = len(model.backbone.blocks)
63
+ else:
64
+ logger.info("else code branch")
65
+ n_blocks = 0
66
+ all_param_groups = []
67
+
68
+ for name, param in model.named_parameters():
69
+ name = name.replace("_fsdp_wrapped_module.", "")
70
+ if not param.requires_grad:
71
+ continue
72
+ decay_rate = get_vit_lr_decay_rate(
73
+ name, lr_decay_rate, num_layers=n_blocks, force_is_backbone=n_blocks > 0, chunked_blocks=chunked_blocks
74
+ )
75
+ d = {"params": param, "is_last_layer": False, "lr_multiplier": decay_rate, "wd_multiplier": 1.0, "name": name}
76
+
77
+ if "last_layer" in name:
78
+ d.update({"is_last_layer": True})
79
+
80
+ if name.endswith(".bias") or "norm" in name or "gamma" in name:
81
+ d.update({"wd_multiplier": 0.0})
82
+
83
+ if "patch_embed" in name:
84
+ d.update({"lr_multiplier": d["lr_multiplier"] * patch_embed_lr_mult})
85
+
86
+ all_param_groups.append(d)
87
+ logger.info(f"""{name}: lr_multiplier: {d["lr_multiplier"]}, wd_multiplier: {d["wd_multiplier"]}""")
88
+
89
+ return all_param_groups
90
+
91
+
92
+ def fuse_params_groups(all_params_groups, keys=("lr_multiplier", "wd_multiplier", "is_last_layer")):
93
+ fused_params_groups = defaultdict(lambda: {"params": []})
94
+ for d in all_params_groups:
95
+ identifier = ""
96
+ for k in keys:
97
+ identifier += k + str(d[k]) + "_"
98
+
99
+ for k in keys:
100
+ fused_params_groups[identifier][k] = d[k]
101
+ fused_params_groups[identifier]["params"].append(d["params"])
102
+
103
+ return fused_params_groups.values()
MoGe/moge/model/dinov2/utils/utils.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ import logging
7
+ import os
8
+ import random
9
+ import subprocess
10
+ from urllib.parse import urlparse
11
+
12
+ import numpy as np
13
+ import torch
14
+ from torch import nn
15
+
16
+
17
+ logger = logging.getLogger("dinov2")
18
+
19
+
20
+ def load_pretrained_weights(model, pretrained_weights, checkpoint_key):
21
+ if urlparse(pretrained_weights).scheme: # If it looks like an URL
22
+ state_dict = torch.hub.load_state_dict_from_url(pretrained_weights, map_location="cpu")
23
+ else:
24
+ state_dict = torch.load(pretrained_weights, map_location="cpu")
25
+ if checkpoint_key is not None and checkpoint_key in state_dict:
26
+ logger.info(f"Take key {checkpoint_key} in provided checkpoint dict")
27
+ state_dict = state_dict[checkpoint_key]
28
+ # remove `module.` prefix
29
+ state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()}
30
+ # remove `backbone.` prefix induced by multicrop wrapper
31
+ state_dict = {k.replace("backbone.", ""): v for k, v in state_dict.items()}
32
+ msg = model.load_state_dict(state_dict, strict=False)
33
+ logger.info("Pretrained weights found at {} and loaded with msg: {}".format(pretrained_weights, msg))
34
+
35
+
36
+ def fix_random_seeds(seed=31):
37
+ """
38
+ Fix random seeds.
39
+ """
40
+ torch.manual_seed(seed)
41
+ torch.cuda.manual_seed_all(seed)
42
+ np.random.seed(seed)
43
+ random.seed(seed)
44
+
45
+
46
+ def get_sha():
47
+ cwd = os.path.dirname(os.path.abspath(__file__))
48
+
49
+ def _run(command):
50
+ return subprocess.check_output(command, cwd=cwd).decode("ascii").strip()
51
+
52
+ sha = "N/A"
53
+ diff = "clean"
54
+ branch = "N/A"
55
+ try:
56
+ sha = _run(["git", "rev-parse", "HEAD"])
57
+ subprocess.check_output(["git", "diff"], cwd=cwd)
58
+ diff = _run(["git", "diff-index", "HEAD"])
59
+ diff = "has uncommitted changes" if diff else "clean"
60
+ branch = _run(["git", "rev-parse", "--abbrev-ref", "HEAD"])
61
+ except Exception:
62
+ pass
63
+ message = f"sha: {sha}, status: {diff}, branch: {branch}"
64
+ return message
65
+
66
+
67
+ class CosineScheduler(object):
68
+ def __init__(self, base_value, final_value, total_iters, warmup_iters=0, start_warmup_value=0, freeze_iters=0):
69
+ super().__init__()
70
+ self.final_value = final_value
71
+ self.total_iters = total_iters
72
+
73
+ freeze_schedule = np.zeros((freeze_iters))
74
+
75
+ warmup_schedule = np.linspace(start_warmup_value, base_value, warmup_iters)
76
+
77
+ iters = np.arange(total_iters - warmup_iters - freeze_iters)
78
+ schedule = final_value + 0.5 * (base_value - final_value) * (1 + np.cos(np.pi * iters / len(iters)))
79
+ self.schedule = np.concatenate((freeze_schedule, warmup_schedule, schedule))
80
+
81
+ assert len(self.schedule) == self.total_iters
82
+
83
+ def __getitem__(self, it):
84
+ if it >= self.total_iters:
85
+ return self.final_value
86
+ else:
87
+ return self.schedule[it]
88
+
89
+
90
+ def has_batchnorms(model):
91
+ bn_types = (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d, nn.SyncBatchNorm)
92
+ for name, module in model.named_modules():
93
+ if isinstance(module, bn_types):
94
+ return True
95
+ return False