Delanoe Pirard commited on
Commit
18b382b
·
0 Parent(s):

Deploy to HuggingFace Spaces

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .flake8 +3 -0
  2. .gitattributes +6 -0
  3. .github/ISSUE_TEMPLATE/bug_report.yml +81 -0
  4. .github/ISSUE_TEMPLATE/config.yml +8 -0
  5. .github/ISSUE_TEMPLATE/feature_request.yml +61 -0
  6. .github/workflows/ci.yml +71 -0
  7. .github/workflows/publish.yml +34 -0
  8. .gitignore +63 -0
  9. .pre-commit-config.yaml +59 -0
  10. BENCHMARKS.md +217 -0
  11. CHANGELOG.md +36 -0
  12. CONTRIBUTING.md +114 -0
  13. LICENSE +201 -0
  14. README.md +418 -0
  15. README.md.original +405 -0
  16. app.py +59 -0
  17. assets/examples/SOH/000.png +3 -0
  18. assets/examples/SOH/010.png +3 -0
  19. assets/examples/robot_unitree.mp4 +3 -0
  20. assets/images/da3_radar.png +3 -0
  21. assets/images/demo320-2.gif +3 -0
  22. benchmarks/__init__.py +3 -0
  23. benchmarks/comparative_benchmark.py +436 -0
  24. benchmarks/flash_attention_benchmark.py +488 -0
  25. benchmarks/full_benchmark.py +696 -0
  26. benchmarks/gpu_preprocessing_benchmark.py +363 -0
  27. benchmarks/results/temp_images/test_image_0000.jpg +3 -0
  28. benchmarks/results/temp_images/test_image_0001.jpg +3 -0
  29. benchmarks/results/temp_images/test_image_0002.jpg +3 -0
  30. benchmarks/results/temp_images/test_image_0003.jpg +3 -0
  31. docs/API.md +465 -0
  32. docs/CLI.md +654 -0
  33. docs/funcs/ref_view_strategy.md +183 -0
  34. notebooks/da3.ipynb +0 -0
  35. notebooks/da3_tutorial.ipynb +667 -0
  36. pyproject.toml +144 -0
  37. requirements.txt +38 -0
  38. scripts/deploy_hf.sh +97 -0
  39. src/depth_anything_3/api.py +718 -0
  40. src/depth_anything_3/app/css_and_html.py +623 -0
  41. src/depth_anything_3/app/gradio_app.py +743 -0
  42. src/depth_anything_3/app/modules/__init__.py +43 -0
  43. src/depth_anything_3/app/modules/event_handlers.py +624 -0
  44. src/depth_anything_3/app/modules/file_handlers.py +327 -0
  45. src/depth_anything_3/app/modules/model_inference.py +454 -0
  46. src/depth_anything_3/app/modules/ui_components.py +497 -0
  47. src/depth_anything_3/app/modules/utils.py +269 -0
  48. src/depth_anything_3/app/modules/visualization.py +435 -0
  49. src/depth_anything_3/cache.py +190 -0
  50. src/depth_anything_3/cfg.py +145 -0
.flake8 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [flake8]
2
+ max-line-length = 100
3
+ ignore = E203 E741 W503 E731
.gitattributes ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ *.png filter=lfs diff=lfs merge=lfs -text
2
+ *.jpg filter=lfs diff=lfs merge=lfs -text
3
+ *.jpeg filter=lfs diff=lfs merge=lfs -text
4
+ *.gif filter=lfs diff=lfs merge=lfs -text
5
+ *.mp4 filter=lfs diff=lfs merge=lfs -text
6
+ *.webm filter=lfs diff=lfs merge=lfs -text
.github/ISSUE_TEMPLATE/bug_report.yml ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Delanoe Pirard / Aedelon
2
+ # Licensed under the Apache License, Version 2.0
3
+
4
+ name: Bug Report
5
+ description: Report a bug in awesome-depth-anything-3
6
+ labels: ["bug"]
7
+ body:
8
+ - type: markdown
9
+ attributes:
10
+ value: |
11
+ Thanks for reporting! Please fill out the form below.
12
+
13
+ **Note**: For issues with the model itself (accuracy, artifacts), please report to the [upstream repository](https://github.com/ByteDance-Seed/Depth-Anything-3/issues).
14
+
15
+ - type: textarea
16
+ id: description
17
+ attributes:
18
+ label: Bug Description
19
+ description: What happened? What did you expect to happen?
20
+ placeholder: Describe the bug...
21
+ validations:
22
+ required: true
23
+
24
+ - type: textarea
25
+ id: reproduction
26
+ attributes:
27
+ label: Steps to Reproduce
28
+ description: Minimal code or steps to reproduce the issue
29
+ placeholder: |
30
+ ```python
31
+ from depth_anything_3.api import DepthAnything3
32
+ model = DepthAnything3.from_pretrained("depth-anything/DA3-LARGE")
33
+ # ...
34
+ ```
35
+ validations:
36
+ required: true
37
+
38
+ - type: textarea
39
+ id: traceback
40
+ attributes:
41
+ label: Error Traceback
42
+ description: Full error message and traceback
43
+ render: shell
44
+ placeholder: Paste the full traceback here...
45
+
46
+ - type: input
47
+ id: version
48
+ attributes:
49
+ label: Package Version
50
+ placeholder: "0.1.0"
51
+ validations:
52
+ required: true
53
+
54
+ - type: dropdown
55
+ id: device
56
+ attributes:
57
+ label: Device
58
+ options:
59
+ - CUDA (NVIDIA GPU)
60
+ - MPS (Apple Silicon)
61
+ - CPU
62
+ validations:
63
+ required: true
64
+
65
+ - type: input
66
+ id: pytorch
67
+ attributes:
68
+ label: PyTorch Version
69
+ placeholder: "2.9.0"
70
+
71
+ - type: input
72
+ id: python
73
+ attributes:
74
+ label: Python Version
75
+ placeholder: "3.11"
76
+
77
+ - type: input
78
+ id: os
79
+ attributes:
80
+ label: Operating System
81
+ placeholder: "macOS 14.0 / Ubuntu 22.04 / Windows 11"
.github/ISSUE_TEMPLATE/config.yml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ blank_issues_enabled: false
2
+ contact_links:
3
+ - name: Upstream Repository
4
+ url: https://github.com/ByteDance-Seed/Depth-Anything-3/issues
5
+ about: For issues with the model architecture, accuracy, or training
6
+ - name: Discussions
7
+ url: https://github.com/Aedelon/awesome-depth-anything-3/discussions
8
+ about: Ask questions or share ideas
.github/ISSUE_TEMPLATE/feature_request.yml ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Delanoe Pirard / Aedelon
2
+ # Licensed under the Apache License, Version 2.0
3
+
4
+ name: Feature Request
5
+ description: Suggest a new feature or improvement
6
+ labels: ["enhancement"]
7
+ body:
8
+ - type: markdown
9
+ attributes:
10
+ value: |
11
+ Thanks for your suggestion!
12
+
13
+ **Note**: For model/architecture changes, please suggest to the [upstream repository](https://github.com/ByteDance-Seed/Depth-Anything-3/issues).
14
+ This fork focuses on optimization, deployment, and developer experience.
15
+
16
+ - type: textarea
17
+ id: problem
18
+ attributes:
19
+ label: Problem / Use Case
20
+ description: What problem does this solve? What are you trying to do?
21
+ placeholder: I'm trying to...
22
+ validations:
23
+ required: true
24
+
25
+ - type: textarea
26
+ id: solution
27
+ attributes:
28
+ label: Proposed Solution
29
+ description: How would you like it to work?
30
+ placeholder: It would be great if...
31
+ validations:
32
+ required: true
33
+
34
+ - type: textarea
35
+ id: alternatives
36
+ attributes:
37
+ label: Alternatives Considered
38
+ description: Any other approaches you've considered?
39
+ placeholder: I also thought about...
40
+
41
+ - type: dropdown
42
+ id: category
43
+ attributes:
44
+ label: Category
45
+ options:
46
+ - Performance optimization
47
+ - CLI improvement
48
+ - API enhancement
49
+ - Documentation
50
+ - Testing
51
+ - CI/CD
52
+ - Other
53
+ validations:
54
+ required: true
55
+
56
+ - type: checkboxes
57
+ id: contribution
58
+ attributes:
59
+ label: Contribution
60
+ options:
61
+ - label: I would be willing to submit a PR for this feature
.github/workflows/ci.yml ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Delanoe Pirard / Aedelon
2
+ # Licensed under the Apache License, Version 2.0
3
+
4
+ name: CI
5
+
6
+ on:
7
+ push:
8
+ branches: [main]
9
+ pull_request:
10
+ branches: [main]
11
+
12
+ jobs:
13
+ lint:
14
+ runs-on: ubuntu-latest
15
+ steps:
16
+ - uses: actions/checkout@v4
17
+
18
+ - name: Install uv
19
+ uses: astral-sh/setup-uv@v4
20
+
21
+ - name: Set up Python
22
+ run: uv python install 3.11
23
+
24
+ - name: Install dependencies
25
+ run: uv sync --extra dev
26
+
27
+ - name: Lint with ruff
28
+ run: uv run ruff check src/
29
+
30
+ test:
31
+ runs-on: ubuntu-latest
32
+ strategy:
33
+ fail-fast: false
34
+ matrix:
35
+ python-version: ["3.10", "3.11", "3.12"]
36
+
37
+ steps:
38
+ - uses: actions/checkout@v4
39
+
40
+ - name: Install uv
41
+ uses: astral-sh/setup-uv@v4
42
+
43
+ - name: Set up Python ${{ matrix.python-version }}
44
+ run: uv python install ${{ matrix.python-version }}
45
+
46
+ - name: Install dependencies
47
+ run: uv sync --extra dev
48
+
49
+ - name: Run tests
50
+ run: uv run pytest tests/ -v --tb=short -x
51
+ env:
52
+ PYTORCH_ENABLE_MPS_FALLBACK: "1"
53
+
54
+ build:
55
+ runs-on: ubuntu-latest
56
+ steps:
57
+ - uses: actions/checkout@v4
58
+ with:
59
+ fetch-depth: 0 # Required for hatch-vcs
60
+
61
+ - name: Install uv
62
+ uses: astral-sh/setup-uv@v4
63
+
64
+ - name: Set up Python
65
+ run: uv python install 3.11
66
+
67
+ - name: Build package
68
+ run: uv build
69
+
70
+ - name: Check package
71
+ run: uvx twine check dist/*
.github/workflows/publish.yml ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Delanoe Pirard / Aedelon
2
+ # Licensed under the Apache License, Version 2.0
3
+
4
+ name: Publish to PyPI
5
+
6
+ on:
7
+ release:
8
+ types: [published]
9
+
10
+ jobs:
11
+ publish:
12
+ runs-on: ubuntu-latest
13
+ environment: pypi
14
+ permissions:
15
+ id-token: write # Required for trusted publishing
16
+
17
+ steps:
18
+ - uses: actions/checkout@v4
19
+ with:
20
+ fetch-depth: 0 # Required for hatch-vcs version
21
+
22
+ - name: Install uv
23
+ uses: astral-sh/setup-uv@v4
24
+
25
+ - name: Set up Python
26
+ run: uv python install 3.11
27
+
28
+ - name: Build package
29
+ run: uv build
30
+
31
+ - name: Publish to PyPI
32
+ uses: pypa/gh-action-pypi-publish@release/v1
33
+ # Uses trusted publishing (OIDC) - no API token needed
34
+ # Configure at: https://pypi.org/manage/project/awesome-depth-anything-3/settings/publishing/
.gitignore ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python cache
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+
7
+ # Distribution / packaging
8
+ workspace/
9
+ build/
10
+ dist/
11
+ *.egg-info/
12
+ .eggs/
13
+ *.egg
14
+
15
+ # Virtual environments
16
+ .venv/
17
+ venv/
18
+ ENV/
19
+
20
+ # Test/coverage
21
+ .coverage
22
+ .pytest_cache/
23
+ htmlcov/
24
+ .tox/
25
+ .nox/
26
+ coverage.xml
27
+ *.cover
28
+
29
+ # Jupyter notebooks
30
+ .ipynb_checkpoints/
31
+
32
+ # IDE
33
+ .vscode/
34
+ .idea/
35
+
36
+ # OS files
37
+ .DS_Store
38
+ Thumbs.db
39
+
40
+ # Project-specific
41
+ gallery*/
42
+ debug*/
43
+ DA3HF*/
44
+ gradio_workspace/
45
+ eval_workspace/
46
+ FILTER*/
47
+ input_images*/
48
+ *.gradio/
49
+ .gradio/
50
+ src/debug_main.py
51
+ temp*.png
52
+ /outputs
53
+
54
+ # Model weights and large files
55
+ *.pt
56
+ *.pth
57
+ *.ckpt
58
+ *.safetensors
59
+ !assets/**/*.pt
60
+
61
+ # Logs
62
+ *.log
63
+ logs/
.pre-commit-config.yaml ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ repos:
2
+ - repo: 'https://github.com/pre-commit/pre-commit-hooks'
3
+ rev: v4.5.0
4
+ hooks:
5
+ - id: check-added-large-files
6
+ args:
7
+ - '--maxkb=125'
8
+ - id: check-ast
9
+ - id: check-executables-have-shebangs
10
+ - id: check-merge-conflict
11
+ - id: check-symlinks
12
+ - id: check-toml
13
+ - id: check-yaml
14
+ - id: debug-statements
15
+ - id: detect-private-key
16
+ - id: end-of-file-fixer
17
+ - id: no-commit-to-branch
18
+ args:
19
+ - '--branch'
20
+ - 'master'
21
+ - id: pretty-format-json
22
+ exclude: '.*\.ipynb$'
23
+ args:
24
+ - '--autofix'
25
+ - '--indent'
26
+ - '4'
27
+ - id: trailing-whitespace
28
+ args:
29
+ - '--markdown-linebreak-ext=md'
30
+ - repo: 'https://github.com/pycqa/isort'
31
+ rev: 5.13.2
32
+ hooks:
33
+ - id: isort
34
+ args:
35
+ - '--settings-file'
36
+ - 'pyproject.toml'
37
+ - '--filter-files'
38
+ - repo: 'https://github.com/asottile/pyupgrade'
39
+ rev: v3.15.2
40
+ hooks:
41
+ - id: pyupgrade
42
+ args: [--py38-plus, --keep-runtime-typing]
43
+ - repo: 'https://github.com/psf/black.git'
44
+ rev: 24.3.0
45
+ hooks:
46
+ - id: black
47
+ args:
48
+ - '--config=pyproject.toml'
49
+ - repo: 'https://github.com/PyCQA/flake8'
50
+ rev: 7.0.0
51
+ hooks:
52
+ - id: flake8
53
+ args:
54
+ - '--config=.flake8'
55
+ - repo: 'https://github.com/myint/autoflake'
56
+ rev: v1.4
57
+ hooks:
58
+ - id: autoflake
59
+ args: [ '--remove-all-unused-imports', '--recursive', '--remove-unused-variables', '--in-place']
BENCHMARKS.md ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Benchmark Results
2
+
3
+ Performance benchmarks comparing **awesome-depth-anything-3** (optimized fork) against the vanilla upstream implementation.
4
+
5
+ > **Test Environment**: Apple Silicon (M-series), PyTorch 2.9.0
6
+ > **Models**: da3-small, da3-base, da3-large, da3-giant
7
+
8
+ ---
9
+
10
+ ## Quick Summary
11
+
12
+ | Feature | Improvement |
13
+ |---------|-------------|
14
+ | Model Loading (cached) | **200x faster** (0.8s → 0.005s) |
15
+ | Inference (MPS, batch 4) | **1.14x faster** |
16
+ | Cold Load Time | **1.7x faster** |
17
+ | Memory Efficiency | Adaptive batching prevents OOM |
18
+
19
+ ---
20
+
21
+ ## 1. Awesome vs Upstream Comparison
22
+
23
+ Direct comparison between this optimized fork and the original upstream repository.
24
+
25
+ ### MPS (Apple Silicon GPU)
26
+
27
+ | Batch Size | Upstream | Awesome | Speedup | Notes |
28
+ |------------|----------|---------|---------|-------|
29
+ | 1 | 3.47 img/s | 3.50 img/s | 1.01x | Minimal overhead |
30
+ | 2 | 3.64 img/s | 3.83 img/s | 1.05x | Batching benefits |
31
+ | **4** | 3.32 img/s | 3.78 img/s | **1.14x** | Best improvement |
32
+
33
+ #### Model Loading Performance
34
+
35
+ | Metric | Upstream | Awesome | Speedup |
36
+ |--------|----------|---------|---------|
37
+ | Cold Load | 1.28s | 0.77s | **1.7x** |
38
+ | Cached Load | N/A | 0.005s | **~200x** |
39
+
40
+ The model caching system is the standout feature - after the first load, subsequent loads are essentially instant.
41
+
42
+ ### CPU
43
+
44
+ | Batch Size | Upstream | Awesome | Speedup |
45
+ |------------|----------|---------|---------|
46
+ | 1 | 0.27 img/s | 0.31 img/s | 1.13x |
47
+ | 2 | 0.24 img/s | 0.24 img/s | 1.00x |
48
+ | 4 | 0.17 img/s | 0.16 img/s | 0.95x |
49
+
50
+ > **Note**: CPU performance is similar between versions since GPU-specific optimizations don't apply. The slight regression at batch 4 is within measurement noise.
51
+
52
+ ---
53
+
54
+ ## 2. Model Performance by Size
55
+
56
+ Throughput benchmarks on MPS (Apple Silicon) with 1280x720 input images.
57
+
58
+ | Model | Parameters | Batch 1 | Batch 4 | Best Config |
59
+ |-------|------------|---------|---------|-------------|
60
+ | **da3-small** | ~25M | 22.2 img/s | 27.2 img/s | B=4 SDPA |
61
+ | **da3-base** | ~100M | 10.7 img/s | 11.6 img/s | B=4 SDPA |
62
+ | **da3-large** | ~335M | 3.8 img/s | 3.8 img/s | B=1-2 |
63
+ | **da3-giant** | ~1.1B | 1.6 img/s | 1.2 img/s | B=1 |
64
+
65
+ ### Latency (single image)
66
+
67
+ | Model | MPS | CPU | MPS Speedup |
68
+ |-------|-----|-----|-------------|
69
+ | da3-small | 45 ms | ~3,500 ms | ~78x |
70
+ | da3-base | 94 ms | ~7,000 ms | ~74x |
71
+ | da3-large | 265 ms | ~3,900 ms | ~15x |
72
+ | da3-giant | 618 ms | N/A | - |
73
+
74
+ ---
75
+
76
+ ## 3. Preprocessing Pipeline
77
+
78
+ ### Strategy: Hybrid CPU/GPU
79
+
80
+ On Apple Silicon, **CPU preprocessing is faster** than GPU (Kornia) due to optimized OpenCV/Accelerate routines. The overhead of MPS kernel launches exceeds the benefit for image transforms.
81
+
82
+ | Resolution | CPU Time | GPU Time | Winner |
83
+ |------------|----------|----------|--------|
84
+ | 640x480 | 6.0 ms | N/A | CPU |
85
+ | 1920x1080 | 18.7 ms | N/A | CPU |
86
+ | 3840x2160 | 57.0 ms | N/A | CPU |
87
+
88
+ > **Design Decision**: GPU preprocessing is automatically disabled on MPS. The GPU is reserved for model inference where it provides 15-78x speedup.
89
+
90
+ ### CUDA (NVIDIA)
91
+
92
+ On CUDA, GPU preprocessing with NVJPEG provides significant benefits for JPEG decoding directly to GPU memory, eliminating CPU→GPU transfer overhead.
93
+
94
+ ---
95
+
96
+ ## 4. Attention Mechanisms
97
+
98
+ Comparison between SDPA (Scaled Dot-Product Attention / Flash Attention) and manual attention implementation.
99
+
100
+ ### Per-Layer Performance
101
+
102
+ | Config | SDPA | Manual | Speedup |
103
+ |--------|------|--------|---------|
104
+ | ViT-L 518px (MPS) | 2.21 ms | 1.86 ms | 0.8x |
105
+ | ViT-L 1024px (MPS) | 9.91 ms | 5.87 ms | 0.6x |
106
+ | ViT-L 518px (CPU) | 3.75 ms | 4.96 ms | 1.3x |
107
+ | ViT-L 1024px (CPU) | 11.73 ms | 16.85 ms | 1.4x |
108
+
109
+ > **Insight**: On MPS, manual attention is faster for ViT due to MPS's SDPA implementation overhead. On CPU, SDPA benefits from optimized BLAS operations.
110
+
111
+ ### End-to-End Impact
112
+
113
+ | Model | SDPA | Manual | Best |
114
+ |-------|------|--------|------|
115
+ | da3-small | 21.8 img/s | 22.2 img/s | Manual |
116
+ | da3-base | 9.8 img/s | 10.7 img/s | Manual |
117
+ | da3-large | 3.8 img/s | 3.7 img/s | SDPA |
118
+ | da3-giant | 1.6 img/s | 1.6 img/s | Tie |
119
+
120
+ ---
121
+
122
+ ## 5. Adaptive Batching
123
+
124
+ The adaptive batching system dynamically adjusts batch size based on available GPU memory.
125
+
126
+ ### Test: 20 images with da3-large on MPS
127
+
128
+ | Strategy | Total Time | Throughput | Batches Used |
129
+ |----------|------------|------------|--------------|
130
+ | Fixed B=1 | 5,612 ms | 3.6 img/s | [1,1,1...] |
131
+ | Fixed B=2 | 5,514 ms | **3.6 img/s** | [2,2,2...] |
132
+ | Fixed B=4 | 8,305 ms | 2.4 img/s | [4,4,4,4,4] |
133
+ | Adaptive 85% | 5,637 ms | 3.5 img/s | [4,4,4...] |
134
+
135
+ > **Recommendation**: For MPS with da3-large, fixed batch size of 2 provides optimal throughput. Adaptive batching is more valuable for:
136
+ > - Variable input sizes
137
+ > - Unknown GPU memory constraints
138
+ > - Preventing OOM errors on smaller GPUs
139
+
140
+ ---
141
+
142
+ ## 6. Cross-Device Comparison
143
+
144
+ ### Inference Throughput (da3-large, batch=1)
145
+
146
+ ```
147
+ MPS (Apple Silicon) ███████████████��████████████████████████ 3.7 img/s
148
+ CPU ███ 0.3 img/s
149
+ ```
150
+
151
+ **MPS provides ~12x speedup over CPU** for da3-large inference.
152
+
153
+ ### Attention Layer (ViT-L 518px, SDPA)
154
+
155
+ ```
156
+ MPS ████████████████████████ 2.40 ms
157
+ CPU ███████████████████████████████████████ 3.75 ms
158
+ ```
159
+
160
+ ---
161
+
162
+ ## 7. Optimization Recommendations
163
+
164
+ ### For Apple Silicon (MPS)
165
+
166
+ 1. **Use model caching** - 200x faster subsequent loads
167
+ 2. **Batch size 2-4** for da3-small/base, **batch 1-2** for da3-large/giant
168
+ 3. **Let CPU handle preprocessing** - it's faster than MPS for image transforms
169
+ 4. **SDPA vs Manual**: Both are similar; SDPA slightly better for larger models
170
+
171
+ ### For NVIDIA CUDA
172
+
173
+ 1. **Enable GPU preprocessing** with NVJPEG for JPEG inputs
174
+ 2. **Use SDPA** (Flash Attention) - significant speedup
175
+ 3. **Larger batch sizes** benefit more from GPU parallelism
176
+ 4. **Adaptive batching** to maximize VRAM utilization
177
+
178
+ ### For CPU-only
179
+
180
+ 1. **Use smallest viable model** (da3-small: 22x faster than da3-giant)
181
+ 2. **Batch size 1** is optimal (memory bandwidth limited)
182
+ 3. **SDPA provides 1.3-1.4x speedup** on CPU
183
+
184
+ ---
185
+
186
+ ## Running Benchmarks
187
+
188
+ ```bash
189
+ # Quick benchmark (fewer iterations)
190
+ uv run python benchmarks/full_benchmark.py --quick
191
+
192
+ # Full benchmark on specific device
193
+ uv run python benchmarks/full_benchmark.py --device mps
194
+ uv run python benchmarks/full_benchmark.py --device cuda
195
+ uv run python benchmarks/full_benchmark.py --device cpu
196
+
197
+ # Compare against upstream (requires upstream repo)
198
+ uv run python benchmarks/comparative_benchmark.py --device all
199
+
200
+ # Skip specific tests
201
+ uv run python benchmarks/full_benchmark.py --skip-batching
202
+ ```
203
+
204
+ ---
205
+
206
+ ## Methodology
207
+
208
+ - **Warmup**: 2 inference passes before timing
209
+ - **Runs**: 3-5 iterations per configuration
210
+ - **Synchronization**: `torch.mps.synchronize()` / `torch.cuda.synchronize()` for accurate GPU timing
211
+ - **Memory cleanup**: `gc.collect()` + cache clearing between tests
212
+ - **Input**: Synthetic 1280x720 RGB images (consistent across tests)
213
+
214
+ ---
215
+
216
+ *Benchmarks last updated: December 2024*
217
+ *Hardware: Apple Silicon (M-series) | Software: PyTorch 2.9.0*
CHANGELOG.md ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [Unreleased]
9
+
10
+ ## [0.1.0] - 2024-12-03
11
+
12
+ ### Added
13
+
14
+ - **Model Caching**: ~200x faster model loading after first use via `ModelCache` singleton
15
+ - **Adaptive Batching**: Automatic batch size optimization based on available GPU memory
16
+ - `batch_inference()` method with `batch_size="auto"` option
17
+ - `get_optimal_batch_size()` for memory-aware batch sizing
18
+ - **CLI Batching Options**: `--batch-size`, `--max-batch-size`, `--target-memory-utilization`
19
+ - **Apple Silicon Optimizations**: Smart CPU/GPU preprocessing selection for MPS
20
+ - **GPU Preprocessing**: Kornia-based GPU preprocessing with NVJPEG support on CUDA
21
+ - **Comprehensive Benchmarks**: Performance comparison scripts and documentation
22
+ - **PyPI Package**: Published as `awesome-depth-anything-3`
23
+ - **CI/CD**: GitHub Actions for testing, linting, and PyPI publishing
24
+ - **HF Spaces Demo**: Interactive Gradio demo on Hugging Face
25
+ - **Colab Tutorial**: Interactive notebook with examples
26
+
27
+ ### Changed
28
+
29
+ - Package renamed from `depth-anything-3` to `awesome-depth-anything-3`
30
+ - Improved error handling in CLI commands
31
+ - Better logging with configurable levels
32
+
33
+ ### Credits
34
+
35
+ This package is an optimized fork of [Depth Anything 3](https://github.com/ByteDance-Seed/Depth-Anything-3)
36
+ by ByteDance. All model architecture and weights are their work. See README for full attribution.
CONTRIBUTING.md ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Contributing to awesome-depth-anything-3
2
+
3
+ Thank you for your interest in contributing! This document provides guidelines for contributing to this project.
4
+
5
+ ## Important Note
6
+
7
+ This is an **optimized fork** of [Depth Anything 3](https://github.com/ByteDance-Seed/Depth-Anything-3) by ByteDance.
8
+
9
+ - **Model/architecture changes** should be proposed to the [upstream repository](https://github.com/ByteDance-Seed/Depth-Anything-3)
10
+ - **Optimization/deployment improvements** are welcome here
11
+
12
+ ## Development Setup
13
+
14
+ ```bash
15
+ # Clone the repository
16
+ git clone https://github.com/Aedelon/awesome-depth-anything-3.git
17
+ cd awesome-depth-anything-3
18
+
19
+ # Install with development dependencies (using uv)
20
+ uv sync --extra dev
21
+
22
+ # Or with pip
23
+ pip install -e ".[dev]"
24
+ ```
25
+
26
+ ## Running Tests
27
+
28
+ ```bash
29
+ # Run all tests
30
+ uv run pytest tests/ -v
31
+
32
+ # Run specific test file
33
+ uv run pytest tests/test_adaptive_batching.py -v
34
+
35
+ # Run with coverage
36
+ uv run pytest tests/ --cov=src/depth_anything_3
37
+ ```
38
+
39
+ ## Code Style
40
+
41
+ We use `ruff` for linting and formatting:
42
+
43
+ ```bash
44
+ # Check for issues
45
+ uv run ruff check src/
46
+
47
+ # Auto-fix issues
48
+ uv run ruff check src/ --fix
49
+
50
+ # Format code
51
+ uv run ruff format src/
52
+ ```
53
+
54
+ ## Pre-commit Hooks
55
+
56
+ We recommend using pre-commit hooks:
57
+
58
+ ```bash
59
+ uv run pre-commit install
60
+ uv run pre-commit run --all-files
61
+ ```
62
+
63
+ ## Pull Request Process
64
+
65
+ 1. **Fork** the repository
66
+ 2. **Create a branch** for your feature (`git checkout -b feature/amazing-feature`)
67
+ 3. **Make your changes** with clear, descriptive commits
68
+ 4. **Run tests** and linting
69
+ 5. **Update documentation** if needed
70
+ 6. **Push** to your fork and **open a Pull Request**
71
+
72
+ ### PR Guidelines
73
+
74
+ - Keep PRs focused on a single change
75
+ - Include tests for new functionality
76
+ - Update CHANGELOG.md for user-facing changes
77
+ - Ensure CI passes before requesting review
78
+
79
+ ## Types of Contributions Welcome
80
+
81
+ ### Highly Welcome
82
+
83
+ - Performance optimizations
84
+ - Bug fixes
85
+ - Documentation improvements
86
+ - Test coverage improvements
87
+ - CI/CD improvements
88
+ - Device compatibility (CUDA, MPS, CPU)
89
+
90
+ ### Discuss First
91
+
92
+ - New CLI commands
93
+ - API changes
94
+ - New dependencies
95
+
96
+ ### Redirect to Upstream
97
+
98
+ - Model architecture changes
99
+ - Training code changes
100
+ - New model variants
101
+
102
+ ## Reporting Issues
103
+
104
+ When reporting bugs, please include:
105
+
106
+ - Python version
107
+ - PyTorch version
108
+ - Device type (CUDA/MPS/CPU)
109
+ - Minimal reproduction code
110
+ - Full error traceback
111
+
112
+ ## License
113
+
114
+ By contributing, you agree that your contributions will be licensed under the Apache License 2.0.
LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on
186
+ the same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright 2025 The Depth Anything 3 Team
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
README.md ADDED
@@ -0,0 +1,418 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Awesome Depth Anything 3
3
+ emoji: 🌊
4
+ colorFrom: blue
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: 5.50.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: apache-2.0
11
+ short_description: Metric 3D reconstruction from images/video
12
+ ---
13
+
14
+ <div align="center">
15
+
16
+ # Awesome Depth Anything 3
17
+
18
+ **Optimized fork of Depth Anything 3 with production-ready features**
19
+
20
+ [![PyPI](https://img.shields.io/pypi/v/awesome-depth-anything-3)](https://pypi.org/project/awesome-depth-anything-3/)
21
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/)
22
+ [![License](https://img.shields.io/badge/License-Apache%202.0-green.svg)](LICENSE)
23
+ [![Tests](https://github.com/Aedelon/awesome-depth-anything-3/actions/workflows/ci.yml/badge.svg)](https://github.com/Aedelon/awesome-depth-anything-3/actions)
24
+ [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Aedelon/awesome-depth-anything-3/blob/main/notebooks/da3_tutorial.ipynb)
25
+ [![HF Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/Aedelon/awesome-depth-anything-3)
26
+
27
+ [Demo](https://huggingface.co/spaces/Aedelon/awesome-depth-anything-3) · [Tutorial](notebooks/da3_tutorial.ipynb) · [Benchmarks](BENCHMARKS.md) · [Original Paper](https://arxiv.org/abs/2511.10647)
28
+
29
+ </div>
30
+
31
+ ---
32
+
33
+ > **This is an optimized fork** of [Depth Anything 3](https://github.com/ByteDance-Seed/Depth-Anything-3) by ByteDance.
34
+ > All credit for the model architecture, training, and research goes to the original authors (see [Credits](#-credits) below).
35
+ > This fork focuses on **production optimization, developer experience, and ease of deployment**.
36
+
37
+ ## 🚀 What's New in This Fork
38
+
39
+ | Feature | Description |
40
+ |---------|-------------|
41
+ | **Model Caching** | ~200x faster model loading after first use |
42
+ | **Adaptive Batching** | Automatic batch size optimization based on GPU memory |
43
+ | **PyPI Package** | `pip install awesome-depth-anything-3` |
44
+ | **CLI Improvements** | Batch processing options, better error handling |
45
+ | **Apple Silicon Optimized** | Smart CPU/GPU preprocessing for best MPS performance |
46
+ | **Comprehensive Benchmarks** | Detailed performance analysis across devices |
47
+
48
+ ### Performance Improvements
49
+
50
+ | Metric | Upstream | This Fork | Improvement |
51
+ |--------|----------|-----------|-------------|
52
+ | Cached model load | ~1s | ~5ms | **200x faster** |
53
+ | Batch 4 inference (MPS) | 3.32 img/s | 3.78 img/s | **1.14x faster** |
54
+ | Cold model load | 1.28s | 0.77s | **1.7x faster** |
55
+
56
+ ---
57
+
58
+ <div align="center">
59
+
60
+ ## Original Depth Anything 3
61
+
62
+ <h3>Recovering the Visual Space from Any Views</h3>
63
+
64
+ [**Haotong Lin**](https://haotongl.github.io/)<sup>&ast;</sup> · [**Sili Chen**](https://github.com/SiliChen321)<sup>&ast;</sup> · [**Jun Hao Liew**](https://liewjunhao.github.io/)<sup>&ast;</sup> · [**Donny Y. Chen**](https://donydchen.github.io)<sup>&ast;</sup> · [**Zhenyu Li**](https://zhyever.github.io/) · [**Guang Shi**](https://scholar.google.com/citations?user=MjXxWbUAAAAJ&hl=en) · [**Jiashi Feng**](https://scholar.google.com.sg/citations?user=Q8iay0gAAAAJ&hl=en)
65
+ <br>
66
+ [**Bingyi Kang**](https://bingykang.github.io/)<sup>&ast;&dagger;</sup>
67
+
68
+ &dagger;project lead&emsp;&ast;Equal Contribution
69
+
70
+ <a href="https://arxiv.org/abs/2511.10647"><img src='https://img.shields.io/badge/arXiv-Depth Anything 3-red' alt='Paper PDF'></a>
71
+ <a href='https://depth-anything-3.github.io'><img src='https://img.shields.io/badge/Project_Page-Depth Anything 3-green' alt='Project Page'></a>
72
+ <a href='https://huggingface.co/spaces/depth-anything/Depth-Anything-3'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Official Demo-blue'></a>
73
+
74
+ </div>
75
+
76
+ This work presents **Depth Anything 3 (DA3)**, a model that predicts spatially consistent geometry from
77
+ arbitrary visual inputs, with or without known camera poses.
78
+ In pursuit of minimal modeling, DA3 yields two key insights:
79
+ - 💎 A **single plain transformer** (e.g., vanilla DINO encoder) is sufficient as a backbone without architectural specialization,
80
+ - ✨ A singular **depth-ray representation** obviates the need for complex multi-task learning.
81
+
82
+ 🏆 DA3 significantly outperforms
83
+ [DA2](https://github.com/DepthAnything/Depth-Anything-V2) for monocular depth estimation,
84
+ and [VGGT](https://github.com/facebookresearch/vggt) for multi-view depth estimation and pose estimation.
85
+ All models are trained exclusively on **public academic datasets**.
86
+
87
+ <!-- <p align="center">
88
+ <img src="assets/images/da3_teaser.png" alt="Depth Anything 3" width="100%">
89
+ </p> -->
90
+ <p align="center">
91
+ <img src="assets/images/demo320-2.gif" alt="Depth Anything 3 - Left" width="70%">
92
+ </p>
93
+ <p align="center">
94
+ <img src="assets/images/da3_radar.png" alt="Depth Anything 3" width="100%">
95
+ </p>
96
+
97
+
98
+ ## 📰 News
99
+ - **30-11-2025:** Add [`use_ray_pose`](#use-ray-pose) and [`ref_view_strategy`](docs/funcs/ref_view_strategy.md) (reference view selection for multi-view inputs).
100
+ - **25-11-2025:** Add [Awesome DA3 Projects](#-awesome-da3-projects), a community-driven section featuring DA3-based applications.
101
+ - **14-11-2025:** Paper, project page, code and models are all released.
102
+
103
+ ## ✨ Highlights
104
+
105
+ ### 🏆 Model Zoo
106
+ We release three series of models, each tailored for specific use cases in visual geometry.
107
+
108
+ - 🌟 **DA3 Main Series** (`DA3-Giant`, `DA3-Large`, `DA3-Base`, `DA3-Small`) These are our flagship foundation models, trained with a unified depth-ray representation. By varying the input configuration, a single model can perform a wide range of tasks:
109
+ + 🌊 **Monocular Depth Estimation**: Predicts a depth map from a single RGB image.
110
+ + 🌊 **Multi-View Depth Estimation**: Generates consistent depth maps from multiple images for high-quality fusion.
111
+ + 🎯 **Pose-Conditioned Depth Estimation**: Achieves superior depth consistency when camera poses are provided as input.
112
+ + 📷 **Camera Pose Estimation**: Estimates camera extrinsics and intrinsics from one or more images.
113
+ + 🟡 **3D Gaussian Estimation**: Directly predicts 3D Gaussians, enabling high-fidelity novel view synthesis.
114
+
115
+ - 📐 **DA3 Metric Series** (`DA3Metric-Large`) A specialized model fine-tuned for metric depth estimation in monocular settings, ideal for applications requiring real-world scale.
116
+
117
+ - 🔍 **DA3 Monocular Series** (`DA3Mono-Large`). A dedicated model for high-quality relative monocular depth estimation. Unlike disparity-based models (e.g., [Depth Anything 2](https://github.com/DepthAnything/Depth-Anything-V2)), it directly predicts depth, resulting in superior geometric accuracy.
118
+
119
+ 🔗 Leveraging these available models, we developed a **nested series** (`DA3Nested-Giant-Large`). This series combines a any-view giant model with a metric model to reconstruct visual geometry at a real-world metric scale.
120
+
121
+ ### 🛠️ Codebase Features
122
+ Our repository is designed to be a powerful and user-friendly toolkit for both practical application and future research.
123
+ - 🎨 **Interactive Web UI & Gallery**: Visualize model outputs and compare results with an easy-to-use Gradio-based web interface.
124
+ - ⚡ **Flexible Command-Line Interface (CLI)**: Powerful and scriptable CLI for batch processing and integration into custom workflows.
125
+ - 💾 **Multiple Export Formats**: Save your results in various formats, including `glb`, `npz`, depth images, `ply`, 3DGS videos, etc, to seamlessly connect with other tools.
126
+ - 🔧 **Extensible and Modular Design**: The codebase is structured to facilitate future research and the integration of new models or functionalities.
127
+
128
+
129
+ <!-- ### 🎯 Visual Geometry Benchmark
130
+ We introduce a new benchmark to rigorously evaluate geometry prediction models on three key tasks: pose estimation, 3D reconstruction, and visual rendering (novel view synthesis) quality.
131
+
132
+ - 🔄 **Broad Model Compatibility**: Our benchmark is designed to be versatile, supporting the evaluation of various models, including both monocular and multi-view depth estimation approaches.
133
+ - 🔬 **Robust Evaluation Pipeline**: We provide a standardized pipeline featuring RANSAC-based pose alignment, TSDF fusion for dense reconstruction, and a principled view selection strategy for novel view synthesis.
134
+ - 📊 **Standardized Metrics**: Performance is measured using established metrics: AUC for pose accuracy, F1-score and Chamfer Distance for reconstruction, and PSNR/SSIM/LPIPS for rendering quality.
135
+ - 🌍 **Diverse and Challenging Datasets**: The benchmark spans a wide range of scenes from datasets like HiRoom, ETH3D, DTU, 7Scenes, ScanNet++, DL3DV, Tanks and Temples, and MegaDepth. -->
136
+
137
+
138
+ ## 🚀 Quick Start
139
+
140
+ ### 📦 Installation
141
+
142
+ ```bash
143
+ # From PyPI (recommended)
144
+ pip install awesome-depth-anything-3
145
+
146
+ # With Gradio web UI
147
+ pip install awesome-depth-anything-3[app]
148
+
149
+ # With CUDA optimizations (xformers + gsplat)
150
+ pip install awesome-depth-anything-3[cuda]
151
+
152
+ # Everything
153
+ pip install awesome-depth-anything-3[all]
154
+ ```
155
+
156
+ <details>
157
+ <summary><b>Development installation</b></summary>
158
+
159
+ ```bash
160
+ git clone https://github.com/Aedelon/awesome-depth-anything-3.git
161
+ cd awesome-depth-anything-3
162
+ pip install -e ".[dev]"
163
+
164
+ # Optional: 3D Gaussian Splatting head
165
+ pip install --no-build-isolation git+https://github.com/nerfstudio-project/gsplat.git@0b4dddf
166
+ ```
167
+ </details>
168
+
169
+ For detailed model information, please refer to the [Model Cards](#-model-cards) section below.
170
+
171
+ ### 💻 Basic Usage
172
+
173
+ ```python
174
+ import glob, os, torch
175
+ from depth_anything_3.api import DepthAnything3
176
+ device = torch.device("cuda")
177
+ model = DepthAnything3.from_pretrained("depth-anything/DA3NESTED-GIANT-LARGE")
178
+ model = model.to(device=device)
179
+ example_path = "assets/examples/SOH"
180
+ images = sorted(glob.glob(os.path.join(example_path, "*.png")))
181
+ prediction = model.inference(
182
+ images,
183
+ )
184
+ # prediction.processed_images : [N, H, W, 3] uint8 array
185
+ print(prediction.processed_images.shape)
186
+ # prediction.depth : [N, H, W] float32 array
187
+ print(prediction.depth.shape)
188
+ # prediction.conf : [N, H, W] float32 array
189
+ print(prediction.conf.shape)
190
+ # prediction.extrinsics : [N, 3, 4] float32 array # opencv w2c or colmap format
191
+ print(prediction.extrinsics.shape)
192
+ # prediction.intrinsics : [N, 3, 3] float32 array
193
+ print(prediction.intrinsics.shape)
194
+ ```
195
+
196
+ ```bash
197
+
198
+ export MODEL_DIR=depth-anything/DA3NESTED-GIANT-LARGE
199
+ # This can be a Hugging Face repository or a local directory
200
+ # If you encounter network issues, consider using the following mirror: export HF_ENDPOINT=https://hf-mirror.com
201
+ # Alternatively, you can download the model directly from Hugging Face
202
+ export GALLERY_DIR=workspace/gallery
203
+ mkdir -p $GALLERY_DIR
204
+
205
+ # CLI auto mode with backend reuse
206
+ da3 backend --model-dir ${MODEL_DIR} --gallery-dir ${GALLERY_DIR} # Cache model to gpu
207
+ da3 auto assets/examples/SOH \
208
+ --export-format glb \
209
+ --export-dir ${GALLERY_DIR}/TEST_BACKEND/SOH \
210
+ --use-backend
211
+
212
+ # CLI video processing with feature visualization
213
+ da3 video assets/examples/robot_unitree.mp4 \
214
+ --fps 15 \
215
+ --use-backend \
216
+ --export-dir ${GALLERY_DIR}/TEST_BACKEND/robo \
217
+ --export-format glb-feat_vis \
218
+ --feat-vis-fps 15 \
219
+ --process-res-method lower_bound_resize \
220
+ --export-feat "11,21,31"
221
+
222
+ # CLI auto mode without backend reuse
223
+ da3 auto assets/examples/SOH \
224
+ --export-format glb \
225
+ --export-dir ${GALLERY_DIR}/TEST_CLI/SOH \
226
+ --model-dir ${MODEL_DIR}
227
+
228
+ ```
229
+
230
+ The model architecture is defined in [`DepthAnything3Net`](src/depth_anything_3/model/da3.py), and specified with a Yaml config file located at [`src/depth_anything_3/configs`](src/depth_anything_3/configs). The input and output processing are handled by [`DepthAnything3`](src/depth_anything_3/api.py). To customize the model architecture, simply create a new config file (*e.g.*, `path/to/new/config`) as:
231
+
232
+ ```yaml
233
+ __object__:
234
+ path: depth_anything_3.model.da3
235
+ name: DepthAnything3Net
236
+ args: as_params
237
+
238
+ net:
239
+ __object__:
240
+ path: depth_anything_3.model.dinov2.dinov2
241
+ name: DinoV2
242
+ args: as_params
243
+
244
+ name: vitb
245
+ out_layers: [5, 7, 9, 11]
246
+ alt_start: 4
247
+ qknorm_start: 4
248
+ rope_start: 4
249
+ cat_token: True
250
+
251
+ head:
252
+ __object__:
253
+ path: depth_anything_3.model.dualdpt
254
+ name: DualDPT
255
+ args: as_params
256
+
257
+ dim_in: &head_dim_in 1536
258
+ output_dim: 2
259
+ features: &head_features 128
260
+ out_channels: &head_out_channels [96, 192, 384, 768]
261
+ ```
262
+
263
+ Then, the model can be created with the following code snippet.
264
+ ```python
265
+ from depth_anything_3.cfg import create_object, load_config
266
+
267
+ Model = create_object(load_config("path/to/new/config"))
268
+ ```
269
+
270
+
271
+
272
+ ## 📚 Useful Documentation
273
+
274
+ - 🖥️ [Command Line Interface](docs/CLI.md)
275
+ - 📑 [Python API](docs/API.md)
276
+ <!-- - 🏁 [Visual Geometry Benchmark](docs/BENCHMARK.md) -->
277
+
278
+ ## 🗂️ Model Cards
279
+
280
+ Generally, you should observe that DA3-LARGE achieves comparable results to VGGT.
281
+
282
+ The Nested series uses an Any-view model to estimate pose and depth, and a monocular metric depth estimator for scaling.
283
+
284
+ | 🗃️ Model Name | 📏 Params | 📊 Rel. Depth | 📷 Pose Est. | 🧭 Pose Cond. | 🎨 GS | 📐 Met. Depth | ☁️ Sky Seg | 📄 License |
285
+ |-------------------------------|-----------|---------------|--------------|---------------|-------|---------------|-----------|----------------|
286
+ | **Nested** | | | | | | | | |
287
+ | [DA3NESTED-GIANT-LARGE](https://huggingface.co/depth-anything/DA3NESTED-GIANT-LARGE) | 1.40B | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | CC BY-NC 4.0 |
288
+ | **Any-view Model** | | | | | | | | |
289
+ | [DA3-GIANT](https://huggingface.co/depth-anything/DA3-GIANT) | 1.15B | ✅ | ✅ | ✅ | ✅ | | | CC BY-NC 4.0 |
290
+ | [DA3-LARGE](https://huggingface.co/depth-anything/DA3-LARGE) | 0.35B | ✅ | ✅ | ✅ | | | | CC BY-NC 4.0 |
291
+ | [DA3-BASE](https://huggingface.co/depth-anything/DA3-BASE) | 0.12B | ✅ | ✅ | ✅ | | | | Apache 2.0 |
292
+ | [DA3-SMALL](https://huggingface.co/depth-anything/DA3-SMALL) | 0.08B | ✅ | ✅ | ✅ | | | | Apache 2.0 |
293
+ | | | | | | | | | |
294
+ | **Monocular Metric Depth** | | | | | | | | |
295
+ | [DA3METRIC-LARGE](https://huggingface.co/depth-anything/DA3METRIC-LARGE) | 0.35B | ✅ | | | | ✅ | ✅ | Apache 2.0 |
296
+ | | | | | | | | | |
297
+ | **Monocular Depth** | | | | | | | | |
298
+ | [DA3MONO-LARGE](https://huggingface.co/depth-anything/DA3MONO-LARGE) | 0.35B | ✅ | | | | | ✅ | Apache 2.0 |
299
+
300
+
301
+ ## ⚡ Performance Benchmarks
302
+
303
+ Inference throughput measured on Apple Silicon (MPS) with PyTorch 2.9.0. For detailed benchmarks, see [BENCHMARKS.md](BENCHMARKS.md).
304
+
305
+ ### Apple Silicon (MPS) - Batch Size 1
306
+
307
+ | Model | Latency | Throughput |
308
+ |-------|---------|------------|
309
+ | DA3-Small | 46 ms | **22 img/s** |
310
+ | DA3-Base | 93 ms | **11 img/s** |
311
+ | DA3-Large | 265 ms | **3.8 img/s** |
312
+ | DA3-Giant | 618 ms | **1.6 img/s** |
313
+
314
+ ### Cross-Device Comparison (DA3-Large)
315
+
316
+ | Device | Throughput | vs CPU |
317
+ |--------|------------|--------|
318
+ | CPU | 0.3 img/s | 1.0x |
319
+ | Apple Silicon (MPS) | 3.8 img/s | **13x** |
320
+ | NVIDIA L4 (CUDA) | 10.3 img/s | **34x** |
321
+
322
+ ### Batch Processing
323
+
324
+ ```python
325
+ from depth_anything_3.api import DepthAnything3
326
+
327
+ model = DepthAnything3.from_pretrained("depth-anything/DA3-LARGE")
328
+
329
+ # Adaptive batching (recommended for large image sets)
330
+ results = model.batch_inference(
331
+ images=image_paths,
332
+ batch_size="auto", # Automatically selects optimal batch size
333
+ target_memory_utilization=0.85,
334
+ )
335
+
336
+ # Fixed batch size
337
+ results = model.batch_inference(
338
+ images=image_paths,
339
+ batch_size=4,
340
+ )
341
+ ```
342
+
343
+ > See [BENCHMARKS.md](BENCHMARKS.md) for comprehensive benchmarks including preprocessing, attention mechanisms, and adaptive batching strategies.
344
+
345
+
346
+ ## ❓ FAQ
347
+
348
+ - **Monocular Metric Depth**: To obtain metric depth in meters from `DA3METRIC-LARGE`, use `metric_depth = focal * net_output / 300.`, where `focal` is the focal length in pixels (typically the average of fx and fy from the camera intrinsic matrix K). Note that the output from `DA3NESTED-GIANT-LARGE` is already in meters.
349
+
350
+ - <a id="use-ray-pose"></a>**Ray Head (`use_ray_pose`)**: Our API and CLI support `use_ray_pose` arg, which means that the model will derive camera pose from ray head, which is generally slightly slower, but more accurate. Note that the default is `False` for faster inference speed.
351
+ <details>
352
+ <summary>AUC3 Results for DA3NESTED-GIANT-LARGE</summary>
353
+
354
+ | Model | HiRoom | ETH3D | DTU | 7Scenes | ScanNet++ |
355
+ |-------|------|-------|-----|---------|-----------|
356
+ | `ray_head` | 84.4 | 52.6 | 93.9 | 29.5 | 89.4 |
357
+ | `cam_head` | 80.3 | 48.4 | 94.1 | 28.5 | 85.0 |
358
+
359
+ </details>
360
+
361
+
362
+
363
+
364
+ - **Older GPUs without XFormers support**: See [Issue #11](https://github.com/ByteDance-Seed/Depth-Anything-3/issues/11). Thanks to [@S-Mahoney](https://github.com/S-Mahoney) for the solution!
365
+
366
+
367
+ ## 🏢 Awesome DA3 Projects
368
+
369
+ A community-curated list of Depth Anything 3 integrations across 3D tools, creative pipelines, robotics, and web/VR viewers, including but not limited to these. You are welcome to submit your DA3-based project via PR, and we will review and feature it if applicable.
370
+
371
+ - [DA3-blender](https://github.com/xy-gao/DA3-blender): Blender addon for DA3-based 3D reconstruction from a set of images.
372
+
373
+ - [ComfyUI-DepthAnythingV3](https://github.com/PozzettiAndrea/ComfyUI-DepthAnythingV3): ComfyUI nodes for Depth Anything 3, supporting single/multi-view and video-consistent depth with optional point‑cloud export.
374
+
375
+ - [DA3-ROS2-Wrapper](https://github.com/GerdsenAI/GerdsenAI-Depth-Anything-3-ROS2-Wrapper): Real-time DA3 depth in ROS2 with multi-camera support.
376
+
377
+ - [VideoDepthViewer3D](https://github.com/amariichi/VideoDepthViewer3D): Streaming videos with DA3 metric depth to a Three.js/WebXR 3D viewer for VR/stereo playback.
378
+
379
+
380
+ ## 📝 Credits
381
+
382
+ ### Original Authors
383
+
384
+ This package is built on top of **Depth Anything 3**, created by the ByteDance Seed team:
385
+
386
+ - [Haotong Lin](https://haotongl.github.io/), [Sili Chen](https://github.com/SiliChen321), [Jun Hao Liew](https://liewjunhao.github.io/), [Donny Y. Chen](https://donydchen.github.io), [Zhenyu Li](https://zhyever.github.io/), [Guang Shi](https://scholar.google.com/citations?user=MjXxWbUAAAAJ), [Jiashi Feng](https://scholar.google.com.sg/citations?user=Q8iay0gAAAAJ), [Bingyi Kang](https://bingykang.github.io/)
387
+
388
+ All model weights, architecture, and core algorithms are their work. This fork only adds production optimizations and deployment tooling.
389
+
390
+ ### Fork Maintainer
391
+
392
+ This optimized fork is maintained by [Delanoe Pirard (Aedelon)](https://github.com/Aedelon).
393
+
394
+ Contributions:
395
+ - Model caching system
396
+ - Adaptive batching
397
+ - Apple Silicon (MPS) optimizations
398
+ - PyPI packaging and CI/CD
399
+ - Comprehensive benchmarking
400
+
401
+ ### Citation
402
+
403
+ If you use Depth Anything 3 in your research, please cite the original paper:
404
+
405
+ ```bibtex
406
+ @article{depthanything3,
407
+ title={Depth Anything 3: Recovering the visual space from any views},
408
+ author={Haotong Lin and Sili Chen and Jun Hao Liew and Donny Y. Chen and Zhenyu Li and Guang Shi and Jiashi Feng and Bingyi Kang},
409
+ journal={arXiv preprint arXiv:2511.10647},
410
+ year={2025}
411
+ }
412
+ ```
413
+
414
+ If you specifically use features from this fork (caching, batching, MPS optimizations), you may additionally reference:
415
+
416
+ ```
417
+ awesome-depth-anything-3: https://github.com/Aedelon/awesome-depth-anything-3
418
+ ```
README.md.original ADDED
@@ -0,0 +1,405 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div align="center">
2
+
3
+ # Awesome Depth Anything 3
4
+
5
+ **Optimized fork of Depth Anything 3 with production-ready features**
6
+
7
+ [![PyPI](https://img.shields.io/pypi/v/awesome-depth-anything-3)](https://pypi.org/project/awesome-depth-anything-3/)
8
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/)
9
+ [![License](https://img.shields.io/badge/License-Apache%202.0-green.svg)](LICENSE)
10
+ [![Tests](https://github.com/Aedelon/awesome-depth-anything-3/actions/workflows/ci.yml/badge.svg)](https://github.com/Aedelon/awesome-depth-anything-3/actions)
11
+ [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Aedelon/awesome-depth-anything-3/blob/main/notebooks/da3_tutorial.ipynb)
12
+ [![HF Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/Aedelon/awesome-depth-anything-3)
13
+
14
+ [Demo](https://huggingface.co/spaces/Aedelon/awesome-depth-anything-3) · [Tutorial](notebooks/da3_tutorial.ipynb) · [Benchmarks](BENCHMARKS.md) · [Original Paper](https://arxiv.org/abs/2511.10647)
15
+
16
+ </div>
17
+
18
+ ---
19
+
20
+ > **This is an optimized fork** of [Depth Anything 3](https://github.com/ByteDance-Seed/Depth-Anything-3) by ByteDance.
21
+ > All credit for the model architecture, training, and research goes to the original authors (see [Credits](#-credits) below).
22
+ > This fork focuses on **production optimization, developer experience, and ease of deployment**.
23
+
24
+ ## 🚀 What's New in This Fork
25
+
26
+ | Feature | Description |
27
+ |---------|-------------|
28
+ | **Model Caching** | ~200x faster model loading after first use |
29
+ | **Adaptive Batching** | Automatic batch size optimization based on GPU memory |
30
+ | **PyPI Package** | `pip install awesome-depth-anything-3` |
31
+ | **CLI Improvements** | Batch processing options, better error handling |
32
+ | **Apple Silicon Optimized** | Smart CPU/GPU preprocessing for best MPS performance |
33
+ | **Comprehensive Benchmarks** | Detailed performance analysis across devices |
34
+
35
+ ### Performance Improvements
36
+
37
+ | Metric | Upstream | This Fork | Improvement |
38
+ |--------|----------|-----------|-------------|
39
+ | Cached model load | ~1s | ~5ms | **200x faster** |
40
+ | Batch 4 inference (MPS) | 3.32 img/s | 3.78 img/s | **1.14x faster** |
41
+ | Cold model load | 1.28s | 0.77s | **1.7x faster** |
42
+
43
+ ---
44
+
45
+ <div align="center">
46
+
47
+ ## Original Depth Anything 3
48
+
49
+ <h3>Recovering the Visual Space from Any Views</h3>
50
+
51
+ [**Haotong Lin**](https://haotongl.github.io/)<sup>&ast;</sup> · [**Sili Chen**](https://github.com/SiliChen321)<sup>&ast;</sup> · [**Jun Hao Liew**](https://liewjunhao.github.io/)<sup>&ast;</sup> · [**Donny Y. Chen**](https://donydchen.github.io)<sup>&ast;</sup> · [**Zhenyu Li**](https://zhyever.github.io/) · [**Guang Shi**](https://scholar.google.com/citations?user=MjXxWbUAAAAJ&hl=en) · [**Jiashi Feng**](https://scholar.google.com.sg/citations?user=Q8iay0gAAAAJ&hl=en)
52
+ <br>
53
+ [**Bingyi Kang**](https://bingykang.github.io/)<sup>&ast;&dagger;</sup>
54
+
55
+ &dagger;project lead&emsp;&ast;Equal Contribution
56
+
57
+ <a href="https://arxiv.org/abs/2511.10647"><img src='https://img.shields.io/badge/arXiv-Depth Anything 3-red' alt='Paper PDF'></a>
58
+ <a href='https://depth-anything-3.github.io'><img src='https://img.shields.io/badge/Project_Page-Depth Anything 3-green' alt='Project Page'></a>
59
+ <a href='https://huggingface.co/spaces/depth-anything/Depth-Anything-3'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Official Demo-blue'></a>
60
+
61
+ </div>
62
+
63
+ This work presents **Depth Anything 3 (DA3)**, a model that predicts spatially consistent geometry from
64
+ arbitrary visual inputs, with or without known camera poses.
65
+ In pursuit of minimal modeling, DA3 yields two key insights:
66
+ - 💎 A **single plain transformer** (e.g., vanilla DINO encoder) is sufficient as a backbone without architectural specialization,
67
+ - ✨ A singular **depth-ray representation** obviates the need for complex multi-task learning.
68
+
69
+ 🏆 DA3 significantly outperforms
70
+ [DA2](https://github.com/DepthAnything/Depth-Anything-V2) for monocular depth estimation,
71
+ and [VGGT](https://github.com/facebookresearch/vggt) for multi-view depth estimation and pose estimation.
72
+ All models are trained exclusively on **public academic datasets**.
73
+
74
+ <!-- <p align="center">
75
+ <img src="assets/images/da3_teaser.png" alt="Depth Anything 3" width="100%">
76
+ </p> -->
77
+ <p align="center">
78
+ <img src="assets/images/demo320-2.gif" alt="Depth Anything 3 - Left" width="70%">
79
+ </p>
80
+ <p align="center">
81
+ <img src="assets/images/da3_radar.png" alt="Depth Anything 3" width="100%">
82
+ </p>
83
+
84
+
85
+ ## 📰 News
86
+ - **30-11-2025:** Add [`use_ray_pose`](#use-ray-pose) and [`ref_view_strategy`](docs/funcs/ref_view_strategy.md) (reference view selection for multi-view inputs).
87
+ - **25-11-2025:** Add [Awesome DA3 Projects](#-awesome-da3-projects), a community-driven section featuring DA3-based applications.
88
+ - **14-11-2025:** Paper, project page, code and models are all released.
89
+
90
+ ## ✨ Highlights
91
+
92
+ ### 🏆 Model Zoo
93
+ We release three series of models, each tailored for specific use cases in visual geometry.
94
+
95
+ - 🌟 **DA3 Main Series** (`DA3-Giant`, `DA3-Large`, `DA3-Base`, `DA3-Small`) These are our flagship foundation models, trained with a unified depth-ray representation. By varying the input configuration, a single model can perform a wide range of tasks:
96
+ + 🌊 **Monocular Depth Estimation**: Predicts a depth map from a single RGB image.
97
+ + 🌊 **Multi-View Depth Estimation**: Generates consistent depth maps from multiple images for high-quality fusion.
98
+ + 🎯 **Pose-Conditioned Depth Estimation**: Achieves superior depth consistency when camera poses are provided as input.
99
+ + 📷 **Camera Pose Estimation**: Estimates camera extrinsics and intrinsics from one or more images.
100
+ + 🟡 **3D Gaussian Estimation**: Directly predicts 3D Gaussians, enabling high-fidelity novel view synthesis.
101
+
102
+ - 📐 **DA3 Metric Series** (`DA3Metric-Large`) A specialized model fine-tuned for metric depth estimation in monocular settings, ideal for applications requiring real-world scale.
103
+
104
+ - 🔍 **DA3 Monocular Series** (`DA3Mono-Large`). A dedicated model for high-quality relative monocular depth estimation. Unlike disparity-based models (e.g., [Depth Anything 2](https://github.com/DepthAnything/Depth-Anything-V2)), it directly predicts depth, resulting in superior geometric accuracy.
105
+
106
+ 🔗 Leveraging these available models, we developed a **nested series** (`DA3Nested-Giant-Large`). This series combines a any-view giant model with a metric model to reconstruct visual geometry at a real-world metric scale.
107
+
108
+ ### 🛠️ Codebase Features
109
+ Our repository is designed to be a powerful and user-friendly toolkit for both practical application and future research.
110
+ - 🎨 **Interactive Web UI & Gallery**: Visualize model outputs and compare results with an easy-to-use Gradio-based web interface.
111
+ - ⚡ **Flexible Command-Line Interface (CLI)**: Powerful and scriptable CLI for batch processing and integration into custom workflows.
112
+ - 💾 **Multiple Export Formats**: Save your results in various formats, including `glb`, `npz`, depth images, `ply`, 3DGS videos, etc, to seamlessly connect with other tools.
113
+ - 🔧 **Extensible and Modular Design**: The codebase is structured to facilitate future research and the integration of new models or functionalities.
114
+
115
+
116
+ <!-- ### 🎯 Visual Geometry Benchmark
117
+ We introduce a new benchmark to rigorously evaluate geometry prediction models on three key tasks: pose estimation, 3D reconstruction, and visual rendering (novel view synthesis) quality.
118
+
119
+ - 🔄 **Broad Model Compatibility**: Our benchmark is designed to be versatile, supporting the evaluation of various models, including both monocular and multi-view depth estimation approaches.
120
+ - 🔬 **Robust Evaluation Pipeline**: We provide a standardized pipeline featuring RANSAC-based pose alignment, TSDF fusion for dense reconstruction, and a principled view selection strategy for novel view synthesis.
121
+ - 📊 **Standardized Metrics**: Performance is measured using established metrics: AUC for pose accuracy, F1-score and Chamfer Distance for reconstruction, and PSNR/SSIM/LPIPS for rendering quality.
122
+ - 🌍 **Diverse and Challenging Datasets**: The benchmark spans a wide range of scenes from datasets like HiRoom, ETH3D, DTU, 7Scenes, ScanNet++, DL3DV, Tanks and Temples, and MegaDepth. -->
123
+
124
+
125
+ ## 🚀 Quick Start
126
+
127
+ ### 📦 Installation
128
+
129
+ ```bash
130
+ # From PyPI (recommended)
131
+ pip install awesome-depth-anything-3
132
+
133
+ # With Gradio web UI
134
+ pip install awesome-depth-anything-3[app]
135
+
136
+ # With CUDA optimizations (xformers + gsplat)
137
+ pip install awesome-depth-anything-3[cuda]
138
+
139
+ # Everything
140
+ pip install awesome-depth-anything-3[all]
141
+ ```
142
+
143
+ <details>
144
+ <summary><b>Development installation</b></summary>
145
+
146
+ ```bash
147
+ git clone https://github.com/Aedelon/awesome-depth-anything-3.git
148
+ cd awesome-depth-anything-3
149
+ pip install -e ".[dev]"
150
+
151
+ # Optional: 3D Gaussian Splatting head
152
+ pip install --no-build-isolation git+https://github.com/nerfstudio-project/gsplat.git@0b4dddf
153
+ ```
154
+ </details>
155
+
156
+ For detailed model information, please refer to the [Model Cards](#-model-cards) section below.
157
+
158
+ ### 💻 Basic Usage
159
+
160
+ ```python
161
+ import glob, os, torch
162
+ from depth_anything_3.api import DepthAnything3
163
+ device = torch.device("cuda")
164
+ model = DepthAnything3.from_pretrained("depth-anything/DA3NESTED-GIANT-LARGE")
165
+ model = model.to(device=device)
166
+ example_path = "assets/examples/SOH"
167
+ images = sorted(glob.glob(os.path.join(example_path, "*.png")))
168
+ prediction = model.inference(
169
+ images,
170
+ )
171
+ # prediction.processed_images : [N, H, W, 3] uint8 array
172
+ print(prediction.processed_images.shape)
173
+ # prediction.depth : [N, H, W] float32 array
174
+ print(prediction.depth.shape)
175
+ # prediction.conf : [N, H, W] float32 array
176
+ print(prediction.conf.shape)
177
+ # prediction.extrinsics : [N, 3, 4] float32 array # opencv w2c or colmap format
178
+ print(prediction.extrinsics.shape)
179
+ # prediction.intrinsics : [N, 3, 3] float32 array
180
+ print(prediction.intrinsics.shape)
181
+ ```
182
+
183
+ ```bash
184
+
185
+ export MODEL_DIR=depth-anything/DA3NESTED-GIANT-LARGE
186
+ # This can be a Hugging Face repository or a local directory
187
+ # If you encounter network issues, consider using the following mirror: export HF_ENDPOINT=https://hf-mirror.com
188
+ # Alternatively, you can download the model directly from Hugging Face
189
+ export GALLERY_DIR=workspace/gallery
190
+ mkdir -p $GALLERY_DIR
191
+
192
+ # CLI auto mode with backend reuse
193
+ da3 backend --model-dir ${MODEL_DIR} --gallery-dir ${GALLERY_DIR} # Cache model to gpu
194
+ da3 auto assets/examples/SOH \
195
+ --export-format glb \
196
+ --export-dir ${GALLERY_DIR}/TEST_BACKEND/SOH \
197
+ --use-backend
198
+
199
+ # CLI video processing with feature visualization
200
+ da3 video assets/examples/robot_unitree.mp4 \
201
+ --fps 15 \
202
+ --use-backend \
203
+ --export-dir ${GALLERY_DIR}/TEST_BACKEND/robo \
204
+ --export-format glb-feat_vis \
205
+ --feat-vis-fps 15 \
206
+ --process-res-method lower_bound_resize \
207
+ --export-feat "11,21,31"
208
+
209
+ # CLI auto mode without backend reuse
210
+ da3 auto assets/examples/SOH \
211
+ --export-format glb \
212
+ --export-dir ${GALLERY_DIR}/TEST_CLI/SOH \
213
+ --model-dir ${MODEL_DIR}
214
+
215
+ ```
216
+
217
+ The model architecture is defined in [`DepthAnything3Net`](src/depth_anything_3/model/da3.py), and specified with a Yaml config file located at [`src/depth_anything_3/configs`](src/depth_anything_3/configs). The input and output processing are handled by [`DepthAnything3`](src/depth_anything_3/api.py). To customize the model architecture, simply create a new config file (*e.g.*, `path/to/new/config`) as:
218
+
219
+ ```yaml
220
+ __object__:
221
+ path: depth_anything_3.model.da3
222
+ name: DepthAnything3Net
223
+ args: as_params
224
+
225
+ net:
226
+ __object__:
227
+ path: depth_anything_3.model.dinov2.dinov2
228
+ name: DinoV2
229
+ args: as_params
230
+
231
+ name: vitb
232
+ out_layers: [5, 7, 9, 11]
233
+ alt_start: 4
234
+ qknorm_start: 4
235
+ rope_start: 4
236
+ cat_token: True
237
+
238
+ head:
239
+ __object__:
240
+ path: depth_anything_3.model.dualdpt
241
+ name: DualDPT
242
+ args: as_params
243
+
244
+ dim_in: &head_dim_in 1536
245
+ output_dim: 2
246
+ features: &head_features 128
247
+ out_channels: &head_out_channels [96, 192, 384, 768]
248
+ ```
249
+
250
+ Then, the model can be created with the following code snippet.
251
+ ```python
252
+ from depth_anything_3.cfg import create_object, load_config
253
+
254
+ Model = create_object(load_config("path/to/new/config"))
255
+ ```
256
+
257
+
258
+
259
+ ## 📚 Useful Documentation
260
+
261
+ - 🖥️ [Command Line Interface](docs/CLI.md)
262
+ - 📑 [Python API](docs/API.md)
263
+ <!-- - 🏁 [Visual Geometry Benchmark](docs/BENCHMARK.md) -->
264
+
265
+ ## 🗂️ Model Cards
266
+
267
+ Generally, you should observe that DA3-LARGE achieves comparable results to VGGT.
268
+
269
+ The Nested series uses an Any-view model to estimate pose and depth, and a monocular metric depth estimator for scaling.
270
+
271
+ | 🗃️ Model Name | 📏 Params | 📊 Rel. Depth | 📷 Pose Est. | 🧭 Pose Cond. | 🎨 GS | 📐 Met. Depth | ☁️ Sky Seg | 📄 License |
272
+ |-------------------------------|-----------|---------------|--------------|---------------|-------|---------------|-----------|----------------|
273
+ | **Nested** | | | | | | | | |
274
+ | [DA3NESTED-GIANT-LARGE](https://huggingface.co/depth-anything/DA3NESTED-GIANT-LARGE) | 1.40B | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | CC BY-NC 4.0 |
275
+ | **Any-view Model** | | | | | | | | |
276
+ | [DA3-GIANT](https://huggingface.co/depth-anything/DA3-GIANT) | 1.15B | ✅ | ✅ | ✅ | ✅ | | | CC BY-NC 4.0 |
277
+ | [DA3-LARGE](https://huggingface.co/depth-anything/DA3-LARGE) | 0.35B | ✅ | ✅ | ✅ | | | | CC BY-NC 4.0 |
278
+ | [DA3-BASE](https://huggingface.co/depth-anything/DA3-BASE) | 0.12B | ✅ | ✅ | ✅ | | | | Apache 2.0 |
279
+ | [DA3-SMALL](https://huggingface.co/depth-anything/DA3-SMALL) | 0.08B | ✅ | ✅ | ✅ | | | | Apache 2.0 |
280
+ | | | | | | | | | |
281
+ | **Monocular Metric Depth** | | | | | | | | |
282
+ | [DA3METRIC-LARGE](https://huggingface.co/depth-anything/DA3METRIC-LARGE) | 0.35B | ✅ | | | | ✅ | ✅ | Apache 2.0 |
283
+ | | | | | | | | | |
284
+ | **Monocular Depth** | | | | | | | | |
285
+ | [DA3MONO-LARGE](https://huggingface.co/depth-anything/DA3MONO-LARGE) | 0.35B | ✅ | | | | | ✅ | Apache 2.0 |
286
+
287
+
288
+ ## ⚡ Performance Benchmarks
289
+
290
+ Inference throughput measured on Apple Silicon (MPS) with PyTorch 2.9.0. For detailed benchmarks, see [BENCHMARKS.md](BENCHMARKS.md).
291
+
292
+ ### Apple Silicon (MPS) - Batch Size 1
293
+
294
+ | Model | Latency | Throughput |
295
+ |-------|---------|------------|
296
+ | DA3-Small | 46 ms | **22 img/s** |
297
+ | DA3-Base | 93 ms | **11 img/s** |
298
+ | DA3-Large | 265 ms | **3.8 img/s** |
299
+ | DA3-Giant | 618 ms | **1.6 img/s** |
300
+
301
+ ### Cross-Device Comparison (DA3-Large)
302
+
303
+ | Device | Throughput | vs CPU |
304
+ |--------|------------|--------|
305
+ | CPU | 0.3 img/s | 1.0x |
306
+ | Apple Silicon (MPS) | 3.8 img/s | **13x** |
307
+ | NVIDIA L4 (CUDA) | 10.3 img/s | **34x** |
308
+
309
+ ### Batch Processing
310
+
311
+ ```python
312
+ from depth_anything_3.api import DepthAnything3
313
+
314
+ model = DepthAnything3.from_pretrained("depth-anything/DA3-LARGE")
315
+
316
+ # Adaptive batching (recommended for large image sets)
317
+ results = model.batch_inference(
318
+ images=image_paths,
319
+ batch_size="auto", # Automatically selects optimal batch size
320
+ target_memory_utilization=0.85,
321
+ )
322
+
323
+ # Fixed batch size
324
+ results = model.batch_inference(
325
+ images=image_paths,
326
+ batch_size=4,
327
+ )
328
+ ```
329
+
330
+ > See [BENCHMARKS.md](BENCHMARKS.md) for comprehensive benchmarks including preprocessing, attention mechanisms, and adaptive batching strategies.
331
+
332
+
333
+ ## ❓ FAQ
334
+
335
+ - **Monocular Metric Depth**: To obtain metric depth in meters from `DA3METRIC-LARGE`, use `metric_depth = focal * net_output / 300.`, where `focal` is the focal length in pixels (typically the average of fx and fy from the camera intrinsic matrix K). Note that the output from `DA3NESTED-GIANT-LARGE` is already in meters.
336
+
337
+ - <a id="use-ray-pose"></a>**Ray Head (`use_ray_pose`)**: Our API and CLI support `use_ray_pose` arg, which means that the model will derive camera pose from ray head, which is generally slightly slower, but more accurate. Note that the default is `False` for faster inference speed.
338
+ <details>
339
+ <summary>AUC3 Results for DA3NESTED-GIANT-LARGE</summary>
340
+
341
+ | Model | HiRoom | ETH3D | DTU | 7Scenes | ScanNet++ |
342
+ |-------|------|-------|-----|---------|-----------|
343
+ | `ray_head` | 84.4 | 52.6 | 93.9 | 29.5 | 89.4 |
344
+ | `cam_head` | 80.3 | 48.4 | 94.1 | 28.5 | 85.0 |
345
+
346
+ </details>
347
+
348
+
349
+
350
+
351
+ - **Older GPUs without XFormers support**: See [Issue #11](https://github.com/ByteDance-Seed/Depth-Anything-3/issues/11). Thanks to [@S-Mahoney](https://github.com/S-Mahoney) for the solution!
352
+
353
+
354
+ ## 🏢 Awesome DA3 Projects
355
+
356
+ A community-curated list of Depth Anything 3 integrations across 3D tools, creative pipelines, robotics, and web/VR viewers, including but not limited to these. You are welcome to submit your DA3-based project via PR, and we will review and feature it if applicable.
357
+
358
+ - [DA3-blender](https://github.com/xy-gao/DA3-blender): Blender addon for DA3-based 3D reconstruction from a set of images.
359
+
360
+ - [ComfyUI-DepthAnythingV3](https://github.com/PozzettiAndrea/ComfyUI-DepthAnythingV3): ComfyUI nodes for Depth Anything 3, supporting single/multi-view and video-consistent depth with optional point‑cloud export.
361
+
362
+ - [DA3-ROS2-Wrapper](https://github.com/GerdsenAI/GerdsenAI-Depth-Anything-3-ROS2-Wrapper): Real-time DA3 depth in ROS2 with multi-camera support.
363
+
364
+ - [VideoDepthViewer3D](https://github.com/amariichi/VideoDepthViewer3D): Streaming videos with DA3 metric depth to a Three.js/WebXR 3D viewer for VR/stereo playback.
365
+
366
+
367
+ ## 📝 Credits
368
+
369
+ ### Original Authors
370
+
371
+ This package is built on top of **Depth Anything 3**, created by the ByteDance Seed team:
372
+
373
+ - [Haotong Lin](https://haotongl.github.io/), [Sili Chen](https://github.com/SiliChen321), [Jun Hao Liew](https://liewjunhao.github.io/), [Donny Y. Chen](https://donydchen.github.io), [Zhenyu Li](https://zhyever.github.io/), [Guang Shi](https://scholar.google.com/citations?user=MjXxWbUAAAAJ), [Jiashi Feng](https://scholar.google.com.sg/citations?user=Q8iay0gAAAAJ), [Bingyi Kang](https://bingykang.github.io/)
374
+
375
+ All model weights, architecture, and core algorithms are their work. This fork only adds production optimizations and deployment tooling.
376
+
377
+ ### Fork Maintainer
378
+
379
+ This optimized fork is maintained by [Delanoe Pirard (Aedelon)](https://github.com/Aedelon).
380
+
381
+ Contributions:
382
+ - Model caching system
383
+ - Adaptive batching
384
+ - Apple Silicon (MPS) optimizations
385
+ - PyPI packaging and CI/CD
386
+ - Comprehensive benchmarking
387
+
388
+ ### Citation
389
+
390
+ If you use Depth Anything 3 in your research, please cite the original paper:
391
+
392
+ ```bibtex
393
+ @article{depthanything3,
394
+ title={Depth Anything 3: Recovering the visual space from any views},
395
+ author={Haotong Lin and Sili Chen and Jun Hao Liew and Donny Y. Chen and Zhenyu Li and Guang Shi and Jiashi Feng and Bingyi Kang},
396
+ journal={arXiv preprint arXiv:2511.10647},
397
+ year={2025}
398
+ }
399
+ ```
400
+
401
+ If you specifically use features from this fork (caching, batching, MPS optimizations), you may additionally reference:
402
+
403
+ ```
404
+ awesome-depth-anything-3: https://github.com/Aedelon/awesome-depth-anything-3
405
+ ```
app.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # Copyright (c) Delanoe Pirard / Aedelon
3
+ # Licensed under the Apache License, Version 2.0
4
+ """
5
+ Hugging Face Spaces entry point for awesome-depth-anything-3.
6
+
7
+ This file is the main entry point for the HF Spaces deployment.
8
+ It launches the Gradio web interface with optimized settings for cloud deployment.
9
+ """
10
+
11
+ import os
12
+ import tempfile
13
+
14
+ # Disable analytics and configure for HF Spaces
15
+ os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
16
+ os.environ["DA3_LOG_LEVEL"] = "WARNING"
17
+
18
+ from depth_anything_3.app.gradio_app import DepthAnything3App
19
+
20
+
21
+ def main():
22
+ """Launch the Gradio app for HF Spaces."""
23
+ # Use DA3-LARGE for good balance of quality and speed
24
+ workspace_dir = "/tmp/workspace"
25
+ gallery_dir = "/tmp/gallery"
26
+
27
+ # Create directories
28
+ os.makedirs(workspace_dir, exist_ok=True)
29
+ os.makedirs(gallery_dir, exist_ok=True)
30
+
31
+ app = DepthAnything3App(
32
+ model_dir="depth-anything/DA3-LARGE",
33
+ workspace_dir=workspace_dir,
34
+ gallery_dir=gallery_dir,
35
+ )
36
+
37
+ demo = app.create_app()
38
+
39
+ # Build allowed paths for Gradio file access
40
+ allowed_paths = [
41
+ os.getcwd(),
42
+ tempfile.gettempdir(),
43
+ workspace_dir,
44
+ gallery_dir,
45
+ "/tmp",
46
+ ]
47
+
48
+ # Launch for HF Spaces (theme/css already set in create_app via gr.Blocks())
49
+ demo.queue(max_size=10).launch(
50
+ server_name="0.0.0.0",
51
+ server_port=7860,
52
+ share=True,
53
+ show_error=True,
54
+ allowed_paths=allowed_paths,
55
+ )
56
+
57
+
58
+ if __name__ == "__main__":
59
+ main()
assets/examples/SOH/000.png ADDED

Git LFS Details

  • SHA256: ea78c3b872b1e8b27de48cadf1d4a692cd42ddf5f72fcab78e2be2937935fb79
  • Pointer size: 132 Bytes
  • Size of remote file: 1.09 MB
assets/examples/SOH/010.png ADDED

Git LFS Details

  • SHA256: c91a69d4e050e75e3760fbacda18a452b9abcf3065e6d6bd940b4a99d48f7982
  • Pointer size: 132 Bytes
  • Size of remote file: 1.13 MB
assets/examples/robot_unitree.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:99bc274f7613a665c6135085fe01691ebfaa9033101319071f37c550ab21d1ea
3
+ size 1964268
assets/images/da3_radar.png ADDED

Git LFS Details

  • SHA256: d85aa2a89c1959e31ea627474360720b94a393126e8102399ce4dbc355d81d47
  • Pointer size: 131 Bytes
  • Size of remote file: 215 kB
assets/images/demo320-2.gif ADDED

Git LFS Details

  • SHA256: dc02d85064d875f7679c70b7156e7b5c7eef872895dbe45a241400c6f99e22f9
  • Pointer size: 133 Bytes
  • Size of remote file: 18.8 MB
benchmarks/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Copyright (c) Delanoe Pirard / Aedelon
2
+ # Licensed under the Apache License, Version 2.0
3
+ """Benchmark scripts for Depth Anything 3."""
benchmarks/comparative_benchmark.py ADDED
@@ -0,0 +1,436 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # Copyright (c) Delanoe Pirard / Aedelon
3
+ # Licensed under the Apache License, Version 2.0
4
+ """
5
+ Comparative Benchmark: awesome-depth-anything-3 vs upstream (vanilla)
6
+
7
+ Compares performance between the optimized fork and the original upstream.
8
+
9
+ Usage:
10
+ python benchmarks/comparative_benchmark.py --device mps
11
+ python benchmarks/comparative_benchmark.py --device cuda
12
+ python benchmarks/comparative_benchmark.py --device all
13
+ python benchmarks/comparative_benchmark.py --quick
14
+ """
15
+
16
+ import argparse
17
+ import contextlib
18
+ import gc
19
+ import io
20
+ import logging
21
+ import os
22
+ import shutil
23
+ import sys
24
+ import time
25
+ import warnings
26
+
27
+ # Suppress ALL logging before any imports
28
+ logging.disable(logging.CRITICAL)
29
+ os.environ["DA3_LOG_LEVEL"] = "CRITICAL"
30
+ os.environ["PYTHONWARNINGS"] = "ignore"
31
+ warnings.filterwarnings("ignore")
32
+
33
+ import numpy as np
34
+ import torch
35
+ from PIL import Image
36
+
37
+ # Suppress all loggers
38
+ logging.getLogger("depth_anything_3").disabled = True
39
+ logging.getLogger("dinov2").disabled = True
40
+ logging.getLogger().setLevel(logging.CRITICAL)
41
+
42
+
43
+ @contextlib.contextmanager
44
+ def suppress_output():
45
+ """Context manager to suppress stdout and stderr."""
46
+ with contextlib.redirect_stdout(io.StringIO()), \
47
+ contextlib.redirect_stderr(io.StringIO()):
48
+ # Also suppress all loggers again
49
+ logging.disable(logging.CRITICAL)
50
+ yield
51
+
52
+ # ============================================================================
53
+ # CONFIGURATION
54
+ # ============================================================================
55
+
56
+ AWESOME_REPO = "/Users/aedelon/Workspace/awesome-depth-anything-3"
57
+ UPSTREAM_REPO = "/Users/aedelon/Workspace/depth-anything-3-upstream"
58
+ MODEL_NAME = "da3-large"
59
+
60
+
61
+ # ============================================================================
62
+ # UTILITIES
63
+ # ============================================================================
64
+
65
+ def cleanup():
66
+ gc.collect()
67
+ if torch.cuda.is_available():
68
+ torch.cuda.empty_cache()
69
+ torch.cuda.reset_peak_memory_stats()
70
+ if torch.backends.mps.is_available():
71
+ torch.mps.empty_cache()
72
+
73
+
74
+ def sync_device(device):
75
+ if device.type == "cuda":
76
+ torch.cuda.synchronize()
77
+ elif device.type == "mps":
78
+ torch.mps.synchronize()
79
+
80
+
81
+ def clear_modules():
82
+ """Clear depth_anything_3 from sys.modules."""
83
+ to_remove = [k for k in sys.modules.keys() if "depth_anything_3" in k]
84
+ for k in to_remove:
85
+ del sys.modules[k]
86
+
87
+
88
+ def suppress_logging():
89
+ """Suppress all logging after module import."""
90
+ logging.disable(logging.CRITICAL)
91
+ try:
92
+ from depth_anything_3.utils.logger import logger
93
+ logger.level = 100
94
+ except:
95
+ pass
96
+
97
+
98
+ def get_available_devices():
99
+ """Get available devices."""
100
+ devices = [torch.device("cpu")]
101
+ if torch.backends.mps.is_available():
102
+ devices.append(torch.device("mps"))
103
+ if torch.cuda.is_available():
104
+ devices.append(torch.device("cuda"))
105
+ return devices
106
+
107
+
108
+ def get_device_name(device):
109
+ if device.type == "cuda":
110
+ return torch.cuda.get_device_name(device)
111
+ elif device.type == "mps":
112
+ return "Apple Silicon (MPS)"
113
+ return "CPU"
114
+
115
+
116
+ # ============================================================================
117
+ # BENCHMARK: UPSTREAM (VANILLA)
118
+ # ============================================================================
119
+
120
+ def benchmark_upstream(device, pil_images, process_res=504, runs=3):
121
+ """Benchmark upstream/vanilla depth-anything-3."""
122
+
123
+ # Setup path
124
+ clear_modules()
125
+ upstream_src = os.path.join(UPSTREAM_REPO, "src")
126
+ if upstream_src in sys.path:
127
+ sys.path.remove(upstream_src)
128
+ sys.path.insert(0, upstream_src)
129
+
130
+ with suppress_output():
131
+ from depth_anything_3.api import DepthAnything3
132
+ suppress_logging()
133
+
134
+ cleanup()
135
+
136
+ # Cold load
137
+ start = time.perf_counter()
138
+ model = DepthAnything3(model_name=MODEL_NAME)
139
+ model = model.to(device)
140
+ model.eval()
141
+ cold_load_time = time.perf_counter() - start
142
+
143
+ # Warmup
144
+ for _ in range(2):
145
+ model.inference(pil_images[:1], process_res=process_res)
146
+ sync_device(device)
147
+ cleanup()
148
+
149
+ # Benchmark inference
150
+ times = []
151
+ for _ in range(runs):
152
+ cleanup()
153
+ sync_device(device)
154
+ start = time.perf_counter()
155
+ model.inference(pil_images, process_res=process_res)
156
+ sync_device(device)
157
+ times.append(time.perf_counter() - start)
158
+
159
+ avg_time = np.mean(times)
160
+ std_time = np.std(times)
161
+ throughput = len(pil_images) / avg_time
162
+
163
+ del model
164
+ cleanup()
165
+
166
+ # Cleanup path
167
+ sys.path.remove(upstream_src)
168
+ clear_modules()
169
+
170
+ return {
171
+ "cold_load": cold_load_time,
172
+ "inference_time": avg_time,
173
+ "inference_std": std_time,
174
+ "throughput": throughput,
175
+ }
176
+
177
+
178
+ # ============================================================================
179
+ # BENCHMARK: AWESOME (OPTIMIZED)
180
+ # ============================================================================
181
+
182
+ def benchmark_awesome(device, pil_images, process_res=504, runs=3, use_cache=True):
183
+ """Benchmark awesome (optimized) depth-anything-3."""
184
+
185
+ # Setup path
186
+ clear_modules()
187
+ awesome_src = os.path.join(AWESOME_REPO, "src")
188
+ if awesome_src in sys.path:
189
+ sys.path.remove(awesome_src)
190
+ sys.path.insert(0, awesome_src)
191
+
192
+ with suppress_output():
193
+ from depth_anything_3.api import DepthAnything3
194
+ from depth_anything_3.cache import get_model_cache
195
+ suppress_logging()
196
+
197
+ # Clear cache if testing cold load
198
+ if not use_cache:
199
+ cache = get_model_cache()
200
+ cache.clear()
201
+
202
+ cleanup()
203
+
204
+ # Cold/warm load
205
+ start = time.perf_counter()
206
+ model = DepthAnything3(model_name=MODEL_NAME, device=device, use_cache=use_cache)
207
+ load_time = time.perf_counter() - start
208
+
209
+ # For cache test, do a second load
210
+ cached_load_time = None
211
+ if use_cache:
212
+ del model
213
+ cleanup()
214
+ start = time.perf_counter()
215
+ model = DepthAnything3(model_name=MODEL_NAME, device=device, use_cache=True)
216
+ cached_load_time = time.perf_counter() - start
217
+
218
+ # Warmup
219
+ for _ in range(2):
220
+ model.inference(pil_images[:1], process_res=process_res)
221
+ sync_device(device)
222
+ cleanup()
223
+
224
+ # Benchmark inference
225
+ times = []
226
+ for _ in range(runs):
227
+ cleanup()
228
+ sync_device(device)
229
+ start = time.perf_counter()
230
+ model.inference(pil_images, process_res=process_res)
231
+ sync_device(device)
232
+ times.append(time.perf_counter() - start)
233
+
234
+ avg_time = np.mean(times)
235
+ std_time = np.std(times)
236
+ throughput = len(pil_images) / avg_time
237
+
238
+ del model
239
+ cleanup()
240
+
241
+ # Cleanup path
242
+ sys.path.remove(awesome_src)
243
+ clear_modules()
244
+
245
+ return {
246
+ "cold_load": load_time,
247
+ "cached_load": cached_load_time,
248
+ "inference_time": avg_time,
249
+ "inference_std": std_time,
250
+ "throughput": throughput,
251
+ }
252
+
253
+
254
+ # ============================================================================
255
+ # MAIN
256
+ # ============================================================================
257
+
258
+ def run_comparison(device, batch_sizes, process_res=504, runs=3):
259
+ """Run comparison for a specific device."""
260
+
261
+ results = {}
262
+ temp_dir = "temp_compare"
263
+ os.makedirs(temp_dir, exist_ok=True)
264
+
265
+ try:
266
+ # Create test images
267
+ max_batch = max(batch_sizes)
268
+ pil_images = []
269
+ for i in range(max_batch):
270
+ img = Image.new("RGB", (1280, 720), color=(100 + i*10, 150, 200))
271
+ pil_images.append(img)
272
+
273
+ for batch_size in batch_sizes:
274
+ test_images = pil_images[:batch_size]
275
+ results[batch_size] = {}
276
+
277
+ print(f"\n Batch size: {batch_size}")
278
+ print(f" {'-'*50}")
279
+
280
+ # Upstream
281
+ print(f" Testing UPSTREAM (vanilla)...", end=" ", flush=True)
282
+ try:
283
+ upstream = benchmark_upstream(device, test_images, process_res, runs)
284
+ results[batch_size]["upstream"] = upstream
285
+ print(f"{upstream['throughput']:.2f} img/s")
286
+ except Exception as e:
287
+ print(f"ERROR: {e}")
288
+ results[batch_size]["upstream"] = None
289
+
290
+ # Awesome (no cache - fair comparison)
291
+ print(f" Testing AWESOME (no cache)...", end=" ", flush=True)
292
+ try:
293
+ awesome_nc = benchmark_awesome(device, test_images, process_res, runs, use_cache=False)
294
+ results[batch_size]["awesome_nocache"] = awesome_nc
295
+ print(f"{awesome_nc['throughput']:.2f} img/s")
296
+ except Exception as e:
297
+ print(f"ERROR: {e}")
298
+ results[batch_size]["awesome_nocache"] = None
299
+
300
+ # Awesome (with cache)
301
+ print(f" Testing AWESOME (cached)...", end=" ", flush=True)
302
+ try:
303
+ awesome_c = benchmark_awesome(device, test_images, process_res, runs, use_cache=True)
304
+ results[batch_size]["awesome_cached"] = awesome_c
305
+ print(f"{awesome_c['throughput']:.2f} img/s")
306
+ except Exception as e:
307
+ print(f"ERROR: {e}")
308
+ results[batch_size]["awesome_cached"] = None
309
+
310
+ finally:
311
+ shutil.rmtree(temp_dir, ignore_errors=True)
312
+
313
+ return results
314
+
315
+
316
+ def print_results_table(results, device):
317
+ """Print formatted results table."""
318
+
319
+ print(f"\n{'='*70}")
320
+ print(f" RESULTS: {device.type.upper()}")
321
+ print(f"{'='*70}")
322
+
323
+ # Header
324
+ print(f"\n{'Batch':<8} {'Metric':<18} {'Upstream':<12} {'Awesome':<12} {'Speedup':<10}")
325
+ print("-" * 60)
326
+
327
+ for batch_size, data in sorted(results.items()):
328
+ upstream = data.get("upstream")
329
+ awesome = data.get("awesome_nocache") or data.get("awesome_cached")
330
+
331
+ if not upstream or not awesome:
332
+ continue
333
+
334
+ # Inference throughput
335
+ u_thr = upstream["throughput"]
336
+ a_thr = awesome["throughput"]
337
+ speedup = a_thr / u_thr if u_thr > 0 else 0
338
+ print(f"{batch_size:<8} {'Throughput (img/s)':<18} {u_thr:<12.2f} {a_thr:<12.2f} {speedup:<10.2f}x")
339
+
340
+ # Inference time
341
+ u_time = upstream["inference_time"] * 1000
342
+ a_time = awesome["inference_time"] * 1000
343
+ speedup = u_time / a_time if a_time > 0 else 0
344
+ print(f"{'':<8} {'Latency (ms)':<18} {u_time:<12.1f} {a_time:<12.1f} {speedup:<10.2f}x")
345
+
346
+ # Cold load time
347
+ u_load = upstream["cold_load"]
348
+ a_load = awesome["cold_load"]
349
+ speedup = u_load / a_load if a_load > 0 else 0
350
+ print(f"{'':<8} {'Cold load (s)':<18} {u_load:<12.2f} {a_load:<12.2f} {speedup:<10.2f}x")
351
+
352
+ # Cached load (awesome only)
353
+ cached = data.get("awesome_cached")
354
+ if cached and cached.get("cached_load"):
355
+ c_load = cached["cached_load"]
356
+ speedup = u_load / c_load if c_load > 0 else 0
357
+ print(f"{'':<8} {'Cached load (s)':<18} {'-':<12} {c_load:<12.3f} {speedup:<10.1f}x")
358
+
359
+ print()
360
+
361
+
362
+ def main():
363
+ parser = argparse.ArgumentParser(description="Comparative Benchmark: Awesome vs Upstream")
364
+ parser.add_argument("--device", "-d", type=str, default="auto",
365
+ choices=["auto", "cpu", "mps", "cuda", "all"],
366
+ help="Device to benchmark")
367
+ parser.add_argument("--batch-sizes", type=int, nargs="+", default=[1, 2, 4],
368
+ help="Batch sizes to test")
369
+ parser.add_argument("--runs", type=int, default=3, help="Number of runs per test")
370
+ parser.add_argument("--quick", action="store_true", help="Quick mode (fewer runs)")
371
+ args = parser.parse_args()
372
+
373
+ if args.quick:
374
+ args.batch_sizes = [1, 2]
375
+ args.runs = 2
376
+
377
+ # Determine devices
378
+ available = get_available_devices()
379
+ if args.device == "auto":
380
+ devices = [available[-1]]
381
+ elif args.device == "all":
382
+ devices = available
383
+ else:
384
+ requested = torch.device(args.device)
385
+ if requested in available:
386
+ devices = [requested]
387
+ else:
388
+ print(f"Device '{args.device}' not available. Available: {[d.type for d in available]}")
389
+ return
390
+
391
+ # Header
392
+ print("\n" + "=" * 70)
393
+ print(" COMPARATIVE BENCHMARK: AWESOME vs UPSTREAM (VANILLA)")
394
+ print("=" * 70)
395
+ print(f" Model: {MODEL_NAME}")
396
+ print(f" PyTorch: {torch.__version__}")
397
+ print(f" Batch sizes: {args.batch_sizes}")
398
+ print(f" Runs per test: {args.runs}")
399
+ print(f" Devices: {[d.type.upper() for d in devices]}")
400
+ for d in available:
401
+ status = "✓" if d in devices else "○"
402
+ print(f" {status} {d.type.upper()}: {get_device_name(d)}")
403
+ print("=" * 70)
404
+
405
+ all_results = {}
406
+
407
+ for device in devices:
408
+ print(f"\n{'#'*70}")
409
+ print(f" DEVICE: {device.type.upper()} ({get_device_name(device)})")
410
+ print(f"{'#'*70}")
411
+
412
+ results = run_comparison(device, args.batch_sizes, runs=args.runs)
413
+ all_results[device.type] = results
414
+ print_results_table(results, device)
415
+
416
+ # Final summary
417
+ print("\n" + "=" * 70)
418
+ print(" SUMMARY")
419
+ print("=" * 70)
420
+
421
+ for device_type, results in all_results.items():
422
+ print(f"\n {device_type.upper()}:")
423
+
424
+ for batch_size, data in sorted(results.items()):
425
+ upstream = data.get("upstream")
426
+ awesome = data.get("awesome_nocache")
427
+
428
+ if upstream and awesome:
429
+ speedup = awesome["throughput"] / upstream["throughput"]
430
+ print(f" Batch {batch_size}: {speedup:.2f}x faster inference")
431
+
432
+ print("\n" + "=" * 70 + "\n")
433
+
434
+
435
+ if __name__ == "__main__":
436
+ main()
benchmarks/flash_attention_benchmark.py ADDED
@@ -0,0 +1,488 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # Copyright (c) Delanoe Pirard / Aedelon - Apache 2.0
3
+ """
4
+ Flash Attention Benchmark for Depth Anything 3.
5
+
6
+ Provides clear performance comparison with tables and analysis.
7
+
8
+ Usage:
9
+ python benchmarks/flash_attention_benchmark.py
10
+ python benchmarks/flash_attention_benchmark.py --detailed
11
+ """
12
+
13
+ import argparse
14
+ import gc
15
+ import os
16
+ import sys
17
+ import time
18
+ from dataclasses import dataclass
19
+
20
+ import torch
21
+
22
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))
23
+
24
+ from depth_anything_3.model.dinov2.layers import (
25
+ FLASH_ATTN_AVAILABLE,
26
+ FLASH_ATTN_VERSION,
27
+ Attention,
28
+ )
29
+
30
+
31
+ @dataclass
32
+ class BenchmarkConfig:
33
+ """Configuration for a benchmark test case."""
34
+
35
+ name: str
36
+ seq_len: int
37
+ batch_size: int
38
+ embed_dim: int
39
+ num_heads: int
40
+ image_size: str # Description of corresponding image size
41
+
42
+ @property
43
+ def description(self):
44
+ return f"{self.name} ({self.image_size})"
45
+
46
+
47
+ # Depth Anything 3 model configurations
48
+ DA3_CONFIGS = {
49
+ "vitb": {"embed_dim": 768, "num_heads": 12, "depth": 12},
50
+ "vitl": {"embed_dim": 1024, "num_heads": 16, "depth": 24},
51
+ "vitg": {"embed_dim": 1536, "num_heads": 24, "depth": 40},
52
+ }
53
+
54
+
55
+ def get_device_info():
56
+ """Get device information."""
57
+ if torch.cuda.is_available():
58
+ device = torch.device("cuda")
59
+ device_name = torch.cuda.get_device_name()
60
+ memory_gb = torch.cuda.get_device_properties(0).total_memory / 1024**3
61
+ compute_cap = torch.cuda.get_device_capability()
62
+ return {
63
+ "type": "cuda",
64
+ "device": device,
65
+ "name": device_name,
66
+ "memory_gb": memory_gb,
67
+ "compute_capability": f"{compute_cap[0]}.{compute_cap[1]}",
68
+ }
69
+ elif torch.backends.mps.is_available():
70
+ return {
71
+ "type": "mps",
72
+ "device": torch.device("mps"),
73
+ "name": "Apple Silicon",
74
+ "memory_gb": None,
75
+ "compute_capability": None,
76
+ }
77
+ else:
78
+ return {
79
+ "type": "cpu",
80
+ "device": torch.device("cpu"),
81
+ "name": "CPU",
82
+ "memory_gb": None,
83
+ "compute_capability": None,
84
+ }
85
+
86
+
87
+ def benchmark_attention(attn_module, x, warmup=5, runs=20):
88
+ """Run benchmark for a single attention module."""
89
+ device = x.device
90
+
91
+ # Warmup
92
+ with torch.no_grad():
93
+ for _ in range(warmup):
94
+ _ = attn_module(x)
95
+ if device.type == "cuda":
96
+ torch.cuda.synchronize()
97
+
98
+ # Reset memory tracking
99
+ if device.type == "cuda":
100
+ torch.cuda.reset_peak_memory_stats()
101
+
102
+ # Benchmark
103
+ times = []
104
+ with torch.no_grad():
105
+ for _ in range(runs):
106
+ if device.type == "cuda":
107
+ torch.cuda.synchronize()
108
+ start = time.perf_counter()
109
+ _ = attn_module(x)
110
+ if device.type == "cuda":
111
+ torch.cuda.synchronize()
112
+ times.append((time.perf_counter() - start) * 1000)
113
+
114
+ # Memory
115
+ peak_mem_mb = 0
116
+ if device.type == "cuda":
117
+ peak_mem_mb = torch.cuda.max_memory_allocated() / 1024 / 1024
118
+
119
+ times_tensor = torch.tensor(times)
120
+ return {
121
+ "mean_ms": times_tensor.mean().item(),
122
+ "std_ms": times_tensor.std().item(),
123
+ "min_ms": times_tensor.min().item(),
124
+ "peak_mem_mb": peak_mem_mb,
125
+ }
126
+
127
+
128
+ def print_header():
129
+ """Print benchmark header."""
130
+ print("\n" + "=" * 80)
131
+ print(" " * 20 + "FLASH ATTENTION BENCHMARK - DEPTH ANYTHING 3")
132
+ print("=" * 80 + "\n")
133
+
134
+
135
+ def get_sdpa_backend_info():
136
+ """Get info about which SDPA backend is being used."""
137
+ info = {}
138
+ if torch.cuda.is_available():
139
+ from torch.backends.cuda import (
140
+ flash_sdp_enabled,
141
+ mem_efficient_sdp_enabled,
142
+ math_sdp_enabled,
143
+ )
144
+ info["flash_sdp"] = flash_sdp_enabled()
145
+ info["mem_efficient_sdp"] = mem_efficient_sdp_enabled()
146
+ info["math_sdp"] = math_sdp_enabled()
147
+ return info
148
+
149
+
150
+ def print_device_info(device_info):
151
+ """Print device information."""
152
+ print("📊 HARDWARE CONFIGURATION")
153
+ print("─" * 80)
154
+ print(f" Device Type : {device_info['type'].upper()}")
155
+ print(f" Device Name : {device_info['name']}")
156
+ if device_info["memory_gb"]:
157
+ print(f" Memory : {device_info['memory_gb']:.1f} GB")
158
+ if device_info["compute_capability"]:
159
+ print(f" Compute Cap. : {device_info['compute_capability']}")
160
+ cc = float(device_info["compute_capability"])
161
+ if cc >= 7.5:
162
+ print(f" ✅ Flash Attention supported (≥7.5)")
163
+ else:
164
+ print(f" ❌ Flash Attention requires ≥7.5")
165
+
166
+ # SDPA backend info
167
+ sdpa_info = get_sdpa_backend_info()
168
+ if sdpa_info:
169
+ print(f"\n PyTorch SDPA Backends:")
170
+ print(f" Flash SDP : {'✅ Enabled' if sdpa_info.get('flash_sdp') else '❌ Disabled'}")
171
+ print(f" MemEfficient : {'✅ Enabled' if sdpa_info.get('mem_efficient_sdp') else '❌ Disabled'}")
172
+ print(f" Math SDP : {'✅ Enabled' if sdpa_info.get('math_sdp') else '❌ Disabled'}")
173
+
174
+ if sdpa_info.get('flash_sdp'):
175
+ print(f"\n ⚡ PyTorch SDPA uses Flash Attention internally!")
176
+ print(f" (No need for flash-attn package with PyTorch >= 2.2)")
177
+
178
+ print(f"\n flash-attn pkg : {'✅ Installed v' + FLASH_ATTN_VERSION if FLASH_ATTN_AVAILABLE else '❌ Not installed (optional)'}")
179
+ print()
180
+
181
+
182
+ def print_table_header():
183
+ """Print benchmark table header."""
184
+ print(
185
+ "┌──────────────────────────┬──────────────┬──────────────┬──────────────┬────────────┐"
186
+ )
187
+ print(
188
+ "│ Configuration │ flash_attn │ sdpa │ manual │ Speedup │"
189
+ )
190
+ print(
191
+ "├──────────────────────────┼──────────────┼──────────────┼──────────────┼────────────┤"
192
+ )
193
+
194
+
195
+ def print_table_row(config_desc, results, baseline="sdpa"):
196
+ """Print a benchmark result row."""
197
+ backends = ["flash_attn", "sdpa", "manual"]
198
+
199
+ # Format times
200
+ time_strs = []
201
+ for backend in backends:
202
+ if backend in results and results[backend]:
203
+ time_ms = results[backend]["mean_ms"]
204
+ time_strs.append(f"{time_ms:6.2f} ms")
205
+ else:
206
+ time_strs.append(" N/A")
207
+
208
+ # Calculate speedup
209
+ speedup_str = " -"
210
+ if "flash_attn" in results and results["flash_attn"] and baseline in results:
211
+ if results[baseline]:
212
+ speedup = results[baseline]["mean_ms"] / results["flash_attn"]["mean_ms"]
213
+ speedup_str = f" {speedup:.2f}x ⚡" if speedup > 1.1 else f" {speedup:.2f}x"
214
+
215
+ print(
216
+ f"│ {config_desc:24s} │ {time_strs[0]:12s} │ {time_strs[1]:12s} │ {time_strs[2]:12s} │ {speedup_str:10s} │"
217
+ )
218
+
219
+
220
+ def print_table_footer():
221
+ """Print benchmark table footer."""
222
+ print(
223
+ "└──────────────────────────┴──────────────┴──────────────┴──────────────┴────────────┘"
224
+ )
225
+
226
+
227
+ def print_model_analysis(model_name, config, results, num_layers):
228
+ """Print detailed analysis for a specific model."""
229
+ if "flash_attn" not in results or not results["flash_attn"]:
230
+ return
231
+
232
+ flash_time = results["flash_attn"]["mean_ms"]
233
+ sdpa_time = results["sdpa"]["mean_ms"] if "sdpa" in results else flash_time
234
+
235
+ speedup = sdpa_time / flash_time
236
+ time_saved_per_layer = (sdpa_time - flash_time) / num_layers
237
+ total_time_saved = time_saved_per_layer * num_layers
238
+
239
+ print(f"\n 📈 {model_name} Analysis:")
240
+ print(f" • Attention time per layer: {flash_time:.2f} ms (flash) vs {sdpa_time:.2f} ms (sdpa)")
241
+ print(f" • Time saved per layer: {time_saved_per_layer:.2f} ms")
242
+ print(f" • Total time saved ({num_layers} layers): {total_time_saved:.1f} ms")
243
+ print(f" • Speedup: {speedup:.2f}x on attention")
244
+
245
+ # Estimate full inference impact
246
+ # Attention is ~15-20% of total inference time
247
+ attn_fraction = 0.175
248
+ overall_speedup = 1 / (1 - attn_fraction + attn_fraction / speedup)
249
+ overall_improvement = (1 - 1 / overall_speedup) * 100
250
+
251
+ print(
252
+ f" • Estimated full inference speedup: {overall_speedup:.2f}x (~{overall_improvement:.1f}% faster)"
253
+ )
254
+
255
+
256
+ def run_benchmark(test_configs, backends, warmup=5, runs=20, detailed=False):
257
+ """Run complete benchmark suite."""
258
+ device_info = get_device_info()
259
+ device = device_info["device"]
260
+ dtype = torch.float16 if device.type == "cuda" else torch.float32
261
+
262
+ print_header()
263
+ print_device_info(device_info)
264
+
265
+ # Filter backends based on availability
266
+ available_backends = []
267
+ if FLASH_ATTN_AVAILABLE and device.type == "cuda":
268
+ available_backends.append("flash_attn")
269
+ available_backends.append("sdpa")
270
+ if detailed:
271
+ available_backends.append("manual")
272
+
273
+ all_results = {}
274
+
275
+ # Run benchmarks by model
276
+ for model_name, model_config in DA3_CONFIGS.items():
277
+ print(f"\n🔬 MODEL: {model_name.upper()} (dim={model_config['embed_dim']}, heads={model_config['num_heads']}, depth={model_config['depth']})")
278
+ print("─" * 80)
279
+ print_table_header()
280
+
281
+ model_results = {}
282
+
283
+ for test_config in test_configs:
284
+ # Adjust config for this model
285
+ config = BenchmarkConfig(
286
+ name=test_config.name,
287
+ seq_len=test_config.seq_len,
288
+ batch_size=test_config.batch_size,
289
+ embed_dim=model_config["embed_dim"],
290
+ num_heads=model_config["num_heads"],
291
+ image_size=test_config.image_size,
292
+ )
293
+
294
+ x = torch.randn(
295
+ config.batch_size, config.seq_len, config.embed_dim, device=device, dtype=dtype
296
+ )
297
+
298
+ results = {}
299
+ for backend in available_backends:
300
+ gc.collect()
301
+ if device.type == "cuda":
302
+ torch.cuda.empty_cache()
303
+
304
+ try:
305
+ attn = Attention(
306
+ dim=config.embed_dim,
307
+ num_heads=config.num_heads,
308
+ attn_backend=backend,
309
+ ).to(device, dtype)
310
+ attn.eval()
311
+
312
+ result = benchmark_attention(attn, x, warmup=warmup, runs=runs)
313
+ results[backend] = result
314
+
315
+ del attn
316
+ except Exception as e:
317
+ results[backend] = None
318
+ if detailed:
319
+ print(f" {backend} failed: {e}")
320
+
321
+ model_results[config.name] = results
322
+ print_table_row(config.description, results)
323
+
324
+ print_table_footer()
325
+
326
+ # Analysis for this model
327
+ if detailed and model_results:
328
+ # Use medium config for analysis
329
+ medium_key = next(
330
+ (k for k in model_results.keys() if "1024" in k.lower() or "medium" in k.lower()),
331
+ list(model_results.keys())[0],
332
+ )
333
+ print_model_analysis(
334
+ model_name.upper(),
335
+ test_configs[0],
336
+ model_results[medium_key],
337
+ model_config["depth"],
338
+ )
339
+
340
+ all_results[model_name] = model_results
341
+
342
+ # Final summary
343
+ print("\n" + "=" * 80)
344
+ print("📋 SUMMARY & RECOMMENDATIONS")
345
+ print("=" * 80)
346
+
347
+ sdpa_info = get_sdpa_backend_info()
348
+
349
+ if device.type == "cuda":
350
+ # Check if PyTorch SDPA has Flash enabled
351
+ if sdpa_info.get('flash_sdp'):
352
+ print("\n✅ Flash Attention is ACTIVE via PyTorch SDPA!")
353
+ print("\n Your setup:")
354
+ print(f" • PyTorch {torch.__version__} with native Flash Attention")
355
+ print(" • SDPA backend: Flash SDP ⚡")
356
+ print(" • No additional packages needed!")
357
+ print("\n Benefits you're already getting:")
358
+ print(" • 2-4x faster attention vs manual implementation")
359
+ print(" • Memory-efficient attention computation")
360
+ print(" • Automatic kernel selection per input size")
361
+
362
+ if FLASH_ATTN_AVAILABLE:
363
+ print(f"\n ℹ️ flash-attn v{FLASH_ATTN_VERSION} also installed")
364
+ print(" (May provide slight additional optimization in some cases)")
365
+ else:
366
+ print("\n ℹ️ flash-attn package: Not needed!")
367
+ print(" PyTorch >= 2.2 includes Flash Attention natively.")
368
+
369
+ elif FLASH_ATTN_AVAILABLE:
370
+ print("\n✅ Flash Attention is ACTIVE via flash-attn package")
371
+ print(f"\n Using flash-attn v{FLASH_ATTN_VERSION}")
372
+ print("\n Benefits:")
373
+ print(" • 2-3x faster attention computation")
374
+ print(" • ~15-25% overall inference speedup")
375
+ print(" • Lower memory usage")
376
+
377
+ else:
378
+ print("\n⚠️ Flash Attention not available")
379
+ print("\n Options to enable:")
380
+ print(" 1. Upgrade PyTorch to >= 2.2 (recommended)")
381
+ print(" 2. Install flash-attn: pip install flash-attn --no-build-isolation")
382
+
383
+ elif device.type == "mps":
384
+ print("\n📱 Apple Silicon (MPS) detected")
385
+ print("\n • Flash Attention not available for MPS")
386
+ print(" • PyTorch SDPA uses optimized Metal kernels")
387
+ print(" • Already running at optimal speed for your hardware")
388
+
389
+ else:
390
+ print("\n💻 CPU detected")
391
+ print("\n • Consider using GPU for faster inference")
392
+ print(" • Flash Attention is CUDA-only")
393
+
394
+ # Print SDPA vs Manual speedup summary
395
+ print("\n" + "─" * 80)
396
+ print("⚡ PERFORMANCE COMPARISON")
397
+ print("─" * 80)
398
+ print("\n SDPA vs Manual attention speedup (per layer):")
399
+
400
+ for model_name, model_results in all_results.items():
401
+ if model_results:
402
+ # Get XLarge config results for most impact
403
+ xlarge_key = next((k for k in model_results.keys() if "xlarge" in k.lower()), list(model_results.keys())[-1])
404
+ if xlarge_key in model_results:
405
+ res = model_results[xlarge_key]
406
+ if res.get("sdpa") and res.get("manual"):
407
+ speedup = res["manual"]["mean_ms"] / res["sdpa"]["mean_ms"]
408
+ print(f" • {model_name.upper():6s}: {speedup:.1f}x faster (sdpa: {res['sdpa']['mean_ms']:.2f}ms vs manual: {res['manual']['mean_ms']:.2f}ms)")
409
+
410
+ print("\n" + "=" * 80)
411
+ print()
412
+
413
+ return all_results
414
+
415
+
416
+ def main():
417
+ parser = argparse.ArgumentParser(description="Flash Attention benchmark for DA3")
418
+ parser.add_argument(
419
+ "--detailed",
420
+ action="store_true",
421
+ help="Show detailed analysis and include manual backend",
422
+ )
423
+ parser.add_argument(
424
+ "--warmup",
425
+ type=int,
426
+ default=5,
427
+ help="Warmup iterations (default: 5)",
428
+ )
429
+ parser.add_argument(
430
+ "--runs",
431
+ type=int,
432
+ default=20,
433
+ help="Benchmark runs (default: 20)",
434
+ )
435
+
436
+ args = parser.parse_args()
437
+
438
+ # Test configurations based on common image sizes
439
+ test_configs = [
440
+ BenchmarkConfig(
441
+ name="Small",
442
+ seq_len=256,
443
+ batch_size=1,
444
+ embed_dim=768, # Will be overridden per model
445
+ num_heads=12, # Will be overridden per model
446
+ image_size="392px image",
447
+ ),
448
+ BenchmarkConfig(
449
+ name="Medium",
450
+ seq_len=529,
451
+ batch_size=1,
452
+ embed_dim=768,
453
+ num_heads=12,
454
+ image_size="518px image",
455
+ ),
456
+ BenchmarkConfig(
457
+ name="Large",
458
+ seq_len=1024,
459
+ batch_size=1,
460
+ embed_dim=768,
461
+ num_heads=12,
462
+ image_size="742px image",
463
+ ),
464
+ BenchmarkConfig(
465
+ name="XLarge",
466
+ seq_len=1369,
467
+ batch_size=1,
468
+ embed_dim=768,
469
+ num_heads=12,
470
+ image_size="1024px image",
471
+ ),
472
+ ]
473
+
474
+ backends = ["flash_attn", "sdpa"]
475
+ if args.detailed:
476
+ backends.append("manual")
477
+
478
+ run_benchmark(
479
+ test_configs=test_configs,
480
+ backends=backends,
481
+ warmup=args.warmup,
482
+ runs=args.runs,
483
+ detailed=args.detailed,
484
+ )
485
+
486
+
487
+ if __name__ == "__main__":
488
+ main()
benchmarks/full_benchmark.py ADDED
@@ -0,0 +1,696 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # Copyright (c) 2025 Delanoe Pirard / Aedelon - Apache 2.0
3
+ """
4
+ Full Benchmark Suite for Depth Anything 3
5
+
6
+ Tests ALL optimization combinations for each device (CPU, MPS, CUDA).
7
+
8
+ Optimizations tested:
9
+ - Preprocessing: CPU (PIL) vs GPU (NVJPEG on CUDA)
10
+ - Attention: SDPA (Flash Attention) vs Manual
11
+
12
+ Usage:
13
+ python benchmarks/full_benchmark.py # Best device only
14
+ python benchmarks/full_benchmark.py -d all # All devices
15
+ python benchmarks/full_benchmark.py -d cuda # CUDA only
16
+ python benchmarks/full_benchmark.py --quick # Quick mode
17
+ """
18
+
19
+ import argparse
20
+ import gc
21
+ import logging
22
+ import os
23
+ import shutil
24
+ import sys
25
+ import time
26
+ import warnings
27
+ from dataclasses import dataclass
28
+ from typing import Dict, List, Optional
29
+
30
+ # Suppress ALL logging before any imports
31
+ logging.disable(logging.CRITICAL)
32
+ os.environ["DA3_LOG_LEVEL"] = "ERROR"
33
+ warnings.filterwarnings("ignore")
34
+
35
+ import numpy as np
36
+ import torch
37
+ from PIL import Image
38
+
39
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))
40
+
41
+ # Suppress depth_anything_3 logger specifically
42
+ logging.getLogger("depth_anything_3").disabled = True
43
+ logging.getLogger("dinov2").disabled = True
44
+
45
+
46
+ # ============================================================================
47
+ # STYLES
48
+ # ============================================================================
49
+
50
+ class Style:
51
+ CYAN = "\033[96m"
52
+ GREEN = "\033[92m"
53
+ YELLOW = "\033[93m"
54
+ RED = "\033[91m"
55
+ BOLD = "\033[1m"
56
+ DIM = "\033[2m"
57
+ RESET = "\033[0m"
58
+
59
+
60
+ def colored(text, color, bold=False):
61
+ prefix = Style.BOLD if bold else ""
62
+ return f"{prefix}{color}{text}{Style.RESET}"
63
+
64
+
65
+ # ============================================================================
66
+ # UTILITIES
67
+ # ============================================================================
68
+
69
+ def cleanup():
70
+ gc.collect()
71
+ if torch.cuda.is_available():
72
+ torch.cuda.empty_cache()
73
+ torch.cuda.reset_peak_memory_stats()
74
+ if torch.backends.mps.is_available():
75
+ torch.mps.empty_cache()
76
+
77
+
78
+ def sync_device(device):
79
+ if device.type == "cuda":
80
+ torch.cuda.synchronize()
81
+ elif device.type == "mps":
82
+ torch.mps.synchronize()
83
+
84
+
85
+ def get_available_devices() -> List[torch.device]:
86
+ """Get all available devices for benchmarking."""
87
+ devices = [torch.device("cpu")]
88
+ if torch.backends.mps.is_available():
89
+ devices.append(torch.device("mps"))
90
+ if torch.cuda.is_available():
91
+ devices.append(torch.device("cuda"))
92
+ return devices
93
+
94
+
95
+ def get_device_name(device: torch.device) -> str:
96
+ """Get human-readable device name."""
97
+ if device.type == "cuda":
98
+ return torch.cuda.get_device_name(device)
99
+ elif device.type == "mps":
100
+ return "Apple Silicon (MPS)"
101
+ else:
102
+ import platform
103
+ return f"CPU ({platform.processor() or 'Unknown'})"
104
+
105
+
106
+ # ============================================================================
107
+ # DATA CLASSES
108
+ # ============================================================================
109
+
110
+ @dataclass
111
+ class BenchmarkResult:
112
+ """Single benchmark result."""
113
+ mean_ms: float
114
+ std_ms: float
115
+ fps: float
116
+
117
+ @classmethod
118
+ def from_times(cls, times: List[float], batch_size: int = 1):
119
+ mean_ms = np.mean(times)
120
+ std_ms = np.std(times)
121
+ fps = 1000 / mean_ms * batch_size
122
+ return cls(mean_ms=mean_ms, std_ms=std_ms, fps=fps)
123
+
124
+
125
+ @dataclass
126
+ class OptimizationConfig:
127
+ """Configuration for a specific optimization combination."""
128
+ name: str
129
+ preprocessing: str # "cpu" or "gpu"
130
+ attention: str # "sdpa" or "manual"
131
+ description: str
132
+
133
+ @property
134
+ def short_name(self) -> str:
135
+ prep = "GPU" if self.preprocessing == "gpu" else "CPU"
136
+ attn = "SDPA" if self.attention == "sdpa" else "Manual"
137
+ return f"{prep}+{attn}"
138
+
139
+
140
+ # ============================================================================
141
+ # BENCHMARK FUNCTIONS
142
+ # ============================================================================
143
+
144
+ def get_optimization_configs(device: torch.device) -> List[OptimizationConfig]:
145
+ """Get all valid optimization configurations for a device."""
146
+ configs = []
147
+
148
+ if device.type == "cuda":
149
+ # CUDA: All 4 combinations
150
+ configs = [
151
+ OptimizationConfig("gpu_sdpa", "gpu", "sdpa", "GPU Decode (NVJPEG) + SDPA (Flash)"),
152
+ OptimizationConfig("gpu_manual", "gpu", "manual", "GPU Decode (NVJPEG) + Manual Attn"),
153
+ OptimizationConfig("cpu_sdpa", "cpu", "sdpa", "CPU Decode (PIL) + SDPA (Flash)"),
154
+ OptimizationConfig("cpu_manual", "cpu", "manual", "CPU Decode (PIL) + Manual Attn"),
155
+ ]
156
+ elif device.type == "mps":
157
+ # MPS: CPU preprocessing is better, 2 combinations
158
+ configs = [
159
+ OptimizationConfig("cpu_sdpa", "cpu", "sdpa", "CPU Decode (PIL) + SDPA"),
160
+ OptimizationConfig("cpu_manual", "cpu", "manual", "CPU Decode (PIL) + Manual Attn"),
161
+ ]
162
+ else:
163
+ # CPU: 2 combinations
164
+ configs = [
165
+ OptimizationConfig("cpu_sdpa", "cpu", "sdpa", "SDPA Attention"),
166
+ OptimizationConfig("cpu_manual", "cpu", "manual", "Manual Attention"),
167
+ ]
168
+
169
+ return configs
170
+
171
+
172
+ def benchmark_preprocessing_detailed(device: torch.device, runs: int = 5) -> Dict:
173
+ """Benchmark preprocessing in detail."""
174
+ from depth_anything_3.utils.io.input_processor import InputProcessor
175
+ from depth_anything_3.utils.io.gpu_input_processor import GPUInputProcessor
176
+
177
+ results = {}
178
+ temp_dir = "temp_bench_preproc"
179
+
180
+ sizes = [
181
+ ("720p", 1280, 720),
182
+ ("1080p", 1920, 1080),
183
+ ("4K", 3840, 2160),
184
+ ]
185
+
186
+ os.makedirs(temp_dir, exist_ok=True)
187
+
188
+ try:
189
+ cpu_proc = InputProcessor()
190
+ gpu_proc = None
191
+ if device.type == "cuda":
192
+ gpu_proc = GPUInputProcessor(device=device)
193
+
194
+ for name, w, h in sizes:
195
+ results[name] = {}
196
+
197
+ # Create test files
198
+ files = []
199
+ pil_imgs = []
200
+ for i in range(4):
201
+ img = Image.new("RGB", (w, h), color=(100 + i*10, 150, 200))
202
+ fpath = f"{temp_dir}/{name}_{i}.jpg"
203
+ img.save(fpath, quality=95)
204
+ files.append(fpath)
205
+ pil_imgs.append(img.copy())
206
+
207
+ # CPU benchmark
208
+ cleanup()
209
+ for _ in range(2):
210
+ cpu_proc(image=pil_imgs, process_res=518, num_workers=8)
211
+
212
+ times = []
213
+ for _ in range(runs):
214
+ start = time.perf_counter()
215
+ cpu_proc(image=pil_imgs, process_res=518, num_workers=8)
216
+ times.append((time.perf_counter() - start) * 1000)
217
+ results[name]["cpu"] = BenchmarkResult.from_times(times, batch_size=4)
218
+
219
+ # GPU benchmark (NVJPEG for CUDA)
220
+ if gpu_proc and gpu_proc.use_gpu:
221
+ cleanup()
222
+ for _ in range(2):
223
+ gpu_proc(image=files, process_res=518, num_workers=1)
224
+ sync_device(device)
225
+
226
+ times = []
227
+ for _ in range(runs):
228
+ sync_device(device)
229
+ start = time.perf_counter()
230
+ gpu_proc(image=files, process_res=518, num_workers=1)
231
+ sync_device(device)
232
+ times.append((time.perf_counter() - start) * 1000)
233
+ results[name]["gpu"] = BenchmarkResult.from_times(times, batch_size=4)
234
+
235
+ finally:
236
+ shutil.rmtree(temp_dir, ignore_errors=True)
237
+
238
+ return results
239
+
240
+
241
+ def benchmark_attention_detailed(device: torch.device, runs: int = 10) -> Dict:
242
+ """Benchmark attention backends in detail."""
243
+ from depth_anything_3.model.dinov2.layers import Attention
244
+
245
+ results = {}
246
+ dtype = torch.float16 if device.type == "cuda" else torch.float32
247
+
248
+ configs = [
249
+ ("ViT-S (518px)", 384, 6, 529),
250
+ ("ViT-L (518px)", 1024, 16, 529),
251
+ ("ViT-L (770px)", 1024, 16, 1156),
252
+ ]
253
+
254
+ for name, dim, heads, seq_len in configs:
255
+ results[name] = {}
256
+ x = torch.randn(1, seq_len, dim, device=device, dtype=dtype)
257
+
258
+ for backend in ["sdpa", "manual"]:
259
+ cleanup()
260
+ attn = Attention(dim=dim, num_heads=heads, attn_backend=backend).to(device, dtype)
261
+ attn.eval()
262
+
263
+ # Warmup
264
+ with torch.no_grad():
265
+ for _ in range(3):
266
+ attn(x)
267
+ sync_device(device)
268
+
269
+ # Benchmark
270
+ times = []
271
+ with torch.no_grad():
272
+ for _ in range(runs):
273
+ sync_device(device)
274
+ start = time.perf_counter()
275
+ attn(x)
276
+ sync_device(device)
277
+ times.append((time.perf_counter() - start) * 1000)
278
+
279
+ results[name][backend] = BenchmarkResult.from_times(times)
280
+ del attn
281
+
282
+ return results
283
+
284
+
285
+ def benchmark_inference_matrix(
286
+ device: torch.device,
287
+ models: List[str],
288
+ runs: int = 3,
289
+ ) -> Dict:
290
+ """Benchmark all optimization combinations for inference."""
291
+ from depth_anything_3.api import DepthAnything3
292
+
293
+ results = {}
294
+ temp_dir = "temp_bench_infer"
295
+ configs = get_optimization_configs(device)
296
+
297
+ os.makedirs(temp_dir, exist_ok=True)
298
+
299
+ # Create test images (720p)
300
+ img_paths = []
301
+ pil_imgs = []
302
+ for i in range(4):
303
+ img = Image.new("RGB", (1280, 720), color=(100 + i*20, 150, 200))
304
+ path = f"{temp_dir}/test_{i}.jpg"
305
+ img.save(path, quality=95)
306
+ img_paths.append(path)
307
+ pil_imgs.append(img.copy())
308
+
309
+ try:
310
+ for model_name in models:
311
+ results[model_name] = {}
312
+
313
+ for config in configs:
314
+ cleanup()
315
+
316
+ # Set attention backend
317
+ os.environ["DA3_ATTENTION_BACKEND"] = config.attention
318
+
319
+ # Load model fresh (to apply attention backend)
320
+ model = DepthAnything3(
321
+ model_name=model_name,
322
+ device=device,
323
+ use_cache=False,
324
+ )
325
+
326
+ # Choose input based on preprocessing
327
+ if config.preprocessing == "gpu" and device.type == "cuda":
328
+ test_input = img_paths[:1] # File paths for NVJPEG
329
+ else:
330
+ test_input = pil_imgs[:1] # PIL for CPU preprocessing
331
+
332
+ # Warmup
333
+ for _ in range(3):
334
+ model.inference(test_input, process_res=518)
335
+ sync_device(device)
336
+
337
+ # Benchmark
338
+ times = []
339
+ for _ in range(runs):
340
+ sync_device(device)
341
+ start = time.perf_counter()
342
+ model.inference(test_input, process_res=518)
343
+ sync_device(device)
344
+ times.append((time.perf_counter() - start) * 1000)
345
+
346
+ results[model_name][config.name] = {
347
+ "result": BenchmarkResult.from_times(times, batch_size=1),
348
+ "config": config,
349
+ }
350
+
351
+ del model
352
+ cleanup()
353
+
354
+ finally:
355
+ shutil.rmtree(temp_dir, ignore_errors=True)
356
+
357
+ return results
358
+
359
+
360
+ # ============================================================================
361
+ # DISPLAY FUNCTIONS
362
+ # ============================================================================
363
+
364
+ def print_header(title: str):
365
+ """Print section header."""
366
+ print()
367
+ print(colored("═" * 70, Style.CYAN))
368
+ print(colored("║", Style.CYAN) + colored(f" {title}", Style.BOLD).center(77) + colored("║", Style.CYAN))
369
+ print(colored("═" * 70, Style.CYAN))
370
+
371
+
372
+ def print_subheader(title: str):
373
+ """Print subsection header."""
374
+ print()
375
+ print(colored(f"▶ {title}", Style.YELLOW, bold=True))
376
+ print(colored("─" * 70, Style.DIM))
377
+
378
+
379
+ def format_speedup(speedup: float) -> str:
380
+ """Format speedup with color."""
381
+ if speedup >= 1.5:
382
+ return colored(f"{speedup:.2f}x", Style.GREEN, bold=True)
383
+ elif speedup >= 1.1:
384
+ return colored(f"{speedup:.2f}x", Style.GREEN)
385
+ elif speedup >= 0.95:
386
+ return f"{speedup:.2f}x"
387
+ else:
388
+ return colored(f"{speedup:.2f}x", Style.RED)
389
+
390
+
391
+ def print_preprocessing_results(results: Dict, device: torch.device):
392
+ """Print preprocessing benchmark results."""
393
+ print_subheader("PREPROCESSING (4 images batch)")
394
+
395
+ has_gpu = any("gpu" in r for r in results.values())
396
+
397
+ if has_gpu:
398
+ print(f" {'Resolution':<12} {'CPU (PIL)':<14} {'GPU (NVJPEG)':<14} {'Speedup':<10}")
399
+ print(f" {'-'*50}")
400
+
401
+ for name, data in results.items():
402
+ cpu_ms = data["cpu"].mean_ms
403
+ if "gpu" in data:
404
+ gpu_ms = data["gpu"].mean_ms
405
+ speedup = cpu_ms / gpu_ms
406
+ print(f" {name:<12} {cpu_ms:>8.1f} ms {gpu_ms:>8.1f} ms {format_speedup(speedup)}")
407
+ else:
408
+ print(f" {name:<12} {cpu_ms:>8.1f} ms {'N/A':<14}")
409
+ else:
410
+ print(f" {'Resolution':<12} {'CPU (PIL)':<14}")
411
+ print(f" {'-'*30}")
412
+ for name, data in results.items():
413
+ cpu_ms = data["cpu"].mean_ms
414
+ print(f" {name:<12} {cpu_ms:>8.1f} ms")
415
+
416
+ # Summary
417
+ if has_gpu:
418
+ speedups = []
419
+ for data in results.values():
420
+ if "gpu" in data:
421
+ speedups.append(data["cpu"].mean_ms / data["gpu"].mean_ms)
422
+ if speedups:
423
+ avg = np.mean(speedups)
424
+ print()
425
+ print(f" {colored('→', Style.GREEN)} GPU preprocessing avg {colored(f'{avg:.1f}x', Style.GREEN, bold=True)} faster")
426
+
427
+
428
+ def print_attention_results(results: Dict, device: torch.device):
429
+ """Print attention benchmark results."""
430
+ print_subheader("ATTENTION (per layer forward pass)")
431
+
432
+ print(f" {'Config':<18} {'SDPA':<12} {'Manual':<12} {'Speedup':<10}")
433
+ print(f" {'-'*52}")
434
+
435
+ for name, data in results.items():
436
+ sdpa_ms = data["sdpa"].mean_ms
437
+ manual_ms = data["manual"].mean_ms
438
+ speedup = manual_ms / sdpa_ms
439
+ print(f" {name:<18} {sdpa_ms:>6.3f} ms {manual_ms:>6.3f} ms {format_speedup(speedup)}")
440
+
441
+ # Summary
442
+ speedups = [d["manual"].mean_ms / d["sdpa"].mean_ms for d in results.values()]
443
+ avg = np.mean(speedups)
444
+ print()
445
+ print(f" {colored('→', Style.GREEN)} SDPA avg {colored(f'{avg:.1f}x', Style.GREEN, bold=True)} faster than manual")
446
+
447
+ # Check Flash SDP
448
+ if device.type == "cuda":
449
+ from torch.backends.cuda import flash_sdp_enabled
450
+ if flash_sdp_enabled():
451
+ print(f" {colored('→', Style.GREEN)} Flash Attention: {colored('ENABLED', Style.GREEN, bold=True)} (PyTorch native)")
452
+
453
+
454
+ def print_inference_matrix(results: Dict, device: torch.device):
455
+ """Print inference benchmark matrix."""
456
+ print_subheader("END-TO-END INFERENCE (720p input, batch=1)")
457
+
458
+ configs = get_optimization_configs(device)
459
+
460
+ # Header
461
+ header = f" {'Model':<12}"
462
+ for cfg in configs:
463
+ header += f" {cfg.short_name:<14}"
464
+ header += " Best"
465
+ print(header)
466
+ print(f" {'-'*(14 + 15*len(configs) + 6)}")
467
+
468
+ # Results per model
469
+ for model_name, model_results in results.items():
470
+ row = f" {model_name:<12}"
471
+
472
+ best_fps = 0
473
+ best_config = None
474
+ worst_fps = float('inf')
475
+
476
+ for cfg in configs:
477
+ if cfg.name in model_results:
478
+ result = model_results[cfg.name]["result"]
479
+ fps = result.fps
480
+ row += f" {fps:>6.1f} img/s "
481
+
482
+ if fps > best_fps:
483
+ best_fps = fps
484
+ best_config = cfg
485
+ if fps < worst_fps:
486
+ worst_fps = fps
487
+ else:
488
+ row += f" {'N/A':<14}"
489
+
490
+ # Best indicator
491
+ if best_config:
492
+ row += f" {colored(best_config.short_name, Style.GREEN, bold=True)}"
493
+
494
+ print(row)
495
+
496
+ # Summary
497
+ print()
498
+ print(f" {Style.DIM}Legend: GPU=NVJPEG decode, CPU=PIL decode, SDPA=Flash Attention{Style.RESET}")
499
+
500
+
501
+ def print_device_summary(
502
+ device: torch.device,
503
+ preproc_results: Dict,
504
+ attn_results: Dict,
505
+ infer_results: Dict,
506
+ ):
507
+ """Print summary for a device."""
508
+ print()
509
+ print(colored("─" * 70, Style.CYAN))
510
+ print(colored(f" {device.type.upper()} - OPTIMIZATION SUMMARY", Style.BOLD))
511
+ print(colored("─" * 70, Style.CYAN))
512
+
513
+ # Best configuration
514
+ if infer_results:
515
+ print()
516
+ print(f" {colored('Best configuration per model:', Style.CYAN)}")
517
+
518
+ for model_name, model_results in infer_results.items():
519
+ if not model_results:
520
+ continue
521
+
522
+ best_name = max(model_results.keys(), key=lambda k: model_results[k]["result"].fps)
523
+ best = model_results[best_name]
524
+ worst_name = min(model_results.keys(), key=lambda k: model_results[k]["result"].fps)
525
+ worst = model_results[worst_name]
526
+
527
+ speedup = best["result"].fps / worst["result"].fps if worst["result"].fps > 0 else 1
528
+
529
+ print(f" {model_name:<12} {colored(best['config'].description, Style.GREEN)}")
530
+ print(f" {'':<12} {best['result'].fps:.1f} img/s ({speedup:.1f}x vs worst)")
531
+
532
+ # Recommendations
533
+ print()
534
+ print(f" {colored('Recommendations:', Style.CYAN)}")
535
+
536
+ if device.type == "cuda":
537
+ print(f" ✓ Use {colored('GPU preprocessing (NVJPEG)', Style.GREEN)} for file inputs")
538
+ print(f" ✓ {colored('SDPA (Flash Attention)', Style.GREEN)} is enabled by default")
539
+ print(f" ✓ Pass file paths (not PIL images) to leverage NVJPEG")
540
+ elif device.type == "mps":
541
+ print(f" ✓ Use {colored('CPU preprocessing', Style.GREEN)} (faster than GPU on MPS)")
542
+ print(f" ✓ {colored('SDPA', Style.GREEN)} provides moderate speedup")
543
+ else:
544
+ print(f" ✓ {colored('SDPA', Style.GREEN)} provides speedup over manual attention")
545
+ print(f" ○ Consider using GPU (CUDA/MPS) for better performance")
546
+
547
+
548
+ # ============================================================================
549
+ # MAIN
550
+ # ============================================================================
551
+
552
+ def main():
553
+ parser = argparse.ArgumentParser(
554
+ description="DA3 Full Benchmark - Test all optimization combinations",
555
+ formatter_class=argparse.RawDescriptionHelpFormatter,
556
+ epilog="""
557
+ Examples:
558
+ python benchmarks/full_benchmark.py # Best device only
559
+ python benchmarks/full_benchmark.py -d all # All devices
560
+ python benchmarks/full_benchmark.py -d cuda # CUDA only
561
+ python benchmarks/full_benchmark.py --quick # Quick mode (fewer runs)
562
+ python benchmarks/full_benchmark.py --models da3-small da3-large
563
+ """
564
+ )
565
+ parser.add_argument("--quick", action="store_true", help="Quick mode (fewer runs)")
566
+ parser.add_argument("--skip-preprocessing", action="store_true", help="Skip preprocessing benchmark")
567
+ parser.add_argument("--skip-attention", action="store_true", help="Skip attention benchmark")
568
+ parser.add_argument("--skip-inference", action="store_true", help="Skip inference benchmark")
569
+ parser.add_argument("-d", "--device", type=str, default="auto",
570
+ choices=["auto", "cpu", "mps", "cuda", "all"],
571
+ help="Device to benchmark (default: auto)")
572
+ parser.add_argument("--models", nargs="+", default=None,
573
+ help="Models to benchmark (default: all)")
574
+ args = parser.parse_args()
575
+
576
+ # Configure runs
577
+ runs_preproc = 3 if args.quick else 5
578
+ runs_attn = 5 if args.quick else 10
579
+ runs_infer = 2 if args.quick else 4
580
+
581
+ # Determine models
582
+ if args.models:
583
+ models = args.models
584
+ elif args.quick:
585
+ models = ["da3-small", "da3-large"]
586
+ else:
587
+ models = ["da3-small", "da3-base", "da3-large"]
588
+
589
+ # Determine devices
590
+ available_devices = get_available_devices()
591
+ if args.device == "auto":
592
+ devices_to_test = [available_devices[-1]] # Best available
593
+ elif args.device == "all":
594
+ devices_to_test = available_devices
595
+ else:
596
+ requested = torch.device(args.device)
597
+ if requested in available_devices:
598
+ devices_to_test = [requested]
599
+ else:
600
+ print(f"Error: Device '{args.device}' not available.")
601
+ print(f"Available: {[d.type for d in available_devices]}")
602
+ return
603
+
604
+ # Main header
605
+ print()
606
+ print(colored("╔" + "═" * 68 + "╗", Style.CYAN))
607
+ print(colored("║", Style.CYAN) + colored(" DEPTH ANYTHING 3 - FULL BENCHMARK", Style.BOLD).center(77) + colored("║", Style.CYAN))
608
+ print(colored("║", Style.CYAN) + colored(" All Optimization Combinations", Style.DIM).center(77) + colored("║", Style.CYAN))
609
+ print(colored("╚" + "═" * 68 + "╝", Style.CYAN))
610
+
611
+ print(f"\n {Style.DIM}PyTorch{Style.RESET} : {colored(torch.__version__, Style.CYAN)}")
612
+ print(f" {Style.DIM}Models{Style.RESET} : {colored(', '.join(models), Style.CYAN)}")
613
+ print(f" {Style.DIM}Mode{Style.RESET} : {colored('Quick' if args.quick else 'Full', Style.CYAN)}")
614
+
615
+ print(f"\n {Style.DIM}Available devices:{Style.RESET}")
616
+ for d in available_devices:
617
+ status = colored("●", Style.GREEN) if d in devices_to_test else colored("○", Style.DIM)
618
+ print(f" {status} {d.type.upper():<6} {get_device_name(d)}")
619
+
620
+ all_results = {}
621
+
622
+ # Run benchmarks for each device
623
+ for device in devices_to_test:
624
+ device_name = get_device_name(device)
625
+ all_results[device.type] = {}
626
+
627
+ print_header(f"{device.type.upper()} - {device_name}")
628
+
629
+ # 1. Preprocessing
630
+ preproc_results = {}
631
+ if not args.skip_preprocessing and device.type != "cpu":
632
+ preproc_results = benchmark_preprocessing_detailed(device, runs=runs_preproc)
633
+ all_results[device.type]["preprocessing"] = preproc_results
634
+ print_preprocessing_results(preproc_results, device)
635
+ elif device.type == "cpu":
636
+ print_subheader("PREPROCESSING")
637
+ print(f" {Style.DIM}Skipped (CPU only - no GPU comparison){Style.RESET}")
638
+
639
+ # 2. Attention
640
+ attn_results = {}
641
+ if not args.skip_attention:
642
+ attn_results = benchmark_attention_detailed(device, runs=runs_attn)
643
+ all_results[device.type]["attention"] = attn_results
644
+ print_attention_results(attn_results, device)
645
+
646
+ # 3. Inference Matrix
647
+ infer_results = {}
648
+ if not args.skip_inference:
649
+ infer_results = benchmark_inference_matrix(device, models, runs=runs_infer)
650
+ all_results[device.type]["inference"] = infer_results
651
+ print_inference_matrix(infer_results, device)
652
+
653
+ # Device Summary
654
+ print_device_summary(device, preproc_results, attn_results, infer_results)
655
+
656
+ cleanup()
657
+
658
+ # Cross-device comparison
659
+ if len(devices_to_test) > 1 and not args.skip_inference:
660
+ print_header("CROSS-DEVICE COMPARISON")
661
+
662
+ # Find common model
663
+ common_model = models[-1] # Usually largest tested
664
+
665
+ print()
666
+ print(f" {colored(f'{common_model} (best config per device):', Style.CYAN)}")
667
+ print(f" {'Device':<10} {'Config':<30} {'Performance':<15}")
668
+ print(f" {'-'*55}")
669
+
670
+ base_fps = None
671
+ for device in devices_to_test:
672
+ if device.type in all_results and "inference" in all_results[device.type]:
673
+ infer = all_results[device.type]["inference"].get(common_model, {})
674
+ if infer:
675
+ best_name = max(infer.keys(), key=lambda k: infer[k]["result"].fps)
676
+ best = infer[best_name]
677
+ fps = best["result"].fps
678
+
679
+ if base_fps is None:
680
+ base_fps = fps
681
+
682
+ speedup = fps / base_fps if base_fps else 1
683
+ speedup_str = f"({speedup:.1f}x)" if device != devices_to_test[0] else "(baseline)"
684
+
685
+ print(f" {device.type.upper():<10} {best['config'].description:<30} {fps:>5.1f} img/s {speedup_str}")
686
+
687
+ # Final summary
688
+ print()
689
+ print(colored("═" * 70, Style.CYAN))
690
+ print(colored("║", Style.CYAN) + colored(" BENCHMARK COMPLETE", Style.BOLD).center(77) + colored("║", Style.CYAN))
691
+ print(colored("═" * 70, Style.CYAN))
692
+ print()
693
+
694
+
695
+ if __name__ == "__main__":
696
+ main()
benchmarks/gpu_preprocessing_benchmark.py ADDED
@@ -0,0 +1,363 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # Copyright (c) 2025 Delanoe Pirard
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ """
17
+ GPU Preprocessing Benchmark
18
+
19
+ Compares CPU vs GPU preprocessing performance across different image sizes.
20
+ Measures:
21
+ - Preprocessing time only
22
+ - Total inference time (preprocessing + model forward)
23
+ - Memory usage
24
+ - Speedup percentages
25
+ """
26
+
27
+ import time
28
+ from typing import List, Tuple
29
+
30
+ import numpy as np
31
+ import torch
32
+ from PIL import Image
33
+
34
+ from depth_anything_3.utils.io.input_processor import InputProcessor
35
+ from depth_anything_3.utils.io.gpu_input_processor import GPUInputProcessor
36
+
37
+
38
+ import os
39
+ import shutil
40
+
41
+ def create_test_files(sizes: List[Tuple[int, int]], count: int = 4, temp_dir: str = "temp_bench_imgs") -> Tuple[List[List[str]], str]:
42
+ """Create test image files on disk.
43
+
44
+ Args:
45
+ sizes: List of (width, height) tuples
46
+ count: Number of images per size
47
+ temp_dir: Directory to save images
48
+
49
+ Returns:
50
+ List of image path batches, one per size
51
+ Path to temp directory
52
+ """
53
+ if os.path.exists(temp_dir):
54
+ shutil.rmtree(temp_dir)
55
+ os.makedirs(temp_dir)
56
+
57
+ batches = []
58
+ for w, h, _ in sizes:
59
+ batch = []
60
+ for i in range(count):
61
+ img = Image.new("RGB", (w, h), color=(i * 50, 100, 150))
62
+ fname = f"{temp_dir}/{w}x{h}_{i}.jpg"
63
+ img.save(fname, quality=95, subsampling=0)
64
+ batch.append(fname)
65
+ batches.append(batch)
66
+ return batches, temp_dir
67
+
68
+ def benchmark_gpu_decode_files(
69
+ processor,
70
+ image_paths: List[str],
71
+ process_res: int = 504,
72
+ warmup_runs: int = 2,
73
+ benchmark_runs: int = 10,
74
+ num_workers: int = 8,
75
+ ) -> float:
76
+ """Benchmark GPU decoding (from file path)."""
77
+ # Warmup
78
+ for _ in range(warmup_runs):
79
+ processor(
80
+ image=image_paths,
81
+ process_res=process_res,
82
+ process_res_method="upper_bound_resize",
83
+ num_workers=num_workers,
84
+ )
85
+
86
+ # Benchmark
87
+ times = []
88
+ for _ in range(benchmark_runs):
89
+ if hasattr(processor, 'device') and processor.device.type == "cuda":
90
+ torch.cuda.synchronize()
91
+
92
+ start = time.perf_counter()
93
+ # Pass file paths directly to GPUInputProcessor
94
+ tensor, _, _ = processor(
95
+ image=image_paths,
96
+ process_res=process_res,
97
+ process_res_method="upper_bound_resize",
98
+ num_workers=num_workers,
99
+ )
100
+
101
+ if hasattr(processor, 'device') and processor.device.type == "cuda":
102
+ torch.cuda.synchronize()
103
+
104
+ elapsed = time.perf_counter() - start
105
+ times.append(elapsed)
106
+
107
+ return np.mean(times)
108
+
109
+ def create_test_images(sizes: List[Tuple[int, int]], count: int = 4) -> List[List[Image.Image]]:
110
+ """Create test images for each size.
111
+
112
+ Args:
113
+ sizes: List of (width, height) tuples
114
+ count: Number of images per size
115
+
116
+ Returns:
117
+ List of image batches, one per size
118
+ """
119
+ batches = []
120
+ for w, h in sizes:
121
+ batch = [Image.new("RGB", (w, h), color=(i * 50, 100, 150)) for i in range(count)]
122
+ batches.append(batch)
123
+ return batches
124
+
125
+
126
+ def benchmark_hybrid(
127
+ processor,
128
+ images: List[Image.Image],
129
+ process_res: int = 504,
130
+ warmup_runs: int = 2,
131
+ benchmark_runs: int = 10,
132
+ num_workers: int = 8,
133
+ device=torch.device("cuda")
134
+ ) -> float:
135
+ """Benchmark hybrid preprocessing (CPU resize -> GPU normalize)."""
136
+
137
+ # Warmup
138
+ for _ in range(warmup_runs):
139
+ imgs_cpu, _, _ = processor(
140
+ image=images,
141
+ process_res=process_res,
142
+ process_res_method="upper_bound_resize",
143
+ num_workers=num_workers,
144
+ perform_normalization=False
145
+ )
146
+ imgs_gpu = imgs_cpu.to(device, non_blocking=True).float() / 255.0
147
+ _ = InputProcessor.normalize_tensor(imgs_gpu, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
148
+
149
+ # Benchmark
150
+ times = []
151
+ for _ in range(benchmark_runs):
152
+ if device.type == "cuda":
153
+ torch.cuda.synchronize()
154
+
155
+ start = time.perf_counter()
156
+
157
+ # 1. CPU Preprocessing (uint8)
158
+ imgs_cpu, _, _ = processor(
159
+ image=images,
160
+ process_res=process_res,
161
+ process_res_method="upper_bound_resize",
162
+ num_workers=num_workers,
163
+ perform_normalization=False
164
+ )
165
+
166
+ # 2. Transfer + Normalize
167
+ imgs_gpu = imgs_cpu.to(device, non_blocking=True).float() / 255.0
168
+ _ = InputProcessor.normalize_tensor(imgs_gpu, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
169
+
170
+ if device.type == "cuda":
171
+ torch.cuda.synchronize()
172
+
173
+ elapsed = time.perf_counter() - start
174
+ times.append(elapsed)
175
+
176
+ return np.mean(times)
177
+
178
+ def benchmark_preprocessing(
179
+ processor,
180
+ images: List[Image.Image],
181
+ process_res: int = 504,
182
+ warmup_runs: int = 2,
183
+ benchmark_runs: int = 10,
184
+ num_workers: int = 8,
185
+ ) -> float:
186
+ """Benchmark preprocessing performance.
187
+
188
+ Args:
189
+ processor: InputProcessor or GPUInputProcessor instance
190
+ images: List of test images
191
+ process_res: Processing resolution
192
+ warmup_runs: Number of warmup runs to discard
193
+ benchmark_runs: Number of benchmark runs to average
194
+ num_workers: Number of parallel workers (for CPU processor)
195
+
196
+ Returns:
197
+ Average preprocessing time in seconds
198
+ """
199
+ # Warmup
200
+ for _ in range(warmup_runs):
201
+ processor(
202
+ image=images,
203
+ process_res=process_res,
204
+ process_res_method="upper_bound_resize",
205
+ num_workers=num_workers,
206
+ )
207
+
208
+ # Benchmark
209
+ times = []
210
+ for _ in range(benchmark_runs):
211
+ if hasattr(processor, 'device') and processor.device.type == "cuda":
212
+ torch.cuda.synchronize()
213
+
214
+ start = time.perf_counter()
215
+ tensor, _, _ = processor(
216
+ image=images,
217
+ process_res=process_res,
218
+ process_res_method="upper_bound_resize",
219
+ num_workers=num_workers,
220
+ )
221
+
222
+ if hasattr(processor, 'device') and processor.device.type == "cuda":
223
+ torch.cuda.synchronize()
224
+
225
+ elapsed = time.perf_counter() - start
226
+ times.append(elapsed)
227
+
228
+ return np.mean(times)
229
+
230
+
231
+ def print_results_table(results: List[dict]):
232
+ """Pretty print benchmark results as table."""
233
+ print("\n" + "=" * 140)
234
+ print("GPU PREPROCESSING BENCHMARK RESULTS")
235
+ print("=" * 140)
236
+ print(f"{'Image Size':<15} {'CPU Time':<12} {'GPU Time':<12} {'Hybrid Time':<12} {'GPU Decode':<12} {'Best Method':<15}")
237
+ print("-" * 140)
238
+
239
+ for result in results:
240
+ size_str = f"{result['width']}x{result['height']}"
241
+ cpu_time = f"{result['cpu_time']*1000:.2f} ms"
242
+ gpu_time = f"{result['gpu_time']*1000:.2f} ms"
243
+ hybrid_time = f"{result['hybrid_time']*1000:.2f} ms"
244
+ gpu_decode_time = f"{result['gpu_decode_time']*1000:.2f} ms"
245
+
246
+ times = [result['cpu_time'], result['gpu_time'], result['hybrid_time'], result['gpu_decode_time']]
247
+ labels = ["CPU", "GPU", "Hybrid", "GPU Decode"]
248
+ best_idx = np.argmin(times)
249
+ best = labels[best_idx]
250
+
251
+ print(f"{size_str:<15} {cpu_time:<12} {gpu_time:<12} {hybrid_time:<12} {gpu_decode_time:<12} {best:<15}")
252
+
253
+ print("=" * 140 + "\n")
254
+
255
+
256
+ def main():
257
+ """Run comprehensive benchmark."""
258
+ print("\n" + "=" * 100)
259
+ print("INITIALIZING GPU PREPROCESSING BENCHMARK")
260
+ print("=" * 100)
261
+
262
+ # Check GPU availability
263
+ if torch.cuda.is_available():
264
+ device_name = "cuda"
265
+ device_info = torch.cuda.get_device_name(0)
266
+ print(f"✓ GPU Device: {device_info}")
267
+ print("✓ GPU preprocessing: ENABLED (NVJPEG + Kornia)")
268
+ elif torch.backends.mps.is_available():
269
+ device_name = "mps"
270
+ device_info = "Apple MPS"
271
+ print(f"✓ GPU Device: {device_info}")
272
+ print("ℹ GPU preprocessing: DISABLED on MPS (CPU is faster on Apple Silicon)")
273
+ print(" → GPUInputProcessor will use CPU path automatically")
274
+ print(" → GPU reserved for model inference (5-10x speedup there)")
275
+ else:
276
+ print("✗ No GPU available - benchmark will show CPU vs CPU (no speedup expected)")
277
+ device_name = "cpu"
278
+ device_info = "CPU only"
279
+
280
+ device = torch.device(device_name)
281
+
282
+ # Create processors
283
+ cpu_proc = InputProcessor()
284
+ gpu_proc = GPUInputProcessor(device=device_name)
285
+ print(f"✓ Processors initialized: CPU vs {device_name.upper()}")
286
+
287
+ # Test configurations
288
+ # Format: (width, height, description)
289
+ test_sizes = [
290
+ (640, 480, "Small (VGA)"),
291
+ (1280, 720, "Medium (HD)"),
292
+ (1920, 1080, "Large (Full HD)"),
293
+ (3840, 2160, "XLarge (4K)"),
294
+ ]
295
+
296
+ process_res = 504
297
+ num_images = 4
298
+ num_workers = 8
299
+
300
+ print(f"✓ Test config: {num_images} images per batch, process_res={process_res}, num_workers={num_workers}")
301
+ print(f"✓ Testing {len(test_sizes)} image sizes: {', '.join([desc for _, _, desc in test_sizes])}")
302
+
303
+ # Create test images
304
+ print("\nGenerating test images (PIL & Files)...")
305
+ image_batches_pil = create_test_images([(w, h) for w, h, _ in test_sizes], count=num_images)
306
+ image_batches_files, temp_dir = create_test_files(test_sizes, count=num_images)
307
+ print("✓ Test images generated")
308
+
309
+ # Run benchmarks
310
+ print("\nRunning benchmarks (this may take a minute)...\n")
311
+ results = []
312
+
313
+ try:
314
+ for (w, h, desc), imgs_pil, imgs_files in zip(test_sizes, image_batches_pil, image_batches_files):
315
+ print(f"Benchmarking {desc} ({w}x{h})...", end=" ", flush=True)
316
+
317
+ cpu_time = benchmark_preprocessing(cpu_proc, imgs_pil, process_res, num_workers=num_workers)
318
+ gpu_time = benchmark_preprocessing(gpu_proc, imgs_pil, process_res, num_workers=num_workers)
319
+ hybrid_time = benchmark_hybrid(cpu_proc, imgs_pil, process_res, num_workers=num_workers, device=device)
320
+
321
+ # GPU Decode uses file paths
322
+ gpu_decode_time = benchmark_gpu_decode_files(gpu_proc, imgs_files, process_res, num_workers=num_workers)
323
+
324
+ results.append({
325
+ 'width': w,
326
+ 'height': h,
327
+ 'description': desc,
328
+ 'cpu_time': cpu_time,
329
+ 'gpu_time': gpu_time,
330
+ 'hybrid_time': hybrid_time,
331
+ 'gpu_decode_time': gpu_decode_time
332
+ })
333
+
334
+ best_time = min(cpu_time, gpu_time, hybrid_time, gpu_decode_time)
335
+ if best_time == gpu_decode_time:
336
+ win = "GPU Decode"
337
+ elif best_time == hybrid_time:
338
+ win = "Hybrid"
339
+ elif best_time == gpu_time:
340
+ win = "GPU"
341
+ else:
342
+ win = "CPU"
343
+
344
+ print(f"✓ Best: {win}")
345
+
346
+ # Print results table
347
+ print_results_table(results)
348
+
349
+ # Memory info (CUDA only)
350
+ if device_name == "cuda":
351
+ print("\nGPU Memory Usage:")
352
+ print(f" Allocated: {torch.cuda.memory_allocated(0) / 1024**2:.1f} MB")
353
+ print(f" Cached: {torch.cuda.memory_reserved(0) / 1024**2:.1f} MB")
354
+
355
+ finally:
356
+ # Cleanup
357
+ if os.path.exists(temp_dir):
358
+ shutil.rmtree(temp_dir)
359
+ print(f"\n✓ Cleaned up temp directory: {temp_dir}")
360
+
361
+ if __name__ == "__main__":
362
+ main()
363
+
benchmarks/results/temp_images/test_image_0000.jpg ADDED

Git LFS Details

  • SHA256: b66fe582ae8d3bb62f6dcff97784125ae59ea7e62bdd49e89699fb7664c41a08
  • Pointer size: 131 Bytes
  • Size of remote file: 360 kB
benchmarks/results/temp_images/test_image_0001.jpg ADDED

Git LFS Details

  • SHA256: 24b4ecf7b65cc1f6c85453a03417da5a48d23295824d275e0cc9361a4feb36c5
  • Pointer size: 131 Bytes
  • Size of remote file: 361 kB
benchmarks/results/temp_images/test_image_0002.jpg ADDED

Git LFS Details

  • SHA256: 1dc218e581801d2332d2b7fe4eb35a997dc8d564d643dadeb2ee7539c7bf76f9
  • Pointer size: 131 Bytes
  • Size of remote file: 360 kB
benchmarks/results/temp_images/test_image_0003.jpg ADDED

Git LFS Details

  • SHA256: 4749f606ec40b7385eda5edecf5be20d02caed202fca5d39e198765dc93557f7
  • Pointer size: 131 Bytes
  • Size of remote file: 360 kB
docs/API.md ADDED
@@ -0,0 +1,465 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 📚 DepthAnything3 API Documentation
2
+
3
+ ## 📑 Table of Contents
4
+
5
+ 1. [📖 Overview](#overview)
6
+ 2. [💡 Usage Examples](#usage-examples)
7
+ 3. [🔧 Core API](#core-api)
8
+ - [DepthAnything3 Class](#depthanything3-class)
9
+ - [inference() Method](#inference-method)
10
+ 4. [⚙️ Parameters](#parameters)
11
+ - [Input Parameters](#input-parameters)
12
+ - [Pose Alignment Parameters](#pose-alignment-parameters)
13
+ - [Feature Export Parameters](#feature-export-parameters)
14
+ - [Rendering Parameters](#rendering-parameters)
15
+ - [Processing Parameters](#processing-parameters)
16
+ - [Export Parameters](#export-parameters)
17
+ 5. [📤 Export Formats](#export-formats)
18
+ 6. [↩️ Return Value](#return-value)
19
+
20
+ ## 📖 Overview
21
+
22
+ This documentation provides comprehensive API reference for DepthAnything3, including usage examples, parameter specifications, export formats, and advanced features. It covers both basic pose and depth estimation workflows and advanced pose-conditioned processing with multiple export capabilities.
23
+
24
+ ## 💡 Usage Examples
25
+
26
+ Here are quick examples to get you started:
27
+
28
+ ### 🚀 Basic Depth Estimation
29
+ ```python
30
+ from depth_anything_3.api import DepthAnything3
31
+
32
+ # Initialize and run inference
33
+ model = DepthAnything3.from_pretrained("depth-anything/DA3NESTED-GIANT-LARGE").to("cuda")
34
+ prediction = model.inference(["image1.jpg", "image2.jpg"])
35
+ ```
36
+
37
+ ### 📷 Pose-Conditioned Depth Estimation
38
+ ```python
39
+ import numpy as np
40
+
41
+ # With camera parameters for better consistency
42
+ prediction = model.inference(
43
+ image=["image1.jpg", "image2.jpg"],
44
+ extrinsics=extrinsics_array, # (N, 4, 4)
45
+ intrinsics=intrinsics_array # (N, 3, 3)
46
+ )
47
+ ```
48
+
49
+ ### 📤 Export Results
50
+ ```python
51
+ # Export depth data and 3D visualization
52
+ prediction = model.inference(
53
+ image=image_paths,
54
+ export_dir="./output",
55
+ export_format="mini_npz-glb"
56
+ )
57
+ ```
58
+
59
+ ### 🔍 Feature Extraction
60
+ ```python
61
+ # Export intermediate features from specific layers
62
+ prediction = model.inference(
63
+ image=image_paths,
64
+ export_dir="./output",
65
+ export_format="feat_vis",
66
+ export_feat_layers=[0, 1, 2] # Export features from layers 0, 1, 2
67
+ )
68
+ ```
69
+
70
+ ### ✨ Advanced Export with Gaussian Splatting
71
+ ```python
72
+ # Export multiple formats including Gaussian Splatting
73
+ # Note: infer_gs=True requires da3-giant or da3nested-giant-large model
74
+ model = DepthAnything3(model_name="da3-giant").to("cuda")
75
+
76
+ prediction = model.inference(
77
+ image=image_paths,
78
+ extrinsics=extrinsics_array,
79
+ intrinsics=intrinsics_array,
80
+ export_dir="./output",
81
+ export_format="npz-glb-gs_ply-gs_video",
82
+ align_to_input_ext_scale=True,
83
+ infer_gs=True, # Required for gs_ply and gs_video exports
84
+ )
85
+ ```
86
+
87
+ ### 🎨 Advanced Export with Feature Visualization
88
+ ```python
89
+ # Export with intermediate feature visualization
90
+ prediction = model.inference(
91
+ image=image_paths,
92
+ export_dir="./output",
93
+ export_format="mini_npz-glb-depth_vis-feat_vis",
94
+ export_feat_layers=[0, 5, 10, 15, 20],
95
+ feat_vis_fps=30,
96
+ )
97
+ ```
98
+
99
+ ### 📐 Using Ray-Based Pose Estimation
100
+ ```python
101
+ # Use ray-based pose estimation instead of camera decoder
102
+ prediction = model.inference(
103
+ image=image_paths,
104
+ export_dir="./output",
105
+ export_format="glb",
106
+ use_ray_pose=True, # Enable ray-based pose estimation
107
+ )
108
+ ```
109
+
110
+ ### 🎯 Reference View Selection
111
+ ```python
112
+ # For multi-view inputs, automatically select the best reference view
113
+ prediction = model.inference(
114
+ image=image_paths,
115
+ ref_view_strategy="saddle_balanced", # Default: balanced selection
116
+ )
117
+
118
+ # For video sequences, use middle frame as reference
119
+ prediction = model.inference(
120
+ image=video_frames,
121
+ ref_view_strategy="middle", # Good for temporally ordered inputs
122
+ )
123
+ ```
124
+
125
+ ## 🔧 Core API
126
+
127
+ ### 🔨 DepthAnything3 Class
128
+
129
+ The main API class that provides depth estimation capabilities with optional pose conditioning.
130
+
131
+ #### 🎯 Initialization
132
+
133
+ ```python
134
+ from depth_anything_3 import DepthAnything3
135
+
136
+ # Initialize the model with a model name
137
+ model = DepthAnything3(model_name="da3-large")
138
+ model = model.to("cuda") # Move to GPU
139
+ ```
140
+
141
+ **Parameters:**
142
+ - `model_name` (str, default: "da3-large"): The name of the model preset to use.
143
+ - **Available models:**
144
+ - 🦾 `"da3-giant"` - 1.15B params, any-view model with GS support
145
+ - ⭐ `"da3-large"` - 0.35B params, any-view model (recommended for most use cases)
146
+ - 📦 `"da3-base"` - 0.12B params, any-view model
147
+ - 🪶 `"da3-small"` - 0.08B params, any-view model
148
+ - 👁️ `"da3mono-large"` - 0.35B params, monocular depth only
149
+ - 📏 `"da3metric-large"` - 0.35B params, metric depth with sky segmentation
150
+ - 🎯 `"da3nested-giant-large"` - 1.40B params, nested model with all features
151
+
152
+ ### 🚀 inference() Method
153
+
154
+ The primary inference method that processes images and returns depth predictions.
155
+
156
+ ```python
157
+ prediction = model.inference(
158
+ image=image_list,
159
+ extrinsics=extrinsics_array, # Optional
160
+ intrinsics=intrinsics_array, # Optional
161
+ align_to_input_ext_scale=True, # Whether to align predicted poses to input scale
162
+ infer_gs=True, # Enable Gaussian branch for gs exports
163
+ use_ray_pose=False, # Use ray-based pose estimation instead of camera decoder
164
+ ref_view_strategy="saddle_balanced", # Reference view selection strategy
165
+ render_exts=render_extrinsics, # Optional renders for gs_video
166
+ render_ixts=render_intrinsics, # Optional renders for gs_video
167
+ render_hw=(height, width), # Optional renders for gs_video
168
+ process_res=504,
169
+ process_res_method="upper_bound_resize",
170
+ export_dir="output_directory", # Optional
171
+ export_format="mini_npz",
172
+ export_feat_layers=[], # List of layer indices to export features from
173
+ conf_thresh_percentile=40.0, # Confidence threshold percentile for depth map in GLB export
174
+ num_max_points=1_000_000, # Maximum number of points to export in GLB export
175
+ show_cameras=True, # Whether to show cameras in GLB export
176
+ feat_vis_fps=15, # Frames per second for feature visualization in feat_vis export
177
+ export_kwargs={} # Optional, additional arguments to export functions. export_format:key:val, see 'Parameters/Export Parameters' for details
178
+ )
179
+ ```
180
+
181
+ ## ⚙️ Parameters
182
+
183
+ ### 📸 Input Parameters
184
+
185
+ #### `image` (required)
186
+ - **Type**: `List[Union[np.ndarray, Image.Image, str]]`
187
+ - **Description**: List of input images. Can be numpy arrays, PIL Images, or file paths.
188
+ - **Example**:
189
+ ```python
190
+ # From file paths
191
+ image = ["image1.jpg", "image2.jpg", "image3.jpg"]
192
+
193
+ # From numpy arrays
194
+ image = [np.array(img1), np.array(img2)]
195
+
196
+ # From PIL Images
197
+ image = [Image.open("image1.jpg"), Image.open("image2.jpg")]
198
+ ```
199
+
200
+ #### `extrinsics` (optional)
201
+ - **Type**: `Optional[np.ndarray]`
202
+ - **Shape**: `(N, 4, 4)` where N is the number of input images
203
+ - **Description**: Camera extrinsic matrices (world-to-camera transformation). When provided, enables pose-conditioned depth estimation mode.
204
+ - **Note**: If not provided, the model operates in standard depth estimation mode.
205
+
206
+ #### `intrinsics` (optional)
207
+ - **Type**: `Optional[np.ndarray]`
208
+ - **Shape**: `(N, 3, 3)` where N is the number of input images
209
+ - **Description**: Camera intrinsic matrices containing focal length and principal point information. When provided, enables pose-conditioned depth estimation mode.
210
+
211
+ ### 🎯 Pose Alignment Parameters
212
+
213
+ #### `align_to_input_ext_scale` (default: True)
214
+ - **Type**: `bool`
215
+ - **Description**: When True the predicted extrinsics are replaced with the input
216
+ ones and the depth maps are rescaled to match their metric scale. When False the
217
+ function returns the internally aligned poses computed via Umeyama alignment.
218
+
219
+ #### `infer_gs` (default: False)
220
+ - **Type**: `bool`
221
+ - **Description**: Enable Gaussian Splatting branch for gaussian splatting exports. Required when using `gs_ply` or `gs_video` export formats.
222
+
223
+ #### `use_ray_pose` (default: False)
224
+ - **Type**: `bool`
225
+ - **Description**: Use ray-based pose estimation instead of camera decoder for pose prediction. When True, the model uses ray prediction heads to estimate camera poses; when False, it uses the camera decoder approach.
226
+
227
+ #### `ref_view_strategy` (default: "saddle_balanced")
228
+ - **Type**: `str`
229
+ - **Description**: Strategy for selecting the reference view from multiple input views. Options: `"first"`, `"middle"`, `"saddle_balanced"`, `"saddle_sim_range"`. Only applied when number of views ≥ 3. See [detailed documentation](funcs/ref_view_strategy.md) for strategy comparisons.
230
+ - **Available strategies**:
231
+ - `"saddle_balanced"`: Selects view with balanced features across multiple metrics (recommended default)
232
+ - `"saddle_sim_range"`: Selects view with largest similarity range
233
+ - `"first"`: Always uses first view (not recommended, equivalent to no reordering for views < 3)
234
+ - `"middle"`: Uses middle view (recommended for video sequences)
235
+
236
+ ### 🔍 Feature Export Parameters
237
+
238
+ #### `export_feat_layers` (default: [])
239
+ - **Type**: `List[int]`
240
+ - **Description**: List of layer indices to export intermediate features from. Features are stored in the `aux` dictionary of the Prediction object with keys like `feat_layer_0`, `feat_layer_1`, etc.
241
+
242
+ ### 🎥 Rendering Parameters
243
+
244
+ These arguments are only used when exporting Gaussian-splatting videos (include
245
+ `"gs_video"` in `export_format`). They describe an auxiliary camera trajectory
246
+ with ``M`` views.
247
+
248
+ #### `render_exts` (optional)
249
+ - **Type**: `Optional[np.ndarray]`
250
+ - **Shape**: `(M, 4, 4)`
251
+ - **Description**: Camera extrinsics for the synthesized trajectory. If omitted,
252
+ the exporter falls back to the predicted poses.
253
+
254
+ #### `render_ixts` (optional)
255
+ - **Type**: `Optional[np.ndarray]`
256
+ - **Shape**: `(M, 3, 3)`
257
+ - **Description**: Camera intrinsics for each rendered frame. Leave `None` to
258
+ reuse the input intrinsics.
259
+
260
+ #### `render_hw` (optional)
261
+ - **Type**: `Optional[Tuple[int, int]]`
262
+ - **Description**: Explicit output resolution `(height, width)` for the rendered
263
+ frames. Defaults to the input resolution when not provided.
264
+
265
+ ### ⚡ Processing Parameters
266
+
267
+ #### `process_res` (default: 504)
268
+ - **Type**: `int`
269
+ - **Description**: Base resolution for processing. The model will resize images to this resolution for inference.
270
+
271
+ #### `process_res_method` (default: "upper_bound_resize")
272
+ - **Type**: `str`
273
+ - **Description**: Method for resizing images to the target resolution.
274
+ - **Options**:
275
+ - `"upper_bound_resize"`: Resize so that the specified dimension (504) becomes the longer side
276
+ - `"lower_bound_resize"`: Resize so that the specified dimension (504) becomes the shorter side
277
+ - **Example**:
278
+ - Input: 1200×1600 → Output: 378×504 (with `process_res=504`, `process_res_method="upper_bound_resize"`)
279
+ - Input: 504×672 → Output: 504×672 (no change needed)
280
+
281
+ ### 📦 Export Parameters
282
+
283
+ #### `export_dir` (optional)
284
+ - **Type**: `Optional[str]`
285
+ - **Description**: Directory path where exported files will be saved. If not provided, no files will be exported.
286
+
287
+ #### `export_format` (default: "mini_npz")
288
+ - **Type**: `str`
289
+ - **Description**: Format for exporting results. Supports multiple formats separated by `-`.
290
+ - **Example**: `"mini_npz-glb"` exports both mini_npz and glb formats.
291
+
292
+ #### 🌐 GLB Export Parameters
293
+
294
+ These parameters are passed directly to the `inference()` method and only apply when `export_format` includes `"glb"`.
295
+
296
+ ##### `conf_thresh_percentile` (default: 40.0)
297
+ - **Type**: `float`
298
+ - **Description**: Lower percentile for adaptive confidence threshold. Points below this confidence percentile will be filtered out from the point cloud.
299
+
300
+ ##### `num_max_points` (default: 1,000,000)
301
+ - **Type**: `int`
302
+ - **Description**: Maximum number of points in the exported point cloud. If the point cloud exceeds this limit, it will be downsampled.
303
+
304
+ ##### `show_cameras` (default: True)
305
+ - **Type**: `bool`
306
+ - **Description**: Whether to include camera wireframes in the exported GLB file for visualization.
307
+
308
+ #### 🎨 Feature Visualization Parameters
309
+
310
+ These parameters are passed directly to the `inference()` method and only apply when `export_format` includes `"feat_vis"`.
311
+
312
+ ##### `feat_vis_fps` (default: 15)
313
+ - **Type**: `int`
314
+ - **Description**: Frame rate for the output video when visualizing features across multiple images.
315
+
316
+ #### ✨🎥 3DGS and 3DGS Video Parameters
317
+
318
+ These parameters are passed directly to the `inference()` method and only apply when `export_format` includes `"gs_ply"` or `"gs_video"`.
319
+
320
+ ##### `export_kwargs` (default: `{}`)
321
+ - Type: `dict[str, dict[str, Any]]`
322
+ - Description: Per-format extra arguments passed to export functions, mainly for `"gs_ply"` and `"gs_video"`.
323
+ - Access pattern: `export_kwargs[export_format][key] = value`
324
+ - Example:
325
+ ```python
326
+ {
327
+ "gs_ply": {
328
+ "gs_views_interval": 1,
329
+ },
330
+ "gs_video": {
331
+ "trj_mode": "interpolate_smooth",
332
+ "chunk_size": 1,
333
+ "vis_depth": None,
334
+ },
335
+ }
336
+ ```
337
+
338
+ ## 📤 Export Formats
339
+
340
+ The API supports multiple export formats for different use cases:
341
+
342
+ ### 📊 `mini_npz`
343
+ - **Description**: Minimal NPZ format containing essential data
344
+ - **Contents**: `depth`, `conf`, `exts`, `ixts`
345
+ - **Use case**: Lightweight storage for depth data with camera parameters
346
+
347
+ ### 📦 `npz`
348
+ - **Description**: Full NPZ format with comprehensive data
349
+ - **Contents**: `depth`, `conf`, `exts`, `ixts`, `image`, etc.
350
+ - **Use case**: Complete data export for advanced processing
351
+
352
+ ### 🌐 `glb`
353
+ - **Description**: 3D visualization format with point cloud and camera poses
354
+ - **Contents**:
355
+ - Point cloud with colors from original images
356
+ - Camera wireframes for visualization
357
+ - Confidence-based filtering and downsampling
358
+ - **Use case**: 3D visualization, inspection, and analysis
359
+ - **Features**:
360
+ - Automatic sky depth handling
361
+ - Confidence threshold filtering
362
+ - Background filtering (black/white)
363
+ - Scene scale normalization
364
+ - **Parameters** (passed via `inference()` method directly):
365
+ - `conf_thresh_percentile` (float, default: 40.0): Lower percentile for adaptive confidence threshold. Points below this confidence percentile will be filtered out.
366
+ - `num_max_points` (int, default: 1,000,000): Maximum number of points in the exported point cloud. If exceeded, points will be downsampled.
367
+ - `show_cameras` (bool, default: True): Whether to include camera wireframes in the exported GLB file for visualization.
368
+
369
+ ### ✨ `gs_ply`
370
+ - **Description**: Gaussian Splatting point cloud format
371
+ - **Contents**: 3DGS data in PLY format. Compatible with standard 3DGS viewers such as [SuperSplat](https://superspl.at/editor) (recommended), [SPARK](https://sparkjs.dev/viewer/).
372
+ - **Use case**: Gaussian Splatting reconstruction
373
+ - **Requirements**: Must set `infer_gs=True` when calling `inference()`. Only supported by `da3-giant` and `da3nested-giant-large` models.
374
+ - **Additional configs**, provided via `export_kwargs` (see [Export Parameters](#export-parameters)):
375
+ - `gs_views_interval`: Export to 3DGS every N views, default: `1`.
376
+
377
+ ### 🎥 `gs_video`
378
+ - **Description**: Rasterized 3DGS to obtain videos
379
+ - **Contents**: A video of 3DGS-rasterized views using either provided viewpoints or a predefined camera trajectory.
380
+ - **Use case**: Video rendering for Gaussian Splatting
381
+ - **Requirements**: Must set `infer_gs=True` when calling `inference()`. Only supported by `da3-giant` and `da3nested-giant-large` models.
382
+ - **Note**: Can optionally use `render_exts`, `render_ixts`, and `render_hw` parameters in `inference()` method to specify novel viewpoints.
383
+ - **Additional configs**, provided via `export_kwargs` (see [Export Parameters](#export-parameters)):
384
+ - `extrinsics`: Optional world-to-camera poses for novel views. Falls back to the predicted poses of input views if not provided. (Alternatively, use `render_exts` parameter in `inference()`)
385
+ - `intrinsics`: Optional camera intrinsics for novel views. Falls back to the predicted intrinsics of input views if not provided. (Alternatively, use `render_ixts` parameter in `inference()`)
386
+ - `out_image_hw`: Optional output resolution `H x W`. Falls back to input resolution if not provided. (Alternatively, use `render_hw` parameter in `inference()`)
387
+ - `chunk_size`: Number of views rasterized per batch. Default: `8`.
388
+ - `trj_mode`: Predefined camera trajectory for novel-view rendering.
389
+ - `color_mode`: Same as `render_mode` in [gsplat](https://docs.gsplat.studio/main/apis/rasterization.html#gsplat.rasterization).
390
+ - `vis_depth`: How depth is combined with RGB. Default: `hcat` (horizontal concatenation).
391
+ - `enable_tqdm`: Whether to display a tqdm progress bar during rendering.
392
+ - `output_name`: File name of the rendered video.
393
+ - `video_quality`: Video quality to save. Default: `high`.
394
+ - `high`: High quality video (default)
395
+ - `medium`: Medium quality video (balance of storage space and quality)
396
+ - `low`: Low quality video (fewer storage space)
397
+
398
+ ### 🔍 `feat_vis`
399
+ - **Description**: Feature visualization format
400
+ - **Contents**: PCA-visualized intermediate features from specified layers
401
+ - **Use case**: Model interpretability and feature analysis
402
+ - **Note**: Requires `export_feat_layers` to be specified
403
+ - **Parameters** (passed via `inference()` method directly):
404
+ - `feat_vis_fps` (int, default: 15): Frame rate for the output video when visualizing features across multiple images.
405
+
406
+ ### 🎨 `depth_vis`
407
+ - **Description**: Depth visualization format
408
+ - **Contents**: Color-coded depth maps alongside original images
409
+ - **Use case**: Visual inspection of depth estimation quality
410
+
411
+ ### 🔗 Multiple Format Export
412
+ You can export multiple formats simultaneously by separating them with `-`:
413
+
414
+ ```python
415
+ # Export both mini_npz and glb formats
416
+ export_format = "mini_npz-glb"
417
+
418
+ # Export multiple formats
419
+ export_format = "npz-glb-gs_ply"
420
+ ```
421
+
422
+ ## ↩️ Return Value
423
+
424
+ The `inference()` method returns a `Prediction` object with the following attributes:
425
+
426
+ ### 📊 Core Outputs
427
+
428
+ - **depth**: `np.ndarray` - Estimated depth maps with shape `(N, H, W)` where N is the number of images, H is height, and W is width.
429
+ - **conf**: `np.ndarray` - Confidence maps with shape `(N, H, W)` indicating prediction reliability (optional, depends on model).
430
+
431
+ ### 📷 Camera Parameters
432
+
433
+ - **extrinsics**: `np.ndarray` - Camera extrinsic matrices with shape `(N, 3, 4)` representing world-to-camera transformations. Only present if camera poses were estimated or provided as input.
434
+ - **intrinsics**: `np.ndarray` - Camera intrinsic matrices with shape `(N, 3, 3)` containing focal length and principal point information. Only present if poses were estimated or provided as input.
435
+
436
+ ### 🎁 Additional Outputs
437
+
438
+ - **processed_images**: `np.ndarray` - Preprocessed input images with shape `(N, H, W, 3)` in RGB format (0-255 uint8).
439
+ - **aux**: `dict` - Auxiliary outputs including:
440
+ - `feat_layer_X`: Intermediate features from layer X (if `export_feat_layers` was specified)
441
+ - `gaussians`: 3D Gaussian Splats data (if `infer_gs=True`)
442
+
443
+ ### 💻 Usage Example
444
+
445
+ ```python
446
+ prediction = model.inference(image=["img1.jpg", "img2.jpg"])
447
+
448
+ # Access depth maps
449
+ depth_maps = prediction.depth # shape: (2, H, W)
450
+
451
+ # Access confidence
452
+ if hasattr(prediction, 'conf'):
453
+ confidence = prediction.conf
454
+
455
+ # Access camera parameters (if available)
456
+ if hasattr(prediction, 'extrinsics'):
457
+ camera_poses = prediction.extrinsics # shape: (2, 4, 4)
458
+
459
+ if hasattr(prediction, 'intrinsics'):
460
+ camera_intrinsics = prediction.intrinsics # shape: (2, 3, 3)
461
+
462
+ # Access intermediate features (if export_feat_layers was set)
463
+ if hasattr(prediction, 'aux') and 'feat_layer_0' in prediction.aux:
464
+ features = prediction.aux['feat_layer_0']
465
+ ```
docs/CLI.md ADDED
@@ -0,0 +1,654 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🚀 Depth Anything 3 Command Line Interface
2
+
3
+ ## 📋 Table of Contents
4
+
5
+ - [📖 Overview](#overview)
6
+ - [⚡ Quick Start](#quick-start)
7
+ - [📚 Command Reference](#command-reference)
8
+ - [🤖 auto - Auto Mode](#auto---auto-mode)
9
+ - [🖼️ image - Single Image Processing](#image---single-image-processing)
10
+ - [🗂️ images - Image Directory Processing](#images---image-directory-processing)
11
+ - [🎬 video - Video Processing](#video---video-processing)
12
+ - [📐 colmap - COLMAP Dataset Processing](#colmap---colmap-dataset-processing)
13
+ - [🔧 backend - Backend Service](#backend---backend-service)
14
+ - [🎨 gradio - Gradio Application](#gradio---gradio-application)
15
+ - [🖼️ gallery - Gallery Server](#gallery---gallery-server)
16
+ - [⚙️ Parameter Details](#parameter-details)
17
+ - [💡 Usage Examples](#usage-examples)
18
+
19
+ ## 📖 Overview
20
+
21
+ The Depth Anything 3 CLI provides a comprehensive command-line toolkit supporting image depth estimation, video processing, COLMAP dataset handling, and web applications.
22
+
23
+ The backend service enables cache model to GPU so that we do not need to reload model for each command.
24
+
25
+ ## ⚡ Quick Start
26
+
27
+ The CLI can run fully offline or connect to the backend for cached weights and task scheduling:
28
+
29
+ ```bash
30
+ # 🔧 Start backend service (optional, keeps model resident in GPU memory)
31
+ da3 backend --model-dir depth-anything/DA3NESTED-GIANT-LARGE
32
+
33
+ # 🚀 Use auto mode to process input
34
+ da3 auto path/to/input --export-dir ./workspace/scene001
35
+
36
+ # ♻️ Reuse backend for next job
37
+ da3 auto path/to/video.mp4 \
38
+ --export-dir ./workspace/scene002 \
39
+ --use-backend \
40
+ --backend-url http://localhost:8008
41
+ ```
42
+
43
+ Each export directory contains `scene.glb`, `scene.jpg`, and optional extras such as `depth_vis/` or `gs_video/` depending on the requested format.
44
+
45
+ ## 📚 Command Reference
46
+
47
+ ### 🤖 auto - Auto Mode
48
+
49
+ Automatically detect input type and dispatch to the appropriate handler.
50
+
51
+ **Usage:**
52
+
53
+ ```bash
54
+ da3 auto INPUT_PATH [OPTIONS]
55
+ ```
56
+
57
+ **Input Type Detection:**
58
+ - 🖼️ Single image file (.jpg, .png, .jpeg, .webp, .bmp, .tiff, .tif)
59
+ - 📁 Image directory
60
+ - 🎬 Video file (.mp4, .avi, .mov, .mkv, .flv, .wmv, .webm, .m4v)
61
+ - 📐 COLMAP directory (containing `images/` and `sparse/` subdirectories)
62
+
63
+ **Parameters:**
64
+
65
+ | Parameter | Type | Default | Description |
66
+ |-----------|------|---------|-------------|
67
+ | `INPUT_PATH` | str | Required | Input path (image, directory, video, or COLMAP) |
68
+ | `--model-dir` | str | Default model | Model directory path |
69
+ | `--export-dir` | str | `debug` | Export directory |
70
+ | `--export-format` | str | `glb` | Export format (supports `mini_npz`, `glb`, `feat_vis`, etc., can be combined with hyphens) |
71
+ | `--device` | str | `cuda` | Device to use |
72
+ | `--use-backend` | bool | `False` | Use backend service for inference |
73
+ | `--backend-url` | str | `http://localhost:8008` | Backend service URL |
74
+ | `--process-res` | int | `504` | Processing resolution |
75
+ | `--process-res-method` | str | `upper_bound_resize` | Processing resolution method |
76
+ | `--export-feat` | str | `""` | Export features from specified layers, comma-separated (e.g., `"0,1,2"`) |
77
+ | `--auto-cleanup` | bool | `False` | Automatically clean export directory without confirmation |
78
+ | `--fps` | float | `1.0` | [Video] Frame sampling FPS |
79
+ | `--sparse-subdir` | str | `""` | [COLMAP] Sparse reconstruction subdirectory (e.g., `"0"` for `sparse/0/`) |
80
+ | `--align-to-input-ext-scale` | bool | `True` | [COLMAP] Align prediction to input extrinsics scale |
81
+ | `--use-ray-pose` | bool | `False` | Use ray-based pose estimation instead of camera decoder |
82
+ | `--ref-view-strategy` | str | `saddle_balanced` | Reference view selection strategy: `first`, `middle`, `saddle_balanced`, `saddle_sim_range`. See [docs](funcs/ref_view_strategy.md) |
83
+ | `--conf-thresh-percentile` | float | `40.0` | [GLB] Lower percentile for adaptive confidence threshold |
84
+ | `--num-max-points` | int | `1000000` | [GLB] Maximum number of points in the point cloud |
85
+ | `--show-cameras` | bool | `True` | [GLB] Show camera wireframes in the exported scene |
86
+ | `--feat-vis-fps` | int | `15` | [FEAT_VIS] Frame rate for output video |
87
+
88
+ **Examples:**
89
+
90
+ ```bash
91
+ # 🖼️ Auto-process an image
92
+ da3 auto path/to/image.jpg --export-dir ./output
93
+
94
+ # 🎬 Auto-process a video
95
+ da3 auto path/to/video.mp4 --fps 2.0 --export-dir ./output
96
+
97
+ # 🔧 Use backend service
98
+ da3 auto path/to/input \
99
+ --export-format mini_npz-glb \
100
+ --use-backend \
101
+ --backend-url http://localhost:8008 \
102
+ --export-dir ./output
103
+ ```
104
+
105
+ ---
106
+
107
+ ### 🖼️ image - Single Image Processing
108
+
109
+ Process a single image for camera pose and depth estimation.
110
+
111
+ **Usage:**
112
+
113
+ ```bash
114
+ da3 image IMAGE_PATH [OPTIONS]
115
+ ```
116
+
117
+ **Parameters:**
118
+
119
+ | Parameter | Type | Default | Description |
120
+ |-----------|------|---------|-------------|
121
+ | `IMAGE_PATH` | str | Required | Input image file path |
122
+ | `--model-dir` | str | Default model | Model directory path |
123
+ | `--export-dir` | str | `debug` | Export directory |
124
+ | `--export-format` | str | `glb` | Export format |
125
+ | `--device` | str | `cuda` | Device to use |
126
+ | `--use-backend` | bool | `False` | Use backend service for inference |
127
+ | `--backend-url` | str | `http://localhost:8008` | Backend service URL |
128
+ | `--process-res` | int | `504` | Processing resolution |
129
+ | `--process-res-method` | str | `upper_bound_resize` | Processing resolution method |
130
+ | `--export-feat` | str | `""` | Export feature layer indices (comma-separated) |
131
+ | `--auto-cleanup` | bool | `False` | Automatically clean export directory |
132
+ | `--use-ray-pose` | bool | `False` | Use ray-based pose estimation instead of camera decoder |
133
+ | `--ref-view-strategy` | str | `saddle_balanced` | Reference view selection strategy. See [docs](funcs/ref_view_strategy.md) |
134
+ | `--conf-thresh-percentile` | float | `40.0` | [GLB] Confidence threshold percentile |
135
+ | `--num-max-points` | int | `1000000` | [GLB] Maximum number of points |
136
+ | `--show-cameras` | bool | `True` | [GLB] Show cameras |
137
+ | `--feat-vis-fps` | int | `15` | [FEAT_VIS] Video frame rate |
138
+
139
+ **Examples:**
140
+
141
+ ```bash
142
+ # ✨ Basic usage
143
+ da3 image path/to/image.png --export-dir ./output
144
+
145
+ # ⚡ With backend acceleration
146
+ da3 image path/to/image.png \
147
+ --use-backend \
148
+ --backend-url http://localhost:8008 \
149
+ --export-dir ./output
150
+
151
+ # 🔍 Export feature visualization
152
+ da3 image image.jpg \
153
+ --export-format feat_vis \
154
+ --export-feat "9,19,29,39" \
155
+ --export-dir ./results
156
+ ```
157
+
158
+ ---
159
+
160
+ ### 🗂️ images - Image Directory Processing
161
+
162
+ Process a directory of images for batch depth estimation.
163
+
164
+ **Usage:**
165
+
166
+ ```bash
167
+ da3 images IMAGES_DIR [OPTIONS]
168
+ ```
169
+
170
+ **Parameters:**
171
+
172
+ | Parameter | Type | Default | Description |
173
+ |-----------|------|---------|-------------|
174
+ | `IMAGES_DIR` | str | Required | Directory path containing images |
175
+ | `--image-extensions` | str | `png,jpg,jpeg` | Image file extensions to process (comma-separated) |
176
+ | `--model-dir` | str | Default model | Model directory path |
177
+ | `--export-dir` | str | `debug` | Export directory |
178
+ | `--export-format` | str | `glb` | Export format |
179
+ | `--device` | str | `cuda` | Device to use |
180
+ | `--use-backend` | bool | `False` | Use backend service for inference |
181
+ | `--backend-url` | str | `http://localhost:8008` | Backend service URL |
182
+ | `--process-res` | int | `504` | Processing resolution |
183
+ | `--process-res-method` | str | `upper_bound_resize` | Processing resolution method |
184
+ | `--export-feat` | str | `""` | Export feature layer indices |
185
+ | `--auto-cleanup` | bool | `False` | Automatically clean export directory |
186
+ | `--use-ray-pose` | bool | `False` | Use ray-based pose estimation instead of camera decoder |
187
+ | `--ref-view-strategy` | str | `saddle_balanced` | Reference view selection strategy. See [docs](funcs/ref_view_strategy.md) |
188
+ | `--conf-thresh-percentile` | float | `40.0` | [GLB] Confidence threshold percentile |
189
+ | `--num-max-points` | int | `1000000` | [GLB] Maximum number of points |
190
+ | `--show-cameras` | bool | `True` | [GLB] Show cameras |
191
+ | `--feat-vis-fps` | int | `15` | [FEAT_VIS] Video frame rate |
192
+
193
+ **Examples:**
194
+
195
+ ```bash
196
+ # 📁 Process directory (defaults to png/jpg/jpeg)
197
+ da3 images ./image_folder --export-dir ./output
198
+
199
+ # 🎯 Custom extensions
200
+ da3 images ./dataset --image-extensions "png,jpg,webp" --export-dir ./output
201
+
202
+ # 🔧 Use backend service
203
+ da3 images ./dataset \
204
+ --use-backend \
205
+ --backend-url http://localhost:8008 \
206
+ --export-dir ./output
207
+ ```
208
+
209
+ ---
210
+
211
+ ### 🎬 video - Video Processing
212
+
213
+ Process video by extracting frames for depth estimation.
214
+
215
+ **Usage:**
216
+
217
+ ```bash
218
+ da3 video VIDEO_PATH [OPTIONS]
219
+ ```
220
+
221
+ **Parameters:**
222
+
223
+ | Parameter | Type | Default | Description |
224
+ |-----------|------|---------|-------------|
225
+ | `VIDEO_PATH` | str | Required | Input video file path |
226
+ | `--fps` | float | `1.0` | Frame extraction sampling FPS |
227
+ | `--model-dir` | str | Default model | Model directory path |
228
+ | `--export-dir` | str | `debug` | Export directory |
229
+ | `--export-format` | str | `glb` | Export format |
230
+ | `--device` | str | `cuda` | Device to use |
231
+ | `--use-backend` | bool | `False` | Use backend service for inference |
232
+ | `--backend-url` | str | `http://localhost:8008` | Backend service URL |
233
+ | `--process-res` | int | `504` | Processing resolution |
234
+ | `--process-res-method` | str | `upper_bound_resize` | Processing resolution method |
235
+ | `--export-feat` | str | `""` | Export feature layer indices |
236
+ | `--auto-cleanup` | bool | `False` | Automatically clean export directory |
237
+ | `--use-ray-pose` | bool | `False` | Use ray-based pose estimation instead of camera decoder |
238
+ | `--ref-view-strategy` | str | `saddle_balanced` | Reference view selection strategy. See [docs](funcs/ref_view_strategy.md) |
239
+ | `--conf-thresh-percentile` | float | `40.0` | [GLB] Confidence threshold percentile |
240
+ | `--num-max-points` | int | `1000000` | [GLB] Maximum number of points |
241
+ | `--show-cameras` | bool | `True` | [GLB] Show cameras |
242
+ | `--feat-vis-fps` | int | `15` | [FEAT_VIS] Video frame rate |
243
+
244
+ **Examples:**
245
+
246
+ ```bash
247
+ # ��� Basic video processing
248
+ da3 video path/to/video.mp4 --export-dir ./output
249
+
250
+ # ⚙️ Control frame sampling and resolution
251
+ da3 video path/to/video.mp4 \
252
+ --fps 2.0 \
253
+ --process-res 1024 \
254
+ --export-dir ./output
255
+
256
+ # 🔧 Use backend service
257
+ da3 video path/to/video.mp4 \
258
+ --use-backend \
259
+ --backend-url http://localhost:8008 \
260
+ --export-dir ./output
261
+ ```
262
+
263
+ ---
264
+
265
+ ### 📐 colmap - COLMAP Dataset Processing
266
+
267
+ Run pose-conditioned depth estimation on COLMAP data.
268
+
269
+ **Usage:**
270
+
271
+ ```bash
272
+ da3 colmap COLMAP_DIR [OPTIONS]
273
+ ```
274
+
275
+ **Parameters:**
276
+
277
+ | Parameter | Type | Default | Description |
278
+ |-----------|------|---------|-------------|
279
+ | `COLMAP_DIR` | str | Required | COLMAP directory containing `images/` and `sparse/` subdirectories |
280
+ | `--sparse-subdir` | str | `""` | Sparse reconstruction subdirectory (e.g., `"0"` for `sparse/0/`) |
281
+ | `--align-to-input-ext-scale` | bool | `True` | Align prediction to input extrinsics scale |
282
+ | `--model-dir` | str | Default model | Model directory path |
283
+ | `--export-dir` | str | `debug` | Export directory |
284
+ | `--export-format` | str | `glb` | Export format |
285
+ | `--device` | str | `cuda` | Device to use |
286
+ | `--use-backend` | bool | `False` | Use backend service for inference |
287
+ | `--backend-url` | str | `http://localhost:8008` | Backend service URL |
288
+ | `--process-res` | int | `504` | Processing resolution |
289
+ | `--process-res-method` | str | `upper_bound_resize` | Processing resolution method |
290
+ | `--export-feat` | str | `""` | Export feature layer indices |
291
+ | `--auto-cleanup` | bool | `False` | Automatically clean export directory |
292
+ | `--use-ray-pose` | bool | `False` | Use ray-based pose estimation instead of camera decoder |
293
+ | `--ref-view-strategy` | str | `saddle_balanced` | Reference view selection strategy. See [docs](funcs/ref_view_strategy.md) |
294
+ | `--conf-thresh-percentile` | float | `40.0` | [GLB] Confidence threshold percentile |
295
+ | `--num-max-points` | int | `1000000` | [GLB] Maximum number of points |
296
+ | `--show-cameras` | bool | `True` | [GLB] Show cameras |
297
+ | `--feat-vis-fps` | int | `15` | [FEAT_VIS] Video frame rate |
298
+
299
+ **Examples:**
300
+
301
+ ```bash
302
+ # 📐 Process COLMAP dataset
303
+ da3 colmap ./colmap_dataset --export-dir ./output
304
+
305
+ # 🎯 Use specific sparse subdirectory and align scale
306
+ da3 colmap ./colmap_dataset \
307
+ --sparse-subdir 0 \
308
+ --align-to-input-ext-scale \
309
+ --export-dir ./output
310
+
311
+ # 🔧 Use backend service
312
+ da3 colmap ./colmap_dataset \
313
+ --use-backend \
314
+ --backend-url http://localhost:8008 \
315
+ --export-dir ./output
316
+ ```
317
+
318
+ ---
319
+
320
+ ### 🔧 backend - Backend Service
321
+
322
+ Start model backend service with integrated gallery.
323
+
324
+ **Usage:**
325
+
326
+ ```bash
327
+ da3 backend [OPTIONS]
328
+ ```
329
+
330
+ **Parameters:**
331
+
332
+ | Parameter | Type | Default | Description |
333
+ |-----------|------|---------|-------------|
334
+ | `--model-dir` | str | Default model | Model directory path |
335
+ | `--device` | str | `cuda` | Device to use |
336
+ | `--host` | str | `127.0.0.1` | Host address to bind to |
337
+ | `--port` | int | `8008` | Port number to bind to |
338
+ | `--gallery-dir` | str | Default gallery dir | Gallery directory path (optional) |
339
+
340
+ **Features:**
341
+ - 🎯 Keeps model resident in GPU memory
342
+ - 🔌 Provides REST inference API
343
+ - 📊 Integrated dashboard and status monitoring
344
+ - 🖼️ Optional gallery browser (if `--gallery-dir` is provided)
345
+
346
+ **Available Endpoints:**
347
+ - 🏠 `/` - Home page
348
+ - 📊 `/dashboard` - Dashboard
349
+ - ✅ `/status` - API status
350
+ - 🖼️ `/gallery/` - Gallery browser (if enabled)
351
+
352
+ **Examples:**
353
+
354
+ ```bash
355
+ # 🚀 Basic backend service
356
+ da3 backend --model-dir depth-anything/DA3NESTED-GIANT-LARGE
357
+
358
+ # 🖼️ Backend with gallery
359
+ da3 backend \
360
+ --model-dir depth-anything/DA3NESTED-GIANT-LARGE \
361
+ --device cuda \
362
+ --host 0.0.0.0 \
363
+ --port 8008 \
364
+ --gallery-dir ./workspace
365
+
366
+ # 💻 Use CPU
367
+ da3 backend --model-dir depth-anything/DA3NESTED-GIANT-LARGE --device cpu
368
+ ```
369
+
370
+ ---
371
+
372
+ ### 🎨 gradio - Gradio Application
373
+
374
+ Launch Depth Anything 3 Gradio interactive web application.
375
+
376
+ **Usage:**
377
+
378
+ ```bash
379
+ da3 gradio [OPTIONS]
380
+ ```
381
+
382
+ **Parameters:**
383
+
384
+ | Parameter | Type | Default | Description |
385
+ |-----------|------|---------|-------------|
386
+ | `--model-dir` | str | Required | Model directory path |
387
+ | `--workspace-dir` | str | Required | Workspace directory path |
388
+ | `--gallery-dir` | str | Required | Gallery directory path |
389
+ | `--host` | str | `127.0.0.1` | Host address to bind to |
390
+ | `--port` | int | `7860` | Port number to bind to |
391
+ | `--share` | bool | `False` | Create a public link |
392
+ | `--debug` | bool | `False` | Enable debug mode |
393
+ | `--cache-examples` | bool | `False` | Pre-cache all example scenes at startup |
394
+ | `--cache-gs-tag` | str | `""` | Tag to match scene names for high-res+3DGS caching |
395
+
396
+ **Examples:**
397
+
398
+ ```bash
399
+ # 🎨 Basic Gradio application
400
+ da3 gradio \
401
+ --model-dir depth-anything/DA3NESTED-GIANT-LARGE \
402
+ --workspace-dir ./workspace \
403
+ --gallery-dir ./gallery
404
+
405
+ # 🌐 Enable sharing and debug
406
+ da3 gradio \
407
+ --model-dir depth-anything/DA3NESTED-GIANT-LARGE \
408
+ --workspace-dir ./workspace \
409
+ --gallery-dir ./gallery \
410
+ --share \
411
+ --debug
412
+
413
+ # ⚡ Pre-cache examples
414
+ da3 gradio \
415
+ --model-dir depth-anything/DA3NESTED-GIANT-LARGE \
416
+ --workspace-dir ./workspace \
417
+ --gallery-dir ./gallery \
418
+ --cache-examples \
419
+ --cache-gs-tag "dl3dv"
420
+ ```
421
+
422
+ ---
423
+
424
+ ### 🖼️ gallery - Gallery Server
425
+
426
+ Launch standalone Depth Anything 3 Gallery server.
427
+
428
+ **Usage:**
429
+
430
+ ```bash
431
+ da3 gallery [OPTIONS]
432
+ ```
433
+
434
+ **Parameters:**
435
+
436
+ | Parameter | Type | Default | Description |
437
+ |-----------|------|---------|-------------|
438
+ | `--gallery-dir` | str | Default gallery dir | Gallery root directory |
439
+ | `--host` | str | `127.0.0.1` | Host address to bind to |
440
+ | `--port` | int | `8007` | Port number to bind to |
441
+ | `--open-browser` | bool | `False` | Open browser after launch |
442
+
443
+ **Note:**
444
+ The gallery expects each scene folder to contain at least `scene.glb` and `scene.jpg`, with optional subfolders such as `depth_vis/` or `gs_video/`.
445
+
446
+ **Examples:**
447
+
448
+ ```bash
449
+ # 🖼️ Basic gallery server
450
+ da3 gallery --gallery-dir ./workspace
451
+
452
+ # 🌐 Custom host and port
453
+ da3 gallery \
454
+ --gallery-dir ./workspace \
455
+ --host 0.0.0.0 \
456
+ --port 8007
457
+
458
+ # 🚀 Auto-open browser
459
+ da3 gallery --gallery-dir ./workspace --open-browser
460
+ ```
461
+
462
+ ---
463
+
464
+ ## ⚙️ Parameter Details
465
+
466
+ ### 🔧 Common Parameters
467
+
468
+ - **`--export-dir`**: Output directory, defaults to `debug`
469
+ - **`--export-format`**: Export format, supports combining multiple formats with hyphens:
470
+ - 📦 `mini_npz`: Compressed NumPy format
471
+ - 🎨 `glb`: glTF binary format (3D scene)
472
+ - 🔍 `feat_vis`: Feature visualization
473
+ - Example: `mini_npz-glb` exports both formats
474
+
475
+ - **`--process-res`** / **`--process-res-method`**: Control preprocessing resolution strategy
476
+ - `process-res`: Target resolution (default 504)
477
+ - `process-res-method`: Resize method (default `upper_bound_resize`)
478
+
479
+ - **`--auto-cleanup`**: Remove existing export directory without confirmation
480
+
481
+ - **`--use-backend`** / **`--backend-url`**: Reuse running backend service
482
+ - ⚡ Reduces model loading time
483
+ - 🌐 Supports distributed processing
484
+
485
+ - **`--export-feat`**: Layer indices for exporting intermediate features (comma-separated)
486
+ - Example: `"9,19,29,39"`
487
+
488
+ ### 🎨 GLB Export Parameters
489
+
490
+ - **`--conf-thresh-percentile`**: Lower percentile for adaptive confidence threshold (default 40.0)
491
+ - Used to filter low-confidence points
492
+
493
+ - **`--num-max-points`**: Maximum number of points in point cloud (default 1,000,000)
494
+ - Controls output file size and performance
495
+
496
+ - **`--show-cameras`**: Show camera wireframes in exported scene (default True)
497
+
498
+ ### 🔍 Feature Visualization Parameters
499
+
500
+ - **`--feat-vis-fps`**: Frame rate for feature visualization output video (default 15)
501
+
502
+ ### 🎬 Video-Specific Parameters
503
+
504
+ - **`--fps`**: Video frame extraction sampling rate (default 1.0 FPS)
505
+ - Higher values extract more frames
506
+
507
+ ### 📐 COLMAP-Specific Parameters
508
+
509
+ - **`--sparse-subdir`**: Sparse reconstruction subdirectory
510
+ - Empty string uses `sparse/` directory
511
+ - `"0"` uses `sparse/0/` directory
512
+
513
+ - **`--align-to-input-ext-scale`**: Align prediction to input extrinsics scale (default True)
514
+ - Ensures depth estimation is consistent with COLMAP scale
515
+
516
+ ---
517
+
518
+ ## 💡 Usage Examples
519
+
520
+ ### 1️⃣ Basic Workflow
521
+
522
+ ```bash
523
+ # 🔧 Start backend service
524
+ da3 backend --model-dir depth-anything/DA3NESTED-GIANT-LARGE --host 0.0.0.0 --port 8008
525
+
526
+ # 🖼️ Process single image
527
+ da3 image image.jpg --export-dir ./output1 --use-backend
528
+
529
+ # 🎬 Process video
530
+ da3 video video.mp4 --fps 2.0 --export-dir ./output2 --use-backend
531
+
532
+ # 📐 Process COLMAP dataset
533
+ da3 colmap ./colmap_data --export-dir ./output3 --use-backend
534
+ ```
535
+
536
+ ### 2️⃣ Using Auto Mode
537
+
538
+ ```bash
539
+ # 🤖 Auto-detect and process
540
+ da3 auto ./unknown_input --export-dir ./output
541
+
542
+ # ⚡ With backend acceleration
543
+ da3 auto ./unknown_input \
544
+ --use-backend \
545
+ --backend-url http://localhost:8008 \
546
+ --export-dir ./output
547
+ ```
548
+
549
+ ### 3️⃣ Multi-Format Export
550
+
551
+ ```bash
552
+ # 📦 Export both NPZ and GLB formats
553
+ da3 auto assets/examples/SOH \
554
+ --export-format mini_npz-glb \
555
+ --export-dir ./workspace/soh
556
+
557
+ # 🔍 Export feature visualization
558
+ da3 image image.jpg \
559
+ --export-format feat_vis \
560
+ --export-feat "9,19,29,39" \
561
+ --export-dir ./results
562
+ ```
563
+
564
+ ### 4️⃣ Advanced Configuration
565
+
566
+ ```bash
567
+ # ⚙️ Custom resolution and point cloud density
568
+ da3 image image.jpg \
569
+ --process-res 1024 \
570
+ --num-max-points 2000000 \
571
+ --conf-thresh-percentile 30.0 \
572
+ --export-dir ./output
573
+
574
+ # 📐 COLMAP advanced options
575
+ da3 colmap ./colmap_data \
576
+ --sparse-subdir 0 \
577
+ --align-to-input-ext-scale \
578
+ --process-res 756 \
579
+ --export-dir ./output
580
+ ```
581
+
582
+ ### 5️⃣ Batch Processing Workflow
583
+
584
+ ```bash
585
+ # 🔧 Start backend
586
+ da3 backend \
587
+ --model-dir depth-anything/DA3NESTED-GIANT-LARGE \
588
+ --device cuda \
589
+ --host 0.0.0.0 \
590
+ --port 8008 \
591
+ --gallery-dir ./workspace
592
+
593
+ # 🔄 Batch process multiple scenes
594
+ for scene in scene1 scene2 scene3; do
595
+ da3 auto ./data/$scene \
596
+ --export-dir ./workspace/$scene \
597
+ --use-backend \
598
+ --auto-cleanup
599
+ done
600
+
601
+ # 🖼️ Launch gallery to view results
602
+ da3 gallery --gallery-dir ./workspace --open-browser
603
+ ```
604
+
605
+ ### 6️⃣ Web Applications
606
+
607
+ ```bash
608
+ # 🎨 Launch Gradio application
609
+ da3 gradio \
610
+ --model-dir depth-anything/DA3NESTED-GIANT-LARGE \
611
+ --workspace-dir workspace/gradio \
612
+ --gallery-dir ./gallery \
613
+ --host 0.0.0.0 \
614
+ --port 7860 \
615
+ --share
616
+ ```
617
+
618
+ ### 7️⃣ Transformer Feature Visualization
619
+
620
+ ```bash
621
+ # 🔍 Export Transformer features
622
+ # 📦 Combined with numerical output
623
+ da3 auto video.mp4 \
624
+ --export-format glb-feat_vis \
625
+ --export-feat "11,21,31" \
626
+ --export-dir ./debug \
627
+ --use-backend
628
+ ```
629
+
630
+ ---
631
+
632
+ ## 📝 Notes
633
+
634
+ 1. **🔧 Backend Service**: Recommended for processing multiple tasks to improve efficiency
635
+ 2. **💾 GPU Memory**: Be mindful of GPU memory usage when processing high-resolution inputs
636
+ 3. **📁 Export Directory**: Use `--auto-cleanup` to avoid manual confirmation for deletion
637
+ 4. **🔀 Format Combination**: Multiple export formats can be combined with hyphens (e.g., `mini_npz-glb-feat_vis`)
638
+ 5. **📐 COLMAP Data**: Ensure COLMAP directory structure is correct (contains `images/` and `sparse/` subdirectories)
639
+
640
+ ---
641
+
642
+ ## ❓ Getting Help
643
+
644
+ View detailed help for any command:
645
+
646
+ ```bash
647
+ # 📖 View main help
648
+ da3 --help
649
+
650
+ # 🔍 View specific command help
651
+ da3 auto --help
652
+ da3 image --help
653
+ da3 backend --help
654
+ ```
docs/funcs/ref_view_strategy.md ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 📐 Reference View Selection Strategy
2
+
3
+ ## 📖 Overview
4
+
5
+ Reference view selection is a component in multi-view depth estimation. When processing multiple input views, the model needs to determine which view should serve as the primary reference frame for depth prediction, defining the world coordinate system.
6
+
7
+ Different reference view will leads to different reconstruction results. This is a known consideration in multi-view geometry and was analyzed in [PI3](https://arxiv.org/abs/2507.13347). The choice of reference view can affect the quality and consistency of depth predictions across the scene.
8
+
9
+
10
+ ## 🚀 Our Simple Solution: Automatic Reference View Selection
11
+
12
+ DA3 provides a simple approach to address this through **automatic reference view selection** based on **class tokens**. Instead of relying on heuristics or manual selection, the model analyzes the class token features from all input views and intelligently selects the most suitable reference frame.
13
+
14
+ ---
15
+
16
+ ## 🎨 Available Strategies
17
+
18
+ ### 1. ⚖️ `saddle_balanced` (Recommended, Default)
19
+
20
+ **Philosophy:**
21
+ Select a view that achieves balance across multiple feature metrics. This strategy looks for a "middle ground" view that is neither too similar nor too different from other views, making it a stable reference point.
22
+
23
+ **How it works:**
24
+ 1. Extracts and normalizes class tokens from all views
25
+ 2. Computes three complementary metrics for each view:
26
+ - **Similarity score**: Average cosine similarity with other views
27
+ - **Feature norm**: L2 norm of the original features
28
+ - **Feature variance**: Variance across feature dimensions
29
+ 3. Normalizes each metric to [0, 1] range
30
+ 4. Selects the view closest to 0.5 (median) across all three metrics
31
+
32
+ ### 2. 🎢 `saddle_sim_range`
33
+
34
+ **Philosophy:**
35
+ Select a view with the largest similarity range to other views. This identifies "saddle point" views that are highly similar to some views but dissimilar to others, making them information-rich anchor points.
36
+
37
+ **How it works:**
38
+ 1. Computes pairwise cosine similarity between all views
39
+ 2. For each view, calculates the range (max - min) of similarities to other views
40
+ 3. Selects the view with the maximum similarity range
41
+
42
+ ---
43
+
44
+ ### 3. 1️⃣ `first` (Not Recommended)
45
+
46
+ **Philosophy:**
47
+ Always use the first view in the input sequence as the reference.
48
+
49
+ **How it works:**
50
+ Simply returns index 0.
51
+
52
+ **When to use:**
53
+ - ⛔ **Not recommended** in general
54
+ - 🔧 Only use when you have manually pre-sorted your views and know the first view is optimal
55
+ - 🐛 Debugging or baseline comparisons
56
+
57
+ ---
58
+
59
+ ### 4. ⏸️ `middle`
60
+
61
+ **Philosophy:**
62
+ Select the view in the middle of the input sequence.
63
+
64
+ **How it works:**
65
+ Returns the view at index `S // 2` where S is the number of views.
66
+
67
+ **When to use:**
68
+ - ⏱️ **Only recommended when input images are temporally ordered**
69
+ - 🎬 Video sequences (e.g., **DA3-LONG** setting)
70
+ - 📹 Sequential captures where the middle frame likely has the most stable viewpoint
71
+
72
+ **Specific use case: DA3-LONG** 🎬
73
+ In video-based depth estimation scenarios (like DA3-LONG), where inputs are consecutive frames, `middle` is often the **optimal choice** because that it has maximum overlap with all other frames.
74
+
75
+
76
+ ## 💻 Usage
77
+
78
+ ### 🐍 Python API
79
+
80
+ ```python
81
+ from depth_anything_3 import DepthAnything3
82
+
83
+ model = DepthAnything3.from_pretrained("depth-anything/DA3NESTED-GIANT-LARGE")
84
+
85
+ # Use default (saddle_balanced)
86
+ prediction = model.inference(
87
+ images,
88
+ ref_view_strategy="saddle_balanced"
89
+ )
90
+
91
+ # For video sequences, consider using middle
92
+ prediction = model.inference(
93
+ video_frames,
94
+ ref_view_strategy="middle" # Good for temporal sequences
95
+ )
96
+
97
+ # For complex scenes with wide baselines
98
+ prediction = model.inference(
99
+ images,
100
+ ref_view_strategy="saddle_sim_range"
101
+ )
102
+ ```
103
+
104
+ ### 🖥️ Command Line Interface
105
+
106
+ ```bash
107
+ # Default (saddle_balanced)
108
+ da3 auto input/ --export-dir output/
109
+
110
+ # Explicitly specify strategy
111
+ da3 auto input/ --ref-view-strategy saddle_balanced
112
+
113
+ # For video processing
114
+ da3 video input.mp4 --ref-view-strategy middle
115
+
116
+ # For wide-baseline multi-view
117
+ da3 images captures/ --ref-view-strategy saddle_sim_range
118
+ ```
119
+
120
+ ---
121
+
122
+ ### 🎯 When Selection Is Applied
123
+
124
+ Reference view selection is applied when:
125
+ - 3️⃣ Number of views S ≥ 3
126
+
127
+ ---
128
+
129
+ ## 💡 Recommendations
130
+
131
+ ### 📋 Quick Guide
132
+
133
+ | Scenario | Recommended Strategy | Rationale |
134
+ |----------|---------------------|-----------|
135
+ | **Default / Unknown** | `saddle_balanced` | Robust, balanced, works well across diverse scenarios |
136
+ | **Video frames** | `middle` | Temporal coherence, stable middle frame |
137
+ | **Wide-baseline multi-view** | `saddle_sim_range` | Maximizes information coverage |
138
+ | **Pre-sorted inputs** | `first` | Use only if you've manually optimized ordering |
139
+ | **Single image** | `first` | Automatically used (no reordering needed for S ≤ 2) |
140
+
141
+ ### ✨ Best Practices
142
+
143
+ 1. 🎯 **Start with defaults**: `saddle_balanced` works well in most cases
144
+ 2. 🎬 **Consider your input type**: Use `middle` for videos, `saddle_balanced` for photos
145
+ 3. 🔬 **Experiment if needed**: Try different strategies if results are suboptimal
146
+ 4. 📊 **Monitor performance**: Check `glb` quality and consistency across views.
147
+
148
+ ---
149
+
150
+ ## 🔧 Technical Details
151
+
152
+ ### 🎚️ Selection Threshold
153
+
154
+ The reference view selection is only triggered when:
155
+ ```python
156
+ num_views >= 3 # At least 3 views required
157
+ ```
158
+
159
+ For 1-2 views, no reordering is performed (equivalent to using `first`).
160
+
161
+ ### ⚙️ Implementation
162
+
163
+ The selection happens at layer `alt_start - 1` in the vision transformer, before the first global attention layer. This ensures the selected reference view influences the entire depth prediction pipeline.
164
+
165
+ ---
166
+
167
+ ## ❓ FAQ
168
+
169
+ **Q: 🤔 Why is this feature provided?**
170
+ A: The model can handle any view order, but this feature provides automatic optimization for reference view selection, which can help improve depth prediction quality in multi-view scenarios.
171
+
172
+ **Q: ⏱️ Does this add computational cost?**
173
+ A: The overhead is totally negligible.
174
+
175
+ **Q: 🎮 Can I manually specify which view to use as reference?**
176
+ A: Not directly through this parameter. You can pre-sort your input images to place your preferred reference view first and use `ref_view_strategy="first"`.
177
+
178
+ **Q: ⚙️ What happens if I don't specify this parameter?**
179
+ A: The default `saddle_balanced` strategy is used automatically.
180
+
181
+ **Q: 📊 Is this feature used in the DA3 paper benchmarks?**
182
+ A: No, the paper used `first` as the default strategy for all multi-view experiments. The current default has been updated to `saddle_balanced` for better robustness.
183
+
notebooks/da3.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
notebooks/da3_tutorial.ipynb ADDED
@@ -0,0 +1,667 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# 🌊 Depth Anything 3 — From Images to 3D in Seconds\n",
8
+ "\n",
9
+ "<div align=\"center\">\n",
10
+ "\n",
11
+ "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Aedelon/awesome-depth-anything-3/blob/main/notebooks/da3_tutorial.ipynb)\n",
12
+ "[![GitHub Stars](https://img.shields.io/github/stars/Aedelon/awesome-depth-anything-3?style=social)](https://github.com/Aedelon/awesome-depth-anything-3)\n",
13
+ "[![PyPI](https://img.shields.io/pypi/v/awesome-depth-anything-3)](https://pypi.org/project/awesome-depth-anything-3/)\n",
14
+ "[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)\n",
15
+ "\n",
16
+ "**State-of-the-art monocular depth estimation + 3D reconstruction**\n",
17
+ "\n",
18
+ "</div>\n",
19
+ "\n",
20
+ "---\n",
21
+ "\n",
22
+ "### What you'll get:\n",
23
+ "\n",
24
+ "| Input | Output |\n",
25
+ "|-------|--------|\n",
26
+ "| 📸 Single image | 🌊 Metric depth map |\n",
27
+ "| 🎬 Video / Multi-view | ☁️ 3D Point Cloud + Camera poses |\n",
28
+ "| 🖼️ Any scene | 📦 Downloadable GLB file |\n",
29
+ "\n",
30
+ "---\n",
31
+ "\n",
32
+ "### ⚡ Quick Start\n",
33
+ "\n",
34
+ "1. **Runtime → Change runtime type → T4 GPU** (free tier works!)\n",
35
+ "2. **Run all cells** (Ctrl+F9) or click ▶️ on each cell\n",
36
+ "3. **Upload your images** in Section 4\n",
37
+ "4. **Download your 3D model** (.glb file)\n",
38
+ "\n",
39
+ "⏱️ **Total time: ~5 minutes** (including model download)"
40
+ ]
41
+ },
42
+ {
43
+ "cell_type": "code",
44
+ "execution_count": null,
45
+ "metadata": {},
46
+ "outputs": [],
47
+ "source": [
48
+ "#@title 🚀 **1. Install** (run this first!) { display-mode: \"form\" }\n",
49
+ "#@markdown > ⏱️ Takes ~2 minutes on first run\n",
50
+ "\n",
51
+ "%%capture\n",
52
+ "!pip install awesome-depth-anything-3\n",
53
+ "\n",
54
+ "# Verify installation\n",
55
+ "import torch\n",
56
+ "from IPython.display import HTML, display\n",
57
+ "\n",
58
+ "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
59
+ "gpu_name = torch.cuda.get_device_name(0) if device == \"cuda\" else \"None\"\n",
60
+ "vram = torch.cuda.get_device_properties(0).total_memory / 1e9 if device == \"cuda\" else 0\n",
61
+ "\n",
62
+ "if device == \"cuda\":\n",
63
+ " status = f'''\n",
64
+ " <div style=\"background: linear-gradient(135deg, #10B981, #059669); padding: 20px; border-radius: 12px; color: white; font-family: system-ui;\">\n",
65
+ " <h3 style=\"margin: 0 0 10px 0;\">✅ Ready to go!</h3>\n",
66
+ " <p style=\"margin: 5px 0;\"><b>GPU:</b> {gpu_name}</p>\n",
67
+ " <p style=\"margin: 5px 0;\"><b>VRAM:</b> {vram:.1f} GB</p>\n",
68
+ " <p style=\"margin: 5px 0;\"><b>PyTorch:</b> {torch.__version__}</p>\n",
69
+ " </div>\n",
70
+ " '''\n",
71
+ "else:\n",
72
+ " status = '''\n",
73
+ " <div style=\"background: linear-gradient(135deg, #EF4444, #DC2626); padding: 20px; border-radius: 12px; color: white; font-family: system-ui;\">\n",
74
+ " <h3 style=\"margin: 0 0 10px 0;\">⚠️ No GPU detected!</h3>\n",
75
+ " <p style=\"margin: 5px 0;\">Go to <b>Runtime → Change runtime type → GPU</b></p>\n",
76
+ " <p style=\"margin: 5px 0;\">Then restart the notebook.</p>\n",
77
+ " </div>\n",
78
+ " '''\n",
79
+ "\n",
80
+ "display(HTML(status))"
81
+ ]
82
+ },
83
+ {
84
+ "cell_type": "code",
85
+ "execution_count": null,
86
+ "metadata": {},
87
+ "outputs": [],
88
+ "source": [
89
+ "#@title 🧠 **2. Load Model** { display-mode: \"form\" }\n",
90
+ "#@markdown Choose model size:\n",
91
+ "model_size = \"DA3-LARGE\" #@param [\"DA3-SMALL\", \"DA3-BASE\", \"DA3-LARGE\", \"DA3-GIANT\", \"DA3NESTED-GIANT-LARGE\"]\n",
92
+ "#@markdown ---\n",
93
+ "#@markdown | Model | Speed | Quality | VRAM |\n",
94
+ "#@markdown |-------|-------|---------|------|\n",
95
+ "#@markdown | SMALL | ⚡⚡⚡ | ★★☆ | 4GB |\n",
96
+ "#@markdown | BASE | ⚡⚡ | ★★★ | 6GB |\n",
97
+ "#@markdown | LARGE | ⚡ | ★★★★ | 8GB |\n",
98
+ "#@markdown | GIANT | 🐢 | ★★★★★ | 12GB |\n",
99
+ "#@markdown | NESTED | 🐢 | ★★★★★+ | 16GB |\n",
100
+ "\n",
101
+ "from depth_anything_3.api import DepthAnything3\n",
102
+ "import time\n",
103
+ "\n",
104
+ "print(f\"📥 Loading {model_size}...\")\n",
105
+ "start = time.time()\n",
106
+ "\n",
107
+ "model = DepthAnything3.from_pretrained(f\"depth-anything/{model_size}\")\n",
108
+ "model = model.to(device).eval()\n",
109
+ "\n",
110
+ "print(f\"✅ Model loaded in {time.time()-start:.1f}s\")"
111
+ ]
112
+ },
113
+ {
114
+ "cell_type": "code",
115
+ "execution_count": null,
116
+ "metadata": {},
117
+ "outputs": [],
118
+ "source": [
119
+ "#@title 🖼️ **3. Try with Sample Image** { display-mode: \"form\" }\n",
120
+ "#@markdown Run depth estimation on a sample image\n",
121
+ "\n",
122
+ "import matplotlib.pyplot as plt\n",
123
+ "import numpy as np\n",
124
+ "from PIL import Image\n",
125
+ "import urllib.request\n",
126
+ "import os\n",
127
+ "\n",
128
+ "# Download sample\n",
129
+ "os.makedirs(\"samples\", exist_ok=True)\n",
130
+ "url = \"https://images.unsplash.com/photo-1506905925346-21bda4d32df4?w=1280\"\n",
131
+ "urllib.request.urlretrieve(url, \"samples/mountain.jpg\")\n",
132
+ "\n",
133
+ "# Run inference\n",
134
+ "result = model.inference([\"samples/mountain.jpg\"])\n",
135
+ "\n",
136
+ "# Visualize\n",
137
+ "fig, axes = plt.subplots(1, 2, figsize=(14, 5))\n",
138
+ "\n",
139
+ "axes[0].imshow(result.processed_images[0])\n",
140
+ "axes[0].set_title(\"📸 Input\", fontsize=14, fontweight='bold')\n",
141
+ "axes[0].axis(\"off\")\n",
142
+ "\n",
143
+ "depth = result.depth[0]\n",
144
+ "im = axes[1].imshow(depth, cmap='Spectral_r')\n",
145
+ "axes[1].set_title(f\"🌊 Depth (range: {depth.min():.1f}m - {depth.max():.1f}m)\", fontsize=14, fontweight='bold')\n",
146
+ "axes[1].axis(\"off\")\n",
147
+ "plt.colorbar(im, ax=axes[1], fraction=0.046, pad=0.04, label='Depth (m)')\n",
148
+ "\n",
149
+ "plt.tight_layout()\n",
150
+ "plt.show()\n",
151
+ "\n",
152
+ "print(f\"\\n📊 Output shapes:\")\n",
153
+ "print(f\" Depth: {result.depth.shape}\")\n",
154
+ "print(f\" Confidence: {result.conf.shape}\")\n",
155
+ "print(f\" Camera intrinsics: {result.intrinsics.shape}\")"
156
+ ]
157
+ },
158
+ {
159
+ "cell_type": "markdown",
160
+ "metadata": {},
161
+ "source": [
162
+ "---\n",
163
+ "\n",
164
+ "## 📤 4. Use Your Own Images\n",
165
+ "\n",
166
+ "Upload your images and get a 3D point cloud!"
167
+ ]
168
+ },
169
+ {
170
+ "cell_type": "code",
171
+ "execution_count": null,
172
+ "metadata": {},
173
+ "outputs": [],
174
+ "source": [
175
+ "#@title 📁 **Upload Images** { display-mode: \"form\" }\n",
176
+ "#@markdown Upload **2-50 images** of the same scene from different angles.\n",
177
+ "#@markdown \n",
178
+ "#@markdown 💡 **Tips for best results:**\n",
179
+ "#@markdown - Move the camera, not the objects\n",
180
+ "#@markdown - 30-50% overlap between consecutive images\n",
181
+ "#@markdown - Avoid motion blur\n",
182
+ "#@markdown - Good lighting helps!\n",
183
+ "\n",
184
+ "from google.colab import files\n",
185
+ "import shutil\n",
186
+ "\n",
187
+ "# Clean up previous uploads\n",
188
+ "upload_dir = \"my_images\"\n",
189
+ "if os.path.exists(upload_dir):\n",
190
+ " shutil.rmtree(upload_dir)\n",
191
+ "os.makedirs(upload_dir, exist_ok=True)\n",
192
+ "\n",
193
+ "print(\"📤 Select your images...\")\n",
194
+ "uploaded = files.upload()\n",
195
+ "\n",
196
+ "# Save uploaded files\n",
197
+ "for filename, data in uploaded.items():\n",
198
+ " with open(f\"{upload_dir}/{filename}\", 'wb') as f:\n",
199
+ " f.write(data)\n",
200
+ "\n",
201
+ "image_files = sorted([f\"{upload_dir}/{f}\" for f in os.listdir(upload_dir) \n",
202
+ " if f.lower().endswith(('.jpg', '.jpeg', '.png', '.webp'))])\n",
203
+ "\n",
204
+ "print(f\"\\n✅ Uploaded {len(image_files)} images\")\n",
205
+ "\n",
206
+ "# Preview\n",
207
+ "n_preview = min(6, len(image_files))\n",
208
+ "fig, axes = plt.subplots(1, n_preview, figsize=(3*n_preview, 3))\n",
209
+ "if n_preview == 1:\n",
210
+ " axes = [axes]\n",
211
+ "for i, img_path in enumerate(image_files[:n_preview]):\n",
212
+ " img = Image.open(img_path)\n",
213
+ " axes[i].imshow(img)\n",
214
+ " axes[i].set_title(f\"#{i+1}\", fontsize=10)\n",
215
+ " axes[i].axis(\"off\")\n",
216
+ "if len(image_files) > n_preview:\n",
217
+ " print(f\" (showing first {n_preview} of {len(image_files)})\")\n",
218
+ "plt.tight_layout()\n",
219
+ "plt.show()"
220
+ ]
221
+ },
222
+ {
223
+ "cell_type": "code",
224
+ "execution_count": null,
225
+ "metadata": {},
226
+ "outputs": [],
227
+ "source": [
228
+ "#@title ⚡ **Run 3D Reconstruction** { display-mode: \"form\" }\n",
229
+ "#@markdown This will:\n",
230
+ "#@markdown 1. Estimate depth for each image\n",
231
+ "#@markdown 2. Compute camera poses\n",
232
+ "#@markdown 3. Generate a 3D point cloud\n",
233
+ "#@markdown 4. Export to GLB format\n",
234
+ "\n",
235
+ "from depth_anything_3.utils.export.glb import export_to_glb\n",
236
+ "import time\n",
237
+ "\n",
238
+ "print(f\"🔄 Processing {len(image_files)} images...\")\n",
239
+ "start = time.time()\n",
240
+ "\n",
241
+ "# Run inference\n",
242
+ "result = model.inference(\n",
243
+ " image_files,\n",
244
+ " process_res_method=\"upper_bound_resize\",\n",
245
+ ")\n",
246
+ "\n",
247
+ "inference_time = time.time() - start\n",
248
+ "print(f\"✅ Inference done in {inference_time:.1f}s ({len(image_files)/inference_time:.1f} img/s)\")\n",
249
+ "\n",
250
+ "# Export to GLB\n",
251
+ "output_dir = \"output_3d\"\n",
252
+ "os.makedirs(output_dir, exist_ok=True)\n",
253
+ "\n",
254
+ "print(\"📦 Generating 3D point cloud...\")\n",
255
+ "export_to_glb(\n",
256
+ " result,\n",
257
+ " export_dir=output_dir,\n",
258
+ " show_cameras=True,\n",
259
+ " conf_thresh_percentile=20, # Filter low-confidence points\n",
260
+ " num_max_points=500_000,\n",
261
+ ")\n",
262
+ "\n",
263
+ "print(f\"\\n✅ 3D model saved to {output_dir}/\")\n",
264
+ "!ls -lh {output_dir}/"
265
+ ]
266
+ },
267
+ {
268
+ "cell_type": "code",
269
+ "execution_count": null,
270
+ "metadata": {},
271
+ "outputs": [],
272
+ "source": [
273
+ "#@title 📥 **Download Your 3D Model** { display-mode: \"form\" }\n",
274
+ "#@markdown Downloads a `.glb` file you can view in:\n",
275
+ "#@markdown - [glTF Viewer](https://gltf-viewer.donmccurdy.com/)\n",
276
+ "#@markdown - Blender\n",
277
+ "#@markdown - Windows 3D Viewer\n",
278
+ "#@markdown - Any 3D software\n",
279
+ "\n",
280
+ "from google.colab import files\n",
281
+ "\n",
282
+ "glb_file = f\"{output_dir}/point_cloud.glb\"\n",
283
+ "if os.path.exists(glb_file):\n",
284
+ " files.download(glb_file)\n",
285
+ " print(\"\\n🎉 Download started!\")\n",
286
+ " print(\"\\n👉 View your model: https://gltf-viewer.donmccurdy.com/\")\n",
287
+ "else:\n",
288
+ " print(\"❌ GLB file not found. Run the previous cell first.\")"
289
+ ]
290
+ },
291
+ {
292
+ "cell_type": "markdown",
293
+ "metadata": {},
294
+ "source": [
295
+ "---\n",
296
+ "\n",
297
+ "## 📊 5. Visualize Results"
298
+ ]
299
+ },
300
+ {
301
+ "cell_type": "code",
302
+ "execution_count": null,
303
+ "metadata": {},
304
+ "outputs": [],
305
+ "source": [
306
+ "#@title 🌊 **View All Depth Maps** { display-mode: \"form\" }\n",
307
+ "\n",
308
+ "n_images = len(result.depth)\n",
309
+ "cols = min(4, n_images)\n",
310
+ "rows = (n_images + cols - 1) // cols\n",
311
+ "\n",
312
+ "fig, axes = plt.subplots(rows, cols, figsize=(4*cols, 4*rows))\n",
313
+ "axes = np.array(axes).flatten() if n_images > 1 else [axes]\n",
314
+ "\n",
315
+ "for i in range(n_images):\n",
316
+ " depth = result.depth[i]\n",
317
+ " axes[i].imshow(depth, cmap='Spectral_r')\n",
318
+ " axes[i].set_title(f\"Frame {i+1}\", fontsize=10)\n",
319
+ " axes[i].axis(\"off\")\n",
320
+ "\n",
321
+ "# Hide unused subplots\n",
322
+ "for i in range(n_images, len(axes)):\n",
323
+ " axes[i].axis(\"off\")\n",
324
+ "\n",
325
+ "plt.suptitle(\"🌊 Depth Maps\", fontsize=16, fontweight='bold')\n",
326
+ "plt.tight_layout()\n",
327
+ "plt.show()"
328
+ ]
329
+ },
330
+ {
331
+ "cell_type": "code",
332
+ "execution_count": null,
333
+ "metadata": {},
334
+ "outputs": [],
335
+ "source": [
336
+ "#@title 📷 **View Camera Poses** { display-mode: \"form\" }\n",
337
+ "#@markdown Visualize estimated camera positions in 3D\n",
338
+ "\n",
339
+ "from mpl_toolkits.mplot3d import Axes3D\n",
340
+ "\n",
341
+ "# Extract camera positions from extrinsics\n",
342
+ "positions = []\n",
343
+ "for ext in result.extrinsics:\n",
344
+ " # Extrinsic is world-to-camera, invert to get camera-to-world\n",
345
+ " R = ext[:3, :3]\n",
346
+ " t = ext[:3, 3]\n",
347
+ " cam_pos = -R.T @ t # Camera position in world coordinates\n",
348
+ " positions.append(cam_pos)\n",
349
+ "\n",
350
+ "positions = np.array(positions)\n",
351
+ "\n",
352
+ "fig = plt.figure(figsize=(10, 8))\n",
353
+ "ax = fig.add_subplot(111, projection='3d')\n",
354
+ "\n",
355
+ "# Plot camera positions\n",
356
+ "ax.scatter(positions[:, 0], positions[:, 1], positions[:, 2], \n",
357
+ " c=range(len(positions)), cmap='viridis', s=100, marker='o')\n",
358
+ "\n",
359
+ "# Connect cameras with lines\n",
360
+ "ax.plot(positions[:, 0], positions[:, 1], positions[:, 2], \n",
361
+ " 'b-', alpha=0.5, linewidth=1)\n",
362
+ "\n",
363
+ "# Mark first and last\n",
364
+ "ax.scatter(*positions[0], c='green', s=200, marker='^', label='First')\n",
365
+ "ax.scatter(*positions[-1], c='red', s=200, marker='v', label='Last')\n",
366
+ "\n",
367
+ "ax.set_xlabel('X')\n",
368
+ "ax.set_ylabel('Y')\n",
369
+ "ax.set_zlabel('Z')\n",
370
+ "ax.set_title('📷 Camera Trajectory', fontsize=14, fontweight='bold')\n",
371
+ "ax.legend()\n",
372
+ "\n",
373
+ "plt.tight_layout()\n",
374
+ "plt.show()\n",
375
+ "\n",
376
+ "print(f\"📍 {len(positions)} camera poses estimated\")"
377
+ ]
378
+ },
379
+ {
380
+ "cell_type": "markdown",
381
+ "metadata": {},
382
+ "source": [
383
+ "---\n",
384
+ "\n",
385
+ "## 🎬 6. Process Video"
386
+ ]
387
+ },
388
+ {
389
+ "cell_type": "code",
390
+ "execution_count": null,
391
+ "metadata": {},
392
+ "outputs": [],
393
+ "source": [
394
+ "#@title 🎬 **Upload Video** { display-mode: \"form\" }\n",
395
+ "#@markdown Upload a short video (< 30 seconds recommended)\n",
396
+ "\n",
397
+ "fps_extract = 2 #@param {type:\"slider\", min:1, max:10, step:1}\n",
398
+ "#@markdown ↑ Frames per second to extract (lower = faster, higher = more detail)\n",
399
+ "\n",
400
+ "from google.colab import files\n",
401
+ "import subprocess\n",
402
+ "\n",
403
+ "print(\"📤 Select a video file...\")\n",
404
+ "uploaded = files.upload()\n",
405
+ "\n",
406
+ "video_file = list(uploaded.keys())[0]\n",
407
+ "frames_dir = \"video_frames\"\n",
408
+ "\n",
409
+ "# Extract frames\n",
410
+ "if os.path.exists(frames_dir):\n",
411
+ " shutil.rmtree(frames_dir)\n",
412
+ "os.makedirs(frames_dir, exist_ok=True)\n",
413
+ "\n",
414
+ "print(f\"🎞️ Extracting frames at {fps_extract} FPS...\")\n",
415
+ "subprocess.run([\n",
416
+ " \"ffmpeg\", \"-i\", video_file, \n",
417
+ " \"-vf\", f\"fps={fps_extract}\",\n",
418
+ " f\"{frames_dir}/frame_%04d.jpg\",\n",
419
+ " \"-hide_banner\", \"-loglevel\", \"error\"\n",
420
+ "])\n",
421
+ "\n",
422
+ "video_images = sorted([f\"{frames_dir}/{f}\" for f in os.listdir(frames_dir)])\n",
423
+ "print(f\"✅ Extracted {len(video_images)} frames\")\n",
424
+ "\n",
425
+ "# Preview\n",
426
+ "n_preview = min(8, len(video_images))\n",
427
+ "fig, axes = plt.subplots(1, n_preview, figsize=(2*n_preview, 2))\n",
428
+ "step = max(1, len(video_images) // n_preview)\n",
429
+ "for i, ax in enumerate(axes):\n",
430
+ " idx = i * step\n",
431
+ " if idx < len(video_images):\n",
432
+ " ax.imshow(Image.open(video_images[idx]))\n",
433
+ " ax.axis(\"off\")\n",
434
+ "plt.suptitle(f\"🎬 Video Frames ({len(video_images)} total)\", fontsize=12)\n",
435
+ "plt.tight_layout()\n",
436
+ "plt.show()"
437
+ ]
438
+ },
439
+ {
440
+ "cell_type": "code",
441
+ "execution_count": null,
442
+ "metadata": {},
443
+ "outputs": [],
444
+ "source": [
445
+ "#@title ⚡ **Process Video Frames** { display-mode: \"form\" }\n",
446
+ "\n",
447
+ "print(f\"🔄 Processing {len(video_images)} frames...\")\n",
448
+ "start = time.time()\n",
449
+ "\n",
450
+ "result_video = model.inference(\n",
451
+ " video_images,\n",
452
+ " process_res_method=\"upper_bound_resize\",\n",
453
+ ")\n",
454
+ "\n",
455
+ "elapsed = time.time() - start\n",
456
+ "print(f\"✅ Done in {elapsed:.1f}s ({len(video_images)/elapsed:.1f} FPS)\")\n",
457
+ "\n",
458
+ "# Export\n",
459
+ "video_output = \"video_3d\"\n",
460
+ "os.makedirs(video_output, exist_ok=True)\n",
461
+ "\n",
462
+ "export_to_glb(\n",
463
+ " result_video,\n",
464
+ " export_dir=video_output,\n",
465
+ " show_cameras=True,\n",
466
+ " conf_thresh_percentile=15,\n",
467
+ " num_max_points=1_000_000,\n",
468
+ ")\n",
469
+ "\n",
470
+ "print(f\"\\n📦 3D model saved!\")\n",
471
+ "!ls -lh {video_output}/"
472
+ ]
473
+ },
474
+ {
475
+ "cell_type": "code",
476
+ "execution_count": null,
477
+ "metadata": {},
478
+ "outputs": [],
479
+ "source": [
480
+ "#@title 📥 **Download Video 3D Model** { display-mode: \"form\" }\n",
481
+ "\n",
482
+ "glb_file = f\"{video_output}/point_cloud.glb\"\n",
483
+ "if os.path.exists(glb_file):\n",
484
+ " files.download(glb_file)\n",
485
+ " print(\"🎉 Download started!\")\n",
486
+ "else:\n",
487
+ " print(\"❌ Run the previous cell first.\")"
488
+ ]
489
+ },
490
+ {
491
+ "cell_type": "markdown",
492
+ "metadata": {},
493
+ "source": [
494
+ "---\n",
495
+ "\n",
496
+ "## 🔧 7. Advanced: Python API"
497
+ ]
498
+ },
499
+ {
500
+ "cell_type": "code",
501
+ "execution_count": null,
502
+ "metadata": {},
503
+ "outputs": [],
504
+ "source": [
505
+ "#@title 💻 **API Reference** { display-mode: \"form\" }\n",
506
+ "#@markdown Quick code snippets for common tasks\n",
507
+ "\n",
508
+ "from IPython.display import Markdown\n",
509
+ "\n",
510
+ "api_docs = '''\n",
511
+ "### Basic Usage\n",
512
+ "\n",
513
+ "```python\n",
514
+ "from depth_anything_3.api import DepthAnything3\n",
515
+ "\n",
516
+ "# Load model\n",
517
+ "model = DepthAnything3.from_pretrained(\"depth-anything/DA3-LARGE\")\n",
518
+ "model = model.to(\"cuda\").eval()\n",
519
+ "\n",
520
+ "# Single image\n",
521
+ "result = model.inference([\"image.jpg\"])\n",
522
+ "depth = result.depth[0] # Shape: (H, W)\n",
523
+ "\n",
524
+ "# Multiple images\n",
525
+ "result = model.inference([\"img1.jpg\", \"img2.jpg\", \"img3.jpg\"])\n",
526
+ "depths = result.depth # Shape: (N, H, W)\n",
527
+ "```\n",
528
+ "\n",
529
+ "### Output Attributes\n",
530
+ "\n",
531
+ "| Attribute | Shape | Description |\n",
532
+ "|-----------|-------|-------------|\n",
533
+ "| `depth` | `(N, H, W)` | Metric depth in meters |\n",
534
+ "| `conf` | `(N, H, W)` | Confidence [0-1] |\n",
535
+ "| `extrinsics` | `(N, 3, 4)` | Camera poses (world-to-cam) |\n",
536
+ "| `intrinsics` | `(N, 3, 3)` | Camera K matrix |\n",
537
+ "| `processed_images` | `(N, H, W, 3)` | Resized inputs (uint8) |\n",
538
+ "\n",
539
+ "### Export to 3D\n",
540
+ "\n",
541
+ "```python\n",
542
+ "from depth_anything_3.utils.export.glb import export_to_glb\n",
543
+ "\n",
544
+ "export_to_glb(\n",
545
+ " result,\n",
546
+ " export_dir=\"output\",\n",
547
+ " show_cameras=True, # Show camera frustums\n",
548
+ " conf_thresh_percentile=20, # Filter low confidence\n",
549
+ " num_max_points=500_000, # Max points in cloud\n",
550
+ ")\n",
551
+ "```\n",
552
+ "\n",
553
+ "### CLI Usage\n",
554
+ "\n",
555
+ "```bash\n",
556
+ "# Single image\n",
557
+ "da3 infer image.jpg -o output/\n",
558
+ "\n",
559
+ "# Directory of images\n",
560
+ "da3 infer images/ -o output/ --model DA3-LARGE\n",
561
+ "\n",
562
+ "# Video\n",
563
+ "da3 infer video.mp4 -o output/ --fps 2\n",
564
+ "```\n",
565
+ "'''\n",
566
+ "\n",
567
+ "display(Markdown(api_docs))"
568
+ ]
569
+ },
570
+ {
571
+ "cell_type": "markdown",
572
+ "metadata": {},
573
+ "source": [
574
+ "---\n",
575
+ "\n",
576
+ "## 💾 8. Save to Google Drive"
577
+ ]
578
+ },
579
+ {
580
+ "cell_type": "code",
581
+ "execution_count": null,
582
+ "metadata": {},
583
+ "outputs": [],
584
+ "source": [
585
+ "#@title 💾 **Mount Google Drive** { display-mode: \"form\" }\n",
586
+ "\n",
587
+ "from google.colab import drive\n",
588
+ "drive.mount('/content/drive')\n",
589
+ "\n",
590
+ "drive_output = \"/content/drive/MyDrive/DepthAnything3_Results\"\n",
591
+ "os.makedirs(drive_output, exist_ok=True)\n",
592
+ "print(f\"✅ Drive mounted at: {drive_output}\")"
593
+ ]
594
+ },
595
+ {
596
+ "cell_type": "code",
597
+ "execution_count": null,
598
+ "metadata": {},
599
+ "outputs": [],
600
+ "source": [
601
+ "#@title 💾 **Save Results to Drive** { display-mode: \"form\" }\n",
602
+ "\n",
603
+ "import shutil\n",
604
+ "from datetime import datetime\n",
605
+ "\n",
606
+ "# Create timestamped folder\n",
607
+ "timestamp = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n",
608
+ "save_dir = f\"{drive_output}/{timestamp}\"\n",
609
+ "os.makedirs(save_dir, exist_ok=True)\n",
610
+ "\n",
611
+ "# Copy all outputs\n",
612
+ "for folder in [\"output_3d\", \"video_3d\"]:\n",
613
+ " if os.path.exists(folder):\n",
614
+ " for f in os.listdir(folder):\n",
615
+ " shutil.copy(f\"{folder}/{f}\", save_dir)\n",
616
+ " print(f\" ✓ {f}\")\n",
617
+ "\n",
618
+ "print(f\"\\n✅ Saved to: {save_dir}\")"
619
+ ]
620
+ },
621
+ {
622
+ "cell_type": "markdown",
623
+ "metadata": {},
624
+ "source": [
625
+ "---\n",
626
+ "\n",
627
+ "## 🙏 Credits & Links\n",
628
+ "\n",
629
+ "<div align=\"center\">\n",
630
+ "\n",
631
+ "**Depth Anything 3** by ByteDance Research\n",
632
+ "\n",
633
+ "[📄 Paper](https://arxiv.org/abs/2511.10647) • [🌐 Project](https://depth-anything-3.github.io) • [🤗 Models](https://huggingface.co/collections/depth-anything/depth-anything-3)\n",
634
+ "\n",
635
+ "---\n",
636
+ "\n",
637
+ "**awesome-depth-anything-3** — Optimized fork with batching, caching & CLI\n",
638
+ "\n",
639
+ "[⭐ GitHub](https://github.com/Aedelon/awesome-depth-anything-3) • [📦 PyPI](https://pypi.org/project/awesome-depth-anything-3/)\n",
640
+ "\n",
641
+ "---\n",
642
+ "\n",
643
+ "Made with ❤️ by [Delanoe Pirard](https://github.com/Aedelon)\n",
644
+ "\n",
645
+ "</div>"
646
+ ]
647
+ }
648
+ ],
649
+ "metadata": {
650
+ "accelerator": "GPU",
651
+ "colab": {
652
+ "gpuType": "T4",
653
+ "provenance": [],
654
+ "toc_visible": true
655
+ },
656
+ "kernelspec": {
657
+ "display_name": "Python 3",
658
+ "name": "python3"
659
+ },
660
+ "language_info": {
661
+ "name": "python",
662
+ "version": "3.10.0"
663
+ }
664
+ },
665
+ "nbformat": 4,
666
+ "nbformat_minor": 0
667
+ }
pyproject.toml ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [build-system]
2
+ requires = ["hatchling>=1.25", "hatch-vcs>=0.4"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "awesome-depth-anything-3"
7
+ version = "0.0.0"
8
+ description = "Optimized wrapper for Depth Anything 3 - Metric depth, point clouds, camera poses and novel views from any images"
9
+ readme = "README.md"
10
+ requires-python = ">=3.10, <=3.13"
11
+ license = { text = "Apache-2.0" }
12
+ authors = [{ name = "Delanoe Pirard", email = "delanoe.pirard.pro@gmail.com" }]
13
+ keywords = [
14
+ "depth-estimation",
15
+ "3d-reconstruction",
16
+ "computer-vision",
17
+ "pytorch",
18
+ "monocular-depth",
19
+ "multi-view",
20
+ "pose-estimation",
21
+ "point-cloud",
22
+ ]
23
+ classifiers = [
24
+ "Development Status :: 4 - Beta",
25
+ "Intended Audience :: Developers",
26
+ "Intended Audience :: Science/Research",
27
+ "License :: OSI Approved :: Apache Software License",
28
+ "Programming Language :: Python :: 3.10",
29
+ "Programming Language :: Python :: 3.11",
30
+ "Programming Language :: Python :: 3.12",
31
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
32
+ "Topic :: Scientific/Engineering :: Image Processing",
33
+ ]
34
+
35
+ dependencies = [
36
+ "torch>=2",
37
+ "torchvision",
38
+ "kornia>=0.7.0",
39
+ "einops",
40
+ "huggingface_hub",
41
+ "imageio",
42
+ "numpy<2",
43
+ "opencv-python",
44
+ "open3d",
45
+ "fastapi",
46
+ "uvicorn",
47
+ "requests",
48
+ "typer>=0.9.0,<0.13.0",
49
+ "pillow",
50
+ "omegaconf",
51
+ "evo",
52
+ "e3nn",
53
+ "moviepy==1.0.3",
54
+ "trimesh",
55
+ "plyfile",
56
+ "pillow_heif",
57
+ "safetensors",
58
+ "pycolmap",
59
+ "twine>=6.2.0",
60
+ ]
61
+
62
+ [project.optional-dependencies]
63
+ app = ["gradio==4.44.1", "huggingface_hub>=0.19,<1.0", "pillow>=9.0"]
64
+ dev = ["pre-commit", "pytest", "ruff"]
65
+ # CUDA acceleration packages (may require manual install steps)
66
+ xformers = ["xformers; platform_system!='Darwin'"]
67
+ gs = ["gsplat>=1.0.0; platform_system!='Darwin'"]
68
+ # Note: flash-attn package is optional. PyTorch >= 2.2 includes Flash Attention
69
+ # natively via F.scaled_dot_product_attention(). Only install flash-attn if you
70
+ # need the absolute latest optimizations:
71
+ # pip install flash-attn --no-build-isolation (requires CUDA toolkit)
72
+ # Convenience bundles
73
+ cuda = ["awesome-depth-anything-3[xformers,gs]"]
74
+ all = ["awesome-depth-anything-3[app,cuda]"]
75
+
76
+
77
+ [project.scripts]
78
+ da3 = "depth_anything_3.cli:app"
79
+
80
+ [project.urls]
81
+ Homepage = "https://github.com/Aedelon/awesome-depth-anything-3"
82
+ Repository = "https://github.com/Aedelon/awesome-depth-anything-3"
83
+ Documentation = "https://github.com/Aedelon/awesome-depth-anything-3#readme"
84
+ Issues = "https://github.com/Aedelon/awesome-depth-anything-3/issues"
85
+ Changelog = "https://github.com/Aedelon/awesome-depth-anything-3/blob/main/CHANGELOG.md"
86
+ Upstream = "https://github.com/ByteDance-Seed/Depth-Anything-3"
87
+
88
+ [tool.hatch.version]
89
+ source = "vcs"
90
+
91
+ [tool.hatch.build.targets.wheel]
92
+ packages = ["src/depth_anything_3"]
93
+
94
+ [tool.hatch.build.targets.sdist]
95
+ include = [
96
+ "/README.md",
97
+ "/pyproject.toml",
98
+ "/src/depth_anything_3",
99
+ ]
100
+
101
+ [tool.hatch.metadata]
102
+ allow-direct-references = true
103
+
104
+ [tool.mypy]
105
+ plugins = ["jaxtyping.mypy_plugin"]
106
+
107
+ [tool.black]
108
+ line-length = 99
109
+ target-version = ['py37', 'py38', 'py39', 'py310', 'py311']
110
+ include = '\.pyi?$'
111
+ exclude = '''
112
+ /(
113
+ | \.git
114
+ )/
115
+ '''
116
+
117
+ [tool.isort]
118
+ profile = "black"
119
+ multi_line_output = 3
120
+ include_trailing_comma = true
121
+ known_third_party = ["bson","cruise","cv2","dataloader","diffusers","omegaconf","tensorflow","torch","torchvision","transformers","gsplat"]
122
+ known_first_party = ["common", "data", "models", "projects", "depth_anything_3"]
123
+ sections = ["FUTURE","STDLIB","THIRDPARTY","FIRSTPARTY","LOCALFOLDER"]
124
+ skip_gitignore = true
125
+ line_length = 99
126
+ no_lines_before="THIRDPARTY"
127
+
128
+ [tool.pytest.ini_options]
129
+ testpaths = ["tests"]
130
+ python_files = ["test_*.py"]
131
+ python_functions = ["test_*"]
132
+ addopts = "-v --tb=short"
133
+ filterwarnings = [
134
+ "ignore::DeprecationWarning",
135
+ "ignore::UserWarning",
136
+ ]
137
+
138
+ [tool.ruff]
139
+ line-length = 99
140
+ target-version = "py310"
141
+
142
+ [tool.ruff.lint]
143
+ select = ["E", "F", "W", "I"]
144
+ ignore = ["E501"] # Line too long (handled by formatter)
requirements.txt ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Install this package from GitHub
2
+ git+https://github.com/Aedelon/awesome-depth-anything-3.git
3
+
4
+ # Core dependencies - torch MUST be first for xformers
5
+ torch>=2
6
+ torchvision
7
+ numpy<2
8
+
9
+ # ML/Vision libraries
10
+ einops
11
+ kornia>=0.7.0
12
+ safetensors
13
+
14
+ # Image/Video processing
15
+ pillow
16
+ pillow_heif
17
+ imageio
18
+ opencv-python
19
+ moviepy==1.0.3
20
+
21
+ # 3D and geometry
22
+ trimesh
23
+ plyfile
24
+ open3d
25
+ e3nn
26
+ evo
27
+ pycolmap
28
+
29
+ # API and config
30
+ fastapi
31
+ uvicorn
32
+ requests
33
+ typer>=0.9.0,<0.13.0
34
+ omegaconf
35
+
36
+ # Gradio app
37
+ gradio>=5.50.0,<6.0
38
+ huggingface_hub>=0.33.5,<2.0
scripts/deploy_hf.sh ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Copyright (c) Delanoe Pirard / Aedelon
3
+ # Licensed under the Apache License, Version 2.0
4
+ #
5
+ # Deploy to HuggingFace Spaces with LFS for binary files
6
+ # This script temporarily enables LFS, pushes to HF, then restores normal state
7
+
8
+ set -e
9
+
10
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
11
+ PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
12
+ cd "$PROJECT_ROOT"
13
+
14
+ # HuggingFace Spaces YAML front matter
15
+ HF_YAML='---
16
+ title: Awesome Depth Anything 3
17
+ emoji: 🌊
18
+ colorFrom: blue
19
+ colorTo: purple
20
+ sdk: gradio
21
+ sdk_version: 5.50.0
22
+ app_file: app.py
23
+ pinned: false
24
+ license: apache-2.0
25
+ short_description: Metric 3D reconstruction from images/video
26
+ ---
27
+
28
+ '
29
+
30
+ echo "=== HuggingFace Deployment Script ==="
31
+
32
+ # Save current HEAD
33
+ CURRENT_SHA=$(git rev-parse HEAD)
34
+ echo "Current commit: $CURRENT_SHA"
35
+
36
+ # Step 1: Configure LFS for binary files
37
+ echo ""
38
+ echo "Step 1: Configuring Git LFS..."
39
+ cat > .gitattributes << 'EOF'
40
+ *.png filter=lfs diff=lfs merge=lfs -text
41
+ *.jpg filter=lfs diff=lfs merge=lfs -text
42
+ *.jpeg filter=lfs diff=lfs merge=lfs -text
43
+ *.gif filter=lfs diff=lfs merge=lfs -text
44
+ *.mp4 filter=lfs diff=lfs merge=lfs -text
45
+ *.webm filter=lfs diff=lfs merge=lfs -text
46
+ EOF
47
+
48
+ # Step 2: Create deployment branch
49
+ echo ""
50
+ echo "Step 2: Creating deployment branch..."
51
+ git checkout --orphan hf-deploy-temp 2>/dev/null || git checkout hf-deploy-temp
52
+
53
+ # Reset to get clean state
54
+ git reset
55
+
56
+ # Step 3: Add YAML to README
57
+ echo ""
58
+ echo "Step 3: Adding YAML front matter to README..."
59
+ cp README.md README.md.original
60
+ echo "$HF_YAML$(cat README.md.original)" > README.md
61
+
62
+ # Step 4: Stage all files (LFS will handle binaries)
63
+ echo ""
64
+ echo "Step 4: Staging files with LFS..."
65
+ git add .gitattributes
66
+ git add -A
67
+
68
+ # Step 5: Commit
69
+ echo ""
70
+ echo "Step 5: Committing..."
71
+ git commit -m "Deploy to HuggingFace Spaces" --no-verify || true
72
+
73
+ # Step 6: Push to HuggingFace
74
+ echo ""
75
+ echo "Step 6: Pushing to HuggingFace Spaces..."
76
+ git push huggingface hf-deploy-temp:main --force
77
+
78
+ # Step 7: Cleanup - return to main branch
79
+ echo ""
80
+ echo "Step 7: Cleaning up..."
81
+ git checkout main --force
82
+ git branch -D hf-deploy-temp 2>/dev/null || true
83
+
84
+ # Restore original .gitattributes (no LFS)
85
+ cat > .gitattributes << 'EOF'
86
+ *.png !text !filter !merge !diff
87
+ *.jpg !text !filter !merge !diff
88
+ *.jpeg !text !filter !merge !diff
89
+ *.gif !text !filter !merge !diff
90
+ *.mp4 !text !filter !merge !diff
91
+ *.webm !text !filter !merge !diff
92
+ EOF
93
+
94
+ echo ""
95
+ echo "=== Done! ==="
96
+ echo "HuggingFace updated with YAML metadata and LFS binaries."
97
+ echo "Local repo restored to normal state (no LFS)."
src/depth_anything_3/api.py ADDED
@@ -0,0 +1,718 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """
15
+ Depth Anything 3 API module.
16
+
17
+ This module provides the main API for Depth Anything 3, including model loading,
18
+ inference, and export capabilities. It supports both single and nested model architectures.
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ import time
24
+ from typing import Optional, Sequence
25
+
26
+ import numpy as np
27
+ import torch
28
+ import torch.nn as nn
29
+ from huggingface_hub import PyTorchModelHubMixin
30
+ from PIL import Image
31
+
32
+ from depth_anything_3.cache import get_model_cache
33
+ from depth_anything_3.cfg import create_object, load_config
34
+ from depth_anything_3.registry import MODEL_REGISTRY
35
+ from depth_anything_3.specs import Prediction
36
+ from depth_anything_3.utils.adaptive_batching import (
37
+ AdaptiveBatchConfig,
38
+ AdaptiveBatchSizeCalculator,
39
+ adaptive_batch_iterator,
40
+ estimate_max_batch_size,
41
+ )
42
+ from depth_anything_3.utils.export import export
43
+ from depth_anything_3.utils.geometry import affine_inverse
44
+ from depth_anything_3.utils.io.gpu_input_processor import GPUInputProcessor
45
+ from depth_anything_3.utils.io.input_processor import InputProcessor
46
+ from depth_anything_3.utils.io.output_processor import OutputProcessor
47
+ from depth_anything_3.utils.logger import logger
48
+ from depth_anything_3.utils.pose_align import align_poses_umeyama
49
+
50
+ torch.backends.cudnn.benchmark = False
51
+ # logger.info("CUDNN Benchmark Disabled")
52
+
53
+ SAFETENSORS_NAME = "model.safetensors"
54
+ CONFIG_NAME = "config.json"
55
+
56
+
57
+ class DepthAnything3(nn.Module, PyTorchModelHubMixin):
58
+ """
59
+ Depth Anything 3 main API class.
60
+
61
+ This class provides a high-level interface for depth estimation using Depth Anything 3.
62
+ It supports both single and nested model architectures with metric scaling capabilities.
63
+
64
+ Features:
65
+ - Hugging Face Hub integration via PyTorchModelHubMixin
66
+ - Support for multiple model presets (vitb, vitg, nested variants)
67
+ - Automatic mixed precision inference
68
+ - Export capabilities for various formats (GLB, PLY, NPZ, etc.)
69
+ - Camera pose estimation and metric depth scaling
70
+
71
+ Usage:
72
+ # Load from Hugging Face Hub
73
+ model = DepthAnything3.from_pretrained("huggingface/model-name")
74
+
75
+ # Or create with specific preset
76
+ model = DepthAnything3(preset="vitg")
77
+
78
+ # Run inference
79
+ prediction = model.inference(images, export_dir="output", export_format="glb")
80
+ """
81
+
82
+ _commit_hash: str | None = None # Set by mixin when loading from Hub
83
+
84
+ def __init__(self, model_name: str = "da3-large", device: str | torch.device | None = None, use_cache: bool = True, **kwargs):
85
+ """
86
+ Initialize DepthAnything3 with specified preset.
87
+
88
+ Args:
89
+ model_name: The name of the model preset to use.
90
+ Examples: 'da3-giant', 'da3-large', 'da3metric-large', 'da3nested-giant-large'.
91
+ device: Target device ('cuda', 'mps', 'cpu'). If None, auto-detect.
92
+ use_cache: Whether to use model caching (default: True).
93
+ Set to False to force reload model from disk.
94
+ **kwargs: Additional keyword arguments (currently unused).
95
+ """
96
+ super().__init__()
97
+ self.model_name = model_name
98
+ self.use_cache = use_cache
99
+
100
+ # Determine device
101
+ if device is None:
102
+ device = self._auto_detect_device()
103
+ self.device = torch.device(device) if isinstance(device, str) else device
104
+
105
+ # Load model configuration
106
+ self.config = load_config(MODEL_REGISTRY[self.model_name])
107
+
108
+ # Build or retrieve model from cache
109
+ if use_cache:
110
+ cache = get_model_cache()
111
+ self.model = cache.get(
112
+ model_name=self.model_name,
113
+ device=self.device,
114
+ loader_fn=lambda: self._create_model()
115
+ )
116
+ else:
117
+ logger.info(f"Model cache disabled, loading {self.model_name} from disk")
118
+ self.model = self._create_model()
119
+
120
+ # Ensure model is on correct device and in eval mode
121
+ self.model = self.model.to(self.device)
122
+ self.model.eval()
123
+
124
+ # Initialize processors
125
+ # Use GPUInputProcessor for CUDA/MPS devices to enable GPU ops
126
+ # Note: NVJPEG decoding is specific to CUDA, MPS will use optimized CPU decoding + GPU resize
127
+ if self.device.type in ("cuda", "mps"):
128
+ self.input_processor = GPUInputProcessor(device=self.device)
129
+ decoding_info = "NVJPEG support enabled" if self.device.type == "cuda" else "TorchVision decoding"
130
+ logger.info(f"Using GPUInputProcessor ({decoding_info} on {self.device})")
131
+ else:
132
+ self.input_processor = InputProcessor()
133
+ logger.info("Using standard InputProcessor (optimized CPU pipeline)")
134
+
135
+ self.output_processor = OutputProcessor()
136
+
137
+ def _auto_detect_device(self) -> torch.device:
138
+ """Auto-detect best available device."""
139
+ if torch.cuda.is_available():
140
+ return torch.device("cuda")
141
+ elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
142
+ return torch.device("mps")
143
+ else:
144
+ return torch.device("cpu")
145
+
146
+ def _create_model(self) -> nn.Module:
147
+ """Create and return new model instance on correct device."""
148
+ model = create_object(self.config)
149
+ model = model.to(self.device) # Move to device before caching
150
+ model.eval()
151
+ return model
152
+
153
+ @torch.inference_mode()
154
+ def forward(
155
+ self,
156
+ image: torch.Tensor,
157
+ extrinsics: torch.Tensor | None = None,
158
+ intrinsics: torch.Tensor | None = None,
159
+ export_feat_layers: list[int] | None = None,
160
+ infer_gs: bool = False,
161
+ use_ray_pose: bool = False,
162
+ ref_view_strategy: str = "saddle_balanced",
163
+ ) -> dict[str, torch.Tensor]:
164
+ """
165
+ Forward pass through the model.
166
+
167
+ Args:
168
+ image: Input batch with shape ``(B, N, 3, H, W)`` on the model device.
169
+ extrinsics: Optional camera extrinsics with shape ``(B, N, 4, 4)``.
170
+ intrinsics: Optional camera intrinsics with shape ``(B, N, 3, 3)``.
171
+ export_feat_layers: Layer indices to return intermediate features for.
172
+ infer_gs: Enable Gaussian Splatting branch.
173
+ use_ray_pose: Use ray-based pose estimation instead of camera decoder.
174
+ ref_view_strategy: Strategy for selecting reference view from multiple views.
175
+
176
+ Returns:
177
+ Dictionary containing model predictions
178
+ """
179
+ with torch.no_grad():
180
+ # MPS doesn't support autocast well - use float32 for stability
181
+ if image.device.type == "mps":
182
+ return self.model(
183
+ image, extrinsics, intrinsics, export_feat_layers, infer_gs, use_ray_pose, ref_view_strategy
184
+ )
185
+ else:
186
+ # CUDA: use autocast for performance
187
+ autocast_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
188
+ with torch.autocast(device_type=image.device.type, dtype=autocast_dtype):
189
+ return self.model(
190
+ image, extrinsics, intrinsics, export_feat_layers, infer_gs, use_ray_pose, ref_view_strategy
191
+ )
192
+
193
+ def inference(
194
+ self,
195
+ image: list[np.ndarray | Image.Image | str],
196
+ extrinsics: np.ndarray | None = None,
197
+ intrinsics: np.ndarray | None = None,
198
+ align_to_input_ext_scale: bool = True,
199
+ infer_gs: bool = False,
200
+ use_ray_pose: bool = False,
201
+ ref_view_strategy: str = "saddle_balanced",
202
+ render_exts: np.ndarray | None = None,
203
+ render_ixts: np.ndarray | None = None,
204
+ render_hw: tuple[int, int] | None = None,
205
+ process_res: int = 504,
206
+ process_res_method: str = "upper_bound_resize",
207
+ export_dir: str | None = None,
208
+ export_format: str = "mini_npz",
209
+ export_feat_layers: Sequence[int] | None = None,
210
+ # GLB export parameters
211
+ conf_thresh_percentile: float = 40.0,
212
+ num_max_points: int = 1_000_000,
213
+ show_cameras: bool = True,
214
+ # Feat_vis export parameters
215
+ feat_vis_fps: int = 15,
216
+ # Other export parameters, e.g., gs_ply, gs_video
217
+ export_kwargs: Optional[dict] = {},
218
+ ) -> Prediction:
219
+ """
220
+ Run inference on input images.
221
+
222
+ Args:
223
+ image: List of input images (numpy arrays, PIL Images, or file paths)
224
+ extrinsics: Camera extrinsics (N, 4, 4)
225
+ intrinsics: Camera intrinsics (N, 3, 3)
226
+ align_to_input_ext_scale: whether to align the input pose scale to the prediction
227
+ infer_gs: Enable the 3D Gaussian branch (needed for `gs_ply`/`gs_video` exports)
228
+ use_ray_pose: Use ray-based pose estimation instead of camera decoder (default: False)
229
+ ref_view_strategy: Strategy for selecting reference view from multiple views.
230
+ Options: "first", "middle", "saddle_balanced", "saddle_sim_range".
231
+ Default: "saddle_balanced". For single view input (S ≤ 2), no reordering is performed.
232
+ render_exts: Optional render extrinsics for Gaussian video export
233
+ render_ixts: Optional render intrinsics for Gaussian video export
234
+ render_hw: Optional render resolution for Gaussian video export
235
+ process_res: Processing resolution
236
+ process_res_method: Resize method for processing
237
+ export_dir: Directory to export results
238
+ export_format: Export format (mini_npz, npz, glb, ply, gs, gs_video)
239
+ export_feat_layers: Layer indices to export intermediate features from
240
+ conf_thresh_percentile: [GLB] Lower percentile for adaptive confidence threshold (default: 40.0) # noqa: E501
241
+ num_max_points: [GLB] Maximum number of points in the point cloud (default: 1,000,000)
242
+ show_cameras: [GLB] Show camera wireframes in the exported scene (default: True)
243
+ feat_vis_fps: [FEAT_VIS] Frame rate for output video (default: 15)
244
+ export_kwargs: additional arguments to export functions.
245
+
246
+ Returns:
247
+ Prediction object containing depth maps and camera parameters
248
+ """
249
+ if "gs" in export_format:
250
+ assert infer_gs, "must set `infer_gs=True` to perform gs-related export."
251
+
252
+ if "colmap" in export_format:
253
+ assert isinstance(image[0], str), "`image` must be image paths for COLMAP export."
254
+
255
+ # Preprocess images
256
+ imgs_cpu, extrinsics, intrinsics = self._preprocess_inputs(
257
+ image, extrinsics, intrinsics, process_res, process_res_method
258
+ )
259
+
260
+ # Prepare tensors for model
261
+ imgs, ex_t, in_t = self._prepare_model_inputs(imgs_cpu, extrinsics, intrinsics)
262
+
263
+ # Normalize extrinsics
264
+ ex_t_norm = self._normalize_extrinsics(ex_t.clone() if ex_t is not None else None)
265
+
266
+ # Run model forward pass
267
+ export_feat_layers = list(export_feat_layers) if export_feat_layers is not None else []
268
+
269
+ raw_output = self._run_model_forward(
270
+ imgs, ex_t_norm, in_t, export_feat_layers, infer_gs, use_ray_pose, ref_view_strategy
271
+ )
272
+
273
+ # Convert raw output to prediction
274
+ prediction = self._convert_to_prediction(raw_output)
275
+
276
+ # Align prediction to extrinsincs
277
+ prediction = self._align_to_input_extrinsics_intrinsics(
278
+ extrinsics, intrinsics, prediction, align_to_input_ext_scale
279
+ )
280
+
281
+ # Add processed images for visualization
282
+ prediction = self._add_processed_images(prediction, imgs_cpu)
283
+
284
+ # Export if requested
285
+ if export_dir is not None:
286
+
287
+ if "gs" in export_format:
288
+ if infer_gs and "gs_video" not in export_format:
289
+ export_format = f"{export_format}-gs_video"
290
+ if "gs_video" in export_format:
291
+ if "gs_video" not in export_kwargs:
292
+ export_kwargs["gs_video"] = {}
293
+ export_kwargs["gs_video"].update(
294
+ {
295
+ "extrinsics": render_exts,
296
+ "intrinsics": render_ixts,
297
+ "out_image_hw": render_hw,
298
+ }
299
+ )
300
+ # Add GLB export parameters
301
+ if "glb" in export_format:
302
+ if "glb" not in export_kwargs:
303
+ export_kwargs["glb"] = {}
304
+ export_kwargs["glb"].update(
305
+ {
306
+ "conf_thresh_percentile": conf_thresh_percentile,
307
+ "num_max_points": num_max_points,
308
+ "show_cameras": show_cameras,
309
+ }
310
+ )
311
+ # Add Feat_vis export parameters
312
+ if "feat_vis" in export_format:
313
+ if "feat_vis" not in export_kwargs:
314
+ export_kwargs["feat_vis"] = {}
315
+ export_kwargs["feat_vis"].update(
316
+ {
317
+ "fps": feat_vis_fps,
318
+ }
319
+ )
320
+ # Add COLMAP export parameters
321
+ if "colmap" in export_format:
322
+ if "colmap" not in export_kwargs:
323
+ export_kwargs["colmap"] = {}
324
+ export_kwargs["colmap"].update(
325
+ {
326
+ "image_paths": image,
327
+ "conf_thresh_percentile": conf_thresh_percentile,
328
+ "process_res_method": process_res_method,
329
+ }
330
+ )
331
+ self._export_results(prediction, export_format, export_dir, **export_kwargs)
332
+
333
+ return prediction
334
+
335
+ def _preprocess_inputs(
336
+ self,
337
+ image: list[np.ndarray | Image.Image | str],
338
+ extrinsics: np.ndarray | None = None,
339
+ intrinsics: np.ndarray | None = None,
340
+ process_res: int = 504,
341
+ process_res_method: str = "upper_bound_resize",
342
+ ) -> tuple[torch.Tensor, torch.Tensor | None, torch.Tensor | None]:
343
+ """Preprocess input images using input processor."""
344
+ start_time = time.time()
345
+
346
+ # Determine normalization strategy:
347
+ # 1. Hybrid (CPU Proc + GPU Device): Skip CPU norm (return uint8), norm on GPU later.
348
+ # 2. GPU Proc (NVJPEG/Kornia): Perform norm on GPU immediately.
349
+ # 3. Standard CPU: Perform norm on CPU.
350
+
351
+ perform_norm = True
352
+ if self.device.type in ("cuda", "mps") and not isinstance(self.input_processor, GPUInputProcessor):
353
+ perform_norm = False
354
+
355
+ imgs_cpu, extrinsics, intrinsics = self.input_processor(
356
+ image,
357
+ extrinsics.copy() if extrinsics is not None else None,
358
+ intrinsics.copy() if intrinsics is not None else None,
359
+ process_res,
360
+ process_res_method,
361
+ perform_normalization=perform_norm,
362
+ )
363
+ end_time = time.time()
364
+ logger.info(
365
+ "Processed Images Done taking",
366
+ end_time - start_time,
367
+ "seconds. Shape: ",
368
+ imgs_cpu.shape,
369
+ )
370
+ return imgs_cpu, extrinsics, intrinsics
371
+
372
+ def _prepare_model_inputs(
373
+ self,
374
+ imgs_cpu: torch.Tensor,
375
+ extrinsics: torch.Tensor | None,
376
+ intrinsics: torch.Tensor | None,
377
+ ) -> tuple[torch.Tensor, torch.Tensor | None, torch.Tensor | None]:
378
+ """
379
+ Prepare tensors for model input with optimized device transfer.
380
+ """
381
+ device = self._get_model_device()
382
+
383
+ # 1. Handle Image Tensor
384
+ # Compare device types (handles cuda:0 vs cuda comparison)
385
+ imgs_on_target_device = (imgs_cpu.device.type == device.type)
386
+ if imgs_on_target_device:
387
+ # Case A: Already on correct device (GPUInputProcessor)
388
+ # Ensure correct shape: (B, S, C, H, W) where B=1
389
+ imgs = imgs_cpu
390
+ if imgs.dim() == 3:
391
+ # Single image (C, H, W) -> (1, 1, C, H, W)
392
+ imgs = imgs.unsqueeze(0).unsqueeze(0)
393
+ elif imgs.dim() == 4:
394
+ # Batch of images (N, C, H, W) -> (1, N, C, H, W)
395
+ imgs = imgs.unsqueeze(0)
396
+ # dim() == 5 means already correct shape
397
+ if imgs.dtype == torch.uint8:
398
+ # Should not happen with GPUInputProcessor default, but safety fallback
399
+ imgs = imgs.float() / 255.0
400
+ imgs = InputProcessor.normalize_tensor(
401
+ imgs,
402
+ mean=[0.485, 0.456, 0.406],
403
+ std=[0.229, 0.224, 0.225]
404
+ )
405
+ else:
406
+ # Case B & C: Needs transfer from CPU
407
+ if imgs_cpu.dtype == torch.uint8:
408
+ # Hybrid mode: uint8 -> GPU -> float -> normalize
409
+ if device.type == "cuda":
410
+ imgs_cpu = imgs_cpu.pin_memory()
411
+
412
+ imgs = imgs_cpu.to(device, non_blocking=True).float() / 255.0
413
+ imgs = InputProcessor.normalize_tensor(
414
+ imgs,
415
+ mean=[0.485, 0.456, 0.406],
416
+ std=[0.229, 0.224, 0.225]
417
+ )
418
+ imgs = imgs[None] # Add batch dimension (1, N, 3, H, W)
419
+ else:
420
+ # Standard mode: float -> GPU
421
+ if device.type == "cuda":
422
+ imgs_cpu = imgs_cpu.pin_memory()
423
+ imgs = imgs_cpu.to(device, non_blocking=True)[None].float()
424
+
425
+ # Convert camera parameters to tensors with non-blocking transfer
426
+ ex_t = (
427
+ extrinsics.pin_memory().to(device, non_blocking=True)[None].float()
428
+ if extrinsics is not None and device.type == "cuda" and extrinsics.device.type == "cpu"
429
+ else extrinsics.to(device, non_blocking=True)[None].float()
430
+ if extrinsics is not None and extrinsics.device != device
431
+ else extrinsics[None].float()
432
+ if extrinsics is not None
433
+ else None
434
+ )
435
+ in_t = (
436
+ intrinsics.pin_memory().to(device, non_blocking=True)[None].float()
437
+ if intrinsics is not None and device.type == "cuda" and intrinsics.device.type == "cpu"
438
+ else intrinsics.to(device, non_blocking=True)[None].float()
439
+ if intrinsics is not None and intrinsics.device != device
440
+ else intrinsics[None].float()
441
+ if intrinsics is not None
442
+ else None
443
+ )
444
+
445
+ return imgs, ex_t, in_t
446
+
447
+ def _normalize_extrinsics(self, ex_t: torch.Tensor | None) -> torch.Tensor | None:
448
+ """Normalize extrinsics"""
449
+ if ex_t is None:
450
+ return None
451
+ transform = affine_inverse(ex_t[:, :1])
452
+ ex_t_norm = ex_t @ transform
453
+ c2ws = affine_inverse(ex_t_norm)
454
+ translations = c2ws[..., :3, 3]
455
+ dists = translations.norm(dim=-1)
456
+ median_dist = torch.median(dists)
457
+ median_dist = torch.clamp(median_dist, min=1e-1)
458
+ ex_t_norm[..., :3, 3] = ex_t_norm[..., :3, 3] / median_dist
459
+ return ex_t_norm
460
+
461
+ def _align_to_input_extrinsics_intrinsics(
462
+ self,
463
+ extrinsics: torch.Tensor | None,
464
+ intrinsics: torch.Tensor | None,
465
+ prediction: Prediction,
466
+ align_to_input_ext_scale: bool = True,
467
+ ransac_view_thresh: int = 10,
468
+ ) -> Prediction:
469
+ """Align depth map to input extrinsics"""
470
+ if extrinsics is None:
471
+ return prediction
472
+ prediction.intrinsics = intrinsics.numpy()
473
+ _, _, scale, aligned_extrinsics = align_poses_umeyama(
474
+ prediction.extrinsics,
475
+ extrinsics.numpy(),
476
+ ransac=len(extrinsics) >= ransac_view_thresh,
477
+ return_aligned=True,
478
+ random_state=42,
479
+ )
480
+ if align_to_input_ext_scale:
481
+ prediction.extrinsics = extrinsics[..., :3, :].numpy()
482
+ prediction.depth /= scale
483
+ else:
484
+ prediction.extrinsics = aligned_extrinsics
485
+ return prediction
486
+
487
+ def _run_model_forward(
488
+ self,
489
+ imgs: torch.Tensor,
490
+ ex_t: torch.Tensor | None,
491
+ in_t: torch.Tensor | None,
492
+ export_feat_layers: Sequence[int] | None = None,
493
+ infer_gs: bool = False,
494
+ use_ray_pose: bool = False,
495
+ ref_view_strategy: str = "saddle_balanced",
496
+ ) -> dict[str, torch.Tensor]:
497
+ """Run model forward pass."""
498
+ device = imgs.device
499
+ need_sync = device.type == "cuda"
500
+ if need_sync:
501
+ torch.cuda.synchronize(device)
502
+ start_time = time.time()
503
+ feat_layers = list(export_feat_layers) if export_feat_layers is not None else None
504
+ output = self.forward(imgs, ex_t, in_t, feat_layers, infer_gs, use_ray_pose, ref_view_strategy)
505
+ if need_sync:
506
+ torch.cuda.synchronize(device)
507
+ end_time = time.time()
508
+ logger.info(f"Model Forward Pass Done. Time: {end_time - start_time} seconds")
509
+ return output
510
+
511
+ def _convert_to_prediction(self, raw_output: dict[str, torch.Tensor]) -> Prediction:
512
+ """Convert raw model output to Prediction object."""
513
+ start_time = time.time()
514
+ output = self.output_processor(raw_output)
515
+ end_time = time.time()
516
+ logger.info(f"Conversion to Prediction Done. Time: {end_time - start_time} seconds")
517
+ return output
518
+
519
+ def _add_processed_images(self, prediction: Prediction, imgs_cpu: torch.Tensor) -> Prediction:
520
+ """Add processed images to prediction for visualization."""
521
+ # Convert from (N, 3, H, W) to (N, H, W, 3)
522
+ processed_imgs = imgs_cpu.permute(0, 2, 3, 1).cpu().numpy() # (N, H, W, 3)
523
+
524
+ if imgs_cpu.dtype == torch.uint8:
525
+ # Already uint8, no need to denormalize
526
+ pass
527
+ else:
528
+ # Denormalize from ImageNet normalization
529
+ mean = np.array([0.485, 0.456, 0.406])
530
+ std = np.array([0.229, 0.224, 0.225])
531
+ processed_imgs = processed_imgs * std + mean
532
+ processed_imgs = np.clip(processed_imgs, 0, 1)
533
+ processed_imgs = (processed_imgs * 255).astype(np.uint8)
534
+
535
+ prediction.processed_images = processed_imgs
536
+ return prediction
537
+
538
+ def _export_results(
539
+ self, prediction: Prediction, export_format: str, export_dir: str, **kwargs
540
+ ) -> None:
541
+ """Export results to specified format and directory."""
542
+ start_time = time.time()
543
+ export(prediction, export_format, export_dir, **kwargs)
544
+ end_time = time.time()
545
+ logger.info(f"Export Results Done. Time: {end_time - start_time} seconds")
546
+
547
+ def _get_model_device(self) -> torch.device:
548
+ """
549
+ Get the device where the model is located.
550
+
551
+ Returns:
552
+ Device where the model parameters are located
553
+
554
+ Raises:
555
+ ValueError: If no tensors are found in the model
556
+ """
557
+ if self.device is not None:
558
+ return self.device
559
+
560
+ # Find device from parameters
561
+ for param in self.parameters():
562
+ self.device = param.device
563
+ return param.device
564
+
565
+ # Find device from buffers
566
+ for buffer in self.buffers():
567
+ self.device = buffer.device
568
+ return buffer.device
569
+
570
+ raise ValueError("No tensor found in model")
571
+
572
+ # =========================================================================
573
+ # Adaptive Batching Methods
574
+ # =========================================================================
575
+
576
+ def batch_inference(
577
+ self,
578
+ images: list[np.ndarray | Image.Image | str],
579
+ process_res: int = 504,
580
+ batch_size: int | str = "auto",
581
+ max_batch_size: int = 64,
582
+ target_memory_utilization: float = 0.85,
583
+ progress_callback: callable | None = None,
584
+ ) -> list[Prediction]:
585
+ """
586
+ Run inference on multiple images with adaptive batching.
587
+
588
+ This method automatically determines optimal batch sizes based on
589
+ available GPU memory, maximizing throughput while preventing OOM errors.
590
+
591
+ Args:
592
+ images: List of input images (numpy arrays, PIL Images, or file paths)
593
+ process_res: Processing resolution (default: 504)
594
+ batch_size: Batch size or "auto" for adaptive batching (default: "auto")
595
+ max_batch_size: Maximum batch size when using adaptive batching (default: 64)
596
+ target_memory_utilization: Target GPU memory usage 0.0-1.0 (default: 0.85)
597
+ progress_callback: Optional callback(processed, total) for progress updates
598
+
599
+ Returns:
600
+ List of Prediction objects, one per batch
601
+
602
+ Example:
603
+ >>> model = DepthAnything3(model_name="da3-large")
604
+ >>> images = ["img1.jpg", "img2.jpg", ..., "img100.jpg"]
605
+ >>>
606
+ >>> # Adaptive batching (recommended)
607
+ >>> results = model.batch_inference(images, process_res=518)
608
+ >>>
609
+ >>> # Fixed batch size
610
+ >>> results = model.batch_inference(images, batch_size=4)
611
+ >>>
612
+ >>> # With progress callback
613
+ >>> def on_progress(done, total):
614
+ ... print(f"Processed {done}/{total}")
615
+ >>> results = model.batch_inference(images, progress_callback=on_progress)
616
+ """
617
+ import gc
618
+
619
+ num_images = len(images)
620
+ if num_images == 0:
621
+ return []
622
+
623
+ results: list[Prediction] = []
624
+
625
+ # Determine batch size
626
+ if batch_size == "auto":
627
+ config = AdaptiveBatchConfig(
628
+ max_batch_size=max_batch_size,
629
+ target_memory_utilization=target_memory_utilization,
630
+ )
631
+ calculator = AdaptiveBatchSizeCalculator(
632
+ model_name=self.model_name,
633
+ device=self.device,
634
+ config=config,
635
+ )
636
+
637
+ for batch_info in adaptive_batch_iterator(images, calculator, process_res):
638
+ # Run inference on this batch
639
+ prediction = self.inference(
640
+ image=batch_info.items,
641
+ process_res=process_res,
642
+ )
643
+ results.append(prediction)
644
+
645
+ # Progress callback
646
+ if progress_callback:
647
+ progress_callback(batch_info.end_idx, num_images)
648
+
649
+ # Memory cleanup between batches
650
+ if not batch_info.is_last:
651
+ gc.collect()
652
+ if self.device.type == "cuda":
653
+ torch.cuda.empty_cache()
654
+ elif self.device.type == "mps":
655
+ torch.mps.empty_cache()
656
+
657
+ # Update profiling data for better estimates
658
+ if calculator.config.enable_profiling and self.device.type == "cuda":
659
+ memory_used = torch.cuda.max_memory_allocated(self.device) / (1024 * 1024)
660
+ calculator.update_from_profiling(
661
+ batch_size=batch_info.batch_size,
662
+ memory_used_mb=memory_used,
663
+ process_res=process_res,
664
+ )
665
+ torch.cuda.reset_peak_memory_stats(self.device)
666
+
667
+ else:
668
+ # Fixed batch size
669
+ fixed_batch_size = int(batch_size)
670
+ for i in range(0, num_images, fixed_batch_size):
671
+ end_idx = min(i + fixed_batch_size, num_images)
672
+ batch_images = images[i:end_idx]
673
+
674
+ prediction = self.inference(
675
+ image=batch_images,
676
+ process_res=process_res,
677
+ )
678
+ results.append(prediction)
679
+
680
+ if progress_callback:
681
+ progress_callback(end_idx, num_images)
682
+
683
+ # Memory cleanup
684
+ if end_idx < num_images:
685
+ gc.collect()
686
+ if self.device.type == "cuda":
687
+ torch.cuda.empty_cache()
688
+ elif self.device.type == "mps":
689
+ torch.mps.empty_cache()
690
+
691
+ return results
692
+
693
+ def get_optimal_batch_size(
694
+ self,
695
+ process_res: int = 504,
696
+ target_utilization: float = 0.85,
697
+ ) -> int:
698
+ """
699
+ Get the optimal batch size for current GPU memory state.
700
+
701
+ Args:
702
+ process_res: Processing resolution (default: 504)
703
+ target_utilization: Target GPU memory usage 0.0-1.0 (default: 0.85)
704
+
705
+ Returns:
706
+ Recommended batch size
707
+
708
+ Example:
709
+ >>> model = DepthAnything3(model_name="da3-large")
710
+ >>> batch_size = model.get_optimal_batch_size(process_res=518)
711
+ >>> print(f"Optimal batch size: {batch_size}")
712
+ """
713
+ return estimate_max_batch_size(
714
+ model_name=self.model_name,
715
+ device=self.device,
716
+ process_res=process_res,
717
+ target_utilization=target_utilization,
718
+ )
src/depth_anything_3/app/css_and_html.py ADDED
@@ -0,0 +1,623 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # flake8: noqa: E501
2
+
3
+ # Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ """
18
+ CSS and HTML content for the Depth Anything 3 Gradio application.
19
+ This module contains all the CSS styles and HTML content blocks
20
+ used in the Gradio interface.
21
+ """
22
+
23
+ # CSS Styles for the Gradio interface
24
+ # Color palette:
25
+ # - Primary: #2563EB (Modern Blue)
26
+ # - Secondary: #14B8A6 (Vibrant Teal)
27
+ # - Accent: #F97316 (Electric Orange)
28
+ # - Neutrals: #F9FAFB to #111827
29
+ GRADIO_CSS = """
30
+ /* Add Font Awesome CDN with all styles including brands and colors */
31
+ @import url('https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css');
32
+
33
+ /* Force light mode */
34
+ html, body, .gradio-container {
35
+ color-scheme: light !important;
36
+ }
37
+
38
+ /* CSS Custom Properties for theming */
39
+ :root {
40
+ --primary: #2563EB;
41
+ --primary-light: #3B82F6;
42
+ --primary-dark: #1D4ED8;
43
+ --secondary: #14B8A6;
44
+ --secondary-light: #2DD4BF;
45
+ --secondary-dark: #0D9488;
46
+ --accent: #F97316;
47
+ --accent-light: #FB923C;
48
+ --accent-dark: #EA580C;
49
+ --neutral-50: #F9FAFB;
50
+ --neutral-100: #F3F4F6;
51
+ --neutral-200: #E5E7EB;
52
+ --neutral-300: #D1D5DB;
53
+ --neutral-400: #9CA3AF;
54
+ --neutral-500: #6B7280;
55
+ --neutral-600: #4B5563;
56
+ --neutral-700: #374151;
57
+ --neutral-800: #1F2937;
58
+ --neutral-900: #111827;
59
+ }
60
+
61
+ /* Add custom styles for colored icons */
62
+ .fa-color-blue {
63
+ color: var(--primary);
64
+ }
65
+
66
+ .fa-color-purple {
67
+ color: #8B5CF6;
68
+ }
69
+
70
+ .fa-color-cyan {
71
+ color: var(--secondary);
72
+ }
73
+
74
+ .fa-color-green {
75
+ color: #10B981;
76
+ }
77
+
78
+ .fa-color-yellow {
79
+ color: var(--accent);
80
+ }
81
+
82
+ .fa-color-red {
83
+ color: #EF4444;
84
+ }
85
+
86
+ .link-btn {
87
+ display: inline-flex;
88
+ align-items: center;
89
+ gap: 8px;
90
+ text-decoration: none;
91
+ padding: 12px 24px;
92
+ border-radius: 50px;
93
+ font-weight: 500;
94
+ transition: all 0.3s ease;
95
+ }
96
+
97
+ /* Dark mode theme */
98
+ @media (prefers-color-scheme: dark) {
99
+ html, body {
100
+ background: var(--neutral-800);
101
+ color: var(--neutral-50);
102
+ }
103
+
104
+ .gradio-container {
105
+ background: var(--neutral-800);
106
+ color: var(--neutral-50);
107
+ }
108
+
109
+ .link-btn {
110
+ background: rgba(20, 184, 166, 0.2);
111
+ color: white;
112
+ border: 1px solid rgba(20, 184, 166, 0.4);
113
+ }
114
+
115
+ .link-btn:hover {
116
+ background: rgba(20, 184, 166, 0.35);
117
+ transform: translateY(-2px);
118
+ box-shadow: 0 8px 25px rgba(20, 184, 166, 0.25);
119
+ }
120
+
121
+ .tech-bg {
122
+ background: linear-gradient(135deg, var(--neutral-900), var(--neutral-800));
123
+ position: relative;
124
+ overflow: hidden;
125
+ }
126
+
127
+ .tech-bg::before {
128
+ content: '';
129
+ position: absolute;
130
+ top: 0;
131
+ left: 0;
132
+ right: 0;
133
+ bottom: 0;
134
+ background:
135
+ radial-gradient(circle at 20% 80%, rgba(37, 99, 235, 0.15) 0%, transparent 50%),
136
+ radial-gradient(circle at 80% 20%, rgba(20, 184, 166, 0.15) 0%, transparent 50%),
137
+ radial-gradient(circle at 40% 40%, rgba(249, 115, 22, 0.1) 0%, transparent 50%);
138
+ animation: techPulse 8s ease-in-out infinite;
139
+ }
140
+
141
+ .gradio-container .panel,
142
+ .gradio-container .block,
143
+ .gradio-container .form {
144
+ background: rgba(0, 0, 0, 0.3);
145
+ border: 1px solid rgba(20, 184, 166, 0.2);
146
+ border-radius: 10px;
147
+ }
148
+
149
+ .gradio-container * {
150
+ color: var(--neutral-50);
151
+ }
152
+
153
+ .gradio-container label {
154
+ color: var(--neutral-200);
155
+ }
156
+
157
+ .gradio-container .markdown {
158
+ color: var(--neutral-200);
159
+ }
160
+ }
161
+
162
+ /* Light mode theme */
163
+ @media (prefers-color-scheme: light) {
164
+ html, body {
165
+ background: var(--neutral-50);
166
+ color: var(--neutral-800);
167
+ }
168
+
169
+ .gradio-container {
170
+ background: var(--neutral-50);
171
+ color: var(--neutral-800);
172
+ }
173
+
174
+ .tech-bg {
175
+ background: linear-gradient(135deg, var(--neutral-50), var(--neutral-100));
176
+ position: relative;
177
+ overflow: hidden;
178
+ }
179
+
180
+ .link-btn {
181
+ background: rgba(20, 184, 166, 0.12);
182
+ color: var(--neutral-700);
183
+ border: 1px solid rgba(20, 184, 166, 0.3);
184
+ }
185
+
186
+ .link-btn:hover {
187
+ background: rgba(20, 184, 166, 0.2);
188
+ transform: translateY(-2px);
189
+ box-shadow: 0 8px 25px rgba(20, 184, 166, 0.2);
190
+ }
191
+
192
+ .tech-bg::before {
193
+ content: '';
194
+ position: absolute;
195
+ top: 0;
196
+ left: 0;
197
+ right: 0;
198
+ bottom: 0;
199
+ background:
200
+ radial-gradient(circle at 20% 80%, rgba(37, 99, 235, 0.08) 0%, transparent 50%),
201
+ radial-gradient(circle at 80% 20%, rgba(20, 184, 166, 0.08) 0%, transparent 50%),
202
+ radial-gradient(circle at 40% 40%, rgba(249, 115, 22, 0.06) 0%, transparent 50%);
203
+ animation: techPulse 8s ease-in-out infinite;
204
+ }
205
+
206
+ .gradio-container .panel,
207
+ .gradio-container .block,
208
+ .gradio-container .form {
209
+ background: rgba(255, 255, 255, 0.9);
210
+ border: 1px solid rgba(20, 184, 166, 0.2);
211
+ border-radius: 10px;
212
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.05);
213
+ }
214
+
215
+ .gradio-container * {
216
+ color: var(--neutral-800);
217
+ }
218
+
219
+ .gradio-container label {
220
+ color: var(--neutral-600);
221
+ }
222
+
223
+ .gradio-container .markdown {
224
+ color: var(--neutral-600);
225
+ }
226
+ }
227
+
228
+
229
+
230
+
231
+ @keyframes techPulse {
232
+ 0%, 100% { opacity: 0.5; }
233
+ 50% { opacity: 0.8; }
234
+ }
235
+
236
+ /* Custom log with tech gradient */
237
+ .custom-log * {
238
+ font-style: italic;
239
+ font-size: 22px !important;
240
+ background: linear-gradient(135deg, var(--primary), var(--secondary));
241
+ background-size: 400% 400%;
242
+ -webkit-background-clip: text;
243
+ background-clip: text;
244
+ font-weight: bold !important;
245
+ color: transparent !important;
246
+ text-align: center !important;
247
+ animation: techGradient 3s ease infinite;
248
+ }
249
+
250
+ @keyframes techGradient {
251
+ 0% { background-position: 0% 50%; }
252
+ 50% { background-position: 100% 50%; }
253
+ 100% { background-position: 0% 50%; }
254
+ }
255
+
256
+ @keyframes metricPulse {
257
+ 0%, 100% { background-position: 0% 50%; }
258
+ 50% { background-position: 100% 50%; }
259
+ }
260
+
261
+ @keyframes pointcloudPulse {
262
+ 0%, 100% { background-position: 0% 50%; }
263
+ 50% { background-position: 100% 50%; }
264
+ }
265
+
266
+ @keyframes camerasPulse {
267
+ 0%, 100% { background-position: 0% 50%; }
268
+ 50% { background-position: 100% 50%; }
269
+ }
270
+
271
+ @keyframes gaussiansPulse {
272
+ 0%, 100% { background-position: 0% 50%; }
273
+ 50% { background-position: 100% 50%; }
274
+ }
275
+
276
+ /* Special colors for key terms - Global styles with gradient animations */
277
+ .metric-text {
278
+ background: linear-gradient(135deg, #10B981, #059669);
279
+ background-size: 200% 200%;
280
+ -webkit-background-clip: text;
281
+ background-clip: text;
282
+ color: transparent !important;
283
+ font-weight: 700;
284
+ animation: metricPulse 3s ease infinite;
285
+ }
286
+
287
+ .pointcloud-text {
288
+ background: linear-gradient(135deg, #10B981, #059669);
289
+ background-size: 200% 200%;
290
+ -webkit-background-clip: text;
291
+ background-clip: text;
292
+ color: transparent !important;
293
+ font-weight: 700;
294
+ animation: pointcloudPulse 3s ease infinite;
295
+ }
296
+
297
+ .cameras-text {
298
+ background: linear-gradient(135deg, #F97316, #EA580C);
299
+ background-size: 200% 200%;
300
+ -webkit-background-clip: text;
301
+ background-clip: text;
302
+ color: transparent !important;
303
+ font-weight: 700;
304
+ animation: camerasPulse 3s ease infinite;
305
+ }
306
+
307
+ .gaussians-text {
308
+ background: linear-gradient(135deg, #2563EB, #1D4ED8);
309
+ background-size: 200% 200%;
310
+ -webkit-background-clip: text;
311
+ background-clip: text;
312
+ color: transparent !important;
313
+ font-weight: 700;
314
+ animation: gaussiansPulse 3s ease infinite;
315
+ }
316
+
317
+ .example-log * {
318
+ font-style: italic;
319
+ font-size: 16px !important;
320
+ background: linear-gradient(135deg, var(--primary), var(--secondary));
321
+ -webkit-background-clip: text;
322
+ background-clip: text;
323
+ color: transparent !important;
324
+ }
325
+
326
+ #my_radio .wrap {
327
+ display: flex;
328
+ flex-wrap: nowrap;
329
+ justify-content: center;
330
+ align-items: center;
331
+ }
332
+
333
+ #my_radio .wrap label {
334
+ display: flex;
335
+ width: 50%;
336
+ justify-content: center;
337
+ align-items: center;
338
+ margin: 0;
339
+ padding: 10px 0;
340
+ box-sizing: border-box;
341
+ }
342
+
343
+ /* Align navigation buttons with dropdown bottom */
344
+ .navigation-row {
345
+ display: flex !important;
346
+ align-items: flex-end !important;
347
+ gap: 8px !important;
348
+ }
349
+
350
+ .navigation-row > div:nth-child(1),
351
+ .navigation-row > div:nth-child(3) {
352
+ align-self: flex-end !important;
353
+ }
354
+
355
+ .navigation-row > div:nth-child(2) {
356
+ flex: 1 !important;
357
+ }
358
+
359
+ /* Make thumbnails clickable with pointer cursor */
360
+ .clickable-thumbnail img {
361
+ cursor: pointer !important;
362
+ }
363
+
364
+ .clickable-thumbnail:hover img {
365
+ cursor: pointer !important;
366
+ opacity: 0.8;
367
+ transition: opacity 0.3s ease;
368
+ }
369
+
370
+ /* Make thumbnail containers narrower horizontally */
371
+ .clickable-thumbnail {
372
+ padding: 5px 2px !important;
373
+ margin: 0 2px !important;
374
+ }
375
+
376
+ .clickable-thumbnail .image-container {
377
+ margin: 0 !important;
378
+ padding: 0 !important;
379
+ }
380
+
381
+ .scene-info {
382
+ text-align: center !important;
383
+ padding: 5px 2px !important;
384
+ margin: 0 !important;
385
+ }
386
+ """
387
+
388
+
389
+ def get_header_html(logo_base64=None):
390
+ """
391
+ Generate the main header HTML with logo and title.
392
+
393
+ Args:
394
+ logo_base64 (str, optional): Base64 encoded logo image
395
+
396
+ Returns:
397
+ str: HTML string for the header
398
+ """
399
+ return """
400
+ <div class="tech-bg" style="text-align: center; margin-bottom: 5px; padding: 40px 20px; border-radius: 15px; position: relative; overflow: hidden;">
401
+ <div style="position: relative; z-index: 2;">
402
+ <h1 style="margin: 0; font-size: 3.5em; font-weight: 700;
403
+ background: linear-gradient(135deg, #2563EB, #14B8A6);
404
+ background-size: 400% 400%;
405
+ -webkit-background-clip: text;
406
+ background-clip: text;
407
+ color: transparent;
408
+ animation: techGradient 3s ease infinite;
409
+ text-shadow: 0 0 30px rgba(20, 184, 166, 0.4);
410
+ letter-spacing: 2px;">
411
+ Depth Anything 3
412
+ </h1>
413
+ <p style="margin: 15px 0 0 0; font-size: 2.16em; font-weight: 300;" class="header-subtitle">
414
+ Recovering the Visual Space from Any Views
415
+ </p>
416
+ <div style="margin-top: 20px;">
417
+ <a href="https://depth-anything-3.github.io" target="_blank" class="link-btn" style="margin: 0.5em;">
418
+ <i class="fas fa-globe" style="margin-right: 8px;"></i> Project Page
419
+ </a>
420
+ <a href="https://arxiv.org/abs/2406.09414" target="_blank" class="link-btn" style="margin: 0.5em;">
421
+ <i class="fas fa-file-pdf" style="margin-right: 8px;"></i> Paper
422
+ </a>
423
+ <a href="https://github.com/Aedelon/awesome-depth-anything-3" target="_blank" class="link-btn" style="margin: 0.5em; background: var(--secondary); color: white; border: none; font-weight: 600;">
424
+ <i class="fab fa-github" style="margin-right: 8px;"></i> Awesome Optimized Fork
425
+ </a>
426
+ <a href="https://github.com/ByteDance-Seed/Depth-Anything-3" target="_blank" class="link-btn" style="margin: 0.5em;">
427
+ <i class="fab fa-github" style="margin-right: 8px;"></i> Original
428
+ </a>
429
+ </div>
430
+ </div>
431
+ </div>
432
+
433
+ <style>
434
+ .header-subtitle {
435
+ color: #4B5563;
436
+ }
437
+ .tech-bg {
438
+ background: linear-gradient(135deg, rgba(20, 184, 166, 0.08) 0%, rgba(37, 99, 235, 0.08) 100%) !important;
439
+ }
440
+ </style>
441
+ <script>
442
+ document.body.classList.add('light');
443
+ document.documentElement.classList.add('light');
444
+ </script>
445
+ """
446
+
447
+
448
+ def get_description_html():
449
+ """
450
+ Generate the main description and getting started HTML.
451
+
452
+ Returns:
453
+ str: HTML string for the description
454
+ """
455
+ return """
456
+ <div class="description-container" style="padding: 25px; border-radius: 15px; margin: 0 0 20px 0;">
457
+ <h2 class="description-title" style="margin-top: 0; font-size: 1.6em; text-align: center;">
458
+ <i class="fas fa-bullseye fa-color-red" style="margin-right: 8px;"></i> What This Demo Does
459
+ </h2>
460
+ <div class="description-content" style="padding: 20px; border-radius: 10px; margin: 15px 0; text-align: center;">
461
+ <p class="description-main" style="line-height: 1.6; margin: 0; font-size: 1.45em;">
462
+ <strong>Upload images or videos</strong> → <strong>Get <span class="metric-text">Metric</span> <span class="pointcloud-text">Point Clouds</span>, <span class="cameras-text">Cameras</span> and <span class="gaussians-text">Novel Views</span></strong> → <strong>Explore in 3D</strong>
463
+ </p>
464
+ </div>
465
+
466
+ <div style="text-align: center; margin-top: 15px;">
467
+ <p class="description-tip" style="font-style: italic; margin: 0;">
468
+ <i class="fas fa-lightbulb fa-color-yellow" style="margin-right: 8px;"></i> <strong>Tip:</strong> Landscape-oriented images or videos are preferred for best 3D recovering.
469
+ </p>
470
+ </div>
471
+ </div>
472
+
473
+ <style>
474
+ @media (prefers-color-scheme: dark) {
475
+ .description-container {
476
+ background: linear-gradient(135deg, rgba(20, 184, 166, 0.08) 0%, rgba(37, 99, 235, 0.08) 100%);
477
+ border: 1px solid rgba(20, 184, 166, 0.2);
478
+ }
479
+ .description-title { color: #14B8A6; }
480
+ .description-content { background: rgba(0, 0, 0, 0.3); }
481
+ .description-main { color: #E5E7EB; }
482
+ .description-text { color: #D1D5DB; }
483
+ .description-tip { color: #D1D5DB; }
484
+ }
485
+
486
+ @media (prefers-color-scheme: light) {
487
+ .description-container {
488
+ background: linear-gradient(135deg, rgba(20, 184, 166, 0.05) 0%, rgba(37, 99, 235, 0.05) 100%);
489
+ border: 1px solid rgba(20, 184, 166, 0.2);
490
+ }
491
+ .description-title { color: #14B8A6; }
492
+ .description-content { background: transparent; }
493
+ .description-main { color: #1F2937; }
494
+ .description-text { color: #4B5563; }
495
+ .description-tip { color: #4B5563; }
496
+ }
497
+ </style>
498
+ """
499
+
500
+
501
+ def get_acknowledgements_html():
502
+ """
503
+ Generate the acknowledgements section HTML.
504
+
505
+ Returns:
506
+ str: HTML string for the acknowledgements
507
+ """
508
+ return """
509
+ <div style="background: linear-gradient(135deg, rgba(20, 184, 166, 0.08) 0%, rgba(37, 99, 235, 0.08) 100%);
510
+ padding: 25px; border-radius: 15px; margin: 20px 0; border: 1px solid rgba(20, 184, 166, 0.2);">
511
+ <h3 style="color: #14B8A6; margin-top: 0; text-align: center; font-size: 1.4em;">
512
+ <i class="fas fa-trophy fa-color-yellow" style="margin-right: 8px;"></i> Research Credits & Acknowledgments
513
+ </h3>
514
+
515
+ <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 20px; margin: 15px 0;">
516
+ <!-- Original Research Section (Left) -->
517
+ <div style="text-align: center;">
518
+ <h4 style="color: #2563EB; margin: 10px 0;"><i class="fas fa-flask fa-color-green" style="margin-right: 8px;"></i> Original Research</h4>
519
+ <p style="color: #9CA3AF; margin: 5px 0;">
520
+ <a href="https://depth-anything-3.github.io" target="_blank"
521
+ style="color: #14B8A6; text-decoration: none; font-weight: 600;">
522
+ Depth Anything 3
523
+ </a>
524
+ </p>
525
+ </div>
526
+
527
+ <!-- Previous Versions Section (Right) -->
528
+ <div style="text-align: center;">
529
+ <h4 style="color: #2563EB; margin: 10px 0;"><i class="fas fa-history fa-color-blue" style="margin-right: 8px;"></i> Previous Versions</h4>
530
+ <div style="display: flex; flex-direction: row; gap: 15px; justify-content: center; align-items: center;">
531
+ <p style="color: #9CA3AF; margin: 0;">
532
+ <a href="https://huggingface.co/spaces/LiheYoung/Depth-Anything" target="_blank"
533
+ style="color: #14B8A6; text-decoration: none; font-weight: 600;">
534
+ Depth-Anything
535
+ </a>
536
+ </p>
537
+ <span style="color: #9CA3AF;">•</span>
538
+ <p style="color: #9CA3AF; margin: 0;">
539
+ <a href="https://huggingface.co/spaces/depth-anything/Depth-Anything-V2" target="_blank"
540
+ style="color: #14B8A6; text-decoration: none; font-weight: 600;">
541
+ Depth-Anything-V2
542
+ </a>
543
+ </p>
544
+ </div>
545
+ </div>
546
+ </div>
547
+
548
+ <!-- HF Demo Adapted from - Centered at the bottom of the whole block -->
549
+ <div style="margin-top: 20px; padding-top: 15px; border-top: 1px solid rgba(20, 184, 166, 0.2); text-align: center;">
550
+ <p style="color: #6B7280; font-size: 0.9em; margin: 0;">
551
+ <i class="fas fa-code-branch" style="margin-right: 5px; color: #9CA3AF;"></i> HF demo adapted from <a href="https://huggingface.co/spaces/facebook/map-anything" target="_blank" style="color: inherit; text-decoration: none;">Map Anything</a>
552
+ </p>
553
+ </div>
554
+ </div>
555
+ """
556
+
557
+
558
+ def get_gradio_theme():
559
+ """
560
+ Get the configured Gradio theme with modern teal/blue colors.
561
+
562
+ Color palette:
563
+ - Primary: Teal (#14B8A6)
564
+ - Secondary: Blue (#2563EB)
565
+ - Accent: Orange (#F97316)
566
+ - Neutrals: Clean grays (#F9FAFB to #111827)
567
+
568
+ Returns:
569
+ gr.themes.Base: Configured Gradio theme
570
+ """
571
+ import gradio as gr
572
+
573
+ return gr.themes.Base(
574
+ # Primary hue: Teal
575
+ primary_hue=gr.themes.Color(
576
+ c50="#F0FDFA",
577
+ c100="#CCFBF1",
578
+ c200="#99F6E4",
579
+ c300="#5EEAD4",
580
+ c400="#2DD4BF",
581
+ c500="#14B8A6",
582
+ c600="#0D9488",
583
+ c700="#0F766E",
584
+ c800="#115E59",
585
+ c900="#134E4A",
586
+ c950="#042F2E",
587
+ ),
588
+ # Secondary hue: Blue
589
+ secondary_hue=gr.themes.Color(
590
+ c50="#EFF6FF",
591
+ c100="#DBEAFE",
592
+ c200="#BFDBFE",
593
+ c300="#93C5FD",
594
+ c400="#60A5FA",
595
+ c500="#3B82F6",
596
+ c600="#2563EB",
597
+ c700="#1D4ED8",
598
+ c800="#1E40AF",
599
+ c900="#1E3A8A",
600
+ c950="#172554",
601
+ ),
602
+ # Neutral hue: Clean grays
603
+ neutral_hue=gr.themes.Color(
604
+ c50="#F9FAFB",
605
+ c100="#F3F4F6",
606
+ c200="#E5E7EB",
607
+ c300="#D1D5DB",
608
+ c400="#9CA3AF",
609
+ c500="#6B7280",
610
+ c600="#4B5563",
611
+ c700="#374151",
612
+ c800="#1F2937",
613
+ c900="#111827",
614
+ c950="#030712",
615
+ ),
616
+ )
617
+
618
+
619
+ # Measure tab instructions HTML
620
+ MEASURE_INSTRUCTIONS_HTML = """
621
+ ### Click points on the image to compute distance.
622
+ > <i class="fas fa-triangle-exclamation fa-color-red" style="margin-right: 5px;"></i> Metric scale estimation is difficult on aerial/drone images.
623
+ """
src/depth_anything_3/app/gradio_app.py ADDED
@@ -0,0 +1,743 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """
16
+ Refactored Gradio App for Depth Anything 3.
17
+
18
+ This is the main application file that orchestrates all components.
19
+ The original functionality has been split into modular components for better maintainability.
20
+ """
21
+
22
+ import argparse
23
+ import os
24
+ from typing import Any, Dict, List
25
+
26
+ import gradio as gr
27
+
28
+ from depth_anything_3.app.css_and_html import GRADIO_CSS, get_gradio_theme
29
+ from depth_anything_3.app.modules.event_handlers import EventHandlers
30
+ from depth_anything_3.app.modules.ui_components import UIComponents
31
+
32
+ # Set environment variables
33
+ os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
34
+
35
+
36
+ class DepthAnything3App:
37
+ """
38
+ Main application class for Depth Anything 3 Gradio app.
39
+ """
40
+
41
+ def __init__(self, model_dir: str = None, workspace_dir: str = None, gallery_dir: str = None):
42
+ """
43
+ Initialize the application.
44
+
45
+ Args:
46
+ model_dir: Path to the model directory
47
+ workspace_dir: Path to the workspace directory
48
+ gallery_dir: Path to the gallery directory
49
+ """
50
+ self.model_dir = model_dir
51
+ self.workspace_dir = workspace_dir
52
+ self.gallery_dir = gallery_dir
53
+
54
+ # Set environment variables for directories
55
+ if self.model_dir:
56
+ os.environ["DA3_MODEL_DIR"] = self.model_dir
57
+ if self.workspace_dir:
58
+ os.environ["DA3_WORKSPACE_DIR"] = self.workspace_dir
59
+ if self.gallery_dir:
60
+ os.environ["DA3_GALLERY_DIR"] = self.gallery_dir
61
+
62
+ self.event_handlers = EventHandlers()
63
+ self.ui_components = UIComponents()
64
+
65
+ def cache_examples(
66
+ self,
67
+ show_cam: bool = True,
68
+ filter_black_bg: bool = False,
69
+ filter_white_bg: bool = False,
70
+ save_percentage: float = 20.0,
71
+ num_max_points: int = 1000,
72
+ cache_gs_tag: str = "",
73
+ gs_trj_mode: str = "smooth",
74
+ gs_video_quality: str = "low",
75
+ ) -> None:
76
+ """
77
+ Pre-cache all example scenes at startup.
78
+
79
+ Args:
80
+ show_cam: Whether to show camera in visualization
81
+ filter_black_bg: Whether to filter black background
82
+ filter_white_bg: Whether to filter white background
83
+ save_percentage: Filter percentage for point cloud
84
+ num_max_points: Maximum number of points
85
+ cache_gs_tag: Tag to match scene names for high-res+3DGS caching (e.g., "dl3dv")
86
+ gs_trj_mode: Trajectory mode for 3DGS
87
+ gs_video_quality: Video quality for 3DGS
88
+ """
89
+ from depth_anything_3.app.modules.utils import get_scene_info
90
+
91
+ examples_dir = os.path.join(self.workspace_dir, "examples")
92
+ if not os.path.exists(examples_dir):
93
+ print(f"Examples directory not found: {examples_dir}")
94
+ return
95
+
96
+ scenes = get_scene_info(examples_dir)
97
+ if not scenes:
98
+ print("No example scenes found to cache.")
99
+ return
100
+
101
+ print(f"\n{'='*60}")
102
+ print(f"Caching {len(scenes)} example scenes...")
103
+ print(f"{'='*60}\n")
104
+
105
+ for i, scene in enumerate(scenes, 1):
106
+ scene_name = scene["name"]
107
+
108
+ # Check if scene name matches the gs tag for high-res+3DGS caching
109
+ use_high_res_gs = cache_gs_tag and cache_gs_tag.lower() in scene_name.lower()
110
+
111
+ if use_high_res_gs:
112
+ print(f"[{i}/{len(scenes)}] Caching scene: {scene_name} (HIGH-RES + 3DGS)")
113
+ print(f" - Number of images: {scene['num_images']}")
114
+ print(f" - Matched tag: '{cache_gs_tag}' - using high_res + 3DGS")
115
+ else:
116
+ print(f"[{i}/{len(scenes)}] Caching scene: {scene_name} (LOW-RES)")
117
+ print(f" - Number of images: {scene['num_images']}")
118
+
119
+ try:
120
+ # Load example scene
121
+ _, target_dir, _, _, _, _, _, _, _ = self.event_handlers.load_example_scene(
122
+ scene_name
123
+ )
124
+
125
+ if target_dir and target_dir != "None":
126
+ # Run reconstruction with appropriate settings
127
+ print(" - Running reconstruction...")
128
+ result = self.event_handlers.gradio_demo(
129
+ target_dir=target_dir,
130
+ show_cam=show_cam,
131
+ filter_black_bg=filter_black_bg,
132
+ filter_white_bg=filter_white_bg,
133
+ process_res_method="high_res" if use_high_res_gs else "low_res",
134
+ save_percentage=save_percentage,
135
+ num_max_points=num_max_points,
136
+ infer_gs=use_high_res_gs,
137
+ ref_view_strategy="saddle_balanced",
138
+ gs_trj_mode=gs_trj_mode,
139
+ gs_video_quality=gs_video_quality,
140
+ )
141
+
142
+ # Check if successful
143
+ if result[0] is not None: # reconstruction_output
144
+ print(f" ✓ Scene '{scene_name}' cached successfully")
145
+ else:
146
+ print(f" ✗ Scene '{scene_name}' caching failed: {result[1]}")
147
+ else:
148
+ print(f" ✗ Scene '{scene_name}' loading failed")
149
+
150
+ except Exception as e:
151
+ print(f" ✗ Error caching scene '{scene_name}': {str(e)}")
152
+
153
+ print()
154
+
155
+ print("=" * 60)
156
+ print("Example scene caching completed!")
157
+ print("=" * 60 + "\n")
158
+
159
+ def create_app(self) -> gr.Blocks:
160
+ """
161
+ Create and configure the Gradio application.
162
+
163
+ Returns:
164
+ Configured Gradio Blocks interface
165
+ """
166
+ # Get theme and CSS
167
+ self._theme = get_gradio_theme()
168
+ self._css = GRADIO_CSS
169
+
170
+ with gr.Blocks(theme=self._theme, css=self._css) as demo:
171
+ # State variables for the tabbed interface
172
+ is_example = gr.Textbox(label="is_example", visible=False, value="None")
173
+ processed_data_state = gr.State(value=None)
174
+ measure_points_state = gr.State(value=[])
175
+ selected_image_index_state = gr.State(value=0) # Track selected image index
176
+ # current_view_index = gr.State(value=0) # noqa: F841 Track current view index
177
+
178
+ # Header and description
179
+ self.ui_components.create_header_section()
180
+ self.ui_components.create_description_section()
181
+
182
+ target_dir_output = gr.Textbox(label="Target Dir", visible=False, value="None")
183
+
184
+ # Main content area
185
+ with gr.Row():
186
+ with gr.Column(scale=2):
187
+ # Upload section
188
+ (
189
+ input_video,
190
+ s_time_interval,
191
+ input_images,
192
+ image_gallery,
193
+ ) = self.ui_components.create_upload_section()
194
+
195
+ with gr.Column(scale=4):
196
+ with gr.Column():
197
+ # gr.Markdown("**Metric 3D Reconstruction (Point Cloud and Camera Poses)**")
198
+ # Reconstruction control section (buttons) - moved below tabs
199
+
200
+ log_output = gr.Markdown(
201
+ "Please upload a video or images, then click Reconstruct.",
202
+ elem_classes=["custom-log"],
203
+ )
204
+
205
+ # Tabbed interface
206
+ with gr.Tabs():
207
+ with gr.Tab("Point Cloud & Cameras"):
208
+ reconstruction_output = (
209
+ self.ui_components.create_3d_viewer_section()
210
+ )
211
+
212
+ with gr.Tab("Metric Depth"):
213
+ (
214
+ prev_measure_btn,
215
+ measure_view_selector,
216
+ next_measure_btn,
217
+ measure_image,
218
+ measure_depth_image,
219
+ measure_text,
220
+ ) = self.ui_components.create_measure_section()
221
+
222
+ with gr.Tab("3DGS Rendered Novel Views"):
223
+ gs_video, gs_info = self.ui_components.create_nvs_video()
224
+
225
+ # Inference control section (before inference)
226
+ (
227
+ model_selector,
228
+ process_res_method_dropdown,
229
+ infer_gs,
230
+ ref_view_strategy_dropdown,
231
+ ) = self.ui_components.create_inference_control_section()
232
+
233
+ # Display control section - includes 3DGS options, buttons, and Visualization Options # noqa: E501
234
+ (
235
+ show_cam,
236
+ filter_black_bg,
237
+ filter_white_bg,
238
+ save_percentage,
239
+ num_max_points,
240
+ gs_trj_mode,
241
+ gs_video_quality,
242
+ submit_btn,
243
+ clear_btn,
244
+ ) = self.ui_components.create_display_control_section()
245
+
246
+ # bind visibility of gs_trj_mode to infer_gs
247
+ infer_gs.change(
248
+ fn=lambda checked: (
249
+ gr.update(visible=checked),
250
+ gr.update(visible=checked),
251
+ gr.update(visible=checked),
252
+ gr.update(visible=(not checked)),
253
+ ),
254
+ inputs=infer_gs,
255
+ outputs=[gs_trj_mode, gs_video_quality, gs_video, gs_info],
256
+ )
257
+
258
+ # Example scenes section
259
+ gr.Markdown("## Example Scenes")
260
+
261
+ scenes = self.ui_components.create_example_scenes_section()
262
+ scene_components = self.ui_components.create_example_scene_grid(scenes)
263
+
264
+ # Set up event handlers
265
+ self._setup_event_handlers(
266
+ demo,
267
+ is_example,
268
+ processed_data_state,
269
+ measure_points_state,
270
+ target_dir_output,
271
+ input_video,
272
+ input_images,
273
+ s_time_interval,
274
+ image_gallery,
275
+ reconstruction_output,
276
+ log_output,
277
+ show_cam,
278
+ filter_black_bg,
279
+ filter_white_bg,
280
+ process_res_method_dropdown,
281
+ save_percentage,
282
+ submit_btn,
283
+ clear_btn,
284
+ num_max_points,
285
+ infer_gs,
286
+ ref_view_strategy_dropdown,
287
+ selected_image_index_state,
288
+ measure_view_selector,
289
+ measure_image,
290
+ measure_depth_image,
291
+ measure_text,
292
+ prev_measure_btn,
293
+ next_measure_btn,
294
+ scenes,
295
+ scene_components,
296
+ gs_video,
297
+ gs_info,
298
+ gs_trj_mode,
299
+ gs_video_quality,
300
+ model_selector,
301
+ s_time_interval,
302
+ )
303
+
304
+ # Acknowledgements
305
+ self.ui_components.create_acknowledgements_section()
306
+
307
+ return demo
308
+
309
+ def _setup_event_handlers(
310
+ self,
311
+ demo: gr.Blocks,
312
+ is_example: gr.Textbox,
313
+ processed_data_state: gr.State,
314
+ measure_points_state: gr.State,
315
+ target_dir_output: gr.Textbox,
316
+ input_video: gr.Video,
317
+ input_images: gr.File,
318
+ s_time_interval: gr.Slider,
319
+ image_gallery: gr.Gallery,
320
+ reconstruction_output: gr.Model3D,
321
+ log_output: gr.Markdown,
322
+ show_cam: gr.Checkbox,
323
+ filter_black_bg: gr.Checkbox,
324
+ filter_white_bg: gr.Checkbox,
325
+ process_res_method_dropdown: gr.Dropdown,
326
+ save_percentage: gr.Slider,
327
+ submit_btn: gr.Button,
328
+ clear_btn: gr.ClearButton,
329
+ num_max_points: gr.Slider,
330
+ infer_gs: gr.Checkbox,
331
+ ref_view_strategy_dropdown: gr.Dropdown,
332
+ selected_image_index_state: gr.State,
333
+ measure_view_selector: gr.Dropdown,
334
+ measure_image: gr.Image,
335
+ measure_depth_image: gr.Image,
336
+ measure_text: gr.Markdown,
337
+ prev_measure_btn: gr.Button,
338
+ next_measure_btn: gr.Button,
339
+ scenes: List[Dict[str, Any]],
340
+ scene_components: List, # List of gr.Image or gr.Video
341
+ gs_video: gr.Video,
342
+ gs_info: gr.Markdown,
343
+ gs_trj_mode: gr.Dropdown,
344
+ gs_video_quality: gr.Dropdown,
345
+ model_selector: gr.Dropdown,
346
+ s_time_interval_slider: gr.Slider,
347
+ ) -> None:
348
+ """
349
+ Set up all event handlers for the application.
350
+
351
+ Args:
352
+ demo: Gradio Blocks interface
353
+ All other arguments: Gradio components to connect
354
+ """
355
+ # Configure clear button
356
+ clear_btn.add(
357
+ [
358
+ input_video,
359
+ input_images,
360
+ reconstruction_output,
361
+ log_output,
362
+ target_dir_output,
363
+ image_gallery,
364
+ gs_video,
365
+ ]
366
+ )
367
+
368
+ # Main reconstruction button
369
+ submit_btn.click(
370
+ fn=self.event_handlers.gradio_demo,
371
+ inputs=[
372
+ target_dir_output,
373
+ show_cam,
374
+ filter_black_bg,
375
+ filter_white_bg,
376
+ process_res_method_dropdown,
377
+ save_percentage,
378
+ num_max_points,
379
+ infer_gs,
380
+ ref_view_strategy_dropdown,
381
+ gs_trj_mode,
382
+ gs_video_quality,
383
+ model_selector,
384
+ ],
385
+ outputs=[
386
+ reconstruction_output,
387
+ log_output,
388
+ processed_data_state,
389
+ measure_image,
390
+ measure_depth_image,
391
+ measure_text,
392
+ measure_view_selector,
393
+ gs_video,
394
+ gs_info,
395
+ ],
396
+ )
397
+
398
+ # Real-time visualization updates
399
+ self._setup_visualization_handlers(
400
+ show_cam,
401
+ filter_black_bg,
402
+ filter_white_bg,
403
+ process_res_method_dropdown,
404
+ target_dir_output,
405
+ is_example,
406
+ reconstruction_output,
407
+ log_output,
408
+ )
409
+
410
+ # File upload handlers
411
+ input_video.change(
412
+ fn=self.event_handlers.handle_uploads,
413
+ inputs=[input_video, input_images, s_time_interval],
414
+ outputs=[reconstruction_output, target_dir_output, image_gallery, log_output],
415
+ )
416
+ input_images.change(
417
+ fn=self.event_handlers.handle_uploads,
418
+ inputs=[input_video, input_images, s_time_interval],
419
+ outputs=[reconstruction_output, target_dir_output, image_gallery, log_output],
420
+ )
421
+
422
+ # Navigation handlers
423
+ self._setup_navigation_handlers(
424
+ prev_measure_btn,
425
+ next_measure_btn,
426
+ measure_view_selector,
427
+ measure_image,
428
+ measure_depth_image,
429
+ measure_points_state,
430
+ processed_data_state,
431
+ )
432
+
433
+ # Measurement handler
434
+ measure_image.select(
435
+ fn=self.event_handlers.measure,
436
+ inputs=[processed_data_state, measure_points_state, measure_view_selector],
437
+ outputs=[measure_image, measure_depth_image, measure_points_state, measure_text],
438
+ )
439
+
440
+ # Example scene handlers
441
+ self._setup_example_scene_handlers(
442
+ scenes,
443
+ scene_components,
444
+ reconstruction_output,
445
+ target_dir_output,
446
+ image_gallery,
447
+ log_output,
448
+ is_example,
449
+ processed_data_state,
450
+ measure_view_selector,
451
+ measure_image,
452
+ measure_depth_image,
453
+ gs_video,
454
+ gs_info,
455
+ s_time_interval,
456
+ )
457
+
458
+ def _setup_visualization_handlers(
459
+ self,
460
+ show_cam: gr.Checkbox,
461
+ filter_black_bg: gr.Checkbox,
462
+ filter_white_bg: gr.Checkbox,
463
+ process_res_method_dropdown: gr.Dropdown,
464
+ target_dir_output: gr.Textbox,
465
+ is_example: gr.Textbox,
466
+ reconstruction_output: gr.Model3D,
467
+ log_output: gr.Markdown,
468
+ ) -> None:
469
+ """Set up visualization update handlers."""
470
+ # Common inputs for visualization updates
471
+ viz_inputs = [
472
+ target_dir_output,
473
+ show_cam,
474
+ is_example,
475
+ filter_black_bg,
476
+ filter_white_bg,
477
+ process_res_method_dropdown,
478
+ ]
479
+
480
+ # Set up change handlers for all visualization controls
481
+ for component in [show_cam, filter_black_bg, filter_white_bg]:
482
+ component.change(
483
+ fn=self.event_handlers.update_visualization,
484
+ inputs=viz_inputs,
485
+ outputs=[reconstruction_output, log_output],
486
+ )
487
+
488
+ def _setup_navigation_handlers(
489
+ self,
490
+ prev_measure_btn: gr.Button,
491
+ next_measure_btn: gr.Button,
492
+ measure_view_selector: gr.Dropdown,
493
+ measure_image: gr.Image,
494
+ measure_depth_image: gr.Image,
495
+ measure_points_state: gr.State,
496
+ processed_data_state: gr.State,
497
+ ) -> None:
498
+ """Set up navigation handlers for measure tab."""
499
+ # Measure tab navigation
500
+ prev_measure_btn.click(
501
+ fn=lambda processed_data, current_selector: self.event_handlers.navigate_measure_view(
502
+ processed_data, current_selector, -1
503
+ ),
504
+ inputs=[processed_data_state, measure_view_selector],
505
+ outputs=[
506
+ measure_view_selector,
507
+ measure_image,
508
+ measure_depth_image,
509
+ measure_points_state,
510
+ ],
511
+ )
512
+
513
+ next_measure_btn.click(
514
+ fn=lambda processed_data, current_selector: self.event_handlers.navigate_measure_view(
515
+ processed_data, current_selector, 1
516
+ ),
517
+ inputs=[processed_data_state, measure_view_selector],
518
+ outputs=[
519
+ measure_view_selector,
520
+ measure_image,
521
+ measure_depth_image,
522
+ measure_points_state,
523
+ ],
524
+ )
525
+
526
+ measure_view_selector.change(
527
+ fn=lambda processed_data, selector_value: (
528
+ self.event_handlers.update_measure_view(
529
+ processed_data, int(selector_value.split()[1]) - 1
530
+ )
531
+ if selector_value
532
+ else (None, None, [])
533
+ ),
534
+ inputs=[processed_data_state, measure_view_selector],
535
+ outputs=[measure_image, measure_depth_image, measure_points_state],
536
+ )
537
+
538
+ def _setup_example_scene_handlers(
539
+ self,
540
+ scenes: List[Dict[str, Any]],
541
+ scene_components: List, # List of gr.Image
542
+ reconstruction_output: gr.Model3D,
543
+ target_dir_output: gr.Textbox,
544
+ image_gallery: gr.Gallery,
545
+ log_output: gr.Markdown,
546
+ is_example: gr.Textbox,
547
+ processed_data_state: gr.State,
548
+ measure_view_selector: gr.Dropdown,
549
+ measure_image: gr.Image,
550
+ measure_depth_image: gr.Image,
551
+ gs_video: gr.Video,
552
+ gs_info: gr.Markdown,
553
+ s_time_interval: gr.Slider,
554
+ ) -> None:
555
+ """Set up example scene handlers."""
556
+ # Use assets/examples directory
557
+ examples_dir = os.environ.get("DA3_EXAMPLES_DIR", "assets/examples")
558
+
559
+ def load_and_update_measure(scene_name: str, fps: float):
560
+ """Load example scene and update measure view."""
561
+ print(f"[load_and_update_measure] Called with scene_name={scene_name}, fps={fps}", flush=True)
562
+ result = self.event_handlers.load_example_scene(scene_name, examples_dir, fps)
563
+ print(f"[load_and_update_measure] target_dir from result[1]: {result[1]}", flush=True)
564
+
565
+ # Update measure view if processed_data is available
566
+ measure_img = None
567
+ measure_depth = None
568
+ if result[4] is not None: # processed_data exists
569
+ measure_img, measure_depth, _ = (
570
+ self.event_handlers.visualization_handler.update_measure_view(result[4], 0)
571
+ )
572
+
573
+ final_result = result + ("True", measure_img, measure_depth)
574
+ print(f"[load_and_update_measure] Returning {len(final_result)} values", flush=True)
575
+ return final_result
576
+
577
+ def create_scene_handler(scene_name: str):
578
+ """Create a handler function for a specific scene."""
579
+ def handler(fps: float):
580
+ return load_and_update_measure(scene_name, fps)
581
+ return handler
582
+
583
+ for i, scene in enumerate(scenes):
584
+ if i < len(scene_components):
585
+ component = scene_components[i]
586
+ # Create handler with scene name bound
587
+ handler_fn = create_scene_handler(scene["name"])
588
+ outputs = [
589
+ reconstruction_output,
590
+ target_dir_output,
591
+ image_gallery,
592
+ log_output,
593
+ processed_data_state,
594
+ measure_view_selector,
595
+ gs_video,
596
+ gs_info,
597
+ is_example,
598
+ measure_image,
599
+ measure_depth_image,
600
+ ]
601
+
602
+ # Use click event - s_time_interval value is passed as input
603
+ component.select(fn=handler_fn, inputs=[s_time_interval], outputs=outputs)
604
+
605
+ def launch(self, host: str = "127.0.0.1", port: int = 7860, **kwargs) -> None:
606
+ """
607
+ Launch the application.
608
+
609
+ Args:
610
+ host: Host address to bind to
611
+ port: Port number to bind to
612
+ **kwargs: Additional arguments for demo.launch()
613
+ """
614
+ demo = self.create_app()
615
+ demo.queue(max_size=20).launch(
616
+ show_error=True,
617
+ server_name=host,
618
+ server_port=port,
619
+ **kwargs,
620
+ )
621
+
622
+
623
+ def main():
624
+ """Main function to run the application."""
625
+ parser = argparse.ArgumentParser(
626
+ description="Depth Anything 3 Gradio Application",
627
+ formatter_class=argparse.RawDescriptionHelpFormatter,
628
+ epilog="""
629
+ Examples:
630
+ # Basic usage
631
+ python gradio_app.py --help
632
+ python gradio_app.py --host 0.0.0.0 --port 8080
633
+ python gradio_app.py --model-dir /path/to/model --workspace-dir /path/to/workspace
634
+
635
+ # Cache examples at startup (all low-res)
636
+ python gradio_app.py --cache-examples
637
+
638
+ # Cache with selective high-res+3DGS for scenes matching tag
639
+ python gradio_app.py --cache-examples --cache-gs-tag dl3dv
640
+ # This will use high-res + 3DGS for scenes containing "dl3dv" in their name,
641
+ # and low-res only for other scenes
642
+ """,
643
+ )
644
+
645
+ # Server configuration
646
+ parser.add_argument(
647
+ "--host", default="127.0.0.1", help="Host address to bind to (default: 127.0.0.1)"
648
+ )
649
+ parser.add_argument(
650
+ "--port", type=int, default=7860, help="Port number to bind to (default: 7860)"
651
+ )
652
+
653
+ # Directory configuration
654
+ parser.add_argument(
655
+ "--model-dir",
656
+ default="depth-anything/DA3NESTED-GIANT-LARGE",
657
+ help="Path to the model directory (default: depth-anything/DA3NESTED-GIANT-LARGE)",
658
+ )
659
+ parser.add_argument(
660
+ "--workspace-dir",
661
+ default="workspace/gradio", # noqa: E501
662
+ help="Path to the workspace directory (default: workspace/gradio)", # noqa: E501
663
+ )
664
+ parser.add_argument(
665
+ "--gallery-dir",
666
+ default="workspace/gallery",
667
+ help="Path to the gallery directory (default: workspace/gallery)", # noqa: E501
668
+ )
669
+
670
+ # Additional Gradio options
671
+ parser.add_argument("--share", action="store_true", help="Create a public link for the app")
672
+ parser.add_argument("--debug", action="store_true", help="Enable debug mode")
673
+
674
+ # Example caching options
675
+ parser.add_argument(
676
+ "--cache-examples",
677
+ action="store_true",
678
+ help="Pre-cache all example scenes at startup for faster loading",
679
+ )
680
+ parser.add_argument(
681
+ "--cache-gs-tag",
682
+ type=str,
683
+ default="",
684
+ help="Tag to match scene names for high-res+3DGS caching (e.g., 'dl3dv'). Scenes containing this tag will use high_res and infer_gs=True; others will use low_res only.", # noqa: E501
685
+ )
686
+
687
+ args = parser.parse_args()
688
+
689
+ # Create directories if they don't exist
690
+ os.makedirs(args.workspace_dir, exist_ok=True)
691
+ os.makedirs(args.gallery_dir, exist_ok=True)
692
+
693
+ # Initialize and launch the application
694
+ app = DepthAnything3App(
695
+ model_dir=args.model_dir, workspace_dir=args.workspace_dir, gallery_dir=args.gallery_dir
696
+ )
697
+
698
+ # Prepare launch arguments
699
+ launch_kwargs = {"share": args.share, "debug": args.debug}
700
+
701
+ print("Starting Depth Anything 3 Gradio App...")
702
+ print(f"Host: {args.host}")
703
+ print(f"Port: {args.port}")
704
+ print(f"Model Directory: {args.model_dir}")
705
+ print(f"Workspace Directory: {args.workspace_dir}")
706
+ print(f"Gallery Directory: {args.gallery_dir}")
707
+ print(f"Share: {args.share}")
708
+ print(f"Debug: {args.debug}")
709
+ print(f"Cache Examples: {args.cache_examples}")
710
+ if args.cache_examples:
711
+ if args.cache_gs_tag:
712
+ print(
713
+ f"Cache GS Tag: '{args.cache_gs_tag}' (scenes matching this tag will use high-res + 3DGS)" # noqa: E501
714
+ ) # noqa: E501
715
+ else:
716
+ print("Cache GS Tag: None (all scenes will use low-res only)")
717
+
718
+ # Pre-cache examples if requested
719
+ if args.cache_examples:
720
+ print("\n" + "=" * 60)
721
+ print("Pre-caching mode enabled")
722
+ if args.cache_gs_tag:
723
+ print(f"Scenes containing '{args.cache_gs_tag}' will use HIGH-RES + 3DGS")
724
+ print("Other scenes will use LOW-RES only")
725
+ else:
726
+ print("All scenes will use LOW-RES only")
727
+ print("=" * 60)
728
+ app.cache_examples(
729
+ show_cam=True,
730
+ filter_black_bg=False,
731
+ filter_white_bg=False,
732
+ save_percentage=5.0,
733
+ num_max_points=1000,
734
+ cache_gs_tag=args.cache_gs_tag,
735
+ gs_trj_mode="smooth",
736
+ gs_video_quality="low",
737
+ )
738
+
739
+ app.launch(host=args.host, port=args.port, **launch_kwargs)
740
+
741
+
742
+ if __name__ == "__main__":
743
+ main()
src/depth_anything_3/app/modules/__init__.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """
16
+ Modules package for Depth Anything 3 Gradio app.
17
+
18
+ This package contains all the modular components for the Gradio application.
19
+ """
20
+
21
+ from depth_anything_3.app.modules.event_handlers import EventHandlers
22
+ from depth_anything_3.app.modules.file_handlers import FileHandler
23
+ from depth_anything_3.app.modules.model_inference import ModelInference
24
+ from depth_anything_3.app.modules.ui_components import UIComponents
25
+ from depth_anything_3.app.modules.utils import (
26
+ create_depth_visualization,
27
+ get_logo_base64,
28
+ get_scene_info,
29
+ save_to_gallery_func,
30
+ )
31
+ from depth_anything_3.app.modules.visualization import VisualizationHandler
32
+
33
+ __all__ = [
34
+ "ModelInference",
35
+ "FileHandler",
36
+ "VisualizationHandler",
37
+ "EventHandlers",
38
+ "UIComponents",
39
+ "create_depth_visualization",
40
+ "save_to_gallery_func",
41
+ "get_scene_info",
42
+ "get_logo_base64",
43
+ ]
src/depth_anything_3/app/modules/event_handlers.py ADDED
@@ -0,0 +1,624 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """
16
+ Event handling module for Depth Anything 3 Gradio app.
17
+
18
+ This module handles all event callbacks and user interactions.
19
+ """
20
+
21
+ import os
22
+ import time
23
+ from glob import glob
24
+ from typing import Any, Dict, List, Optional, Tuple
25
+
26
+ import gradio as gr
27
+ import numpy as np
28
+ import torch
29
+
30
+ from depth_anything_3.app.modules.file_handlers import FileHandler
31
+ from depth_anything_3.app.modules.model_inference import ModelInference
32
+ from depth_anything_3.app.modules.visualization import VisualizationHandler
33
+ from depth_anything_3.utils.memory import cleanup_cuda_memory
34
+
35
+
36
+ class EventHandlers:
37
+ """
38
+ Handles all event callbacks and user interactions for the Gradio app.
39
+ """
40
+
41
+ def __init__(self):
42
+ """Initialize the event handlers."""
43
+ self.model_inference = ModelInference()
44
+ self.file_handler = FileHandler()
45
+ self.visualization_handler = VisualizationHandler()
46
+
47
+ def clear_fields(self) -> None:
48
+ """
49
+ Clears the 3D viewer, the stored target_dir, and empties the gallery.
50
+ """
51
+ return None
52
+
53
+ def update_log(self) -> str:
54
+ """
55
+ Display a quick log message while waiting.
56
+ """
57
+ return "Loading and Reconstructing..."
58
+
59
+ def save_current_visualization(
60
+ self,
61
+ target_dir: str,
62
+ save_percentage: float,
63
+ show_cam: bool,
64
+ filter_black_bg: bool,
65
+ filter_white_bg: bool,
66
+ processed_data: Optional[Dict],
67
+ scene_name: str = "",
68
+ ) -> str:
69
+ """
70
+ Save current visualization results to gallery with specified save percentage.
71
+
72
+ Args:
73
+ target_dir: Directory containing results
74
+ save_percentage: Percentage of points to save (0-100)
75
+ show_cam: Whether to show cameras
76
+ filter_black_bg: Whether to filter black background
77
+ filter_white_bg: Whether to filter white background
78
+ processed_data: Processed data from reconstruction
79
+
80
+ Returns:
81
+ Status message
82
+ """
83
+ if not target_dir or target_dir == "None" or not os.path.isdir(target_dir):
84
+ return "No reconstruction available. Please run 'Reconstruct' first."
85
+
86
+ if processed_data is None:
87
+ return "No processed data available. Please run 'Reconstruct' first."
88
+
89
+ try:
90
+ import datetime
91
+
92
+ from .utils import save_to_gallery_func
93
+
94
+ timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
95
+ if scene_name and scene_name.strip():
96
+ gallery_name = f"{scene_name.strip()}_{timestamp}_pct{save_percentage:.0f}"
97
+ else:
98
+ gallery_name = f"save_{timestamp}_pct{save_percentage:.0f}"
99
+
100
+ success, message = save_to_gallery_func(
101
+ target_dir=target_dir, processed_data=processed_data, gallery_name=gallery_name
102
+ )
103
+
104
+ if success:
105
+ return (
106
+ "Successfully saved to gallery!\n"
107
+ f"Gallery name: {gallery_name}\n"
108
+ f"Save percentage: {save_percentage}%\n"
109
+ f"Show cameras: {show_cam}\n"
110
+ f"Filter black bg: {filter_black_bg}\n"
111
+ f"Filter white bg: {filter_white_bg}\n\n"
112
+ f"{message}"
113
+ )
114
+ else:
115
+ return f"Failed to save to gallery: {message}"
116
+
117
+ except Exception as e:
118
+ return f"Error saving visualization: {str(e)}"
119
+
120
+ def gradio_demo(
121
+ self,
122
+ target_dir: str,
123
+ show_cam: bool = True,
124
+ filter_black_bg: bool = False,
125
+ filter_white_bg: bool = False,
126
+ process_res_method: str = "upper_bound_resize",
127
+ save_percentage: float = 30.0,
128
+ num_max_points: int = 1_000_000,
129
+ infer_gs: bool = False,
130
+ ref_view_strategy: str = "saddle_balanced",
131
+ gs_trj_mode: str = "extend",
132
+ gs_video_quality: str = "high",
133
+ model_name: str = None,
134
+ ):
135
+ """
136
+ Perform reconstruction using the already-created target_dir/images.
137
+
138
+ Args:
139
+ target_dir: Directory containing images
140
+ show_cam: Whether to show camera
141
+ filter_black_bg: Whether to filter black background
142
+ filter_white_bg: Whether to filter white background
143
+ process_res_method: Method for resizing input images
144
+ save_percentage: Filter percentage for point cloud
145
+ num_max_points: Maximum number of points
146
+ infer_gs: Whether to infer 3D Gaussian Splatting
147
+ ref_view_strategy: Reference view selection strategy
148
+ model_name: Model to use (da3-base, da3-large, da3nested-giant-large)
149
+
150
+ Returns:
151
+ Tuple of reconstruction results
152
+ """
153
+ from depth_anything_3.app.modules.model_inference import DEFAULT_MODEL
154
+
155
+ if model_name is None:
156
+ model_name = DEFAULT_MODEL
157
+
158
+ print(f"[gradio_demo] Called with target_dir={target_dir}, model={model_name}", flush=True)
159
+
160
+ if target_dir is None or not os.path.isdir(target_dir) or target_dir == "None":
161
+ print("[gradio_demo] Invalid target_dir, returning early")
162
+ return (
163
+ None,
164
+ "No valid target directory found. Please upload first.",
165
+ None,
166
+ None,
167
+ None,
168
+ "",
169
+ None,
170
+ gr.update(value=None, visible=False), # gs_video
171
+ gr.update(visible=True), # gs_info
172
+ )
173
+
174
+ start_time = time.time()
175
+ cleanup_cuda_memory()
176
+
177
+ # Get image files for logging
178
+ target_dir_images = os.path.join(target_dir, "images")
179
+ all_files = (
180
+ sorted(os.listdir(target_dir_images)) if os.path.isdir(target_dir_images) else []
181
+ )
182
+
183
+ print(f"[gradio_demo] Running {model_name} on {len(all_files)} images...")
184
+ print(f"[gradio_demo] Reference view strategy: {ref_view_strategy}")
185
+
186
+ try:
187
+ with torch.no_grad():
188
+ prediction, processed_data = self.model_inference.run_inference(
189
+ target_dir,
190
+ process_res_method=process_res_method,
191
+ show_camera=show_cam,
192
+ save_percentage=save_percentage,
193
+ num_max_points=int(num_max_points * 1000), # Convert K to actual count
194
+ infer_gs=infer_gs,
195
+ ref_view_strategy=ref_view_strategy,
196
+ gs_trj_mode=gs_trj_mode,
197
+ gs_video_quality=gs_video_quality,
198
+ model_name=model_name,
199
+ )
200
+ except Exception as e:
201
+ error_msg = f"Reconstruction failed: {str(e)}"
202
+ print(f"[ERROR] {error_msg}")
203
+ import traceback
204
+ traceback.print_exc()
205
+ return (
206
+ None,
207
+ error_msg,
208
+ None,
209
+ None,
210
+ None,
211
+ "",
212
+ None,
213
+ gr.update(value=None, visible=False),
214
+ gr.update(visible=True),
215
+ )
216
+
217
+ # The GLB file is already generated by the API
218
+ glbfile = os.path.join(target_dir, "scene.glb")
219
+
220
+ # Handle 3DGS video based on infer_gs flag
221
+ gsvideo_path = None
222
+ gs_video_visible = False
223
+ gs_info_visible = True
224
+
225
+ if infer_gs:
226
+ try:
227
+ gsvideo_path = sorted(glob(os.path.join(target_dir, "gs_video", "*.mp4")))[-1]
228
+ gs_video_visible = True
229
+ gs_info_visible = False
230
+ except IndexError:
231
+ gsvideo_path = None
232
+ print("3DGS video not found, but infer_gs was enabled")
233
+
234
+ # Cleanup
235
+ cleanup_cuda_memory()
236
+
237
+ end_time = time.time()
238
+ print(f"Total time: {end_time - start_time:.2f} seconds")
239
+ log_msg = f"Reconstruction Success ({len(all_files)} frames). Waiting for visualization."
240
+
241
+ # Populate visualization tabs with processed data
242
+ depth_vis, measure_img, measure_depth_vis, measure_pts = (
243
+ self.visualization_handler.populate_visualization_tabs(processed_data)
244
+ )
245
+
246
+ # Update view selectors based on available views
247
+ depth_selector, measure_selector = self.visualization_handler.update_view_selectors(
248
+ processed_data
249
+ )
250
+
251
+ return (
252
+ glbfile,
253
+ log_msg,
254
+ processed_data,
255
+ measure_img, # measure_image
256
+ measure_depth_vis, # measure_depth_image
257
+ "", # measure_text (empty initially)
258
+ measure_selector, # measure_view_selector
259
+ gr.update(value=gsvideo_path, visible=gs_video_visible), # gs_video
260
+ gr.update(visible=gs_info_visible), # gs_info visibility
261
+ )
262
+
263
+ def update_visualization(
264
+ self,
265
+ target_dir: str,
266
+ show_cam: bool,
267
+ is_example: str,
268
+ filter_black_bg: bool = False,
269
+ filter_white_bg: bool = False,
270
+ process_res_method: str = "upper_bound_resize",
271
+ ) -> Tuple[gr.update, str]:
272
+ """
273
+ Reload saved predictions from npz, create (or reuse) the GLB for new parameters,
274
+ and return it for the 3D viewer.
275
+
276
+ Args:
277
+ target_dir: Directory containing results
278
+ show_cam: Whether to show camera
279
+ is_example: Whether this is an example scene
280
+ filter_black_bg: Whether to filter black background
281
+ filter_white_bg: Whether to filter white background
282
+ process_res_method: Method for resizing input images
283
+
284
+ Returns:
285
+ Tuple of (glb_file, log_message)
286
+ """
287
+ if not target_dir or target_dir == "None" or not os.path.isdir(target_dir):
288
+ return (
289
+ gr.update(),
290
+ "No reconstruction available. Please click the Reconstruct button first.",
291
+ )
292
+
293
+ # Check if GLB exists (could be cached example or reconstructed scene)
294
+ glbfile = os.path.join(target_dir, "scene.glb")
295
+ if os.path.exists(glbfile):
296
+ return (
297
+ glbfile,
298
+ (
299
+ "Visualization loaded from cache."
300
+ if is_example == "True"
301
+ else "Visualization updated."
302
+ ),
303
+ )
304
+
305
+ # If no GLB but it's an example that hasn't been reconstructed yet
306
+ if is_example == "True":
307
+ return (
308
+ gr.update(),
309
+ "No reconstruction available. Please click the Reconstruct button first.",
310
+ )
311
+
312
+ # For non-examples, check predictions.npz
313
+ predictions_path = os.path.join(target_dir, "predictions.npz")
314
+ if not os.path.exists(predictions_path):
315
+ error_message = (
316
+ f"No reconstruction available at {predictions_path}. "
317
+ "Please run 'Reconstruct' first."
318
+ )
319
+ return gr.update(), error_message
320
+
321
+ loaded = np.load(predictions_path, allow_pickle=True)
322
+ predictions = {key: loaded[key] for key in loaded.keys()} # noqa: F841
323
+
324
+ return (
325
+ glbfile,
326
+ "Visualization updated.",
327
+ )
328
+
329
+ def handle_uploads(
330
+ self,
331
+ input_video: Optional[str],
332
+ input_images: Optional[List],
333
+ s_time_interval: float = 10.0,
334
+ ) -> Tuple[Optional[str], Optional[str], Optional[List], Optional[str]]:
335
+ """
336
+ Handle file uploads and update gallery.
337
+
338
+ Args:
339
+ input_video: Path to input video file
340
+ input_images: List of input image files
341
+ s_time_interval: Sampling FPS (frames per second) for frame extraction
342
+
343
+ Returns:
344
+ Tuple of (reconstruction_output, target_dir, image_paths, log_message)
345
+ """
346
+ return self.file_handler.update_gallery_on_upload(
347
+ input_video, input_images, s_time_interval
348
+ )
349
+
350
+ def load_example_scene(
351
+ self, scene_name: str, examples_dir: str = None, s_time_interval: float = None
352
+ ) -> Tuple[
353
+ Optional[str],
354
+ Optional[str],
355
+ Optional[List],
356
+ str,
357
+ Optional[Dict],
358
+ gr.Dropdown, # measure_view_selector
359
+ dict, # gs_video update (value + visibility)
360
+ dict, # gs_info update (visibility)
361
+ ]:
362
+ """
363
+ Load a scene from examples directory.
364
+
365
+ Args:
366
+ scene_name: Name of the scene to load
367
+ examples_dir: Path to examples directory (if None, uses workspace_dir/examples)
368
+ s_time_interval: Sampling FPS for video frame extraction (default 1.0)
369
+
370
+ Returns:
371
+ Tuple of (reconstruction_output, target_dir, image_paths, log_message, processed_data, measure_view_selector, gs_video, gs_video_vis, gs_info_vis) # noqa: E501
372
+ """
373
+ if examples_dir is None:
374
+ # Get workspace directory from environment variable
375
+ workspace_dir = os.environ.get("DA3_WORKSPACE_DIR", "gradio_workspace")
376
+ examples_dir = os.path.join(workspace_dir, "examples")
377
+
378
+ # Default FPS for video extraction
379
+ if s_time_interval is None:
380
+ s_time_interval = 1.0
381
+
382
+ reconstruction_output, target_dir, image_paths, log_message = (
383
+ self.file_handler.load_example_scene(scene_name, examples_dir, s_time_interval)
384
+ )
385
+
386
+ # Try to load cached processed data if available
387
+ processed_data = None
388
+ measure_view_selector = gr.Dropdown(choices=["View 1"], value="View 1")
389
+ gs_video_path = None
390
+ gs_video_visible = False
391
+ gs_info_visible = True
392
+
393
+ if target_dir and target_dir != "None":
394
+ predictions_path = os.path.join(target_dir, "predictions.npz")
395
+ if os.path.exists(predictions_path):
396
+ try:
397
+ # Load predictions from cache
398
+ loaded = np.load(predictions_path, allow_pickle=True)
399
+ predictions = {key: loaded[key] for key in loaded.keys()}
400
+
401
+ # Reconstruct processed_data structure
402
+ num_images = len(predictions.get("images", []))
403
+ processed_data = {}
404
+
405
+ for i in range(num_images):
406
+ processed_data[i] = {
407
+ "image": predictions["images"][i] if "images" in predictions else None,
408
+ "depth": predictions["depths"][i] if "depths" in predictions else None,
409
+ "depth_image": os.path.join(
410
+ target_dir, "depth_vis", f"{i:04d}.jpg" # Fixed: use .jpg not .png
411
+ ),
412
+ "intrinsics": (
413
+ predictions["intrinsics"][i]
414
+ if "intrinsics" in predictions
415
+ and i < len(predictions["intrinsics"])
416
+ else None
417
+ ),
418
+ "mask": None,
419
+ }
420
+
421
+ # Update measure view selector
422
+ choices = [f"View {i + 1}" for i in range(num_images)]
423
+ measure_view_selector = gr.Dropdown(choices=choices, value=choices[0])
424
+
425
+ except Exception as e:
426
+ print(f"Error loading cached data: {e}")
427
+
428
+ # Check for cached 3DGS video
429
+ gs_video_dir = os.path.join(target_dir, "gs_video")
430
+ if os.path.exists(gs_video_dir):
431
+ try:
432
+ from glob import glob
433
+
434
+ gs_videos = sorted(glob(os.path.join(gs_video_dir, "*.mp4")))
435
+ if gs_videos:
436
+ gs_video_path = gs_videos[-1]
437
+ gs_video_visible = True
438
+ gs_info_visible = False
439
+ print(f"Loaded cached 3DGS video: {gs_video_path}")
440
+ except Exception as e:
441
+ print(f"Error loading cached 3DGS video: {e}")
442
+
443
+ return (
444
+ reconstruction_output,
445
+ target_dir,
446
+ image_paths,
447
+ log_message,
448
+ processed_data,
449
+ measure_view_selector,
450
+ gr.update(value=gs_video_path, visible=gs_video_visible), # gs_video
451
+ gr.update(visible=gs_info_visible), # gs_info
452
+ )
453
+
454
+ def navigate_depth_view(
455
+ self,
456
+ processed_data: Optional[dict],
457
+ current_selector: str,
458
+ direction: int,
459
+ ) -> Tuple[str, Optional[str]]:
460
+ """
461
+ Navigate depth view.
462
+
463
+ Args:
464
+ processed_data: Processed data dictionary
465
+ current_selector: Current selector value
466
+ direction: Direction to navigate
467
+
468
+ Returns:
469
+ Tuple of (new_selector_value, depth_vis)
470
+ """
471
+ return self.visualization_handler.navigate_depth_view(
472
+ processed_data, current_selector, direction
473
+ )
474
+
475
+ def update_depth_view(
476
+ self, processed_data: Optional[dict], view_index: int
477
+ ) -> Optional[str]:
478
+ """
479
+ Update depth view for a specific view index.
480
+
481
+ Args:
482
+ processed_data: Processed data dictionary
483
+ view_index: Index of the view to update
484
+
485
+ Returns:
486
+ Path to depth visualization image or None
487
+ """
488
+ return self.visualization_handler.update_depth_view(processed_data, view_index)
489
+
490
+ def navigate_measure_view(
491
+ self,
492
+ processed_data: Optional[dict],
493
+ current_selector: str,
494
+ direction: int,
495
+ ) -> Tuple[str, Optional[np.ndarray], Optional[np.ndarray], List]:
496
+ """
497
+ Navigate measure view.
498
+
499
+ Args:
500
+ processed_data: Processed data dictionary
501
+ current_selector: Current selector value
502
+ direction: Direction to navigate
503
+
504
+ Returns:
505
+ Tuple of (new_selector_value, measure_image, depth_right_half, measure_points)
506
+ """
507
+ return self.visualization_handler.navigate_measure_view(
508
+ processed_data, current_selector, direction
509
+ )
510
+
511
+ def update_measure_view(
512
+ self, processed_data: Optional[dict], view_index: int
513
+ ) -> Tuple[Optional[np.ndarray], Optional[np.ndarray], List]:
514
+ """
515
+ Update measure view for a specific view index.
516
+
517
+ Args:
518
+ processed_data: Processed data dictionary
519
+ view_index: Index of the view to update
520
+
521
+ Returns:
522
+ Tuple of (measure_image, depth_right_half, measure_points)
523
+ """
524
+ return self.visualization_handler.update_measure_view(processed_data, view_index)
525
+
526
+ def measure(
527
+ self,
528
+ processed_data: Optional[dict],
529
+ measure_points: List,
530
+ current_view_selector: str,
531
+ event: gr.SelectData,
532
+ ) -> List:
533
+ """
534
+ Handle measurement on images.
535
+
536
+ Args:
537
+ processed_data: Processed data dictionary
538
+ measure_points: List of current measure points
539
+ current_view_selector: Current view selector value
540
+ event: Gradio select event
541
+
542
+ Returns:
543
+ List of [image, depth_right_half, measure_points, text]
544
+ """
545
+ return self.visualization_handler.measure(
546
+ processed_data, measure_points, current_view_selector, event
547
+ )
548
+
549
+ def select_first_frame(
550
+ self, image_gallery: List, selected_index: int = 0
551
+ ) -> Tuple[List, str, str]:
552
+ """
553
+ Select the first frame from the image gallery.
554
+
555
+ Args:
556
+ image_gallery: List of images in the gallery
557
+ selected_index: Index of the selected image (default: 0)
558
+
559
+ Returns:
560
+ Tuple of (updated_image_gallery, log_message, selected_frame_path)
561
+ """
562
+ try:
563
+ if not image_gallery or len(image_gallery) == 0:
564
+ return image_gallery, "No images available to select as first frame.", ""
565
+
566
+ # Handle None or invalid selected_index
567
+ if (
568
+ selected_index is None
569
+ or selected_index < 0
570
+ or selected_index >= len(image_gallery)
571
+ ):
572
+ selected_index = 0
573
+ print(f"Invalid selected_index: {selected_index}, using default: 0")
574
+
575
+ # Get the selected image based on index
576
+ selected_image = image_gallery[selected_index]
577
+ print(f"Selected image index: {selected_index}")
578
+ print(f"Total images: {len(image_gallery)}")
579
+
580
+ # Extract the file path from the selected image
581
+ selected_frame_path = ""
582
+ print(f"Selected image type: {type(selected_image)}")
583
+ print(f"Selected image: {selected_image}")
584
+
585
+ if isinstance(selected_image, tuple):
586
+ # Gradio Gallery returns tuple (path, None)
587
+ selected_frame_path = selected_image[0]
588
+ elif isinstance(selected_image, str):
589
+ selected_frame_path = selected_image
590
+ elif hasattr(selected_image, "name"):
591
+ selected_frame_path = selected_image.name
592
+ elif isinstance(selected_image, dict):
593
+ if "name" in selected_image:
594
+ selected_frame_path = selected_image["name"]
595
+ elif "path" in selected_image:
596
+ selected_frame_path = selected_image["path"]
597
+ elif "src" in selected_image:
598
+ selected_frame_path = selected_image["src"]
599
+ else:
600
+ # Try to convert to string
601
+ selected_frame_path = str(selected_image)
602
+
603
+ print(f"Extracted path: {selected_frame_path}")
604
+
605
+ # Extract filename from the path for matching
606
+ import os
607
+
608
+ selected_filename = os.path.basename(selected_frame_path)
609
+ print(f"Selected filename: {selected_filename}")
610
+
611
+ # Move the selected image to the front
612
+ updated_gallery = [selected_image] + [
613
+ img for img in image_gallery if img != selected_image
614
+ ]
615
+
616
+ log_message = (
617
+ f"Selected frame: {selected_filename}. "
618
+ f"Moved to first position. Total frames: {len(updated_gallery)}"
619
+ )
620
+ return updated_gallery, log_message, selected_filename
621
+
622
+ except Exception as e:
623
+ print(f"Error selecting first frame: {e}")
624
+ return image_gallery, f"Error selecting first frame: {e}", ""
src/depth_anything_3/app/modules/file_handlers.py ADDED
@@ -0,0 +1,327 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """
16
+ File handling module for Depth Anything 3 Gradio app.
17
+
18
+ This module handles file uploads, video processing, and file operations.
19
+ """
20
+
21
+ import os
22
+ import shutil
23
+ import time
24
+ from datetime import datetime
25
+ from typing import List, Optional, Tuple
26
+
27
+ import cv2
28
+ from PIL import Image
29
+ from pillow_heif import register_heif_opener
30
+
31
+ register_heif_opener()
32
+
33
+
34
+ class FileHandler:
35
+ """
36
+ Handles file uploads and processing for the Gradio app.
37
+ """
38
+
39
+ def __init__(self):
40
+ """Initialize the file handler."""
41
+
42
+ def handle_uploads(
43
+ self,
44
+ input_video: Optional[str],
45
+ input_images: Optional[List],
46
+ s_time_interval: float = 10.0,
47
+ ) -> Tuple[str, List[str]]:
48
+ """
49
+ Create a new 'target_dir' + 'images' subfolder, and place user-uploaded
50
+ images or extracted frames from video into it.
51
+
52
+ Args:
53
+ input_video: Path to input video file
54
+ input_images: List of input image files
55
+ s_time_interval: Sampling FPS (frames per second) for frame extraction
56
+
57
+ Returns:
58
+ Tuple of (target_dir, image_paths)
59
+ """
60
+ start_time = time.time()
61
+
62
+ # Get workspace directory from environment variable or use default
63
+ workspace_dir = os.environ.get("DA3_WORKSPACE_DIR", "gradio_workspace")
64
+ if not os.path.exists(workspace_dir):
65
+ os.makedirs(workspace_dir)
66
+
67
+ # Create input_images subdirectory
68
+ input_images_dir = os.path.join(workspace_dir, "input_images")
69
+ if not os.path.exists(input_images_dir):
70
+ os.makedirs(input_images_dir)
71
+
72
+ # Create a unique folder name within input_images
73
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
74
+ target_dir = os.path.join(input_images_dir, f"session_{timestamp}")
75
+ target_dir_images = os.path.join(target_dir, "images")
76
+
77
+ # Clean up if somehow that folder already exists
78
+ if os.path.exists(target_dir):
79
+ shutil.rmtree(target_dir)
80
+ os.makedirs(target_dir)
81
+ os.makedirs(target_dir_images)
82
+
83
+ image_paths = []
84
+
85
+ # Handle images
86
+ if input_images is not None:
87
+ image_paths.extend(self._process_images(input_images, target_dir_images))
88
+
89
+ # Handle video
90
+ if input_video is not None:
91
+ image_paths.extend(
92
+ self._process_video(input_video, target_dir_images, s_time_interval)
93
+ )
94
+
95
+ # Sort final images for gallery
96
+ image_paths = sorted(image_paths)
97
+
98
+ end_time = time.time()
99
+ print(f"Files copied to {target_dir_images}; took {end_time - start_time:.3f} seconds")
100
+ return target_dir, image_paths
101
+
102
+ def _process_images(self, input_images: List, target_dir_images: str) -> List[str]:
103
+ """
104
+ Process uploaded images.
105
+
106
+ Args:
107
+ input_images: List of input image files
108
+ target_dir_images: Target directory for images
109
+
110
+ Returns:
111
+ List of processed image paths
112
+ """
113
+ image_paths = []
114
+
115
+ for file_data in input_images:
116
+ if isinstance(file_data, dict) and "name" in file_data:
117
+ file_path = file_data["name"]
118
+ else:
119
+ file_path = file_data
120
+
121
+ # Check if the file is a HEIC image
122
+ file_ext = os.path.splitext(file_path)[1].lower()
123
+ if file_ext in [".heic", ".heif"]:
124
+ # Convert HEIC to JPEG for better gallery compatibility
125
+ try:
126
+ with Image.open(file_path) as img:
127
+ # Convert to RGB if necessary (HEIC can have different color modes)
128
+ if img.mode not in ("RGB", "L"):
129
+ img = img.convert("RGB")
130
+
131
+ # Create JPEG filename
132
+ base_name = os.path.splitext(os.path.basename(file_path))[0]
133
+ dst_path = os.path.join(target_dir_images, f"{base_name}.jpg")
134
+
135
+ # Save as JPEG with high quality
136
+ img.save(dst_path, "JPEG", quality=95)
137
+ image_paths.append(dst_path)
138
+ print(
139
+ f"Converted HEIC to JPEG: {os.path.basename(file_path)} -> "
140
+ f"{os.path.basename(dst_path)}"
141
+ )
142
+ except Exception as e:
143
+ print(f"Error converting HEIC file {file_path}: {e}")
144
+ # Fall back to copying as is
145
+ dst_path = os.path.join(target_dir_images, os.path.basename(file_path))
146
+ shutil.copy(file_path, dst_path)
147
+ image_paths.append(dst_path)
148
+ else:
149
+ # Regular image files - copy as is
150
+ dst_path = os.path.join(target_dir_images, os.path.basename(file_path))
151
+ shutil.copy(file_path, dst_path)
152
+ image_paths.append(dst_path)
153
+
154
+ return image_paths
155
+
156
+ def _process_video(
157
+ self, input_video: str, target_dir_images: str, s_time_interval: float
158
+ ) -> List[str]:
159
+ """
160
+ Process video file and extract frames.
161
+
162
+ Args:
163
+ input_video: Path to input video file
164
+ target_dir_images: Target directory for extracted frames
165
+ s_time_interval: Sampling FPS (frames per second) for frame extraction
166
+
167
+ Returns:
168
+ List of extracted frame paths
169
+ """
170
+ image_paths = []
171
+
172
+ if isinstance(input_video, dict) and "name" in input_video:
173
+ video_path = input_video["name"]
174
+ else:
175
+ video_path = input_video
176
+
177
+ vs = cv2.VideoCapture(video_path)
178
+ fps = vs.get(cv2.CAP_PROP_FPS)
179
+ frame_interval = max(1, int(fps / s_time_interval)) # Convert FPS to frame interval
180
+
181
+ count = 0
182
+ video_frame_num = 0
183
+ while True:
184
+ gotit, frame = vs.read()
185
+ if not gotit:
186
+ break
187
+ count += 1
188
+ if count % frame_interval == 0:
189
+ image_path = os.path.join(target_dir_images, f"{video_frame_num:06}.png")
190
+ cv2.imwrite(image_path, frame)
191
+ image_paths.append(image_path)
192
+ video_frame_num += 1
193
+
194
+ return image_paths
195
+
196
+ def update_gallery_on_upload(
197
+ self,
198
+ input_video: Optional[str],
199
+ input_images: Optional[List],
200
+ s_time_interval: float = 10.0,
201
+ ) -> Tuple[Optional[str], Optional[str], Optional[List], Optional[str]]:
202
+ """
203
+ Handle file uploads and update gallery.
204
+
205
+ Args:
206
+ input_video: Path to input video file
207
+ input_images: List of input image files
208
+ s_time_interval: Sampling FPS (frames per second) for frame extraction
209
+
210
+ Returns:
211
+ Tuple of (reconstruction_output, target_dir, image_paths, log_message)
212
+ """
213
+ if not input_video and not input_images:
214
+ return None, None, None, None
215
+
216
+ target_dir, image_paths = self.handle_uploads(input_video, input_images, s_time_interval)
217
+ return (
218
+ None,
219
+ target_dir,
220
+ image_paths,
221
+ "Upload complete. Click 'Reconstruct' to begin 3D processing.",
222
+ )
223
+
224
+ def load_example_scene(
225
+ self, scene_name: str, examples_dir: str = "examples", s_time_interval: float = 1.0
226
+ ) -> Tuple[Optional[str], Optional[str], Optional[List], str]:
227
+ """
228
+ Load a scene from examples directory.
229
+
230
+ Args:
231
+ scene_name: Name of the scene to load
232
+ examples_dir: Path to examples directory
233
+ s_time_interval: Sampling FPS for video frame extraction (default 1.0)
234
+
235
+ Returns:
236
+ Tuple of (reconstruction_output, target_dir, image_paths, log_message)
237
+ """
238
+ from depth_anything_3.app.modules.utils import get_scene_info
239
+
240
+ scenes = get_scene_info(examples_dir)
241
+
242
+ # Find the selected scene
243
+ selected_scene = None
244
+ for scene in scenes:
245
+ if scene["name"] == scene_name:
246
+ selected_scene = scene
247
+ break
248
+
249
+ if selected_scene is None:
250
+ return None, None, None, "Scene not found"
251
+
252
+ # Check if this is a video scene
253
+ is_video_scene = selected_scene.get("type") == "video"
254
+
255
+ # Use fixed directory name for examples (not timestamp-based)
256
+ workspace_dir = os.environ.get("DA3_WORKSPACE_DIR", "gradio_workspace")
257
+ input_images_dir = os.path.join(workspace_dir, "input_images")
258
+ if not os.path.exists(input_images_dir):
259
+ os.makedirs(input_images_dir)
260
+
261
+ # For video scenes, include FPS in folder name so different FPS = different cache
262
+ if is_video_scene:
263
+ target_dir = os.path.join(
264
+ input_images_dir, f"example_{scene_name}_fps{s_time_interval:.1f}"
265
+ )
266
+ else:
267
+ target_dir = os.path.join(input_images_dir, f"example_{scene_name}")
268
+ target_dir_images = os.path.join(target_dir, "images")
269
+
270
+ # Check if already cached (GLB file exists)
271
+ glb_path = os.path.join(target_dir, "scene.glb")
272
+ is_cached = os.path.exists(glb_path)
273
+
274
+ # Create directory if it doesn't exist
275
+ if not os.path.exists(target_dir):
276
+ os.makedirs(target_dir)
277
+ os.makedirs(target_dir_images)
278
+
279
+ # Process images or extract video frames if directory is new or empty
280
+ if not os.path.exists(target_dir_images) or len(os.listdir(target_dir_images)) == 0:
281
+ os.makedirs(target_dir_images, exist_ok=True)
282
+ image_paths = []
283
+
284
+ if is_video_scene:
285
+ # Extract frames from video using specified FPS
286
+ video_path = selected_scene.get("video_file")
287
+ if video_path:
288
+ image_paths = self._process_video(
289
+ video_path, target_dir_images, s_time_interval
290
+ )
291
+ else:
292
+ # Copy images
293
+ for file_path in selected_scene["image_files"]:
294
+ dst_path = os.path.join(target_dir_images, os.path.basename(file_path))
295
+ shutil.copy(file_path, dst_path)
296
+ image_paths.append(dst_path)
297
+ else:
298
+ # Use existing images
299
+ image_paths = sorted(
300
+ [
301
+ os.path.join(target_dir_images, f)
302
+ for f in os.listdir(target_dir_images)
303
+ if f.lower().endswith((".png", ".jpg", ".jpeg", ".bmp", ".tiff", ".tif"))
304
+ ]
305
+ )
306
+
307
+ num_frames = len(image_paths)
308
+ scene_type = "video" if is_video_scene else "scene"
309
+
310
+ # Return cached GLB if available
311
+ if is_cached:
312
+ return (
313
+ glb_path, # Return cached reconstruction
314
+ target_dir, # Set target directory
315
+ image_paths, # Set gallery
316
+ f"Loaded cached {scene_type} '{scene_name}' with {num_frames} frames.",
317
+ )
318
+ else:
319
+ return (
320
+ None, # No cached reconstruction
321
+ target_dir, # Set target directory
322
+ image_paths, # Set gallery
323
+ (
324
+ f"Loaded {scene_type} '{scene_name}' with {num_frames} frames. "
325
+ "Click 'Reconstruct' to begin 3D processing."
326
+ ),
327
+ )
src/depth_anything_3/app/modules/model_inference.py ADDED
@@ -0,0 +1,454 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
2
+ # Optimizations (c) Delanoe Pirard / Aedelon - Apache 2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ """
17
+ Model inference module for Depth Anything 3 Gradio app.
18
+
19
+ This module handles all model-related operations including inference,
20
+ data processing, and result preparation.
21
+
22
+ Optimizations based on benchmarks:
23
+ - Smart batch sizing per model/device (MPS: B=4 for small/base, B=2 for large, B=1 for giant)
24
+ - CUDA: Adaptive batching at 85% memory utilization
25
+ - CPU: Always batch=1
26
+ - Model caching for 200x faster subsequent loads
27
+ """
28
+
29
+ import glob
30
+ import os
31
+ import time
32
+ from typing import Any, Dict, List, Optional, Tuple
33
+
34
+ import numpy as np
35
+ import torch
36
+
37
+ from depth_anything_3.api import DepthAnything3
38
+ from depth_anything_3.utils.export.glb import export_to_glb
39
+ from depth_anything_3.utils.export.gs import export_to_gs_video
40
+ from depth_anything_3.utils.memory import cleanup_cuda_memory
41
+
42
+ # Available models for UI selection
43
+ AVAILABLE_MODELS = {
44
+ "da3-small": "Small (fastest, ~27 img/s)",
45
+ "da3-base": "Base (fast, ~10 img/s)",
46
+ "da3-large": "Large (balanced, ~4 img/s)",
47
+ "da3-giant": "Giant (high quality, ~1.6 img/s)",
48
+ "da3nested-giant-large": "Giant+Large (best quality, ~1.5 img/s)",
49
+ }
50
+
51
+ # Mapping from UI names to HuggingFace repo IDs
52
+ MODEL_TO_HF_REPO = {
53
+ "da3-small": "depth-anything/DA3-SMALL",
54
+ "da3-base": "depth-anything/DA3-BASE",
55
+ "da3-large": "depth-anything/DA3-LARGE",
56
+ "da3-giant": "depth-anything/DA3-GIANT",
57
+ "da3nested-giant-large": "depth-anything/DA3NESTED-GIANT-LARGE",
58
+ }
59
+
60
+ DEFAULT_MODEL = "da3nested-giant-large"
61
+
62
+
63
+ class ModelInference:
64
+ """
65
+ Handles model inference and data processing for Depth Anything 3.
66
+
67
+ Uses benchmark-optimized batch sizes:
68
+ - MPS: B=4 for small/base, B=2 for large, B=1 for giant
69
+ - CUDA: Adaptive batching (85% VRAM utilization)
70
+ - CPU: B=1 always
71
+ """
72
+
73
+ def __init__(self):
74
+ """Initialize the model inference handler."""
75
+ self.model: Optional[DepthAnything3] = None
76
+ self.current_model_name: Optional[str] = None
77
+ self.device: Optional[torch.device] = None
78
+
79
+ def _get_optimal_batch_size(
80
+ self, num_images: int, model_name: str, device_type: str
81
+ ) -> int:
82
+ """
83
+ Get optimal batch size based on benchmarks.
84
+
85
+ Benchmark results (MPS, 1280x720):
86
+ - da3-small: B=4 → 27.2 img/s (vs B=1 → 22.2 img/s)
87
+ - da3-base: B=4 → 11.6 img/s (vs B=1 → 10.7 img/s)
88
+ - da3-large: B=2 → 3.8 img/s (B=4 slower due to memory pressure)
89
+ - da3-giant: B=1 → 1.6 img/s (B=4 → 1.2 img/s, worse!)
90
+
91
+ Args:
92
+ num_images: Number of images to process
93
+ model_name: Name of the model
94
+ device_type: Device type ('cuda', 'mps', 'cpu')
95
+
96
+ Returns:
97
+ Optimal batch size
98
+ """
99
+ if device_type == "cpu":
100
+ return 1
101
+
102
+ # MPS: Use benchmark-optimized fixed batch sizes
103
+ if device_type == "mps":
104
+ if "small" in model_name:
105
+ return min(4, num_images)
106
+ elif "base" in model_name:
107
+ return min(4, num_images)
108
+ elif "giant" in model_name:
109
+ return 1
110
+ else: # large
111
+ return min(2, num_images)
112
+
113
+ # CUDA: Conservative batch size, can be tuned
114
+ if "giant" in model_name:
115
+ return min(2, num_images)
116
+ elif "large" in model_name:
117
+ return min(4, num_images)
118
+ else:
119
+ return min(8, num_images)
120
+
121
+ def initialize_model(self, device: torch.device, model_name: str = None) -> None:
122
+ """
123
+ Initialize the DepthAnything3 model.
124
+
125
+ Args:
126
+ device: Device to load the model on
127
+ model_name: Model name to load (default: da3-base)
128
+ """
129
+ if model_name is None:
130
+ model_name = os.environ.get("DA3_MODEL_NAME", DEFAULT_MODEL)
131
+
132
+ # Check if we need to reload the model
133
+ need_reload = (
134
+ self.model is None
135
+ or self.current_model_name != model_name
136
+ or self.device != device
137
+ )
138
+
139
+ if need_reload:
140
+ # Cleanup old model if exists
141
+ if self.model is not None:
142
+ print(f"[ModelInference] Unloading {self.current_model_name}")
143
+ del self.model
144
+ self.model = None
145
+ cleanup_cuda_memory()
146
+
147
+ # Get HuggingFace repo ID from model name
148
+ hf_repo = MODEL_TO_HF_REPO.get(model_name, model_name)
149
+ print(f"[ModelInference] Loading model: {model_name} ({hf_repo}) on {device}")
150
+ start_time = time.time()
151
+
152
+ # Use from_pretrained to load from HuggingFace
153
+ self.model = DepthAnything3.from_pretrained(hf_repo)
154
+ self.model = self.model.to(device)
155
+ self.current_model_name = model_name
156
+ self.device = device
157
+
158
+ load_time = time.time() - start_time
159
+ print(f"[ModelInference] Model loaded in {load_time:.2f}s")
160
+ else:
161
+ print(f"[ModelInference] Reusing cached model: {model_name}")
162
+
163
+ self.model.eval()
164
+
165
+ def run_inference(
166
+ self,
167
+ target_dir: str,
168
+ filter_black_bg: bool = False,
169
+ filter_white_bg: bool = False,
170
+ process_res_method: str = "upper_bound_resize",
171
+ show_camera: bool = True,
172
+ save_percentage: float = 30.0,
173
+ num_max_points: int = 1_000_000,
174
+ infer_gs: bool = False,
175
+ ref_view_strategy: str = "saddle_balanced",
176
+ gs_trj_mode: str = "extend",
177
+ gs_video_quality: str = "high",
178
+ model_name: str = None,
179
+ ) -> Tuple[Any, dict]:
180
+ """
181
+ Run DepthAnything3 model inference on images.
182
+
183
+ All images are processed in a single batch for optimal performance.
184
+
185
+ Args:
186
+ target_dir: Directory containing images
187
+ filter_black_bg: Whether to filter black background
188
+ filter_white_bg: Whether to filter white background
189
+ process_res_method: Method for resizing input images
190
+ show_camera: Whether to show camera in 3D view
191
+ save_percentage: Percentage of points to save (0-100)
192
+ num_max_points: Maximum number of points in point cloud
193
+ infer_gs: Whether to infer 3D Gaussian Splatting
194
+ ref_view_strategy: Reference view selection strategy
195
+ gs_trj_mode: Trajectory mode for 3DGS
196
+ gs_video_quality: Video quality for 3DGS
197
+ model_name: Model to use (default: da3-base)
198
+
199
+ Returns:
200
+ Tuple of (prediction, processed_data)
201
+ """
202
+ inference_start = time.time()
203
+ print(f"[ModelInference] Processing images from {target_dir}")
204
+
205
+ # Device check - support CUDA, MPS (Apple Silicon), and CPU
206
+ if torch.cuda.is_available():
207
+ device = torch.device("cuda")
208
+ elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
209
+ device = torch.device("mps")
210
+ else:
211
+ device = torch.device("cpu")
212
+
213
+ # Initialize model (with caching)
214
+ if model_name is None:
215
+ model_name = DEFAULT_MODEL
216
+ self.initialize_model(device, model_name)
217
+
218
+ # Get image paths
219
+ image_folder_path = os.path.join(target_dir, "images")
220
+ all_image_paths = sorted(glob.glob(os.path.join(image_folder_path, "*")))
221
+
222
+ # Filter for image files
223
+ image_extensions = [".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif"]
224
+ image_paths = [
225
+ path
226
+ for path in all_image_paths
227
+ if any(path.lower().endswith(ext) for ext in image_extensions)
228
+ ]
229
+
230
+ num_images = len(image_paths)
231
+ print(f"[ModelInference] Found {num_images} images")
232
+
233
+ if num_images == 0:
234
+ raise ValueError("No images found. Check your upload.")
235
+
236
+ # Map UI options to actual method names
237
+ method_mapping = {"high_res": "lower_bound_resize", "low_res": "upper_bound_resize"}
238
+ actual_method = method_mapping.get(process_res_method, "upper_bound_crop")
239
+
240
+ # Get optimal batch size based on benchmarks
241
+ batch_size = self._get_optimal_batch_size(num_images, model_name, device.type)
242
+ print(
243
+ f"[ModelInference] Batched inference: model={model_name}, "
244
+ f"device={device.type}, images={num_images}, batch_size={batch_size}"
245
+ )
246
+
247
+ # Run model inference with batching
248
+ with torch.no_grad():
249
+ if num_images <= batch_size:
250
+ # Single batch - process all at once
251
+ prediction = self.model.inference(
252
+ image_paths,
253
+ export_dir=None,
254
+ process_res_method=actual_method,
255
+ infer_gs=infer_gs,
256
+ ref_view_strategy=ref_view_strategy,
257
+ )
258
+ else:
259
+ # Multiple batches - process in chunks and merge
260
+ predictions = []
261
+ for i in range(0, num_images, batch_size):
262
+ batch_paths = image_paths[i : i + batch_size]
263
+ print(f"[ModelInference] Processing batch {i // batch_size + 1}/{(num_images + batch_size - 1) // batch_size} ({len(batch_paths)} images)")
264
+ batch_pred = self.model.inference(
265
+ batch_paths,
266
+ export_dir=None,
267
+ process_res_method=actual_method,
268
+ infer_gs=False, # Only infer GS on final merged result
269
+ ref_view_strategy=ref_view_strategy,
270
+ )
271
+ predictions.append(batch_pred)
272
+
273
+ # Merge all batch predictions
274
+ prediction = self._merge_predictions(predictions)
275
+ # num_max_points: int = 1_000_000,
276
+ export_to_glb(
277
+ prediction,
278
+ filter_black_bg=filter_black_bg,
279
+ filter_white_bg=filter_white_bg,
280
+ export_dir=target_dir,
281
+ show_cameras=show_camera,
282
+ conf_thresh_percentile=save_percentage,
283
+ num_max_points=int(num_max_points),
284
+ )
285
+
286
+ # export to gs video if needed
287
+ if infer_gs:
288
+ mode_mapping = {"extend": "extend", "smooth": "interpolate_smooth"}
289
+ print(f"GS mode: {gs_trj_mode}; Backend mode: {mode_mapping[gs_trj_mode]}")
290
+ export_to_gs_video(
291
+ prediction,
292
+ export_dir=target_dir,
293
+ chunk_size=4,
294
+ trj_mode=mode_mapping.get(gs_trj_mode, "extend"),
295
+ enable_tqdm=True,
296
+ vis_depth="hcat",
297
+ video_quality=gs_video_quality,
298
+ )
299
+
300
+ # Save predictions.npz for caching metric depth data
301
+ self._save_predictions_cache(target_dir, prediction)
302
+
303
+ # Process results
304
+ processed_data = self._process_results(target_dir, prediction, image_paths)
305
+
306
+ # Clean up using centralized memory utilities for consistency with backend
307
+ cleanup_cuda_memory()
308
+
309
+ inference_time = time.time() - inference_start
310
+ throughput = num_images / inference_time if inference_time > 0 else 0
311
+ print(
312
+ f"[ModelInference] Completed in {inference_time:.2f}s "
313
+ f"({throughput:.1f} img/s)"
314
+ )
315
+
316
+ return prediction, processed_data
317
+
318
+ def _merge_predictions(self, predictions: List[Any]) -> Any:
319
+ """
320
+ Merge multiple batch predictions into a single prediction.
321
+
322
+ Args:
323
+ predictions: List of Prediction objects from batch inference
324
+
325
+ Returns:
326
+ Merged Prediction object
327
+ """
328
+ if not predictions:
329
+ return None
330
+ if len(predictions) == 1:
331
+ return predictions[0]
332
+
333
+ from depth_anything_3.specs import Prediction
334
+
335
+ # Concatenate arrays from all predictions
336
+ merged_depth = np.concatenate([p.depth for p in predictions], axis=0)
337
+ merged_conf = (
338
+ np.concatenate([p.conf for p in predictions], axis=0)
339
+ if predictions[0].conf is not None
340
+ else None
341
+ )
342
+ merged_processed_images = (
343
+ np.concatenate([p.processed_images for p in predictions], axis=0)
344
+ if predictions[0].processed_images is not None
345
+ else None
346
+ )
347
+ merged_extrinsics = (
348
+ np.concatenate([p.extrinsics for p in predictions], axis=0)
349
+ if predictions[0].extrinsics is not None
350
+ else None
351
+ )
352
+ merged_intrinsics = (
353
+ np.concatenate([p.intrinsics for p in predictions], axis=0)
354
+ if predictions[0].intrinsics is not None
355
+ else None
356
+ )
357
+
358
+ # Create merged prediction (use is_metric from first batch)
359
+ merged = Prediction(
360
+ depth=merged_depth,
361
+ is_metric=predictions[0].is_metric,
362
+ conf=merged_conf,
363
+ extrinsics=merged_extrinsics,
364
+ intrinsics=merged_intrinsics,
365
+ processed_images=merged_processed_images,
366
+ )
367
+
368
+ print(f"[ModelInference] Merged {len(predictions)} batches into single prediction")
369
+ return merged
370
+
371
+ def _save_predictions_cache(self, target_dir: str, prediction: Any) -> None:
372
+ """
373
+ Save predictions data to predictions.npz for caching.
374
+
375
+ Args:
376
+ target_dir: Directory to save the cache
377
+ prediction: Model prediction object
378
+ """
379
+ try:
380
+ output_file = os.path.join(target_dir, "predictions.npz")
381
+
382
+ # Build save dict with prediction data
383
+ save_dict = {}
384
+
385
+ # Save processed images if available
386
+ if prediction.processed_images is not None:
387
+ save_dict["images"] = prediction.processed_images
388
+
389
+ # Save depth data
390
+ if prediction.depth is not None:
391
+ save_dict["depths"] = np.round(prediction.depth, 6)
392
+
393
+ # Save confidence if available
394
+ if prediction.conf is not None:
395
+ save_dict["conf"] = np.round(prediction.conf, 2)
396
+
397
+ # Save camera parameters
398
+ if prediction.extrinsics is not None:
399
+ save_dict["extrinsics"] = prediction.extrinsics
400
+ if prediction.intrinsics is not None:
401
+ save_dict["intrinsics"] = prediction.intrinsics
402
+
403
+ # Save to file
404
+ np.savez_compressed(output_file, **save_dict)
405
+ print(f"Saved predictions cache to: {output_file}")
406
+
407
+ except Exception as e:
408
+ print(f"Warning: Failed to save predictions cache: {e}")
409
+
410
+ def _process_results(
411
+ self, target_dir: str, prediction: Any, image_paths: list
412
+ ) -> dict:
413
+ """
414
+ Process model results into structured data.
415
+
416
+ Args:
417
+ target_dir: Directory containing results
418
+ prediction: Model prediction object
419
+ image_paths: List of input image paths
420
+
421
+ Returns:
422
+ Dictionary containing processed data for each view
423
+ """
424
+ processed_data = {}
425
+
426
+ # Read generated depth visualization files
427
+ depth_vis_dir = os.path.join(target_dir, "depth_vis")
428
+
429
+ if os.path.exists(depth_vis_dir):
430
+ depth_files = sorted(glob.glob(os.path.join(depth_vis_dir, "*.jpg")))
431
+ for i, depth_file in enumerate(depth_files):
432
+ # Use processed images directly from API
433
+ processed_image = None
434
+ if prediction.processed_images is not None and i < len(
435
+ prediction.processed_images
436
+ ):
437
+ processed_image = prediction.processed_images[i]
438
+
439
+ processed_data[i] = {
440
+ "depth_image": depth_file,
441
+ "image": processed_image,
442
+ "original_image_path": image_paths[i] if i < len(image_paths) else None,
443
+ "depth": prediction.depth[i] if i < len(prediction.depth) else None,
444
+ "intrinsics": (
445
+ prediction.intrinsics[i]
446
+ if prediction.intrinsics is not None and i < len(prediction.intrinsics)
447
+ else None
448
+ ),
449
+ "mask": None, # No mask information available
450
+ }
451
+
452
+ return processed_data
453
+
454
+ # cleanup() removed: call cleanup_cuda_memory() directly where needed.
src/depth_anything_3/app/modules/ui_components.py ADDED
@@ -0,0 +1,497 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """
16
+ UI components module for Depth Anything 3 Gradio app.
17
+
18
+ This module contains UI component definitions and layout functions.
19
+ """
20
+
21
+ import os
22
+ from typing import Any, Dict, List, Tuple
23
+
24
+ import gradio as gr
25
+
26
+ from depth_anything_3.app.modules.utils import get_logo_base64, get_scene_info
27
+
28
+
29
+ class UIComponents:
30
+ """
31
+ Handles UI component creation and layout for the Gradio app.
32
+ """
33
+
34
+ def __init__(self):
35
+ """Initialize the UI components handler."""
36
+
37
+ def create_upload_section(self) -> Tuple[gr.Video, gr.Slider, gr.File, gr.Gallery]:
38
+ """
39
+ Create the upload section with video, images, and gallery components.
40
+
41
+ Returns:
42
+ A tuple of Gradio components: (input_video, s_time_interval, input_images, image_gallery).
43
+ """
44
+ input_video = gr.Video(label="Upload Video", interactive=True)
45
+ s_time_interval = gr.Slider(
46
+ minimum=0.1,
47
+ maximum=60,
48
+ value=10,
49
+ step=0.1,
50
+ label="Sampling FPS (Frames Per Second)",
51
+ interactive=True,
52
+ visible=True,
53
+ )
54
+ input_images = gr.File(file_count="multiple", label="Upload Images", interactive=True)
55
+ image_gallery = gr.Gallery(
56
+ label="Preview",
57
+ columns=4,
58
+ height="300px",
59
+ object_fit="contain",
60
+ allow_preview=True,
61
+ interactive=False,
62
+ )
63
+
64
+ return input_video, s_time_interval, input_images, image_gallery
65
+
66
+ def create_3d_viewer_section(self) -> gr.Model3D:
67
+ """
68
+ Create the 3D viewer component.
69
+
70
+ Returns:
71
+ 3D model viewer component
72
+ """
73
+ return gr.Model3D(
74
+ height=520,
75
+ zoom_speed=0.5,
76
+ pan_speed=0.5,
77
+ clear_color=[0.0, 0.0, 0.0, 0.0],
78
+ key="persistent_3d_viewer",
79
+ elem_id="reconstruction_3d_viewer",
80
+ )
81
+
82
+ def create_nvs_video(self) -> Tuple[gr.Video, gr.Markdown]:
83
+ """
84
+ Create the 3DGS rendered video display component and info message.
85
+
86
+ Returns:
87
+ Tuple of (video component, info message component)
88
+ """
89
+ with gr.Column():
90
+ gs_info = gr.Markdown(
91
+ (
92
+ "‼️ **3D Gaussian Splatting rendering is currently DISABLED.** <br><br><br>"
93
+ "To render novel views from 3DGS, "
94
+ "enable **Infer 3D Gaussian Splatting** below. <br>"
95
+ "Next, in **Visualization Options**, "
96
+ "*optionally* configure the **rendering trajectory** (default: smooth) "
97
+ "and **video quality** (default: low), "
98
+ "then click **Reconstruct**."
99
+ ),
100
+ visible=True,
101
+ height=520,
102
+ )
103
+ gs_video = gr.Video(
104
+ height=520,
105
+ label="3DGS Rendered NVS Video (depth shown for reference only)",
106
+ interactive=False,
107
+ visible=False,
108
+ )
109
+ return gs_video, gs_info
110
+
111
+ def create_depth_section(self) -> Tuple[gr.Button, gr.Dropdown, gr.Button, gr.Image]:
112
+ """
113
+ Create the depth visualization section.
114
+
115
+ Returns:
116
+ A tuple of (prev_depth_btn, depth_view_selector, next_depth_btn, depth_map)
117
+ """
118
+ with gr.Row(elem_classes=["navigation-row"]):
119
+ prev_depth_btn = gr.Button("◀ Previous", size="sm", scale=1)
120
+ depth_view_selector = gr.Dropdown(
121
+ choices=["View 1"],
122
+ value="View 1",
123
+ label="Select View",
124
+ scale=2,
125
+ interactive=True,
126
+ allow_custom_value=True,
127
+ )
128
+ next_depth_btn = gr.Button("Next ▶", size="sm", scale=1)
129
+ depth_map = gr.Image(
130
+ type="numpy",
131
+ label="Colorized Depth Map",
132
+ format="png",
133
+ interactive=False,
134
+ )
135
+
136
+ return prev_depth_btn, depth_view_selector, next_depth_btn, depth_map
137
+
138
+ def create_measure_section(
139
+ self,
140
+ ) -> Tuple[gr.Button, gr.Dropdown, gr.Button, gr.Image, gr.Image, gr.Markdown]:
141
+ """
142
+ Create the measurement section.
143
+
144
+ Returns:
145
+ A tuple of (prev_measure_btn, measure_view_selector, next_measure_btn, measure_image,
146
+ measure_depth_image, measure_text)
147
+ """
148
+ from depth_anything_3.app.css_and_html import MEASURE_INSTRUCTIONS_HTML
149
+
150
+ gr.Markdown(MEASURE_INSTRUCTIONS_HTML)
151
+ with gr.Row(elem_classes=["navigation-row"]):
152
+ prev_measure_btn = gr.Button("◀ Previous", size="sm", scale=1)
153
+ measure_view_selector = gr.Dropdown(
154
+ choices=["View 1"],
155
+ value="View 1",
156
+ label="Select View",
157
+ scale=2,
158
+ interactive=True,
159
+ allow_custom_value=True,
160
+ )
161
+ next_measure_btn = gr.Button("Next ▶", size="sm", scale=1)
162
+ with gr.Row():
163
+ measure_image = gr.Image(
164
+ type="numpy",
165
+ show_label=False,
166
+ format="webp",
167
+ interactive=False,
168
+ sources=[],
169
+ label="RGB Image",
170
+ scale=1,
171
+ height=275,
172
+ )
173
+ measure_depth_image = gr.Image(
174
+ type="numpy",
175
+ show_label=False,
176
+ format="webp",
177
+ interactive=False,
178
+ sources=[],
179
+ label="Depth Visualization (Right Half)",
180
+ scale=1,
181
+ height=275,
182
+ )
183
+ gr.Markdown(
184
+ "**Note:** Images have been adjusted to model processing size. "
185
+ "Click two points on the RGB image to measure distance."
186
+ )
187
+ measure_text = gr.Markdown("")
188
+
189
+ return (
190
+ prev_measure_btn,
191
+ measure_view_selector,
192
+ next_measure_btn,
193
+ measure_image,
194
+ measure_depth_image,
195
+ measure_text,
196
+ )
197
+
198
+ def create_inference_control_section(
199
+ self,
200
+ ) -> Tuple[gr.Dropdown, gr.Dropdown, gr.Checkbox, gr.Dropdown]:
201
+ """
202
+ Create the inference control section (before inference).
203
+
204
+ Returns:
205
+ Tuple of (model_selector, process_res_method_dropdown, infer_gs, ref_view_strategy)
206
+ """
207
+ from depth_anything_3.app.modules.model_inference import AVAILABLE_MODELS, DEFAULT_MODEL
208
+
209
+ with gr.Row():
210
+ # Model selector - most important control
211
+ model_selector = gr.Dropdown(
212
+ choices=list(AVAILABLE_MODELS.keys()),
213
+ value=DEFAULT_MODEL,
214
+ label="Model",
215
+ info="da3-base: fast | da3-large: balanced | giant: best quality",
216
+ scale=1,
217
+ )
218
+ process_res_method_dropdown = gr.Dropdown(
219
+ choices=["high_res", "low_res"],
220
+ value="low_res",
221
+ label="Image Processing Method",
222
+ info="low_res for much more images",
223
+ scale=1,
224
+ )
225
+ infer_gs = gr.Checkbox(
226
+ label="Infer 3D Gaussian Splatting",
227
+ value=False,
228
+ info=(
229
+ 'Enable novel view rendering from 3DGS (<i class="fas fa-triangle-exclamation '
230
+ 'fa-color-red"></i> requires extra processing time)'
231
+ ),
232
+ scale=1,
233
+ )
234
+ ref_view_strategy = gr.Dropdown(
235
+ choices=["saddle_balanced", "saddle_sim_range", "first", "middle"],
236
+ value="saddle_balanced",
237
+ label="Reference View Strategy",
238
+ info="Strategy for selecting reference view from multiple inputs",
239
+ scale=1,
240
+ )
241
+
242
+ return (model_selector, process_res_method_dropdown, infer_gs, ref_view_strategy)
243
+
244
+ def create_display_control_section(
245
+ self,
246
+ ) -> Tuple[
247
+ gr.Checkbox,
248
+ gr.Checkbox,
249
+ gr.Checkbox,
250
+ gr.Slider,
251
+ gr.Slider,
252
+ gr.Dropdown,
253
+ gr.Dropdown,
254
+ gr.Button,
255
+ gr.ClearButton,
256
+ ]:
257
+ """
258
+ Create the display control section (options for visualization).
259
+
260
+ Returns:
261
+ Tuple of display control components including buttons
262
+ """
263
+ with gr.Column():
264
+ # 3DGS options at the top
265
+ with gr.Row():
266
+ gs_trj_mode = gr.Dropdown(
267
+ choices=["smooth", "extend"],
268
+ value="smooth",
269
+ label=("Rendering trajectory for 3DGS viewpoints (requires n_views ≥ 2)"),
270
+ info=("'smooth' for view interpolation; 'extend' for longer trajectory"),
271
+ visible=False, # initially hidden
272
+ )
273
+ gs_video_quality = gr.Dropdown(
274
+ choices=["low", "medium", "high"],
275
+ value="low",
276
+ label=("Video quality for 3DGS rendered outputs"),
277
+ info=("'low' for faster loading speed; 'high' for better visual quality"),
278
+ visible=False, # initially hidden
279
+ )
280
+
281
+ # Reconstruct and Clear buttons (before Visualization Options)
282
+ with gr.Row():
283
+ submit_btn = gr.Button("Reconstruct", scale=1, variant="primary")
284
+ clear_btn = gr.ClearButton(scale=1)
285
+
286
+ gr.Markdown("### Visualization Options: (Click Reconstruct to update)")
287
+ show_cam = gr.Checkbox(label="Show Camera", value=True)
288
+ filter_black_bg = gr.Checkbox(label="Filter Black Background", value=False)
289
+ filter_white_bg = gr.Checkbox(label="Filter White Background", value=False)
290
+ save_percentage = gr.Slider(
291
+ minimum=0,
292
+ maximum=100,
293
+ value=10,
294
+ step=1,
295
+ label="Filter Percentage",
296
+ info="Confidence Threshold (%): Higher values filter more points.",
297
+ )
298
+ num_max_points = gr.Slider(
299
+ minimum=1000,
300
+ maximum=100000,
301
+ value=1000,
302
+ step=1000,
303
+ label="Max Points (K points)",
304
+ info="Maximum number of points to export to GLB (in thousands)",
305
+ )
306
+
307
+ return (
308
+ show_cam,
309
+ filter_black_bg,
310
+ filter_white_bg,
311
+ save_percentage,
312
+ num_max_points,
313
+ gs_trj_mode,
314
+ gs_video_quality,
315
+ submit_btn,
316
+ clear_btn,
317
+ )
318
+
319
+ def create_control_section(
320
+ self,
321
+ ) -> Tuple[
322
+ gr.Button,
323
+ gr.ClearButton,
324
+ gr.Dropdown,
325
+ gr.Checkbox,
326
+ gr.Checkbox,
327
+ gr.Checkbox,
328
+ gr.Checkbox,
329
+ gr.Checkbox,
330
+ gr.Dropdown,
331
+ gr.Checkbox,
332
+ gr.Textbox,
333
+ ]:
334
+ """
335
+ Create the control section with buttons and options.
336
+
337
+ Returns:
338
+ Tuple of control components
339
+ """
340
+ with gr.Row():
341
+ submit_btn = gr.Button("Reconstruct", scale=1, variant="primary")
342
+ clear_btn = gr.ClearButton(
343
+ scale=1,
344
+ )
345
+
346
+ with gr.Row():
347
+ frame_filter = gr.Dropdown(
348
+ choices=["All"], value="All", label="Show Points from Frame"
349
+ )
350
+ with gr.Column():
351
+ gr.Markdown("### Visualization Option: (Click Reconstruct to update)")
352
+ show_cam = gr.Checkbox(label="Show Camera", value=True)
353
+ show_mesh = gr.Checkbox(label="Show Mesh", value=True)
354
+ filter_black_bg = gr.Checkbox(label="Filter Black Background", value=False)
355
+ filter_white_bg = gr.Checkbox(label="Filter White Background", value=False)
356
+ gr.Markdown("### Reconstruction Options: (updated on next run)")
357
+ apply_mask_checkbox = gr.Checkbox(
358
+ label="Apply mask for predicted ambiguous depth classes & edges",
359
+ value=True,
360
+ )
361
+ process_res_method_dropdown = gr.Dropdown(
362
+ choices=[
363
+ "upper_bound_resize",
364
+ "upper_bound_crop",
365
+ "lower_bound_resize",
366
+ "lower_bound_crop",
367
+ ],
368
+ value="upper_bound_resize",
369
+ label="Image Processing Method",
370
+ info="Method for resizing input images",
371
+ )
372
+ save_to_gallery_checkbox = gr.Checkbox(
373
+ label="Save to Gallery",
374
+ value=False,
375
+ info="Save current reconstruction results to gallery directory",
376
+ )
377
+ gallery_name_input = gr.Textbox(
378
+ label="Gallery Name",
379
+ placeholder="Enter a name for the gallery folder",
380
+ value="",
381
+ info="Leave empty for auto-generated name with timestamp",
382
+ )
383
+
384
+ return (
385
+ submit_btn,
386
+ clear_btn,
387
+ frame_filter,
388
+ show_cam,
389
+ show_mesh,
390
+ filter_black_bg,
391
+ filter_white_bg,
392
+ apply_mask_checkbox,
393
+ process_res_method_dropdown,
394
+ save_to_gallery_checkbox,
395
+ gallery_name_input,
396
+ )
397
+
398
+ def create_example_scenes_section(self) -> List[Dict[str, Any]]:
399
+ """
400
+ Create the example scenes section.
401
+
402
+ Returns:
403
+ List of scene information dictionaries
404
+ """
405
+ # Use assets/examples directory for example scenes
406
+ examples_dir = os.environ.get("DA3_EXAMPLES_DIR", "assets/examples")
407
+
408
+ # Get scene information
409
+ scenes = get_scene_info(examples_dir)
410
+
411
+ return scenes
412
+
413
+ def create_example_scene_grid(self, scenes: List[Dict[str, Any]]) -> List:
414
+ """
415
+ Create the example scene grid.
416
+
417
+ Args:
418
+ scenes: List of scene information dictionaries
419
+
420
+ Returns:
421
+ List of scene components (gr.Image or gr.Video) in same order as scenes
422
+ """
423
+ scene_components = []
424
+
425
+ if scenes:
426
+ for i in range(0, len(scenes), 4): # Process 4 scenes per row
427
+ with gr.Row():
428
+ for j in range(4):
429
+ scene_idx = i + j
430
+ if scene_idx < len(scenes):
431
+ scene = scenes[scene_idx]
432
+ scene_type = scene.get("type", "images")
433
+
434
+ with gr.Column(scale=1, elem_classes=["clickable-thumbnail"]):
435
+ # Use Image for both image and video scenes
436
+ # (video scenes use first frame as thumbnail)
437
+ scene_component = gr.Image(
438
+ value=scene["thumbnail"],
439
+ height=150,
440
+ interactive=False,
441
+ show_label=False,
442
+ elem_id=f"scene_thumb_{scene['name']}",
443
+ sources=[],
444
+ )
445
+ scene_components.append(scene_component)
446
+
447
+ if scene_type == "video":
448
+ # Scene name for video
449
+ gr.Markdown(
450
+ f"**{scene['name']}** \n 🎬 video",
451
+ elem_classes=["scene-info"],
452
+ )
453
+ else:
454
+ # Scene name and image count
455
+ gr.Markdown(
456
+ f"**{scene['name']}** \n {scene['num_images']} images",
457
+ elem_classes=["scene-info"],
458
+ )
459
+ else:
460
+ # Empty column to maintain grid structure
461
+ with gr.Column(scale=1):
462
+ pass
463
+
464
+ return scene_components
465
+
466
+ def create_header_section(self) -> gr.HTML:
467
+ """
468
+ Create the header section with logo and title.
469
+
470
+ Returns:
471
+ Header HTML component
472
+ """
473
+ from depth_anything_3.app.css_and_html import get_header_html
474
+
475
+ return gr.HTML(get_header_html(get_logo_base64()))
476
+
477
+ def create_description_section(self) -> gr.HTML:
478
+ """
479
+ Create the description section.
480
+
481
+ Returns:
482
+ Description HTML component
483
+ """
484
+ from depth_anything_3.app.css_and_html import get_description_html
485
+
486
+ return gr.HTML(get_description_html())
487
+
488
+ def create_acknowledgements_section(self) -> gr.HTML:
489
+ """
490
+ Create the acknowledgements section.
491
+
492
+ Returns:
493
+ Acknowledgements HTML component
494
+ """
495
+ from depth_anything_3.app.css_and_html import get_acknowledgements_html
496
+
497
+ return gr.HTML(get_acknowledgements_html())
src/depth_anything_3/app/modules/utils.py ADDED
@@ -0,0 +1,269 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """
16
+ Utility functions for Depth Anything 3 Gradio app.
17
+
18
+ This module contains helper functions for data processing, visualization,
19
+ and file operations.
20
+ """
21
+
22
+
23
+ import json
24
+ import os
25
+ import shutil
26
+ from datetime import datetime
27
+ from typing import Any, Dict, List, Optional, Tuple
28
+
29
+ import numpy as np
30
+
31
+
32
+ def create_depth_visualization(depth: np.ndarray) -> Optional[np.ndarray]:
33
+ """
34
+ Create a colored depth visualization.
35
+
36
+ Args:
37
+ depth: Depth array
38
+
39
+ Returns:
40
+ Colored depth visualization or None
41
+ """
42
+ if depth is None:
43
+ return None
44
+
45
+ # Normalize depth to 0-1 range
46
+ depth_min = depth[depth > 0].min() if (depth > 0).any() else 0
47
+ depth_max = depth.max()
48
+
49
+ if depth_max <= depth_min:
50
+ return None
51
+
52
+ # Normalize depth
53
+ depth_norm = (depth - depth_min) / (depth_max - depth_min)
54
+ depth_norm = np.clip(depth_norm, 0, 1)
55
+
56
+ # Apply colormap (using matplotlib's viridis colormap)
57
+ import matplotlib.cm as cm
58
+
59
+ # Convert to colored image
60
+ depth_colored = cm.viridis(depth_norm)[:, :, :3] # Remove alpha channel
61
+ depth_colored = (depth_colored * 255).astype(np.uint8)
62
+
63
+ return depth_colored
64
+
65
+
66
+ def save_to_gallery_func(
67
+ target_dir: str, processed_data: dict, gallery_name: Optional[str] = None
68
+ ) -> Tuple[bool, str]:
69
+ """
70
+ Save the current reconstruction results to the gallery directory.
71
+
72
+ Args:
73
+ target_dir: Source directory containing reconstruction results
74
+ processed_data: Processed data dictionary
75
+ gallery_name: Name for the gallery folder
76
+
77
+ Returns:
78
+ Tuple of (success, message)
79
+ """
80
+ try:
81
+ # Get gallery directory from environment variable or use default
82
+ gallery_dir = os.environ.get(
83
+ "DA3_GALLERY_DIR",
84
+ "workspace/gallery",
85
+ )
86
+ if not os.path.exists(gallery_dir):
87
+ os.makedirs(gallery_dir)
88
+
89
+ # Use provided name or create a unique name
90
+ if gallery_name is None or gallery_name.strip() == "":
91
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
92
+ gallery_name = f"reconstruction_{timestamp}"
93
+
94
+ gallery_path = os.path.join(gallery_dir, gallery_name)
95
+
96
+ # Check if directory already exists
97
+ if os.path.exists(gallery_path):
98
+ return False, f"Save failed: folder '{gallery_name}' already exists"
99
+
100
+ # Create the gallery directory
101
+ os.makedirs(gallery_path, exist_ok=True)
102
+
103
+ # Copy GLB file
104
+ glb_source = os.path.join(target_dir, "scene.glb")
105
+ glb_dest = os.path.join(gallery_path, "scene.glb")
106
+ if os.path.exists(glb_source):
107
+ shutil.copy2(glb_source, glb_dest)
108
+
109
+ # Copy depth visualization images
110
+ depth_vis_dir = os.path.join(target_dir, "depth_vis")
111
+ if os.path.exists(depth_vis_dir):
112
+ gallery_depth_vis = os.path.join(gallery_path, "depth_vis")
113
+ shutil.copytree(depth_vis_dir, gallery_depth_vis)
114
+
115
+ # Copy original images
116
+ images_source = os.path.join(target_dir, "images")
117
+ if os.path.exists(images_source):
118
+ gallery_images = os.path.join(gallery_path, "images")
119
+ shutil.copytree(images_source, gallery_images)
120
+
121
+ scene_preview_source = os.path.join(target_dir, "scene.jpg")
122
+ scene_preview_dest = os.path.join(gallery_path, "scene.jpg")
123
+ shutil.copy2(scene_preview_source, scene_preview_dest)
124
+
125
+ # Save metadata
126
+ metadata = {
127
+ "timestamp": datetime.now().strftime("%Y%m%d_%H%M%S"),
128
+ "num_images": len(processed_data) if processed_data else 0,
129
+ "gallery_name": gallery_name,
130
+ }
131
+
132
+ with open(os.path.join(gallery_path, "metadata.json"), "w") as f:
133
+ json.dump(metadata, f, indent=2)
134
+
135
+ print(f"Saved reconstruction to gallery: {gallery_path}")
136
+ return True, f"Save successful: saved to {gallery_path}"
137
+
138
+ except Exception as e:
139
+ print(f"Error saving to gallery: {e}")
140
+ return False, f"Save failed: {str(e)}"
141
+
142
+
143
+ def _extract_video_thumbnail(video_path: str) -> str:
144
+ """
145
+ Extract the first frame of a video as a thumbnail image.
146
+
147
+ Args:
148
+ video_path: Path to the video file
149
+
150
+ Returns:
151
+ Path to the thumbnail image (or video path if extraction fails)
152
+ """
153
+ import tempfile
154
+
155
+ import cv2
156
+
157
+ try:
158
+ cap = cv2.VideoCapture(video_path)
159
+ ret, frame = cap.read()
160
+ cap.release()
161
+
162
+ if ret and frame is not None:
163
+ # Save thumbnail to temp directory
164
+ video_name = os.path.splitext(os.path.basename(video_path))[0]
165
+ thumbnail_dir = os.path.join(tempfile.gettempdir(), "da3_video_thumbnails")
166
+ os.makedirs(thumbnail_dir, exist_ok=True)
167
+ thumbnail_path = os.path.join(thumbnail_dir, f"{video_name}_thumb.jpg")
168
+ cv2.imwrite(thumbnail_path, frame)
169
+ return thumbnail_path
170
+ except Exception as e:
171
+ print(f"Error extracting video thumbnail: {e}")
172
+
173
+ # Fallback to video path if extraction fails
174
+ return video_path
175
+
176
+
177
+ def get_scene_info(examples_dir: str) -> List[Dict[str, Any]]:
178
+ """
179
+ Get information about scenes in the examples directory.
180
+
181
+ Supports:
182
+ - Folders containing images (scene folders)
183
+ - Video files at the root level
184
+
185
+ Args:
186
+ examples_dir: Path to examples directory
187
+
188
+ Returns:
189
+ List of scene information dictionaries
190
+ """
191
+ import glob
192
+
193
+ scenes = []
194
+ if not os.path.exists(examples_dir):
195
+ return scenes
196
+
197
+ for item in sorted(os.listdir(examples_dir)):
198
+ item_path = os.path.join(examples_dir, item)
199
+
200
+ if os.path.isdir(item_path):
201
+ # Find all image files in the scene folder
202
+ image_extensions = ["*.jpg", "*.jpeg", "*.png", "*.bmp", "*.tiff", "*.tif"]
203
+ image_files = []
204
+ for ext in image_extensions:
205
+ image_files.extend(glob.glob(os.path.join(item_path, ext)))
206
+ image_files.extend(glob.glob(os.path.join(item_path, ext.upper())))
207
+
208
+ if image_files:
209
+ # Sort images and get the first one for thumbnail
210
+ image_files = sorted(image_files)
211
+ first_image = image_files[0]
212
+ num_images = len(image_files)
213
+
214
+ scenes.append(
215
+ {
216
+ "name": item,
217
+ "path": item_path,
218
+ "thumbnail": first_image,
219
+ "num_images": num_images,
220
+ "image_files": image_files,
221
+ "type": "images",
222
+ }
223
+ )
224
+
225
+ elif os.path.isfile(item_path):
226
+ # Check if it's a video file
227
+ video_extensions = [".mp4", ".avi", ".mov", ".mkv", ".webm"]
228
+ ext = os.path.splitext(item)[1].lower()
229
+ if ext in video_extensions:
230
+ name = os.path.splitext(item)[0]
231
+ # Extract first frame as thumbnail
232
+ thumbnail_path = _extract_video_thumbnail(item_path)
233
+ scenes.append(
234
+ {
235
+ "name": name,
236
+ "path": item_path,
237
+ "thumbnail": thumbnail_path, # First frame as thumbnail
238
+ "num_images": 0,
239
+ "image_files": [],
240
+ "video_file": item_path,
241
+ "type": "video",
242
+ }
243
+ )
244
+
245
+ return scenes
246
+
247
+
248
+ # NOTE: cleanup was moved to a single canonical helper in
249
+ # `depth_anything_3.utils.memory.cleanup_cuda_memory`.
250
+ # Callers should import and call that directly instead of using this module.
251
+
252
+
253
+ def get_logo_base64() -> Optional[str]:
254
+ """
255
+ Convert WAI logo to base64 for embedding in HTML.
256
+
257
+ Returns:
258
+ Base64 encoded logo string or None
259
+ """
260
+ import base64
261
+
262
+ logo_path = "examples/WAI-Logo/wai_logo.png"
263
+ try:
264
+ with open(logo_path, "rb") as img_file:
265
+ img_data = img_file.read()
266
+ base64_str = base64.b64encode(img_data).decode()
267
+ return f"data:image/png;base64,{base64_str}"
268
+ except FileNotFoundError:
269
+ return None
src/depth_anything_3/app/modules/visualization.py ADDED
@@ -0,0 +1,435 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """
16
+ Visualization module for Depth Anything 3 Gradio app.
17
+
18
+ This module handles visualization updates, navigation, and measurement functionality.
19
+ """
20
+
21
+ import os
22
+ from typing import Any, Dict, List, Optional, Tuple
23
+
24
+ import cv2
25
+ import gradio as gr
26
+ import numpy as np
27
+
28
+
29
+ class VisualizationHandler:
30
+ """
31
+ Handles visualization updates and navigation for the Gradio app.
32
+ """
33
+
34
+ def __init__(self):
35
+ """Initialize the visualization handler."""
36
+
37
+ def update_view_selectors(
38
+ self, processed_data: Optional[dict]
39
+ ) -> Tuple[gr.Dropdown, gr.Dropdown]:
40
+ """
41
+ Update view selector dropdowns based on available views.
42
+
43
+ Args:
44
+ processed_data: Processed data dictionary
45
+
46
+ Returns:
47
+ Tuple of (depth_view_selector, measure_view_selector)
48
+ """
49
+ if processed_data is None or len(processed_data) == 0:
50
+ choices = ["View 1"]
51
+ else:
52
+ num_views = len(processed_data)
53
+ choices = [f"View {i + 1}" for i in range(num_views)]
54
+
55
+ return (
56
+ gr.Dropdown(choices=choices, value=choices[0]), # depth_view_selector
57
+ gr.Dropdown(choices=choices, value=choices[0]), # measure_view_selector
58
+ )
59
+
60
+ def get_view_data_by_index(
61
+ self, processed_data: Optional[dict], view_index: int
62
+ ) -> Optional[Dict[str, Any]]:
63
+ """
64
+ Get view data by index, handling bounds.
65
+
66
+ Args:
67
+ processed_data: Processed data dictionary
68
+ view_index: Index of the view to get
69
+
70
+ Returns:
71
+ View data dictionary or None
72
+ """
73
+ if processed_data is None or len(processed_data) == 0:
74
+ return None
75
+
76
+ view_keys = list(processed_data.keys())
77
+ if view_index < 0 or view_index >= len(view_keys):
78
+ view_index = 0
79
+
80
+ return processed_data[view_keys[view_index]]
81
+
82
+ def update_depth_view(
83
+ self, processed_data: Optional[dict], view_index: int
84
+ ) -> Optional[str]:
85
+ """
86
+ Update depth view for a specific view index.
87
+
88
+ Args:
89
+ processed_data: Processed data dictionary
90
+ view_index: Index of the view to update
91
+
92
+ Returns:
93
+ Path to depth visualization image or None
94
+ """
95
+ view_data = self.get_view_data_by_index(processed_data, view_index)
96
+ if view_data is None or view_data.get("depth_image") is None:
97
+ return None
98
+
99
+ # Return the depth visualization image directly
100
+ return view_data["depth_image"]
101
+
102
+ def navigate_depth_view(
103
+ self,
104
+ processed_data: Optional[dict],
105
+ current_selector_value: str,
106
+ direction: int,
107
+ ) -> Tuple[str, Optional[str]]:
108
+ """
109
+ Navigate depth view (direction: -1 for previous, +1 for next).
110
+
111
+ Args:
112
+ processed_data: Processed data dictionary
113
+ current_selector_value: Current selector value
114
+ direction: Direction to navigate (-1 for previous, +1 for next)
115
+
116
+ Returns:
117
+ Tuple of (new_selector_value, depth_vis)
118
+ """
119
+ if processed_data is None or len(processed_data) == 0:
120
+ return "View 1", None
121
+
122
+ # Parse current view number
123
+ try:
124
+ current_view = int(current_selector_value.split()[1]) - 1
125
+ except: # noqa
126
+ current_view = 0
127
+
128
+ num_views = len(processed_data)
129
+ new_view = (current_view + direction) % num_views
130
+
131
+ new_selector_value = f"View {new_view + 1}"
132
+ depth_vis = self.update_depth_view(processed_data, new_view)
133
+
134
+ return new_selector_value, depth_vis
135
+
136
+ def update_measure_view(
137
+ self, processed_data: Optional[dict], view_index: int
138
+ ) -> Tuple[Optional[np.ndarray], Optional[np.ndarray], List]:
139
+ """
140
+ Update measure view for a specific view index.
141
+
142
+ Args:
143
+ processed_data: Processed data dictionary
144
+ view_index: Index of the view to update
145
+
146
+ Returns:
147
+ Tuple of (measure_image, depth_right_half, measure_points)
148
+ """
149
+ view_data = self.get_view_data_by_index(processed_data, view_index)
150
+ if view_data is None:
151
+ return None, None, [] # image, depth_right_half, measure_points
152
+
153
+ # Get the processed (resized) image
154
+ if "image" in view_data and view_data["image"] is not None:
155
+ image = view_data["image"].copy()
156
+ else:
157
+ return None, None, []
158
+
159
+ # Ensure image is in uint8 format
160
+ if image.dtype != np.uint8:
161
+ if image.max() <= 1.0:
162
+ image = (image * 255).astype(np.uint8)
163
+ else:
164
+ image = image.astype(np.uint8)
165
+
166
+ # Extract right half of the depth visualization (pure depth part)
167
+ depth_image_path = view_data.get("depth_image", None)
168
+ depth_right_half = None
169
+
170
+ if depth_image_path and os.path.exists(depth_image_path):
171
+ try:
172
+ # Load the combined depth visualization image
173
+ depth_combined = cv2.imread(depth_image_path)
174
+ depth_combined = cv2.cvtColor(depth_combined, cv2.COLOR_BGR2RGB)
175
+ if depth_combined is not None:
176
+ height, width = depth_combined.shape[:2]
177
+ # Extract right half (depth visualization part)
178
+ depth_right_half = depth_combined[:, width // 2 :]
179
+ except Exception as e:
180
+ print(f"Error extracting depth right half: {e}")
181
+
182
+ return image, depth_right_half, []
183
+
184
+ def navigate_measure_view(
185
+ self,
186
+ processed_data: Optional[dict],
187
+ current_selector_value: str,
188
+ direction: int,
189
+ ) -> Tuple[str, Optional[np.ndarray], Optional[str], List]:
190
+ """
191
+ Navigate measure view (direction: -1 for previous, +1 for next).
192
+
193
+ Args:
194
+ processed_data: Processed data dictionary
195
+ current_selector_value: Current selector value
196
+ direction: Direction to navigate (-1 for previous, +1 for next)
197
+
198
+ Returns:
199
+ Tuple of (new_selector_value, measure_image, depth_image_path, measure_points)
200
+ """
201
+ if processed_data is None or len(processed_data) == 0:
202
+ return "View 1", None, None, []
203
+
204
+ # Parse current view number
205
+ try:
206
+ current_view = int(current_selector_value.split()[1]) - 1
207
+ except: # noqa
208
+ current_view = 0
209
+
210
+ num_views = len(processed_data)
211
+ new_view = (current_view + direction) % num_views
212
+
213
+ new_selector_value = f"View {new_view + 1}"
214
+ measure_image, depth_right_half, measure_points = self.update_measure_view(
215
+ processed_data, new_view
216
+ )
217
+
218
+ return new_selector_value, measure_image, depth_right_half, measure_points
219
+
220
+ def populate_visualization_tabs(
221
+ self, processed_data: Optional[dict]
222
+ ) -> Tuple[Optional[str], Optional[np.ndarray], Optional[str], List]:
223
+ """
224
+ Populate the depth and measure tabs with processed data.
225
+
226
+ Args:
227
+ processed_data: Processed data dictionary
228
+
229
+ Returns:
230
+ Tuple of (depth_vis, measure_img, depth_image_path, measure_points)
231
+ """
232
+ if processed_data is None or len(processed_data) == 0:
233
+ return None, None, None, []
234
+
235
+ # Use update function to get depth visualization
236
+ depth_vis = self.update_depth_view(processed_data, 0)
237
+ measure_img, depth_right_half, _ = self.update_measure_view(processed_data, 0)
238
+
239
+ return depth_vis, measure_img, depth_right_half, []
240
+
241
+ def reset_measure(
242
+ self, processed_data: Optional[dict]
243
+ ) -> Tuple[Optional[np.ndarray], List, str]:
244
+ """
245
+ Reset measure points.
246
+
247
+ Args:
248
+ processed_data: Processed data dictionary
249
+
250
+ Returns:
251
+ Tuple of (image, measure_points, text)
252
+ """
253
+ if processed_data is None or len(processed_data) == 0:
254
+ return None, [], ""
255
+
256
+ # Return the first view image
257
+ first_view = list(processed_data.values())[0]
258
+ return first_view["image"], [], ""
259
+
260
+ def measure(
261
+ self,
262
+ processed_data: Optional[dict],
263
+ measure_points: List,
264
+ current_view_selector: str,
265
+ event: gr.SelectData,
266
+ ) -> List:
267
+ """
268
+ Handle measurement on images.
269
+
270
+ Args:
271
+ processed_data: Processed data dictionary
272
+ measure_points: List of current measure points
273
+ current_view_selector: Current view selector value
274
+ event: Gradio select event
275
+
276
+ Returns:
277
+ List of [image, depth_right_half, measure_points, text]
278
+ """
279
+ try:
280
+ print(f"Measure function called with selector: {current_view_selector}")
281
+
282
+ if processed_data is None or len(processed_data) == 0:
283
+ return [None, [], "No data available"]
284
+
285
+ # Use the currently selected view instead of always using the first view
286
+ try:
287
+ current_view_index = int(current_view_selector.split()[1]) - 1
288
+ except: # noqa
289
+ current_view_index = 0
290
+
291
+ print(f"Using view index: {current_view_index}")
292
+
293
+ # Get view data safely
294
+ if current_view_index < 0 or current_view_index >= len(processed_data):
295
+ current_view_index = 0
296
+
297
+ view_keys = list(processed_data.keys())
298
+ current_view = processed_data[view_keys[current_view_index]]
299
+
300
+ if current_view is None:
301
+ return [None, [], "No view data available"]
302
+
303
+ point2d = event.index[0], event.index[1]
304
+ print(f"Clicked point: {point2d}")
305
+
306
+ measure_points.append(point2d)
307
+
308
+ # Get image and depth visualization
309
+ image, depth_right_half, _ = self.update_measure_view(
310
+ processed_data, current_view_index
311
+ )
312
+ if image is None:
313
+ return [None, [], "No image available"]
314
+
315
+ image = image.copy()
316
+
317
+ # Ensure image is in uint8 format for proper cv2 operations
318
+ try:
319
+ if image.dtype != np.uint8:
320
+ if image.max() <= 1.0:
321
+ # Image is in [0, 1] range, convert to [0, 255]
322
+ image = (image * 255).astype(np.uint8)
323
+ else:
324
+ # Image is already in [0, 255] range
325
+ image = image.astype(np.uint8)
326
+ except Exception as e:
327
+ print(f"Image conversion error: {e}")
328
+ return [None, [], f"Image conversion error: {e}"]
329
+
330
+ # Draw circles for points
331
+ try:
332
+ for p in measure_points:
333
+ if 0 <= p[0] < image.shape[1] and 0 <= p[1] < image.shape[0]:
334
+ image = cv2.circle(image, p, radius=5, color=(255, 0, 0), thickness=2)
335
+ except Exception as e:
336
+ print(f"Drawing error: {e}")
337
+ return [None, [], f"Drawing error: {e}"]
338
+
339
+ # Get depth information from processed_data
340
+ depth_text = ""
341
+ try:
342
+ for i, p in enumerate(measure_points):
343
+ if (
344
+ current_view["depth"] is not None
345
+ and 0 <= p[1] < current_view["depth"].shape[0]
346
+ and 0 <= p[0] < current_view["depth"].shape[1]
347
+ ):
348
+ d = current_view["depth"][p[1], p[0]]
349
+ depth_text += f"- **P{i + 1} depth: {d:.2f}m**\n"
350
+ else:
351
+ depth_text += f"- **P{i + 1}: Click position ({p[0]}, {p[1]}) - No depth information**\n" # noqa: E501
352
+ except Exception as e:
353
+ print(f"Depth text error: {e}")
354
+ depth_text = f"Error computing depth: {e}\n"
355
+
356
+ if len(measure_points) == 2:
357
+ try:
358
+ point1, point2 = measure_points
359
+ # Draw line
360
+ if (
361
+ 0 <= point1[0] < image.shape[1]
362
+ and 0 <= point1[1] < image.shape[0]
363
+ and 0 <= point2[0] < image.shape[1]
364
+ and 0 <= point2[1] < image.shape[0]
365
+ ):
366
+ image = cv2.line(image, point1, point2, color=(255, 0, 0), thickness=2)
367
+
368
+ # Compute 3D distance using depth information and camera intrinsics
369
+ distance_text = "- **Distance: Unable to calculate 3D distance**"
370
+ if (
371
+ current_view["depth"] is not None
372
+ and 0 <= point1[1] < current_view["depth"].shape[0]
373
+ and 0 <= point1[0] < current_view["depth"].shape[1]
374
+ and 0 <= point2[1] < current_view["depth"].shape[0]
375
+ and 0 <= point2[0] < current_view["depth"].shape[1]
376
+ ):
377
+ try:
378
+ # Get depth values at the two points
379
+ d1 = current_view["depth"][point1[1], point1[0]]
380
+ d2 = current_view["depth"][point2[1], point2[0]]
381
+
382
+ # Convert 2D pixel coordinates to 3D world coordinates
383
+ if current_view["intrinsics"] is not None:
384
+ # Get camera intrinsics
385
+ K = current_view["intrinsics"] # 3x3 intrinsic matrix
386
+ fx, fy = K[0, 0], K[1, 1] # focal lengths
387
+ cx, cy = K[0, 2], K[1, 2] # principal point
388
+
389
+ # Convert pixel coordinates to normalized camera coordinates
390
+ # Point 1: (u1, v1) -> (x1, y1, z1)
391
+ u1, v1 = point1[0], point1[1]
392
+ x1 = (u1 - cx) * d1 / fx
393
+ y1 = (v1 - cy) * d1 / fy
394
+ z1 = d1
395
+
396
+ # Point 2: (u2, v2) -> (x2, y2, z2)
397
+ u2, v2 = point2[0], point2[1]
398
+ x2 = (u2 - cx) * d2 / fx
399
+ y2 = (v2 - cy) * d2 / fy
400
+ z2 = d2
401
+
402
+ # Calculate 3D Euclidean distance
403
+ p1_3d = np.array([x1, y1, z1])
404
+ p2_3d = np.array([x2, y2, z2])
405
+ distance_3d = np.linalg.norm(p1_3d - p2_3d)
406
+
407
+ distance_text = f"- **Distance: {distance_3d:.2f}m**"
408
+ else:
409
+ # Fallback to simplified calculation if no intrinsics
410
+ pixel_distance = np.sqrt(
411
+ (point1[0] - point2[0]) ** 2 + (point1[1] - point2[1]) ** 2
412
+ )
413
+ avg_depth = (d1 + d2) / 2
414
+ scale_factor = avg_depth / 1000 # Rough scaling factor
415
+ estimated_3d_distance = pixel_distance * scale_factor
416
+ distance_text = f"- **Distance: {estimated_3d_distance:.2f}m (estimated, no intrinsics)**" # noqa: E501
417
+
418
+ except Exception as e:
419
+ print(f"Distance computation error: {e}")
420
+ distance_text = f"- **Distance computation error: {e}**"
421
+
422
+ measure_points = []
423
+ text = depth_text + distance_text
424
+ print(f"Measurement complete: {text}")
425
+ return [image, depth_right_half, measure_points, text]
426
+ except Exception as e:
427
+ print(f"Final measurement error: {e}")
428
+ return [None, [], f"Measurement error: {e}"]
429
+ else:
430
+ print(f"Single point measurement: {depth_text}")
431
+ return [image, depth_right_half, measure_points, depth_text]
432
+
433
+ except Exception as e:
434
+ print(f"Overall measure function error: {e}")
435
+ return [None, [], f"Measure function error: {e}"]
src/depth_anything_3/cache.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Model caching utilities for Depth Anything 3.
3
+
4
+ Provides model caching functionality to avoid reloading model weights on every instantiation.
5
+ This significantly reduces latency for repeated model creation (2-5s gain).
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import threading
11
+ from typing import Dict, Optional, Tuple
12
+
13
+ import torch
14
+ import torch.nn as nn
15
+
16
+ from depth_anything_3.utils.logger import logger
17
+
18
+
19
+ class ModelCache:
20
+ """
21
+ Thread-safe singleton cache for Depth Anything 3 models.
22
+
23
+ Caches loaded model weights to avoid reloading from disk on every instantiation.
24
+ Each unique combination of (model_name, device) is cached separately.
25
+
26
+ Usage:
27
+ cache = ModelCache()
28
+ model = cache.get(model_name, device, loader_fn)
29
+ # loader_fn is only called if cache miss
30
+
31
+ Thread Safety:
32
+ Uses threading.Lock to ensure thread-safe access to cache.
33
+
34
+ Memory Management:
35
+ - Models are kept in cache until explicitly cleared
36
+ - Use clear() to free memory when needed
37
+ - Use clear_device() to clear specific device models
38
+ """
39
+
40
+ _instance: Optional["ModelCache"] = None
41
+ _lock = threading.Lock()
42
+
43
+ def __new__(cls):
44
+ """Singleton pattern to ensure single cache instance."""
45
+ if cls._instance is None:
46
+ with cls._lock:
47
+ if cls._instance is None:
48
+ cls._instance = super().__new__(cls)
49
+ cls._instance._initialized = False
50
+ return cls._instance
51
+
52
+ def __init__(self):
53
+ """Initialize cache storage."""
54
+ if self._initialized:
55
+ return
56
+
57
+ self._cache: Dict[Tuple[str, str], nn.Module] = {}
58
+ self._cache_lock = threading.Lock()
59
+ self._initialized = True
60
+ logger.info("ModelCache initialized")
61
+
62
+ def get(
63
+ self,
64
+ model_name: str,
65
+ device: torch.device | str,
66
+ loader_fn: callable,
67
+ ) -> nn.Module:
68
+ """
69
+ Get cached model or load if not in cache.
70
+
71
+ Args:
72
+ model_name: Name of the model (e.g., "da3-large")
73
+ device: Target device (cuda, mps, cpu)
74
+ loader_fn: Function to load model if cache miss
75
+ Should return nn.Module
76
+
77
+ Returns:
78
+ Cached or freshly loaded model on specified device
79
+
80
+ Example:
81
+ >>> cache = ModelCache()
82
+ >>> model = cache.get(
83
+ ... "da3-large",
84
+ ... "cuda",
85
+ ... lambda: create_model()
86
+ ... )
87
+ """
88
+ device_str = str(device)
89
+ cache_key = (model_name, device_str)
90
+
91
+ with self._cache_lock:
92
+ if cache_key in self._cache:
93
+ logger.debug(f"Model cache HIT: {model_name} on {device_str}")
94
+ return self._cache[cache_key]
95
+
96
+ logger.info(f"Model cache MISS: {model_name} on {device_str}. Loading...")
97
+ model = loader_fn()
98
+ self._cache[cache_key] = model
99
+ logger.info(f"Model cached: {model_name} on {device_str}")
100
+
101
+ return model
102
+
103
+ def clear(self) -> None:
104
+ """
105
+ Clear entire cache and free memory.
106
+
107
+ Removes all cached models and forces garbage collection.
108
+ Useful when switching between many different models.
109
+ """
110
+ with self._cache_lock:
111
+ num_cached = len(self._cache)
112
+ self._cache.clear()
113
+
114
+ # Force garbage collection to free GPU memory
115
+ import gc
116
+
117
+ gc.collect()
118
+ if torch.cuda.is_available():
119
+ torch.cuda.empty_cache()
120
+ if hasattr(torch, "mps") and torch.backends.mps.is_available():
121
+ torch.mps.empty_cache()
122
+
123
+ logger.info(f"Model cache cleared ({num_cached} models removed)")
124
+
125
+ def clear_device(self, device: torch.device | str) -> None:
126
+ """
127
+ Clear all models on specific device.
128
+
129
+ Args:
130
+ device: Device to clear (e.g., "cuda", "mps", "cpu")
131
+
132
+ Example:
133
+ >>> cache = ModelCache()
134
+ >>> cache.clear_device("cuda") # Clear all CUDA models
135
+ """
136
+ device_str = str(device)
137
+
138
+ with self._cache_lock:
139
+ keys_to_remove = [key for key in self._cache if key[1] == device_str]
140
+ for key in keys_to_remove:
141
+ del self._cache[key]
142
+
143
+ # Free device memory
144
+ if "cuda" in device_str and torch.cuda.is_available():
145
+ torch.cuda.empty_cache()
146
+ elif "mps" in device_str and hasattr(torch, "mps") and torch.backends.mps.is_available():
147
+ torch.mps.empty_cache()
148
+
149
+ logger.info(f"Model cache cleared for device {device_str} ({len(keys_to_remove)} models removed)")
150
+
151
+ def get_cache_info(self) -> Dict[str, int]:
152
+ """
153
+ Get cache statistics.
154
+
155
+ Returns:
156
+ Dictionary with cache info:
157
+ - total: Total number of cached models
158
+ - by_device: Number of models per device
159
+ """
160
+ with self._cache_lock:
161
+ info = {
162
+ "total": len(self._cache),
163
+ "by_device": {},
164
+ }
165
+
166
+ for model_name, device_str in self._cache.keys():
167
+ if device_str not in info["by_device"]:
168
+ info["by_device"][device_str] = 0
169
+ info["by_device"][device_str] += 1
170
+
171
+ return info
172
+
173
+
174
+ # Global singleton instance
175
+ _global_cache = ModelCache()
176
+
177
+
178
+ def get_model_cache() -> ModelCache:
179
+ """
180
+ Get global model cache instance.
181
+
182
+ Returns:
183
+ Singleton ModelCache instance
184
+
185
+ Example:
186
+ >>> from depth_anything_3.cache import get_model_cache
187
+ >>> cache = get_model_cache()
188
+ >>> cache.clear()
189
+ """
190
+ return _global_cache
src/depth_anything_3/cfg.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """
16
+ Configuration utility functions
17
+ """
18
+
19
+ import importlib
20
+ from pathlib import Path
21
+ from typing import Any, Callable, List, Union
22
+
23
+ from omegaconf import DictConfig, ListConfig, OmegaConf
24
+
25
+ try:
26
+ OmegaConf.register_new_resolver("eval", eval)
27
+ except Exception as e:
28
+ # if eval is not available, we can just pass
29
+ print(f"Error registering eval resolver: {e}")
30
+
31
+
32
+ def load_config(path: str, argv: List[str] = None) -> Union[DictConfig, ListConfig]:
33
+ """
34
+ Load a configuration. Will resolve inheritance.
35
+ Supports both file paths and module paths (e.g., depth_anything_3.configs.giant).
36
+ """
37
+ # Check if path is a module path (contains dots but no slashes and doesn't end with .yaml)
38
+ if "." in path and "/" not in path and not path.endswith(".yaml"):
39
+ # It's a module path, load from package resources
40
+ path_parts = path.split(".")[1:]
41
+ config_path = Path(__file__).resolve().parent
42
+ for part in path_parts:
43
+ config_path = config_path.joinpath(part)
44
+ config_path = config_path.with_suffix(".yaml")
45
+ config = OmegaConf.load(str(config_path))
46
+ else:
47
+ # It's a file path (absolute, relative, or with .yaml extension)
48
+ config = OmegaConf.load(path)
49
+
50
+ if argv is not None:
51
+ config_argv = OmegaConf.from_dotlist(argv)
52
+ config = OmegaConf.merge(config, config_argv)
53
+ config = resolve_recursive(config, resolve_inheritance)
54
+ return config
55
+
56
+
57
+ def resolve_recursive(
58
+ config: Any,
59
+ resolver: Callable[[Union[DictConfig, ListConfig]], Union[DictConfig, ListConfig]],
60
+ ) -> Any:
61
+ config = resolver(config)
62
+ if isinstance(config, DictConfig):
63
+ for k in config.keys():
64
+ v = config.get(k)
65
+ if isinstance(v, (DictConfig, ListConfig)):
66
+ config[k] = resolve_recursive(v, resolver)
67
+ if isinstance(config, ListConfig):
68
+ for i in range(len(config)):
69
+ v = config.get(i)
70
+ if isinstance(v, (DictConfig, ListConfig)):
71
+ config[i] = resolve_recursive(v, resolver)
72
+ return config
73
+
74
+
75
+ def resolve_inheritance(config: Union[DictConfig, ListConfig]) -> Any:
76
+ """
77
+ Recursively resolve inheritance if the config contains:
78
+ __inherit__: path/to/parent.yaml or a ListConfig of such paths.
79
+ """
80
+ if isinstance(config, DictConfig):
81
+ inherit = config.pop("__inherit__", None)
82
+
83
+ if inherit:
84
+ inherit_list = inherit if isinstance(inherit, ListConfig) else [inherit]
85
+
86
+ parent_config = None
87
+ for parent_path in inherit_list:
88
+ assert isinstance(parent_path, str)
89
+ parent_config = (
90
+ load_config(parent_path)
91
+ if parent_config is None
92
+ else OmegaConf.merge(parent_config, load_config(parent_path))
93
+ )
94
+
95
+ if len(config.keys()) > 0:
96
+ config = OmegaConf.merge(parent_config, config)
97
+ else:
98
+ config = parent_config
99
+ return config
100
+
101
+
102
+ def import_item(path: str, name: str) -> Any:
103
+ """
104
+ Import a python item. Example: import_item("path.to.file", "MyClass") -> MyClass
105
+ """
106
+ return getattr(importlib.import_module(path), name)
107
+
108
+
109
+ def create_object(config: DictConfig) -> Any:
110
+ """
111
+ Create an object from config.
112
+ The config is expected to contains the following:
113
+ __object__:
114
+ path: path.to.module
115
+ name: MyClass
116
+ args: as_config | as_params (default to as_config)
117
+ """
118
+ config = DictConfig(config)
119
+ item = import_item(
120
+ path=config.__object__.path,
121
+ name=config.__object__.name,
122
+ )
123
+ args = config.__object__.get("args", "as_config")
124
+ if args == "as_config":
125
+ return item(config)
126
+ if args == "as_params":
127
+ config = OmegaConf.to_object(config)
128
+ config.pop("__object__")
129
+ return item(**config)
130
+ raise NotImplementedError(f"Unknown args type: {args}")
131
+
132
+
133
+ def create_dataset(path: str, *args, **kwargs) -> Any:
134
+ """
135
+ Create a dataset. Requires the file to contain a "create_dataset" function.
136
+ """
137
+ return import_item(path, "create_dataset")(*args, **kwargs)
138
+
139
+
140
+ def to_dict_recursive(config_obj):
141
+ if isinstance(config_obj, DictConfig):
142
+ return {k: to_dict_recursive(v) for k, v in config_obj.items()}
143
+ elif isinstance(config_obj, ListConfig):
144
+ return [to_dict_recursive(item) for item in config_obj]
145
+ return config_obj