Delanoe Pirard
commited on
Commit
·
18b382b
0
Parent(s):
Deploy to HuggingFace Spaces
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .flake8 +3 -0
- .gitattributes +6 -0
- .github/ISSUE_TEMPLATE/bug_report.yml +81 -0
- .github/ISSUE_TEMPLATE/config.yml +8 -0
- .github/ISSUE_TEMPLATE/feature_request.yml +61 -0
- .github/workflows/ci.yml +71 -0
- .github/workflows/publish.yml +34 -0
- .gitignore +63 -0
- .pre-commit-config.yaml +59 -0
- BENCHMARKS.md +217 -0
- CHANGELOG.md +36 -0
- CONTRIBUTING.md +114 -0
- LICENSE +201 -0
- README.md +418 -0
- README.md.original +405 -0
- app.py +59 -0
- assets/examples/SOH/000.png +3 -0
- assets/examples/SOH/010.png +3 -0
- assets/examples/robot_unitree.mp4 +3 -0
- assets/images/da3_radar.png +3 -0
- assets/images/demo320-2.gif +3 -0
- benchmarks/__init__.py +3 -0
- benchmarks/comparative_benchmark.py +436 -0
- benchmarks/flash_attention_benchmark.py +488 -0
- benchmarks/full_benchmark.py +696 -0
- benchmarks/gpu_preprocessing_benchmark.py +363 -0
- benchmarks/results/temp_images/test_image_0000.jpg +3 -0
- benchmarks/results/temp_images/test_image_0001.jpg +3 -0
- benchmarks/results/temp_images/test_image_0002.jpg +3 -0
- benchmarks/results/temp_images/test_image_0003.jpg +3 -0
- docs/API.md +465 -0
- docs/CLI.md +654 -0
- docs/funcs/ref_view_strategy.md +183 -0
- notebooks/da3.ipynb +0 -0
- notebooks/da3_tutorial.ipynb +667 -0
- pyproject.toml +144 -0
- requirements.txt +38 -0
- scripts/deploy_hf.sh +97 -0
- src/depth_anything_3/api.py +718 -0
- src/depth_anything_3/app/css_and_html.py +623 -0
- src/depth_anything_3/app/gradio_app.py +743 -0
- src/depth_anything_3/app/modules/__init__.py +43 -0
- src/depth_anything_3/app/modules/event_handlers.py +624 -0
- src/depth_anything_3/app/modules/file_handlers.py +327 -0
- src/depth_anything_3/app/modules/model_inference.py +454 -0
- src/depth_anything_3/app/modules/ui_components.py +497 -0
- src/depth_anything_3/app/modules/utils.py +269 -0
- src/depth_anything_3/app/modules/visualization.py +435 -0
- src/depth_anything_3/cache.py +190 -0
- src/depth_anything_3/cfg.py +145 -0
.flake8
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[flake8]
|
| 2 |
+
max-line-length = 100
|
| 3 |
+
ignore = E203 E741 W503 E731
|
.gitattributes
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.jpg filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.jpeg filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.gif filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.webm filter=lfs diff=lfs merge=lfs -text
|
.github/ISSUE_TEMPLATE/bug_report.yml
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Delanoe Pirard / Aedelon
|
| 2 |
+
# Licensed under the Apache License, Version 2.0
|
| 3 |
+
|
| 4 |
+
name: Bug Report
|
| 5 |
+
description: Report a bug in awesome-depth-anything-3
|
| 6 |
+
labels: ["bug"]
|
| 7 |
+
body:
|
| 8 |
+
- type: markdown
|
| 9 |
+
attributes:
|
| 10 |
+
value: |
|
| 11 |
+
Thanks for reporting! Please fill out the form below.
|
| 12 |
+
|
| 13 |
+
**Note**: For issues with the model itself (accuracy, artifacts), please report to the [upstream repository](https://github.com/ByteDance-Seed/Depth-Anything-3/issues).
|
| 14 |
+
|
| 15 |
+
- type: textarea
|
| 16 |
+
id: description
|
| 17 |
+
attributes:
|
| 18 |
+
label: Bug Description
|
| 19 |
+
description: What happened? What did you expect to happen?
|
| 20 |
+
placeholder: Describe the bug...
|
| 21 |
+
validations:
|
| 22 |
+
required: true
|
| 23 |
+
|
| 24 |
+
- type: textarea
|
| 25 |
+
id: reproduction
|
| 26 |
+
attributes:
|
| 27 |
+
label: Steps to Reproduce
|
| 28 |
+
description: Minimal code or steps to reproduce the issue
|
| 29 |
+
placeholder: |
|
| 30 |
+
```python
|
| 31 |
+
from depth_anything_3.api import DepthAnything3
|
| 32 |
+
model = DepthAnything3.from_pretrained("depth-anything/DA3-LARGE")
|
| 33 |
+
# ...
|
| 34 |
+
```
|
| 35 |
+
validations:
|
| 36 |
+
required: true
|
| 37 |
+
|
| 38 |
+
- type: textarea
|
| 39 |
+
id: traceback
|
| 40 |
+
attributes:
|
| 41 |
+
label: Error Traceback
|
| 42 |
+
description: Full error message and traceback
|
| 43 |
+
render: shell
|
| 44 |
+
placeholder: Paste the full traceback here...
|
| 45 |
+
|
| 46 |
+
- type: input
|
| 47 |
+
id: version
|
| 48 |
+
attributes:
|
| 49 |
+
label: Package Version
|
| 50 |
+
placeholder: "0.1.0"
|
| 51 |
+
validations:
|
| 52 |
+
required: true
|
| 53 |
+
|
| 54 |
+
- type: dropdown
|
| 55 |
+
id: device
|
| 56 |
+
attributes:
|
| 57 |
+
label: Device
|
| 58 |
+
options:
|
| 59 |
+
- CUDA (NVIDIA GPU)
|
| 60 |
+
- MPS (Apple Silicon)
|
| 61 |
+
- CPU
|
| 62 |
+
validations:
|
| 63 |
+
required: true
|
| 64 |
+
|
| 65 |
+
- type: input
|
| 66 |
+
id: pytorch
|
| 67 |
+
attributes:
|
| 68 |
+
label: PyTorch Version
|
| 69 |
+
placeholder: "2.9.0"
|
| 70 |
+
|
| 71 |
+
- type: input
|
| 72 |
+
id: python
|
| 73 |
+
attributes:
|
| 74 |
+
label: Python Version
|
| 75 |
+
placeholder: "3.11"
|
| 76 |
+
|
| 77 |
+
- type: input
|
| 78 |
+
id: os
|
| 79 |
+
attributes:
|
| 80 |
+
label: Operating System
|
| 81 |
+
placeholder: "macOS 14.0 / Ubuntu 22.04 / Windows 11"
|
.github/ISSUE_TEMPLATE/config.yml
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
blank_issues_enabled: false
|
| 2 |
+
contact_links:
|
| 3 |
+
- name: Upstream Repository
|
| 4 |
+
url: https://github.com/ByteDance-Seed/Depth-Anything-3/issues
|
| 5 |
+
about: For issues with the model architecture, accuracy, or training
|
| 6 |
+
- name: Discussions
|
| 7 |
+
url: https://github.com/Aedelon/awesome-depth-anything-3/discussions
|
| 8 |
+
about: Ask questions or share ideas
|
.github/ISSUE_TEMPLATE/feature_request.yml
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Delanoe Pirard / Aedelon
|
| 2 |
+
# Licensed under the Apache License, Version 2.0
|
| 3 |
+
|
| 4 |
+
name: Feature Request
|
| 5 |
+
description: Suggest a new feature or improvement
|
| 6 |
+
labels: ["enhancement"]
|
| 7 |
+
body:
|
| 8 |
+
- type: markdown
|
| 9 |
+
attributes:
|
| 10 |
+
value: |
|
| 11 |
+
Thanks for your suggestion!
|
| 12 |
+
|
| 13 |
+
**Note**: For model/architecture changes, please suggest to the [upstream repository](https://github.com/ByteDance-Seed/Depth-Anything-3/issues).
|
| 14 |
+
This fork focuses on optimization, deployment, and developer experience.
|
| 15 |
+
|
| 16 |
+
- type: textarea
|
| 17 |
+
id: problem
|
| 18 |
+
attributes:
|
| 19 |
+
label: Problem / Use Case
|
| 20 |
+
description: What problem does this solve? What are you trying to do?
|
| 21 |
+
placeholder: I'm trying to...
|
| 22 |
+
validations:
|
| 23 |
+
required: true
|
| 24 |
+
|
| 25 |
+
- type: textarea
|
| 26 |
+
id: solution
|
| 27 |
+
attributes:
|
| 28 |
+
label: Proposed Solution
|
| 29 |
+
description: How would you like it to work?
|
| 30 |
+
placeholder: It would be great if...
|
| 31 |
+
validations:
|
| 32 |
+
required: true
|
| 33 |
+
|
| 34 |
+
- type: textarea
|
| 35 |
+
id: alternatives
|
| 36 |
+
attributes:
|
| 37 |
+
label: Alternatives Considered
|
| 38 |
+
description: Any other approaches you've considered?
|
| 39 |
+
placeholder: I also thought about...
|
| 40 |
+
|
| 41 |
+
- type: dropdown
|
| 42 |
+
id: category
|
| 43 |
+
attributes:
|
| 44 |
+
label: Category
|
| 45 |
+
options:
|
| 46 |
+
- Performance optimization
|
| 47 |
+
- CLI improvement
|
| 48 |
+
- API enhancement
|
| 49 |
+
- Documentation
|
| 50 |
+
- Testing
|
| 51 |
+
- CI/CD
|
| 52 |
+
- Other
|
| 53 |
+
validations:
|
| 54 |
+
required: true
|
| 55 |
+
|
| 56 |
+
- type: checkboxes
|
| 57 |
+
id: contribution
|
| 58 |
+
attributes:
|
| 59 |
+
label: Contribution
|
| 60 |
+
options:
|
| 61 |
+
- label: I would be willing to submit a PR for this feature
|
.github/workflows/ci.yml
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Delanoe Pirard / Aedelon
|
| 2 |
+
# Licensed under the Apache License, Version 2.0
|
| 3 |
+
|
| 4 |
+
name: CI
|
| 5 |
+
|
| 6 |
+
on:
|
| 7 |
+
push:
|
| 8 |
+
branches: [main]
|
| 9 |
+
pull_request:
|
| 10 |
+
branches: [main]
|
| 11 |
+
|
| 12 |
+
jobs:
|
| 13 |
+
lint:
|
| 14 |
+
runs-on: ubuntu-latest
|
| 15 |
+
steps:
|
| 16 |
+
- uses: actions/checkout@v4
|
| 17 |
+
|
| 18 |
+
- name: Install uv
|
| 19 |
+
uses: astral-sh/setup-uv@v4
|
| 20 |
+
|
| 21 |
+
- name: Set up Python
|
| 22 |
+
run: uv python install 3.11
|
| 23 |
+
|
| 24 |
+
- name: Install dependencies
|
| 25 |
+
run: uv sync --extra dev
|
| 26 |
+
|
| 27 |
+
- name: Lint with ruff
|
| 28 |
+
run: uv run ruff check src/
|
| 29 |
+
|
| 30 |
+
test:
|
| 31 |
+
runs-on: ubuntu-latest
|
| 32 |
+
strategy:
|
| 33 |
+
fail-fast: false
|
| 34 |
+
matrix:
|
| 35 |
+
python-version: ["3.10", "3.11", "3.12"]
|
| 36 |
+
|
| 37 |
+
steps:
|
| 38 |
+
- uses: actions/checkout@v4
|
| 39 |
+
|
| 40 |
+
- name: Install uv
|
| 41 |
+
uses: astral-sh/setup-uv@v4
|
| 42 |
+
|
| 43 |
+
- name: Set up Python ${{ matrix.python-version }}
|
| 44 |
+
run: uv python install ${{ matrix.python-version }}
|
| 45 |
+
|
| 46 |
+
- name: Install dependencies
|
| 47 |
+
run: uv sync --extra dev
|
| 48 |
+
|
| 49 |
+
- name: Run tests
|
| 50 |
+
run: uv run pytest tests/ -v --tb=short -x
|
| 51 |
+
env:
|
| 52 |
+
PYTORCH_ENABLE_MPS_FALLBACK: "1"
|
| 53 |
+
|
| 54 |
+
build:
|
| 55 |
+
runs-on: ubuntu-latest
|
| 56 |
+
steps:
|
| 57 |
+
- uses: actions/checkout@v4
|
| 58 |
+
with:
|
| 59 |
+
fetch-depth: 0 # Required for hatch-vcs
|
| 60 |
+
|
| 61 |
+
- name: Install uv
|
| 62 |
+
uses: astral-sh/setup-uv@v4
|
| 63 |
+
|
| 64 |
+
- name: Set up Python
|
| 65 |
+
run: uv python install 3.11
|
| 66 |
+
|
| 67 |
+
- name: Build package
|
| 68 |
+
run: uv build
|
| 69 |
+
|
| 70 |
+
- name: Check package
|
| 71 |
+
run: uvx twine check dist/*
|
.github/workflows/publish.yml
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Delanoe Pirard / Aedelon
|
| 2 |
+
# Licensed under the Apache License, Version 2.0
|
| 3 |
+
|
| 4 |
+
name: Publish to PyPI
|
| 5 |
+
|
| 6 |
+
on:
|
| 7 |
+
release:
|
| 8 |
+
types: [published]
|
| 9 |
+
|
| 10 |
+
jobs:
|
| 11 |
+
publish:
|
| 12 |
+
runs-on: ubuntu-latest
|
| 13 |
+
environment: pypi
|
| 14 |
+
permissions:
|
| 15 |
+
id-token: write # Required for trusted publishing
|
| 16 |
+
|
| 17 |
+
steps:
|
| 18 |
+
- uses: actions/checkout@v4
|
| 19 |
+
with:
|
| 20 |
+
fetch-depth: 0 # Required for hatch-vcs version
|
| 21 |
+
|
| 22 |
+
- name: Install uv
|
| 23 |
+
uses: astral-sh/setup-uv@v4
|
| 24 |
+
|
| 25 |
+
- name: Set up Python
|
| 26 |
+
run: uv python install 3.11
|
| 27 |
+
|
| 28 |
+
- name: Build package
|
| 29 |
+
run: uv build
|
| 30 |
+
|
| 31 |
+
- name: Publish to PyPI
|
| 32 |
+
uses: pypa/gh-action-pypi-publish@release/v1
|
| 33 |
+
# Uses trusted publishing (OIDC) - no API token needed
|
| 34 |
+
# Configure at: https://pypi.org/manage/project/awesome-depth-anything-3/settings/publishing/
|
.gitignore
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python cache
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
*.so
|
| 6 |
+
|
| 7 |
+
# Distribution / packaging
|
| 8 |
+
workspace/
|
| 9 |
+
build/
|
| 10 |
+
dist/
|
| 11 |
+
*.egg-info/
|
| 12 |
+
.eggs/
|
| 13 |
+
*.egg
|
| 14 |
+
|
| 15 |
+
# Virtual environments
|
| 16 |
+
.venv/
|
| 17 |
+
venv/
|
| 18 |
+
ENV/
|
| 19 |
+
|
| 20 |
+
# Test/coverage
|
| 21 |
+
.coverage
|
| 22 |
+
.pytest_cache/
|
| 23 |
+
htmlcov/
|
| 24 |
+
.tox/
|
| 25 |
+
.nox/
|
| 26 |
+
coverage.xml
|
| 27 |
+
*.cover
|
| 28 |
+
|
| 29 |
+
# Jupyter notebooks
|
| 30 |
+
.ipynb_checkpoints/
|
| 31 |
+
|
| 32 |
+
# IDE
|
| 33 |
+
.vscode/
|
| 34 |
+
.idea/
|
| 35 |
+
|
| 36 |
+
# OS files
|
| 37 |
+
.DS_Store
|
| 38 |
+
Thumbs.db
|
| 39 |
+
|
| 40 |
+
# Project-specific
|
| 41 |
+
gallery*/
|
| 42 |
+
debug*/
|
| 43 |
+
DA3HF*/
|
| 44 |
+
gradio_workspace/
|
| 45 |
+
eval_workspace/
|
| 46 |
+
FILTER*/
|
| 47 |
+
input_images*/
|
| 48 |
+
*.gradio/
|
| 49 |
+
.gradio/
|
| 50 |
+
src/debug_main.py
|
| 51 |
+
temp*.png
|
| 52 |
+
/outputs
|
| 53 |
+
|
| 54 |
+
# Model weights and large files
|
| 55 |
+
*.pt
|
| 56 |
+
*.pth
|
| 57 |
+
*.ckpt
|
| 58 |
+
*.safetensors
|
| 59 |
+
!assets/**/*.pt
|
| 60 |
+
|
| 61 |
+
# Logs
|
| 62 |
+
*.log
|
| 63 |
+
logs/
|
.pre-commit-config.yaml
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
repos:
|
| 2 |
+
- repo: 'https://github.com/pre-commit/pre-commit-hooks'
|
| 3 |
+
rev: v4.5.0
|
| 4 |
+
hooks:
|
| 5 |
+
- id: check-added-large-files
|
| 6 |
+
args:
|
| 7 |
+
- '--maxkb=125'
|
| 8 |
+
- id: check-ast
|
| 9 |
+
- id: check-executables-have-shebangs
|
| 10 |
+
- id: check-merge-conflict
|
| 11 |
+
- id: check-symlinks
|
| 12 |
+
- id: check-toml
|
| 13 |
+
- id: check-yaml
|
| 14 |
+
- id: debug-statements
|
| 15 |
+
- id: detect-private-key
|
| 16 |
+
- id: end-of-file-fixer
|
| 17 |
+
- id: no-commit-to-branch
|
| 18 |
+
args:
|
| 19 |
+
- '--branch'
|
| 20 |
+
- 'master'
|
| 21 |
+
- id: pretty-format-json
|
| 22 |
+
exclude: '.*\.ipynb$'
|
| 23 |
+
args:
|
| 24 |
+
- '--autofix'
|
| 25 |
+
- '--indent'
|
| 26 |
+
- '4'
|
| 27 |
+
- id: trailing-whitespace
|
| 28 |
+
args:
|
| 29 |
+
- '--markdown-linebreak-ext=md'
|
| 30 |
+
- repo: 'https://github.com/pycqa/isort'
|
| 31 |
+
rev: 5.13.2
|
| 32 |
+
hooks:
|
| 33 |
+
- id: isort
|
| 34 |
+
args:
|
| 35 |
+
- '--settings-file'
|
| 36 |
+
- 'pyproject.toml'
|
| 37 |
+
- '--filter-files'
|
| 38 |
+
- repo: 'https://github.com/asottile/pyupgrade'
|
| 39 |
+
rev: v3.15.2
|
| 40 |
+
hooks:
|
| 41 |
+
- id: pyupgrade
|
| 42 |
+
args: [--py38-plus, --keep-runtime-typing]
|
| 43 |
+
- repo: 'https://github.com/psf/black.git'
|
| 44 |
+
rev: 24.3.0
|
| 45 |
+
hooks:
|
| 46 |
+
- id: black
|
| 47 |
+
args:
|
| 48 |
+
- '--config=pyproject.toml'
|
| 49 |
+
- repo: 'https://github.com/PyCQA/flake8'
|
| 50 |
+
rev: 7.0.0
|
| 51 |
+
hooks:
|
| 52 |
+
- id: flake8
|
| 53 |
+
args:
|
| 54 |
+
- '--config=.flake8'
|
| 55 |
+
- repo: 'https://github.com/myint/autoflake'
|
| 56 |
+
rev: v1.4
|
| 57 |
+
hooks:
|
| 58 |
+
- id: autoflake
|
| 59 |
+
args: [ '--remove-all-unused-imports', '--recursive', '--remove-unused-variables', '--in-place']
|
BENCHMARKS.md
ADDED
|
@@ -0,0 +1,217 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Benchmark Results
|
| 2 |
+
|
| 3 |
+
Performance benchmarks comparing **awesome-depth-anything-3** (optimized fork) against the vanilla upstream implementation.
|
| 4 |
+
|
| 5 |
+
> **Test Environment**: Apple Silicon (M-series), PyTorch 2.9.0
|
| 6 |
+
> **Models**: da3-small, da3-base, da3-large, da3-giant
|
| 7 |
+
|
| 8 |
+
---
|
| 9 |
+
|
| 10 |
+
## Quick Summary
|
| 11 |
+
|
| 12 |
+
| Feature | Improvement |
|
| 13 |
+
|---------|-------------|
|
| 14 |
+
| Model Loading (cached) | **200x faster** (0.8s → 0.005s) |
|
| 15 |
+
| Inference (MPS, batch 4) | **1.14x faster** |
|
| 16 |
+
| Cold Load Time | **1.7x faster** |
|
| 17 |
+
| Memory Efficiency | Adaptive batching prevents OOM |
|
| 18 |
+
|
| 19 |
+
---
|
| 20 |
+
|
| 21 |
+
## 1. Awesome vs Upstream Comparison
|
| 22 |
+
|
| 23 |
+
Direct comparison between this optimized fork and the original upstream repository.
|
| 24 |
+
|
| 25 |
+
### MPS (Apple Silicon GPU)
|
| 26 |
+
|
| 27 |
+
| Batch Size | Upstream | Awesome | Speedup | Notes |
|
| 28 |
+
|------------|----------|---------|---------|-------|
|
| 29 |
+
| 1 | 3.47 img/s | 3.50 img/s | 1.01x | Minimal overhead |
|
| 30 |
+
| 2 | 3.64 img/s | 3.83 img/s | 1.05x | Batching benefits |
|
| 31 |
+
| **4** | 3.32 img/s | 3.78 img/s | **1.14x** | Best improvement |
|
| 32 |
+
|
| 33 |
+
#### Model Loading Performance
|
| 34 |
+
|
| 35 |
+
| Metric | Upstream | Awesome | Speedup |
|
| 36 |
+
|--------|----------|---------|---------|
|
| 37 |
+
| Cold Load | 1.28s | 0.77s | **1.7x** |
|
| 38 |
+
| Cached Load | N/A | 0.005s | **~200x** |
|
| 39 |
+
|
| 40 |
+
The model caching system is the standout feature - after the first load, subsequent loads are essentially instant.
|
| 41 |
+
|
| 42 |
+
### CPU
|
| 43 |
+
|
| 44 |
+
| Batch Size | Upstream | Awesome | Speedup |
|
| 45 |
+
|------------|----------|---------|---------|
|
| 46 |
+
| 1 | 0.27 img/s | 0.31 img/s | 1.13x |
|
| 47 |
+
| 2 | 0.24 img/s | 0.24 img/s | 1.00x |
|
| 48 |
+
| 4 | 0.17 img/s | 0.16 img/s | 0.95x |
|
| 49 |
+
|
| 50 |
+
> **Note**: CPU performance is similar between versions since GPU-specific optimizations don't apply. The slight regression at batch 4 is within measurement noise.
|
| 51 |
+
|
| 52 |
+
---
|
| 53 |
+
|
| 54 |
+
## 2. Model Performance by Size
|
| 55 |
+
|
| 56 |
+
Throughput benchmarks on MPS (Apple Silicon) with 1280x720 input images.
|
| 57 |
+
|
| 58 |
+
| Model | Parameters | Batch 1 | Batch 4 | Best Config |
|
| 59 |
+
|-------|------------|---------|---------|-------------|
|
| 60 |
+
| **da3-small** | ~25M | 22.2 img/s | 27.2 img/s | B=4 SDPA |
|
| 61 |
+
| **da3-base** | ~100M | 10.7 img/s | 11.6 img/s | B=4 SDPA |
|
| 62 |
+
| **da3-large** | ~335M | 3.8 img/s | 3.8 img/s | B=1-2 |
|
| 63 |
+
| **da3-giant** | ~1.1B | 1.6 img/s | 1.2 img/s | B=1 |
|
| 64 |
+
|
| 65 |
+
### Latency (single image)
|
| 66 |
+
|
| 67 |
+
| Model | MPS | CPU | MPS Speedup |
|
| 68 |
+
|-------|-----|-----|-------------|
|
| 69 |
+
| da3-small | 45 ms | ~3,500 ms | ~78x |
|
| 70 |
+
| da3-base | 94 ms | ~7,000 ms | ~74x |
|
| 71 |
+
| da3-large | 265 ms | ~3,900 ms | ~15x |
|
| 72 |
+
| da3-giant | 618 ms | N/A | - |
|
| 73 |
+
|
| 74 |
+
---
|
| 75 |
+
|
| 76 |
+
## 3. Preprocessing Pipeline
|
| 77 |
+
|
| 78 |
+
### Strategy: Hybrid CPU/GPU
|
| 79 |
+
|
| 80 |
+
On Apple Silicon, **CPU preprocessing is faster** than GPU (Kornia) due to optimized OpenCV/Accelerate routines. The overhead of MPS kernel launches exceeds the benefit for image transforms.
|
| 81 |
+
|
| 82 |
+
| Resolution | CPU Time | GPU Time | Winner |
|
| 83 |
+
|------------|----------|----------|--------|
|
| 84 |
+
| 640x480 | 6.0 ms | N/A | CPU |
|
| 85 |
+
| 1920x1080 | 18.7 ms | N/A | CPU |
|
| 86 |
+
| 3840x2160 | 57.0 ms | N/A | CPU |
|
| 87 |
+
|
| 88 |
+
> **Design Decision**: GPU preprocessing is automatically disabled on MPS. The GPU is reserved for model inference where it provides 15-78x speedup.
|
| 89 |
+
|
| 90 |
+
### CUDA (NVIDIA)
|
| 91 |
+
|
| 92 |
+
On CUDA, GPU preprocessing with NVJPEG provides significant benefits for JPEG decoding directly to GPU memory, eliminating CPU→GPU transfer overhead.
|
| 93 |
+
|
| 94 |
+
---
|
| 95 |
+
|
| 96 |
+
## 4. Attention Mechanisms
|
| 97 |
+
|
| 98 |
+
Comparison between SDPA (Scaled Dot-Product Attention / Flash Attention) and manual attention implementation.
|
| 99 |
+
|
| 100 |
+
### Per-Layer Performance
|
| 101 |
+
|
| 102 |
+
| Config | SDPA | Manual | Speedup |
|
| 103 |
+
|--------|------|--------|---------|
|
| 104 |
+
| ViT-L 518px (MPS) | 2.21 ms | 1.86 ms | 0.8x |
|
| 105 |
+
| ViT-L 1024px (MPS) | 9.91 ms | 5.87 ms | 0.6x |
|
| 106 |
+
| ViT-L 518px (CPU) | 3.75 ms | 4.96 ms | 1.3x |
|
| 107 |
+
| ViT-L 1024px (CPU) | 11.73 ms | 16.85 ms | 1.4x |
|
| 108 |
+
|
| 109 |
+
> **Insight**: On MPS, manual attention is faster for ViT due to MPS's SDPA implementation overhead. On CPU, SDPA benefits from optimized BLAS operations.
|
| 110 |
+
|
| 111 |
+
### End-to-End Impact
|
| 112 |
+
|
| 113 |
+
| Model | SDPA | Manual | Best |
|
| 114 |
+
|-------|------|--------|------|
|
| 115 |
+
| da3-small | 21.8 img/s | 22.2 img/s | Manual |
|
| 116 |
+
| da3-base | 9.8 img/s | 10.7 img/s | Manual |
|
| 117 |
+
| da3-large | 3.8 img/s | 3.7 img/s | SDPA |
|
| 118 |
+
| da3-giant | 1.6 img/s | 1.6 img/s | Tie |
|
| 119 |
+
|
| 120 |
+
---
|
| 121 |
+
|
| 122 |
+
## 5. Adaptive Batching
|
| 123 |
+
|
| 124 |
+
The adaptive batching system dynamically adjusts batch size based on available GPU memory.
|
| 125 |
+
|
| 126 |
+
### Test: 20 images with da3-large on MPS
|
| 127 |
+
|
| 128 |
+
| Strategy | Total Time | Throughput | Batches Used |
|
| 129 |
+
|----------|------------|------------|--------------|
|
| 130 |
+
| Fixed B=1 | 5,612 ms | 3.6 img/s | [1,1,1...] |
|
| 131 |
+
| Fixed B=2 | 5,514 ms | **3.6 img/s** | [2,2,2...] |
|
| 132 |
+
| Fixed B=4 | 8,305 ms | 2.4 img/s | [4,4,4,4,4] |
|
| 133 |
+
| Adaptive 85% | 5,637 ms | 3.5 img/s | [4,4,4...] |
|
| 134 |
+
|
| 135 |
+
> **Recommendation**: For MPS with da3-large, fixed batch size of 2 provides optimal throughput. Adaptive batching is more valuable for:
|
| 136 |
+
> - Variable input sizes
|
| 137 |
+
> - Unknown GPU memory constraints
|
| 138 |
+
> - Preventing OOM errors on smaller GPUs
|
| 139 |
+
|
| 140 |
+
---
|
| 141 |
+
|
| 142 |
+
## 6. Cross-Device Comparison
|
| 143 |
+
|
| 144 |
+
### Inference Throughput (da3-large, batch=1)
|
| 145 |
+
|
| 146 |
+
```
|
| 147 |
+
MPS (Apple Silicon) ███████████████��████████████████████████ 3.7 img/s
|
| 148 |
+
CPU ███ 0.3 img/s
|
| 149 |
+
```
|
| 150 |
+
|
| 151 |
+
**MPS provides ~12x speedup over CPU** for da3-large inference.
|
| 152 |
+
|
| 153 |
+
### Attention Layer (ViT-L 518px, SDPA)
|
| 154 |
+
|
| 155 |
+
```
|
| 156 |
+
MPS ████████████████████████ 2.40 ms
|
| 157 |
+
CPU ███████████████████████████████████████ 3.75 ms
|
| 158 |
+
```
|
| 159 |
+
|
| 160 |
+
---
|
| 161 |
+
|
| 162 |
+
## 7. Optimization Recommendations
|
| 163 |
+
|
| 164 |
+
### For Apple Silicon (MPS)
|
| 165 |
+
|
| 166 |
+
1. **Use model caching** - 200x faster subsequent loads
|
| 167 |
+
2. **Batch size 2-4** for da3-small/base, **batch 1-2** for da3-large/giant
|
| 168 |
+
3. **Let CPU handle preprocessing** - it's faster than MPS for image transforms
|
| 169 |
+
4. **SDPA vs Manual**: Both are similar; SDPA slightly better for larger models
|
| 170 |
+
|
| 171 |
+
### For NVIDIA CUDA
|
| 172 |
+
|
| 173 |
+
1. **Enable GPU preprocessing** with NVJPEG for JPEG inputs
|
| 174 |
+
2. **Use SDPA** (Flash Attention) - significant speedup
|
| 175 |
+
3. **Larger batch sizes** benefit more from GPU parallelism
|
| 176 |
+
4. **Adaptive batching** to maximize VRAM utilization
|
| 177 |
+
|
| 178 |
+
### For CPU-only
|
| 179 |
+
|
| 180 |
+
1. **Use smallest viable model** (da3-small: 22x faster than da3-giant)
|
| 181 |
+
2. **Batch size 1** is optimal (memory bandwidth limited)
|
| 182 |
+
3. **SDPA provides 1.3-1.4x speedup** on CPU
|
| 183 |
+
|
| 184 |
+
---
|
| 185 |
+
|
| 186 |
+
## Running Benchmarks
|
| 187 |
+
|
| 188 |
+
```bash
|
| 189 |
+
# Quick benchmark (fewer iterations)
|
| 190 |
+
uv run python benchmarks/full_benchmark.py --quick
|
| 191 |
+
|
| 192 |
+
# Full benchmark on specific device
|
| 193 |
+
uv run python benchmarks/full_benchmark.py --device mps
|
| 194 |
+
uv run python benchmarks/full_benchmark.py --device cuda
|
| 195 |
+
uv run python benchmarks/full_benchmark.py --device cpu
|
| 196 |
+
|
| 197 |
+
# Compare against upstream (requires upstream repo)
|
| 198 |
+
uv run python benchmarks/comparative_benchmark.py --device all
|
| 199 |
+
|
| 200 |
+
# Skip specific tests
|
| 201 |
+
uv run python benchmarks/full_benchmark.py --skip-batching
|
| 202 |
+
```
|
| 203 |
+
|
| 204 |
+
---
|
| 205 |
+
|
| 206 |
+
## Methodology
|
| 207 |
+
|
| 208 |
+
- **Warmup**: 2 inference passes before timing
|
| 209 |
+
- **Runs**: 3-5 iterations per configuration
|
| 210 |
+
- **Synchronization**: `torch.mps.synchronize()` / `torch.cuda.synchronize()` for accurate GPU timing
|
| 211 |
+
- **Memory cleanup**: `gc.collect()` + cache clearing between tests
|
| 212 |
+
- **Input**: Synthetic 1280x720 RGB images (consistent across tests)
|
| 213 |
+
|
| 214 |
+
---
|
| 215 |
+
|
| 216 |
+
*Benchmarks last updated: December 2024*
|
| 217 |
+
*Hardware: Apple Silicon (M-series) | Software: PyTorch 2.9.0*
|
CHANGELOG.md
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Changelog
|
| 2 |
+
|
| 3 |
+
All notable changes to this project will be documented in this file.
|
| 4 |
+
|
| 5 |
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
| 6 |
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
| 7 |
+
|
| 8 |
+
## [Unreleased]
|
| 9 |
+
|
| 10 |
+
## [0.1.0] - 2024-12-03
|
| 11 |
+
|
| 12 |
+
### Added
|
| 13 |
+
|
| 14 |
+
- **Model Caching**: ~200x faster model loading after first use via `ModelCache` singleton
|
| 15 |
+
- **Adaptive Batching**: Automatic batch size optimization based on available GPU memory
|
| 16 |
+
- `batch_inference()` method with `batch_size="auto"` option
|
| 17 |
+
- `get_optimal_batch_size()` for memory-aware batch sizing
|
| 18 |
+
- **CLI Batching Options**: `--batch-size`, `--max-batch-size`, `--target-memory-utilization`
|
| 19 |
+
- **Apple Silicon Optimizations**: Smart CPU/GPU preprocessing selection for MPS
|
| 20 |
+
- **GPU Preprocessing**: Kornia-based GPU preprocessing with NVJPEG support on CUDA
|
| 21 |
+
- **Comprehensive Benchmarks**: Performance comparison scripts and documentation
|
| 22 |
+
- **PyPI Package**: Published as `awesome-depth-anything-3`
|
| 23 |
+
- **CI/CD**: GitHub Actions for testing, linting, and PyPI publishing
|
| 24 |
+
- **HF Spaces Demo**: Interactive Gradio demo on Hugging Face
|
| 25 |
+
- **Colab Tutorial**: Interactive notebook with examples
|
| 26 |
+
|
| 27 |
+
### Changed
|
| 28 |
+
|
| 29 |
+
- Package renamed from `depth-anything-3` to `awesome-depth-anything-3`
|
| 30 |
+
- Improved error handling in CLI commands
|
| 31 |
+
- Better logging with configurable levels
|
| 32 |
+
|
| 33 |
+
### Credits
|
| 34 |
+
|
| 35 |
+
This package is an optimized fork of [Depth Anything 3](https://github.com/ByteDance-Seed/Depth-Anything-3)
|
| 36 |
+
by ByteDance. All model architecture and weights are their work. See README for full attribution.
|
CONTRIBUTING.md
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Contributing to awesome-depth-anything-3
|
| 2 |
+
|
| 3 |
+
Thank you for your interest in contributing! This document provides guidelines for contributing to this project.
|
| 4 |
+
|
| 5 |
+
## Important Note
|
| 6 |
+
|
| 7 |
+
This is an **optimized fork** of [Depth Anything 3](https://github.com/ByteDance-Seed/Depth-Anything-3) by ByteDance.
|
| 8 |
+
|
| 9 |
+
- **Model/architecture changes** should be proposed to the [upstream repository](https://github.com/ByteDance-Seed/Depth-Anything-3)
|
| 10 |
+
- **Optimization/deployment improvements** are welcome here
|
| 11 |
+
|
| 12 |
+
## Development Setup
|
| 13 |
+
|
| 14 |
+
```bash
|
| 15 |
+
# Clone the repository
|
| 16 |
+
git clone https://github.com/Aedelon/awesome-depth-anything-3.git
|
| 17 |
+
cd awesome-depth-anything-3
|
| 18 |
+
|
| 19 |
+
# Install with development dependencies (using uv)
|
| 20 |
+
uv sync --extra dev
|
| 21 |
+
|
| 22 |
+
# Or with pip
|
| 23 |
+
pip install -e ".[dev]"
|
| 24 |
+
```
|
| 25 |
+
|
| 26 |
+
## Running Tests
|
| 27 |
+
|
| 28 |
+
```bash
|
| 29 |
+
# Run all tests
|
| 30 |
+
uv run pytest tests/ -v
|
| 31 |
+
|
| 32 |
+
# Run specific test file
|
| 33 |
+
uv run pytest tests/test_adaptive_batching.py -v
|
| 34 |
+
|
| 35 |
+
# Run with coverage
|
| 36 |
+
uv run pytest tests/ --cov=src/depth_anything_3
|
| 37 |
+
```
|
| 38 |
+
|
| 39 |
+
## Code Style
|
| 40 |
+
|
| 41 |
+
We use `ruff` for linting and formatting:
|
| 42 |
+
|
| 43 |
+
```bash
|
| 44 |
+
# Check for issues
|
| 45 |
+
uv run ruff check src/
|
| 46 |
+
|
| 47 |
+
# Auto-fix issues
|
| 48 |
+
uv run ruff check src/ --fix
|
| 49 |
+
|
| 50 |
+
# Format code
|
| 51 |
+
uv run ruff format src/
|
| 52 |
+
```
|
| 53 |
+
|
| 54 |
+
## Pre-commit Hooks
|
| 55 |
+
|
| 56 |
+
We recommend using pre-commit hooks:
|
| 57 |
+
|
| 58 |
+
```bash
|
| 59 |
+
uv run pre-commit install
|
| 60 |
+
uv run pre-commit run --all-files
|
| 61 |
+
```
|
| 62 |
+
|
| 63 |
+
## Pull Request Process
|
| 64 |
+
|
| 65 |
+
1. **Fork** the repository
|
| 66 |
+
2. **Create a branch** for your feature (`git checkout -b feature/amazing-feature`)
|
| 67 |
+
3. **Make your changes** with clear, descriptive commits
|
| 68 |
+
4. **Run tests** and linting
|
| 69 |
+
5. **Update documentation** if needed
|
| 70 |
+
6. **Push** to your fork and **open a Pull Request**
|
| 71 |
+
|
| 72 |
+
### PR Guidelines
|
| 73 |
+
|
| 74 |
+
- Keep PRs focused on a single change
|
| 75 |
+
- Include tests for new functionality
|
| 76 |
+
- Update CHANGELOG.md for user-facing changes
|
| 77 |
+
- Ensure CI passes before requesting review
|
| 78 |
+
|
| 79 |
+
## Types of Contributions Welcome
|
| 80 |
+
|
| 81 |
+
### Highly Welcome
|
| 82 |
+
|
| 83 |
+
- Performance optimizations
|
| 84 |
+
- Bug fixes
|
| 85 |
+
- Documentation improvements
|
| 86 |
+
- Test coverage improvements
|
| 87 |
+
- CI/CD improvements
|
| 88 |
+
- Device compatibility (CUDA, MPS, CPU)
|
| 89 |
+
|
| 90 |
+
### Discuss First
|
| 91 |
+
|
| 92 |
+
- New CLI commands
|
| 93 |
+
- API changes
|
| 94 |
+
- New dependencies
|
| 95 |
+
|
| 96 |
+
### Redirect to Upstream
|
| 97 |
+
|
| 98 |
+
- Model architecture changes
|
| 99 |
+
- Training code changes
|
| 100 |
+
- New model variants
|
| 101 |
+
|
| 102 |
+
## Reporting Issues
|
| 103 |
+
|
| 104 |
+
When reporting bugs, please include:
|
| 105 |
+
|
| 106 |
+
- Python version
|
| 107 |
+
- PyTorch version
|
| 108 |
+
- Device type (CUDA/MPS/CPU)
|
| 109 |
+
- Minimal reproduction code
|
| 110 |
+
- Full error traceback
|
| 111 |
+
|
| 112 |
+
## License
|
| 113 |
+
|
| 114 |
+
By contributing, you agree that your contributions will be licensed under the Apache License 2.0.
|
LICENSE
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Apache License
|
| 2 |
+
Version 2.0, January 2004
|
| 3 |
+
http://www.apache.org/licenses/
|
| 4 |
+
|
| 5 |
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
| 6 |
+
|
| 7 |
+
1. Definitions.
|
| 8 |
+
|
| 9 |
+
"License" shall mean the terms and conditions for use, reproduction,
|
| 10 |
+
and distribution as defined by Sections 1 through 9 of this document.
|
| 11 |
+
|
| 12 |
+
"Licensor" shall mean the copyright owner or entity authorized by
|
| 13 |
+
the copyright owner that is granting the License.
|
| 14 |
+
|
| 15 |
+
"Legal Entity" shall mean the union of the acting entity and all
|
| 16 |
+
other entities that control, are controlled by, or are under common
|
| 17 |
+
control with that entity. For the purposes of this definition,
|
| 18 |
+
"control" means (i) the power, direct or indirect, to cause the
|
| 19 |
+
direction or management of such entity, whether by contract or
|
| 20 |
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
| 21 |
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
| 22 |
+
|
| 23 |
+
"You" (or "Your") shall mean an individual or Legal Entity
|
| 24 |
+
exercising permissions granted by this License.
|
| 25 |
+
|
| 26 |
+
"Source" form shall mean the preferred form for making modifications,
|
| 27 |
+
including but not limited to software source code, documentation
|
| 28 |
+
source, and configuration files.
|
| 29 |
+
|
| 30 |
+
"Object" form shall mean any form resulting from mechanical
|
| 31 |
+
transformation or translation of a Source form, including but
|
| 32 |
+
not limited to compiled object code, generated documentation,
|
| 33 |
+
and conversions to other media types.
|
| 34 |
+
|
| 35 |
+
"Work" shall mean the work of authorship, whether in Source or
|
| 36 |
+
Object form, made available under the License, as indicated by a
|
| 37 |
+
copyright notice that is included in or attached to the work
|
| 38 |
+
(an example is provided in the Appendix below).
|
| 39 |
+
|
| 40 |
+
"Derivative Works" shall mean any work, whether in Source or Object
|
| 41 |
+
form, that is based on (or derived from) the Work and for which the
|
| 42 |
+
editorial revisions, annotations, elaborations, or other modifications
|
| 43 |
+
represent, as a whole, an original work of authorship. For the purposes
|
| 44 |
+
of this License, Derivative Works shall not include works that remain
|
| 45 |
+
separable from, or merely link (or bind by name) to the interfaces of,
|
| 46 |
+
the Work and Derivative Works thereof.
|
| 47 |
+
|
| 48 |
+
"Contribution" shall mean any work of authorship, including
|
| 49 |
+
the original version of the Work and any modifications or additions
|
| 50 |
+
to that Work or Derivative Works thereof, that is intentionally
|
| 51 |
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
| 52 |
+
or by an individual or Legal Entity authorized to submit on behalf of
|
| 53 |
+
the copyright owner. For the purposes of this definition, "submitted"
|
| 54 |
+
means any form of electronic, verbal, or written communication sent
|
| 55 |
+
to the Licensor or its representatives, including but not limited to
|
| 56 |
+
communication on electronic mailing lists, source code control systems,
|
| 57 |
+
and issue tracking systems that are managed by, or on behalf of, the
|
| 58 |
+
Licensor for the purpose of discussing and improving the Work, but
|
| 59 |
+
excluding communication that is conspicuously marked or otherwise
|
| 60 |
+
designated in writing by the copyright owner as "Not a Contribution."
|
| 61 |
+
|
| 62 |
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
| 63 |
+
on behalf of whom a Contribution has been received by Licensor and
|
| 64 |
+
subsequently incorporated within the Work.
|
| 65 |
+
|
| 66 |
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
| 67 |
+
this License, each Contributor hereby grants to You a perpetual,
|
| 68 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
| 69 |
+
copyright license to reproduce, prepare Derivative Works of,
|
| 70 |
+
publicly display, publicly perform, sublicense, and distribute the
|
| 71 |
+
Work and such Derivative Works in Source or Object form.
|
| 72 |
+
|
| 73 |
+
3. Grant of Patent License. Subject to the terms and conditions of
|
| 74 |
+
this License, each Contributor hereby grants to You a perpetual,
|
| 75 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
| 76 |
+
(except as stated in this section) patent license to make, have made,
|
| 77 |
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
| 78 |
+
where such license applies only to those patent claims licensable
|
| 79 |
+
by such Contributor that are necessarily infringed by their
|
| 80 |
+
Contribution(s) alone or by combination of their Contribution(s)
|
| 81 |
+
with the Work to which such Contribution(s) was submitted. If You
|
| 82 |
+
institute patent litigation against any entity (including a
|
| 83 |
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
| 84 |
+
or a Contribution incorporated within the Work constitutes direct
|
| 85 |
+
or contributory patent infringement, then any patent licenses
|
| 86 |
+
granted to You under this License for that Work shall terminate
|
| 87 |
+
as of the date such litigation is filed.
|
| 88 |
+
|
| 89 |
+
4. Redistribution. You may reproduce and distribute copies of the
|
| 90 |
+
Work or Derivative Works thereof in any medium, with or without
|
| 91 |
+
modifications, and in Source or Object form, provided that You
|
| 92 |
+
meet the following conditions:
|
| 93 |
+
|
| 94 |
+
(a) You must give any other recipients of the Work or
|
| 95 |
+
Derivative Works a copy of this License; and
|
| 96 |
+
|
| 97 |
+
(b) You must cause any modified files to carry prominent notices
|
| 98 |
+
stating that You changed the files; and
|
| 99 |
+
|
| 100 |
+
(c) You must retain, in the Source form of any Derivative Works
|
| 101 |
+
that You distribute, all copyright, patent, trademark, and
|
| 102 |
+
attribution notices from the Source form of the Work,
|
| 103 |
+
excluding those notices that do not pertain to any part of
|
| 104 |
+
the Derivative Works; and
|
| 105 |
+
|
| 106 |
+
(d) If the Work includes a "NOTICE" text file as part of its
|
| 107 |
+
distribution, then any Derivative Works that You distribute must
|
| 108 |
+
include a readable copy of the attribution notices contained
|
| 109 |
+
within such NOTICE file, excluding those notices that do not
|
| 110 |
+
pertain to any part of the Derivative Works, in at least one
|
| 111 |
+
of the following places: within a NOTICE text file distributed
|
| 112 |
+
as part of the Derivative Works; within the Source form or
|
| 113 |
+
documentation, if provided along with the Derivative Works; or,
|
| 114 |
+
within a display generated by the Derivative Works, if and
|
| 115 |
+
wherever such third-party notices normally appear. The contents
|
| 116 |
+
of the NOTICE file are for informational purposes only and
|
| 117 |
+
do not modify the License. You may add Your own attribution
|
| 118 |
+
notices within Derivative Works that You distribute, alongside
|
| 119 |
+
or as an addendum to the NOTICE text from the Work, provided
|
| 120 |
+
that such additional attribution notices cannot be construed
|
| 121 |
+
as modifying the License.
|
| 122 |
+
|
| 123 |
+
You may add Your own copyright statement to Your modifications and
|
| 124 |
+
may provide additional or different license terms and conditions
|
| 125 |
+
for use, reproduction, or distribution of Your modifications, or
|
| 126 |
+
for any such Derivative Works as a whole, provided Your use,
|
| 127 |
+
reproduction, and distribution of the Work otherwise complies with
|
| 128 |
+
the conditions stated in this License.
|
| 129 |
+
|
| 130 |
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
| 131 |
+
any Contribution intentionally submitted for inclusion in the Work
|
| 132 |
+
by You to the Licensor shall be under the terms and conditions of
|
| 133 |
+
this License, without any additional terms or conditions.
|
| 134 |
+
Notwithstanding the above, nothing herein shall supersede or modify
|
| 135 |
+
the terms of any separate license agreement you may have executed
|
| 136 |
+
with Licensor regarding such Contributions.
|
| 137 |
+
|
| 138 |
+
6. Trademarks. This License does not grant permission to use the trade
|
| 139 |
+
names, trademarks, service marks, or product names of the Licensor,
|
| 140 |
+
except as required for reasonable and customary use in describing the
|
| 141 |
+
origin of the Work and reproducing the content of the NOTICE file.
|
| 142 |
+
|
| 143 |
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
| 144 |
+
agreed to in writing, Licensor provides the Work (and each
|
| 145 |
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
| 146 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
| 147 |
+
implied, including, without limitation, any warranties or conditions
|
| 148 |
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
| 149 |
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
| 150 |
+
appropriateness of using or redistributing the Work and assume any
|
| 151 |
+
risks associated with Your exercise of permissions under this License.
|
| 152 |
+
|
| 153 |
+
8. Limitation of Liability. In no event and under no legal theory,
|
| 154 |
+
whether in tort (including negligence), contract, or otherwise,
|
| 155 |
+
unless required by applicable law (such as deliberate and grossly
|
| 156 |
+
negligent acts) or agreed to in writing, shall any Contributor be
|
| 157 |
+
liable to You for damages, including any direct, indirect, special,
|
| 158 |
+
incidental, or consequential damages of any character arising as a
|
| 159 |
+
result of this License or out of the use or inability to use the
|
| 160 |
+
Work (including but not limited to damages for loss of goodwill,
|
| 161 |
+
work stoppage, computer failure or malfunction, or any and all
|
| 162 |
+
other commercial damages or losses), even if such Contributor
|
| 163 |
+
has been advised of the possibility of such damages.
|
| 164 |
+
|
| 165 |
+
9. Accepting Warranty or Additional Liability. While redistributing
|
| 166 |
+
the Work or Derivative Works thereof, You may choose to offer,
|
| 167 |
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
| 168 |
+
or other liability obligations and/or rights consistent with this
|
| 169 |
+
License. However, in accepting such obligations, You may act only
|
| 170 |
+
on Your own behalf and on Your sole responsibility, not on behalf
|
| 171 |
+
of any other Contributor, and only if You agree to indemnify,
|
| 172 |
+
defend, and hold each Contributor harmless for any liability
|
| 173 |
+
incurred by, or claims asserted against, such Contributor by reason
|
| 174 |
+
of your accepting any such warranty or additional liability.
|
| 175 |
+
|
| 176 |
+
END OF TERMS AND CONDITIONS
|
| 177 |
+
|
| 178 |
+
APPENDIX: How to apply the Apache License to your work.
|
| 179 |
+
|
| 180 |
+
To apply the Apache License to your work, attach the following
|
| 181 |
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
| 182 |
+
replaced with your own identifying information. (Don't include
|
| 183 |
+
the brackets!) The text should be enclosed in the appropriate
|
| 184 |
+
comment syntax for the file format. We also recommend that a
|
| 185 |
+
file or class name and description of purpose be included on
|
| 186 |
+
the same "printed page" as the copyright notice for easier
|
| 187 |
+
identification within third-party archives.
|
| 188 |
+
|
| 189 |
+
Copyright 2025 The Depth Anything 3 Team
|
| 190 |
+
|
| 191 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
| 192 |
+
you may not use this file except in compliance with the License.
|
| 193 |
+
You may obtain a copy of the License at
|
| 194 |
+
|
| 195 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
| 196 |
+
|
| 197 |
+
Unless required by applicable law or agreed to in writing, software
|
| 198 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
| 199 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 200 |
+
See the License for the specific language governing permissions and
|
| 201 |
+
limitations under the License.
|
README.md
ADDED
|
@@ -0,0 +1,418 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Awesome Depth Anything 3
|
| 3 |
+
emoji: 🌊
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: purple
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: 5.50.0
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: false
|
| 10 |
+
license: apache-2.0
|
| 11 |
+
short_description: Metric 3D reconstruction from images/video
|
| 12 |
+
---
|
| 13 |
+
|
| 14 |
+
<div align="center">
|
| 15 |
+
|
| 16 |
+
# Awesome Depth Anything 3
|
| 17 |
+
|
| 18 |
+
**Optimized fork of Depth Anything 3 with production-ready features**
|
| 19 |
+
|
| 20 |
+
[](https://pypi.org/project/awesome-depth-anything-3/)
|
| 21 |
+
[](https://www.python.org/)
|
| 22 |
+
[](LICENSE)
|
| 23 |
+
[](https://github.com/Aedelon/awesome-depth-anything-3/actions)
|
| 24 |
+
[](https://colab.research.google.com/github/Aedelon/awesome-depth-anything-3/blob/main/notebooks/da3_tutorial.ipynb)
|
| 25 |
+
[](https://huggingface.co/spaces/Aedelon/awesome-depth-anything-3)
|
| 26 |
+
|
| 27 |
+
[Demo](https://huggingface.co/spaces/Aedelon/awesome-depth-anything-3) · [Tutorial](notebooks/da3_tutorial.ipynb) · [Benchmarks](BENCHMARKS.md) · [Original Paper](https://arxiv.org/abs/2511.10647)
|
| 28 |
+
|
| 29 |
+
</div>
|
| 30 |
+
|
| 31 |
+
---
|
| 32 |
+
|
| 33 |
+
> **This is an optimized fork** of [Depth Anything 3](https://github.com/ByteDance-Seed/Depth-Anything-3) by ByteDance.
|
| 34 |
+
> All credit for the model architecture, training, and research goes to the original authors (see [Credits](#-credits) below).
|
| 35 |
+
> This fork focuses on **production optimization, developer experience, and ease of deployment**.
|
| 36 |
+
|
| 37 |
+
## 🚀 What's New in This Fork
|
| 38 |
+
|
| 39 |
+
| Feature | Description |
|
| 40 |
+
|---------|-------------|
|
| 41 |
+
| **Model Caching** | ~200x faster model loading after first use |
|
| 42 |
+
| **Adaptive Batching** | Automatic batch size optimization based on GPU memory |
|
| 43 |
+
| **PyPI Package** | `pip install awesome-depth-anything-3` |
|
| 44 |
+
| **CLI Improvements** | Batch processing options, better error handling |
|
| 45 |
+
| **Apple Silicon Optimized** | Smart CPU/GPU preprocessing for best MPS performance |
|
| 46 |
+
| **Comprehensive Benchmarks** | Detailed performance analysis across devices |
|
| 47 |
+
|
| 48 |
+
### Performance Improvements
|
| 49 |
+
|
| 50 |
+
| Metric | Upstream | This Fork | Improvement |
|
| 51 |
+
|--------|----------|-----------|-------------|
|
| 52 |
+
| Cached model load | ~1s | ~5ms | **200x faster** |
|
| 53 |
+
| Batch 4 inference (MPS) | 3.32 img/s | 3.78 img/s | **1.14x faster** |
|
| 54 |
+
| Cold model load | 1.28s | 0.77s | **1.7x faster** |
|
| 55 |
+
|
| 56 |
+
---
|
| 57 |
+
|
| 58 |
+
<div align="center">
|
| 59 |
+
|
| 60 |
+
## Original Depth Anything 3
|
| 61 |
+
|
| 62 |
+
<h3>Recovering the Visual Space from Any Views</h3>
|
| 63 |
+
|
| 64 |
+
[**Haotong Lin**](https://haotongl.github.io/)<sup>*</sup> · [**Sili Chen**](https://github.com/SiliChen321)<sup>*</sup> · [**Jun Hao Liew**](https://liewjunhao.github.io/)<sup>*</sup> · [**Donny Y. Chen**](https://donydchen.github.io)<sup>*</sup> · [**Zhenyu Li**](https://zhyever.github.io/) · [**Guang Shi**](https://scholar.google.com/citations?user=MjXxWbUAAAAJ&hl=en) · [**Jiashi Feng**](https://scholar.google.com.sg/citations?user=Q8iay0gAAAAJ&hl=en)
|
| 65 |
+
<br>
|
| 66 |
+
[**Bingyi Kang**](https://bingykang.github.io/)<sup>*†</sup>
|
| 67 |
+
|
| 68 |
+
†project lead *Equal Contribution
|
| 69 |
+
|
| 70 |
+
<a href="https://arxiv.org/abs/2511.10647"><img src='https://img.shields.io/badge/arXiv-Depth Anything 3-red' alt='Paper PDF'></a>
|
| 71 |
+
<a href='https://depth-anything-3.github.io'><img src='https://img.shields.io/badge/Project_Page-Depth Anything 3-green' alt='Project Page'></a>
|
| 72 |
+
<a href='https://huggingface.co/spaces/depth-anything/Depth-Anything-3'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Official Demo-blue'></a>
|
| 73 |
+
|
| 74 |
+
</div>
|
| 75 |
+
|
| 76 |
+
This work presents **Depth Anything 3 (DA3)**, a model that predicts spatially consistent geometry from
|
| 77 |
+
arbitrary visual inputs, with or without known camera poses.
|
| 78 |
+
In pursuit of minimal modeling, DA3 yields two key insights:
|
| 79 |
+
- 💎 A **single plain transformer** (e.g., vanilla DINO encoder) is sufficient as a backbone without architectural specialization,
|
| 80 |
+
- ✨ A singular **depth-ray representation** obviates the need for complex multi-task learning.
|
| 81 |
+
|
| 82 |
+
🏆 DA3 significantly outperforms
|
| 83 |
+
[DA2](https://github.com/DepthAnything/Depth-Anything-V2) for monocular depth estimation,
|
| 84 |
+
and [VGGT](https://github.com/facebookresearch/vggt) for multi-view depth estimation and pose estimation.
|
| 85 |
+
All models are trained exclusively on **public academic datasets**.
|
| 86 |
+
|
| 87 |
+
<!-- <p align="center">
|
| 88 |
+
<img src="assets/images/da3_teaser.png" alt="Depth Anything 3" width="100%">
|
| 89 |
+
</p> -->
|
| 90 |
+
<p align="center">
|
| 91 |
+
<img src="assets/images/demo320-2.gif" alt="Depth Anything 3 - Left" width="70%">
|
| 92 |
+
</p>
|
| 93 |
+
<p align="center">
|
| 94 |
+
<img src="assets/images/da3_radar.png" alt="Depth Anything 3" width="100%">
|
| 95 |
+
</p>
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
## 📰 News
|
| 99 |
+
- **30-11-2025:** Add [`use_ray_pose`](#use-ray-pose) and [`ref_view_strategy`](docs/funcs/ref_view_strategy.md) (reference view selection for multi-view inputs).
|
| 100 |
+
- **25-11-2025:** Add [Awesome DA3 Projects](#-awesome-da3-projects), a community-driven section featuring DA3-based applications.
|
| 101 |
+
- **14-11-2025:** Paper, project page, code and models are all released.
|
| 102 |
+
|
| 103 |
+
## ✨ Highlights
|
| 104 |
+
|
| 105 |
+
### 🏆 Model Zoo
|
| 106 |
+
We release three series of models, each tailored for specific use cases in visual geometry.
|
| 107 |
+
|
| 108 |
+
- 🌟 **DA3 Main Series** (`DA3-Giant`, `DA3-Large`, `DA3-Base`, `DA3-Small`) These are our flagship foundation models, trained with a unified depth-ray representation. By varying the input configuration, a single model can perform a wide range of tasks:
|
| 109 |
+
+ 🌊 **Monocular Depth Estimation**: Predicts a depth map from a single RGB image.
|
| 110 |
+
+ 🌊 **Multi-View Depth Estimation**: Generates consistent depth maps from multiple images for high-quality fusion.
|
| 111 |
+
+ 🎯 **Pose-Conditioned Depth Estimation**: Achieves superior depth consistency when camera poses are provided as input.
|
| 112 |
+
+ 📷 **Camera Pose Estimation**: Estimates camera extrinsics and intrinsics from one or more images.
|
| 113 |
+
+ 🟡 **3D Gaussian Estimation**: Directly predicts 3D Gaussians, enabling high-fidelity novel view synthesis.
|
| 114 |
+
|
| 115 |
+
- 📐 **DA3 Metric Series** (`DA3Metric-Large`) A specialized model fine-tuned for metric depth estimation in monocular settings, ideal for applications requiring real-world scale.
|
| 116 |
+
|
| 117 |
+
- 🔍 **DA3 Monocular Series** (`DA3Mono-Large`). A dedicated model for high-quality relative monocular depth estimation. Unlike disparity-based models (e.g., [Depth Anything 2](https://github.com/DepthAnything/Depth-Anything-V2)), it directly predicts depth, resulting in superior geometric accuracy.
|
| 118 |
+
|
| 119 |
+
🔗 Leveraging these available models, we developed a **nested series** (`DA3Nested-Giant-Large`). This series combines a any-view giant model with a metric model to reconstruct visual geometry at a real-world metric scale.
|
| 120 |
+
|
| 121 |
+
### 🛠️ Codebase Features
|
| 122 |
+
Our repository is designed to be a powerful and user-friendly toolkit for both practical application and future research.
|
| 123 |
+
- 🎨 **Interactive Web UI & Gallery**: Visualize model outputs and compare results with an easy-to-use Gradio-based web interface.
|
| 124 |
+
- ⚡ **Flexible Command-Line Interface (CLI)**: Powerful and scriptable CLI for batch processing and integration into custom workflows.
|
| 125 |
+
- 💾 **Multiple Export Formats**: Save your results in various formats, including `glb`, `npz`, depth images, `ply`, 3DGS videos, etc, to seamlessly connect with other tools.
|
| 126 |
+
- 🔧 **Extensible and Modular Design**: The codebase is structured to facilitate future research and the integration of new models or functionalities.
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
<!-- ### 🎯 Visual Geometry Benchmark
|
| 130 |
+
We introduce a new benchmark to rigorously evaluate geometry prediction models on three key tasks: pose estimation, 3D reconstruction, and visual rendering (novel view synthesis) quality.
|
| 131 |
+
|
| 132 |
+
- 🔄 **Broad Model Compatibility**: Our benchmark is designed to be versatile, supporting the evaluation of various models, including both monocular and multi-view depth estimation approaches.
|
| 133 |
+
- 🔬 **Robust Evaluation Pipeline**: We provide a standardized pipeline featuring RANSAC-based pose alignment, TSDF fusion for dense reconstruction, and a principled view selection strategy for novel view synthesis.
|
| 134 |
+
- 📊 **Standardized Metrics**: Performance is measured using established metrics: AUC for pose accuracy, F1-score and Chamfer Distance for reconstruction, and PSNR/SSIM/LPIPS for rendering quality.
|
| 135 |
+
- 🌍 **Diverse and Challenging Datasets**: The benchmark spans a wide range of scenes from datasets like HiRoom, ETH3D, DTU, 7Scenes, ScanNet++, DL3DV, Tanks and Temples, and MegaDepth. -->
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
## 🚀 Quick Start
|
| 139 |
+
|
| 140 |
+
### 📦 Installation
|
| 141 |
+
|
| 142 |
+
```bash
|
| 143 |
+
# From PyPI (recommended)
|
| 144 |
+
pip install awesome-depth-anything-3
|
| 145 |
+
|
| 146 |
+
# With Gradio web UI
|
| 147 |
+
pip install awesome-depth-anything-3[app]
|
| 148 |
+
|
| 149 |
+
# With CUDA optimizations (xformers + gsplat)
|
| 150 |
+
pip install awesome-depth-anything-3[cuda]
|
| 151 |
+
|
| 152 |
+
# Everything
|
| 153 |
+
pip install awesome-depth-anything-3[all]
|
| 154 |
+
```
|
| 155 |
+
|
| 156 |
+
<details>
|
| 157 |
+
<summary><b>Development installation</b></summary>
|
| 158 |
+
|
| 159 |
+
```bash
|
| 160 |
+
git clone https://github.com/Aedelon/awesome-depth-anything-3.git
|
| 161 |
+
cd awesome-depth-anything-3
|
| 162 |
+
pip install -e ".[dev]"
|
| 163 |
+
|
| 164 |
+
# Optional: 3D Gaussian Splatting head
|
| 165 |
+
pip install --no-build-isolation git+https://github.com/nerfstudio-project/gsplat.git@0b4dddf
|
| 166 |
+
```
|
| 167 |
+
</details>
|
| 168 |
+
|
| 169 |
+
For detailed model information, please refer to the [Model Cards](#-model-cards) section below.
|
| 170 |
+
|
| 171 |
+
### 💻 Basic Usage
|
| 172 |
+
|
| 173 |
+
```python
|
| 174 |
+
import glob, os, torch
|
| 175 |
+
from depth_anything_3.api import DepthAnything3
|
| 176 |
+
device = torch.device("cuda")
|
| 177 |
+
model = DepthAnything3.from_pretrained("depth-anything/DA3NESTED-GIANT-LARGE")
|
| 178 |
+
model = model.to(device=device)
|
| 179 |
+
example_path = "assets/examples/SOH"
|
| 180 |
+
images = sorted(glob.glob(os.path.join(example_path, "*.png")))
|
| 181 |
+
prediction = model.inference(
|
| 182 |
+
images,
|
| 183 |
+
)
|
| 184 |
+
# prediction.processed_images : [N, H, W, 3] uint8 array
|
| 185 |
+
print(prediction.processed_images.shape)
|
| 186 |
+
# prediction.depth : [N, H, W] float32 array
|
| 187 |
+
print(prediction.depth.shape)
|
| 188 |
+
# prediction.conf : [N, H, W] float32 array
|
| 189 |
+
print(prediction.conf.shape)
|
| 190 |
+
# prediction.extrinsics : [N, 3, 4] float32 array # opencv w2c or colmap format
|
| 191 |
+
print(prediction.extrinsics.shape)
|
| 192 |
+
# prediction.intrinsics : [N, 3, 3] float32 array
|
| 193 |
+
print(prediction.intrinsics.shape)
|
| 194 |
+
```
|
| 195 |
+
|
| 196 |
+
```bash
|
| 197 |
+
|
| 198 |
+
export MODEL_DIR=depth-anything/DA3NESTED-GIANT-LARGE
|
| 199 |
+
# This can be a Hugging Face repository or a local directory
|
| 200 |
+
# If you encounter network issues, consider using the following mirror: export HF_ENDPOINT=https://hf-mirror.com
|
| 201 |
+
# Alternatively, you can download the model directly from Hugging Face
|
| 202 |
+
export GALLERY_DIR=workspace/gallery
|
| 203 |
+
mkdir -p $GALLERY_DIR
|
| 204 |
+
|
| 205 |
+
# CLI auto mode with backend reuse
|
| 206 |
+
da3 backend --model-dir ${MODEL_DIR} --gallery-dir ${GALLERY_DIR} # Cache model to gpu
|
| 207 |
+
da3 auto assets/examples/SOH \
|
| 208 |
+
--export-format glb \
|
| 209 |
+
--export-dir ${GALLERY_DIR}/TEST_BACKEND/SOH \
|
| 210 |
+
--use-backend
|
| 211 |
+
|
| 212 |
+
# CLI video processing with feature visualization
|
| 213 |
+
da3 video assets/examples/robot_unitree.mp4 \
|
| 214 |
+
--fps 15 \
|
| 215 |
+
--use-backend \
|
| 216 |
+
--export-dir ${GALLERY_DIR}/TEST_BACKEND/robo \
|
| 217 |
+
--export-format glb-feat_vis \
|
| 218 |
+
--feat-vis-fps 15 \
|
| 219 |
+
--process-res-method lower_bound_resize \
|
| 220 |
+
--export-feat "11,21,31"
|
| 221 |
+
|
| 222 |
+
# CLI auto mode without backend reuse
|
| 223 |
+
da3 auto assets/examples/SOH \
|
| 224 |
+
--export-format glb \
|
| 225 |
+
--export-dir ${GALLERY_DIR}/TEST_CLI/SOH \
|
| 226 |
+
--model-dir ${MODEL_DIR}
|
| 227 |
+
|
| 228 |
+
```
|
| 229 |
+
|
| 230 |
+
The model architecture is defined in [`DepthAnything3Net`](src/depth_anything_3/model/da3.py), and specified with a Yaml config file located at [`src/depth_anything_3/configs`](src/depth_anything_3/configs). The input and output processing are handled by [`DepthAnything3`](src/depth_anything_3/api.py). To customize the model architecture, simply create a new config file (*e.g.*, `path/to/new/config`) as:
|
| 231 |
+
|
| 232 |
+
```yaml
|
| 233 |
+
__object__:
|
| 234 |
+
path: depth_anything_3.model.da3
|
| 235 |
+
name: DepthAnything3Net
|
| 236 |
+
args: as_params
|
| 237 |
+
|
| 238 |
+
net:
|
| 239 |
+
__object__:
|
| 240 |
+
path: depth_anything_3.model.dinov2.dinov2
|
| 241 |
+
name: DinoV2
|
| 242 |
+
args: as_params
|
| 243 |
+
|
| 244 |
+
name: vitb
|
| 245 |
+
out_layers: [5, 7, 9, 11]
|
| 246 |
+
alt_start: 4
|
| 247 |
+
qknorm_start: 4
|
| 248 |
+
rope_start: 4
|
| 249 |
+
cat_token: True
|
| 250 |
+
|
| 251 |
+
head:
|
| 252 |
+
__object__:
|
| 253 |
+
path: depth_anything_3.model.dualdpt
|
| 254 |
+
name: DualDPT
|
| 255 |
+
args: as_params
|
| 256 |
+
|
| 257 |
+
dim_in: &head_dim_in 1536
|
| 258 |
+
output_dim: 2
|
| 259 |
+
features: &head_features 128
|
| 260 |
+
out_channels: &head_out_channels [96, 192, 384, 768]
|
| 261 |
+
```
|
| 262 |
+
|
| 263 |
+
Then, the model can be created with the following code snippet.
|
| 264 |
+
```python
|
| 265 |
+
from depth_anything_3.cfg import create_object, load_config
|
| 266 |
+
|
| 267 |
+
Model = create_object(load_config("path/to/new/config"))
|
| 268 |
+
```
|
| 269 |
+
|
| 270 |
+
|
| 271 |
+
|
| 272 |
+
## 📚 Useful Documentation
|
| 273 |
+
|
| 274 |
+
- 🖥️ [Command Line Interface](docs/CLI.md)
|
| 275 |
+
- 📑 [Python API](docs/API.md)
|
| 276 |
+
<!-- - 🏁 [Visual Geometry Benchmark](docs/BENCHMARK.md) -->
|
| 277 |
+
|
| 278 |
+
## 🗂️ Model Cards
|
| 279 |
+
|
| 280 |
+
Generally, you should observe that DA3-LARGE achieves comparable results to VGGT.
|
| 281 |
+
|
| 282 |
+
The Nested series uses an Any-view model to estimate pose and depth, and a monocular metric depth estimator for scaling.
|
| 283 |
+
|
| 284 |
+
| 🗃️ Model Name | 📏 Params | 📊 Rel. Depth | 📷 Pose Est. | 🧭 Pose Cond. | 🎨 GS | 📐 Met. Depth | ☁️ Sky Seg | 📄 License |
|
| 285 |
+
|-------------------------------|-----------|---------------|--------------|---------------|-------|---------------|-----------|----------------|
|
| 286 |
+
| **Nested** | | | | | | | | |
|
| 287 |
+
| [DA3NESTED-GIANT-LARGE](https://huggingface.co/depth-anything/DA3NESTED-GIANT-LARGE) | 1.40B | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | CC BY-NC 4.0 |
|
| 288 |
+
| **Any-view Model** | | | | | | | | |
|
| 289 |
+
| [DA3-GIANT](https://huggingface.co/depth-anything/DA3-GIANT) | 1.15B | ✅ | ✅ | ✅ | ✅ | | | CC BY-NC 4.0 |
|
| 290 |
+
| [DA3-LARGE](https://huggingface.co/depth-anything/DA3-LARGE) | 0.35B | ✅ | ✅ | ✅ | | | | CC BY-NC 4.0 |
|
| 291 |
+
| [DA3-BASE](https://huggingface.co/depth-anything/DA3-BASE) | 0.12B | ✅ | ✅ | ✅ | | | | Apache 2.0 |
|
| 292 |
+
| [DA3-SMALL](https://huggingface.co/depth-anything/DA3-SMALL) | 0.08B | ✅ | ✅ | ✅ | | | | Apache 2.0 |
|
| 293 |
+
| | | | | | | | | |
|
| 294 |
+
| **Monocular Metric Depth** | | | | | | | | |
|
| 295 |
+
| [DA3METRIC-LARGE](https://huggingface.co/depth-anything/DA3METRIC-LARGE) | 0.35B | ✅ | | | | ✅ | ✅ | Apache 2.0 |
|
| 296 |
+
| | | | | | | | | |
|
| 297 |
+
| **Monocular Depth** | | | | | | | | |
|
| 298 |
+
| [DA3MONO-LARGE](https://huggingface.co/depth-anything/DA3MONO-LARGE) | 0.35B | ✅ | | | | | ✅ | Apache 2.0 |
|
| 299 |
+
|
| 300 |
+
|
| 301 |
+
## ⚡ Performance Benchmarks
|
| 302 |
+
|
| 303 |
+
Inference throughput measured on Apple Silicon (MPS) with PyTorch 2.9.0. For detailed benchmarks, see [BENCHMARKS.md](BENCHMARKS.md).
|
| 304 |
+
|
| 305 |
+
### Apple Silicon (MPS) - Batch Size 1
|
| 306 |
+
|
| 307 |
+
| Model | Latency | Throughput |
|
| 308 |
+
|-------|---------|------------|
|
| 309 |
+
| DA3-Small | 46 ms | **22 img/s** |
|
| 310 |
+
| DA3-Base | 93 ms | **11 img/s** |
|
| 311 |
+
| DA3-Large | 265 ms | **3.8 img/s** |
|
| 312 |
+
| DA3-Giant | 618 ms | **1.6 img/s** |
|
| 313 |
+
|
| 314 |
+
### Cross-Device Comparison (DA3-Large)
|
| 315 |
+
|
| 316 |
+
| Device | Throughput | vs CPU |
|
| 317 |
+
|--------|------------|--------|
|
| 318 |
+
| CPU | 0.3 img/s | 1.0x |
|
| 319 |
+
| Apple Silicon (MPS) | 3.8 img/s | **13x** |
|
| 320 |
+
| NVIDIA L4 (CUDA) | 10.3 img/s | **34x** |
|
| 321 |
+
|
| 322 |
+
### Batch Processing
|
| 323 |
+
|
| 324 |
+
```python
|
| 325 |
+
from depth_anything_3.api import DepthAnything3
|
| 326 |
+
|
| 327 |
+
model = DepthAnything3.from_pretrained("depth-anything/DA3-LARGE")
|
| 328 |
+
|
| 329 |
+
# Adaptive batching (recommended for large image sets)
|
| 330 |
+
results = model.batch_inference(
|
| 331 |
+
images=image_paths,
|
| 332 |
+
batch_size="auto", # Automatically selects optimal batch size
|
| 333 |
+
target_memory_utilization=0.85,
|
| 334 |
+
)
|
| 335 |
+
|
| 336 |
+
# Fixed batch size
|
| 337 |
+
results = model.batch_inference(
|
| 338 |
+
images=image_paths,
|
| 339 |
+
batch_size=4,
|
| 340 |
+
)
|
| 341 |
+
```
|
| 342 |
+
|
| 343 |
+
> See [BENCHMARKS.md](BENCHMARKS.md) for comprehensive benchmarks including preprocessing, attention mechanisms, and adaptive batching strategies.
|
| 344 |
+
|
| 345 |
+
|
| 346 |
+
## ❓ FAQ
|
| 347 |
+
|
| 348 |
+
- **Monocular Metric Depth**: To obtain metric depth in meters from `DA3METRIC-LARGE`, use `metric_depth = focal * net_output / 300.`, where `focal` is the focal length in pixels (typically the average of fx and fy from the camera intrinsic matrix K). Note that the output from `DA3NESTED-GIANT-LARGE` is already in meters.
|
| 349 |
+
|
| 350 |
+
- <a id="use-ray-pose"></a>**Ray Head (`use_ray_pose`)**: Our API and CLI support `use_ray_pose` arg, which means that the model will derive camera pose from ray head, which is generally slightly slower, but more accurate. Note that the default is `False` for faster inference speed.
|
| 351 |
+
<details>
|
| 352 |
+
<summary>AUC3 Results for DA3NESTED-GIANT-LARGE</summary>
|
| 353 |
+
|
| 354 |
+
| Model | HiRoom | ETH3D | DTU | 7Scenes | ScanNet++ |
|
| 355 |
+
|-------|------|-------|-----|---------|-----------|
|
| 356 |
+
| `ray_head` | 84.4 | 52.6 | 93.9 | 29.5 | 89.4 |
|
| 357 |
+
| `cam_head` | 80.3 | 48.4 | 94.1 | 28.5 | 85.0 |
|
| 358 |
+
|
| 359 |
+
</details>
|
| 360 |
+
|
| 361 |
+
|
| 362 |
+
|
| 363 |
+
|
| 364 |
+
- **Older GPUs without XFormers support**: See [Issue #11](https://github.com/ByteDance-Seed/Depth-Anything-3/issues/11). Thanks to [@S-Mahoney](https://github.com/S-Mahoney) for the solution!
|
| 365 |
+
|
| 366 |
+
|
| 367 |
+
## 🏢 Awesome DA3 Projects
|
| 368 |
+
|
| 369 |
+
A community-curated list of Depth Anything 3 integrations across 3D tools, creative pipelines, robotics, and web/VR viewers, including but not limited to these. You are welcome to submit your DA3-based project via PR, and we will review and feature it if applicable.
|
| 370 |
+
|
| 371 |
+
- [DA3-blender](https://github.com/xy-gao/DA3-blender): Blender addon for DA3-based 3D reconstruction from a set of images.
|
| 372 |
+
|
| 373 |
+
- [ComfyUI-DepthAnythingV3](https://github.com/PozzettiAndrea/ComfyUI-DepthAnythingV3): ComfyUI nodes for Depth Anything 3, supporting single/multi-view and video-consistent depth with optional point‑cloud export.
|
| 374 |
+
|
| 375 |
+
- [DA3-ROS2-Wrapper](https://github.com/GerdsenAI/GerdsenAI-Depth-Anything-3-ROS2-Wrapper): Real-time DA3 depth in ROS2 with multi-camera support.
|
| 376 |
+
|
| 377 |
+
- [VideoDepthViewer3D](https://github.com/amariichi/VideoDepthViewer3D): Streaming videos with DA3 metric depth to a Three.js/WebXR 3D viewer for VR/stereo playback.
|
| 378 |
+
|
| 379 |
+
|
| 380 |
+
## 📝 Credits
|
| 381 |
+
|
| 382 |
+
### Original Authors
|
| 383 |
+
|
| 384 |
+
This package is built on top of **Depth Anything 3**, created by the ByteDance Seed team:
|
| 385 |
+
|
| 386 |
+
- [Haotong Lin](https://haotongl.github.io/), [Sili Chen](https://github.com/SiliChen321), [Jun Hao Liew](https://liewjunhao.github.io/), [Donny Y. Chen](https://donydchen.github.io), [Zhenyu Li](https://zhyever.github.io/), [Guang Shi](https://scholar.google.com/citations?user=MjXxWbUAAAAJ), [Jiashi Feng](https://scholar.google.com.sg/citations?user=Q8iay0gAAAAJ), [Bingyi Kang](https://bingykang.github.io/)
|
| 387 |
+
|
| 388 |
+
All model weights, architecture, and core algorithms are their work. This fork only adds production optimizations and deployment tooling.
|
| 389 |
+
|
| 390 |
+
### Fork Maintainer
|
| 391 |
+
|
| 392 |
+
This optimized fork is maintained by [Delanoe Pirard (Aedelon)](https://github.com/Aedelon).
|
| 393 |
+
|
| 394 |
+
Contributions:
|
| 395 |
+
- Model caching system
|
| 396 |
+
- Adaptive batching
|
| 397 |
+
- Apple Silicon (MPS) optimizations
|
| 398 |
+
- PyPI packaging and CI/CD
|
| 399 |
+
- Comprehensive benchmarking
|
| 400 |
+
|
| 401 |
+
### Citation
|
| 402 |
+
|
| 403 |
+
If you use Depth Anything 3 in your research, please cite the original paper:
|
| 404 |
+
|
| 405 |
+
```bibtex
|
| 406 |
+
@article{depthanything3,
|
| 407 |
+
title={Depth Anything 3: Recovering the visual space from any views},
|
| 408 |
+
author={Haotong Lin and Sili Chen and Jun Hao Liew and Donny Y. Chen and Zhenyu Li and Guang Shi and Jiashi Feng and Bingyi Kang},
|
| 409 |
+
journal={arXiv preprint arXiv:2511.10647},
|
| 410 |
+
year={2025}
|
| 411 |
+
}
|
| 412 |
+
```
|
| 413 |
+
|
| 414 |
+
If you specifically use features from this fork (caching, batching, MPS optimizations), you may additionally reference:
|
| 415 |
+
|
| 416 |
+
```
|
| 417 |
+
awesome-depth-anything-3: https://github.com/Aedelon/awesome-depth-anything-3
|
| 418 |
+
```
|
README.md.original
ADDED
|
@@ -0,0 +1,405 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<div align="center">
|
| 2 |
+
|
| 3 |
+
# Awesome Depth Anything 3
|
| 4 |
+
|
| 5 |
+
**Optimized fork of Depth Anything 3 with production-ready features**
|
| 6 |
+
|
| 7 |
+
[](https://pypi.org/project/awesome-depth-anything-3/)
|
| 8 |
+
[](https://www.python.org/)
|
| 9 |
+
[](LICENSE)
|
| 10 |
+
[](https://github.com/Aedelon/awesome-depth-anything-3/actions)
|
| 11 |
+
[](https://colab.research.google.com/github/Aedelon/awesome-depth-anything-3/blob/main/notebooks/da3_tutorial.ipynb)
|
| 12 |
+
[](https://huggingface.co/spaces/Aedelon/awesome-depth-anything-3)
|
| 13 |
+
|
| 14 |
+
[Demo](https://huggingface.co/spaces/Aedelon/awesome-depth-anything-3) · [Tutorial](notebooks/da3_tutorial.ipynb) · [Benchmarks](BENCHMARKS.md) · [Original Paper](https://arxiv.org/abs/2511.10647)
|
| 15 |
+
|
| 16 |
+
</div>
|
| 17 |
+
|
| 18 |
+
---
|
| 19 |
+
|
| 20 |
+
> **This is an optimized fork** of [Depth Anything 3](https://github.com/ByteDance-Seed/Depth-Anything-3) by ByteDance.
|
| 21 |
+
> All credit for the model architecture, training, and research goes to the original authors (see [Credits](#-credits) below).
|
| 22 |
+
> This fork focuses on **production optimization, developer experience, and ease of deployment**.
|
| 23 |
+
|
| 24 |
+
## 🚀 What's New in This Fork
|
| 25 |
+
|
| 26 |
+
| Feature | Description |
|
| 27 |
+
|---------|-------------|
|
| 28 |
+
| **Model Caching** | ~200x faster model loading after first use |
|
| 29 |
+
| **Adaptive Batching** | Automatic batch size optimization based on GPU memory |
|
| 30 |
+
| **PyPI Package** | `pip install awesome-depth-anything-3` |
|
| 31 |
+
| **CLI Improvements** | Batch processing options, better error handling |
|
| 32 |
+
| **Apple Silicon Optimized** | Smart CPU/GPU preprocessing for best MPS performance |
|
| 33 |
+
| **Comprehensive Benchmarks** | Detailed performance analysis across devices |
|
| 34 |
+
|
| 35 |
+
### Performance Improvements
|
| 36 |
+
|
| 37 |
+
| Metric | Upstream | This Fork | Improvement |
|
| 38 |
+
|--------|----------|-----------|-------------|
|
| 39 |
+
| Cached model load | ~1s | ~5ms | **200x faster** |
|
| 40 |
+
| Batch 4 inference (MPS) | 3.32 img/s | 3.78 img/s | **1.14x faster** |
|
| 41 |
+
| Cold model load | 1.28s | 0.77s | **1.7x faster** |
|
| 42 |
+
|
| 43 |
+
---
|
| 44 |
+
|
| 45 |
+
<div align="center">
|
| 46 |
+
|
| 47 |
+
## Original Depth Anything 3
|
| 48 |
+
|
| 49 |
+
<h3>Recovering the Visual Space from Any Views</h3>
|
| 50 |
+
|
| 51 |
+
[**Haotong Lin**](https://haotongl.github.io/)<sup>*</sup> · [**Sili Chen**](https://github.com/SiliChen321)<sup>*</sup> · [**Jun Hao Liew**](https://liewjunhao.github.io/)<sup>*</sup> · [**Donny Y. Chen**](https://donydchen.github.io)<sup>*</sup> · [**Zhenyu Li**](https://zhyever.github.io/) · [**Guang Shi**](https://scholar.google.com/citations?user=MjXxWbUAAAAJ&hl=en) · [**Jiashi Feng**](https://scholar.google.com.sg/citations?user=Q8iay0gAAAAJ&hl=en)
|
| 52 |
+
<br>
|
| 53 |
+
[**Bingyi Kang**](https://bingykang.github.io/)<sup>*†</sup>
|
| 54 |
+
|
| 55 |
+
†project lead *Equal Contribution
|
| 56 |
+
|
| 57 |
+
<a href="https://arxiv.org/abs/2511.10647"><img src='https://img.shields.io/badge/arXiv-Depth Anything 3-red' alt='Paper PDF'></a>
|
| 58 |
+
<a href='https://depth-anything-3.github.io'><img src='https://img.shields.io/badge/Project_Page-Depth Anything 3-green' alt='Project Page'></a>
|
| 59 |
+
<a href='https://huggingface.co/spaces/depth-anything/Depth-Anything-3'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Official Demo-blue'></a>
|
| 60 |
+
|
| 61 |
+
</div>
|
| 62 |
+
|
| 63 |
+
This work presents **Depth Anything 3 (DA3)**, a model that predicts spatially consistent geometry from
|
| 64 |
+
arbitrary visual inputs, with or without known camera poses.
|
| 65 |
+
In pursuit of minimal modeling, DA3 yields two key insights:
|
| 66 |
+
- 💎 A **single plain transformer** (e.g., vanilla DINO encoder) is sufficient as a backbone without architectural specialization,
|
| 67 |
+
- ✨ A singular **depth-ray representation** obviates the need for complex multi-task learning.
|
| 68 |
+
|
| 69 |
+
🏆 DA3 significantly outperforms
|
| 70 |
+
[DA2](https://github.com/DepthAnything/Depth-Anything-V2) for monocular depth estimation,
|
| 71 |
+
and [VGGT](https://github.com/facebookresearch/vggt) for multi-view depth estimation and pose estimation.
|
| 72 |
+
All models are trained exclusively on **public academic datasets**.
|
| 73 |
+
|
| 74 |
+
<!-- <p align="center">
|
| 75 |
+
<img src="assets/images/da3_teaser.png" alt="Depth Anything 3" width="100%">
|
| 76 |
+
</p> -->
|
| 77 |
+
<p align="center">
|
| 78 |
+
<img src="assets/images/demo320-2.gif" alt="Depth Anything 3 - Left" width="70%">
|
| 79 |
+
</p>
|
| 80 |
+
<p align="center">
|
| 81 |
+
<img src="assets/images/da3_radar.png" alt="Depth Anything 3" width="100%">
|
| 82 |
+
</p>
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
## 📰 News
|
| 86 |
+
- **30-11-2025:** Add [`use_ray_pose`](#use-ray-pose) and [`ref_view_strategy`](docs/funcs/ref_view_strategy.md) (reference view selection for multi-view inputs).
|
| 87 |
+
- **25-11-2025:** Add [Awesome DA3 Projects](#-awesome-da3-projects), a community-driven section featuring DA3-based applications.
|
| 88 |
+
- **14-11-2025:** Paper, project page, code and models are all released.
|
| 89 |
+
|
| 90 |
+
## ✨ Highlights
|
| 91 |
+
|
| 92 |
+
### 🏆 Model Zoo
|
| 93 |
+
We release three series of models, each tailored for specific use cases in visual geometry.
|
| 94 |
+
|
| 95 |
+
- 🌟 **DA3 Main Series** (`DA3-Giant`, `DA3-Large`, `DA3-Base`, `DA3-Small`) These are our flagship foundation models, trained with a unified depth-ray representation. By varying the input configuration, a single model can perform a wide range of tasks:
|
| 96 |
+
+ 🌊 **Monocular Depth Estimation**: Predicts a depth map from a single RGB image.
|
| 97 |
+
+ 🌊 **Multi-View Depth Estimation**: Generates consistent depth maps from multiple images for high-quality fusion.
|
| 98 |
+
+ 🎯 **Pose-Conditioned Depth Estimation**: Achieves superior depth consistency when camera poses are provided as input.
|
| 99 |
+
+ 📷 **Camera Pose Estimation**: Estimates camera extrinsics and intrinsics from one or more images.
|
| 100 |
+
+ 🟡 **3D Gaussian Estimation**: Directly predicts 3D Gaussians, enabling high-fidelity novel view synthesis.
|
| 101 |
+
|
| 102 |
+
- 📐 **DA3 Metric Series** (`DA3Metric-Large`) A specialized model fine-tuned for metric depth estimation in monocular settings, ideal for applications requiring real-world scale.
|
| 103 |
+
|
| 104 |
+
- 🔍 **DA3 Monocular Series** (`DA3Mono-Large`). A dedicated model for high-quality relative monocular depth estimation. Unlike disparity-based models (e.g., [Depth Anything 2](https://github.com/DepthAnything/Depth-Anything-V2)), it directly predicts depth, resulting in superior geometric accuracy.
|
| 105 |
+
|
| 106 |
+
🔗 Leveraging these available models, we developed a **nested series** (`DA3Nested-Giant-Large`). This series combines a any-view giant model with a metric model to reconstruct visual geometry at a real-world metric scale.
|
| 107 |
+
|
| 108 |
+
### 🛠️ Codebase Features
|
| 109 |
+
Our repository is designed to be a powerful and user-friendly toolkit for both practical application and future research.
|
| 110 |
+
- 🎨 **Interactive Web UI & Gallery**: Visualize model outputs and compare results with an easy-to-use Gradio-based web interface.
|
| 111 |
+
- ⚡ **Flexible Command-Line Interface (CLI)**: Powerful and scriptable CLI for batch processing and integration into custom workflows.
|
| 112 |
+
- 💾 **Multiple Export Formats**: Save your results in various formats, including `glb`, `npz`, depth images, `ply`, 3DGS videos, etc, to seamlessly connect with other tools.
|
| 113 |
+
- 🔧 **Extensible and Modular Design**: The codebase is structured to facilitate future research and the integration of new models or functionalities.
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
<!-- ### 🎯 Visual Geometry Benchmark
|
| 117 |
+
We introduce a new benchmark to rigorously evaluate geometry prediction models on three key tasks: pose estimation, 3D reconstruction, and visual rendering (novel view synthesis) quality.
|
| 118 |
+
|
| 119 |
+
- 🔄 **Broad Model Compatibility**: Our benchmark is designed to be versatile, supporting the evaluation of various models, including both monocular and multi-view depth estimation approaches.
|
| 120 |
+
- 🔬 **Robust Evaluation Pipeline**: We provide a standardized pipeline featuring RANSAC-based pose alignment, TSDF fusion for dense reconstruction, and a principled view selection strategy for novel view synthesis.
|
| 121 |
+
- 📊 **Standardized Metrics**: Performance is measured using established metrics: AUC for pose accuracy, F1-score and Chamfer Distance for reconstruction, and PSNR/SSIM/LPIPS for rendering quality.
|
| 122 |
+
- 🌍 **Diverse and Challenging Datasets**: The benchmark spans a wide range of scenes from datasets like HiRoom, ETH3D, DTU, 7Scenes, ScanNet++, DL3DV, Tanks and Temples, and MegaDepth. -->
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
## 🚀 Quick Start
|
| 126 |
+
|
| 127 |
+
### 📦 Installation
|
| 128 |
+
|
| 129 |
+
```bash
|
| 130 |
+
# From PyPI (recommended)
|
| 131 |
+
pip install awesome-depth-anything-3
|
| 132 |
+
|
| 133 |
+
# With Gradio web UI
|
| 134 |
+
pip install awesome-depth-anything-3[app]
|
| 135 |
+
|
| 136 |
+
# With CUDA optimizations (xformers + gsplat)
|
| 137 |
+
pip install awesome-depth-anything-3[cuda]
|
| 138 |
+
|
| 139 |
+
# Everything
|
| 140 |
+
pip install awesome-depth-anything-3[all]
|
| 141 |
+
```
|
| 142 |
+
|
| 143 |
+
<details>
|
| 144 |
+
<summary><b>Development installation</b></summary>
|
| 145 |
+
|
| 146 |
+
```bash
|
| 147 |
+
git clone https://github.com/Aedelon/awesome-depth-anything-3.git
|
| 148 |
+
cd awesome-depth-anything-3
|
| 149 |
+
pip install -e ".[dev]"
|
| 150 |
+
|
| 151 |
+
# Optional: 3D Gaussian Splatting head
|
| 152 |
+
pip install --no-build-isolation git+https://github.com/nerfstudio-project/gsplat.git@0b4dddf
|
| 153 |
+
```
|
| 154 |
+
</details>
|
| 155 |
+
|
| 156 |
+
For detailed model information, please refer to the [Model Cards](#-model-cards) section below.
|
| 157 |
+
|
| 158 |
+
### 💻 Basic Usage
|
| 159 |
+
|
| 160 |
+
```python
|
| 161 |
+
import glob, os, torch
|
| 162 |
+
from depth_anything_3.api import DepthAnything3
|
| 163 |
+
device = torch.device("cuda")
|
| 164 |
+
model = DepthAnything3.from_pretrained("depth-anything/DA3NESTED-GIANT-LARGE")
|
| 165 |
+
model = model.to(device=device)
|
| 166 |
+
example_path = "assets/examples/SOH"
|
| 167 |
+
images = sorted(glob.glob(os.path.join(example_path, "*.png")))
|
| 168 |
+
prediction = model.inference(
|
| 169 |
+
images,
|
| 170 |
+
)
|
| 171 |
+
# prediction.processed_images : [N, H, W, 3] uint8 array
|
| 172 |
+
print(prediction.processed_images.shape)
|
| 173 |
+
# prediction.depth : [N, H, W] float32 array
|
| 174 |
+
print(prediction.depth.shape)
|
| 175 |
+
# prediction.conf : [N, H, W] float32 array
|
| 176 |
+
print(prediction.conf.shape)
|
| 177 |
+
# prediction.extrinsics : [N, 3, 4] float32 array # opencv w2c or colmap format
|
| 178 |
+
print(prediction.extrinsics.shape)
|
| 179 |
+
# prediction.intrinsics : [N, 3, 3] float32 array
|
| 180 |
+
print(prediction.intrinsics.shape)
|
| 181 |
+
```
|
| 182 |
+
|
| 183 |
+
```bash
|
| 184 |
+
|
| 185 |
+
export MODEL_DIR=depth-anything/DA3NESTED-GIANT-LARGE
|
| 186 |
+
# This can be a Hugging Face repository or a local directory
|
| 187 |
+
# If you encounter network issues, consider using the following mirror: export HF_ENDPOINT=https://hf-mirror.com
|
| 188 |
+
# Alternatively, you can download the model directly from Hugging Face
|
| 189 |
+
export GALLERY_DIR=workspace/gallery
|
| 190 |
+
mkdir -p $GALLERY_DIR
|
| 191 |
+
|
| 192 |
+
# CLI auto mode with backend reuse
|
| 193 |
+
da3 backend --model-dir ${MODEL_DIR} --gallery-dir ${GALLERY_DIR} # Cache model to gpu
|
| 194 |
+
da3 auto assets/examples/SOH \
|
| 195 |
+
--export-format glb \
|
| 196 |
+
--export-dir ${GALLERY_DIR}/TEST_BACKEND/SOH \
|
| 197 |
+
--use-backend
|
| 198 |
+
|
| 199 |
+
# CLI video processing with feature visualization
|
| 200 |
+
da3 video assets/examples/robot_unitree.mp4 \
|
| 201 |
+
--fps 15 \
|
| 202 |
+
--use-backend \
|
| 203 |
+
--export-dir ${GALLERY_DIR}/TEST_BACKEND/robo \
|
| 204 |
+
--export-format glb-feat_vis \
|
| 205 |
+
--feat-vis-fps 15 \
|
| 206 |
+
--process-res-method lower_bound_resize \
|
| 207 |
+
--export-feat "11,21,31"
|
| 208 |
+
|
| 209 |
+
# CLI auto mode without backend reuse
|
| 210 |
+
da3 auto assets/examples/SOH \
|
| 211 |
+
--export-format glb \
|
| 212 |
+
--export-dir ${GALLERY_DIR}/TEST_CLI/SOH \
|
| 213 |
+
--model-dir ${MODEL_DIR}
|
| 214 |
+
|
| 215 |
+
```
|
| 216 |
+
|
| 217 |
+
The model architecture is defined in [`DepthAnything3Net`](src/depth_anything_3/model/da3.py), and specified with a Yaml config file located at [`src/depth_anything_3/configs`](src/depth_anything_3/configs). The input and output processing are handled by [`DepthAnything3`](src/depth_anything_3/api.py). To customize the model architecture, simply create a new config file (*e.g.*, `path/to/new/config`) as:
|
| 218 |
+
|
| 219 |
+
```yaml
|
| 220 |
+
__object__:
|
| 221 |
+
path: depth_anything_3.model.da3
|
| 222 |
+
name: DepthAnything3Net
|
| 223 |
+
args: as_params
|
| 224 |
+
|
| 225 |
+
net:
|
| 226 |
+
__object__:
|
| 227 |
+
path: depth_anything_3.model.dinov2.dinov2
|
| 228 |
+
name: DinoV2
|
| 229 |
+
args: as_params
|
| 230 |
+
|
| 231 |
+
name: vitb
|
| 232 |
+
out_layers: [5, 7, 9, 11]
|
| 233 |
+
alt_start: 4
|
| 234 |
+
qknorm_start: 4
|
| 235 |
+
rope_start: 4
|
| 236 |
+
cat_token: True
|
| 237 |
+
|
| 238 |
+
head:
|
| 239 |
+
__object__:
|
| 240 |
+
path: depth_anything_3.model.dualdpt
|
| 241 |
+
name: DualDPT
|
| 242 |
+
args: as_params
|
| 243 |
+
|
| 244 |
+
dim_in: &head_dim_in 1536
|
| 245 |
+
output_dim: 2
|
| 246 |
+
features: &head_features 128
|
| 247 |
+
out_channels: &head_out_channels [96, 192, 384, 768]
|
| 248 |
+
```
|
| 249 |
+
|
| 250 |
+
Then, the model can be created with the following code snippet.
|
| 251 |
+
```python
|
| 252 |
+
from depth_anything_3.cfg import create_object, load_config
|
| 253 |
+
|
| 254 |
+
Model = create_object(load_config("path/to/new/config"))
|
| 255 |
+
```
|
| 256 |
+
|
| 257 |
+
|
| 258 |
+
|
| 259 |
+
## 📚 Useful Documentation
|
| 260 |
+
|
| 261 |
+
- 🖥️ [Command Line Interface](docs/CLI.md)
|
| 262 |
+
- 📑 [Python API](docs/API.md)
|
| 263 |
+
<!-- - 🏁 [Visual Geometry Benchmark](docs/BENCHMARK.md) -->
|
| 264 |
+
|
| 265 |
+
## 🗂️ Model Cards
|
| 266 |
+
|
| 267 |
+
Generally, you should observe that DA3-LARGE achieves comparable results to VGGT.
|
| 268 |
+
|
| 269 |
+
The Nested series uses an Any-view model to estimate pose and depth, and a monocular metric depth estimator for scaling.
|
| 270 |
+
|
| 271 |
+
| 🗃️ Model Name | 📏 Params | 📊 Rel. Depth | 📷 Pose Est. | 🧭 Pose Cond. | 🎨 GS | 📐 Met. Depth | ☁️ Sky Seg | 📄 License |
|
| 272 |
+
|-------------------------------|-----------|---------------|--------------|---------------|-------|---------------|-----------|----------------|
|
| 273 |
+
| **Nested** | | | | | | | | |
|
| 274 |
+
| [DA3NESTED-GIANT-LARGE](https://huggingface.co/depth-anything/DA3NESTED-GIANT-LARGE) | 1.40B | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | CC BY-NC 4.0 |
|
| 275 |
+
| **Any-view Model** | | | | | | | | |
|
| 276 |
+
| [DA3-GIANT](https://huggingface.co/depth-anything/DA3-GIANT) | 1.15B | ✅ | ✅ | ✅ | ✅ | | | CC BY-NC 4.0 |
|
| 277 |
+
| [DA3-LARGE](https://huggingface.co/depth-anything/DA3-LARGE) | 0.35B | ✅ | ✅ | ✅ | | | | CC BY-NC 4.0 |
|
| 278 |
+
| [DA3-BASE](https://huggingface.co/depth-anything/DA3-BASE) | 0.12B | ✅ | ✅ | ✅ | | | | Apache 2.0 |
|
| 279 |
+
| [DA3-SMALL](https://huggingface.co/depth-anything/DA3-SMALL) | 0.08B | ✅ | ✅ | ✅ | | | | Apache 2.0 |
|
| 280 |
+
| | | | | | | | | |
|
| 281 |
+
| **Monocular Metric Depth** | | | | | | | | |
|
| 282 |
+
| [DA3METRIC-LARGE](https://huggingface.co/depth-anything/DA3METRIC-LARGE) | 0.35B | ✅ | | | | ✅ | ✅ | Apache 2.0 |
|
| 283 |
+
| | | | | | | | | |
|
| 284 |
+
| **Monocular Depth** | | | | | | | | |
|
| 285 |
+
| [DA3MONO-LARGE](https://huggingface.co/depth-anything/DA3MONO-LARGE) | 0.35B | ✅ | | | | | ✅ | Apache 2.0 |
|
| 286 |
+
|
| 287 |
+
|
| 288 |
+
## ⚡ Performance Benchmarks
|
| 289 |
+
|
| 290 |
+
Inference throughput measured on Apple Silicon (MPS) with PyTorch 2.9.0. For detailed benchmarks, see [BENCHMARKS.md](BENCHMARKS.md).
|
| 291 |
+
|
| 292 |
+
### Apple Silicon (MPS) - Batch Size 1
|
| 293 |
+
|
| 294 |
+
| Model | Latency | Throughput |
|
| 295 |
+
|-------|---------|------------|
|
| 296 |
+
| DA3-Small | 46 ms | **22 img/s** |
|
| 297 |
+
| DA3-Base | 93 ms | **11 img/s** |
|
| 298 |
+
| DA3-Large | 265 ms | **3.8 img/s** |
|
| 299 |
+
| DA3-Giant | 618 ms | **1.6 img/s** |
|
| 300 |
+
|
| 301 |
+
### Cross-Device Comparison (DA3-Large)
|
| 302 |
+
|
| 303 |
+
| Device | Throughput | vs CPU |
|
| 304 |
+
|--------|------------|--------|
|
| 305 |
+
| CPU | 0.3 img/s | 1.0x |
|
| 306 |
+
| Apple Silicon (MPS) | 3.8 img/s | **13x** |
|
| 307 |
+
| NVIDIA L4 (CUDA) | 10.3 img/s | **34x** |
|
| 308 |
+
|
| 309 |
+
### Batch Processing
|
| 310 |
+
|
| 311 |
+
```python
|
| 312 |
+
from depth_anything_3.api import DepthAnything3
|
| 313 |
+
|
| 314 |
+
model = DepthAnything3.from_pretrained("depth-anything/DA3-LARGE")
|
| 315 |
+
|
| 316 |
+
# Adaptive batching (recommended for large image sets)
|
| 317 |
+
results = model.batch_inference(
|
| 318 |
+
images=image_paths,
|
| 319 |
+
batch_size="auto", # Automatically selects optimal batch size
|
| 320 |
+
target_memory_utilization=0.85,
|
| 321 |
+
)
|
| 322 |
+
|
| 323 |
+
# Fixed batch size
|
| 324 |
+
results = model.batch_inference(
|
| 325 |
+
images=image_paths,
|
| 326 |
+
batch_size=4,
|
| 327 |
+
)
|
| 328 |
+
```
|
| 329 |
+
|
| 330 |
+
> See [BENCHMARKS.md](BENCHMARKS.md) for comprehensive benchmarks including preprocessing, attention mechanisms, and adaptive batching strategies.
|
| 331 |
+
|
| 332 |
+
|
| 333 |
+
## ❓ FAQ
|
| 334 |
+
|
| 335 |
+
- **Monocular Metric Depth**: To obtain metric depth in meters from `DA3METRIC-LARGE`, use `metric_depth = focal * net_output / 300.`, where `focal` is the focal length in pixels (typically the average of fx and fy from the camera intrinsic matrix K). Note that the output from `DA3NESTED-GIANT-LARGE` is already in meters.
|
| 336 |
+
|
| 337 |
+
- <a id="use-ray-pose"></a>**Ray Head (`use_ray_pose`)**: Our API and CLI support `use_ray_pose` arg, which means that the model will derive camera pose from ray head, which is generally slightly slower, but more accurate. Note that the default is `False` for faster inference speed.
|
| 338 |
+
<details>
|
| 339 |
+
<summary>AUC3 Results for DA3NESTED-GIANT-LARGE</summary>
|
| 340 |
+
|
| 341 |
+
| Model | HiRoom | ETH3D | DTU | 7Scenes | ScanNet++ |
|
| 342 |
+
|-------|------|-------|-----|---------|-----------|
|
| 343 |
+
| `ray_head` | 84.4 | 52.6 | 93.9 | 29.5 | 89.4 |
|
| 344 |
+
| `cam_head` | 80.3 | 48.4 | 94.1 | 28.5 | 85.0 |
|
| 345 |
+
|
| 346 |
+
</details>
|
| 347 |
+
|
| 348 |
+
|
| 349 |
+
|
| 350 |
+
|
| 351 |
+
- **Older GPUs without XFormers support**: See [Issue #11](https://github.com/ByteDance-Seed/Depth-Anything-3/issues/11). Thanks to [@S-Mahoney](https://github.com/S-Mahoney) for the solution!
|
| 352 |
+
|
| 353 |
+
|
| 354 |
+
## 🏢 Awesome DA3 Projects
|
| 355 |
+
|
| 356 |
+
A community-curated list of Depth Anything 3 integrations across 3D tools, creative pipelines, robotics, and web/VR viewers, including but not limited to these. You are welcome to submit your DA3-based project via PR, and we will review and feature it if applicable.
|
| 357 |
+
|
| 358 |
+
- [DA3-blender](https://github.com/xy-gao/DA3-blender): Blender addon for DA3-based 3D reconstruction from a set of images.
|
| 359 |
+
|
| 360 |
+
- [ComfyUI-DepthAnythingV3](https://github.com/PozzettiAndrea/ComfyUI-DepthAnythingV3): ComfyUI nodes for Depth Anything 3, supporting single/multi-view and video-consistent depth with optional point‑cloud export.
|
| 361 |
+
|
| 362 |
+
- [DA3-ROS2-Wrapper](https://github.com/GerdsenAI/GerdsenAI-Depth-Anything-3-ROS2-Wrapper): Real-time DA3 depth in ROS2 with multi-camera support.
|
| 363 |
+
|
| 364 |
+
- [VideoDepthViewer3D](https://github.com/amariichi/VideoDepthViewer3D): Streaming videos with DA3 metric depth to a Three.js/WebXR 3D viewer for VR/stereo playback.
|
| 365 |
+
|
| 366 |
+
|
| 367 |
+
## 📝 Credits
|
| 368 |
+
|
| 369 |
+
### Original Authors
|
| 370 |
+
|
| 371 |
+
This package is built on top of **Depth Anything 3**, created by the ByteDance Seed team:
|
| 372 |
+
|
| 373 |
+
- [Haotong Lin](https://haotongl.github.io/), [Sili Chen](https://github.com/SiliChen321), [Jun Hao Liew](https://liewjunhao.github.io/), [Donny Y. Chen](https://donydchen.github.io), [Zhenyu Li](https://zhyever.github.io/), [Guang Shi](https://scholar.google.com/citations?user=MjXxWbUAAAAJ), [Jiashi Feng](https://scholar.google.com.sg/citations?user=Q8iay0gAAAAJ), [Bingyi Kang](https://bingykang.github.io/)
|
| 374 |
+
|
| 375 |
+
All model weights, architecture, and core algorithms are their work. This fork only adds production optimizations and deployment tooling.
|
| 376 |
+
|
| 377 |
+
### Fork Maintainer
|
| 378 |
+
|
| 379 |
+
This optimized fork is maintained by [Delanoe Pirard (Aedelon)](https://github.com/Aedelon).
|
| 380 |
+
|
| 381 |
+
Contributions:
|
| 382 |
+
- Model caching system
|
| 383 |
+
- Adaptive batching
|
| 384 |
+
- Apple Silicon (MPS) optimizations
|
| 385 |
+
- PyPI packaging and CI/CD
|
| 386 |
+
- Comprehensive benchmarking
|
| 387 |
+
|
| 388 |
+
### Citation
|
| 389 |
+
|
| 390 |
+
If you use Depth Anything 3 in your research, please cite the original paper:
|
| 391 |
+
|
| 392 |
+
```bibtex
|
| 393 |
+
@article{depthanything3,
|
| 394 |
+
title={Depth Anything 3: Recovering the visual space from any views},
|
| 395 |
+
author={Haotong Lin and Sili Chen and Jun Hao Liew and Donny Y. Chen and Zhenyu Li and Guang Shi and Jiashi Feng and Bingyi Kang},
|
| 396 |
+
journal={arXiv preprint arXiv:2511.10647},
|
| 397 |
+
year={2025}
|
| 398 |
+
}
|
| 399 |
+
```
|
| 400 |
+
|
| 401 |
+
If you specifically use features from this fork (caching, batching, MPS optimizations), you may additionally reference:
|
| 402 |
+
|
| 403 |
+
```
|
| 404 |
+
awesome-depth-anything-3: https://github.com/Aedelon/awesome-depth-anything-3
|
| 405 |
+
```
|
app.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
# Copyright (c) Delanoe Pirard / Aedelon
|
| 3 |
+
# Licensed under the Apache License, Version 2.0
|
| 4 |
+
"""
|
| 5 |
+
Hugging Face Spaces entry point for awesome-depth-anything-3.
|
| 6 |
+
|
| 7 |
+
This file is the main entry point for the HF Spaces deployment.
|
| 8 |
+
It launches the Gradio web interface with optimized settings for cloud deployment.
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import os
|
| 12 |
+
import tempfile
|
| 13 |
+
|
| 14 |
+
# Disable analytics and configure for HF Spaces
|
| 15 |
+
os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
|
| 16 |
+
os.environ["DA3_LOG_LEVEL"] = "WARNING"
|
| 17 |
+
|
| 18 |
+
from depth_anything_3.app.gradio_app import DepthAnything3App
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def main():
|
| 22 |
+
"""Launch the Gradio app for HF Spaces."""
|
| 23 |
+
# Use DA3-LARGE for good balance of quality and speed
|
| 24 |
+
workspace_dir = "/tmp/workspace"
|
| 25 |
+
gallery_dir = "/tmp/gallery"
|
| 26 |
+
|
| 27 |
+
# Create directories
|
| 28 |
+
os.makedirs(workspace_dir, exist_ok=True)
|
| 29 |
+
os.makedirs(gallery_dir, exist_ok=True)
|
| 30 |
+
|
| 31 |
+
app = DepthAnything3App(
|
| 32 |
+
model_dir="depth-anything/DA3-LARGE",
|
| 33 |
+
workspace_dir=workspace_dir,
|
| 34 |
+
gallery_dir=gallery_dir,
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
demo = app.create_app()
|
| 38 |
+
|
| 39 |
+
# Build allowed paths for Gradio file access
|
| 40 |
+
allowed_paths = [
|
| 41 |
+
os.getcwd(),
|
| 42 |
+
tempfile.gettempdir(),
|
| 43 |
+
workspace_dir,
|
| 44 |
+
gallery_dir,
|
| 45 |
+
"/tmp",
|
| 46 |
+
]
|
| 47 |
+
|
| 48 |
+
# Launch for HF Spaces (theme/css already set in create_app via gr.Blocks())
|
| 49 |
+
demo.queue(max_size=10).launch(
|
| 50 |
+
server_name="0.0.0.0",
|
| 51 |
+
server_port=7860,
|
| 52 |
+
share=True,
|
| 53 |
+
show_error=True,
|
| 54 |
+
allowed_paths=allowed_paths,
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
if __name__ == "__main__":
|
| 59 |
+
main()
|
assets/examples/SOH/000.png
ADDED
|
Git LFS Details
|
assets/examples/SOH/010.png
ADDED
|
Git LFS Details
|
assets/examples/robot_unitree.mp4
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:99bc274f7613a665c6135085fe01691ebfaa9033101319071f37c550ab21d1ea
|
| 3 |
+
size 1964268
|
assets/images/da3_radar.png
ADDED
|
Git LFS Details
|
assets/images/demo320-2.gif
ADDED
|
Git LFS Details
|
benchmarks/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Delanoe Pirard / Aedelon
|
| 2 |
+
# Licensed under the Apache License, Version 2.0
|
| 3 |
+
"""Benchmark scripts for Depth Anything 3."""
|
benchmarks/comparative_benchmark.py
ADDED
|
@@ -0,0 +1,436 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
# Copyright (c) Delanoe Pirard / Aedelon
|
| 3 |
+
# Licensed under the Apache License, Version 2.0
|
| 4 |
+
"""
|
| 5 |
+
Comparative Benchmark: awesome-depth-anything-3 vs upstream (vanilla)
|
| 6 |
+
|
| 7 |
+
Compares performance between the optimized fork and the original upstream.
|
| 8 |
+
|
| 9 |
+
Usage:
|
| 10 |
+
python benchmarks/comparative_benchmark.py --device mps
|
| 11 |
+
python benchmarks/comparative_benchmark.py --device cuda
|
| 12 |
+
python benchmarks/comparative_benchmark.py --device all
|
| 13 |
+
python benchmarks/comparative_benchmark.py --quick
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
import argparse
|
| 17 |
+
import contextlib
|
| 18 |
+
import gc
|
| 19 |
+
import io
|
| 20 |
+
import logging
|
| 21 |
+
import os
|
| 22 |
+
import shutil
|
| 23 |
+
import sys
|
| 24 |
+
import time
|
| 25 |
+
import warnings
|
| 26 |
+
|
| 27 |
+
# Suppress ALL logging before any imports
|
| 28 |
+
logging.disable(logging.CRITICAL)
|
| 29 |
+
os.environ["DA3_LOG_LEVEL"] = "CRITICAL"
|
| 30 |
+
os.environ["PYTHONWARNINGS"] = "ignore"
|
| 31 |
+
warnings.filterwarnings("ignore")
|
| 32 |
+
|
| 33 |
+
import numpy as np
|
| 34 |
+
import torch
|
| 35 |
+
from PIL import Image
|
| 36 |
+
|
| 37 |
+
# Suppress all loggers
|
| 38 |
+
logging.getLogger("depth_anything_3").disabled = True
|
| 39 |
+
logging.getLogger("dinov2").disabled = True
|
| 40 |
+
logging.getLogger().setLevel(logging.CRITICAL)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
@contextlib.contextmanager
|
| 44 |
+
def suppress_output():
|
| 45 |
+
"""Context manager to suppress stdout and stderr."""
|
| 46 |
+
with contextlib.redirect_stdout(io.StringIO()), \
|
| 47 |
+
contextlib.redirect_stderr(io.StringIO()):
|
| 48 |
+
# Also suppress all loggers again
|
| 49 |
+
logging.disable(logging.CRITICAL)
|
| 50 |
+
yield
|
| 51 |
+
|
| 52 |
+
# ============================================================================
|
| 53 |
+
# CONFIGURATION
|
| 54 |
+
# ============================================================================
|
| 55 |
+
|
| 56 |
+
AWESOME_REPO = "/Users/aedelon/Workspace/awesome-depth-anything-3"
|
| 57 |
+
UPSTREAM_REPO = "/Users/aedelon/Workspace/depth-anything-3-upstream"
|
| 58 |
+
MODEL_NAME = "da3-large"
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
# ============================================================================
|
| 62 |
+
# UTILITIES
|
| 63 |
+
# ============================================================================
|
| 64 |
+
|
| 65 |
+
def cleanup():
|
| 66 |
+
gc.collect()
|
| 67 |
+
if torch.cuda.is_available():
|
| 68 |
+
torch.cuda.empty_cache()
|
| 69 |
+
torch.cuda.reset_peak_memory_stats()
|
| 70 |
+
if torch.backends.mps.is_available():
|
| 71 |
+
torch.mps.empty_cache()
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def sync_device(device):
|
| 75 |
+
if device.type == "cuda":
|
| 76 |
+
torch.cuda.synchronize()
|
| 77 |
+
elif device.type == "mps":
|
| 78 |
+
torch.mps.synchronize()
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def clear_modules():
|
| 82 |
+
"""Clear depth_anything_3 from sys.modules."""
|
| 83 |
+
to_remove = [k for k in sys.modules.keys() if "depth_anything_3" in k]
|
| 84 |
+
for k in to_remove:
|
| 85 |
+
del sys.modules[k]
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def suppress_logging():
|
| 89 |
+
"""Suppress all logging after module import."""
|
| 90 |
+
logging.disable(logging.CRITICAL)
|
| 91 |
+
try:
|
| 92 |
+
from depth_anything_3.utils.logger import logger
|
| 93 |
+
logger.level = 100
|
| 94 |
+
except:
|
| 95 |
+
pass
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def get_available_devices():
|
| 99 |
+
"""Get available devices."""
|
| 100 |
+
devices = [torch.device("cpu")]
|
| 101 |
+
if torch.backends.mps.is_available():
|
| 102 |
+
devices.append(torch.device("mps"))
|
| 103 |
+
if torch.cuda.is_available():
|
| 104 |
+
devices.append(torch.device("cuda"))
|
| 105 |
+
return devices
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def get_device_name(device):
|
| 109 |
+
if device.type == "cuda":
|
| 110 |
+
return torch.cuda.get_device_name(device)
|
| 111 |
+
elif device.type == "mps":
|
| 112 |
+
return "Apple Silicon (MPS)"
|
| 113 |
+
return "CPU"
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
# ============================================================================
|
| 117 |
+
# BENCHMARK: UPSTREAM (VANILLA)
|
| 118 |
+
# ============================================================================
|
| 119 |
+
|
| 120 |
+
def benchmark_upstream(device, pil_images, process_res=504, runs=3):
|
| 121 |
+
"""Benchmark upstream/vanilla depth-anything-3."""
|
| 122 |
+
|
| 123 |
+
# Setup path
|
| 124 |
+
clear_modules()
|
| 125 |
+
upstream_src = os.path.join(UPSTREAM_REPO, "src")
|
| 126 |
+
if upstream_src in sys.path:
|
| 127 |
+
sys.path.remove(upstream_src)
|
| 128 |
+
sys.path.insert(0, upstream_src)
|
| 129 |
+
|
| 130 |
+
with suppress_output():
|
| 131 |
+
from depth_anything_3.api import DepthAnything3
|
| 132 |
+
suppress_logging()
|
| 133 |
+
|
| 134 |
+
cleanup()
|
| 135 |
+
|
| 136 |
+
# Cold load
|
| 137 |
+
start = time.perf_counter()
|
| 138 |
+
model = DepthAnything3(model_name=MODEL_NAME)
|
| 139 |
+
model = model.to(device)
|
| 140 |
+
model.eval()
|
| 141 |
+
cold_load_time = time.perf_counter() - start
|
| 142 |
+
|
| 143 |
+
# Warmup
|
| 144 |
+
for _ in range(2):
|
| 145 |
+
model.inference(pil_images[:1], process_res=process_res)
|
| 146 |
+
sync_device(device)
|
| 147 |
+
cleanup()
|
| 148 |
+
|
| 149 |
+
# Benchmark inference
|
| 150 |
+
times = []
|
| 151 |
+
for _ in range(runs):
|
| 152 |
+
cleanup()
|
| 153 |
+
sync_device(device)
|
| 154 |
+
start = time.perf_counter()
|
| 155 |
+
model.inference(pil_images, process_res=process_res)
|
| 156 |
+
sync_device(device)
|
| 157 |
+
times.append(time.perf_counter() - start)
|
| 158 |
+
|
| 159 |
+
avg_time = np.mean(times)
|
| 160 |
+
std_time = np.std(times)
|
| 161 |
+
throughput = len(pil_images) / avg_time
|
| 162 |
+
|
| 163 |
+
del model
|
| 164 |
+
cleanup()
|
| 165 |
+
|
| 166 |
+
# Cleanup path
|
| 167 |
+
sys.path.remove(upstream_src)
|
| 168 |
+
clear_modules()
|
| 169 |
+
|
| 170 |
+
return {
|
| 171 |
+
"cold_load": cold_load_time,
|
| 172 |
+
"inference_time": avg_time,
|
| 173 |
+
"inference_std": std_time,
|
| 174 |
+
"throughput": throughput,
|
| 175 |
+
}
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
# ============================================================================
|
| 179 |
+
# BENCHMARK: AWESOME (OPTIMIZED)
|
| 180 |
+
# ============================================================================
|
| 181 |
+
|
| 182 |
+
def benchmark_awesome(device, pil_images, process_res=504, runs=3, use_cache=True):
|
| 183 |
+
"""Benchmark awesome (optimized) depth-anything-3."""
|
| 184 |
+
|
| 185 |
+
# Setup path
|
| 186 |
+
clear_modules()
|
| 187 |
+
awesome_src = os.path.join(AWESOME_REPO, "src")
|
| 188 |
+
if awesome_src in sys.path:
|
| 189 |
+
sys.path.remove(awesome_src)
|
| 190 |
+
sys.path.insert(0, awesome_src)
|
| 191 |
+
|
| 192 |
+
with suppress_output():
|
| 193 |
+
from depth_anything_3.api import DepthAnything3
|
| 194 |
+
from depth_anything_3.cache import get_model_cache
|
| 195 |
+
suppress_logging()
|
| 196 |
+
|
| 197 |
+
# Clear cache if testing cold load
|
| 198 |
+
if not use_cache:
|
| 199 |
+
cache = get_model_cache()
|
| 200 |
+
cache.clear()
|
| 201 |
+
|
| 202 |
+
cleanup()
|
| 203 |
+
|
| 204 |
+
# Cold/warm load
|
| 205 |
+
start = time.perf_counter()
|
| 206 |
+
model = DepthAnything3(model_name=MODEL_NAME, device=device, use_cache=use_cache)
|
| 207 |
+
load_time = time.perf_counter() - start
|
| 208 |
+
|
| 209 |
+
# For cache test, do a second load
|
| 210 |
+
cached_load_time = None
|
| 211 |
+
if use_cache:
|
| 212 |
+
del model
|
| 213 |
+
cleanup()
|
| 214 |
+
start = time.perf_counter()
|
| 215 |
+
model = DepthAnything3(model_name=MODEL_NAME, device=device, use_cache=True)
|
| 216 |
+
cached_load_time = time.perf_counter() - start
|
| 217 |
+
|
| 218 |
+
# Warmup
|
| 219 |
+
for _ in range(2):
|
| 220 |
+
model.inference(pil_images[:1], process_res=process_res)
|
| 221 |
+
sync_device(device)
|
| 222 |
+
cleanup()
|
| 223 |
+
|
| 224 |
+
# Benchmark inference
|
| 225 |
+
times = []
|
| 226 |
+
for _ in range(runs):
|
| 227 |
+
cleanup()
|
| 228 |
+
sync_device(device)
|
| 229 |
+
start = time.perf_counter()
|
| 230 |
+
model.inference(pil_images, process_res=process_res)
|
| 231 |
+
sync_device(device)
|
| 232 |
+
times.append(time.perf_counter() - start)
|
| 233 |
+
|
| 234 |
+
avg_time = np.mean(times)
|
| 235 |
+
std_time = np.std(times)
|
| 236 |
+
throughput = len(pil_images) / avg_time
|
| 237 |
+
|
| 238 |
+
del model
|
| 239 |
+
cleanup()
|
| 240 |
+
|
| 241 |
+
# Cleanup path
|
| 242 |
+
sys.path.remove(awesome_src)
|
| 243 |
+
clear_modules()
|
| 244 |
+
|
| 245 |
+
return {
|
| 246 |
+
"cold_load": load_time,
|
| 247 |
+
"cached_load": cached_load_time,
|
| 248 |
+
"inference_time": avg_time,
|
| 249 |
+
"inference_std": std_time,
|
| 250 |
+
"throughput": throughput,
|
| 251 |
+
}
|
| 252 |
+
|
| 253 |
+
|
| 254 |
+
# ============================================================================
|
| 255 |
+
# MAIN
|
| 256 |
+
# ============================================================================
|
| 257 |
+
|
| 258 |
+
def run_comparison(device, batch_sizes, process_res=504, runs=3):
|
| 259 |
+
"""Run comparison for a specific device."""
|
| 260 |
+
|
| 261 |
+
results = {}
|
| 262 |
+
temp_dir = "temp_compare"
|
| 263 |
+
os.makedirs(temp_dir, exist_ok=True)
|
| 264 |
+
|
| 265 |
+
try:
|
| 266 |
+
# Create test images
|
| 267 |
+
max_batch = max(batch_sizes)
|
| 268 |
+
pil_images = []
|
| 269 |
+
for i in range(max_batch):
|
| 270 |
+
img = Image.new("RGB", (1280, 720), color=(100 + i*10, 150, 200))
|
| 271 |
+
pil_images.append(img)
|
| 272 |
+
|
| 273 |
+
for batch_size in batch_sizes:
|
| 274 |
+
test_images = pil_images[:batch_size]
|
| 275 |
+
results[batch_size] = {}
|
| 276 |
+
|
| 277 |
+
print(f"\n Batch size: {batch_size}")
|
| 278 |
+
print(f" {'-'*50}")
|
| 279 |
+
|
| 280 |
+
# Upstream
|
| 281 |
+
print(f" Testing UPSTREAM (vanilla)...", end=" ", flush=True)
|
| 282 |
+
try:
|
| 283 |
+
upstream = benchmark_upstream(device, test_images, process_res, runs)
|
| 284 |
+
results[batch_size]["upstream"] = upstream
|
| 285 |
+
print(f"{upstream['throughput']:.2f} img/s")
|
| 286 |
+
except Exception as e:
|
| 287 |
+
print(f"ERROR: {e}")
|
| 288 |
+
results[batch_size]["upstream"] = None
|
| 289 |
+
|
| 290 |
+
# Awesome (no cache - fair comparison)
|
| 291 |
+
print(f" Testing AWESOME (no cache)...", end=" ", flush=True)
|
| 292 |
+
try:
|
| 293 |
+
awesome_nc = benchmark_awesome(device, test_images, process_res, runs, use_cache=False)
|
| 294 |
+
results[batch_size]["awesome_nocache"] = awesome_nc
|
| 295 |
+
print(f"{awesome_nc['throughput']:.2f} img/s")
|
| 296 |
+
except Exception as e:
|
| 297 |
+
print(f"ERROR: {e}")
|
| 298 |
+
results[batch_size]["awesome_nocache"] = None
|
| 299 |
+
|
| 300 |
+
# Awesome (with cache)
|
| 301 |
+
print(f" Testing AWESOME (cached)...", end=" ", flush=True)
|
| 302 |
+
try:
|
| 303 |
+
awesome_c = benchmark_awesome(device, test_images, process_res, runs, use_cache=True)
|
| 304 |
+
results[batch_size]["awesome_cached"] = awesome_c
|
| 305 |
+
print(f"{awesome_c['throughput']:.2f} img/s")
|
| 306 |
+
except Exception as e:
|
| 307 |
+
print(f"ERROR: {e}")
|
| 308 |
+
results[batch_size]["awesome_cached"] = None
|
| 309 |
+
|
| 310 |
+
finally:
|
| 311 |
+
shutil.rmtree(temp_dir, ignore_errors=True)
|
| 312 |
+
|
| 313 |
+
return results
|
| 314 |
+
|
| 315 |
+
|
| 316 |
+
def print_results_table(results, device):
|
| 317 |
+
"""Print formatted results table."""
|
| 318 |
+
|
| 319 |
+
print(f"\n{'='*70}")
|
| 320 |
+
print(f" RESULTS: {device.type.upper()}")
|
| 321 |
+
print(f"{'='*70}")
|
| 322 |
+
|
| 323 |
+
# Header
|
| 324 |
+
print(f"\n{'Batch':<8} {'Metric':<18} {'Upstream':<12} {'Awesome':<12} {'Speedup':<10}")
|
| 325 |
+
print("-" * 60)
|
| 326 |
+
|
| 327 |
+
for batch_size, data in sorted(results.items()):
|
| 328 |
+
upstream = data.get("upstream")
|
| 329 |
+
awesome = data.get("awesome_nocache") or data.get("awesome_cached")
|
| 330 |
+
|
| 331 |
+
if not upstream or not awesome:
|
| 332 |
+
continue
|
| 333 |
+
|
| 334 |
+
# Inference throughput
|
| 335 |
+
u_thr = upstream["throughput"]
|
| 336 |
+
a_thr = awesome["throughput"]
|
| 337 |
+
speedup = a_thr / u_thr if u_thr > 0 else 0
|
| 338 |
+
print(f"{batch_size:<8} {'Throughput (img/s)':<18} {u_thr:<12.2f} {a_thr:<12.2f} {speedup:<10.2f}x")
|
| 339 |
+
|
| 340 |
+
# Inference time
|
| 341 |
+
u_time = upstream["inference_time"] * 1000
|
| 342 |
+
a_time = awesome["inference_time"] * 1000
|
| 343 |
+
speedup = u_time / a_time if a_time > 0 else 0
|
| 344 |
+
print(f"{'':<8} {'Latency (ms)':<18} {u_time:<12.1f} {a_time:<12.1f} {speedup:<10.2f}x")
|
| 345 |
+
|
| 346 |
+
# Cold load time
|
| 347 |
+
u_load = upstream["cold_load"]
|
| 348 |
+
a_load = awesome["cold_load"]
|
| 349 |
+
speedup = u_load / a_load if a_load > 0 else 0
|
| 350 |
+
print(f"{'':<8} {'Cold load (s)':<18} {u_load:<12.2f} {a_load:<12.2f} {speedup:<10.2f}x")
|
| 351 |
+
|
| 352 |
+
# Cached load (awesome only)
|
| 353 |
+
cached = data.get("awesome_cached")
|
| 354 |
+
if cached and cached.get("cached_load"):
|
| 355 |
+
c_load = cached["cached_load"]
|
| 356 |
+
speedup = u_load / c_load if c_load > 0 else 0
|
| 357 |
+
print(f"{'':<8} {'Cached load (s)':<18} {'-':<12} {c_load:<12.3f} {speedup:<10.1f}x")
|
| 358 |
+
|
| 359 |
+
print()
|
| 360 |
+
|
| 361 |
+
|
| 362 |
+
def main():
|
| 363 |
+
parser = argparse.ArgumentParser(description="Comparative Benchmark: Awesome vs Upstream")
|
| 364 |
+
parser.add_argument("--device", "-d", type=str, default="auto",
|
| 365 |
+
choices=["auto", "cpu", "mps", "cuda", "all"],
|
| 366 |
+
help="Device to benchmark")
|
| 367 |
+
parser.add_argument("--batch-sizes", type=int, nargs="+", default=[1, 2, 4],
|
| 368 |
+
help="Batch sizes to test")
|
| 369 |
+
parser.add_argument("--runs", type=int, default=3, help="Number of runs per test")
|
| 370 |
+
parser.add_argument("--quick", action="store_true", help="Quick mode (fewer runs)")
|
| 371 |
+
args = parser.parse_args()
|
| 372 |
+
|
| 373 |
+
if args.quick:
|
| 374 |
+
args.batch_sizes = [1, 2]
|
| 375 |
+
args.runs = 2
|
| 376 |
+
|
| 377 |
+
# Determine devices
|
| 378 |
+
available = get_available_devices()
|
| 379 |
+
if args.device == "auto":
|
| 380 |
+
devices = [available[-1]]
|
| 381 |
+
elif args.device == "all":
|
| 382 |
+
devices = available
|
| 383 |
+
else:
|
| 384 |
+
requested = torch.device(args.device)
|
| 385 |
+
if requested in available:
|
| 386 |
+
devices = [requested]
|
| 387 |
+
else:
|
| 388 |
+
print(f"Device '{args.device}' not available. Available: {[d.type for d in available]}")
|
| 389 |
+
return
|
| 390 |
+
|
| 391 |
+
# Header
|
| 392 |
+
print("\n" + "=" * 70)
|
| 393 |
+
print(" COMPARATIVE BENCHMARK: AWESOME vs UPSTREAM (VANILLA)")
|
| 394 |
+
print("=" * 70)
|
| 395 |
+
print(f" Model: {MODEL_NAME}")
|
| 396 |
+
print(f" PyTorch: {torch.__version__}")
|
| 397 |
+
print(f" Batch sizes: {args.batch_sizes}")
|
| 398 |
+
print(f" Runs per test: {args.runs}")
|
| 399 |
+
print(f" Devices: {[d.type.upper() for d in devices]}")
|
| 400 |
+
for d in available:
|
| 401 |
+
status = "✓" if d in devices else "○"
|
| 402 |
+
print(f" {status} {d.type.upper()}: {get_device_name(d)}")
|
| 403 |
+
print("=" * 70)
|
| 404 |
+
|
| 405 |
+
all_results = {}
|
| 406 |
+
|
| 407 |
+
for device in devices:
|
| 408 |
+
print(f"\n{'#'*70}")
|
| 409 |
+
print(f" DEVICE: {device.type.upper()} ({get_device_name(device)})")
|
| 410 |
+
print(f"{'#'*70}")
|
| 411 |
+
|
| 412 |
+
results = run_comparison(device, args.batch_sizes, runs=args.runs)
|
| 413 |
+
all_results[device.type] = results
|
| 414 |
+
print_results_table(results, device)
|
| 415 |
+
|
| 416 |
+
# Final summary
|
| 417 |
+
print("\n" + "=" * 70)
|
| 418 |
+
print(" SUMMARY")
|
| 419 |
+
print("=" * 70)
|
| 420 |
+
|
| 421 |
+
for device_type, results in all_results.items():
|
| 422 |
+
print(f"\n {device_type.upper()}:")
|
| 423 |
+
|
| 424 |
+
for batch_size, data in sorted(results.items()):
|
| 425 |
+
upstream = data.get("upstream")
|
| 426 |
+
awesome = data.get("awesome_nocache")
|
| 427 |
+
|
| 428 |
+
if upstream and awesome:
|
| 429 |
+
speedup = awesome["throughput"] / upstream["throughput"]
|
| 430 |
+
print(f" Batch {batch_size}: {speedup:.2f}x faster inference")
|
| 431 |
+
|
| 432 |
+
print("\n" + "=" * 70 + "\n")
|
| 433 |
+
|
| 434 |
+
|
| 435 |
+
if __name__ == "__main__":
|
| 436 |
+
main()
|
benchmarks/flash_attention_benchmark.py
ADDED
|
@@ -0,0 +1,488 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
# Copyright (c) Delanoe Pirard / Aedelon - Apache 2.0
|
| 3 |
+
"""
|
| 4 |
+
Flash Attention Benchmark for Depth Anything 3.
|
| 5 |
+
|
| 6 |
+
Provides clear performance comparison with tables and analysis.
|
| 7 |
+
|
| 8 |
+
Usage:
|
| 9 |
+
python benchmarks/flash_attention_benchmark.py
|
| 10 |
+
python benchmarks/flash_attention_benchmark.py --detailed
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import argparse
|
| 14 |
+
import gc
|
| 15 |
+
import os
|
| 16 |
+
import sys
|
| 17 |
+
import time
|
| 18 |
+
from dataclasses import dataclass
|
| 19 |
+
|
| 20 |
+
import torch
|
| 21 |
+
|
| 22 |
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))
|
| 23 |
+
|
| 24 |
+
from depth_anything_3.model.dinov2.layers import (
|
| 25 |
+
FLASH_ATTN_AVAILABLE,
|
| 26 |
+
FLASH_ATTN_VERSION,
|
| 27 |
+
Attention,
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
@dataclass
|
| 32 |
+
class BenchmarkConfig:
|
| 33 |
+
"""Configuration for a benchmark test case."""
|
| 34 |
+
|
| 35 |
+
name: str
|
| 36 |
+
seq_len: int
|
| 37 |
+
batch_size: int
|
| 38 |
+
embed_dim: int
|
| 39 |
+
num_heads: int
|
| 40 |
+
image_size: str # Description of corresponding image size
|
| 41 |
+
|
| 42 |
+
@property
|
| 43 |
+
def description(self):
|
| 44 |
+
return f"{self.name} ({self.image_size})"
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
# Depth Anything 3 model configurations
|
| 48 |
+
DA3_CONFIGS = {
|
| 49 |
+
"vitb": {"embed_dim": 768, "num_heads": 12, "depth": 12},
|
| 50 |
+
"vitl": {"embed_dim": 1024, "num_heads": 16, "depth": 24},
|
| 51 |
+
"vitg": {"embed_dim": 1536, "num_heads": 24, "depth": 40},
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def get_device_info():
|
| 56 |
+
"""Get device information."""
|
| 57 |
+
if torch.cuda.is_available():
|
| 58 |
+
device = torch.device("cuda")
|
| 59 |
+
device_name = torch.cuda.get_device_name()
|
| 60 |
+
memory_gb = torch.cuda.get_device_properties(0).total_memory / 1024**3
|
| 61 |
+
compute_cap = torch.cuda.get_device_capability()
|
| 62 |
+
return {
|
| 63 |
+
"type": "cuda",
|
| 64 |
+
"device": device,
|
| 65 |
+
"name": device_name,
|
| 66 |
+
"memory_gb": memory_gb,
|
| 67 |
+
"compute_capability": f"{compute_cap[0]}.{compute_cap[1]}",
|
| 68 |
+
}
|
| 69 |
+
elif torch.backends.mps.is_available():
|
| 70 |
+
return {
|
| 71 |
+
"type": "mps",
|
| 72 |
+
"device": torch.device("mps"),
|
| 73 |
+
"name": "Apple Silicon",
|
| 74 |
+
"memory_gb": None,
|
| 75 |
+
"compute_capability": None,
|
| 76 |
+
}
|
| 77 |
+
else:
|
| 78 |
+
return {
|
| 79 |
+
"type": "cpu",
|
| 80 |
+
"device": torch.device("cpu"),
|
| 81 |
+
"name": "CPU",
|
| 82 |
+
"memory_gb": None,
|
| 83 |
+
"compute_capability": None,
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def benchmark_attention(attn_module, x, warmup=5, runs=20):
|
| 88 |
+
"""Run benchmark for a single attention module."""
|
| 89 |
+
device = x.device
|
| 90 |
+
|
| 91 |
+
# Warmup
|
| 92 |
+
with torch.no_grad():
|
| 93 |
+
for _ in range(warmup):
|
| 94 |
+
_ = attn_module(x)
|
| 95 |
+
if device.type == "cuda":
|
| 96 |
+
torch.cuda.synchronize()
|
| 97 |
+
|
| 98 |
+
# Reset memory tracking
|
| 99 |
+
if device.type == "cuda":
|
| 100 |
+
torch.cuda.reset_peak_memory_stats()
|
| 101 |
+
|
| 102 |
+
# Benchmark
|
| 103 |
+
times = []
|
| 104 |
+
with torch.no_grad():
|
| 105 |
+
for _ in range(runs):
|
| 106 |
+
if device.type == "cuda":
|
| 107 |
+
torch.cuda.synchronize()
|
| 108 |
+
start = time.perf_counter()
|
| 109 |
+
_ = attn_module(x)
|
| 110 |
+
if device.type == "cuda":
|
| 111 |
+
torch.cuda.synchronize()
|
| 112 |
+
times.append((time.perf_counter() - start) * 1000)
|
| 113 |
+
|
| 114 |
+
# Memory
|
| 115 |
+
peak_mem_mb = 0
|
| 116 |
+
if device.type == "cuda":
|
| 117 |
+
peak_mem_mb = torch.cuda.max_memory_allocated() / 1024 / 1024
|
| 118 |
+
|
| 119 |
+
times_tensor = torch.tensor(times)
|
| 120 |
+
return {
|
| 121 |
+
"mean_ms": times_tensor.mean().item(),
|
| 122 |
+
"std_ms": times_tensor.std().item(),
|
| 123 |
+
"min_ms": times_tensor.min().item(),
|
| 124 |
+
"peak_mem_mb": peak_mem_mb,
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
def print_header():
|
| 129 |
+
"""Print benchmark header."""
|
| 130 |
+
print("\n" + "=" * 80)
|
| 131 |
+
print(" " * 20 + "FLASH ATTENTION BENCHMARK - DEPTH ANYTHING 3")
|
| 132 |
+
print("=" * 80 + "\n")
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
def get_sdpa_backend_info():
|
| 136 |
+
"""Get info about which SDPA backend is being used."""
|
| 137 |
+
info = {}
|
| 138 |
+
if torch.cuda.is_available():
|
| 139 |
+
from torch.backends.cuda import (
|
| 140 |
+
flash_sdp_enabled,
|
| 141 |
+
mem_efficient_sdp_enabled,
|
| 142 |
+
math_sdp_enabled,
|
| 143 |
+
)
|
| 144 |
+
info["flash_sdp"] = flash_sdp_enabled()
|
| 145 |
+
info["mem_efficient_sdp"] = mem_efficient_sdp_enabled()
|
| 146 |
+
info["math_sdp"] = math_sdp_enabled()
|
| 147 |
+
return info
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
def print_device_info(device_info):
|
| 151 |
+
"""Print device information."""
|
| 152 |
+
print("📊 HARDWARE CONFIGURATION")
|
| 153 |
+
print("─" * 80)
|
| 154 |
+
print(f" Device Type : {device_info['type'].upper()}")
|
| 155 |
+
print(f" Device Name : {device_info['name']}")
|
| 156 |
+
if device_info["memory_gb"]:
|
| 157 |
+
print(f" Memory : {device_info['memory_gb']:.1f} GB")
|
| 158 |
+
if device_info["compute_capability"]:
|
| 159 |
+
print(f" Compute Cap. : {device_info['compute_capability']}")
|
| 160 |
+
cc = float(device_info["compute_capability"])
|
| 161 |
+
if cc >= 7.5:
|
| 162 |
+
print(f" ✅ Flash Attention supported (≥7.5)")
|
| 163 |
+
else:
|
| 164 |
+
print(f" ❌ Flash Attention requires ≥7.5")
|
| 165 |
+
|
| 166 |
+
# SDPA backend info
|
| 167 |
+
sdpa_info = get_sdpa_backend_info()
|
| 168 |
+
if sdpa_info:
|
| 169 |
+
print(f"\n PyTorch SDPA Backends:")
|
| 170 |
+
print(f" Flash SDP : {'✅ Enabled' if sdpa_info.get('flash_sdp') else '❌ Disabled'}")
|
| 171 |
+
print(f" MemEfficient : {'✅ Enabled' if sdpa_info.get('mem_efficient_sdp') else '❌ Disabled'}")
|
| 172 |
+
print(f" Math SDP : {'✅ Enabled' if sdpa_info.get('math_sdp') else '❌ Disabled'}")
|
| 173 |
+
|
| 174 |
+
if sdpa_info.get('flash_sdp'):
|
| 175 |
+
print(f"\n ⚡ PyTorch SDPA uses Flash Attention internally!")
|
| 176 |
+
print(f" (No need for flash-attn package with PyTorch >= 2.2)")
|
| 177 |
+
|
| 178 |
+
print(f"\n flash-attn pkg : {'✅ Installed v' + FLASH_ATTN_VERSION if FLASH_ATTN_AVAILABLE else '❌ Not installed (optional)'}")
|
| 179 |
+
print()
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
def print_table_header():
|
| 183 |
+
"""Print benchmark table header."""
|
| 184 |
+
print(
|
| 185 |
+
"┌──────────────────────────┬──────────────┬──────────────┬──────────────┬────────────┐"
|
| 186 |
+
)
|
| 187 |
+
print(
|
| 188 |
+
"│ Configuration │ flash_attn │ sdpa │ manual │ Speedup │"
|
| 189 |
+
)
|
| 190 |
+
print(
|
| 191 |
+
"├──────────────────────────┼──────────────┼──────────────┼──────────────┼────────────┤"
|
| 192 |
+
)
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
def print_table_row(config_desc, results, baseline="sdpa"):
|
| 196 |
+
"""Print a benchmark result row."""
|
| 197 |
+
backends = ["flash_attn", "sdpa", "manual"]
|
| 198 |
+
|
| 199 |
+
# Format times
|
| 200 |
+
time_strs = []
|
| 201 |
+
for backend in backends:
|
| 202 |
+
if backend in results and results[backend]:
|
| 203 |
+
time_ms = results[backend]["mean_ms"]
|
| 204 |
+
time_strs.append(f"{time_ms:6.2f} ms")
|
| 205 |
+
else:
|
| 206 |
+
time_strs.append(" N/A")
|
| 207 |
+
|
| 208 |
+
# Calculate speedup
|
| 209 |
+
speedup_str = " -"
|
| 210 |
+
if "flash_attn" in results and results["flash_attn"] and baseline in results:
|
| 211 |
+
if results[baseline]:
|
| 212 |
+
speedup = results[baseline]["mean_ms"] / results["flash_attn"]["mean_ms"]
|
| 213 |
+
speedup_str = f" {speedup:.2f}x ⚡" if speedup > 1.1 else f" {speedup:.2f}x"
|
| 214 |
+
|
| 215 |
+
print(
|
| 216 |
+
f"│ {config_desc:24s} │ {time_strs[0]:12s} │ {time_strs[1]:12s} │ {time_strs[2]:12s} │ {speedup_str:10s} │"
|
| 217 |
+
)
|
| 218 |
+
|
| 219 |
+
|
| 220 |
+
def print_table_footer():
|
| 221 |
+
"""Print benchmark table footer."""
|
| 222 |
+
print(
|
| 223 |
+
"└──────────────────────────┴──────────────┴──────────────┴──────────────┴────────────┘"
|
| 224 |
+
)
|
| 225 |
+
|
| 226 |
+
|
| 227 |
+
def print_model_analysis(model_name, config, results, num_layers):
|
| 228 |
+
"""Print detailed analysis for a specific model."""
|
| 229 |
+
if "flash_attn" not in results or not results["flash_attn"]:
|
| 230 |
+
return
|
| 231 |
+
|
| 232 |
+
flash_time = results["flash_attn"]["mean_ms"]
|
| 233 |
+
sdpa_time = results["sdpa"]["mean_ms"] if "sdpa" in results else flash_time
|
| 234 |
+
|
| 235 |
+
speedup = sdpa_time / flash_time
|
| 236 |
+
time_saved_per_layer = (sdpa_time - flash_time) / num_layers
|
| 237 |
+
total_time_saved = time_saved_per_layer * num_layers
|
| 238 |
+
|
| 239 |
+
print(f"\n 📈 {model_name} Analysis:")
|
| 240 |
+
print(f" • Attention time per layer: {flash_time:.2f} ms (flash) vs {sdpa_time:.2f} ms (sdpa)")
|
| 241 |
+
print(f" • Time saved per layer: {time_saved_per_layer:.2f} ms")
|
| 242 |
+
print(f" • Total time saved ({num_layers} layers): {total_time_saved:.1f} ms")
|
| 243 |
+
print(f" • Speedup: {speedup:.2f}x on attention")
|
| 244 |
+
|
| 245 |
+
# Estimate full inference impact
|
| 246 |
+
# Attention is ~15-20% of total inference time
|
| 247 |
+
attn_fraction = 0.175
|
| 248 |
+
overall_speedup = 1 / (1 - attn_fraction + attn_fraction / speedup)
|
| 249 |
+
overall_improvement = (1 - 1 / overall_speedup) * 100
|
| 250 |
+
|
| 251 |
+
print(
|
| 252 |
+
f" • Estimated full inference speedup: {overall_speedup:.2f}x (~{overall_improvement:.1f}% faster)"
|
| 253 |
+
)
|
| 254 |
+
|
| 255 |
+
|
| 256 |
+
def run_benchmark(test_configs, backends, warmup=5, runs=20, detailed=False):
|
| 257 |
+
"""Run complete benchmark suite."""
|
| 258 |
+
device_info = get_device_info()
|
| 259 |
+
device = device_info["device"]
|
| 260 |
+
dtype = torch.float16 if device.type == "cuda" else torch.float32
|
| 261 |
+
|
| 262 |
+
print_header()
|
| 263 |
+
print_device_info(device_info)
|
| 264 |
+
|
| 265 |
+
# Filter backends based on availability
|
| 266 |
+
available_backends = []
|
| 267 |
+
if FLASH_ATTN_AVAILABLE and device.type == "cuda":
|
| 268 |
+
available_backends.append("flash_attn")
|
| 269 |
+
available_backends.append("sdpa")
|
| 270 |
+
if detailed:
|
| 271 |
+
available_backends.append("manual")
|
| 272 |
+
|
| 273 |
+
all_results = {}
|
| 274 |
+
|
| 275 |
+
# Run benchmarks by model
|
| 276 |
+
for model_name, model_config in DA3_CONFIGS.items():
|
| 277 |
+
print(f"\n🔬 MODEL: {model_name.upper()} (dim={model_config['embed_dim']}, heads={model_config['num_heads']}, depth={model_config['depth']})")
|
| 278 |
+
print("─" * 80)
|
| 279 |
+
print_table_header()
|
| 280 |
+
|
| 281 |
+
model_results = {}
|
| 282 |
+
|
| 283 |
+
for test_config in test_configs:
|
| 284 |
+
# Adjust config for this model
|
| 285 |
+
config = BenchmarkConfig(
|
| 286 |
+
name=test_config.name,
|
| 287 |
+
seq_len=test_config.seq_len,
|
| 288 |
+
batch_size=test_config.batch_size,
|
| 289 |
+
embed_dim=model_config["embed_dim"],
|
| 290 |
+
num_heads=model_config["num_heads"],
|
| 291 |
+
image_size=test_config.image_size,
|
| 292 |
+
)
|
| 293 |
+
|
| 294 |
+
x = torch.randn(
|
| 295 |
+
config.batch_size, config.seq_len, config.embed_dim, device=device, dtype=dtype
|
| 296 |
+
)
|
| 297 |
+
|
| 298 |
+
results = {}
|
| 299 |
+
for backend in available_backends:
|
| 300 |
+
gc.collect()
|
| 301 |
+
if device.type == "cuda":
|
| 302 |
+
torch.cuda.empty_cache()
|
| 303 |
+
|
| 304 |
+
try:
|
| 305 |
+
attn = Attention(
|
| 306 |
+
dim=config.embed_dim,
|
| 307 |
+
num_heads=config.num_heads,
|
| 308 |
+
attn_backend=backend,
|
| 309 |
+
).to(device, dtype)
|
| 310 |
+
attn.eval()
|
| 311 |
+
|
| 312 |
+
result = benchmark_attention(attn, x, warmup=warmup, runs=runs)
|
| 313 |
+
results[backend] = result
|
| 314 |
+
|
| 315 |
+
del attn
|
| 316 |
+
except Exception as e:
|
| 317 |
+
results[backend] = None
|
| 318 |
+
if detailed:
|
| 319 |
+
print(f" {backend} failed: {e}")
|
| 320 |
+
|
| 321 |
+
model_results[config.name] = results
|
| 322 |
+
print_table_row(config.description, results)
|
| 323 |
+
|
| 324 |
+
print_table_footer()
|
| 325 |
+
|
| 326 |
+
# Analysis for this model
|
| 327 |
+
if detailed and model_results:
|
| 328 |
+
# Use medium config for analysis
|
| 329 |
+
medium_key = next(
|
| 330 |
+
(k for k in model_results.keys() if "1024" in k.lower() or "medium" in k.lower()),
|
| 331 |
+
list(model_results.keys())[0],
|
| 332 |
+
)
|
| 333 |
+
print_model_analysis(
|
| 334 |
+
model_name.upper(),
|
| 335 |
+
test_configs[0],
|
| 336 |
+
model_results[medium_key],
|
| 337 |
+
model_config["depth"],
|
| 338 |
+
)
|
| 339 |
+
|
| 340 |
+
all_results[model_name] = model_results
|
| 341 |
+
|
| 342 |
+
# Final summary
|
| 343 |
+
print("\n" + "=" * 80)
|
| 344 |
+
print("📋 SUMMARY & RECOMMENDATIONS")
|
| 345 |
+
print("=" * 80)
|
| 346 |
+
|
| 347 |
+
sdpa_info = get_sdpa_backend_info()
|
| 348 |
+
|
| 349 |
+
if device.type == "cuda":
|
| 350 |
+
# Check if PyTorch SDPA has Flash enabled
|
| 351 |
+
if sdpa_info.get('flash_sdp'):
|
| 352 |
+
print("\n✅ Flash Attention is ACTIVE via PyTorch SDPA!")
|
| 353 |
+
print("\n Your setup:")
|
| 354 |
+
print(f" • PyTorch {torch.__version__} with native Flash Attention")
|
| 355 |
+
print(" • SDPA backend: Flash SDP ⚡")
|
| 356 |
+
print(" • No additional packages needed!")
|
| 357 |
+
print("\n Benefits you're already getting:")
|
| 358 |
+
print(" • 2-4x faster attention vs manual implementation")
|
| 359 |
+
print(" • Memory-efficient attention computation")
|
| 360 |
+
print(" • Automatic kernel selection per input size")
|
| 361 |
+
|
| 362 |
+
if FLASH_ATTN_AVAILABLE:
|
| 363 |
+
print(f"\n ℹ️ flash-attn v{FLASH_ATTN_VERSION} also installed")
|
| 364 |
+
print(" (May provide slight additional optimization in some cases)")
|
| 365 |
+
else:
|
| 366 |
+
print("\n ℹ️ flash-attn package: Not needed!")
|
| 367 |
+
print(" PyTorch >= 2.2 includes Flash Attention natively.")
|
| 368 |
+
|
| 369 |
+
elif FLASH_ATTN_AVAILABLE:
|
| 370 |
+
print("\n✅ Flash Attention is ACTIVE via flash-attn package")
|
| 371 |
+
print(f"\n Using flash-attn v{FLASH_ATTN_VERSION}")
|
| 372 |
+
print("\n Benefits:")
|
| 373 |
+
print(" • 2-3x faster attention computation")
|
| 374 |
+
print(" • ~15-25% overall inference speedup")
|
| 375 |
+
print(" • Lower memory usage")
|
| 376 |
+
|
| 377 |
+
else:
|
| 378 |
+
print("\n⚠️ Flash Attention not available")
|
| 379 |
+
print("\n Options to enable:")
|
| 380 |
+
print(" 1. Upgrade PyTorch to >= 2.2 (recommended)")
|
| 381 |
+
print(" 2. Install flash-attn: pip install flash-attn --no-build-isolation")
|
| 382 |
+
|
| 383 |
+
elif device.type == "mps":
|
| 384 |
+
print("\n📱 Apple Silicon (MPS) detected")
|
| 385 |
+
print("\n • Flash Attention not available for MPS")
|
| 386 |
+
print(" • PyTorch SDPA uses optimized Metal kernels")
|
| 387 |
+
print(" • Already running at optimal speed for your hardware")
|
| 388 |
+
|
| 389 |
+
else:
|
| 390 |
+
print("\n💻 CPU detected")
|
| 391 |
+
print("\n • Consider using GPU for faster inference")
|
| 392 |
+
print(" • Flash Attention is CUDA-only")
|
| 393 |
+
|
| 394 |
+
# Print SDPA vs Manual speedup summary
|
| 395 |
+
print("\n" + "─" * 80)
|
| 396 |
+
print("⚡ PERFORMANCE COMPARISON")
|
| 397 |
+
print("─" * 80)
|
| 398 |
+
print("\n SDPA vs Manual attention speedup (per layer):")
|
| 399 |
+
|
| 400 |
+
for model_name, model_results in all_results.items():
|
| 401 |
+
if model_results:
|
| 402 |
+
# Get XLarge config results for most impact
|
| 403 |
+
xlarge_key = next((k for k in model_results.keys() if "xlarge" in k.lower()), list(model_results.keys())[-1])
|
| 404 |
+
if xlarge_key in model_results:
|
| 405 |
+
res = model_results[xlarge_key]
|
| 406 |
+
if res.get("sdpa") and res.get("manual"):
|
| 407 |
+
speedup = res["manual"]["mean_ms"] / res["sdpa"]["mean_ms"]
|
| 408 |
+
print(f" • {model_name.upper():6s}: {speedup:.1f}x faster (sdpa: {res['sdpa']['mean_ms']:.2f}ms vs manual: {res['manual']['mean_ms']:.2f}ms)")
|
| 409 |
+
|
| 410 |
+
print("\n" + "=" * 80)
|
| 411 |
+
print()
|
| 412 |
+
|
| 413 |
+
return all_results
|
| 414 |
+
|
| 415 |
+
|
| 416 |
+
def main():
|
| 417 |
+
parser = argparse.ArgumentParser(description="Flash Attention benchmark for DA3")
|
| 418 |
+
parser.add_argument(
|
| 419 |
+
"--detailed",
|
| 420 |
+
action="store_true",
|
| 421 |
+
help="Show detailed analysis and include manual backend",
|
| 422 |
+
)
|
| 423 |
+
parser.add_argument(
|
| 424 |
+
"--warmup",
|
| 425 |
+
type=int,
|
| 426 |
+
default=5,
|
| 427 |
+
help="Warmup iterations (default: 5)",
|
| 428 |
+
)
|
| 429 |
+
parser.add_argument(
|
| 430 |
+
"--runs",
|
| 431 |
+
type=int,
|
| 432 |
+
default=20,
|
| 433 |
+
help="Benchmark runs (default: 20)",
|
| 434 |
+
)
|
| 435 |
+
|
| 436 |
+
args = parser.parse_args()
|
| 437 |
+
|
| 438 |
+
# Test configurations based on common image sizes
|
| 439 |
+
test_configs = [
|
| 440 |
+
BenchmarkConfig(
|
| 441 |
+
name="Small",
|
| 442 |
+
seq_len=256,
|
| 443 |
+
batch_size=1,
|
| 444 |
+
embed_dim=768, # Will be overridden per model
|
| 445 |
+
num_heads=12, # Will be overridden per model
|
| 446 |
+
image_size="392px image",
|
| 447 |
+
),
|
| 448 |
+
BenchmarkConfig(
|
| 449 |
+
name="Medium",
|
| 450 |
+
seq_len=529,
|
| 451 |
+
batch_size=1,
|
| 452 |
+
embed_dim=768,
|
| 453 |
+
num_heads=12,
|
| 454 |
+
image_size="518px image",
|
| 455 |
+
),
|
| 456 |
+
BenchmarkConfig(
|
| 457 |
+
name="Large",
|
| 458 |
+
seq_len=1024,
|
| 459 |
+
batch_size=1,
|
| 460 |
+
embed_dim=768,
|
| 461 |
+
num_heads=12,
|
| 462 |
+
image_size="742px image",
|
| 463 |
+
),
|
| 464 |
+
BenchmarkConfig(
|
| 465 |
+
name="XLarge",
|
| 466 |
+
seq_len=1369,
|
| 467 |
+
batch_size=1,
|
| 468 |
+
embed_dim=768,
|
| 469 |
+
num_heads=12,
|
| 470 |
+
image_size="1024px image",
|
| 471 |
+
),
|
| 472 |
+
]
|
| 473 |
+
|
| 474 |
+
backends = ["flash_attn", "sdpa"]
|
| 475 |
+
if args.detailed:
|
| 476 |
+
backends.append("manual")
|
| 477 |
+
|
| 478 |
+
run_benchmark(
|
| 479 |
+
test_configs=test_configs,
|
| 480 |
+
backends=backends,
|
| 481 |
+
warmup=args.warmup,
|
| 482 |
+
runs=args.runs,
|
| 483 |
+
detailed=args.detailed,
|
| 484 |
+
)
|
| 485 |
+
|
| 486 |
+
|
| 487 |
+
if __name__ == "__main__":
|
| 488 |
+
main()
|
benchmarks/full_benchmark.py
ADDED
|
@@ -0,0 +1,696 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
# Copyright (c) 2025 Delanoe Pirard / Aedelon - Apache 2.0
|
| 3 |
+
"""
|
| 4 |
+
Full Benchmark Suite for Depth Anything 3
|
| 5 |
+
|
| 6 |
+
Tests ALL optimization combinations for each device (CPU, MPS, CUDA).
|
| 7 |
+
|
| 8 |
+
Optimizations tested:
|
| 9 |
+
- Preprocessing: CPU (PIL) vs GPU (NVJPEG on CUDA)
|
| 10 |
+
- Attention: SDPA (Flash Attention) vs Manual
|
| 11 |
+
|
| 12 |
+
Usage:
|
| 13 |
+
python benchmarks/full_benchmark.py # Best device only
|
| 14 |
+
python benchmarks/full_benchmark.py -d all # All devices
|
| 15 |
+
python benchmarks/full_benchmark.py -d cuda # CUDA only
|
| 16 |
+
python benchmarks/full_benchmark.py --quick # Quick mode
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
import argparse
|
| 20 |
+
import gc
|
| 21 |
+
import logging
|
| 22 |
+
import os
|
| 23 |
+
import shutil
|
| 24 |
+
import sys
|
| 25 |
+
import time
|
| 26 |
+
import warnings
|
| 27 |
+
from dataclasses import dataclass
|
| 28 |
+
from typing import Dict, List, Optional
|
| 29 |
+
|
| 30 |
+
# Suppress ALL logging before any imports
|
| 31 |
+
logging.disable(logging.CRITICAL)
|
| 32 |
+
os.environ["DA3_LOG_LEVEL"] = "ERROR"
|
| 33 |
+
warnings.filterwarnings("ignore")
|
| 34 |
+
|
| 35 |
+
import numpy as np
|
| 36 |
+
import torch
|
| 37 |
+
from PIL import Image
|
| 38 |
+
|
| 39 |
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))
|
| 40 |
+
|
| 41 |
+
# Suppress depth_anything_3 logger specifically
|
| 42 |
+
logging.getLogger("depth_anything_3").disabled = True
|
| 43 |
+
logging.getLogger("dinov2").disabled = True
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
# ============================================================================
|
| 47 |
+
# STYLES
|
| 48 |
+
# ============================================================================
|
| 49 |
+
|
| 50 |
+
class Style:
|
| 51 |
+
CYAN = "\033[96m"
|
| 52 |
+
GREEN = "\033[92m"
|
| 53 |
+
YELLOW = "\033[93m"
|
| 54 |
+
RED = "\033[91m"
|
| 55 |
+
BOLD = "\033[1m"
|
| 56 |
+
DIM = "\033[2m"
|
| 57 |
+
RESET = "\033[0m"
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def colored(text, color, bold=False):
|
| 61 |
+
prefix = Style.BOLD if bold else ""
|
| 62 |
+
return f"{prefix}{color}{text}{Style.RESET}"
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
# ============================================================================
|
| 66 |
+
# UTILITIES
|
| 67 |
+
# ============================================================================
|
| 68 |
+
|
| 69 |
+
def cleanup():
|
| 70 |
+
gc.collect()
|
| 71 |
+
if torch.cuda.is_available():
|
| 72 |
+
torch.cuda.empty_cache()
|
| 73 |
+
torch.cuda.reset_peak_memory_stats()
|
| 74 |
+
if torch.backends.mps.is_available():
|
| 75 |
+
torch.mps.empty_cache()
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def sync_device(device):
|
| 79 |
+
if device.type == "cuda":
|
| 80 |
+
torch.cuda.synchronize()
|
| 81 |
+
elif device.type == "mps":
|
| 82 |
+
torch.mps.synchronize()
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def get_available_devices() -> List[torch.device]:
|
| 86 |
+
"""Get all available devices for benchmarking."""
|
| 87 |
+
devices = [torch.device("cpu")]
|
| 88 |
+
if torch.backends.mps.is_available():
|
| 89 |
+
devices.append(torch.device("mps"))
|
| 90 |
+
if torch.cuda.is_available():
|
| 91 |
+
devices.append(torch.device("cuda"))
|
| 92 |
+
return devices
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def get_device_name(device: torch.device) -> str:
|
| 96 |
+
"""Get human-readable device name."""
|
| 97 |
+
if device.type == "cuda":
|
| 98 |
+
return torch.cuda.get_device_name(device)
|
| 99 |
+
elif device.type == "mps":
|
| 100 |
+
return "Apple Silicon (MPS)"
|
| 101 |
+
else:
|
| 102 |
+
import platform
|
| 103 |
+
return f"CPU ({platform.processor() or 'Unknown'})"
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
# ============================================================================
|
| 107 |
+
# DATA CLASSES
|
| 108 |
+
# ============================================================================
|
| 109 |
+
|
| 110 |
+
@dataclass
|
| 111 |
+
class BenchmarkResult:
|
| 112 |
+
"""Single benchmark result."""
|
| 113 |
+
mean_ms: float
|
| 114 |
+
std_ms: float
|
| 115 |
+
fps: float
|
| 116 |
+
|
| 117 |
+
@classmethod
|
| 118 |
+
def from_times(cls, times: List[float], batch_size: int = 1):
|
| 119 |
+
mean_ms = np.mean(times)
|
| 120 |
+
std_ms = np.std(times)
|
| 121 |
+
fps = 1000 / mean_ms * batch_size
|
| 122 |
+
return cls(mean_ms=mean_ms, std_ms=std_ms, fps=fps)
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
@dataclass
|
| 126 |
+
class OptimizationConfig:
|
| 127 |
+
"""Configuration for a specific optimization combination."""
|
| 128 |
+
name: str
|
| 129 |
+
preprocessing: str # "cpu" or "gpu"
|
| 130 |
+
attention: str # "sdpa" or "manual"
|
| 131 |
+
description: str
|
| 132 |
+
|
| 133 |
+
@property
|
| 134 |
+
def short_name(self) -> str:
|
| 135 |
+
prep = "GPU" if self.preprocessing == "gpu" else "CPU"
|
| 136 |
+
attn = "SDPA" if self.attention == "sdpa" else "Manual"
|
| 137 |
+
return f"{prep}+{attn}"
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
# ============================================================================
|
| 141 |
+
# BENCHMARK FUNCTIONS
|
| 142 |
+
# ============================================================================
|
| 143 |
+
|
| 144 |
+
def get_optimization_configs(device: torch.device) -> List[OptimizationConfig]:
|
| 145 |
+
"""Get all valid optimization configurations for a device."""
|
| 146 |
+
configs = []
|
| 147 |
+
|
| 148 |
+
if device.type == "cuda":
|
| 149 |
+
# CUDA: All 4 combinations
|
| 150 |
+
configs = [
|
| 151 |
+
OptimizationConfig("gpu_sdpa", "gpu", "sdpa", "GPU Decode (NVJPEG) + SDPA (Flash)"),
|
| 152 |
+
OptimizationConfig("gpu_manual", "gpu", "manual", "GPU Decode (NVJPEG) + Manual Attn"),
|
| 153 |
+
OptimizationConfig("cpu_sdpa", "cpu", "sdpa", "CPU Decode (PIL) + SDPA (Flash)"),
|
| 154 |
+
OptimizationConfig("cpu_manual", "cpu", "manual", "CPU Decode (PIL) + Manual Attn"),
|
| 155 |
+
]
|
| 156 |
+
elif device.type == "mps":
|
| 157 |
+
# MPS: CPU preprocessing is better, 2 combinations
|
| 158 |
+
configs = [
|
| 159 |
+
OptimizationConfig("cpu_sdpa", "cpu", "sdpa", "CPU Decode (PIL) + SDPA"),
|
| 160 |
+
OptimizationConfig("cpu_manual", "cpu", "manual", "CPU Decode (PIL) + Manual Attn"),
|
| 161 |
+
]
|
| 162 |
+
else:
|
| 163 |
+
# CPU: 2 combinations
|
| 164 |
+
configs = [
|
| 165 |
+
OptimizationConfig("cpu_sdpa", "cpu", "sdpa", "SDPA Attention"),
|
| 166 |
+
OptimizationConfig("cpu_manual", "cpu", "manual", "Manual Attention"),
|
| 167 |
+
]
|
| 168 |
+
|
| 169 |
+
return configs
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
def benchmark_preprocessing_detailed(device: torch.device, runs: int = 5) -> Dict:
|
| 173 |
+
"""Benchmark preprocessing in detail."""
|
| 174 |
+
from depth_anything_3.utils.io.input_processor import InputProcessor
|
| 175 |
+
from depth_anything_3.utils.io.gpu_input_processor import GPUInputProcessor
|
| 176 |
+
|
| 177 |
+
results = {}
|
| 178 |
+
temp_dir = "temp_bench_preproc"
|
| 179 |
+
|
| 180 |
+
sizes = [
|
| 181 |
+
("720p", 1280, 720),
|
| 182 |
+
("1080p", 1920, 1080),
|
| 183 |
+
("4K", 3840, 2160),
|
| 184 |
+
]
|
| 185 |
+
|
| 186 |
+
os.makedirs(temp_dir, exist_ok=True)
|
| 187 |
+
|
| 188 |
+
try:
|
| 189 |
+
cpu_proc = InputProcessor()
|
| 190 |
+
gpu_proc = None
|
| 191 |
+
if device.type == "cuda":
|
| 192 |
+
gpu_proc = GPUInputProcessor(device=device)
|
| 193 |
+
|
| 194 |
+
for name, w, h in sizes:
|
| 195 |
+
results[name] = {}
|
| 196 |
+
|
| 197 |
+
# Create test files
|
| 198 |
+
files = []
|
| 199 |
+
pil_imgs = []
|
| 200 |
+
for i in range(4):
|
| 201 |
+
img = Image.new("RGB", (w, h), color=(100 + i*10, 150, 200))
|
| 202 |
+
fpath = f"{temp_dir}/{name}_{i}.jpg"
|
| 203 |
+
img.save(fpath, quality=95)
|
| 204 |
+
files.append(fpath)
|
| 205 |
+
pil_imgs.append(img.copy())
|
| 206 |
+
|
| 207 |
+
# CPU benchmark
|
| 208 |
+
cleanup()
|
| 209 |
+
for _ in range(2):
|
| 210 |
+
cpu_proc(image=pil_imgs, process_res=518, num_workers=8)
|
| 211 |
+
|
| 212 |
+
times = []
|
| 213 |
+
for _ in range(runs):
|
| 214 |
+
start = time.perf_counter()
|
| 215 |
+
cpu_proc(image=pil_imgs, process_res=518, num_workers=8)
|
| 216 |
+
times.append((time.perf_counter() - start) * 1000)
|
| 217 |
+
results[name]["cpu"] = BenchmarkResult.from_times(times, batch_size=4)
|
| 218 |
+
|
| 219 |
+
# GPU benchmark (NVJPEG for CUDA)
|
| 220 |
+
if gpu_proc and gpu_proc.use_gpu:
|
| 221 |
+
cleanup()
|
| 222 |
+
for _ in range(2):
|
| 223 |
+
gpu_proc(image=files, process_res=518, num_workers=1)
|
| 224 |
+
sync_device(device)
|
| 225 |
+
|
| 226 |
+
times = []
|
| 227 |
+
for _ in range(runs):
|
| 228 |
+
sync_device(device)
|
| 229 |
+
start = time.perf_counter()
|
| 230 |
+
gpu_proc(image=files, process_res=518, num_workers=1)
|
| 231 |
+
sync_device(device)
|
| 232 |
+
times.append((time.perf_counter() - start) * 1000)
|
| 233 |
+
results[name]["gpu"] = BenchmarkResult.from_times(times, batch_size=4)
|
| 234 |
+
|
| 235 |
+
finally:
|
| 236 |
+
shutil.rmtree(temp_dir, ignore_errors=True)
|
| 237 |
+
|
| 238 |
+
return results
|
| 239 |
+
|
| 240 |
+
|
| 241 |
+
def benchmark_attention_detailed(device: torch.device, runs: int = 10) -> Dict:
|
| 242 |
+
"""Benchmark attention backends in detail."""
|
| 243 |
+
from depth_anything_3.model.dinov2.layers import Attention
|
| 244 |
+
|
| 245 |
+
results = {}
|
| 246 |
+
dtype = torch.float16 if device.type == "cuda" else torch.float32
|
| 247 |
+
|
| 248 |
+
configs = [
|
| 249 |
+
("ViT-S (518px)", 384, 6, 529),
|
| 250 |
+
("ViT-L (518px)", 1024, 16, 529),
|
| 251 |
+
("ViT-L (770px)", 1024, 16, 1156),
|
| 252 |
+
]
|
| 253 |
+
|
| 254 |
+
for name, dim, heads, seq_len in configs:
|
| 255 |
+
results[name] = {}
|
| 256 |
+
x = torch.randn(1, seq_len, dim, device=device, dtype=dtype)
|
| 257 |
+
|
| 258 |
+
for backend in ["sdpa", "manual"]:
|
| 259 |
+
cleanup()
|
| 260 |
+
attn = Attention(dim=dim, num_heads=heads, attn_backend=backend).to(device, dtype)
|
| 261 |
+
attn.eval()
|
| 262 |
+
|
| 263 |
+
# Warmup
|
| 264 |
+
with torch.no_grad():
|
| 265 |
+
for _ in range(3):
|
| 266 |
+
attn(x)
|
| 267 |
+
sync_device(device)
|
| 268 |
+
|
| 269 |
+
# Benchmark
|
| 270 |
+
times = []
|
| 271 |
+
with torch.no_grad():
|
| 272 |
+
for _ in range(runs):
|
| 273 |
+
sync_device(device)
|
| 274 |
+
start = time.perf_counter()
|
| 275 |
+
attn(x)
|
| 276 |
+
sync_device(device)
|
| 277 |
+
times.append((time.perf_counter() - start) * 1000)
|
| 278 |
+
|
| 279 |
+
results[name][backend] = BenchmarkResult.from_times(times)
|
| 280 |
+
del attn
|
| 281 |
+
|
| 282 |
+
return results
|
| 283 |
+
|
| 284 |
+
|
| 285 |
+
def benchmark_inference_matrix(
|
| 286 |
+
device: torch.device,
|
| 287 |
+
models: List[str],
|
| 288 |
+
runs: int = 3,
|
| 289 |
+
) -> Dict:
|
| 290 |
+
"""Benchmark all optimization combinations for inference."""
|
| 291 |
+
from depth_anything_3.api import DepthAnything3
|
| 292 |
+
|
| 293 |
+
results = {}
|
| 294 |
+
temp_dir = "temp_bench_infer"
|
| 295 |
+
configs = get_optimization_configs(device)
|
| 296 |
+
|
| 297 |
+
os.makedirs(temp_dir, exist_ok=True)
|
| 298 |
+
|
| 299 |
+
# Create test images (720p)
|
| 300 |
+
img_paths = []
|
| 301 |
+
pil_imgs = []
|
| 302 |
+
for i in range(4):
|
| 303 |
+
img = Image.new("RGB", (1280, 720), color=(100 + i*20, 150, 200))
|
| 304 |
+
path = f"{temp_dir}/test_{i}.jpg"
|
| 305 |
+
img.save(path, quality=95)
|
| 306 |
+
img_paths.append(path)
|
| 307 |
+
pil_imgs.append(img.copy())
|
| 308 |
+
|
| 309 |
+
try:
|
| 310 |
+
for model_name in models:
|
| 311 |
+
results[model_name] = {}
|
| 312 |
+
|
| 313 |
+
for config in configs:
|
| 314 |
+
cleanup()
|
| 315 |
+
|
| 316 |
+
# Set attention backend
|
| 317 |
+
os.environ["DA3_ATTENTION_BACKEND"] = config.attention
|
| 318 |
+
|
| 319 |
+
# Load model fresh (to apply attention backend)
|
| 320 |
+
model = DepthAnything3(
|
| 321 |
+
model_name=model_name,
|
| 322 |
+
device=device,
|
| 323 |
+
use_cache=False,
|
| 324 |
+
)
|
| 325 |
+
|
| 326 |
+
# Choose input based on preprocessing
|
| 327 |
+
if config.preprocessing == "gpu" and device.type == "cuda":
|
| 328 |
+
test_input = img_paths[:1] # File paths for NVJPEG
|
| 329 |
+
else:
|
| 330 |
+
test_input = pil_imgs[:1] # PIL for CPU preprocessing
|
| 331 |
+
|
| 332 |
+
# Warmup
|
| 333 |
+
for _ in range(3):
|
| 334 |
+
model.inference(test_input, process_res=518)
|
| 335 |
+
sync_device(device)
|
| 336 |
+
|
| 337 |
+
# Benchmark
|
| 338 |
+
times = []
|
| 339 |
+
for _ in range(runs):
|
| 340 |
+
sync_device(device)
|
| 341 |
+
start = time.perf_counter()
|
| 342 |
+
model.inference(test_input, process_res=518)
|
| 343 |
+
sync_device(device)
|
| 344 |
+
times.append((time.perf_counter() - start) * 1000)
|
| 345 |
+
|
| 346 |
+
results[model_name][config.name] = {
|
| 347 |
+
"result": BenchmarkResult.from_times(times, batch_size=1),
|
| 348 |
+
"config": config,
|
| 349 |
+
}
|
| 350 |
+
|
| 351 |
+
del model
|
| 352 |
+
cleanup()
|
| 353 |
+
|
| 354 |
+
finally:
|
| 355 |
+
shutil.rmtree(temp_dir, ignore_errors=True)
|
| 356 |
+
|
| 357 |
+
return results
|
| 358 |
+
|
| 359 |
+
|
| 360 |
+
# ============================================================================
|
| 361 |
+
# DISPLAY FUNCTIONS
|
| 362 |
+
# ============================================================================
|
| 363 |
+
|
| 364 |
+
def print_header(title: str):
|
| 365 |
+
"""Print section header."""
|
| 366 |
+
print()
|
| 367 |
+
print(colored("═" * 70, Style.CYAN))
|
| 368 |
+
print(colored("║", Style.CYAN) + colored(f" {title}", Style.BOLD).center(77) + colored("║", Style.CYAN))
|
| 369 |
+
print(colored("═" * 70, Style.CYAN))
|
| 370 |
+
|
| 371 |
+
|
| 372 |
+
def print_subheader(title: str):
|
| 373 |
+
"""Print subsection header."""
|
| 374 |
+
print()
|
| 375 |
+
print(colored(f"▶ {title}", Style.YELLOW, bold=True))
|
| 376 |
+
print(colored("─" * 70, Style.DIM))
|
| 377 |
+
|
| 378 |
+
|
| 379 |
+
def format_speedup(speedup: float) -> str:
|
| 380 |
+
"""Format speedup with color."""
|
| 381 |
+
if speedup >= 1.5:
|
| 382 |
+
return colored(f"{speedup:.2f}x", Style.GREEN, bold=True)
|
| 383 |
+
elif speedup >= 1.1:
|
| 384 |
+
return colored(f"{speedup:.2f}x", Style.GREEN)
|
| 385 |
+
elif speedup >= 0.95:
|
| 386 |
+
return f"{speedup:.2f}x"
|
| 387 |
+
else:
|
| 388 |
+
return colored(f"{speedup:.2f}x", Style.RED)
|
| 389 |
+
|
| 390 |
+
|
| 391 |
+
def print_preprocessing_results(results: Dict, device: torch.device):
|
| 392 |
+
"""Print preprocessing benchmark results."""
|
| 393 |
+
print_subheader("PREPROCESSING (4 images batch)")
|
| 394 |
+
|
| 395 |
+
has_gpu = any("gpu" in r for r in results.values())
|
| 396 |
+
|
| 397 |
+
if has_gpu:
|
| 398 |
+
print(f" {'Resolution':<12} {'CPU (PIL)':<14} {'GPU (NVJPEG)':<14} {'Speedup':<10}")
|
| 399 |
+
print(f" {'-'*50}")
|
| 400 |
+
|
| 401 |
+
for name, data in results.items():
|
| 402 |
+
cpu_ms = data["cpu"].mean_ms
|
| 403 |
+
if "gpu" in data:
|
| 404 |
+
gpu_ms = data["gpu"].mean_ms
|
| 405 |
+
speedup = cpu_ms / gpu_ms
|
| 406 |
+
print(f" {name:<12} {cpu_ms:>8.1f} ms {gpu_ms:>8.1f} ms {format_speedup(speedup)}")
|
| 407 |
+
else:
|
| 408 |
+
print(f" {name:<12} {cpu_ms:>8.1f} ms {'N/A':<14}")
|
| 409 |
+
else:
|
| 410 |
+
print(f" {'Resolution':<12} {'CPU (PIL)':<14}")
|
| 411 |
+
print(f" {'-'*30}")
|
| 412 |
+
for name, data in results.items():
|
| 413 |
+
cpu_ms = data["cpu"].mean_ms
|
| 414 |
+
print(f" {name:<12} {cpu_ms:>8.1f} ms")
|
| 415 |
+
|
| 416 |
+
# Summary
|
| 417 |
+
if has_gpu:
|
| 418 |
+
speedups = []
|
| 419 |
+
for data in results.values():
|
| 420 |
+
if "gpu" in data:
|
| 421 |
+
speedups.append(data["cpu"].mean_ms / data["gpu"].mean_ms)
|
| 422 |
+
if speedups:
|
| 423 |
+
avg = np.mean(speedups)
|
| 424 |
+
print()
|
| 425 |
+
print(f" {colored('→', Style.GREEN)} GPU preprocessing avg {colored(f'{avg:.1f}x', Style.GREEN, bold=True)} faster")
|
| 426 |
+
|
| 427 |
+
|
| 428 |
+
def print_attention_results(results: Dict, device: torch.device):
|
| 429 |
+
"""Print attention benchmark results."""
|
| 430 |
+
print_subheader("ATTENTION (per layer forward pass)")
|
| 431 |
+
|
| 432 |
+
print(f" {'Config':<18} {'SDPA':<12} {'Manual':<12} {'Speedup':<10}")
|
| 433 |
+
print(f" {'-'*52}")
|
| 434 |
+
|
| 435 |
+
for name, data in results.items():
|
| 436 |
+
sdpa_ms = data["sdpa"].mean_ms
|
| 437 |
+
manual_ms = data["manual"].mean_ms
|
| 438 |
+
speedup = manual_ms / sdpa_ms
|
| 439 |
+
print(f" {name:<18} {sdpa_ms:>6.3f} ms {manual_ms:>6.3f} ms {format_speedup(speedup)}")
|
| 440 |
+
|
| 441 |
+
# Summary
|
| 442 |
+
speedups = [d["manual"].mean_ms / d["sdpa"].mean_ms for d in results.values()]
|
| 443 |
+
avg = np.mean(speedups)
|
| 444 |
+
print()
|
| 445 |
+
print(f" {colored('→', Style.GREEN)} SDPA avg {colored(f'{avg:.1f}x', Style.GREEN, bold=True)} faster than manual")
|
| 446 |
+
|
| 447 |
+
# Check Flash SDP
|
| 448 |
+
if device.type == "cuda":
|
| 449 |
+
from torch.backends.cuda import flash_sdp_enabled
|
| 450 |
+
if flash_sdp_enabled():
|
| 451 |
+
print(f" {colored('→', Style.GREEN)} Flash Attention: {colored('ENABLED', Style.GREEN, bold=True)} (PyTorch native)")
|
| 452 |
+
|
| 453 |
+
|
| 454 |
+
def print_inference_matrix(results: Dict, device: torch.device):
|
| 455 |
+
"""Print inference benchmark matrix."""
|
| 456 |
+
print_subheader("END-TO-END INFERENCE (720p input, batch=1)")
|
| 457 |
+
|
| 458 |
+
configs = get_optimization_configs(device)
|
| 459 |
+
|
| 460 |
+
# Header
|
| 461 |
+
header = f" {'Model':<12}"
|
| 462 |
+
for cfg in configs:
|
| 463 |
+
header += f" {cfg.short_name:<14}"
|
| 464 |
+
header += " Best"
|
| 465 |
+
print(header)
|
| 466 |
+
print(f" {'-'*(14 + 15*len(configs) + 6)}")
|
| 467 |
+
|
| 468 |
+
# Results per model
|
| 469 |
+
for model_name, model_results in results.items():
|
| 470 |
+
row = f" {model_name:<12}"
|
| 471 |
+
|
| 472 |
+
best_fps = 0
|
| 473 |
+
best_config = None
|
| 474 |
+
worst_fps = float('inf')
|
| 475 |
+
|
| 476 |
+
for cfg in configs:
|
| 477 |
+
if cfg.name in model_results:
|
| 478 |
+
result = model_results[cfg.name]["result"]
|
| 479 |
+
fps = result.fps
|
| 480 |
+
row += f" {fps:>6.1f} img/s "
|
| 481 |
+
|
| 482 |
+
if fps > best_fps:
|
| 483 |
+
best_fps = fps
|
| 484 |
+
best_config = cfg
|
| 485 |
+
if fps < worst_fps:
|
| 486 |
+
worst_fps = fps
|
| 487 |
+
else:
|
| 488 |
+
row += f" {'N/A':<14}"
|
| 489 |
+
|
| 490 |
+
# Best indicator
|
| 491 |
+
if best_config:
|
| 492 |
+
row += f" {colored(best_config.short_name, Style.GREEN, bold=True)}"
|
| 493 |
+
|
| 494 |
+
print(row)
|
| 495 |
+
|
| 496 |
+
# Summary
|
| 497 |
+
print()
|
| 498 |
+
print(f" {Style.DIM}Legend: GPU=NVJPEG decode, CPU=PIL decode, SDPA=Flash Attention{Style.RESET}")
|
| 499 |
+
|
| 500 |
+
|
| 501 |
+
def print_device_summary(
|
| 502 |
+
device: torch.device,
|
| 503 |
+
preproc_results: Dict,
|
| 504 |
+
attn_results: Dict,
|
| 505 |
+
infer_results: Dict,
|
| 506 |
+
):
|
| 507 |
+
"""Print summary for a device."""
|
| 508 |
+
print()
|
| 509 |
+
print(colored("─" * 70, Style.CYAN))
|
| 510 |
+
print(colored(f" {device.type.upper()} - OPTIMIZATION SUMMARY", Style.BOLD))
|
| 511 |
+
print(colored("─" * 70, Style.CYAN))
|
| 512 |
+
|
| 513 |
+
# Best configuration
|
| 514 |
+
if infer_results:
|
| 515 |
+
print()
|
| 516 |
+
print(f" {colored('Best configuration per model:', Style.CYAN)}")
|
| 517 |
+
|
| 518 |
+
for model_name, model_results in infer_results.items():
|
| 519 |
+
if not model_results:
|
| 520 |
+
continue
|
| 521 |
+
|
| 522 |
+
best_name = max(model_results.keys(), key=lambda k: model_results[k]["result"].fps)
|
| 523 |
+
best = model_results[best_name]
|
| 524 |
+
worst_name = min(model_results.keys(), key=lambda k: model_results[k]["result"].fps)
|
| 525 |
+
worst = model_results[worst_name]
|
| 526 |
+
|
| 527 |
+
speedup = best["result"].fps / worst["result"].fps if worst["result"].fps > 0 else 1
|
| 528 |
+
|
| 529 |
+
print(f" {model_name:<12} {colored(best['config'].description, Style.GREEN)}")
|
| 530 |
+
print(f" {'':<12} {best['result'].fps:.1f} img/s ({speedup:.1f}x vs worst)")
|
| 531 |
+
|
| 532 |
+
# Recommendations
|
| 533 |
+
print()
|
| 534 |
+
print(f" {colored('Recommendations:', Style.CYAN)}")
|
| 535 |
+
|
| 536 |
+
if device.type == "cuda":
|
| 537 |
+
print(f" ✓ Use {colored('GPU preprocessing (NVJPEG)', Style.GREEN)} for file inputs")
|
| 538 |
+
print(f" ✓ {colored('SDPA (Flash Attention)', Style.GREEN)} is enabled by default")
|
| 539 |
+
print(f" ✓ Pass file paths (not PIL images) to leverage NVJPEG")
|
| 540 |
+
elif device.type == "mps":
|
| 541 |
+
print(f" ✓ Use {colored('CPU preprocessing', Style.GREEN)} (faster than GPU on MPS)")
|
| 542 |
+
print(f" ✓ {colored('SDPA', Style.GREEN)} provides moderate speedup")
|
| 543 |
+
else:
|
| 544 |
+
print(f" ✓ {colored('SDPA', Style.GREEN)} provides speedup over manual attention")
|
| 545 |
+
print(f" ○ Consider using GPU (CUDA/MPS) for better performance")
|
| 546 |
+
|
| 547 |
+
|
| 548 |
+
# ============================================================================
|
| 549 |
+
# MAIN
|
| 550 |
+
# ============================================================================
|
| 551 |
+
|
| 552 |
+
def main():
|
| 553 |
+
parser = argparse.ArgumentParser(
|
| 554 |
+
description="DA3 Full Benchmark - Test all optimization combinations",
|
| 555 |
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
| 556 |
+
epilog="""
|
| 557 |
+
Examples:
|
| 558 |
+
python benchmarks/full_benchmark.py # Best device only
|
| 559 |
+
python benchmarks/full_benchmark.py -d all # All devices
|
| 560 |
+
python benchmarks/full_benchmark.py -d cuda # CUDA only
|
| 561 |
+
python benchmarks/full_benchmark.py --quick # Quick mode (fewer runs)
|
| 562 |
+
python benchmarks/full_benchmark.py --models da3-small da3-large
|
| 563 |
+
"""
|
| 564 |
+
)
|
| 565 |
+
parser.add_argument("--quick", action="store_true", help="Quick mode (fewer runs)")
|
| 566 |
+
parser.add_argument("--skip-preprocessing", action="store_true", help="Skip preprocessing benchmark")
|
| 567 |
+
parser.add_argument("--skip-attention", action="store_true", help="Skip attention benchmark")
|
| 568 |
+
parser.add_argument("--skip-inference", action="store_true", help="Skip inference benchmark")
|
| 569 |
+
parser.add_argument("-d", "--device", type=str, default="auto",
|
| 570 |
+
choices=["auto", "cpu", "mps", "cuda", "all"],
|
| 571 |
+
help="Device to benchmark (default: auto)")
|
| 572 |
+
parser.add_argument("--models", nargs="+", default=None,
|
| 573 |
+
help="Models to benchmark (default: all)")
|
| 574 |
+
args = parser.parse_args()
|
| 575 |
+
|
| 576 |
+
# Configure runs
|
| 577 |
+
runs_preproc = 3 if args.quick else 5
|
| 578 |
+
runs_attn = 5 if args.quick else 10
|
| 579 |
+
runs_infer = 2 if args.quick else 4
|
| 580 |
+
|
| 581 |
+
# Determine models
|
| 582 |
+
if args.models:
|
| 583 |
+
models = args.models
|
| 584 |
+
elif args.quick:
|
| 585 |
+
models = ["da3-small", "da3-large"]
|
| 586 |
+
else:
|
| 587 |
+
models = ["da3-small", "da3-base", "da3-large"]
|
| 588 |
+
|
| 589 |
+
# Determine devices
|
| 590 |
+
available_devices = get_available_devices()
|
| 591 |
+
if args.device == "auto":
|
| 592 |
+
devices_to_test = [available_devices[-1]] # Best available
|
| 593 |
+
elif args.device == "all":
|
| 594 |
+
devices_to_test = available_devices
|
| 595 |
+
else:
|
| 596 |
+
requested = torch.device(args.device)
|
| 597 |
+
if requested in available_devices:
|
| 598 |
+
devices_to_test = [requested]
|
| 599 |
+
else:
|
| 600 |
+
print(f"Error: Device '{args.device}' not available.")
|
| 601 |
+
print(f"Available: {[d.type for d in available_devices]}")
|
| 602 |
+
return
|
| 603 |
+
|
| 604 |
+
# Main header
|
| 605 |
+
print()
|
| 606 |
+
print(colored("╔" + "═" * 68 + "╗", Style.CYAN))
|
| 607 |
+
print(colored("║", Style.CYAN) + colored(" DEPTH ANYTHING 3 - FULL BENCHMARK", Style.BOLD).center(77) + colored("║", Style.CYAN))
|
| 608 |
+
print(colored("║", Style.CYAN) + colored(" All Optimization Combinations", Style.DIM).center(77) + colored("║", Style.CYAN))
|
| 609 |
+
print(colored("╚" + "═" * 68 + "╝", Style.CYAN))
|
| 610 |
+
|
| 611 |
+
print(f"\n {Style.DIM}PyTorch{Style.RESET} : {colored(torch.__version__, Style.CYAN)}")
|
| 612 |
+
print(f" {Style.DIM}Models{Style.RESET} : {colored(', '.join(models), Style.CYAN)}")
|
| 613 |
+
print(f" {Style.DIM}Mode{Style.RESET} : {colored('Quick' if args.quick else 'Full', Style.CYAN)}")
|
| 614 |
+
|
| 615 |
+
print(f"\n {Style.DIM}Available devices:{Style.RESET}")
|
| 616 |
+
for d in available_devices:
|
| 617 |
+
status = colored("●", Style.GREEN) if d in devices_to_test else colored("○", Style.DIM)
|
| 618 |
+
print(f" {status} {d.type.upper():<6} {get_device_name(d)}")
|
| 619 |
+
|
| 620 |
+
all_results = {}
|
| 621 |
+
|
| 622 |
+
# Run benchmarks for each device
|
| 623 |
+
for device in devices_to_test:
|
| 624 |
+
device_name = get_device_name(device)
|
| 625 |
+
all_results[device.type] = {}
|
| 626 |
+
|
| 627 |
+
print_header(f"{device.type.upper()} - {device_name}")
|
| 628 |
+
|
| 629 |
+
# 1. Preprocessing
|
| 630 |
+
preproc_results = {}
|
| 631 |
+
if not args.skip_preprocessing and device.type != "cpu":
|
| 632 |
+
preproc_results = benchmark_preprocessing_detailed(device, runs=runs_preproc)
|
| 633 |
+
all_results[device.type]["preprocessing"] = preproc_results
|
| 634 |
+
print_preprocessing_results(preproc_results, device)
|
| 635 |
+
elif device.type == "cpu":
|
| 636 |
+
print_subheader("PREPROCESSING")
|
| 637 |
+
print(f" {Style.DIM}Skipped (CPU only - no GPU comparison){Style.RESET}")
|
| 638 |
+
|
| 639 |
+
# 2. Attention
|
| 640 |
+
attn_results = {}
|
| 641 |
+
if not args.skip_attention:
|
| 642 |
+
attn_results = benchmark_attention_detailed(device, runs=runs_attn)
|
| 643 |
+
all_results[device.type]["attention"] = attn_results
|
| 644 |
+
print_attention_results(attn_results, device)
|
| 645 |
+
|
| 646 |
+
# 3. Inference Matrix
|
| 647 |
+
infer_results = {}
|
| 648 |
+
if not args.skip_inference:
|
| 649 |
+
infer_results = benchmark_inference_matrix(device, models, runs=runs_infer)
|
| 650 |
+
all_results[device.type]["inference"] = infer_results
|
| 651 |
+
print_inference_matrix(infer_results, device)
|
| 652 |
+
|
| 653 |
+
# Device Summary
|
| 654 |
+
print_device_summary(device, preproc_results, attn_results, infer_results)
|
| 655 |
+
|
| 656 |
+
cleanup()
|
| 657 |
+
|
| 658 |
+
# Cross-device comparison
|
| 659 |
+
if len(devices_to_test) > 1 and not args.skip_inference:
|
| 660 |
+
print_header("CROSS-DEVICE COMPARISON")
|
| 661 |
+
|
| 662 |
+
# Find common model
|
| 663 |
+
common_model = models[-1] # Usually largest tested
|
| 664 |
+
|
| 665 |
+
print()
|
| 666 |
+
print(f" {colored(f'{common_model} (best config per device):', Style.CYAN)}")
|
| 667 |
+
print(f" {'Device':<10} {'Config':<30} {'Performance':<15}")
|
| 668 |
+
print(f" {'-'*55}")
|
| 669 |
+
|
| 670 |
+
base_fps = None
|
| 671 |
+
for device in devices_to_test:
|
| 672 |
+
if device.type in all_results and "inference" in all_results[device.type]:
|
| 673 |
+
infer = all_results[device.type]["inference"].get(common_model, {})
|
| 674 |
+
if infer:
|
| 675 |
+
best_name = max(infer.keys(), key=lambda k: infer[k]["result"].fps)
|
| 676 |
+
best = infer[best_name]
|
| 677 |
+
fps = best["result"].fps
|
| 678 |
+
|
| 679 |
+
if base_fps is None:
|
| 680 |
+
base_fps = fps
|
| 681 |
+
|
| 682 |
+
speedup = fps / base_fps if base_fps else 1
|
| 683 |
+
speedup_str = f"({speedup:.1f}x)" if device != devices_to_test[0] else "(baseline)"
|
| 684 |
+
|
| 685 |
+
print(f" {device.type.upper():<10} {best['config'].description:<30} {fps:>5.1f} img/s {speedup_str}")
|
| 686 |
+
|
| 687 |
+
# Final summary
|
| 688 |
+
print()
|
| 689 |
+
print(colored("═" * 70, Style.CYAN))
|
| 690 |
+
print(colored("║", Style.CYAN) + colored(" BENCHMARK COMPLETE", Style.BOLD).center(77) + colored("║", Style.CYAN))
|
| 691 |
+
print(colored("═" * 70, Style.CYAN))
|
| 692 |
+
print()
|
| 693 |
+
|
| 694 |
+
|
| 695 |
+
if __name__ == "__main__":
|
| 696 |
+
main()
|
benchmarks/gpu_preprocessing_benchmark.py
ADDED
|
@@ -0,0 +1,363 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
# Copyright (c) 2025 Delanoe Pirard
|
| 3 |
+
#
|
| 4 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 5 |
+
# you may not use this file except in compliance with the License.
|
| 6 |
+
# You may obtain a copy of the License at
|
| 7 |
+
#
|
| 8 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 9 |
+
#
|
| 10 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 11 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 12 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 13 |
+
# See the License for the specific language governing permissions and
|
| 14 |
+
# limitations under the License.
|
| 15 |
+
|
| 16 |
+
"""
|
| 17 |
+
GPU Preprocessing Benchmark
|
| 18 |
+
|
| 19 |
+
Compares CPU vs GPU preprocessing performance across different image sizes.
|
| 20 |
+
Measures:
|
| 21 |
+
- Preprocessing time only
|
| 22 |
+
- Total inference time (preprocessing + model forward)
|
| 23 |
+
- Memory usage
|
| 24 |
+
- Speedup percentages
|
| 25 |
+
"""
|
| 26 |
+
|
| 27 |
+
import time
|
| 28 |
+
from typing import List, Tuple
|
| 29 |
+
|
| 30 |
+
import numpy as np
|
| 31 |
+
import torch
|
| 32 |
+
from PIL import Image
|
| 33 |
+
|
| 34 |
+
from depth_anything_3.utils.io.input_processor import InputProcessor
|
| 35 |
+
from depth_anything_3.utils.io.gpu_input_processor import GPUInputProcessor
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
import os
|
| 39 |
+
import shutil
|
| 40 |
+
|
| 41 |
+
def create_test_files(sizes: List[Tuple[int, int]], count: int = 4, temp_dir: str = "temp_bench_imgs") -> Tuple[List[List[str]], str]:
|
| 42 |
+
"""Create test image files on disk.
|
| 43 |
+
|
| 44 |
+
Args:
|
| 45 |
+
sizes: List of (width, height) tuples
|
| 46 |
+
count: Number of images per size
|
| 47 |
+
temp_dir: Directory to save images
|
| 48 |
+
|
| 49 |
+
Returns:
|
| 50 |
+
List of image path batches, one per size
|
| 51 |
+
Path to temp directory
|
| 52 |
+
"""
|
| 53 |
+
if os.path.exists(temp_dir):
|
| 54 |
+
shutil.rmtree(temp_dir)
|
| 55 |
+
os.makedirs(temp_dir)
|
| 56 |
+
|
| 57 |
+
batches = []
|
| 58 |
+
for w, h, _ in sizes:
|
| 59 |
+
batch = []
|
| 60 |
+
for i in range(count):
|
| 61 |
+
img = Image.new("RGB", (w, h), color=(i * 50, 100, 150))
|
| 62 |
+
fname = f"{temp_dir}/{w}x{h}_{i}.jpg"
|
| 63 |
+
img.save(fname, quality=95, subsampling=0)
|
| 64 |
+
batch.append(fname)
|
| 65 |
+
batches.append(batch)
|
| 66 |
+
return batches, temp_dir
|
| 67 |
+
|
| 68 |
+
def benchmark_gpu_decode_files(
|
| 69 |
+
processor,
|
| 70 |
+
image_paths: List[str],
|
| 71 |
+
process_res: int = 504,
|
| 72 |
+
warmup_runs: int = 2,
|
| 73 |
+
benchmark_runs: int = 10,
|
| 74 |
+
num_workers: int = 8,
|
| 75 |
+
) -> float:
|
| 76 |
+
"""Benchmark GPU decoding (from file path)."""
|
| 77 |
+
# Warmup
|
| 78 |
+
for _ in range(warmup_runs):
|
| 79 |
+
processor(
|
| 80 |
+
image=image_paths,
|
| 81 |
+
process_res=process_res,
|
| 82 |
+
process_res_method="upper_bound_resize",
|
| 83 |
+
num_workers=num_workers,
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
# Benchmark
|
| 87 |
+
times = []
|
| 88 |
+
for _ in range(benchmark_runs):
|
| 89 |
+
if hasattr(processor, 'device') and processor.device.type == "cuda":
|
| 90 |
+
torch.cuda.synchronize()
|
| 91 |
+
|
| 92 |
+
start = time.perf_counter()
|
| 93 |
+
# Pass file paths directly to GPUInputProcessor
|
| 94 |
+
tensor, _, _ = processor(
|
| 95 |
+
image=image_paths,
|
| 96 |
+
process_res=process_res,
|
| 97 |
+
process_res_method="upper_bound_resize",
|
| 98 |
+
num_workers=num_workers,
|
| 99 |
+
)
|
| 100 |
+
|
| 101 |
+
if hasattr(processor, 'device') and processor.device.type == "cuda":
|
| 102 |
+
torch.cuda.synchronize()
|
| 103 |
+
|
| 104 |
+
elapsed = time.perf_counter() - start
|
| 105 |
+
times.append(elapsed)
|
| 106 |
+
|
| 107 |
+
return np.mean(times)
|
| 108 |
+
|
| 109 |
+
def create_test_images(sizes: List[Tuple[int, int]], count: int = 4) -> List[List[Image.Image]]:
|
| 110 |
+
"""Create test images for each size.
|
| 111 |
+
|
| 112 |
+
Args:
|
| 113 |
+
sizes: List of (width, height) tuples
|
| 114 |
+
count: Number of images per size
|
| 115 |
+
|
| 116 |
+
Returns:
|
| 117 |
+
List of image batches, one per size
|
| 118 |
+
"""
|
| 119 |
+
batches = []
|
| 120 |
+
for w, h in sizes:
|
| 121 |
+
batch = [Image.new("RGB", (w, h), color=(i * 50, 100, 150)) for i in range(count)]
|
| 122 |
+
batches.append(batch)
|
| 123 |
+
return batches
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
def benchmark_hybrid(
|
| 127 |
+
processor,
|
| 128 |
+
images: List[Image.Image],
|
| 129 |
+
process_res: int = 504,
|
| 130 |
+
warmup_runs: int = 2,
|
| 131 |
+
benchmark_runs: int = 10,
|
| 132 |
+
num_workers: int = 8,
|
| 133 |
+
device=torch.device("cuda")
|
| 134 |
+
) -> float:
|
| 135 |
+
"""Benchmark hybrid preprocessing (CPU resize -> GPU normalize)."""
|
| 136 |
+
|
| 137 |
+
# Warmup
|
| 138 |
+
for _ in range(warmup_runs):
|
| 139 |
+
imgs_cpu, _, _ = processor(
|
| 140 |
+
image=images,
|
| 141 |
+
process_res=process_res,
|
| 142 |
+
process_res_method="upper_bound_resize",
|
| 143 |
+
num_workers=num_workers,
|
| 144 |
+
perform_normalization=False
|
| 145 |
+
)
|
| 146 |
+
imgs_gpu = imgs_cpu.to(device, non_blocking=True).float() / 255.0
|
| 147 |
+
_ = InputProcessor.normalize_tensor(imgs_gpu, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
|
| 148 |
+
|
| 149 |
+
# Benchmark
|
| 150 |
+
times = []
|
| 151 |
+
for _ in range(benchmark_runs):
|
| 152 |
+
if device.type == "cuda":
|
| 153 |
+
torch.cuda.synchronize()
|
| 154 |
+
|
| 155 |
+
start = time.perf_counter()
|
| 156 |
+
|
| 157 |
+
# 1. CPU Preprocessing (uint8)
|
| 158 |
+
imgs_cpu, _, _ = processor(
|
| 159 |
+
image=images,
|
| 160 |
+
process_res=process_res,
|
| 161 |
+
process_res_method="upper_bound_resize",
|
| 162 |
+
num_workers=num_workers,
|
| 163 |
+
perform_normalization=False
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
# 2. Transfer + Normalize
|
| 167 |
+
imgs_gpu = imgs_cpu.to(device, non_blocking=True).float() / 255.0
|
| 168 |
+
_ = InputProcessor.normalize_tensor(imgs_gpu, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
|
| 169 |
+
|
| 170 |
+
if device.type == "cuda":
|
| 171 |
+
torch.cuda.synchronize()
|
| 172 |
+
|
| 173 |
+
elapsed = time.perf_counter() - start
|
| 174 |
+
times.append(elapsed)
|
| 175 |
+
|
| 176 |
+
return np.mean(times)
|
| 177 |
+
|
| 178 |
+
def benchmark_preprocessing(
|
| 179 |
+
processor,
|
| 180 |
+
images: List[Image.Image],
|
| 181 |
+
process_res: int = 504,
|
| 182 |
+
warmup_runs: int = 2,
|
| 183 |
+
benchmark_runs: int = 10,
|
| 184 |
+
num_workers: int = 8,
|
| 185 |
+
) -> float:
|
| 186 |
+
"""Benchmark preprocessing performance.
|
| 187 |
+
|
| 188 |
+
Args:
|
| 189 |
+
processor: InputProcessor or GPUInputProcessor instance
|
| 190 |
+
images: List of test images
|
| 191 |
+
process_res: Processing resolution
|
| 192 |
+
warmup_runs: Number of warmup runs to discard
|
| 193 |
+
benchmark_runs: Number of benchmark runs to average
|
| 194 |
+
num_workers: Number of parallel workers (for CPU processor)
|
| 195 |
+
|
| 196 |
+
Returns:
|
| 197 |
+
Average preprocessing time in seconds
|
| 198 |
+
"""
|
| 199 |
+
# Warmup
|
| 200 |
+
for _ in range(warmup_runs):
|
| 201 |
+
processor(
|
| 202 |
+
image=images,
|
| 203 |
+
process_res=process_res,
|
| 204 |
+
process_res_method="upper_bound_resize",
|
| 205 |
+
num_workers=num_workers,
|
| 206 |
+
)
|
| 207 |
+
|
| 208 |
+
# Benchmark
|
| 209 |
+
times = []
|
| 210 |
+
for _ in range(benchmark_runs):
|
| 211 |
+
if hasattr(processor, 'device') and processor.device.type == "cuda":
|
| 212 |
+
torch.cuda.synchronize()
|
| 213 |
+
|
| 214 |
+
start = time.perf_counter()
|
| 215 |
+
tensor, _, _ = processor(
|
| 216 |
+
image=images,
|
| 217 |
+
process_res=process_res,
|
| 218 |
+
process_res_method="upper_bound_resize",
|
| 219 |
+
num_workers=num_workers,
|
| 220 |
+
)
|
| 221 |
+
|
| 222 |
+
if hasattr(processor, 'device') and processor.device.type == "cuda":
|
| 223 |
+
torch.cuda.synchronize()
|
| 224 |
+
|
| 225 |
+
elapsed = time.perf_counter() - start
|
| 226 |
+
times.append(elapsed)
|
| 227 |
+
|
| 228 |
+
return np.mean(times)
|
| 229 |
+
|
| 230 |
+
|
| 231 |
+
def print_results_table(results: List[dict]):
|
| 232 |
+
"""Pretty print benchmark results as table."""
|
| 233 |
+
print("\n" + "=" * 140)
|
| 234 |
+
print("GPU PREPROCESSING BENCHMARK RESULTS")
|
| 235 |
+
print("=" * 140)
|
| 236 |
+
print(f"{'Image Size':<15} {'CPU Time':<12} {'GPU Time':<12} {'Hybrid Time':<12} {'GPU Decode':<12} {'Best Method':<15}")
|
| 237 |
+
print("-" * 140)
|
| 238 |
+
|
| 239 |
+
for result in results:
|
| 240 |
+
size_str = f"{result['width']}x{result['height']}"
|
| 241 |
+
cpu_time = f"{result['cpu_time']*1000:.2f} ms"
|
| 242 |
+
gpu_time = f"{result['gpu_time']*1000:.2f} ms"
|
| 243 |
+
hybrid_time = f"{result['hybrid_time']*1000:.2f} ms"
|
| 244 |
+
gpu_decode_time = f"{result['gpu_decode_time']*1000:.2f} ms"
|
| 245 |
+
|
| 246 |
+
times = [result['cpu_time'], result['gpu_time'], result['hybrid_time'], result['gpu_decode_time']]
|
| 247 |
+
labels = ["CPU", "GPU", "Hybrid", "GPU Decode"]
|
| 248 |
+
best_idx = np.argmin(times)
|
| 249 |
+
best = labels[best_idx]
|
| 250 |
+
|
| 251 |
+
print(f"{size_str:<15} {cpu_time:<12} {gpu_time:<12} {hybrid_time:<12} {gpu_decode_time:<12} {best:<15}")
|
| 252 |
+
|
| 253 |
+
print("=" * 140 + "\n")
|
| 254 |
+
|
| 255 |
+
|
| 256 |
+
def main():
|
| 257 |
+
"""Run comprehensive benchmark."""
|
| 258 |
+
print("\n" + "=" * 100)
|
| 259 |
+
print("INITIALIZING GPU PREPROCESSING BENCHMARK")
|
| 260 |
+
print("=" * 100)
|
| 261 |
+
|
| 262 |
+
# Check GPU availability
|
| 263 |
+
if torch.cuda.is_available():
|
| 264 |
+
device_name = "cuda"
|
| 265 |
+
device_info = torch.cuda.get_device_name(0)
|
| 266 |
+
print(f"✓ GPU Device: {device_info}")
|
| 267 |
+
print("✓ GPU preprocessing: ENABLED (NVJPEG + Kornia)")
|
| 268 |
+
elif torch.backends.mps.is_available():
|
| 269 |
+
device_name = "mps"
|
| 270 |
+
device_info = "Apple MPS"
|
| 271 |
+
print(f"✓ GPU Device: {device_info}")
|
| 272 |
+
print("ℹ GPU preprocessing: DISABLED on MPS (CPU is faster on Apple Silicon)")
|
| 273 |
+
print(" → GPUInputProcessor will use CPU path automatically")
|
| 274 |
+
print(" → GPU reserved for model inference (5-10x speedup there)")
|
| 275 |
+
else:
|
| 276 |
+
print("✗ No GPU available - benchmark will show CPU vs CPU (no speedup expected)")
|
| 277 |
+
device_name = "cpu"
|
| 278 |
+
device_info = "CPU only"
|
| 279 |
+
|
| 280 |
+
device = torch.device(device_name)
|
| 281 |
+
|
| 282 |
+
# Create processors
|
| 283 |
+
cpu_proc = InputProcessor()
|
| 284 |
+
gpu_proc = GPUInputProcessor(device=device_name)
|
| 285 |
+
print(f"✓ Processors initialized: CPU vs {device_name.upper()}")
|
| 286 |
+
|
| 287 |
+
# Test configurations
|
| 288 |
+
# Format: (width, height, description)
|
| 289 |
+
test_sizes = [
|
| 290 |
+
(640, 480, "Small (VGA)"),
|
| 291 |
+
(1280, 720, "Medium (HD)"),
|
| 292 |
+
(1920, 1080, "Large (Full HD)"),
|
| 293 |
+
(3840, 2160, "XLarge (4K)"),
|
| 294 |
+
]
|
| 295 |
+
|
| 296 |
+
process_res = 504
|
| 297 |
+
num_images = 4
|
| 298 |
+
num_workers = 8
|
| 299 |
+
|
| 300 |
+
print(f"✓ Test config: {num_images} images per batch, process_res={process_res}, num_workers={num_workers}")
|
| 301 |
+
print(f"✓ Testing {len(test_sizes)} image sizes: {', '.join([desc for _, _, desc in test_sizes])}")
|
| 302 |
+
|
| 303 |
+
# Create test images
|
| 304 |
+
print("\nGenerating test images (PIL & Files)...")
|
| 305 |
+
image_batches_pil = create_test_images([(w, h) for w, h, _ in test_sizes], count=num_images)
|
| 306 |
+
image_batches_files, temp_dir = create_test_files(test_sizes, count=num_images)
|
| 307 |
+
print("✓ Test images generated")
|
| 308 |
+
|
| 309 |
+
# Run benchmarks
|
| 310 |
+
print("\nRunning benchmarks (this may take a minute)...\n")
|
| 311 |
+
results = []
|
| 312 |
+
|
| 313 |
+
try:
|
| 314 |
+
for (w, h, desc), imgs_pil, imgs_files in zip(test_sizes, image_batches_pil, image_batches_files):
|
| 315 |
+
print(f"Benchmarking {desc} ({w}x{h})...", end=" ", flush=True)
|
| 316 |
+
|
| 317 |
+
cpu_time = benchmark_preprocessing(cpu_proc, imgs_pil, process_res, num_workers=num_workers)
|
| 318 |
+
gpu_time = benchmark_preprocessing(gpu_proc, imgs_pil, process_res, num_workers=num_workers)
|
| 319 |
+
hybrid_time = benchmark_hybrid(cpu_proc, imgs_pil, process_res, num_workers=num_workers, device=device)
|
| 320 |
+
|
| 321 |
+
# GPU Decode uses file paths
|
| 322 |
+
gpu_decode_time = benchmark_gpu_decode_files(gpu_proc, imgs_files, process_res, num_workers=num_workers)
|
| 323 |
+
|
| 324 |
+
results.append({
|
| 325 |
+
'width': w,
|
| 326 |
+
'height': h,
|
| 327 |
+
'description': desc,
|
| 328 |
+
'cpu_time': cpu_time,
|
| 329 |
+
'gpu_time': gpu_time,
|
| 330 |
+
'hybrid_time': hybrid_time,
|
| 331 |
+
'gpu_decode_time': gpu_decode_time
|
| 332 |
+
})
|
| 333 |
+
|
| 334 |
+
best_time = min(cpu_time, gpu_time, hybrid_time, gpu_decode_time)
|
| 335 |
+
if best_time == gpu_decode_time:
|
| 336 |
+
win = "GPU Decode"
|
| 337 |
+
elif best_time == hybrid_time:
|
| 338 |
+
win = "Hybrid"
|
| 339 |
+
elif best_time == gpu_time:
|
| 340 |
+
win = "GPU"
|
| 341 |
+
else:
|
| 342 |
+
win = "CPU"
|
| 343 |
+
|
| 344 |
+
print(f"✓ Best: {win}")
|
| 345 |
+
|
| 346 |
+
# Print results table
|
| 347 |
+
print_results_table(results)
|
| 348 |
+
|
| 349 |
+
# Memory info (CUDA only)
|
| 350 |
+
if device_name == "cuda":
|
| 351 |
+
print("\nGPU Memory Usage:")
|
| 352 |
+
print(f" Allocated: {torch.cuda.memory_allocated(0) / 1024**2:.1f} MB")
|
| 353 |
+
print(f" Cached: {torch.cuda.memory_reserved(0) / 1024**2:.1f} MB")
|
| 354 |
+
|
| 355 |
+
finally:
|
| 356 |
+
# Cleanup
|
| 357 |
+
if os.path.exists(temp_dir):
|
| 358 |
+
shutil.rmtree(temp_dir)
|
| 359 |
+
print(f"\n✓ Cleaned up temp directory: {temp_dir}")
|
| 360 |
+
|
| 361 |
+
if __name__ == "__main__":
|
| 362 |
+
main()
|
| 363 |
+
|
benchmarks/results/temp_images/test_image_0000.jpg
ADDED
|
Git LFS Details
|
benchmarks/results/temp_images/test_image_0001.jpg
ADDED
|
Git LFS Details
|
benchmarks/results/temp_images/test_image_0002.jpg
ADDED
|
Git LFS Details
|
benchmarks/results/temp_images/test_image_0003.jpg
ADDED
|
Git LFS Details
|
docs/API.md
ADDED
|
@@ -0,0 +1,465 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 📚 DepthAnything3 API Documentation
|
| 2 |
+
|
| 3 |
+
## 📑 Table of Contents
|
| 4 |
+
|
| 5 |
+
1. [📖 Overview](#overview)
|
| 6 |
+
2. [💡 Usage Examples](#usage-examples)
|
| 7 |
+
3. [🔧 Core API](#core-api)
|
| 8 |
+
- [DepthAnything3 Class](#depthanything3-class)
|
| 9 |
+
- [inference() Method](#inference-method)
|
| 10 |
+
4. [⚙️ Parameters](#parameters)
|
| 11 |
+
- [Input Parameters](#input-parameters)
|
| 12 |
+
- [Pose Alignment Parameters](#pose-alignment-parameters)
|
| 13 |
+
- [Feature Export Parameters](#feature-export-parameters)
|
| 14 |
+
- [Rendering Parameters](#rendering-parameters)
|
| 15 |
+
- [Processing Parameters](#processing-parameters)
|
| 16 |
+
- [Export Parameters](#export-parameters)
|
| 17 |
+
5. [📤 Export Formats](#export-formats)
|
| 18 |
+
6. [↩️ Return Value](#return-value)
|
| 19 |
+
|
| 20 |
+
## 📖 Overview
|
| 21 |
+
|
| 22 |
+
This documentation provides comprehensive API reference for DepthAnything3, including usage examples, parameter specifications, export formats, and advanced features. It covers both basic pose and depth estimation workflows and advanced pose-conditioned processing with multiple export capabilities.
|
| 23 |
+
|
| 24 |
+
## 💡 Usage Examples
|
| 25 |
+
|
| 26 |
+
Here are quick examples to get you started:
|
| 27 |
+
|
| 28 |
+
### 🚀 Basic Depth Estimation
|
| 29 |
+
```python
|
| 30 |
+
from depth_anything_3.api import DepthAnything3
|
| 31 |
+
|
| 32 |
+
# Initialize and run inference
|
| 33 |
+
model = DepthAnything3.from_pretrained("depth-anything/DA3NESTED-GIANT-LARGE").to("cuda")
|
| 34 |
+
prediction = model.inference(["image1.jpg", "image2.jpg"])
|
| 35 |
+
```
|
| 36 |
+
|
| 37 |
+
### 📷 Pose-Conditioned Depth Estimation
|
| 38 |
+
```python
|
| 39 |
+
import numpy as np
|
| 40 |
+
|
| 41 |
+
# With camera parameters for better consistency
|
| 42 |
+
prediction = model.inference(
|
| 43 |
+
image=["image1.jpg", "image2.jpg"],
|
| 44 |
+
extrinsics=extrinsics_array, # (N, 4, 4)
|
| 45 |
+
intrinsics=intrinsics_array # (N, 3, 3)
|
| 46 |
+
)
|
| 47 |
+
```
|
| 48 |
+
|
| 49 |
+
### 📤 Export Results
|
| 50 |
+
```python
|
| 51 |
+
# Export depth data and 3D visualization
|
| 52 |
+
prediction = model.inference(
|
| 53 |
+
image=image_paths,
|
| 54 |
+
export_dir="./output",
|
| 55 |
+
export_format="mini_npz-glb"
|
| 56 |
+
)
|
| 57 |
+
```
|
| 58 |
+
|
| 59 |
+
### 🔍 Feature Extraction
|
| 60 |
+
```python
|
| 61 |
+
# Export intermediate features from specific layers
|
| 62 |
+
prediction = model.inference(
|
| 63 |
+
image=image_paths,
|
| 64 |
+
export_dir="./output",
|
| 65 |
+
export_format="feat_vis",
|
| 66 |
+
export_feat_layers=[0, 1, 2] # Export features from layers 0, 1, 2
|
| 67 |
+
)
|
| 68 |
+
```
|
| 69 |
+
|
| 70 |
+
### ✨ Advanced Export with Gaussian Splatting
|
| 71 |
+
```python
|
| 72 |
+
# Export multiple formats including Gaussian Splatting
|
| 73 |
+
# Note: infer_gs=True requires da3-giant or da3nested-giant-large model
|
| 74 |
+
model = DepthAnything3(model_name="da3-giant").to("cuda")
|
| 75 |
+
|
| 76 |
+
prediction = model.inference(
|
| 77 |
+
image=image_paths,
|
| 78 |
+
extrinsics=extrinsics_array,
|
| 79 |
+
intrinsics=intrinsics_array,
|
| 80 |
+
export_dir="./output",
|
| 81 |
+
export_format="npz-glb-gs_ply-gs_video",
|
| 82 |
+
align_to_input_ext_scale=True,
|
| 83 |
+
infer_gs=True, # Required for gs_ply and gs_video exports
|
| 84 |
+
)
|
| 85 |
+
```
|
| 86 |
+
|
| 87 |
+
### 🎨 Advanced Export with Feature Visualization
|
| 88 |
+
```python
|
| 89 |
+
# Export with intermediate feature visualization
|
| 90 |
+
prediction = model.inference(
|
| 91 |
+
image=image_paths,
|
| 92 |
+
export_dir="./output",
|
| 93 |
+
export_format="mini_npz-glb-depth_vis-feat_vis",
|
| 94 |
+
export_feat_layers=[0, 5, 10, 15, 20],
|
| 95 |
+
feat_vis_fps=30,
|
| 96 |
+
)
|
| 97 |
+
```
|
| 98 |
+
|
| 99 |
+
### 📐 Using Ray-Based Pose Estimation
|
| 100 |
+
```python
|
| 101 |
+
# Use ray-based pose estimation instead of camera decoder
|
| 102 |
+
prediction = model.inference(
|
| 103 |
+
image=image_paths,
|
| 104 |
+
export_dir="./output",
|
| 105 |
+
export_format="glb",
|
| 106 |
+
use_ray_pose=True, # Enable ray-based pose estimation
|
| 107 |
+
)
|
| 108 |
+
```
|
| 109 |
+
|
| 110 |
+
### 🎯 Reference View Selection
|
| 111 |
+
```python
|
| 112 |
+
# For multi-view inputs, automatically select the best reference view
|
| 113 |
+
prediction = model.inference(
|
| 114 |
+
image=image_paths,
|
| 115 |
+
ref_view_strategy="saddle_balanced", # Default: balanced selection
|
| 116 |
+
)
|
| 117 |
+
|
| 118 |
+
# For video sequences, use middle frame as reference
|
| 119 |
+
prediction = model.inference(
|
| 120 |
+
image=video_frames,
|
| 121 |
+
ref_view_strategy="middle", # Good for temporally ordered inputs
|
| 122 |
+
)
|
| 123 |
+
```
|
| 124 |
+
|
| 125 |
+
## 🔧 Core API
|
| 126 |
+
|
| 127 |
+
### 🔨 DepthAnything3 Class
|
| 128 |
+
|
| 129 |
+
The main API class that provides depth estimation capabilities with optional pose conditioning.
|
| 130 |
+
|
| 131 |
+
#### 🎯 Initialization
|
| 132 |
+
|
| 133 |
+
```python
|
| 134 |
+
from depth_anything_3 import DepthAnything3
|
| 135 |
+
|
| 136 |
+
# Initialize the model with a model name
|
| 137 |
+
model = DepthAnything3(model_name="da3-large")
|
| 138 |
+
model = model.to("cuda") # Move to GPU
|
| 139 |
+
```
|
| 140 |
+
|
| 141 |
+
**Parameters:**
|
| 142 |
+
- `model_name` (str, default: "da3-large"): The name of the model preset to use.
|
| 143 |
+
- **Available models:**
|
| 144 |
+
- 🦾 `"da3-giant"` - 1.15B params, any-view model with GS support
|
| 145 |
+
- ⭐ `"da3-large"` - 0.35B params, any-view model (recommended for most use cases)
|
| 146 |
+
- 📦 `"da3-base"` - 0.12B params, any-view model
|
| 147 |
+
- 🪶 `"da3-small"` - 0.08B params, any-view model
|
| 148 |
+
- 👁️ `"da3mono-large"` - 0.35B params, monocular depth only
|
| 149 |
+
- 📏 `"da3metric-large"` - 0.35B params, metric depth with sky segmentation
|
| 150 |
+
- 🎯 `"da3nested-giant-large"` - 1.40B params, nested model with all features
|
| 151 |
+
|
| 152 |
+
### 🚀 inference() Method
|
| 153 |
+
|
| 154 |
+
The primary inference method that processes images and returns depth predictions.
|
| 155 |
+
|
| 156 |
+
```python
|
| 157 |
+
prediction = model.inference(
|
| 158 |
+
image=image_list,
|
| 159 |
+
extrinsics=extrinsics_array, # Optional
|
| 160 |
+
intrinsics=intrinsics_array, # Optional
|
| 161 |
+
align_to_input_ext_scale=True, # Whether to align predicted poses to input scale
|
| 162 |
+
infer_gs=True, # Enable Gaussian branch for gs exports
|
| 163 |
+
use_ray_pose=False, # Use ray-based pose estimation instead of camera decoder
|
| 164 |
+
ref_view_strategy="saddle_balanced", # Reference view selection strategy
|
| 165 |
+
render_exts=render_extrinsics, # Optional renders for gs_video
|
| 166 |
+
render_ixts=render_intrinsics, # Optional renders for gs_video
|
| 167 |
+
render_hw=(height, width), # Optional renders for gs_video
|
| 168 |
+
process_res=504,
|
| 169 |
+
process_res_method="upper_bound_resize",
|
| 170 |
+
export_dir="output_directory", # Optional
|
| 171 |
+
export_format="mini_npz",
|
| 172 |
+
export_feat_layers=[], # List of layer indices to export features from
|
| 173 |
+
conf_thresh_percentile=40.0, # Confidence threshold percentile for depth map in GLB export
|
| 174 |
+
num_max_points=1_000_000, # Maximum number of points to export in GLB export
|
| 175 |
+
show_cameras=True, # Whether to show cameras in GLB export
|
| 176 |
+
feat_vis_fps=15, # Frames per second for feature visualization in feat_vis export
|
| 177 |
+
export_kwargs={} # Optional, additional arguments to export functions. export_format:key:val, see 'Parameters/Export Parameters' for details
|
| 178 |
+
)
|
| 179 |
+
```
|
| 180 |
+
|
| 181 |
+
## ⚙️ Parameters
|
| 182 |
+
|
| 183 |
+
### 📸 Input Parameters
|
| 184 |
+
|
| 185 |
+
#### `image` (required)
|
| 186 |
+
- **Type**: `List[Union[np.ndarray, Image.Image, str]]`
|
| 187 |
+
- **Description**: List of input images. Can be numpy arrays, PIL Images, or file paths.
|
| 188 |
+
- **Example**:
|
| 189 |
+
```python
|
| 190 |
+
# From file paths
|
| 191 |
+
image = ["image1.jpg", "image2.jpg", "image3.jpg"]
|
| 192 |
+
|
| 193 |
+
# From numpy arrays
|
| 194 |
+
image = [np.array(img1), np.array(img2)]
|
| 195 |
+
|
| 196 |
+
# From PIL Images
|
| 197 |
+
image = [Image.open("image1.jpg"), Image.open("image2.jpg")]
|
| 198 |
+
```
|
| 199 |
+
|
| 200 |
+
#### `extrinsics` (optional)
|
| 201 |
+
- **Type**: `Optional[np.ndarray]`
|
| 202 |
+
- **Shape**: `(N, 4, 4)` where N is the number of input images
|
| 203 |
+
- **Description**: Camera extrinsic matrices (world-to-camera transformation). When provided, enables pose-conditioned depth estimation mode.
|
| 204 |
+
- **Note**: If not provided, the model operates in standard depth estimation mode.
|
| 205 |
+
|
| 206 |
+
#### `intrinsics` (optional)
|
| 207 |
+
- **Type**: `Optional[np.ndarray]`
|
| 208 |
+
- **Shape**: `(N, 3, 3)` where N is the number of input images
|
| 209 |
+
- **Description**: Camera intrinsic matrices containing focal length and principal point information. When provided, enables pose-conditioned depth estimation mode.
|
| 210 |
+
|
| 211 |
+
### 🎯 Pose Alignment Parameters
|
| 212 |
+
|
| 213 |
+
#### `align_to_input_ext_scale` (default: True)
|
| 214 |
+
- **Type**: `bool`
|
| 215 |
+
- **Description**: When True the predicted extrinsics are replaced with the input
|
| 216 |
+
ones and the depth maps are rescaled to match their metric scale. When False the
|
| 217 |
+
function returns the internally aligned poses computed via Umeyama alignment.
|
| 218 |
+
|
| 219 |
+
#### `infer_gs` (default: False)
|
| 220 |
+
- **Type**: `bool`
|
| 221 |
+
- **Description**: Enable Gaussian Splatting branch for gaussian splatting exports. Required when using `gs_ply` or `gs_video` export formats.
|
| 222 |
+
|
| 223 |
+
#### `use_ray_pose` (default: False)
|
| 224 |
+
- **Type**: `bool`
|
| 225 |
+
- **Description**: Use ray-based pose estimation instead of camera decoder for pose prediction. When True, the model uses ray prediction heads to estimate camera poses; when False, it uses the camera decoder approach.
|
| 226 |
+
|
| 227 |
+
#### `ref_view_strategy` (default: "saddle_balanced")
|
| 228 |
+
- **Type**: `str`
|
| 229 |
+
- **Description**: Strategy for selecting the reference view from multiple input views. Options: `"first"`, `"middle"`, `"saddle_balanced"`, `"saddle_sim_range"`. Only applied when number of views ≥ 3. See [detailed documentation](funcs/ref_view_strategy.md) for strategy comparisons.
|
| 230 |
+
- **Available strategies**:
|
| 231 |
+
- `"saddle_balanced"`: Selects view with balanced features across multiple metrics (recommended default)
|
| 232 |
+
- `"saddle_sim_range"`: Selects view with largest similarity range
|
| 233 |
+
- `"first"`: Always uses first view (not recommended, equivalent to no reordering for views < 3)
|
| 234 |
+
- `"middle"`: Uses middle view (recommended for video sequences)
|
| 235 |
+
|
| 236 |
+
### 🔍 Feature Export Parameters
|
| 237 |
+
|
| 238 |
+
#### `export_feat_layers` (default: [])
|
| 239 |
+
- **Type**: `List[int]`
|
| 240 |
+
- **Description**: List of layer indices to export intermediate features from. Features are stored in the `aux` dictionary of the Prediction object with keys like `feat_layer_0`, `feat_layer_1`, etc.
|
| 241 |
+
|
| 242 |
+
### 🎥 Rendering Parameters
|
| 243 |
+
|
| 244 |
+
These arguments are only used when exporting Gaussian-splatting videos (include
|
| 245 |
+
`"gs_video"` in `export_format`). They describe an auxiliary camera trajectory
|
| 246 |
+
with ``M`` views.
|
| 247 |
+
|
| 248 |
+
#### `render_exts` (optional)
|
| 249 |
+
- **Type**: `Optional[np.ndarray]`
|
| 250 |
+
- **Shape**: `(M, 4, 4)`
|
| 251 |
+
- **Description**: Camera extrinsics for the synthesized trajectory. If omitted,
|
| 252 |
+
the exporter falls back to the predicted poses.
|
| 253 |
+
|
| 254 |
+
#### `render_ixts` (optional)
|
| 255 |
+
- **Type**: `Optional[np.ndarray]`
|
| 256 |
+
- **Shape**: `(M, 3, 3)`
|
| 257 |
+
- **Description**: Camera intrinsics for each rendered frame. Leave `None` to
|
| 258 |
+
reuse the input intrinsics.
|
| 259 |
+
|
| 260 |
+
#### `render_hw` (optional)
|
| 261 |
+
- **Type**: `Optional[Tuple[int, int]]`
|
| 262 |
+
- **Description**: Explicit output resolution `(height, width)` for the rendered
|
| 263 |
+
frames. Defaults to the input resolution when not provided.
|
| 264 |
+
|
| 265 |
+
### ⚡ Processing Parameters
|
| 266 |
+
|
| 267 |
+
#### `process_res` (default: 504)
|
| 268 |
+
- **Type**: `int`
|
| 269 |
+
- **Description**: Base resolution for processing. The model will resize images to this resolution for inference.
|
| 270 |
+
|
| 271 |
+
#### `process_res_method` (default: "upper_bound_resize")
|
| 272 |
+
- **Type**: `str`
|
| 273 |
+
- **Description**: Method for resizing images to the target resolution.
|
| 274 |
+
- **Options**:
|
| 275 |
+
- `"upper_bound_resize"`: Resize so that the specified dimension (504) becomes the longer side
|
| 276 |
+
- `"lower_bound_resize"`: Resize so that the specified dimension (504) becomes the shorter side
|
| 277 |
+
- **Example**:
|
| 278 |
+
- Input: 1200×1600 → Output: 378×504 (with `process_res=504`, `process_res_method="upper_bound_resize"`)
|
| 279 |
+
- Input: 504×672 → Output: 504×672 (no change needed)
|
| 280 |
+
|
| 281 |
+
### 📦 Export Parameters
|
| 282 |
+
|
| 283 |
+
#### `export_dir` (optional)
|
| 284 |
+
- **Type**: `Optional[str]`
|
| 285 |
+
- **Description**: Directory path where exported files will be saved. If not provided, no files will be exported.
|
| 286 |
+
|
| 287 |
+
#### `export_format` (default: "mini_npz")
|
| 288 |
+
- **Type**: `str`
|
| 289 |
+
- **Description**: Format for exporting results. Supports multiple formats separated by `-`.
|
| 290 |
+
- **Example**: `"mini_npz-glb"` exports both mini_npz and glb formats.
|
| 291 |
+
|
| 292 |
+
#### 🌐 GLB Export Parameters
|
| 293 |
+
|
| 294 |
+
These parameters are passed directly to the `inference()` method and only apply when `export_format` includes `"glb"`.
|
| 295 |
+
|
| 296 |
+
##### `conf_thresh_percentile` (default: 40.0)
|
| 297 |
+
- **Type**: `float`
|
| 298 |
+
- **Description**: Lower percentile for adaptive confidence threshold. Points below this confidence percentile will be filtered out from the point cloud.
|
| 299 |
+
|
| 300 |
+
##### `num_max_points` (default: 1,000,000)
|
| 301 |
+
- **Type**: `int`
|
| 302 |
+
- **Description**: Maximum number of points in the exported point cloud. If the point cloud exceeds this limit, it will be downsampled.
|
| 303 |
+
|
| 304 |
+
##### `show_cameras` (default: True)
|
| 305 |
+
- **Type**: `bool`
|
| 306 |
+
- **Description**: Whether to include camera wireframes in the exported GLB file for visualization.
|
| 307 |
+
|
| 308 |
+
#### 🎨 Feature Visualization Parameters
|
| 309 |
+
|
| 310 |
+
These parameters are passed directly to the `inference()` method and only apply when `export_format` includes `"feat_vis"`.
|
| 311 |
+
|
| 312 |
+
##### `feat_vis_fps` (default: 15)
|
| 313 |
+
- **Type**: `int`
|
| 314 |
+
- **Description**: Frame rate for the output video when visualizing features across multiple images.
|
| 315 |
+
|
| 316 |
+
#### ✨🎥 3DGS and 3DGS Video Parameters
|
| 317 |
+
|
| 318 |
+
These parameters are passed directly to the `inference()` method and only apply when `export_format` includes `"gs_ply"` or `"gs_video"`.
|
| 319 |
+
|
| 320 |
+
##### `export_kwargs` (default: `{}`)
|
| 321 |
+
- Type: `dict[str, dict[str, Any]]`
|
| 322 |
+
- Description: Per-format extra arguments passed to export functions, mainly for `"gs_ply"` and `"gs_video"`.
|
| 323 |
+
- Access pattern: `export_kwargs[export_format][key] = value`
|
| 324 |
+
- Example:
|
| 325 |
+
```python
|
| 326 |
+
{
|
| 327 |
+
"gs_ply": {
|
| 328 |
+
"gs_views_interval": 1,
|
| 329 |
+
},
|
| 330 |
+
"gs_video": {
|
| 331 |
+
"trj_mode": "interpolate_smooth",
|
| 332 |
+
"chunk_size": 1,
|
| 333 |
+
"vis_depth": None,
|
| 334 |
+
},
|
| 335 |
+
}
|
| 336 |
+
```
|
| 337 |
+
|
| 338 |
+
## 📤 Export Formats
|
| 339 |
+
|
| 340 |
+
The API supports multiple export formats for different use cases:
|
| 341 |
+
|
| 342 |
+
### 📊 `mini_npz`
|
| 343 |
+
- **Description**: Minimal NPZ format containing essential data
|
| 344 |
+
- **Contents**: `depth`, `conf`, `exts`, `ixts`
|
| 345 |
+
- **Use case**: Lightweight storage for depth data with camera parameters
|
| 346 |
+
|
| 347 |
+
### 📦 `npz`
|
| 348 |
+
- **Description**: Full NPZ format with comprehensive data
|
| 349 |
+
- **Contents**: `depth`, `conf`, `exts`, `ixts`, `image`, etc.
|
| 350 |
+
- **Use case**: Complete data export for advanced processing
|
| 351 |
+
|
| 352 |
+
### 🌐 `glb`
|
| 353 |
+
- **Description**: 3D visualization format with point cloud and camera poses
|
| 354 |
+
- **Contents**:
|
| 355 |
+
- Point cloud with colors from original images
|
| 356 |
+
- Camera wireframes for visualization
|
| 357 |
+
- Confidence-based filtering and downsampling
|
| 358 |
+
- **Use case**: 3D visualization, inspection, and analysis
|
| 359 |
+
- **Features**:
|
| 360 |
+
- Automatic sky depth handling
|
| 361 |
+
- Confidence threshold filtering
|
| 362 |
+
- Background filtering (black/white)
|
| 363 |
+
- Scene scale normalization
|
| 364 |
+
- **Parameters** (passed via `inference()` method directly):
|
| 365 |
+
- `conf_thresh_percentile` (float, default: 40.0): Lower percentile for adaptive confidence threshold. Points below this confidence percentile will be filtered out.
|
| 366 |
+
- `num_max_points` (int, default: 1,000,000): Maximum number of points in the exported point cloud. If exceeded, points will be downsampled.
|
| 367 |
+
- `show_cameras` (bool, default: True): Whether to include camera wireframes in the exported GLB file for visualization.
|
| 368 |
+
|
| 369 |
+
### ✨ `gs_ply`
|
| 370 |
+
- **Description**: Gaussian Splatting point cloud format
|
| 371 |
+
- **Contents**: 3DGS data in PLY format. Compatible with standard 3DGS viewers such as [SuperSplat](https://superspl.at/editor) (recommended), [SPARK](https://sparkjs.dev/viewer/).
|
| 372 |
+
- **Use case**: Gaussian Splatting reconstruction
|
| 373 |
+
- **Requirements**: Must set `infer_gs=True` when calling `inference()`. Only supported by `da3-giant` and `da3nested-giant-large` models.
|
| 374 |
+
- **Additional configs**, provided via `export_kwargs` (see [Export Parameters](#export-parameters)):
|
| 375 |
+
- `gs_views_interval`: Export to 3DGS every N views, default: `1`.
|
| 376 |
+
|
| 377 |
+
### 🎥 `gs_video`
|
| 378 |
+
- **Description**: Rasterized 3DGS to obtain videos
|
| 379 |
+
- **Contents**: A video of 3DGS-rasterized views using either provided viewpoints or a predefined camera trajectory.
|
| 380 |
+
- **Use case**: Video rendering for Gaussian Splatting
|
| 381 |
+
- **Requirements**: Must set `infer_gs=True` when calling `inference()`. Only supported by `da3-giant` and `da3nested-giant-large` models.
|
| 382 |
+
- **Note**: Can optionally use `render_exts`, `render_ixts`, and `render_hw` parameters in `inference()` method to specify novel viewpoints.
|
| 383 |
+
- **Additional configs**, provided via `export_kwargs` (see [Export Parameters](#export-parameters)):
|
| 384 |
+
- `extrinsics`: Optional world-to-camera poses for novel views. Falls back to the predicted poses of input views if not provided. (Alternatively, use `render_exts` parameter in `inference()`)
|
| 385 |
+
- `intrinsics`: Optional camera intrinsics for novel views. Falls back to the predicted intrinsics of input views if not provided. (Alternatively, use `render_ixts` parameter in `inference()`)
|
| 386 |
+
- `out_image_hw`: Optional output resolution `H x W`. Falls back to input resolution if not provided. (Alternatively, use `render_hw` parameter in `inference()`)
|
| 387 |
+
- `chunk_size`: Number of views rasterized per batch. Default: `8`.
|
| 388 |
+
- `trj_mode`: Predefined camera trajectory for novel-view rendering.
|
| 389 |
+
- `color_mode`: Same as `render_mode` in [gsplat](https://docs.gsplat.studio/main/apis/rasterization.html#gsplat.rasterization).
|
| 390 |
+
- `vis_depth`: How depth is combined with RGB. Default: `hcat` (horizontal concatenation).
|
| 391 |
+
- `enable_tqdm`: Whether to display a tqdm progress bar during rendering.
|
| 392 |
+
- `output_name`: File name of the rendered video.
|
| 393 |
+
- `video_quality`: Video quality to save. Default: `high`.
|
| 394 |
+
- `high`: High quality video (default)
|
| 395 |
+
- `medium`: Medium quality video (balance of storage space and quality)
|
| 396 |
+
- `low`: Low quality video (fewer storage space)
|
| 397 |
+
|
| 398 |
+
### 🔍 `feat_vis`
|
| 399 |
+
- **Description**: Feature visualization format
|
| 400 |
+
- **Contents**: PCA-visualized intermediate features from specified layers
|
| 401 |
+
- **Use case**: Model interpretability and feature analysis
|
| 402 |
+
- **Note**: Requires `export_feat_layers` to be specified
|
| 403 |
+
- **Parameters** (passed via `inference()` method directly):
|
| 404 |
+
- `feat_vis_fps` (int, default: 15): Frame rate for the output video when visualizing features across multiple images.
|
| 405 |
+
|
| 406 |
+
### 🎨 `depth_vis`
|
| 407 |
+
- **Description**: Depth visualization format
|
| 408 |
+
- **Contents**: Color-coded depth maps alongside original images
|
| 409 |
+
- **Use case**: Visual inspection of depth estimation quality
|
| 410 |
+
|
| 411 |
+
### 🔗 Multiple Format Export
|
| 412 |
+
You can export multiple formats simultaneously by separating them with `-`:
|
| 413 |
+
|
| 414 |
+
```python
|
| 415 |
+
# Export both mini_npz and glb formats
|
| 416 |
+
export_format = "mini_npz-glb"
|
| 417 |
+
|
| 418 |
+
# Export multiple formats
|
| 419 |
+
export_format = "npz-glb-gs_ply"
|
| 420 |
+
```
|
| 421 |
+
|
| 422 |
+
## ↩️ Return Value
|
| 423 |
+
|
| 424 |
+
The `inference()` method returns a `Prediction` object with the following attributes:
|
| 425 |
+
|
| 426 |
+
### 📊 Core Outputs
|
| 427 |
+
|
| 428 |
+
- **depth**: `np.ndarray` - Estimated depth maps with shape `(N, H, W)` where N is the number of images, H is height, and W is width.
|
| 429 |
+
- **conf**: `np.ndarray` - Confidence maps with shape `(N, H, W)` indicating prediction reliability (optional, depends on model).
|
| 430 |
+
|
| 431 |
+
### 📷 Camera Parameters
|
| 432 |
+
|
| 433 |
+
- **extrinsics**: `np.ndarray` - Camera extrinsic matrices with shape `(N, 3, 4)` representing world-to-camera transformations. Only present if camera poses were estimated or provided as input.
|
| 434 |
+
- **intrinsics**: `np.ndarray` - Camera intrinsic matrices with shape `(N, 3, 3)` containing focal length and principal point information. Only present if poses were estimated or provided as input.
|
| 435 |
+
|
| 436 |
+
### 🎁 Additional Outputs
|
| 437 |
+
|
| 438 |
+
- **processed_images**: `np.ndarray` - Preprocessed input images with shape `(N, H, W, 3)` in RGB format (0-255 uint8).
|
| 439 |
+
- **aux**: `dict` - Auxiliary outputs including:
|
| 440 |
+
- `feat_layer_X`: Intermediate features from layer X (if `export_feat_layers` was specified)
|
| 441 |
+
- `gaussians`: 3D Gaussian Splats data (if `infer_gs=True`)
|
| 442 |
+
|
| 443 |
+
### 💻 Usage Example
|
| 444 |
+
|
| 445 |
+
```python
|
| 446 |
+
prediction = model.inference(image=["img1.jpg", "img2.jpg"])
|
| 447 |
+
|
| 448 |
+
# Access depth maps
|
| 449 |
+
depth_maps = prediction.depth # shape: (2, H, W)
|
| 450 |
+
|
| 451 |
+
# Access confidence
|
| 452 |
+
if hasattr(prediction, 'conf'):
|
| 453 |
+
confidence = prediction.conf
|
| 454 |
+
|
| 455 |
+
# Access camera parameters (if available)
|
| 456 |
+
if hasattr(prediction, 'extrinsics'):
|
| 457 |
+
camera_poses = prediction.extrinsics # shape: (2, 4, 4)
|
| 458 |
+
|
| 459 |
+
if hasattr(prediction, 'intrinsics'):
|
| 460 |
+
camera_intrinsics = prediction.intrinsics # shape: (2, 3, 3)
|
| 461 |
+
|
| 462 |
+
# Access intermediate features (if export_feat_layers was set)
|
| 463 |
+
if hasattr(prediction, 'aux') and 'feat_layer_0' in prediction.aux:
|
| 464 |
+
features = prediction.aux['feat_layer_0']
|
| 465 |
+
```
|
docs/CLI.md
ADDED
|
@@ -0,0 +1,654 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 🚀 Depth Anything 3 Command Line Interface
|
| 2 |
+
|
| 3 |
+
## 📋 Table of Contents
|
| 4 |
+
|
| 5 |
+
- [📖 Overview](#overview)
|
| 6 |
+
- [⚡ Quick Start](#quick-start)
|
| 7 |
+
- [📚 Command Reference](#command-reference)
|
| 8 |
+
- [🤖 auto - Auto Mode](#auto---auto-mode)
|
| 9 |
+
- [🖼️ image - Single Image Processing](#image---single-image-processing)
|
| 10 |
+
- [🗂️ images - Image Directory Processing](#images---image-directory-processing)
|
| 11 |
+
- [🎬 video - Video Processing](#video---video-processing)
|
| 12 |
+
- [📐 colmap - COLMAP Dataset Processing](#colmap---colmap-dataset-processing)
|
| 13 |
+
- [🔧 backend - Backend Service](#backend---backend-service)
|
| 14 |
+
- [🎨 gradio - Gradio Application](#gradio---gradio-application)
|
| 15 |
+
- [🖼️ gallery - Gallery Server](#gallery---gallery-server)
|
| 16 |
+
- [⚙️ Parameter Details](#parameter-details)
|
| 17 |
+
- [💡 Usage Examples](#usage-examples)
|
| 18 |
+
|
| 19 |
+
## 📖 Overview
|
| 20 |
+
|
| 21 |
+
The Depth Anything 3 CLI provides a comprehensive command-line toolkit supporting image depth estimation, video processing, COLMAP dataset handling, and web applications.
|
| 22 |
+
|
| 23 |
+
The backend service enables cache model to GPU so that we do not need to reload model for each command.
|
| 24 |
+
|
| 25 |
+
## ⚡ Quick Start
|
| 26 |
+
|
| 27 |
+
The CLI can run fully offline or connect to the backend for cached weights and task scheduling:
|
| 28 |
+
|
| 29 |
+
```bash
|
| 30 |
+
# 🔧 Start backend service (optional, keeps model resident in GPU memory)
|
| 31 |
+
da3 backend --model-dir depth-anything/DA3NESTED-GIANT-LARGE
|
| 32 |
+
|
| 33 |
+
# 🚀 Use auto mode to process input
|
| 34 |
+
da3 auto path/to/input --export-dir ./workspace/scene001
|
| 35 |
+
|
| 36 |
+
# ♻️ Reuse backend for next job
|
| 37 |
+
da3 auto path/to/video.mp4 \
|
| 38 |
+
--export-dir ./workspace/scene002 \
|
| 39 |
+
--use-backend \
|
| 40 |
+
--backend-url http://localhost:8008
|
| 41 |
+
```
|
| 42 |
+
|
| 43 |
+
Each export directory contains `scene.glb`, `scene.jpg`, and optional extras such as `depth_vis/` or `gs_video/` depending on the requested format.
|
| 44 |
+
|
| 45 |
+
## 📚 Command Reference
|
| 46 |
+
|
| 47 |
+
### 🤖 auto - Auto Mode
|
| 48 |
+
|
| 49 |
+
Automatically detect input type and dispatch to the appropriate handler.
|
| 50 |
+
|
| 51 |
+
**Usage:**
|
| 52 |
+
|
| 53 |
+
```bash
|
| 54 |
+
da3 auto INPUT_PATH [OPTIONS]
|
| 55 |
+
```
|
| 56 |
+
|
| 57 |
+
**Input Type Detection:**
|
| 58 |
+
- 🖼️ Single image file (.jpg, .png, .jpeg, .webp, .bmp, .tiff, .tif)
|
| 59 |
+
- 📁 Image directory
|
| 60 |
+
- 🎬 Video file (.mp4, .avi, .mov, .mkv, .flv, .wmv, .webm, .m4v)
|
| 61 |
+
- 📐 COLMAP directory (containing `images/` and `sparse/` subdirectories)
|
| 62 |
+
|
| 63 |
+
**Parameters:**
|
| 64 |
+
|
| 65 |
+
| Parameter | Type | Default | Description |
|
| 66 |
+
|-----------|------|---------|-------------|
|
| 67 |
+
| `INPUT_PATH` | str | Required | Input path (image, directory, video, or COLMAP) |
|
| 68 |
+
| `--model-dir` | str | Default model | Model directory path |
|
| 69 |
+
| `--export-dir` | str | `debug` | Export directory |
|
| 70 |
+
| `--export-format` | str | `glb` | Export format (supports `mini_npz`, `glb`, `feat_vis`, etc., can be combined with hyphens) |
|
| 71 |
+
| `--device` | str | `cuda` | Device to use |
|
| 72 |
+
| `--use-backend` | bool | `False` | Use backend service for inference |
|
| 73 |
+
| `--backend-url` | str | `http://localhost:8008` | Backend service URL |
|
| 74 |
+
| `--process-res` | int | `504` | Processing resolution |
|
| 75 |
+
| `--process-res-method` | str | `upper_bound_resize` | Processing resolution method |
|
| 76 |
+
| `--export-feat` | str | `""` | Export features from specified layers, comma-separated (e.g., `"0,1,2"`) |
|
| 77 |
+
| `--auto-cleanup` | bool | `False` | Automatically clean export directory without confirmation |
|
| 78 |
+
| `--fps` | float | `1.0` | [Video] Frame sampling FPS |
|
| 79 |
+
| `--sparse-subdir` | str | `""` | [COLMAP] Sparse reconstruction subdirectory (e.g., `"0"` for `sparse/0/`) |
|
| 80 |
+
| `--align-to-input-ext-scale` | bool | `True` | [COLMAP] Align prediction to input extrinsics scale |
|
| 81 |
+
| `--use-ray-pose` | bool | `False` | Use ray-based pose estimation instead of camera decoder |
|
| 82 |
+
| `--ref-view-strategy` | str | `saddle_balanced` | Reference view selection strategy: `first`, `middle`, `saddle_balanced`, `saddle_sim_range`. See [docs](funcs/ref_view_strategy.md) |
|
| 83 |
+
| `--conf-thresh-percentile` | float | `40.0` | [GLB] Lower percentile for adaptive confidence threshold |
|
| 84 |
+
| `--num-max-points` | int | `1000000` | [GLB] Maximum number of points in the point cloud |
|
| 85 |
+
| `--show-cameras` | bool | `True` | [GLB] Show camera wireframes in the exported scene |
|
| 86 |
+
| `--feat-vis-fps` | int | `15` | [FEAT_VIS] Frame rate for output video |
|
| 87 |
+
|
| 88 |
+
**Examples:**
|
| 89 |
+
|
| 90 |
+
```bash
|
| 91 |
+
# 🖼️ Auto-process an image
|
| 92 |
+
da3 auto path/to/image.jpg --export-dir ./output
|
| 93 |
+
|
| 94 |
+
# 🎬 Auto-process a video
|
| 95 |
+
da3 auto path/to/video.mp4 --fps 2.0 --export-dir ./output
|
| 96 |
+
|
| 97 |
+
# 🔧 Use backend service
|
| 98 |
+
da3 auto path/to/input \
|
| 99 |
+
--export-format mini_npz-glb \
|
| 100 |
+
--use-backend \
|
| 101 |
+
--backend-url http://localhost:8008 \
|
| 102 |
+
--export-dir ./output
|
| 103 |
+
```
|
| 104 |
+
|
| 105 |
+
---
|
| 106 |
+
|
| 107 |
+
### 🖼️ image - Single Image Processing
|
| 108 |
+
|
| 109 |
+
Process a single image for camera pose and depth estimation.
|
| 110 |
+
|
| 111 |
+
**Usage:**
|
| 112 |
+
|
| 113 |
+
```bash
|
| 114 |
+
da3 image IMAGE_PATH [OPTIONS]
|
| 115 |
+
```
|
| 116 |
+
|
| 117 |
+
**Parameters:**
|
| 118 |
+
|
| 119 |
+
| Parameter | Type | Default | Description |
|
| 120 |
+
|-----------|------|---------|-------------|
|
| 121 |
+
| `IMAGE_PATH` | str | Required | Input image file path |
|
| 122 |
+
| `--model-dir` | str | Default model | Model directory path |
|
| 123 |
+
| `--export-dir` | str | `debug` | Export directory |
|
| 124 |
+
| `--export-format` | str | `glb` | Export format |
|
| 125 |
+
| `--device` | str | `cuda` | Device to use |
|
| 126 |
+
| `--use-backend` | bool | `False` | Use backend service for inference |
|
| 127 |
+
| `--backend-url` | str | `http://localhost:8008` | Backend service URL |
|
| 128 |
+
| `--process-res` | int | `504` | Processing resolution |
|
| 129 |
+
| `--process-res-method` | str | `upper_bound_resize` | Processing resolution method |
|
| 130 |
+
| `--export-feat` | str | `""` | Export feature layer indices (comma-separated) |
|
| 131 |
+
| `--auto-cleanup` | bool | `False` | Automatically clean export directory |
|
| 132 |
+
| `--use-ray-pose` | bool | `False` | Use ray-based pose estimation instead of camera decoder |
|
| 133 |
+
| `--ref-view-strategy` | str | `saddle_balanced` | Reference view selection strategy. See [docs](funcs/ref_view_strategy.md) |
|
| 134 |
+
| `--conf-thresh-percentile` | float | `40.0` | [GLB] Confidence threshold percentile |
|
| 135 |
+
| `--num-max-points` | int | `1000000` | [GLB] Maximum number of points |
|
| 136 |
+
| `--show-cameras` | bool | `True` | [GLB] Show cameras |
|
| 137 |
+
| `--feat-vis-fps` | int | `15` | [FEAT_VIS] Video frame rate |
|
| 138 |
+
|
| 139 |
+
**Examples:**
|
| 140 |
+
|
| 141 |
+
```bash
|
| 142 |
+
# ✨ Basic usage
|
| 143 |
+
da3 image path/to/image.png --export-dir ./output
|
| 144 |
+
|
| 145 |
+
# ⚡ With backend acceleration
|
| 146 |
+
da3 image path/to/image.png \
|
| 147 |
+
--use-backend \
|
| 148 |
+
--backend-url http://localhost:8008 \
|
| 149 |
+
--export-dir ./output
|
| 150 |
+
|
| 151 |
+
# 🔍 Export feature visualization
|
| 152 |
+
da3 image image.jpg \
|
| 153 |
+
--export-format feat_vis \
|
| 154 |
+
--export-feat "9,19,29,39" \
|
| 155 |
+
--export-dir ./results
|
| 156 |
+
```
|
| 157 |
+
|
| 158 |
+
---
|
| 159 |
+
|
| 160 |
+
### 🗂️ images - Image Directory Processing
|
| 161 |
+
|
| 162 |
+
Process a directory of images for batch depth estimation.
|
| 163 |
+
|
| 164 |
+
**Usage:**
|
| 165 |
+
|
| 166 |
+
```bash
|
| 167 |
+
da3 images IMAGES_DIR [OPTIONS]
|
| 168 |
+
```
|
| 169 |
+
|
| 170 |
+
**Parameters:**
|
| 171 |
+
|
| 172 |
+
| Parameter | Type | Default | Description |
|
| 173 |
+
|-----------|------|---------|-------------|
|
| 174 |
+
| `IMAGES_DIR` | str | Required | Directory path containing images |
|
| 175 |
+
| `--image-extensions` | str | `png,jpg,jpeg` | Image file extensions to process (comma-separated) |
|
| 176 |
+
| `--model-dir` | str | Default model | Model directory path |
|
| 177 |
+
| `--export-dir` | str | `debug` | Export directory |
|
| 178 |
+
| `--export-format` | str | `glb` | Export format |
|
| 179 |
+
| `--device` | str | `cuda` | Device to use |
|
| 180 |
+
| `--use-backend` | bool | `False` | Use backend service for inference |
|
| 181 |
+
| `--backend-url` | str | `http://localhost:8008` | Backend service URL |
|
| 182 |
+
| `--process-res` | int | `504` | Processing resolution |
|
| 183 |
+
| `--process-res-method` | str | `upper_bound_resize` | Processing resolution method |
|
| 184 |
+
| `--export-feat` | str | `""` | Export feature layer indices |
|
| 185 |
+
| `--auto-cleanup` | bool | `False` | Automatically clean export directory |
|
| 186 |
+
| `--use-ray-pose` | bool | `False` | Use ray-based pose estimation instead of camera decoder |
|
| 187 |
+
| `--ref-view-strategy` | str | `saddle_balanced` | Reference view selection strategy. See [docs](funcs/ref_view_strategy.md) |
|
| 188 |
+
| `--conf-thresh-percentile` | float | `40.0` | [GLB] Confidence threshold percentile |
|
| 189 |
+
| `--num-max-points` | int | `1000000` | [GLB] Maximum number of points |
|
| 190 |
+
| `--show-cameras` | bool | `True` | [GLB] Show cameras |
|
| 191 |
+
| `--feat-vis-fps` | int | `15` | [FEAT_VIS] Video frame rate |
|
| 192 |
+
|
| 193 |
+
**Examples:**
|
| 194 |
+
|
| 195 |
+
```bash
|
| 196 |
+
# 📁 Process directory (defaults to png/jpg/jpeg)
|
| 197 |
+
da3 images ./image_folder --export-dir ./output
|
| 198 |
+
|
| 199 |
+
# 🎯 Custom extensions
|
| 200 |
+
da3 images ./dataset --image-extensions "png,jpg,webp" --export-dir ./output
|
| 201 |
+
|
| 202 |
+
# 🔧 Use backend service
|
| 203 |
+
da3 images ./dataset \
|
| 204 |
+
--use-backend \
|
| 205 |
+
--backend-url http://localhost:8008 \
|
| 206 |
+
--export-dir ./output
|
| 207 |
+
```
|
| 208 |
+
|
| 209 |
+
---
|
| 210 |
+
|
| 211 |
+
### 🎬 video - Video Processing
|
| 212 |
+
|
| 213 |
+
Process video by extracting frames for depth estimation.
|
| 214 |
+
|
| 215 |
+
**Usage:**
|
| 216 |
+
|
| 217 |
+
```bash
|
| 218 |
+
da3 video VIDEO_PATH [OPTIONS]
|
| 219 |
+
```
|
| 220 |
+
|
| 221 |
+
**Parameters:**
|
| 222 |
+
|
| 223 |
+
| Parameter | Type | Default | Description |
|
| 224 |
+
|-----------|------|---------|-------------|
|
| 225 |
+
| `VIDEO_PATH` | str | Required | Input video file path |
|
| 226 |
+
| `--fps` | float | `1.0` | Frame extraction sampling FPS |
|
| 227 |
+
| `--model-dir` | str | Default model | Model directory path |
|
| 228 |
+
| `--export-dir` | str | `debug` | Export directory |
|
| 229 |
+
| `--export-format` | str | `glb` | Export format |
|
| 230 |
+
| `--device` | str | `cuda` | Device to use |
|
| 231 |
+
| `--use-backend` | bool | `False` | Use backend service for inference |
|
| 232 |
+
| `--backend-url` | str | `http://localhost:8008` | Backend service URL |
|
| 233 |
+
| `--process-res` | int | `504` | Processing resolution |
|
| 234 |
+
| `--process-res-method` | str | `upper_bound_resize` | Processing resolution method |
|
| 235 |
+
| `--export-feat` | str | `""` | Export feature layer indices |
|
| 236 |
+
| `--auto-cleanup` | bool | `False` | Automatically clean export directory |
|
| 237 |
+
| `--use-ray-pose` | bool | `False` | Use ray-based pose estimation instead of camera decoder |
|
| 238 |
+
| `--ref-view-strategy` | str | `saddle_balanced` | Reference view selection strategy. See [docs](funcs/ref_view_strategy.md) |
|
| 239 |
+
| `--conf-thresh-percentile` | float | `40.0` | [GLB] Confidence threshold percentile |
|
| 240 |
+
| `--num-max-points` | int | `1000000` | [GLB] Maximum number of points |
|
| 241 |
+
| `--show-cameras` | bool | `True` | [GLB] Show cameras |
|
| 242 |
+
| `--feat-vis-fps` | int | `15` | [FEAT_VIS] Video frame rate |
|
| 243 |
+
|
| 244 |
+
**Examples:**
|
| 245 |
+
|
| 246 |
+
```bash
|
| 247 |
+
# ��� Basic video processing
|
| 248 |
+
da3 video path/to/video.mp4 --export-dir ./output
|
| 249 |
+
|
| 250 |
+
# ⚙️ Control frame sampling and resolution
|
| 251 |
+
da3 video path/to/video.mp4 \
|
| 252 |
+
--fps 2.0 \
|
| 253 |
+
--process-res 1024 \
|
| 254 |
+
--export-dir ./output
|
| 255 |
+
|
| 256 |
+
# 🔧 Use backend service
|
| 257 |
+
da3 video path/to/video.mp4 \
|
| 258 |
+
--use-backend \
|
| 259 |
+
--backend-url http://localhost:8008 \
|
| 260 |
+
--export-dir ./output
|
| 261 |
+
```
|
| 262 |
+
|
| 263 |
+
---
|
| 264 |
+
|
| 265 |
+
### 📐 colmap - COLMAP Dataset Processing
|
| 266 |
+
|
| 267 |
+
Run pose-conditioned depth estimation on COLMAP data.
|
| 268 |
+
|
| 269 |
+
**Usage:**
|
| 270 |
+
|
| 271 |
+
```bash
|
| 272 |
+
da3 colmap COLMAP_DIR [OPTIONS]
|
| 273 |
+
```
|
| 274 |
+
|
| 275 |
+
**Parameters:**
|
| 276 |
+
|
| 277 |
+
| Parameter | Type | Default | Description |
|
| 278 |
+
|-----------|------|---------|-------------|
|
| 279 |
+
| `COLMAP_DIR` | str | Required | COLMAP directory containing `images/` and `sparse/` subdirectories |
|
| 280 |
+
| `--sparse-subdir` | str | `""` | Sparse reconstruction subdirectory (e.g., `"0"` for `sparse/0/`) |
|
| 281 |
+
| `--align-to-input-ext-scale` | bool | `True` | Align prediction to input extrinsics scale |
|
| 282 |
+
| `--model-dir` | str | Default model | Model directory path |
|
| 283 |
+
| `--export-dir` | str | `debug` | Export directory |
|
| 284 |
+
| `--export-format` | str | `glb` | Export format |
|
| 285 |
+
| `--device` | str | `cuda` | Device to use |
|
| 286 |
+
| `--use-backend` | bool | `False` | Use backend service for inference |
|
| 287 |
+
| `--backend-url` | str | `http://localhost:8008` | Backend service URL |
|
| 288 |
+
| `--process-res` | int | `504` | Processing resolution |
|
| 289 |
+
| `--process-res-method` | str | `upper_bound_resize` | Processing resolution method |
|
| 290 |
+
| `--export-feat` | str | `""` | Export feature layer indices |
|
| 291 |
+
| `--auto-cleanup` | bool | `False` | Automatically clean export directory |
|
| 292 |
+
| `--use-ray-pose` | bool | `False` | Use ray-based pose estimation instead of camera decoder |
|
| 293 |
+
| `--ref-view-strategy` | str | `saddle_balanced` | Reference view selection strategy. See [docs](funcs/ref_view_strategy.md) |
|
| 294 |
+
| `--conf-thresh-percentile` | float | `40.0` | [GLB] Confidence threshold percentile |
|
| 295 |
+
| `--num-max-points` | int | `1000000` | [GLB] Maximum number of points |
|
| 296 |
+
| `--show-cameras` | bool | `True` | [GLB] Show cameras |
|
| 297 |
+
| `--feat-vis-fps` | int | `15` | [FEAT_VIS] Video frame rate |
|
| 298 |
+
|
| 299 |
+
**Examples:**
|
| 300 |
+
|
| 301 |
+
```bash
|
| 302 |
+
# 📐 Process COLMAP dataset
|
| 303 |
+
da3 colmap ./colmap_dataset --export-dir ./output
|
| 304 |
+
|
| 305 |
+
# 🎯 Use specific sparse subdirectory and align scale
|
| 306 |
+
da3 colmap ./colmap_dataset \
|
| 307 |
+
--sparse-subdir 0 \
|
| 308 |
+
--align-to-input-ext-scale \
|
| 309 |
+
--export-dir ./output
|
| 310 |
+
|
| 311 |
+
# 🔧 Use backend service
|
| 312 |
+
da3 colmap ./colmap_dataset \
|
| 313 |
+
--use-backend \
|
| 314 |
+
--backend-url http://localhost:8008 \
|
| 315 |
+
--export-dir ./output
|
| 316 |
+
```
|
| 317 |
+
|
| 318 |
+
---
|
| 319 |
+
|
| 320 |
+
### 🔧 backend - Backend Service
|
| 321 |
+
|
| 322 |
+
Start model backend service with integrated gallery.
|
| 323 |
+
|
| 324 |
+
**Usage:**
|
| 325 |
+
|
| 326 |
+
```bash
|
| 327 |
+
da3 backend [OPTIONS]
|
| 328 |
+
```
|
| 329 |
+
|
| 330 |
+
**Parameters:**
|
| 331 |
+
|
| 332 |
+
| Parameter | Type | Default | Description |
|
| 333 |
+
|-----------|------|---------|-------------|
|
| 334 |
+
| `--model-dir` | str | Default model | Model directory path |
|
| 335 |
+
| `--device` | str | `cuda` | Device to use |
|
| 336 |
+
| `--host` | str | `127.0.0.1` | Host address to bind to |
|
| 337 |
+
| `--port` | int | `8008` | Port number to bind to |
|
| 338 |
+
| `--gallery-dir` | str | Default gallery dir | Gallery directory path (optional) |
|
| 339 |
+
|
| 340 |
+
**Features:**
|
| 341 |
+
- 🎯 Keeps model resident in GPU memory
|
| 342 |
+
- 🔌 Provides REST inference API
|
| 343 |
+
- 📊 Integrated dashboard and status monitoring
|
| 344 |
+
- 🖼️ Optional gallery browser (if `--gallery-dir` is provided)
|
| 345 |
+
|
| 346 |
+
**Available Endpoints:**
|
| 347 |
+
- 🏠 `/` - Home page
|
| 348 |
+
- 📊 `/dashboard` - Dashboard
|
| 349 |
+
- ✅ `/status` - API status
|
| 350 |
+
- 🖼️ `/gallery/` - Gallery browser (if enabled)
|
| 351 |
+
|
| 352 |
+
**Examples:**
|
| 353 |
+
|
| 354 |
+
```bash
|
| 355 |
+
# 🚀 Basic backend service
|
| 356 |
+
da3 backend --model-dir depth-anything/DA3NESTED-GIANT-LARGE
|
| 357 |
+
|
| 358 |
+
# 🖼️ Backend with gallery
|
| 359 |
+
da3 backend \
|
| 360 |
+
--model-dir depth-anything/DA3NESTED-GIANT-LARGE \
|
| 361 |
+
--device cuda \
|
| 362 |
+
--host 0.0.0.0 \
|
| 363 |
+
--port 8008 \
|
| 364 |
+
--gallery-dir ./workspace
|
| 365 |
+
|
| 366 |
+
# 💻 Use CPU
|
| 367 |
+
da3 backend --model-dir depth-anything/DA3NESTED-GIANT-LARGE --device cpu
|
| 368 |
+
```
|
| 369 |
+
|
| 370 |
+
---
|
| 371 |
+
|
| 372 |
+
### 🎨 gradio - Gradio Application
|
| 373 |
+
|
| 374 |
+
Launch Depth Anything 3 Gradio interactive web application.
|
| 375 |
+
|
| 376 |
+
**Usage:**
|
| 377 |
+
|
| 378 |
+
```bash
|
| 379 |
+
da3 gradio [OPTIONS]
|
| 380 |
+
```
|
| 381 |
+
|
| 382 |
+
**Parameters:**
|
| 383 |
+
|
| 384 |
+
| Parameter | Type | Default | Description |
|
| 385 |
+
|-----------|------|---------|-------------|
|
| 386 |
+
| `--model-dir` | str | Required | Model directory path |
|
| 387 |
+
| `--workspace-dir` | str | Required | Workspace directory path |
|
| 388 |
+
| `--gallery-dir` | str | Required | Gallery directory path |
|
| 389 |
+
| `--host` | str | `127.0.0.1` | Host address to bind to |
|
| 390 |
+
| `--port` | int | `7860` | Port number to bind to |
|
| 391 |
+
| `--share` | bool | `False` | Create a public link |
|
| 392 |
+
| `--debug` | bool | `False` | Enable debug mode |
|
| 393 |
+
| `--cache-examples` | bool | `False` | Pre-cache all example scenes at startup |
|
| 394 |
+
| `--cache-gs-tag` | str | `""` | Tag to match scene names for high-res+3DGS caching |
|
| 395 |
+
|
| 396 |
+
**Examples:**
|
| 397 |
+
|
| 398 |
+
```bash
|
| 399 |
+
# 🎨 Basic Gradio application
|
| 400 |
+
da3 gradio \
|
| 401 |
+
--model-dir depth-anything/DA3NESTED-GIANT-LARGE \
|
| 402 |
+
--workspace-dir ./workspace \
|
| 403 |
+
--gallery-dir ./gallery
|
| 404 |
+
|
| 405 |
+
# 🌐 Enable sharing and debug
|
| 406 |
+
da3 gradio \
|
| 407 |
+
--model-dir depth-anything/DA3NESTED-GIANT-LARGE \
|
| 408 |
+
--workspace-dir ./workspace \
|
| 409 |
+
--gallery-dir ./gallery \
|
| 410 |
+
--share \
|
| 411 |
+
--debug
|
| 412 |
+
|
| 413 |
+
# ⚡ Pre-cache examples
|
| 414 |
+
da3 gradio \
|
| 415 |
+
--model-dir depth-anything/DA3NESTED-GIANT-LARGE \
|
| 416 |
+
--workspace-dir ./workspace \
|
| 417 |
+
--gallery-dir ./gallery \
|
| 418 |
+
--cache-examples \
|
| 419 |
+
--cache-gs-tag "dl3dv"
|
| 420 |
+
```
|
| 421 |
+
|
| 422 |
+
---
|
| 423 |
+
|
| 424 |
+
### 🖼️ gallery - Gallery Server
|
| 425 |
+
|
| 426 |
+
Launch standalone Depth Anything 3 Gallery server.
|
| 427 |
+
|
| 428 |
+
**Usage:**
|
| 429 |
+
|
| 430 |
+
```bash
|
| 431 |
+
da3 gallery [OPTIONS]
|
| 432 |
+
```
|
| 433 |
+
|
| 434 |
+
**Parameters:**
|
| 435 |
+
|
| 436 |
+
| Parameter | Type | Default | Description |
|
| 437 |
+
|-----------|------|---------|-------------|
|
| 438 |
+
| `--gallery-dir` | str | Default gallery dir | Gallery root directory |
|
| 439 |
+
| `--host` | str | `127.0.0.1` | Host address to bind to |
|
| 440 |
+
| `--port` | int | `8007` | Port number to bind to |
|
| 441 |
+
| `--open-browser` | bool | `False` | Open browser after launch |
|
| 442 |
+
|
| 443 |
+
**Note:**
|
| 444 |
+
The gallery expects each scene folder to contain at least `scene.glb` and `scene.jpg`, with optional subfolders such as `depth_vis/` or `gs_video/`.
|
| 445 |
+
|
| 446 |
+
**Examples:**
|
| 447 |
+
|
| 448 |
+
```bash
|
| 449 |
+
# 🖼️ Basic gallery server
|
| 450 |
+
da3 gallery --gallery-dir ./workspace
|
| 451 |
+
|
| 452 |
+
# 🌐 Custom host and port
|
| 453 |
+
da3 gallery \
|
| 454 |
+
--gallery-dir ./workspace \
|
| 455 |
+
--host 0.0.0.0 \
|
| 456 |
+
--port 8007
|
| 457 |
+
|
| 458 |
+
# 🚀 Auto-open browser
|
| 459 |
+
da3 gallery --gallery-dir ./workspace --open-browser
|
| 460 |
+
```
|
| 461 |
+
|
| 462 |
+
---
|
| 463 |
+
|
| 464 |
+
## ⚙️ Parameter Details
|
| 465 |
+
|
| 466 |
+
### 🔧 Common Parameters
|
| 467 |
+
|
| 468 |
+
- **`--export-dir`**: Output directory, defaults to `debug`
|
| 469 |
+
- **`--export-format`**: Export format, supports combining multiple formats with hyphens:
|
| 470 |
+
- 📦 `mini_npz`: Compressed NumPy format
|
| 471 |
+
- 🎨 `glb`: glTF binary format (3D scene)
|
| 472 |
+
- 🔍 `feat_vis`: Feature visualization
|
| 473 |
+
- Example: `mini_npz-glb` exports both formats
|
| 474 |
+
|
| 475 |
+
- **`--process-res`** / **`--process-res-method`**: Control preprocessing resolution strategy
|
| 476 |
+
- `process-res`: Target resolution (default 504)
|
| 477 |
+
- `process-res-method`: Resize method (default `upper_bound_resize`)
|
| 478 |
+
|
| 479 |
+
- **`--auto-cleanup`**: Remove existing export directory without confirmation
|
| 480 |
+
|
| 481 |
+
- **`--use-backend`** / **`--backend-url`**: Reuse running backend service
|
| 482 |
+
- ⚡ Reduces model loading time
|
| 483 |
+
- 🌐 Supports distributed processing
|
| 484 |
+
|
| 485 |
+
- **`--export-feat`**: Layer indices for exporting intermediate features (comma-separated)
|
| 486 |
+
- Example: `"9,19,29,39"`
|
| 487 |
+
|
| 488 |
+
### 🎨 GLB Export Parameters
|
| 489 |
+
|
| 490 |
+
- **`--conf-thresh-percentile`**: Lower percentile for adaptive confidence threshold (default 40.0)
|
| 491 |
+
- Used to filter low-confidence points
|
| 492 |
+
|
| 493 |
+
- **`--num-max-points`**: Maximum number of points in point cloud (default 1,000,000)
|
| 494 |
+
- Controls output file size and performance
|
| 495 |
+
|
| 496 |
+
- **`--show-cameras`**: Show camera wireframes in exported scene (default True)
|
| 497 |
+
|
| 498 |
+
### 🔍 Feature Visualization Parameters
|
| 499 |
+
|
| 500 |
+
- **`--feat-vis-fps`**: Frame rate for feature visualization output video (default 15)
|
| 501 |
+
|
| 502 |
+
### 🎬 Video-Specific Parameters
|
| 503 |
+
|
| 504 |
+
- **`--fps`**: Video frame extraction sampling rate (default 1.0 FPS)
|
| 505 |
+
- Higher values extract more frames
|
| 506 |
+
|
| 507 |
+
### 📐 COLMAP-Specific Parameters
|
| 508 |
+
|
| 509 |
+
- **`--sparse-subdir`**: Sparse reconstruction subdirectory
|
| 510 |
+
- Empty string uses `sparse/` directory
|
| 511 |
+
- `"0"` uses `sparse/0/` directory
|
| 512 |
+
|
| 513 |
+
- **`--align-to-input-ext-scale`**: Align prediction to input extrinsics scale (default True)
|
| 514 |
+
- Ensures depth estimation is consistent with COLMAP scale
|
| 515 |
+
|
| 516 |
+
---
|
| 517 |
+
|
| 518 |
+
## 💡 Usage Examples
|
| 519 |
+
|
| 520 |
+
### 1️⃣ Basic Workflow
|
| 521 |
+
|
| 522 |
+
```bash
|
| 523 |
+
# 🔧 Start backend service
|
| 524 |
+
da3 backend --model-dir depth-anything/DA3NESTED-GIANT-LARGE --host 0.0.0.0 --port 8008
|
| 525 |
+
|
| 526 |
+
# 🖼️ Process single image
|
| 527 |
+
da3 image image.jpg --export-dir ./output1 --use-backend
|
| 528 |
+
|
| 529 |
+
# 🎬 Process video
|
| 530 |
+
da3 video video.mp4 --fps 2.0 --export-dir ./output2 --use-backend
|
| 531 |
+
|
| 532 |
+
# 📐 Process COLMAP dataset
|
| 533 |
+
da3 colmap ./colmap_data --export-dir ./output3 --use-backend
|
| 534 |
+
```
|
| 535 |
+
|
| 536 |
+
### 2️⃣ Using Auto Mode
|
| 537 |
+
|
| 538 |
+
```bash
|
| 539 |
+
# 🤖 Auto-detect and process
|
| 540 |
+
da3 auto ./unknown_input --export-dir ./output
|
| 541 |
+
|
| 542 |
+
# ⚡ With backend acceleration
|
| 543 |
+
da3 auto ./unknown_input \
|
| 544 |
+
--use-backend \
|
| 545 |
+
--backend-url http://localhost:8008 \
|
| 546 |
+
--export-dir ./output
|
| 547 |
+
```
|
| 548 |
+
|
| 549 |
+
### 3️⃣ Multi-Format Export
|
| 550 |
+
|
| 551 |
+
```bash
|
| 552 |
+
# 📦 Export both NPZ and GLB formats
|
| 553 |
+
da3 auto assets/examples/SOH \
|
| 554 |
+
--export-format mini_npz-glb \
|
| 555 |
+
--export-dir ./workspace/soh
|
| 556 |
+
|
| 557 |
+
# 🔍 Export feature visualization
|
| 558 |
+
da3 image image.jpg \
|
| 559 |
+
--export-format feat_vis \
|
| 560 |
+
--export-feat "9,19,29,39" \
|
| 561 |
+
--export-dir ./results
|
| 562 |
+
```
|
| 563 |
+
|
| 564 |
+
### 4️⃣ Advanced Configuration
|
| 565 |
+
|
| 566 |
+
```bash
|
| 567 |
+
# ⚙️ Custom resolution and point cloud density
|
| 568 |
+
da3 image image.jpg \
|
| 569 |
+
--process-res 1024 \
|
| 570 |
+
--num-max-points 2000000 \
|
| 571 |
+
--conf-thresh-percentile 30.0 \
|
| 572 |
+
--export-dir ./output
|
| 573 |
+
|
| 574 |
+
# 📐 COLMAP advanced options
|
| 575 |
+
da3 colmap ./colmap_data \
|
| 576 |
+
--sparse-subdir 0 \
|
| 577 |
+
--align-to-input-ext-scale \
|
| 578 |
+
--process-res 756 \
|
| 579 |
+
--export-dir ./output
|
| 580 |
+
```
|
| 581 |
+
|
| 582 |
+
### 5️⃣ Batch Processing Workflow
|
| 583 |
+
|
| 584 |
+
```bash
|
| 585 |
+
# 🔧 Start backend
|
| 586 |
+
da3 backend \
|
| 587 |
+
--model-dir depth-anything/DA3NESTED-GIANT-LARGE \
|
| 588 |
+
--device cuda \
|
| 589 |
+
--host 0.0.0.0 \
|
| 590 |
+
--port 8008 \
|
| 591 |
+
--gallery-dir ./workspace
|
| 592 |
+
|
| 593 |
+
# 🔄 Batch process multiple scenes
|
| 594 |
+
for scene in scene1 scene2 scene3; do
|
| 595 |
+
da3 auto ./data/$scene \
|
| 596 |
+
--export-dir ./workspace/$scene \
|
| 597 |
+
--use-backend \
|
| 598 |
+
--auto-cleanup
|
| 599 |
+
done
|
| 600 |
+
|
| 601 |
+
# 🖼️ Launch gallery to view results
|
| 602 |
+
da3 gallery --gallery-dir ./workspace --open-browser
|
| 603 |
+
```
|
| 604 |
+
|
| 605 |
+
### 6️⃣ Web Applications
|
| 606 |
+
|
| 607 |
+
```bash
|
| 608 |
+
# 🎨 Launch Gradio application
|
| 609 |
+
da3 gradio \
|
| 610 |
+
--model-dir depth-anything/DA3NESTED-GIANT-LARGE \
|
| 611 |
+
--workspace-dir workspace/gradio \
|
| 612 |
+
--gallery-dir ./gallery \
|
| 613 |
+
--host 0.0.0.0 \
|
| 614 |
+
--port 7860 \
|
| 615 |
+
--share
|
| 616 |
+
```
|
| 617 |
+
|
| 618 |
+
### 7️⃣ Transformer Feature Visualization
|
| 619 |
+
|
| 620 |
+
```bash
|
| 621 |
+
# 🔍 Export Transformer features
|
| 622 |
+
# 📦 Combined with numerical output
|
| 623 |
+
da3 auto video.mp4 \
|
| 624 |
+
--export-format glb-feat_vis \
|
| 625 |
+
--export-feat "11,21,31" \
|
| 626 |
+
--export-dir ./debug \
|
| 627 |
+
--use-backend
|
| 628 |
+
```
|
| 629 |
+
|
| 630 |
+
---
|
| 631 |
+
|
| 632 |
+
## 📝 Notes
|
| 633 |
+
|
| 634 |
+
1. **🔧 Backend Service**: Recommended for processing multiple tasks to improve efficiency
|
| 635 |
+
2. **💾 GPU Memory**: Be mindful of GPU memory usage when processing high-resolution inputs
|
| 636 |
+
3. **📁 Export Directory**: Use `--auto-cleanup` to avoid manual confirmation for deletion
|
| 637 |
+
4. **🔀 Format Combination**: Multiple export formats can be combined with hyphens (e.g., `mini_npz-glb-feat_vis`)
|
| 638 |
+
5. **📐 COLMAP Data**: Ensure COLMAP directory structure is correct (contains `images/` and `sparse/` subdirectories)
|
| 639 |
+
|
| 640 |
+
---
|
| 641 |
+
|
| 642 |
+
## ❓ Getting Help
|
| 643 |
+
|
| 644 |
+
View detailed help for any command:
|
| 645 |
+
|
| 646 |
+
```bash
|
| 647 |
+
# 📖 View main help
|
| 648 |
+
da3 --help
|
| 649 |
+
|
| 650 |
+
# 🔍 View specific command help
|
| 651 |
+
da3 auto --help
|
| 652 |
+
da3 image --help
|
| 653 |
+
da3 backend --help
|
| 654 |
+
```
|
docs/funcs/ref_view_strategy.md
ADDED
|
@@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 📐 Reference View Selection Strategy
|
| 2 |
+
|
| 3 |
+
## 📖 Overview
|
| 4 |
+
|
| 5 |
+
Reference view selection is a component in multi-view depth estimation. When processing multiple input views, the model needs to determine which view should serve as the primary reference frame for depth prediction, defining the world coordinate system.
|
| 6 |
+
|
| 7 |
+
Different reference view will leads to different reconstruction results. This is a known consideration in multi-view geometry and was analyzed in [PI3](https://arxiv.org/abs/2507.13347). The choice of reference view can affect the quality and consistency of depth predictions across the scene.
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
## 🚀 Our Simple Solution: Automatic Reference View Selection
|
| 11 |
+
|
| 12 |
+
DA3 provides a simple approach to address this through **automatic reference view selection** based on **class tokens**. Instead of relying on heuristics or manual selection, the model analyzes the class token features from all input views and intelligently selects the most suitable reference frame.
|
| 13 |
+
|
| 14 |
+
---
|
| 15 |
+
|
| 16 |
+
## 🎨 Available Strategies
|
| 17 |
+
|
| 18 |
+
### 1. ⚖️ `saddle_balanced` (Recommended, Default)
|
| 19 |
+
|
| 20 |
+
**Philosophy:**
|
| 21 |
+
Select a view that achieves balance across multiple feature metrics. This strategy looks for a "middle ground" view that is neither too similar nor too different from other views, making it a stable reference point.
|
| 22 |
+
|
| 23 |
+
**How it works:**
|
| 24 |
+
1. Extracts and normalizes class tokens from all views
|
| 25 |
+
2. Computes three complementary metrics for each view:
|
| 26 |
+
- **Similarity score**: Average cosine similarity with other views
|
| 27 |
+
- **Feature norm**: L2 norm of the original features
|
| 28 |
+
- **Feature variance**: Variance across feature dimensions
|
| 29 |
+
3. Normalizes each metric to [0, 1] range
|
| 30 |
+
4. Selects the view closest to 0.5 (median) across all three metrics
|
| 31 |
+
|
| 32 |
+
### 2. 🎢 `saddle_sim_range`
|
| 33 |
+
|
| 34 |
+
**Philosophy:**
|
| 35 |
+
Select a view with the largest similarity range to other views. This identifies "saddle point" views that are highly similar to some views but dissimilar to others, making them information-rich anchor points.
|
| 36 |
+
|
| 37 |
+
**How it works:**
|
| 38 |
+
1. Computes pairwise cosine similarity between all views
|
| 39 |
+
2. For each view, calculates the range (max - min) of similarities to other views
|
| 40 |
+
3. Selects the view with the maximum similarity range
|
| 41 |
+
|
| 42 |
+
---
|
| 43 |
+
|
| 44 |
+
### 3. 1️⃣ `first` (Not Recommended)
|
| 45 |
+
|
| 46 |
+
**Philosophy:**
|
| 47 |
+
Always use the first view in the input sequence as the reference.
|
| 48 |
+
|
| 49 |
+
**How it works:**
|
| 50 |
+
Simply returns index 0.
|
| 51 |
+
|
| 52 |
+
**When to use:**
|
| 53 |
+
- ⛔ **Not recommended** in general
|
| 54 |
+
- 🔧 Only use when you have manually pre-sorted your views and know the first view is optimal
|
| 55 |
+
- 🐛 Debugging or baseline comparisons
|
| 56 |
+
|
| 57 |
+
---
|
| 58 |
+
|
| 59 |
+
### 4. ⏸️ `middle`
|
| 60 |
+
|
| 61 |
+
**Philosophy:**
|
| 62 |
+
Select the view in the middle of the input sequence.
|
| 63 |
+
|
| 64 |
+
**How it works:**
|
| 65 |
+
Returns the view at index `S // 2` where S is the number of views.
|
| 66 |
+
|
| 67 |
+
**When to use:**
|
| 68 |
+
- ⏱️ **Only recommended when input images are temporally ordered**
|
| 69 |
+
- 🎬 Video sequences (e.g., **DA3-LONG** setting)
|
| 70 |
+
- 📹 Sequential captures where the middle frame likely has the most stable viewpoint
|
| 71 |
+
|
| 72 |
+
**Specific use case: DA3-LONG** 🎬
|
| 73 |
+
In video-based depth estimation scenarios (like DA3-LONG), where inputs are consecutive frames, `middle` is often the **optimal choice** because that it has maximum overlap with all other frames.
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
## 💻 Usage
|
| 77 |
+
|
| 78 |
+
### 🐍 Python API
|
| 79 |
+
|
| 80 |
+
```python
|
| 81 |
+
from depth_anything_3 import DepthAnything3
|
| 82 |
+
|
| 83 |
+
model = DepthAnything3.from_pretrained("depth-anything/DA3NESTED-GIANT-LARGE")
|
| 84 |
+
|
| 85 |
+
# Use default (saddle_balanced)
|
| 86 |
+
prediction = model.inference(
|
| 87 |
+
images,
|
| 88 |
+
ref_view_strategy="saddle_balanced"
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
# For video sequences, consider using middle
|
| 92 |
+
prediction = model.inference(
|
| 93 |
+
video_frames,
|
| 94 |
+
ref_view_strategy="middle" # Good for temporal sequences
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
+
# For complex scenes with wide baselines
|
| 98 |
+
prediction = model.inference(
|
| 99 |
+
images,
|
| 100 |
+
ref_view_strategy="saddle_sim_range"
|
| 101 |
+
)
|
| 102 |
+
```
|
| 103 |
+
|
| 104 |
+
### 🖥️ Command Line Interface
|
| 105 |
+
|
| 106 |
+
```bash
|
| 107 |
+
# Default (saddle_balanced)
|
| 108 |
+
da3 auto input/ --export-dir output/
|
| 109 |
+
|
| 110 |
+
# Explicitly specify strategy
|
| 111 |
+
da3 auto input/ --ref-view-strategy saddle_balanced
|
| 112 |
+
|
| 113 |
+
# For video processing
|
| 114 |
+
da3 video input.mp4 --ref-view-strategy middle
|
| 115 |
+
|
| 116 |
+
# For wide-baseline multi-view
|
| 117 |
+
da3 images captures/ --ref-view-strategy saddle_sim_range
|
| 118 |
+
```
|
| 119 |
+
|
| 120 |
+
---
|
| 121 |
+
|
| 122 |
+
### 🎯 When Selection Is Applied
|
| 123 |
+
|
| 124 |
+
Reference view selection is applied when:
|
| 125 |
+
- 3️⃣ Number of views S ≥ 3
|
| 126 |
+
|
| 127 |
+
---
|
| 128 |
+
|
| 129 |
+
## 💡 Recommendations
|
| 130 |
+
|
| 131 |
+
### 📋 Quick Guide
|
| 132 |
+
|
| 133 |
+
| Scenario | Recommended Strategy | Rationale |
|
| 134 |
+
|----------|---------------------|-----------|
|
| 135 |
+
| **Default / Unknown** | `saddle_balanced` | Robust, balanced, works well across diverse scenarios |
|
| 136 |
+
| **Video frames** | `middle` | Temporal coherence, stable middle frame |
|
| 137 |
+
| **Wide-baseline multi-view** | `saddle_sim_range` | Maximizes information coverage |
|
| 138 |
+
| **Pre-sorted inputs** | `first` | Use only if you've manually optimized ordering |
|
| 139 |
+
| **Single image** | `first` | Automatically used (no reordering needed for S ≤ 2) |
|
| 140 |
+
|
| 141 |
+
### ✨ Best Practices
|
| 142 |
+
|
| 143 |
+
1. 🎯 **Start with defaults**: `saddle_balanced` works well in most cases
|
| 144 |
+
2. 🎬 **Consider your input type**: Use `middle` for videos, `saddle_balanced` for photos
|
| 145 |
+
3. 🔬 **Experiment if needed**: Try different strategies if results are suboptimal
|
| 146 |
+
4. 📊 **Monitor performance**: Check `glb` quality and consistency across views.
|
| 147 |
+
|
| 148 |
+
---
|
| 149 |
+
|
| 150 |
+
## 🔧 Technical Details
|
| 151 |
+
|
| 152 |
+
### 🎚️ Selection Threshold
|
| 153 |
+
|
| 154 |
+
The reference view selection is only triggered when:
|
| 155 |
+
```python
|
| 156 |
+
num_views >= 3 # At least 3 views required
|
| 157 |
+
```
|
| 158 |
+
|
| 159 |
+
For 1-2 views, no reordering is performed (equivalent to using `first`).
|
| 160 |
+
|
| 161 |
+
### ⚙️ Implementation
|
| 162 |
+
|
| 163 |
+
The selection happens at layer `alt_start - 1` in the vision transformer, before the first global attention layer. This ensures the selected reference view influences the entire depth prediction pipeline.
|
| 164 |
+
|
| 165 |
+
---
|
| 166 |
+
|
| 167 |
+
## ❓ FAQ
|
| 168 |
+
|
| 169 |
+
**Q: 🤔 Why is this feature provided?**
|
| 170 |
+
A: The model can handle any view order, but this feature provides automatic optimization for reference view selection, which can help improve depth prediction quality in multi-view scenarios.
|
| 171 |
+
|
| 172 |
+
**Q: ⏱️ Does this add computational cost?**
|
| 173 |
+
A: The overhead is totally negligible.
|
| 174 |
+
|
| 175 |
+
**Q: 🎮 Can I manually specify which view to use as reference?**
|
| 176 |
+
A: Not directly through this parameter. You can pre-sort your input images to place your preferred reference view first and use `ref_view_strategy="first"`.
|
| 177 |
+
|
| 178 |
+
**Q: ⚙️ What happens if I don't specify this parameter?**
|
| 179 |
+
A: The default `saddle_balanced` strategy is used automatically.
|
| 180 |
+
|
| 181 |
+
**Q: 📊 Is this feature used in the DA3 paper benchmarks?**
|
| 182 |
+
A: No, the paper used `first` as the default strategy for all multi-view experiments. The current default has been updated to `saddle_balanced` for better robustness.
|
| 183 |
+
|
notebooks/da3.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
notebooks/da3_tutorial.ipynb
ADDED
|
@@ -0,0 +1,667 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"metadata": {},
|
| 6 |
+
"source": [
|
| 7 |
+
"# 🌊 Depth Anything 3 — From Images to 3D in Seconds\n",
|
| 8 |
+
"\n",
|
| 9 |
+
"<div align=\"center\">\n",
|
| 10 |
+
"\n",
|
| 11 |
+
"[](https://colab.research.google.com/github/Aedelon/awesome-depth-anything-3/blob/main/notebooks/da3_tutorial.ipynb)\n",
|
| 12 |
+
"[](https://github.com/Aedelon/awesome-depth-anything-3)\n",
|
| 13 |
+
"[](https://pypi.org/project/awesome-depth-anything-3/)\n",
|
| 14 |
+
"[](https://opensource.org/licenses/Apache-2.0)\n",
|
| 15 |
+
"\n",
|
| 16 |
+
"**State-of-the-art monocular depth estimation + 3D reconstruction**\n",
|
| 17 |
+
"\n",
|
| 18 |
+
"</div>\n",
|
| 19 |
+
"\n",
|
| 20 |
+
"---\n",
|
| 21 |
+
"\n",
|
| 22 |
+
"### What you'll get:\n",
|
| 23 |
+
"\n",
|
| 24 |
+
"| Input | Output |\n",
|
| 25 |
+
"|-------|--------|\n",
|
| 26 |
+
"| 📸 Single image | 🌊 Metric depth map |\n",
|
| 27 |
+
"| 🎬 Video / Multi-view | ☁️ 3D Point Cloud + Camera poses |\n",
|
| 28 |
+
"| 🖼️ Any scene | 📦 Downloadable GLB file |\n",
|
| 29 |
+
"\n",
|
| 30 |
+
"---\n",
|
| 31 |
+
"\n",
|
| 32 |
+
"### ⚡ Quick Start\n",
|
| 33 |
+
"\n",
|
| 34 |
+
"1. **Runtime → Change runtime type → T4 GPU** (free tier works!)\n",
|
| 35 |
+
"2. **Run all cells** (Ctrl+F9) or click ▶️ on each cell\n",
|
| 36 |
+
"3. **Upload your images** in Section 4\n",
|
| 37 |
+
"4. **Download your 3D model** (.glb file)\n",
|
| 38 |
+
"\n",
|
| 39 |
+
"⏱️ **Total time: ~5 minutes** (including model download)"
|
| 40 |
+
]
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"cell_type": "code",
|
| 44 |
+
"execution_count": null,
|
| 45 |
+
"metadata": {},
|
| 46 |
+
"outputs": [],
|
| 47 |
+
"source": [
|
| 48 |
+
"#@title 🚀 **1. Install** (run this first!) { display-mode: \"form\" }\n",
|
| 49 |
+
"#@markdown > ⏱️ Takes ~2 minutes on first run\n",
|
| 50 |
+
"\n",
|
| 51 |
+
"%%capture\n",
|
| 52 |
+
"!pip install awesome-depth-anything-3\n",
|
| 53 |
+
"\n",
|
| 54 |
+
"# Verify installation\n",
|
| 55 |
+
"import torch\n",
|
| 56 |
+
"from IPython.display import HTML, display\n",
|
| 57 |
+
"\n",
|
| 58 |
+
"device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
|
| 59 |
+
"gpu_name = torch.cuda.get_device_name(0) if device == \"cuda\" else \"None\"\n",
|
| 60 |
+
"vram = torch.cuda.get_device_properties(0).total_memory / 1e9 if device == \"cuda\" else 0\n",
|
| 61 |
+
"\n",
|
| 62 |
+
"if device == \"cuda\":\n",
|
| 63 |
+
" status = f'''\n",
|
| 64 |
+
" <div style=\"background: linear-gradient(135deg, #10B981, #059669); padding: 20px; border-radius: 12px; color: white; font-family: system-ui;\">\n",
|
| 65 |
+
" <h3 style=\"margin: 0 0 10px 0;\">✅ Ready to go!</h3>\n",
|
| 66 |
+
" <p style=\"margin: 5px 0;\"><b>GPU:</b> {gpu_name}</p>\n",
|
| 67 |
+
" <p style=\"margin: 5px 0;\"><b>VRAM:</b> {vram:.1f} GB</p>\n",
|
| 68 |
+
" <p style=\"margin: 5px 0;\"><b>PyTorch:</b> {torch.__version__}</p>\n",
|
| 69 |
+
" </div>\n",
|
| 70 |
+
" '''\n",
|
| 71 |
+
"else:\n",
|
| 72 |
+
" status = '''\n",
|
| 73 |
+
" <div style=\"background: linear-gradient(135deg, #EF4444, #DC2626); padding: 20px; border-radius: 12px; color: white; font-family: system-ui;\">\n",
|
| 74 |
+
" <h3 style=\"margin: 0 0 10px 0;\">⚠️ No GPU detected!</h3>\n",
|
| 75 |
+
" <p style=\"margin: 5px 0;\">Go to <b>Runtime → Change runtime type → GPU</b></p>\n",
|
| 76 |
+
" <p style=\"margin: 5px 0;\">Then restart the notebook.</p>\n",
|
| 77 |
+
" </div>\n",
|
| 78 |
+
" '''\n",
|
| 79 |
+
"\n",
|
| 80 |
+
"display(HTML(status))"
|
| 81 |
+
]
|
| 82 |
+
},
|
| 83 |
+
{
|
| 84 |
+
"cell_type": "code",
|
| 85 |
+
"execution_count": null,
|
| 86 |
+
"metadata": {},
|
| 87 |
+
"outputs": [],
|
| 88 |
+
"source": [
|
| 89 |
+
"#@title 🧠 **2. Load Model** { display-mode: \"form\" }\n",
|
| 90 |
+
"#@markdown Choose model size:\n",
|
| 91 |
+
"model_size = \"DA3-LARGE\" #@param [\"DA3-SMALL\", \"DA3-BASE\", \"DA3-LARGE\", \"DA3-GIANT\", \"DA3NESTED-GIANT-LARGE\"]\n",
|
| 92 |
+
"#@markdown ---\n",
|
| 93 |
+
"#@markdown | Model | Speed | Quality | VRAM |\n",
|
| 94 |
+
"#@markdown |-------|-------|---------|------|\n",
|
| 95 |
+
"#@markdown | SMALL | ⚡⚡⚡ | ★★☆ | 4GB |\n",
|
| 96 |
+
"#@markdown | BASE | ⚡⚡ | ★★★ | 6GB |\n",
|
| 97 |
+
"#@markdown | LARGE | ⚡ | ★★★★ | 8GB |\n",
|
| 98 |
+
"#@markdown | GIANT | 🐢 | ★★★★★ | 12GB |\n",
|
| 99 |
+
"#@markdown | NESTED | 🐢 | ★★★★★+ | 16GB |\n",
|
| 100 |
+
"\n",
|
| 101 |
+
"from depth_anything_3.api import DepthAnything3\n",
|
| 102 |
+
"import time\n",
|
| 103 |
+
"\n",
|
| 104 |
+
"print(f\"📥 Loading {model_size}...\")\n",
|
| 105 |
+
"start = time.time()\n",
|
| 106 |
+
"\n",
|
| 107 |
+
"model = DepthAnything3.from_pretrained(f\"depth-anything/{model_size}\")\n",
|
| 108 |
+
"model = model.to(device).eval()\n",
|
| 109 |
+
"\n",
|
| 110 |
+
"print(f\"✅ Model loaded in {time.time()-start:.1f}s\")"
|
| 111 |
+
]
|
| 112 |
+
},
|
| 113 |
+
{
|
| 114 |
+
"cell_type": "code",
|
| 115 |
+
"execution_count": null,
|
| 116 |
+
"metadata": {},
|
| 117 |
+
"outputs": [],
|
| 118 |
+
"source": [
|
| 119 |
+
"#@title 🖼️ **3. Try with Sample Image** { display-mode: \"form\" }\n",
|
| 120 |
+
"#@markdown Run depth estimation on a sample image\n",
|
| 121 |
+
"\n",
|
| 122 |
+
"import matplotlib.pyplot as plt\n",
|
| 123 |
+
"import numpy as np\n",
|
| 124 |
+
"from PIL import Image\n",
|
| 125 |
+
"import urllib.request\n",
|
| 126 |
+
"import os\n",
|
| 127 |
+
"\n",
|
| 128 |
+
"# Download sample\n",
|
| 129 |
+
"os.makedirs(\"samples\", exist_ok=True)\n",
|
| 130 |
+
"url = \"https://images.unsplash.com/photo-1506905925346-21bda4d32df4?w=1280\"\n",
|
| 131 |
+
"urllib.request.urlretrieve(url, \"samples/mountain.jpg\")\n",
|
| 132 |
+
"\n",
|
| 133 |
+
"# Run inference\n",
|
| 134 |
+
"result = model.inference([\"samples/mountain.jpg\"])\n",
|
| 135 |
+
"\n",
|
| 136 |
+
"# Visualize\n",
|
| 137 |
+
"fig, axes = plt.subplots(1, 2, figsize=(14, 5))\n",
|
| 138 |
+
"\n",
|
| 139 |
+
"axes[0].imshow(result.processed_images[0])\n",
|
| 140 |
+
"axes[0].set_title(\"📸 Input\", fontsize=14, fontweight='bold')\n",
|
| 141 |
+
"axes[0].axis(\"off\")\n",
|
| 142 |
+
"\n",
|
| 143 |
+
"depth = result.depth[0]\n",
|
| 144 |
+
"im = axes[1].imshow(depth, cmap='Spectral_r')\n",
|
| 145 |
+
"axes[1].set_title(f\"🌊 Depth (range: {depth.min():.1f}m - {depth.max():.1f}m)\", fontsize=14, fontweight='bold')\n",
|
| 146 |
+
"axes[1].axis(\"off\")\n",
|
| 147 |
+
"plt.colorbar(im, ax=axes[1], fraction=0.046, pad=0.04, label='Depth (m)')\n",
|
| 148 |
+
"\n",
|
| 149 |
+
"plt.tight_layout()\n",
|
| 150 |
+
"plt.show()\n",
|
| 151 |
+
"\n",
|
| 152 |
+
"print(f\"\\n📊 Output shapes:\")\n",
|
| 153 |
+
"print(f\" Depth: {result.depth.shape}\")\n",
|
| 154 |
+
"print(f\" Confidence: {result.conf.shape}\")\n",
|
| 155 |
+
"print(f\" Camera intrinsics: {result.intrinsics.shape}\")"
|
| 156 |
+
]
|
| 157 |
+
},
|
| 158 |
+
{
|
| 159 |
+
"cell_type": "markdown",
|
| 160 |
+
"metadata": {},
|
| 161 |
+
"source": [
|
| 162 |
+
"---\n",
|
| 163 |
+
"\n",
|
| 164 |
+
"## 📤 4. Use Your Own Images\n",
|
| 165 |
+
"\n",
|
| 166 |
+
"Upload your images and get a 3D point cloud!"
|
| 167 |
+
]
|
| 168 |
+
},
|
| 169 |
+
{
|
| 170 |
+
"cell_type": "code",
|
| 171 |
+
"execution_count": null,
|
| 172 |
+
"metadata": {},
|
| 173 |
+
"outputs": [],
|
| 174 |
+
"source": [
|
| 175 |
+
"#@title 📁 **Upload Images** { display-mode: \"form\" }\n",
|
| 176 |
+
"#@markdown Upload **2-50 images** of the same scene from different angles.\n",
|
| 177 |
+
"#@markdown \n",
|
| 178 |
+
"#@markdown 💡 **Tips for best results:**\n",
|
| 179 |
+
"#@markdown - Move the camera, not the objects\n",
|
| 180 |
+
"#@markdown - 30-50% overlap between consecutive images\n",
|
| 181 |
+
"#@markdown - Avoid motion blur\n",
|
| 182 |
+
"#@markdown - Good lighting helps!\n",
|
| 183 |
+
"\n",
|
| 184 |
+
"from google.colab import files\n",
|
| 185 |
+
"import shutil\n",
|
| 186 |
+
"\n",
|
| 187 |
+
"# Clean up previous uploads\n",
|
| 188 |
+
"upload_dir = \"my_images\"\n",
|
| 189 |
+
"if os.path.exists(upload_dir):\n",
|
| 190 |
+
" shutil.rmtree(upload_dir)\n",
|
| 191 |
+
"os.makedirs(upload_dir, exist_ok=True)\n",
|
| 192 |
+
"\n",
|
| 193 |
+
"print(\"📤 Select your images...\")\n",
|
| 194 |
+
"uploaded = files.upload()\n",
|
| 195 |
+
"\n",
|
| 196 |
+
"# Save uploaded files\n",
|
| 197 |
+
"for filename, data in uploaded.items():\n",
|
| 198 |
+
" with open(f\"{upload_dir}/{filename}\", 'wb') as f:\n",
|
| 199 |
+
" f.write(data)\n",
|
| 200 |
+
"\n",
|
| 201 |
+
"image_files = sorted([f\"{upload_dir}/{f}\" for f in os.listdir(upload_dir) \n",
|
| 202 |
+
" if f.lower().endswith(('.jpg', '.jpeg', '.png', '.webp'))])\n",
|
| 203 |
+
"\n",
|
| 204 |
+
"print(f\"\\n✅ Uploaded {len(image_files)} images\")\n",
|
| 205 |
+
"\n",
|
| 206 |
+
"# Preview\n",
|
| 207 |
+
"n_preview = min(6, len(image_files))\n",
|
| 208 |
+
"fig, axes = plt.subplots(1, n_preview, figsize=(3*n_preview, 3))\n",
|
| 209 |
+
"if n_preview == 1:\n",
|
| 210 |
+
" axes = [axes]\n",
|
| 211 |
+
"for i, img_path in enumerate(image_files[:n_preview]):\n",
|
| 212 |
+
" img = Image.open(img_path)\n",
|
| 213 |
+
" axes[i].imshow(img)\n",
|
| 214 |
+
" axes[i].set_title(f\"#{i+1}\", fontsize=10)\n",
|
| 215 |
+
" axes[i].axis(\"off\")\n",
|
| 216 |
+
"if len(image_files) > n_preview:\n",
|
| 217 |
+
" print(f\" (showing first {n_preview} of {len(image_files)})\")\n",
|
| 218 |
+
"plt.tight_layout()\n",
|
| 219 |
+
"plt.show()"
|
| 220 |
+
]
|
| 221 |
+
},
|
| 222 |
+
{
|
| 223 |
+
"cell_type": "code",
|
| 224 |
+
"execution_count": null,
|
| 225 |
+
"metadata": {},
|
| 226 |
+
"outputs": [],
|
| 227 |
+
"source": [
|
| 228 |
+
"#@title ⚡ **Run 3D Reconstruction** { display-mode: \"form\" }\n",
|
| 229 |
+
"#@markdown This will:\n",
|
| 230 |
+
"#@markdown 1. Estimate depth for each image\n",
|
| 231 |
+
"#@markdown 2. Compute camera poses\n",
|
| 232 |
+
"#@markdown 3. Generate a 3D point cloud\n",
|
| 233 |
+
"#@markdown 4. Export to GLB format\n",
|
| 234 |
+
"\n",
|
| 235 |
+
"from depth_anything_3.utils.export.glb import export_to_glb\n",
|
| 236 |
+
"import time\n",
|
| 237 |
+
"\n",
|
| 238 |
+
"print(f\"🔄 Processing {len(image_files)} images...\")\n",
|
| 239 |
+
"start = time.time()\n",
|
| 240 |
+
"\n",
|
| 241 |
+
"# Run inference\n",
|
| 242 |
+
"result = model.inference(\n",
|
| 243 |
+
" image_files,\n",
|
| 244 |
+
" process_res_method=\"upper_bound_resize\",\n",
|
| 245 |
+
")\n",
|
| 246 |
+
"\n",
|
| 247 |
+
"inference_time = time.time() - start\n",
|
| 248 |
+
"print(f\"✅ Inference done in {inference_time:.1f}s ({len(image_files)/inference_time:.1f} img/s)\")\n",
|
| 249 |
+
"\n",
|
| 250 |
+
"# Export to GLB\n",
|
| 251 |
+
"output_dir = \"output_3d\"\n",
|
| 252 |
+
"os.makedirs(output_dir, exist_ok=True)\n",
|
| 253 |
+
"\n",
|
| 254 |
+
"print(\"📦 Generating 3D point cloud...\")\n",
|
| 255 |
+
"export_to_glb(\n",
|
| 256 |
+
" result,\n",
|
| 257 |
+
" export_dir=output_dir,\n",
|
| 258 |
+
" show_cameras=True,\n",
|
| 259 |
+
" conf_thresh_percentile=20, # Filter low-confidence points\n",
|
| 260 |
+
" num_max_points=500_000,\n",
|
| 261 |
+
")\n",
|
| 262 |
+
"\n",
|
| 263 |
+
"print(f\"\\n✅ 3D model saved to {output_dir}/\")\n",
|
| 264 |
+
"!ls -lh {output_dir}/"
|
| 265 |
+
]
|
| 266 |
+
},
|
| 267 |
+
{
|
| 268 |
+
"cell_type": "code",
|
| 269 |
+
"execution_count": null,
|
| 270 |
+
"metadata": {},
|
| 271 |
+
"outputs": [],
|
| 272 |
+
"source": [
|
| 273 |
+
"#@title 📥 **Download Your 3D Model** { display-mode: \"form\" }\n",
|
| 274 |
+
"#@markdown Downloads a `.glb` file you can view in:\n",
|
| 275 |
+
"#@markdown - [glTF Viewer](https://gltf-viewer.donmccurdy.com/)\n",
|
| 276 |
+
"#@markdown - Blender\n",
|
| 277 |
+
"#@markdown - Windows 3D Viewer\n",
|
| 278 |
+
"#@markdown - Any 3D software\n",
|
| 279 |
+
"\n",
|
| 280 |
+
"from google.colab import files\n",
|
| 281 |
+
"\n",
|
| 282 |
+
"glb_file = f\"{output_dir}/point_cloud.glb\"\n",
|
| 283 |
+
"if os.path.exists(glb_file):\n",
|
| 284 |
+
" files.download(glb_file)\n",
|
| 285 |
+
" print(\"\\n🎉 Download started!\")\n",
|
| 286 |
+
" print(\"\\n👉 View your model: https://gltf-viewer.donmccurdy.com/\")\n",
|
| 287 |
+
"else:\n",
|
| 288 |
+
" print(\"❌ GLB file not found. Run the previous cell first.\")"
|
| 289 |
+
]
|
| 290 |
+
},
|
| 291 |
+
{
|
| 292 |
+
"cell_type": "markdown",
|
| 293 |
+
"metadata": {},
|
| 294 |
+
"source": [
|
| 295 |
+
"---\n",
|
| 296 |
+
"\n",
|
| 297 |
+
"## 📊 5. Visualize Results"
|
| 298 |
+
]
|
| 299 |
+
},
|
| 300 |
+
{
|
| 301 |
+
"cell_type": "code",
|
| 302 |
+
"execution_count": null,
|
| 303 |
+
"metadata": {},
|
| 304 |
+
"outputs": [],
|
| 305 |
+
"source": [
|
| 306 |
+
"#@title 🌊 **View All Depth Maps** { display-mode: \"form\" }\n",
|
| 307 |
+
"\n",
|
| 308 |
+
"n_images = len(result.depth)\n",
|
| 309 |
+
"cols = min(4, n_images)\n",
|
| 310 |
+
"rows = (n_images + cols - 1) // cols\n",
|
| 311 |
+
"\n",
|
| 312 |
+
"fig, axes = plt.subplots(rows, cols, figsize=(4*cols, 4*rows))\n",
|
| 313 |
+
"axes = np.array(axes).flatten() if n_images > 1 else [axes]\n",
|
| 314 |
+
"\n",
|
| 315 |
+
"for i in range(n_images):\n",
|
| 316 |
+
" depth = result.depth[i]\n",
|
| 317 |
+
" axes[i].imshow(depth, cmap='Spectral_r')\n",
|
| 318 |
+
" axes[i].set_title(f\"Frame {i+1}\", fontsize=10)\n",
|
| 319 |
+
" axes[i].axis(\"off\")\n",
|
| 320 |
+
"\n",
|
| 321 |
+
"# Hide unused subplots\n",
|
| 322 |
+
"for i in range(n_images, len(axes)):\n",
|
| 323 |
+
" axes[i].axis(\"off\")\n",
|
| 324 |
+
"\n",
|
| 325 |
+
"plt.suptitle(\"🌊 Depth Maps\", fontsize=16, fontweight='bold')\n",
|
| 326 |
+
"plt.tight_layout()\n",
|
| 327 |
+
"plt.show()"
|
| 328 |
+
]
|
| 329 |
+
},
|
| 330 |
+
{
|
| 331 |
+
"cell_type": "code",
|
| 332 |
+
"execution_count": null,
|
| 333 |
+
"metadata": {},
|
| 334 |
+
"outputs": [],
|
| 335 |
+
"source": [
|
| 336 |
+
"#@title 📷 **View Camera Poses** { display-mode: \"form\" }\n",
|
| 337 |
+
"#@markdown Visualize estimated camera positions in 3D\n",
|
| 338 |
+
"\n",
|
| 339 |
+
"from mpl_toolkits.mplot3d import Axes3D\n",
|
| 340 |
+
"\n",
|
| 341 |
+
"# Extract camera positions from extrinsics\n",
|
| 342 |
+
"positions = []\n",
|
| 343 |
+
"for ext in result.extrinsics:\n",
|
| 344 |
+
" # Extrinsic is world-to-camera, invert to get camera-to-world\n",
|
| 345 |
+
" R = ext[:3, :3]\n",
|
| 346 |
+
" t = ext[:3, 3]\n",
|
| 347 |
+
" cam_pos = -R.T @ t # Camera position in world coordinates\n",
|
| 348 |
+
" positions.append(cam_pos)\n",
|
| 349 |
+
"\n",
|
| 350 |
+
"positions = np.array(positions)\n",
|
| 351 |
+
"\n",
|
| 352 |
+
"fig = plt.figure(figsize=(10, 8))\n",
|
| 353 |
+
"ax = fig.add_subplot(111, projection='3d')\n",
|
| 354 |
+
"\n",
|
| 355 |
+
"# Plot camera positions\n",
|
| 356 |
+
"ax.scatter(positions[:, 0], positions[:, 1], positions[:, 2], \n",
|
| 357 |
+
" c=range(len(positions)), cmap='viridis', s=100, marker='o')\n",
|
| 358 |
+
"\n",
|
| 359 |
+
"# Connect cameras with lines\n",
|
| 360 |
+
"ax.plot(positions[:, 0], positions[:, 1], positions[:, 2], \n",
|
| 361 |
+
" 'b-', alpha=0.5, linewidth=1)\n",
|
| 362 |
+
"\n",
|
| 363 |
+
"# Mark first and last\n",
|
| 364 |
+
"ax.scatter(*positions[0], c='green', s=200, marker='^', label='First')\n",
|
| 365 |
+
"ax.scatter(*positions[-1], c='red', s=200, marker='v', label='Last')\n",
|
| 366 |
+
"\n",
|
| 367 |
+
"ax.set_xlabel('X')\n",
|
| 368 |
+
"ax.set_ylabel('Y')\n",
|
| 369 |
+
"ax.set_zlabel('Z')\n",
|
| 370 |
+
"ax.set_title('📷 Camera Trajectory', fontsize=14, fontweight='bold')\n",
|
| 371 |
+
"ax.legend()\n",
|
| 372 |
+
"\n",
|
| 373 |
+
"plt.tight_layout()\n",
|
| 374 |
+
"plt.show()\n",
|
| 375 |
+
"\n",
|
| 376 |
+
"print(f\"📍 {len(positions)} camera poses estimated\")"
|
| 377 |
+
]
|
| 378 |
+
},
|
| 379 |
+
{
|
| 380 |
+
"cell_type": "markdown",
|
| 381 |
+
"metadata": {},
|
| 382 |
+
"source": [
|
| 383 |
+
"---\n",
|
| 384 |
+
"\n",
|
| 385 |
+
"## 🎬 6. Process Video"
|
| 386 |
+
]
|
| 387 |
+
},
|
| 388 |
+
{
|
| 389 |
+
"cell_type": "code",
|
| 390 |
+
"execution_count": null,
|
| 391 |
+
"metadata": {},
|
| 392 |
+
"outputs": [],
|
| 393 |
+
"source": [
|
| 394 |
+
"#@title 🎬 **Upload Video** { display-mode: \"form\" }\n",
|
| 395 |
+
"#@markdown Upload a short video (< 30 seconds recommended)\n",
|
| 396 |
+
"\n",
|
| 397 |
+
"fps_extract = 2 #@param {type:\"slider\", min:1, max:10, step:1}\n",
|
| 398 |
+
"#@markdown ↑ Frames per second to extract (lower = faster, higher = more detail)\n",
|
| 399 |
+
"\n",
|
| 400 |
+
"from google.colab import files\n",
|
| 401 |
+
"import subprocess\n",
|
| 402 |
+
"\n",
|
| 403 |
+
"print(\"📤 Select a video file...\")\n",
|
| 404 |
+
"uploaded = files.upload()\n",
|
| 405 |
+
"\n",
|
| 406 |
+
"video_file = list(uploaded.keys())[0]\n",
|
| 407 |
+
"frames_dir = \"video_frames\"\n",
|
| 408 |
+
"\n",
|
| 409 |
+
"# Extract frames\n",
|
| 410 |
+
"if os.path.exists(frames_dir):\n",
|
| 411 |
+
" shutil.rmtree(frames_dir)\n",
|
| 412 |
+
"os.makedirs(frames_dir, exist_ok=True)\n",
|
| 413 |
+
"\n",
|
| 414 |
+
"print(f\"🎞️ Extracting frames at {fps_extract} FPS...\")\n",
|
| 415 |
+
"subprocess.run([\n",
|
| 416 |
+
" \"ffmpeg\", \"-i\", video_file, \n",
|
| 417 |
+
" \"-vf\", f\"fps={fps_extract}\",\n",
|
| 418 |
+
" f\"{frames_dir}/frame_%04d.jpg\",\n",
|
| 419 |
+
" \"-hide_banner\", \"-loglevel\", \"error\"\n",
|
| 420 |
+
"])\n",
|
| 421 |
+
"\n",
|
| 422 |
+
"video_images = sorted([f\"{frames_dir}/{f}\" for f in os.listdir(frames_dir)])\n",
|
| 423 |
+
"print(f\"✅ Extracted {len(video_images)} frames\")\n",
|
| 424 |
+
"\n",
|
| 425 |
+
"# Preview\n",
|
| 426 |
+
"n_preview = min(8, len(video_images))\n",
|
| 427 |
+
"fig, axes = plt.subplots(1, n_preview, figsize=(2*n_preview, 2))\n",
|
| 428 |
+
"step = max(1, len(video_images) // n_preview)\n",
|
| 429 |
+
"for i, ax in enumerate(axes):\n",
|
| 430 |
+
" idx = i * step\n",
|
| 431 |
+
" if idx < len(video_images):\n",
|
| 432 |
+
" ax.imshow(Image.open(video_images[idx]))\n",
|
| 433 |
+
" ax.axis(\"off\")\n",
|
| 434 |
+
"plt.suptitle(f\"🎬 Video Frames ({len(video_images)} total)\", fontsize=12)\n",
|
| 435 |
+
"plt.tight_layout()\n",
|
| 436 |
+
"plt.show()"
|
| 437 |
+
]
|
| 438 |
+
},
|
| 439 |
+
{
|
| 440 |
+
"cell_type": "code",
|
| 441 |
+
"execution_count": null,
|
| 442 |
+
"metadata": {},
|
| 443 |
+
"outputs": [],
|
| 444 |
+
"source": [
|
| 445 |
+
"#@title ⚡ **Process Video Frames** { display-mode: \"form\" }\n",
|
| 446 |
+
"\n",
|
| 447 |
+
"print(f\"🔄 Processing {len(video_images)} frames...\")\n",
|
| 448 |
+
"start = time.time()\n",
|
| 449 |
+
"\n",
|
| 450 |
+
"result_video = model.inference(\n",
|
| 451 |
+
" video_images,\n",
|
| 452 |
+
" process_res_method=\"upper_bound_resize\",\n",
|
| 453 |
+
")\n",
|
| 454 |
+
"\n",
|
| 455 |
+
"elapsed = time.time() - start\n",
|
| 456 |
+
"print(f\"✅ Done in {elapsed:.1f}s ({len(video_images)/elapsed:.1f} FPS)\")\n",
|
| 457 |
+
"\n",
|
| 458 |
+
"# Export\n",
|
| 459 |
+
"video_output = \"video_3d\"\n",
|
| 460 |
+
"os.makedirs(video_output, exist_ok=True)\n",
|
| 461 |
+
"\n",
|
| 462 |
+
"export_to_glb(\n",
|
| 463 |
+
" result_video,\n",
|
| 464 |
+
" export_dir=video_output,\n",
|
| 465 |
+
" show_cameras=True,\n",
|
| 466 |
+
" conf_thresh_percentile=15,\n",
|
| 467 |
+
" num_max_points=1_000_000,\n",
|
| 468 |
+
")\n",
|
| 469 |
+
"\n",
|
| 470 |
+
"print(f\"\\n📦 3D model saved!\")\n",
|
| 471 |
+
"!ls -lh {video_output}/"
|
| 472 |
+
]
|
| 473 |
+
},
|
| 474 |
+
{
|
| 475 |
+
"cell_type": "code",
|
| 476 |
+
"execution_count": null,
|
| 477 |
+
"metadata": {},
|
| 478 |
+
"outputs": [],
|
| 479 |
+
"source": [
|
| 480 |
+
"#@title 📥 **Download Video 3D Model** { display-mode: \"form\" }\n",
|
| 481 |
+
"\n",
|
| 482 |
+
"glb_file = f\"{video_output}/point_cloud.glb\"\n",
|
| 483 |
+
"if os.path.exists(glb_file):\n",
|
| 484 |
+
" files.download(glb_file)\n",
|
| 485 |
+
" print(\"🎉 Download started!\")\n",
|
| 486 |
+
"else:\n",
|
| 487 |
+
" print(\"❌ Run the previous cell first.\")"
|
| 488 |
+
]
|
| 489 |
+
},
|
| 490 |
+
{
|
| 491 |
+
"cell_type": "markdown",
|
| 492 |
+
"metadata": {},
|
| 493 |
+
"source": [
|
| 494 |
+
"---\n",
|
| 495 |
+
"\n",
|
| 496 |
+
"## 🔧 7. Advanced: Python API"
|
| 497 |
+
]
|
| 498 |
+
},
|
| 499 |
+
{
|
| 500 |
+
"cell_type": "code",
|
| 501 |
+
"execution_count": null,
|
| 502 |
+
"metadata": {},
|
| 503 |
+
"outputs": [],
|
| 504 |
+
"source": [
|
| 505 |
+
"#@title 💻 **API Reference** { display-mode: \"form\" }\n",
|
| 506 |
+
"#@markdown Quick code snippets for common tasks\n",
|
| 507 |
+
"\n",
|
| 508 |
+
"from IPython.display import Markdown\n",
|
| 509 |
+
"\n",
|
| 510 |
+
"api_docs = '''\n",
|
| 511 |
+
"### Basic Usage\n",
|
| 512 |
+
"\n",
|
| 513 |
+
"```python\n",
|
| 514 |
+
"from depth_anything_3.api import DepthAnything3\n",
|
| 515 |
+
"\n",
|
| 516 |
+
"# Load model\n",
|
| 517 |
+
"model = DepthAnything3.from_pretrained(\"depth-anything/DA3-LARGE\")\n",
|
| 518 |
+
"model = model.to(\"cuda\").eval()\n",
|
| 519 |
+
"\n",
|
| 520 |
+
"# Single image\n",
|
| 521 |
+
"result = model.inference([\"image.jpg\"])\n",
|
| 522 |
+
"depth = result.depth[0] # Shape: (H, W)\n",
|
| 523 |
+
"\n",
|
| 524 |
+
"# Multiple images\n",
|
| 525 |
+
"result = model.inference([\"img1.jpg\", \"img2.jpg\", \"img3.jpg\"])\n",
|
| 526 |
+
"depths = result.depth # Shape: (N, H, W)\n",
|
| 527 |
+
"```\n",
|
| 528 |
+
"\n",
|
| 529 |
+
"### Output Attributes\n",
|
| 530 |
+
"\n",
|
| 531 |
+
"| Attribute | Shape | Description |\n",
|
| 532 |
+
"|-----------|-------|-------------|\n",
|
| 533 |
+
"| `depth` | `(N, H, W)` | Metric depth in meters |\n",
|
| 534 |
+
"| `conf` | `(N, H, W)` | Confidence [0-1] |\n",
|
| 535 |
+
"| `extrinsics` | `(N, 3, 4)` | Camera poses (world-to-cam) |\n",
|
| 536 |
+
"| `intrinsics` | `(N, 3, 3)` | Camera K matrix |\n",
|
| 537 |
+
"| `processed_images` | `(N, H, W, 3)` | Resized inputs (uint8) |\n",
|
| 538 |
+
"\n",
|
| 539 |
+
"### Export to 3D\n",
|
| 540 |
+
"\n",
|
| 541 |
+
"```python\n",
|
| 542 |
+
"from depth_anything_3.utils.export.glb import export_to_glb\n",
|
| 543 |
+
"\n",
|
| 544 |
+
"export_to_glb(\n",
|
| 545 |
+
" result,\n",
|
| 546 |
+
" export_dir=\"output\",\n",
|
| 547 |
+
" show_cameras=True, # Show camera frustums\n",
|
| 548 |
+
" conf_thresh_percentile=20, # Filter low confidence\n",
|
| 549 |
+
" num_max_points=500_000, # Max points in cloud\n",
|
| 550 |
+
")\n",
|
| 551 |
+
"```\n",
|
| 552 |
+
"\n",
|
| 553 |
+
"### CLI Usage\n",
|
| 554 |
+
"\n",
|
| 555 |
+
"```bash\n",
|
| 556 |
+
"# Single image\n",
|
| 557 |
+
"da3 infer image.jpg -o output/\n",
|
| 558 |
+
"\n",
|
| 559 |
+
"# Directory of images\n",
|
| 560 |
+
"da3 infer images/ -o output/ --model DA3-LARGE\n",
|
| 561 |
+
"\n",
|
| 562 |
+
"# Video\n",
|
| 563 |
+
"da3 infer video.mp4 -o output/ --fps 2\n",
|
| 564 |
+
"```\n",
|
| 565 |
+
"'''\n",
|
| 566 |
+
"\n",
|
| 567 |
+
"display(Markdown(api_docs))"
|
| 568 |
+
]
|
| 569 |
+
},
|
| 570 |
+
{
|
| 571 |
+
"cell_type": "markdown",
|
| 572 |
+
"metadata": {},
|
| 573 |
+
"source": [
|
| 574 |
+
"---\n",
|
| 575 |
+
"\n",
|
| 576 |
+
"## 💾 8. Save to Google Drive"
|
| 577 |
+
]
|
| 578 |
+
},
|
| 579 |
+
{
|
| 580 |
+
"cell_type": "code",
|
| 581 |
+
"execution_count": null,
|
| 582 |
+
"metadata": {},
|
| 583 |
+
"outputs": [],
|
| 584 |
+
"source": [
|
| 585 |
+
"#@title 💾 **Mount Google Drive** { display-mode: \"form\" }\n",
|
| 586 |
+
"\n",
|
| 587 |
+
"from google.colab import drive\n",
|
| 588 |
+
"drive.mount('/content/drive')\n",
|
| 589 |
+
"\n",
|
| 590 |
+
"drive_output = \"/content/drive/MyDrive/DepthAnything3_Results\"\n",
|
| 591 |
+
"os.makedirs(drive_output, exist_ok=True)\n",
|
| 592 |
+
"print(f\"✅ Drive mounted at: {drive_output}\")"
|
| 593 |
+
]
|
| 594 |
+
},
|
| 595 |
+
{
|
| 596 |
+
"cell_type": "code",
|
| 597 |
+
"execution_count": null,
|
| 598 |
+
"metadata": {},
|
| 599 |
+
"outputs": [],
|
| 600 |
+
"source": [
|
| 601 |
+
"#@title 💾 **Save Results to Drive** { display-mode: \"form\" }\n",
|
| 602 |
+
"\n",
|
| 603 |
+
"import shutil\n",
|
| 604 |
+
"from datetime import datetime\n",
|
| 605 |
+
"\n",
|
| 606 |
+
"# Create timestamped folder\n",
|
| 607 |
+
"timestamp = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n",
|
| 608 |
+
"save_dir = f\"{drive_output}/{timestamp}\"\n",
|
| 609 |
+
"os.makedirs(save_dir, exist_ok=True)\n",
|
| 610 |
+
"\n",
|
| 611 |
+
"# Copy all outputs\n",
|
| 612 |
+
"for folder in [\"output_3d\", \"video_3d\"]:\n",
|
| 613 |
+
" if os.path.exists(folder):\n",
|
| 614 |
+
" for f in os.listdir(folder):\n",
|
| 615 |
+
" shutil.copy(f\"{folder}/{f}\", save_dir)\n",
|
| 616 |
+
" print(f\" ✓ {f}\")\n",
|
| 617 |
+
"\n",
|
| 618 |
+
"print(f\"\\n✅ Saved to: {save_dir}\")"
|
| 619 |
+
]
|
| 620 |
+
},
|
| 621 |
+
{
|
| 622 |
+
"cell_type": "markdown",
|
| 623 |
+
"metadata": {},
|
| 624 |
+
"source": [
|
| 625 |
+
"---\n",
|
| 626 |
+
"\n",
|
| 627 |
+
"## 🙏 Credits & Links\n",
|
| 628 |
+
"\n",
|
| 629 |
+
"<div align=\"center\">\n",
|
| 630 |
+
"\n",
|
| 631 |
+
"**Depth Anything 3** by ByteDance Research\n",
|
| 632 |
+
"\n",
|
| 633 |
+
"[📄 Paper](https://arxiv.org/abs/2511.10647) • [🌐 Project](https://depth-anything-3.github.io) • [🤗 Models](https://huggingface.co/collections/depth-anything/depth-anything-3)\n",
|
| 634 |
+
"\n",
|
| 635 |
+
"---\n",
|
| 636 |
+
"\n",
|
| 637 |
+
"**awesome-depth-anything-3** — Optimized fork with batching, caching & CLI\n",
|
| 638 |
+
"\n",
|
| 639 |
+
"[⭐ GitHub](https://github.com/Aedelon/awesome-depth-anything-3) • [📦 PyPI](https://pypi.org/project/awesome-depth-anything-3/)\n",
|
| 640 |
+
"\n",
|
| 641 |
+
"---\n",
|
| 642 |
+
"\n",
|
| 643 |
+
"Made with ❤️ by [Delanoe Pirard](https://github.com/Aedelon)\n",
|
| 644 |
+
"\n",
|
| 645 |
+
"</div>"
|
| 646 |
+
]
|
| 647 |
+
}
|
| 648 |
+
],
|
| 649 |
+
"metadata": {
|
| 650 |
+
"accelerator": "GPU",
|
| 651 |
+
"colab": {
|
| 652 |
+
"gpuType": "T4",
|
| 653 |
+
"provenance": [],
|
| 654 |
+
"toc_visible": true
|
| 655 |
+
},
|
| 656 |
+
"kernelspec": {
|
| 657 |
+
"display_name": "Python 3",
|
| 658 |
+
"name": "python3"
|
| 659 |
+
},
|
| 660 |
+
"language_info": {
|
| 661 |
+
"name": "python",
|
| 662 |
+
"version": "3.10.0"
|
| 663 |
+
}
|
| 664 |
+
},
|
| 665 |
+
"nbformat": 4,
|
| 666 |
+
"nbformat_minor": 0
|
| 667 |
+
}
|
pyproject.toml
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = ["hatchling>=1.25", "hatch-vcs>=0.4"]
|
| 3 |
+
build-backend = "hatchling.build"
|
| 4 |
+
|
| 5 |
+
[project]
|
| 6 |
+
name = "awesome-depth-anything-3"
|
| 7 |
+
version = "0.0.0"
|
| 8 |
+
description = "Optimized wrapper for Depth Anything 3 - Metric depth, point clouds, camera poses and novel views from any images"
|
| 9 |
+
readme = "README.md"
|
| 10 |
+
requires-python = ">=3.10, <=3.13"
|
| 11 |
+
license = { text = "Apache-2.0" }
|
| 12 |
+
authors = [{ name = "Delanoe Pirard", email = "delanoe.pirard.pro@gmail.com" }]
|
| 13 |
+
keywords = [
|
| 14 |
+
"depth-estimation",
|
| 15 |
+
"3d-reconstruction",
|
| 16 |
+
"computer-vision",
|
| 17 |
+
"pytorch",
|
| 18 |
+
"monocular-depth",
|
| 19 |
+
"multi-view",
|
| 20 |
+
"pose-estimation",
|
| 21 |
+
"point-cloud",
|
| 22 |
+
]
|
| 23 |
+
classifiers = [
|
| 24 |
+
"Development Status :: 4 - Beta",
|
| 25 |
+
"Intended Audience :: Developers",
|
| 26 |
+
"Intended Audience :: Science/Research",
|
| 27 |
+
"License :: OSI Approved :: Apache Software License",
|
| 28 |
+
"Programming Language :: Python :: 3.10",
|
| 29 |
+
"Programming Language :: Python :: 3.11",
|
| 30 |
+
"Programming Language :: Python :: 3.12",
|
| 31 |
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
| 32 |
+
"Topic :: Scientific/Engineering :: Image Processing",
|
| 33 |
+
]
|
| 34 |
+
|
| 35 |
+
dependencies = [
|
| 36 |
+
"torch>=2",
|
| 37 |
+
"torchvision",
|
| 38 |
+
"kornia>=0.7.0",
|
| 39 |
+
"einops",
|
| 40 |
+
"huggingface_hub",
|
| 41 |
+
"imageio",
|
| 42 |
+
"numpy<2",
|
| 43 |
+
"opencv-python",
|
| 44 |
+
"open3d",
|
| 45 |
+
"fastapi",
|
| 46 |
+
"uvicorn",
|
| 47 |
+
"requests",
|
| 48 |
+
"typer>=0.9.0,<0.13.0",
|
| 49 |
+
"pillow",
|
| 50 |
+
"omegaconf",
|
| 51 |
+
"evo",
|
| 52 |
+
"e3nn",
|
| 53 |
+
"moviepy==1.0.3",
|
| 54 |
+
"trimesh",
|
| 55 |
+
"plyfile",
|
| 56 |
+
"pillow_heif",
|
| 57 |
+
"safetensors",
|
| 58 |
+
"pycolmap",
|
| 59 |
+
"twine>=6.2.0",
|
| 60 |
+
]
|
| 61 |
+
|
| 62 |
+
[project.optional-dependencies]
|
| 63 |
+
app = ["gradio==4.44.1", "huggingface_hub>=0.19,<1.0", "pillow>=9.0"]
|
| 64 |
+
dev = ["pre-commit", "pytest", "ruff"]
|
| 65 |
+
# CUDA acceleration packages (may require manual install steps)
|
| 66 |
+
xformers = ["xformers; platform_system!='Darwin'"]
|
| 67 |
+
gs = ["gsplat>=1.0.0; platform_system!='Darwin'"]
|
| 68 |
+
# Note: flash-attn package is optional. PyTorch >= 2.2 includes Flash Attention
|
| 69 |
+
# natively via F.scaled_dot_product_attention(). Only install flash-attn if you
|
| 70 |
+
# need the absolute latest optimizations:
|
| 71 |
+
# pip install flash-attn --no-build-isolation (requires CUDA toolkit)
|
| 72 |
+
# Convenience bundles
|
| 73 |
+
cuda = ["awesome-depth-anything-3[xformers,gs]"]
|
| 74 |
+
all = ["awesome-depth-anything-3[app,cuda]"]
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
[project.scripts]
|
| 78 |
+
da3 = "depth_anything_3.cli:app"
|
| 79 |
+
|
| 80 |
+
[project.urls]
|
| 81 |
+
Homepage = "https://github.com/Aedelon/awesome-depth-anything-3"
|
| 82 |
+
Repository = "https://github.com/Aedelon/awesome-depth-anything-3"
|
| 83 |
+
Documentation = "https://github.com/Aedelon/awesome-depth-anything-3#readme"
|
| 84 |
+
Issues = "https://github.com/Aedelon/awesome-depth-anything-3/issues"
|
| 85 |
+
Changelog = "https://github.com/Aedelon/awesome-depth-anything-3/blob/main/CHANGELOG.md"
|
| 86 |
+
Upstream = "https://github.com/ByteDance-Seed/Depth-Anything-3"
|
| 87 |
+
|
| 88 |
+
[tool.hatch.version]
|
| 89 |
+
source = "vcs"
|
| 90 |
+
|
| 91 |
+
[tool.hatch.build.targets.wheel]
|
| 92 |
+
packages = ["src/depth_anything_3"]
|
| 93 |
+
|
| 94 |
+
[tool.hatch.build.targets.sdist]
|
| 95 |
+
include = [
|
| 96 |
+
"/README.md",
|
| 97 |
+
"/pyproject.toml",
|
| 98 |
+
"/src/depth_anything_3",
|
| 99 |
+
]
|
| 100 |
+
|
| 101 |
+
[tool.hatch.metadata]
|
| 102 |
+
allow-direct-references = true
|
| 103 |
+
|
| 104 |
+
[tool.mypy]
|
| 105 |
+
plugins = ["jaxtyping.mypy_plugin"]
|
| 106 |
+
|
| 107 |
+
[tool.black]
|
| 108 |
+
line-length = 99
|
| 109 |
+
target-version = ['py37', 'py38', 'py39', 'py310', 'py311']
|
| 110 |
+
include = '\.pyi?$'
|
| 111 |
+
exclude = '''
|
| 112 |
+
/(
|
| 113 |
+
| \.git
|
| 114 |
+
)/
|
| 115 |
+
'''
|
| 116 |
+
|
| 117 |
+
[tool.isort]
|
| 118 |
+
profile = "black"
|
| 119 |
+
multi_line_output = 3
|
| 120 |
+
include_trailing_comma = true
|
| 121 |
+
known_third_party = ["bson","cruise","cv2","dataloader","diffusers","omegaconf","tensorflow","torch","torchvision","transformers","gsplat"]
|
| 122 |
+
known_first_party = ["common", "data", "models", "projects", "depth_anything_3"]
|
| 123 |
+
sections = ["FUTURE","STDLIB","THIRDPARTY","FIRSTPARTY","LOCALFOLDER"]
|
| 124 |
+
skip_gitignore = true
|
| 125 |
+
line_length = 99
|
| 126 |
+
no_lines_before="THIRDPARTY"
|
| 127 |
+
|
| 128 |
+
[tool.pytest.ini_options]
|
| 129 |
+
testpaths = ["tests"]
|
| 130 |
+
python_files = ["test_*.py"]
|
| 131 |
+
python_functions = ["test_*"]
|
| 132 |
+
addopts = "-v --tb=short"
|
| 133 |
+
filterwarnings = [
|
| 134 |
+
"ignore::DeprecationWarning",
|
| 135 |
+
"ignore::UserWarning",
|
| 136 |
+
]
|
| 137 |
+
|
| 138 |
+
[tool.ruff]
|
| 139 |
+
line-length = 99
|
| 140 |
+
target-version = "py310"
|
| 141 |
+
|
| 142 |
+
[tool.ruff.lint]
|
| 143 |
+
select = ["E", "F", "W", "I"]
|
| 144 |
+
ignore = ["E501"] # Line too long (handled by formatter)
|
requirements.txt
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Install this package from GitHub
|
| 2 |
+
git+https://github.com/Aedelon/awesome-depth-anything-3.git
|
| 3 |
+
|
| 4 |
+
# Core dependencies - torch MUST be first for xformers
|
| 5 |
+
torch>=2
|
| 6 |
+
torchvision
|
| 7 |
+
numpy<2
|
| 8 |
+
|
| 9 |
+
# ML/Vision libraries
|
| 10 |
+
einops
|
| 11 |
+
kornia>=0.7.0
|
| 12 |
+
safetensors
|
| 13 |
+
|
| 14 |
+
# Image/Video processing
|
| 15 |
+
pillow
|
| 16 |
+
pillow_heif
|
| 17 |
+
imageio
|
| 18 |
+
opencv-python
|
| 19 |
+
moviepy==1.0.3
|
| 20 |
+
|
| 21 |
+
# 3D and geometry
|
| 22 |
+
trimesh
|
| 23 |
+
plyfile
|
| 24 |
+
open3d
|
| 25 |
+
e3nn
|
| 26 |
+
evo
|
| 27 |
+
pycolmap
|
| 28 |
+
|
| 29 |
+
# API and config
|
| 30 |
+
fastapi
|
| 31 |
+
uvicorn
|
| 32 |
+
requests
|
| 33 |
+
typer>=0.9.0,<0.13.0
|
| 34 |
+
omegaconf
|
| 35 |
+
|
| 36 |
+
# Gradio app
|
| 37 |
+
gradio>=5.50.0,<6.0
|
| 38 |
+
huggingface_hub>=0.33.5,<2.0
|
scripts/deploy_hf.sh
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
# Copyright (c) Delanoe Pirard / Aedelon
|
| 3 |
+
# Licensed under the Apache License, Version 2.0
|
| 4 |
+
#
|
| 5 |
+
# Deploy to HuggingFace Spaces with LFS for binary files
|
| 6 |
+
# This script temporarily enables LFS, pushes to HF, then restores normal state
|
| 7 |
+
|
| 8 |
+
set -e
|
| 9 |
+
|
| 10 |
+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
| 11 |
+
PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
|
| 12 |
+
cd "$PROJECT_ROOT"
|
| 13 |
+
|
| 14 |
+
# HuggingFace Spaces YAML front matter
|
| 15 |
+
HF_YAML='---
|
| 16 |
+
title: Awesome Depth Anything 3
|
| 17 |
+
emoji: 🌊
|
| 18 |
+
colorFrom: blue
|
| 19 |
+
colorTo: purple
|
| 20 |
+
sdk: gradio
|
| 21 |
+
sdk_version: 5.50.0
|
| 22 |
+
app_file: app.py
|
| 23 |
+
pinned: false
|
| 24 |
+
license: apache-2.0
|
| 25 |
+
short_description: Metric 3D reconstruction from images/video
|
| 26 |
+
---
|
| 27 |
+
|
| 28 |
+
'
|
| 29 |
+
|
| 30 |
+
echo "=== HuggingFace Deployment Script ==="
|
| 31 |
+
|
| 32 |
+
# Save current HEAD
|
| 33 |
+
CURRENT_SHA=$(git rev-parse HEAD)
|
| 34 |
+
echo "Current commit: $CURRENT_SHA"
|
| 35 |
+
|
| 36 |
+
# Step 1: Configure LFS for binary files
|
| 37 |
+
echo ""
|
| 38 |
+
echo "Step 1: Configuring Git LFS..."
|
| 39 |
+
cat > .gitattributes << 'EOF'
|
| 40 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
*.jpg filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
*.jpeg filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
*.gif filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
*.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 45 |
+
*.webm filter=lfs diff=lfs merge=lfs -text
|
| 46 |
+
EOF
|
| 47 |
+
|
| 48 |
+
# Step 2: Create deployment branch
|
| 49 |
+
echo ""
|
| 50 |
+
echo "Step 2: Creating deployment branch..."
|
| 51 |
+
git checkout --orphan hf-deploy-temp 2>/dev/null || git checkout hf-deploy-temp
|
| 52 |
+
|
| 53 |
+
# Reset to get clean state
|
| 54 |
+
git reset
|
| 55 |
+
|
| 56 |
+
# Step 3: Add YAML to README
|
| 57 |
+
echo ""
|
| 58 |
+
echo "Step 3: Adding YAML front matter to README..."
|
| 59 |
+
cp README.md README.md.original
|
| 60 |
+
echo "$HF_YAML$(cat README.md.original)" > README.md
|
| 61 |
+
|
| 62 |
+
# Step 4: Stage all files (LFS will handle binaries)
|
| 63 |
+
echo ""
|
| 64 |
+
echo "Step 4: Staging files with LFS..."
|
| 65 |
+
git add .gitattributes
|
| 66 |
+
git add -A
|
| 67 |
+
|
| 68 |
+
# Step 5: Commit
|
| 69 |
+
echo ""
|
| 70 |
+
echo "Step 5: Committing..."
|
| 71 |
+
git commit -m "Deploy to HuggingFace Spaces" --no-verify || true
|
| 72 |
+
|
| 73 |
+
# Step 6: Push to HuggingFace
|
| 74 |
+
echo ""
|
| 75 |
+
echo "Step 6: Pushing to HuggingFace Spaces..."
|
| 76 |
+
git push huggingface hf-deploy-temp:main --force
|
| 77 |
+
|
| 78 |
+
# Step 7: Cleanup - return to main branch
|
| 79 |
+
echo ""
|
| 80 |
+
echo "Step 7: Cleaning up..."
|
| 81 |
+
git checkout main --force
|
| 82 |
+
git branch -D hf-deploy-temp 2>/dev/null || true
|
| 83 |
+
|
| 84 |
+
# Restore original .gitattributes (no LFS)
|
| 85 |
+
cat > .gitattributes << 'EOF'
|
| 86 |
+
*.png !text !filter !merge !diff
|
| 87 |
+
*.jpg !text !filter !merge !diff
|
| 88 |
+
*.jpeg !text !filter !merge !diff
|
| 89 |
+
*.gif !text !filter !merge !diff
|
| 90 |
+
*.mp4 !text !filter !merge !diff
|
| 91 |
+
*.webm !text !filter !merge !diff
|
| 92 |
+
EOF
|
| 93 |
+
|
| 94 |
+
echo ""
|
| 95 |
+
echo "=== Done! ==="
|
| 96 |
+
echo "HuggingFace updated with YAML metadata and LFS binaries."
|
| 97 |
+
echo "Local repo restored to normal state (no LFS)."
|
src/depth_anything_3/api.py
ADDED
|
@@ -0,0 +1,718 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
"""
|
| 15 |
+
Depth Anything 3 API module.
|
| 16 |
+
|
| 17 |
+
This module provides the main API for Depth Anything 3, including model loading,
|
| 18 |
+
inference, and export capabilities. It supports both single and nested model architectures.
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
from __future__ import annotations
|
| 22 |
+
|
| 23 |
+
import time
|
| 24 |
+
from typing import Optional, Sequence
|
| 25 |
+
|
| 26 |
+
import numpy as np
|
| 27 |
+
import torch
|
| 28 |
+
import torch.nn as nn
|
| 29 |
+
from huggingface_hub import PyTorchModelHubMixin
|
| 30 |
+
from PIL import Image
|
| 31 |
+
|
| 32 |
+
from depth_anything_3.cache import get_model_cache
|
| 33 |
+
from depth_anything_3.cfg import create_object, load_config
|
| 34 |
+
from depth_anything_3.registry import MODEL_REGISTRY
|
| 35 |
+
from depth_anything_3.specs import Prediction
|
| 36 |
+
from depth_anything_3.utils.adaptive_batching import (
|
| 37 |
+
AdaptiveBatchConfig,
|
| 38 |
+
AdaptiveBatchSizeCalculator,
|
| 39 |
+
adaptive_batch_iterator,
|
| 40 |
+
estimate_max_batch_size,
|
| 41 |
+
)
|
| 42 |
+
from depth_anything_3.utils.export import export
|
| 43 |
+
from depth_anything_3.utils.geometry import affine_inverse
|
| 44 |
+
from depth_anything_3.utils.io.gpu_input_processor import GPUInputProcessor
|
| 45 |
+
from depth_anything_3.utils.io.input_processor import InputProcessor
|
| 46 |
+
from depth_anything_3.utils.io.output_processor import OutputProcessor
|
| 47 |
+
from depth_anything_3.utils.logger import logger
|
| 48 |
+
from depth_anything_3.utils.pose_align import align_poses_umeyama
|
| 49 |
+
|
| 50 |
+
torch.backends.cudnn.benchmark = False
|
| 51 |
+
# logger.info("CUDNN Benchmark Disabled")
|
| 52 |
+
|
| 53 |
+
SAFETENSORS_NAME = "model.safetensors"
|
| 54 |
+
CONFIG_NAME = "config.json"
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
class DepthAnything3(nn.Module, PyTorchModelHubMixin):
|
| 58 |
+
"""
|
| 59 |
+
Depth Anything 3 main API class.
|
| 60 |
+
|
| 61 |
+
This class provides a high-level interface for depth estimation using Depth Anything 3.
|
| 62 |
+
It supports both single and nested model architectures with metric scaling capabilities.
|
| 63 |
+
|
| 64 |
+
Features:
|
| 65 |
+
- Hugging Face Hub integration via PyTorchModelHubMixin
|
| 66 |
+
- Support for multiple model presets (vitb, vitg, nested variants)
|
| 67 |
+
- Automatic mixed precision inference
|
| 68 |
+
- Export capabilities for various formats (GLB, PLY, NPZ, etc.)
|
| 69 |
+
- Camera pose estimation and metric depth scaling
|
| 70 |
+
|
| 71 |
+
Usage:
|
| 72 |
+
# Load from Hugging Face Hub
|
| 73 |
+
model = DepthAnything3.from_pretrained("huggingface/model-name")
|
| 74 |
+
|
| 75 |
+
# Or create with specific preset
|
| 76 |
+
model = DepthAnything3(preset="vitg")
|
| 77 |
+
|
| 78 |
+
# Run inference
|
| 79 |
+
prediction = model.inference(images, export_dir="output", export_format="glb")
|
| 80 |
+
"""
|
| 81 |
+
|
| 82 |
+
_commit_hash: str | None = None # Set by mixin when loading from Hub
|
| 83 |
+
|
| 84 |
+
def __init__(self, model_name: str = "da3-large", device: str | torch.device | None = None, use_cache: bool = True, **kwargs):
|
| 85 |
+
"""
|
| 86 |
+
Initialize DepthAnything3 with specified preset.
|
| 87 |
+
|
| 88 |
+
Args:
|
| 89 |
+
model_name: The name of the model preset to use.
|
| 90 |
+
Examples: 'da3-giant', 'da3-large', 'da3metric-large', 'da3nested-giant-large'.
|
| 91 |
+
device: Target device ('cuda', 'mps', 'cpu'). If None, auto-detect.
|
| 92 |
+
use_cache: Whether to use model caching (default: True).
|
| 93 |
+
Set to False to force reload model from disk.
|
| 94 |
+
**kwargs: Additional keyword arguments (currently unused).
|
| 95 |
+
"""
|
| 96 |
+
super().__init__()
|
| 97 |
+
self.model_name = model_name
|
| 98 |
+
self.use_cache = use_cache
|
| 99 |
+
|
| 100 |
+
# Determine device
|
| 101 |
+
if device is None:
|
| 102 |
+
device = self._auto_detect_device()
|
| 103 |
+
self.device = torch.device(device) if isinstance(device, str) else device
|
| 104 |
+
|
| 105 |
+
# Load model configuration
|
| 106 |
+
self.config = load_config(MODEL_REGISTRY[self.model_name])
|
| 107 |
+
|
| 108 |
+
# Build or retrieve model from cache
|
| 109 |
+
if use_cache:
|
| 110 |
+
cache = get_model_cache()
|
| 111 |
+
self.model = cache.get(
|
| 112 |
+
model_name=self.model_name,
|
| 113 |
+
device=self.device,
|
| 114 |
+
loader_fn=lambda: self._create_model()
|
| 115 |
+
)
|
| 116 |
+
else:
|
| 117 |
+
logger.info(f"Model cache disabled, loading {self.model_name} from disk")
|
| 118 |
+
self.model = self._create_model()
|
| 119 |
+
|
| 120 |
+
# Ensure model is on correct device and in eval mode
|
| 121 |
+
self.model = self.model.to(self.device)
|
| 122 |
+
self.model.eval()
|
| 123 |
+
|
| 124 |
+
# Initialize processors
|
| 125 |
+
# Use GPUInputProcessor for CUDA/MPS devices to enable GPU ops
|
| 126 |
+
# Note: NVJPEG decoding is specific to CUDA, MPS will use optimized CPU decoding + GPU resize
|
| 127 |
+
if self.device.type in ("cuda", "mps"):
|
| 128 |
+
self.input_processor = GPUInputProcessor(device=self.device)
|
| 129 |
+
decoding_info = "NVJPEG support enabled" if self.device.type == "cuda" else "TorchVision decoding"
|
| 130 |
+
logger.info(f"Using GPUInputProcessor ({decoding_info} on {self.device})")
|
| 131 |
+
else:
|
| 132 |
+
self.input_processor = InputProcessor()
|
| 133 |
+
logger.info("Using standard InputProcessor (optimized CPU pipeline)")
|
| 134 |
+
|
| 135 |
+
self.output_processor = OutputProcessor()
|
| 136 |
+
|
| 137 |
+
def _auto_detect_device(self) -> torch.device:
|
| 138 |
+
"""Auto-detect best available device."""
|
| 139 |
+
if torch.cuda.is_available():
|
| 140 |
+
return torch.device("cuda")
|
| 141 |
+
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
|
| 142 |
+
return torch.device("mps")
|
| 143 |
+
else:
|
| 144 |
+
return torch.device("cpu")
|
| 145 |
+
|
| 146 |
+
def _create_model(self) -> nn.Module:
|
| 147 |
+
"""Create and return new model instance on correct device."""
|
| 148 |
+
model = create_object(self.config)
|
| 149 |
+
model = model.to(self.device) # Move to device before caching
|
| 150 |
+
model.eval()
|
| 151 |
+
return model
|
| 152 |
+
|
| 153 |
+
@torch.inference_mode()
|
| 154 |
+
def forward(
|
| 155 |
+
self,
|
| 156 |
+
image: torch.Tensor,
|
| 157 |
+
extrinsics: torch.Tensor | None = None,
|
| 158 |
+
intrinsics: torch.Tensor | None = None,
|
| 159 |
+
export_feat_layers: list[int] | None = None,
|
| 160 |
+
infer_gs: bool = False,
|
| 161 |
+
use_ray_pose: bool = False,
|
| 162 |
+
ref_view_strategy: str = "saddle_balanced",
|
| 163 |
+
) -> dict[str, torch.Tensor]:
|
| 164 |
+
"""
|
| 165 |
+
Forward pass through the model.
|
| 166 |
+
|
| 167 |
+
Args:
|
| 168 |
+
image: Input batch with shape ``(B, N, 3, H, W)`` on the model device.
|
| 169 |
+
extrinsics: Optional camera extrinsics with shape ``(B, N, 4, 4)``.
|
| 170 |
+
intrinsics: Optional camera intrinsics with shape ``(B, N, 3, 3)``.
|
| 171 |
+
export_feat_layers: Layer indices to return intermediate features for.
|
| 172 |
+
infer_gs: Enable Gaussian Splatting branch.
|
| 173 |
+
use_ray_pose: Use ray-based pose estimation instead of camera decoder.
|
| 174 |
+
ref_view_strategy: Strategy for selecting reference view from multiple views.
|
| 175 |
+
|
| 176 |
+
Returns:
|
| 177 |
+
Dictionary containing model predictions
|
| 178 |
+
"""
|
| 179 |
+
with torch.no_grad():
|
| 180 |
+
# MPS doesn't support autocast well - use float32 for stability
|
| 181 |
+
if image.device.type == "mps":
|
| 182 |
+
return self.model(
|
| 183 |
+
image, extrinsics, intrinsics, export_feat_layers, infer_gs, use_ray_pose, ref_view_strategy
|
| 184 |
+
)
|
| 185 |
+
else:
|
| 186 |
+
# CUDA: use autocast for performance
|
| 187 |
+
autocast_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
|
| 188 |
+
with torch.autocast(device_type=image.device.type, dtype=autocast_dtype):
|
| 189 |
+
return self.model(
|
| 190 |
+
image, extrinsics, intrinsics, export_feat_layers, infer_gs, use_ray_pose, ref_view_strategy
|
| 191 |
+
)
|
| 192 |
+
|
| 193 |
+
def inference(
|
| 194 |
+
self,
|
| 195 |
+
image: list[np.ndarray | Image.Image | str],
|
| 196 |
+
extrinsics: np.ndarray | None = None,
|
| 197 |
+
intrinsics: np.ndarray | None = None,
|
| 198 |
+
align_to_input_ext_scale: bool = True,
|
| 199 |
+
infer_gs: bool = False,
|
| 200 |
+
use_ray_pose: bool = False,
|
| 201 |
+
ref_view_strategy: str = "saddle_balanced",
|
| 202 |
+
render_exts: np.ndarray | None = None,
|
| 203 |
+
render_ixts: np.ndarray | None = None,
|
| 204 |
+
render_hw: tuple[int, int] | None = None,
|
| 205 |
+
process_res: int = 504,
|
| 206 |
+
process_res_method: str = "upper_bound_resize",
|
| 207 |
+
export_dir: str | None = None,
|
| 208 |
+
export_format: str = "mini_npz",
|
| 209 |
+
export_feat_layers: Sequence[int] | None = None,
|
| 210 |
+
# GLB export parameters
|
| 211 |
+
conf_thresh_percentile: float = 40.0,
|
| 212 |
+
num_max_points: int = 1_000_000,
|
| 213 |
+
show_cameras: bool = True,
|
| 214 |
+
# Feat_vis export parameters
|
| 215 |
+
feat_vis_fps: int = 15,
|
| 216 |
+
# Other export parameters, e.g., gs_ply, gs_video
|
| 217 |
+
export_kwargs: Optional[dict] = {},
|
| 218 |
+
) -> Prediction:
|
| 219 |
+
"""
|
| 220 |
+
Run inference on input images.
|
| 221 |
+
|
| 222 |
+
Args:
|
| 223 |
+
image: List of input images (numpy arrays, PIL Images, or file paths)
|
| 224 |
+
extrinsics: Camera extrinsics (N, 4, 4)
|
| 225 |
+
intrinsics: Camera intrinsics (N, 3, 3)
|
| 226 |
+
align_to_input_ext_scale: whether to align the input pose scale to the prediction
|
| 227 |
+
infer_gs: Enable the 3D Gaussian branch (needed for `gs_ply`/`gs_video` exports)
|
| 228 |
+
use_ray_pose: Use ray-based pose estimation instead of camera decoder (default: False)
|
| 229 |
+
ref_view_strategy: Strategy for selecting reference view from multiple views.
|
| 230 |
+
Options: "first", "middle", "saddle_balanced", "saddle_sim_range".
|
| 231 |
+
Default: "saddle_balanced". For single view input (S ≤ 2), no reordering is performed.
|
| 232 |
+
render_exts: Optional render extrinsics for Gaussian video export
|
| 233 |
+
render_ixts: Optional render intrinsics for Gaussian video export
|
| 234 |
+
render_hw: Optional render resolution for Gaussian video export
|
| 235 |
+
process_res: Processing resolution
|
| 236 |
+
process_res_method: Resize method for processing
|
| 237 |
+
export_dir: Directory to export results
|
| 238 |
+
export_format: Export format (mini_npz, npz, glb, ply, gs, gs_video)
|
| 239 |
+
export_feat_layers: Layer indices to export intermediate features from
|
| 240 |
+
conf_thresh_percentile: [GLB] Lower percentile for adaptive confidence threshold (default: 40.0) # noqa: E501
|
| 241 |
+
num_max_points: [GLB] Maximum number of points in the point cloud (default: 1,000,000)
|
| 242 |
+
show_cameras: [GLB] Show camera wireframes in the exported scene (default: True)
|
| 243 |
+
feat_vis_fps: [FEAT_VIS] Frame rate for output video (default: 15)
|
| 244 |
+
export_kwargs: additional arguments to export functions.
|
| 245 |
+
|
| 246 |
+
Returns:
|
| 247 |
+
Prediction object containing depth maps and camera parameters
|
| 248 |
+
"""
|
| 249 |
+
if "gs" in export_format:
|
| 250 |
+
assert infer_gs, "must set `infer_gs=True` to perform gs-related export."
|
| 251 |
+
|
| 252 |
+
if "colmap" in export_format:
|
| 253 |
+
assert isinstance(image[0], str), "`image` must be image paths for COLMAP export."
|
| 254 |
+
|
| 255 |
+
# Preprocess images
|
| 256 |
+
imgs_cpu, extrinsics, intrinsics = self._preprocess_inputs(
|
| 257 |
+
image, extrinsics, intrinsics, process_res, process_res_method
|
| 258 |
+
)
|
| 259 |
+
|
| 260 |
+
# Prepare tensors for model
|
| 261 |
+
imgs, ex_t, in_t = self._prepare_model_inputs(imgs_cpu, extrinsics, intrinsics)
|
| 262 |
+
|
| 263 |
+
# Normalize extrinsics
|
| 264 |
+
ex_t_norm = self._normalize_extrinsics(ex_t.clone() if ex_t is not None else None)
|
| 265 |
+
|
| 266 |
+
# Run model forward pass
|
| 267 |
+
export_feat_layers = list(export_feat_layers) if export_feat_layers is not None else []
|
| 268 |
+
|
| 269 |
+
raw_output = self._run_model_forward(
|
| 270 |
+
imgs, ex_t_norm, in_t, export_feat_layers, infer_gs, use_ray_pose, ref_view_strategy
|
| 271 |
+
)
|
| 272 |
+
|
| 273 |
+
# Convert raw output to prediction
|
| 274 |
+
prediction = self._convert_to_prediction(raw_output)
|
| 275 |
+
|
| 276 |
+
# Align prediction to extrinsincs
|
| 277 |
+
prediction = self._align_to_input_extrinsics_intrinsics(
|
| 278 |
+
extrinsics, intrinsics, prediction, align_to_input_ext_scale
|
| 279 |
+
)
|
| 280 |
+
|
| 281 |
+
# Add processed images for visualization
|
| 282 |
+
prediction = self._add_processed_images(prediction, imgs_cpu)
|
| 283 |
+
|
| 284 |
+
# Export if requested
|
| 285 |
+
if export_dir is not None:
|
| 286 |
+
|
| 287 |
+
if "gs" in export_format:
|
| 288 |
+
if infer_gs and "gs_video" not in export_format:
|
| 289 |
+
export_format = f"{export_format}-gs_video"
|
| 290 |
+
if "gs_video" in export_format:
|
| 291 |
+
if "gs_video" not in export_kwargs:
|
| 292 |
+
export_kwargs["gs_video"] = {}
|
| 293 |
+
export_kwargs["gs_video"].update(
|
| 294 |
+
{
|
| 295 |
+
"extrinsics": render_exts,
|
| 296 |
+
"intrinsics": render_ixts,
|
| 297 |
+
"out_image_hw": render_hw,
|
| 298 |
+
}
|
| 299 |
+
)
|
| 300 |
+
# Add GLB export parameters
|
| 301 |
+
if "glb" in export_format:
|
| 302 |
+
if "glb" not in export_kwargs:
|
| 303 |
+
export_kwargs["glb"] = {}
|
| 304 |
+
export_kwargs["glb"].update(
|
| 305 |
+
{
|
| 306 |
+
"conf_thresh_percentile": conf_thresh_percentile,
|
| 307 |
+
"num_max_points": num_max_points,
|
| 308 |
+
"show_cameras": show_cameras,
|
| 309 |
+
}
|
| 310 |
+
)
|
| 311 |
+
# Add Feat_vis export parameters
|
| 312 |
+
if "feat_vis" in export_format:
|
| 313 |
+
if "feat_vis" not in export_kwargs:
|
| 314 |
+
export_kwargs["feat_vis"] = {}
|
| 315 |
+
export_kwargs["feat_vis"].update(
|
| 316 |
+
{
|
| 317 |
+
"fps": feat_vis_fps,
|
| 318 |
+
}
|
| 319 |
+
)
|
| 320 |
+
# Add COLMAP export parameters
|
| 321 |
+
if "colmap" in export_format:
|
| 322 |
+
if "colmap" not in export_kwargs:
|
| 323 |
+
export_kwargs["colmap"] = {}
|
| 324 |
+
export_kwargs["colmap"].update(
|
| 325 |
+
{
|
| 326 |
+
"image_paths": image,
|
| 327 |
+
"conf_thresh_percentile": conf_thresh_percentile,
|
| 328 |
+
"process_res_method": process_res_method,
|
| 329 |
+
}
|
| 330 |
+
)
|
| 331 |
+
self._export_results(prediction, export_format, export_dir, **export_kwargs)
|
| 332 |
+
|
| 333 |
+
return prediction
|
| 334 |
+
|
| 335 |
+
def _preprocess_inputs(
|
| 336 |
+
self,
|
| 337 |
+
image: list[np.ndarray | Image.Image | str],
|
| 338 |
+
extrinsics: np.ndarray | None = None,
|
| 339 |
+
intrinsics: np.ndarray | None = None,
|
| 340 |
+
process_res: int = 504,
|
| 341 |
+
process_res_method: str = "upper_bound_resize",
|
| 342 |
+
) -> tuple[torch.Tensor, torch.Tensor | None, torch.Tensor | None]:
|
| 343 |
+
"""Preprocess input images using input processor."""
|
| 344 |
+
start_time = time.time()
|
| 345 |
+
|
| 346 |
+
# Determine normalization strategy:
|
| 347 |
+
# 1. Hybrid (CPU Proc + GPU Device): Skip CPU norm (return uint8), norm on GPU later.
|
| 348 |
+
# 2. GPU Proc (NVJPEG/Kornia): Perform norm on GPU immediately.
|
| 349 |
+
# 3. Standard CPU: Perform norm on CPU.
|
| 350 |
+
|
| 351 |
+
perform_norm = True
|
| 352 |
+
if self.device.type in ("cuda", "mps") and not isinstance(self.input_processor, GPUInputProcessor):
|
| 353 |
+
perform_norm = False
|
| 354 |
+
|
| 355 |
+
imgs_cpu, extrinsics, intrinsics = self.input_processor(
|
| 356 |
+
image,
|
| 357 |
+
extrinsics.copy() if extrinsics is not None else None,
|
| 358 |
+
intrinsics.copy() if intrinsics is not None else None,
|
| 359 |
+
process_res,
|
| 360 |
+
process_res_method,
|
| 361 |
+
perform_normalization=perform_norm,
|
| 362 |
+
)
|
| 363 |
+
end_time = time.time()
|
| 364 |
+
logger.info(
|
| 365 |
+
"Processed Images Done taking",
|
| 366 |
+
end_time - start_time,
|
| 367 |
+
"seconds. Shape: ",
|
| 368 |
+
imgs_cpu.shape,
|
| 369 |
+
)
|
| 370 |
+
return imgs_cpu, extrinsics, intrinsics
|
| 371 |
+
|
| 372 |
+
def _prepare_model_inputs(
|
| 373 |
+
self,
|
| 374 |
+
imgs_cpu: torch.Tensor,
|
| 375 |
+
extrinsics: torch.Tensor | None,
|
| 376 |
+
intrinsics: torch.Tensor | None,
|
| 377 |
+
) -> tuple[torch.Tensor, torch.Tensor | None, torch.Tensor | None]:
|
| 378 |
+
"""
|
| 379 |
+
Prepare tensors for model input with optimized device transfer.
|
| 380 |
+
"""
|
| 381 |
+
device = self._get_model_device()
|
| 382 |
+
|
| 383 |
+
# 1. Handle Image Tensor
|
| 384 |
+
# Compare device types (handles cuda:0 vs cuda comparison)
|
| 385 |
+
imgs_on_target_device = (imgs_cpu.device.type == device.type)
|
| 386 |
+
if imgs_on_target_device:
|
| 387 |
+
# Case A: Already on correct device (GPUInputProcessor)
|
| 388 |
+
# Ensure correct shape: (B, S, C, H, W) where B=1
|
| 389 |
+
imgs = imgs_cpu
|
| 390 |
+
if imgs.dim() == 3:
|
| 391 |
+
# Single image (C, H, W) -> (1, 1, C, H, W)
|
| 392 |
+
imgs = imgs.unsqueeze(0).unsqueeze(0)
|
| 393 |
+
elif imgs.dim() == 4:
|
| 394 |
+
# Batch of images (N, C, H, W) -> (1, N, C, H, W)
|
| 395 |
+
imgs = imgs.unsqueeze(0)
|
| 396 |
+
# dim() == 5 means already correct shape
|
| 397 |
+
if imgs.dtype == torch.uint8:
|
| 398 |
+
# Should not happen with GPUInputProcessor default, but safety fallback
|
| 399 |
+
imgs = imgs.float() / 255.0
|
| 400 |
+
imgs = InputProcessor.normalize_tensor(
|
| 401 |
+
imgs,
|
| 402 |
+
mean=[0.485, 0.456, 0.406],
|
| 403 |
+
std=[0.229, 0.224, 0.225]
|
| 404 |
+
)
|
| 405 |
+
else:
|
| 406 |
+
# Case B & C: Needs transfer from CPU
|
| 407 |
+
if imgs_cpu.dtype == torch.uint8:
|
| 408 |
+
# Hybrid mode: uint8 -> GPU -> float -> normalize
|
| 409 |
+
if device.type == "cuda":
|
| 410 |
+
imgs_cpu = imgs_cpu.pin_memory()
|
| 411 |
+
|
| 412 |
+
imgs = imgs_cpu.to(device, non_blocking=True).float() / 255.0
|
| 413 |
+
imgs = InputProcessor.normalize_tensor(
|
| 414 |
+
imgs,
|
| 415 |
+
mean=[0.485, 0.456, 0.406],
|
| 416 |
+
std=[0.229, 0.224, 0.225]
|
| 417 |
+
)
|
| 418 |
+
imgs = imgs[None] # Add batch dimension (1, N, 3, H, W)
|
| 419 |
+
else:
|
| 420 |
+
# Standard mode: float -> GPU
|
| 421 |
+
if device.type == "cuda":
|
| 422 |
+
imgs_cpu = imgs_cpu.pin_memory()
|
| 423 |
+
imgs = imgs_cpu.to(device, non_blocking=True)[None].float()
|
| 424 |
+
|
| 425 |
+
# Convert camera parameters to tensors with non-blocking transfer
|
| 426 |
+
ex_t = (
|
| 427 |
+
extrinsics.pin_memory().to(device, non_blocking=True)[None].float()
|
| 428 |
+
if extrinsics is not None and device.type == "cuda" and extrinsics.device.type == "cpu"
|
| 429 |
+
else extrinsics.to(device, non_blocking=True)[None].float()
|
| 430 |
+
if extrinsics is not None and extrinsics.device != device
|
| 431 |
+
else extrinsics[None].float()
|
| 432 |
+
if extrinsics is not None
|
| 433 |
+
else None
|
| 434 |
+
)
|
| 435 |
+
in_t = (
|
| 436 |
+
intrinsics.pin_memory().to(device, non_blocking=True)[None].float()
|
| 437 |
+
if intrinsics is not None and device.type == "cuda" and intrinsics.device.type == "cpu"
|
| 438 |
+
else intrinsics.to(device, non_blocking=True)[None].float()
|
| 439 |
+
if intrinsics is not None and intrinsics.device != device
|
| 440 |
+
else intrinsics[None].float()
|
| 441 |
+
if intrinsics is not None
|
| 442 |
+
else None
|
| 443 |
+
)
|
| 444 |
+
|
| 445 |
+
return imgs, ex_t, in_t
|
| 446 |
+
|
| 447 |
+
def _normalize_extrinsics(self, ex_t: torch.Tensor | None) -> torch.Tensor | None:
|
| 448 |
+
"""Normalize extrinsics"""
|
| 449 |
+
if ex_t is None:
|
| 450 |
+
return None
|
| 451 |
+
transform = affine_inverse(ex_t[:, :1])
|
| 452 |
+
ex_t_norm = ex_t @ transform
|
| 453 |
+
c2ws = affine_inverse(ex_t_norm)
|
| 454 |
+
translations = c2ws[..., :3, 3]
|
| 455 |
+
dists = translations.norm(dim=-1)
|
| 456 |
+
median_dist = torch.median(dists)
|
| 457 |
+
median_dist = torch.clamp(median_dist, min=1e-1)
|
| 458 |
+
ex_t_norm[..., :3, 3] = ex_t_norm[..., :3, 3] / median_dist
|
| 459 |
+
return ex_t_norm
|
| 460 |
+
|
| 461 |
+
def _align_to_input_extrinsics_intrinsics(
|
| 462 |
+
self,
|
| 463 |
+
extrinsics: torch.Tensor | None,
|
| 464 |
+
intrinsics: torch.Tensor | None,
|
| 465 |
+
prediction: Prediction,
|
| 466 |
+
align_to_input_ext_scale: bool = True,
|
| 467 |
+
ransac_view_thresh: int = 10,
|
| 468 |
+
) -> Prediction:
|
| 469 |
+
"""Align depth map to input extrinsics"""
|
| 470 |
+
if extrinsics is None:
|
| 471 |
+
return prediction
|
| 472 |
+
prediction.intrinsics = intrinsics.numpy()
|
| 473 |
+
_, _, scale, aligned_extrinsics = align_poses_umeyama(
|
| 474 |
+
prediction.extrinsics,
|
| 475 |
+
extrinsics.numpy(),
|
| 476 |
+
ransac=len(extrinsics) >= ransac_view_thresh,
|
| 477 |
+
return_aligned=True,
|
| 478 |
+
random_state=42,
|
| 479 |
+
)
|
| 480 |
+
if align_to_input_ext_scale:
|
| 481 |
+
prediction.extrinsics = extrinsics[..., :3, :].numpy()
|
| 482 |
+
prediction.depth /= scale
|
| 483 |
+
else:
|
| 484 |
+
prediction.extrinsics = aligned_extrinsics
|
| 485 |
+
return prediction
|
| 486 |
+
|
| 487 |
+
def _run_model_forward(
|
| 488 |
+
self,
|
| 489 |
+
imgs: torch.Tensor,
|
| 490 |
+
ex_t: torch.Tensor | None,
|
| 491 |
+
in_t: torch.Tensor | None,
|
| 492 |
+
export_feat_layers: Sequence[int] | None = None,
|
| 493 |
+
infer_gs: bool = False,
|
| 494 |
+
use_ray_pose: bool = False,
|
| 495 |
+
ref_view_strategy: str = "saddle_balanced",
|
| 496 |
+
) -> dict[str, torch.Tensor]:
|
| 497 |
+
"""Run model forward pass."""
|
| 498 |
+
device = imgs.device
|
| 499 |
+
need_sync = device.type == "cuda"
|
| 500 |
+
if need_sync:
|
| 501 |
+
torch.cuda.synchronize(device)
|
| 502 |
+
start_time = time.time()
|
| 503 |
+
feat_layers = list(export_feat_layers) if export_feat_layers is not None else None
|
| 504 |
+
output = self.forward(imgs, ex_t, in_t, feat_layers, infer_gs, use_ray_pose, ref_view_strategy)
|
| 505 |
+
if need_sync:
|
| 506 |
+
torch.cuda.synchronize(device)
|
| 507 |
+
end_time = time.time()
|
| 508 |
+
logger.info(f"Model Forward Pass Done. Time: {end_time - start_time} seconds")
|
| 509 |
+
return output
|
| 510 |
+
|
| 511 |
+
def _convert_to_prediction(self, raw_output: dict[str, torch.Tensor]) -> Prediction:
|
| 512 |
+
"""Convert raw model output to Prediction object."""
|
| 513 |
+
start_time = time.time()
|
| 514 |
+
output = self.output_processor(raw_output)
|
| 515 |
+
end_time = time.time()
|
| 516 |
+
logger.info(f"Conversion to Prediction Done. Time: {end_time - start_time} seconds")
|
| 517 |
+
return output
|
| 518 |
+
|
| 519 |
+
def _add_processed_images(self, prediction: Prediction, imgs_cpu: torch.Tensor) -> Prediction:
|
| 520 |
+
"""Add processed images to prediction for visualization."""
|
| 521 |
+
# Convert from (N, 3, H, W) to (N, H, W, 3)
|
| 522 |
+
processed_imgs = imgs_cpu.permute(0, 2, 3, 1).cpu().numpy() # (N, H, W, 3)
|
| 523 |
+
|
| 524 |
+
if imgs_cpu.dtype == torch.uint8:
|
| 525 |
+
# Already uint8, no need to denormalize
|
| 526 |
+
pass
|
| 527 |
+
else:
|
| 528 |
+
# Denormalize from ImageNet normalization
|
| 529 |
+
mean = np.array([0.485, 0.456, 0.406])
|
| 530 |
+
std = np.array([0.229, 0.224, 0.225])
|
| 531 |
+
processed_imgs = processed_imgs * std + mean
|
| 532 |
+
processed_imgs = np.clip(processed_imgs, 0, 1)
|
| 533 |
+
processed_imgs = (processed_imgs * 255).astype(np.uint8)
|
| 534 |
+
|
| 535 |
+
prediction.processed_images = processed_imgs
|
| 536 |
+
return prediction
|
| 537 |
+
|
| 538 |
+
def _export_results(
|
| 539 |
+
self, prediction: Prediction, export_format: str, export_dir: str, **kwargs
|
| 540 |
+
) -> None:
|
| 541 |
+
"""Export results to specified format and directory."""
|
| 542 |
+
start_time = time.time()
|
| 543 |
+
export(prediction, export_format, export_dir, **kwargs)
|
| 544 |
+
end_time = time.time()
|
| 545 |
+
logger.info(f"Export Results Done. Time: {end_time - start_time} seconds")
|
| 546 |
+
|
| 547 |
+
def _get_model_device(self) -> torch.device:
|
| 548 |
+
"""
|
| 549 |
+
Get the device where the model is located.
|
| 550 |
+
|
| 551 |
+
Returns:
|
| 552 |
+
Device where the model parameters are located
|
| 553 |
+
|
| 554 |
+
Raises:
|
| 555 |
+
ValueError: If no tensors are found in the model
|
| 556 |
+
"""
|
| 557 |
+
if self.device is not None:
|
| 558 |
+
return self.device
|
| 559 |
+
|
| 560 |
+
# Find device from parameters
|
| 561 |
+
for param in self.parameters():
|
| 562 |
+
self.device = param.device
|
| 563 |
+
return param.device
|
| 564 |
+
|
| 565 |
+
# Find device from buffers
|
| 566 |
+
for buffer in self.buffers():
|
| 567 |
+
self.device = buffer.device
|
| 568 |
+
return buffer.device
|
| 569 |
+
|
| 570 |
+
raise ValueError("No tensor found in model")
|
| 571 |
+
|
| 572 |
+
# =========================================================================
|
| 573 |
+
# Adaptive Batching Methods
|
| 574 |
+
# =========================================================================
|
| 575 |
+
|
| 576 |
+
def batch_inference(
|
| 577 |
+
self,
|
| 578 |
+
images: list[np.ndarray | Image.Image | str],
|
| 579 |
+
process_res: int = 504,
|
| 580 |
+
batch_size: int | str = "auto",
|
| 581 |
+
max_batch_size: int = 64,
|
| 582 |
+
target_memory_utilization: float = 0.85,
|
| 583 |
+
progress_callback: callable | None = None,
|
| 584 |
+
) -> list[Prediction]:
|
| 585 |
+
"""
|
| 586 |
+
Run inference on multiple images with adaptive batching.
|
| 587 |
+
|
| 588 |
+
This method automatically determines optimal batch sizes based on
|
| 589 |
+
available GPU memory, maximizing throughput while preventing OOM errors.
|
| 590 |
+
|
| 591 |
+
Args:
|
| 592 |
+
images: List of input images (numpy arrays, PIL Images, or file paths)
|
| 593 |
+
process_res: Processing resolution (default: 504)
|
| 594 |
+
batch_size: Batch size or "auto" for adaptive batching (default: "auto")
|
| 595 |
+
max_batch_size: Maximum batch size when using adaptive batching (default: 64)
|
| 596 |
+
target_memory_utilization: Target GPU memory usage 0.0-1.0 (default: 0.85)
|
| 597 |
+
progress_callback: Optional callback(processed, total) for progress updates
|
| 598 |
+
|
| 599 |
+
Returns:
|
| 600 |
+
List of Prediction objects, one per batch
|
| 601 |
+
|
| 602 |
+
Example:
|
| 603 |
+
>>> model = DepthAnything3(model_name="da3-large")
|
| 604 |
+
>>> images = ["img1.jpg", "img2.jpg", ..., "img100.jpg"]
|
| 605 |
+
>>>
|
| 606 |
+
>>> # Adaptive batching (recommended)
|
| 607 |
+
>>> results = model.batch_inference(images, process_res=518)
|
| 608 |
+
>>>
|
| 609 |
+
>>> # Fixed batch size
|
| 610 |
+
>>> results = model.batch_inference(images, batch_size=4)
|
| 611 |
+
>>>
|
| 612 |
+
>>> # With progress callback
|
| 613 |
+
>>> def on_progress(done, total):
|
| 614 |
+
... print(f"Processed {done}/{total}")
|
| 615 |
+
>>> results = model.batch_inference(images, progress_callback=on_progress)
|
| 616 |
+
"""
|
| 617 |
+
import gc
|
| 618 |
+
|
| 619 |
+
num_images = len(images)
|
| 620 |
+
if num_images == 0:
|
| 621 |
+
return []
|
| 622 |
+
|
| 623 |
+
results: list[Prediction] = []
|
| 624 |
+
|
| 625 |
+
# Determine batch size
|
| 626 |
+
if batch_size == "auto":
|
| 627 |
+
config = AdaptiveBatchConfig(
|
| 628 |
+
max_batch_size=max_batch_size,
|
| 629 |
+
target_memory_utilization=target_memory_utilization,
|
| 630 |
+
)
|
| 631 |
+
calculator = AdaptiveBatchSizeCalculator(
|
| 632 |
+
model_name=self.model_name,
|
| 633 |
+
device=self.device,
|
| 634 |
+
config=config,
|
| 635 |
+
)
|
| 636 |
+
|
| 637 |
+
for batch_info in adaptive_batch_iterator(images, calculator, process_res):
|
| 638 |
+
# Run inference on this batch
|
| 639 |
+
prediction = self.inference(
|
| 640 |
+
image=batch_info.items,
|
| 641 |
+
process_res=process_res,
|
| 642 |
+
)
|
| 643 |
+
results.append(prediction)
|
| 644 |
+
|
| 645 |
+
# Progress callback
|
| 646 |
+
if progress_callback:
|
| 647 |
+
progress_callback(batch_info.end_idx, num_images)
|
| 648 |
+
|
| 649 |
+
# Memory cleanup between batches
|
| 650 |
+
if not batch_info.is_last:
|
| 651 |
+
gc.collect()
|
| 652 |
+
if self.device.type == "cuda":
|
| 653 |
+
torch.cuda.empty_cache()
|
| 654 |
+
elif self.device.type == "mps":
|
| 655 |
+
torch.mps.empty_cache()
|
| 656 |
+
|
| 657 |
+
# Update profiling data for better estimates
|
| 658 |
+
if calculator.config.enable_profiling and self.device.type == "cuda":
|
| 659 |
+
memory_used = torch.cuda.max_memory_allocated(self.device) / (1024 * 1024)
|
| 660 |
+
calculator.update_from_profiling(
|
| 661 |
+
batch_size=batch_info.batch_size,
|
| 662 |
+
memory_used_mb=memory_used,
|
| 663 |
+
process_res=process_res,
|
| 664 |
+
)
|
| 665 |
+
torch.cuda.reset_peak_memory_stats(self.device)
|
| 666 |
+
|
| 667 |
+
else:
|
| 668 |
+
# Fixed batch size
|
| 669 |
+
fixed_batch_size = int(batch_size)
|
| 670 |
+
for i in range(0, num_images, fixed_batch_size):
|
| 671 |
+
end_idx = min(i + fixed_batch_size, num_images)
|
| 672 |
+
batch_images = images[i:end_idx]
|
| 673 |
+
|
| 674 |
+
prediction = self.inference(
|
| 675 |
+
image=batch_images,
|
| 676 |
+
process_res=process_res,
|
| 677 |
+
)
|
| 678 |
+
results.append(prediction)
|
| 679 |
+
|
| 680 |
+
if progress_callback:
|
| 681 |
+
progress_callback(end_idx, num_images)
|
| 682 |
+
|
| 683 |
+
# Memory cleanup
|
| 684 |
+
if end_idx < num_images:
|
| 685 |
+
gc.collect()
|
| 686 |
+
if self.device.type == "cuda":
|
| 687 |
+
torch.cuda.empty_cache()
|
| 688 |
+
elif self.device.type == "mps":
|
| 689 |
+
torch.mps.empty_cache()
|
| 690 |
+
|
| 691 |
+
return results
|
| 692 |
+
|
| 693 |
+
def get_optimal_batch_size(
|
| 694 |
+
self,
|
| 695 |
+
process_res: int = 504,
|
| 696 |
+
target_utilization: float = 0.85,
|
| 697 |
+
) -> int:
|
| 698 |
+
"""
|
| 699 |
+
Get the optimal batch size for current GPU memory state.
|
| 700 |
+
|
| 701 |
+
Args:
|
| 702 |
+
process_res: Processing resolution (default: 504)
|
| 703 |
+
target_utilization: Target GPU memory usage 0.0-1.0 (default: 0.85)
|
| 704 |
+
|
| 705 |
+
Returns:
|
| 706 |
+
Recommended batch size
|
| 707 |
+
|
| 708 |
+
Example:
|
| 709 |
+
>>> model = DepthAnything3(model_name="da3-large")
|
| 710 |
+
>>> batch_size = model.get_optimal_batch_size(process_res=518)
|
| 711 |
+
>>> print(f"Optimal batch size: {batch_size}")
|
| 712 |
+
"""
|
| 713 |
+
return estimate_max_batch_size(
|
| 714 |
+
model_name=self.model_name,
|
| 715 |
+
device=self.device,
|
| 716 |
+
process_res=process_res,
|
| 717 |
+
target_utilization=target_utilization,
|
| 718 |
+
)
|
src/depth_anything_3/app/css_and_html.py
ADDED
|
@@ -0,0 +1,623 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# flake8: noqa: E501
|
| 2 |
+
|
| 3 |
+
# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
|
| 4 |
+
#
|
| 5 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 6 |
+
# you may not use this file except in compliance with the License.
|
| 7 |
+
# You may obtain a copy of the License at
|
| 8 |
+
#
|
| 9 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 10 |
+
#
|
| 11 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 12 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 13 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 14 |
+
# See the License for the specific language governing permissions and
|
| 15 |
+
# limitations under the License.
|
| 16 |
+
|
| 17 |
+
"""
|
| 18 |
+
CSS and HTML content for the Depth Anything 3 Gradio application.
|
| 19 |
+
This module contains all the CSS styles and HTML content blocks
|
| 20 |
+
used in the Gradio interface.
|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
# CSS Styles for the Gradio interface
|
| 24 |
+
# Color palette:
|
| 25 |
+
# - Primary: #2563EB (Modern Blue)
|
| 26 |
+
# - Secondary: #14B8A6 (Vibrant Teal)
|
| 27 |
+
# - Accent: #F97316 (Electric Orange)
|
| 28 |
+
# - Neutrals: #F9FAFB to #111827
|
| 29 |
+
GRADIO_CSS = """
|
| 30 |
+
/* Add Font Awesome CDN with all styles including brands and colors */
|
| 31 |
+
@import url('https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css');
|
| 32 |
+
|
| 33 |
+
/* Force light mode */
|
| 34 |
+
html, body, .gradio-container {
|
| 35 |
+
color-scheme: light !important;
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
/* CSS Custom Properties for theming */
|
| 39 |
+
:root {
|
| 40 |
+
--primary: #2563EB;
|
| 41 |
+
--primary-light: #3B82F6;
|
| 42 |
+
--primary-dark: #1D4ED8;
|
| 43 |
+
--secondary: #14B8A6;
|
| 44 |
+
--secondary-light: #2DD4BF;
|
| 45 |
+
--secondary-dark: #0D9488;
|
| 46 |
+
--accent: #F97316;
|
| 47 |
+
--accent-light: #FB923C;
|
| 48 |
+
--accent-dark: #EA580C;
|
| 49 |
+
--neutral-50: #F9FAFB;
|
| 50 |
+
--neutral-100: #F3F4F6;
|
| 51 |
+
--neutral-200: #E5E7EB;
|
| 52 |
+
--neutral-300: #D1D5DB;
|
| 53 |
+
--neutral-400: #9CA3AF;
|
| 54 |
+
--neutral-500: #6B7280;
|
| 55 |
+
--neutral-600: #4B5563;
|
| 56 |
+
--neutral-700: #374151;
|
| 57 |
+
--neutral-800: #1F2937;
|
| 58 |
+
--neutral-900: #111827;
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
/* Add custom styles for colored icons */
|
| 62 |
+
.fa-color-blue {
|
| 63 |
+
color: var(--primary);
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
.fa-color-purple {
|
| 67 |
+
color: #8B5CF6;
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
.fa-color-cyan {
|
| 71 |
+
color: var(--secondary);
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
.fa-color-green {
|
| 75 |
+
color: #10B981;
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
.fa-color-yellow {
|
| 79 |
+
color: var(--accent);
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
.fa-color-red {
|
| 83 |
+
color: #EF4444;
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
.link-btn {
|
| 87 |
+
display: inline-flex;
|
| 88 |
+
align-items: center;
|
| 89 |
+
gap: 8px;
|
| 90 |
+
text-decoration: none;
|
| 91 |
+
padding: 12px 24px;
|
| 92 |
+
border-radius: 50px;
|
| 93 |
+
font-weight: 500;
|
| 94 |
+
transition: all 0.3s ease;
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
/* Dark mode theme */
|
| 98 |
+
@media (prefers-color-scheme: dark) {
|
| 99 |
+
html, body {
|
| 100 |
+
background: var(--neutral-800);
|
| 101 |
+
color: var(--neutral-50);
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
.gradio-container {
|
| 105 |
+
background: var(--neutral-800);
|
| 106 |
+
color: var(--neutral-50);
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
.link-btn {
|
| 110 |
+
background: rgba(20, 184, 166, 0.2);
|
| 111 |
+
color: white;
|
| 112 |
+
border: 1px solid rgba(20, 184, 166, 0.4);
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
.link-btn:hover {
|
| 116 |
+
background: rgba(20, 184, 166, 0.35);
|
| 117 |
+
transform: translateY(-2px);
|
| 118 |
+
box-shadow: 0 8px 25px rgba(20, 184, 166, 0.25);
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
.tech-bg {
|
| 122 |
+
background: linear-gradient(135deg, var(--neutral-900), var(--neutral-800));
|
| 123 |
+
position: relative;
|
| 124 |
+
overflow: hidden;
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
.tech-bg::before {
|
| 128 |
+
content: '';
|
| 129 |
+
position: absolute;
|
| 130 |
+
top: 0;
|
| 131 |
+
left: 0;
|
| 132 |
+
right: 0;
|
| 133 |
+
bottom: 0;
|
| 134 |
+
background:
|
| 135 |
+
radial-gradient(circle at 20% 80%, rgba(37, 99, 235, 0.15) 0%, transparent 50%),
|
| 136 |
+
radial-gradient(circle at 80% 20%, rgba(20, 184, 166, 0.15) 0%, transparent 50%),
|
| 137 |
+
radial-gradient(circle at 40% 40%, rgba(249, 115, 22, 0.1) 0%, transparent 50%);
|
| 138 |
+
animation: techPulse 8s ease-in-out infinite;
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
.gradio-container .panel,
|
| 142 |
+
.gradio-container .block,
|
| 143 |
+
.gradio-container .form {
|
| 144 |
+
background: rgba(0, 0, 0, 0.3);
|
| 145 |
+
border: 1px solid rgba(20, 184, 166, 0.2);
|
| 146 |
+
border-radius: 10px;
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
.gradio-container * {
|
| 150 |
+
color: var(--neutral-50);
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
+
.gradio-container label {
|
| 154 |
+
color: var(--neutral-200);
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
.gradio-container .markdown {
|
| 158 |
+
color: var(--neutral-200);
|
| 159 |
+
}
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
/* Light mode theme */
|
| 163 |
+
@media (prefers-color-scheme: light) {
|
| 164 |
+
html, body {
|
| 165 |
+
background: var(--neutral-50);
|
| 166 |
+
color: var(--neutral-800);
|
| 167 |
+
}
|
| 168 |
+
|
| 169 |
+
.gradio-container {
|
| 170 |
+
background: var(--neutral-50);
|
| 171 |
+
color: var(--neutral-800);
|
| 172 |
+
}
|
| 173 |
+
|
| 174 |
+
.tech-bg {
|
| 175 |
+
background: linear-gradient(135deg, var(--neutral-50), var(--neutral-100));
|
| 176 |
+
position: relative;
|
| 177 |
+
overflow: hidden;
|
| 178 |
+
}
|
| 179 |
+
|
| 180 |
+
.link-btn {
|
| 181 |
+
background: rgba(20, 184, 166, 0.12);
|
| 182 |
+
color: var(--neutral-700);
|
| 183 |
+
border: 1px solid rgba(20, 184, 166, 0.3);
|
| 184 |
+
}
|
| 185 |
+
|
| 186 |
+
.link-btn:hover {
|
| 187 |
+
background: rgba(20, 184, 166, 0.2);
|
| 188 |
+
transform: translateY(-2px);
|
| 189 |
+
box-shadow: 0 8px 25px rgba(20, 184, 166, 0.2);
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
.tech-bg::before {
|
| 193 |
+
content: '';
|
| 194 |
+
position: absolute;
|
| 195 |
+
top: 0;
|
| 196 |
+
left: 0;
|
| 197 |
+
right: 0;
|
| 198 |
+
bottom: 0;
|
| 199 |
+
background:
|
| 200 |
+
radial-gradient(circle at 20% 80%, rgba(37, 99, 235, 0.08) 0%, transparent 50%),
|
| 201 |
+
radial-gradient(circle at 80% 20%, rgba(20, 184, 166, 0.08) 0%, transparent 50%),
|
| 202 |
+
radial-gradient(circle at 40% 40%, rgba(249, 115, 22, 0.06) 0%, transparent 50%);
|
| 203 |
+
animation: techPulse 8s ease-in-out infinite;
|
| 204 |
+
}
|
| 205 |
+
|
| 206 |
+
.gradio-container .panel,
|
| 207 |
+
.gradio-container .block,
|
| 208 |
+
.gradio-container .form {
|
| 209 |
+
background: rgba(255, 255, 255, 0.9);
|
| 210 |
+
border: 1px solid rgba(20, 184, 166, 0.2);
|
| 211 |
+
border-radius: 10px;
|
| 212 |
+
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.05);
|
| 213 |
+
}
|
| 214 |
+
|
| 215 |
+
.gradio-container * {
|
| 216 |
+
color: var(--neutral-800);
|
| 217 |
+
}
|
| 218 |
+
|
| 219 |
+
.gradio-container label {
|
| 220 |
+
color: var(--neutral-600);
|
| 221 |
+
}
|
| 222 |
+
|
| 223 |
+
.gradio-container .markdown {
|
| 224 |
+
color: var(--neutral-600);
|
| 225 |
+
}
|
| 226 |
+
}
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
|
| 230 |
+
|
| 231 |
+
@keyframes techPulse {
|
| 232 |
+
0%, 100% { opacity: 0.5; }
|
| 233 |
+
50% { opacity: 0.8; }
|
| 234 |
+
}
|
| 235 |
+
|
| 236 |
+
/* Custom log with tech gradient */
|
| 237 |
+
.custom-log * {
|
| 238 |
+
font-style: italic;
|
| 239 |
+
font-size: 22px !important;
|
| 240 |
+
background: linear-gradient(135deg, var(--primary), var(--secondary));
|
| 241 |
+
background-size: 400% 400%;
|
| 242 |
+
-webkit-background-clip: text;
|
| 243 |
+
background-clip: text;
|
| 244 |
+
font-weight: bold !important;
|
| 245 |
+
color: transparent !important;
|
| 246 |
+
text-align: center !important;
|
| 247 |
+
animation: techGradient 3s ease infinite;
|
| 248 |
+
}
|
| 249 |
+
|
| 250 |
+
@keyframes techGradient {
|
| 251 |
+
0% { background-position: 0% 50%; }
|
| 252 |
+
50% { background-position: 100% 50%; }
|
| 253 |
+
100% { background-position: 0% 50%; }
|
| 254 |
+
}
|
| 255 |
+
|
| 256 |
+
@keyframes metricPulse {
|
| 257 |
+
0%, 100% { background-position: 0% 50%; }
|
| 258 |
+
50% { background-position: 100% 50%; }
|
| 259 |
+
}
|
| 260 |
+
|
| 261 |
+
@keyframes pointcloudPulse {
|
| 262 |
+
0%, 100% { background-position: 0% 50%; }
|
| 263 |
+
50% { background-position: 100% 50%; }
|
| 264 |
+
}
|
| 265 |
+
|
| 266 |
+
@keyframes camerasPulse {
|
| 267 |
+
0%, 100% { background-position: 0% 50%; }
|
| 268 |
+
50% { background-position: 100% 50%; }
|
| 269 |
+
}
|
| 270 |
+
|
| 271 |
+
@keyframes gaussiansPulse {
|
| 272 |
+
0%, 100% { background-position: 0% 50%; }
|
| 273 |
+
50% { background-position: 100% 50%; }
|
| 274 |
+
}
|
| 275 |
+
|
| 276 |
+
/* Special colors for key terms - Global styles with gradient animations */
|
| 277 |
+
.metric-text {
|
| 278 |
+
background: linear-gradient(135deg, #10B981, #059669);
|
| 279 |
+
background-size: 200% 200%;
|
| 280 |
+
-webkit-background-clip: text;
|
| 281 |
+
background-clip: text;
|
| 282 |
+
color: transparent !important;
|
| 283 |
+
font-weight: 700;
|
| 284 |
+
animation: metricPulse 3s ease infinite;
|
| 285 |
+
}
|
| 286 |
+
|
| 287 |
+
.pointcloud-text {
|
| 288 |
+
background: linear-gradient(135deg, #10B981, #059669);
|
| 289 |
+
background-size: 200% 200%;
|
| 290 |
+
-webkit-background-clip: text;
|
| 291 |
+
background-clip: text;
|
| 292 |
+
color: transparent !important;
|
| 293 |
+
font-weight: 700;
|
| 294 |
+
animation: pointcloudPulse 3s ease infinite;
|
| 295 |
+
}
|
| 296 |
+
|
| 297 |
+
.cameras-text {
|
| 298 |
+
background: linear-gradient(135deg, #F97316, #EA580C);
|
| 299 |
+
background-size: 200% 200%;
|
| 300 |
+
-webkit-background-clip: text;
|
| 301 |
+
background-clip: text;
|
| 302 |
+
color: transparent !important;
|
| 303 |
+
font-weight: 700;
|
| 304 |
+
animation: camerasPulse 3s ease infinite;
|
| 305 |
+
}
|
| 306 |
+
|
| 307 |
+
.gaussians-text {
|
| 308 |
+
background: linear-gradient(135deg, #2563EB, #1D4ED8);
|
| 309 |
+
background-size: 200% 200%;
|
| 310 |
+
-webkit-background-clip: text;
|
| 311 |
+
background-clip: text;
|
| 312 |
+
color: transparent !important;
|
| 313 |
+
font-weight: 700;
|
| 314 |
+
animation: gaussiansPulse 3s ease infinite;
|
| 315 |
+
}
|
| 316 |
+
|
| 317 |
+
.example-log * {
|
| 318 |
+
font-style: italic;
|
| 319 |
+
font-size: 16px !important;
|
| 320 |
+
background: linear-gradient(135deg, var(--primary), var(--secondary));
|
| 321 |
+
-webkit-background-clip: text;
|
| 322 |
+
background-clip: text;
|
| 323 |
+
color: transparent !important;
|
| 324 |
+
}
|
| 325 |
+
|
| 326 |
+
#my_radio .wrap {
|
| 327 |
+
display: flex;
|
| 328 |
+
flex-wrap: nowrap;
|
| 329 |
+
justify-content: center;
|
| 330 |
+
align-items: center;
|
| 331 |
+
}
|
| 332 |
+
|
| 333 |
+
#my_radio .wrap label {
|
| 334 |
+
display: flex;
|
| 335 |
+
width: 50%;
|
| 336 |
+
justify-content: center;
|
| 337 |
+
align-items: center;
|
| 338 |
+
margin: 0;
|
| 339 |
+
padding: 10px 0;
|
| 340 |
+
box-sizing: border-box;
|
| 341 |
+
}
|
| 342 |
+
|
| 343 |
+
/* Align navigation buttons with dropdown bottom */
|
| 344 |
+
.navigation-row {
|
| 345 |
+
display: flex !important;
|
| 346 |
+
align-items: flex-end !important;
|
| 347 |
+
gap: 8px !important;
|
| 348 |
+
}
|
| 349 |
+
|
| 350 |
+
.navigation-row > div:nth-child(1),
|
| 351 |
+
.navigation-row > div:nth-child(3) {
|
| 352 |
+
align-self: flex-end !important;
|
| 353 |
+
}
|
| 354 |
+
|
| 355 |
+
.navigation-row > div:nth-child(2) {
|
| 356 |
+
flex: 1 !important;
|
| 357 |
+
}
|
| 358 |
+
|
| 359 |
+
/* Make thumbnails clickable with pointer cursor */
|
| 360 |
+
.clickable-thumbnail img {
|
| 361 |
+
cursor: pointer !important;
|
| 362 |
+
}
|
| 363 |
+
|
| 364 |
+
.clickable-thumbnail:hover img {
|
| 365 |
+
cursor: pointer !important;
|
| 366 |
+
opacity: 0.8;
|
| 367 |
+
transition: opacity 0.3s ease;
|
| 368 |
+
}
|
| 369 |
+
|
| 370 |
+
/* Make thumbnail containers narrower horizontally */
|
| 371 |
+
.clickable-thumbnail {
|
| 372 |
+
padding: 5px 2px !important;
|
| 373 |
+
margin: 0 2px !important;
|
| 374 |
+
}
|
| 375 |
+
|
| 376 |
+
.clickable-thumbnail .image-container {
|
| 377 |
+
margin: 0 !important;
|
| 378 |
+
padding: 0 !important;
|
| 379 |
+
}
|
| 380 |
+
|
| 381 |
+
.scene-info {
|
| 382 |
+
text-align: center !important;
|
| 383 |
+
padding: 5px 2px !important;
|
| 384 |
+
margin: 0 !important;
|
| 385 |
+
}
|
| 386 |
+
"""
|
| 387 |
+
|
| 388 |
+
|
| 389 |
+
def get_header_html(logo_base64=None):
|
| 390 |
+
"""
|
| 391 |
+
Generate the main header HTML with logo and title.
|
| 392 |
+
|
| 393 |
+
Args:
|
| 394 |
+
logo_base64 (str, optional): Base64 encoded logo image
|
| 395 |
+
|
| 396 |
+
Returns:
|
| 397 |
+
str: HTML string for the header
|
| 398 |
+
"""
|
| 399 |
+
return """
|
| 400 |
+
<div class="tech-bg" style="text-align: center; margin-bottom: 5px; padding: 40px 20px; border-radius: 15px; position: relative; overflow: hidden;">
|
| 401 |
+
<div style="position: relative; z-index: 2;">
|
| 402 |
+
<h1 style="margin: 0; font-size: 3.5em; font-weight: 700;
|
| 403 |
+
background: linear-gradient(135deg, #2563EB, #14B8A6);
|
| 404 |
+
background-size: 400% 400%;
|
| 405 |
+
-webkit-background-clip: text;
|
| 406 |
+
background-clip: text;
|
| 407 |
+
color: transparent;
|
| 408 |
+
animation: techGradient 3s ease infinite;
|
| 409 |
+
text-shadow: 0 0 30px rgba(20, 184, 166, 0.4);
|
| 410 |
+
letter-spacing: 2px;">
|
| 411 |
+
Depth Anything 3
|
| 412 |
+
</h1>
|
| 413 |
+
<p style="margin: 15px 0 0 0; font-size: 2.16em; font-weight: 300;" class="header-subtitle">
|
| 414 |
+
Recovering the Visual Space from Any Views
|
| 415 |
+
</p>
|
| 416 |
+
<div style="margin-top: 20px;">
|
| 417 |
+
<a href="https://depth-anything-3.github.io" target="_blank" class="link-btn" style="margin: 0.5em;">
|
| 418 |
+
<i class="fas fa-globe" style="margin-right: 8px;"></i> Project Page
|
| 419 |
+
</a>
|
| 420 |
+
<a href="https://arxiv.org/abs/2406.09414" target="_blank" class="link-btn" style="margin: 0.5em;">
|
| 421 |
+
<i class="fas fa-file-pdf" style="margin-right: 8px;"></i> Paper
|
| 422 |
+
</a>
|
| 423 |
+
<a href="https://github.com/Aedelon/awesome-depth-anything-3" target="_blank" class="link-btn" style="margin: 0.5em; background: var(--secondary); color: white; border: none; font-weight: 600;">
|
| 424 |
+
<i class="fab fa-github" style="margin-right: 8px;"></i> Awesome Optimized Fork
|
| 425 |
+
</a>
|
| 426 |
+
<a href="https://github.com/ByteDance-Seed/Depth-Anything-3" target="_blank" class="link-btn" style="margin: 0.5em;">
|
| 427 |
+
<i class="fab fa-github" style="margin-right: 8px;"></i> Original
|
| 428 |
+
</a>
|
| 429 |
+
</div>
|
| 430 |
+
</div>
|
| 431 |
+
</div>
|
| 432 |
+
|
| 433 |
+
<style>
|
| 434 |
+
.header-subtitle {
|
| 435 |
+
color: #4B5563;
|
| 436 |
+
}
|
| 437 |
+
.tech-bg {
|
| 438 |
+
background: linear-gradient(135deg, rgba(20, 184, 166, 0.08) 0%, rgba(37, 99, 235, 0.08) 100%) !important;
|
| 439 |
+
}
|
| 440 |
+
</style>
|
| 441 |
+
<script>
|
| 442 |
+
document.body.classList.add('light');
|
| 443 |
+
document.documentElement.classList.add('light');
|
| 444 |
+
</script>
|
| 445 |
+
"""
|
| 446 |
+
|
| 447 |
+
|
| 448 |
+
def get_description_html():
|
| 449 |
+
"""
|
| 450 |
+
Generate the main description and getting started HTML.
|
| 451 |
+
|
| 452 |
+
Returns:
|
| 453 |
+
str: HTML string for the description
|
| 454 |
+
"""
|
| 455 |
+
return """
|
| 456 |
+
<div class="description-container" style="padding: 25px; border-radius: 15px; margin: 0 0 20px 0;">
|
| 457 |
+
<h2 class="description-title" style="margin-top: 0; font-size: 1.6em; text-align: center;">
|
| 458 |
+
<i class="fas fa-bullseye fa-color-red" style="margin-right: 8px;"></i> What This Demo Does
|
| 459 |
+
</h2>
|
| 460 |
+
<div class="description-content" style="padding: 20px; border-radius: 10px; margin: 15px 0; text-align: center;">
|
| 461 |
+
<p class="description-main" style="line-height: 1.6; margin: 0; font-size: 1.45em;">
|
| 462 |
+
<strong>Upload images or videos</strong> → <strong>Get <span class="metric-text">Metric</span> <span class="pointcloud-text">Point Clouds</span>, <span class="cameras-text">Cameras</span> and <span class="gaussians-text">Novel Views</span></strong> → <strong>Explore in 3D</strong>
|
| 463 |
+
</p>
|
| 464 |
+
</div>
|
| 465 |
+
|
| 466 |
+
<div style="text-align: center; margin-top: 15px;">
|
| 467 |
+
<p class="description-tip" style="font-style: italic; margin: 0;">
|
| 468 |
+
<i class="fas fa-lightbulb fa-color-yellow" style="margin-right: 8px;"></i> <strong>Tip:</strong> Landscape-oriented images or videos are preferred for best 3D recovering.
|
| 469 |
+
</p>
|
| 470 |
+
</div>
|
| 471 |
+
</div>
|
| 472 |
+
|
| 473 |
+
<style>
|
| 474 |
+
@media (prefers-color-scheme: dark) {
|
| 475 |
+
.description-container {
|
| 476 |
+
background: linear-gradient(135deg, rgba(20, 184, 166, 0.08) 0%, rgba(37, 99, 235, 0.08) 100%);
|
| 477 |
+
border: 1px solid rgba(20, 184, 166, 0.2);
|
| 478 |
+
}
|
| 479 |
+
.description-title { color: #14B8A6; }
|
| 480 |
+
.description-content { background: rgba(0, 0, 0, 0.3); }
|
| 481 |
+
.description-main { color: #E5E7EB; }
|
| 482 |
+
.description-text { color: #D1D5DB; }
|
| 483 |
+
.description-tip { color: #D1D5DB; }
|
| 484 |
+
}
|
| 485 |
+
|
| 486 |
+
@media (prefers-color-scheme: light) {
|
| 487 |
+
.description-container {
|
| 488 |
+
background: linear-gradient(135deg, rgba(20, 184, 166, 0.05) 0%, rgba(37, 99, 235, 0.05) 100%);
|
| 489 |
+
border: 1px solid rgba(20, 184, 166, 0.2);
|
| 490 |
+
}
|
| 491 |
+
.description-title { color: #14B8A6; }
|
| 492 |
+
.description-content { background: transparent; }
|
| 493 |
+
.description-main { color: #1F2937; }
|
| 494 |
+
.description-text { color: #4B5563; }
|
| 495 |
+
.description-tip { color: #4B5563; }
|
| 496 |
+
}
|
| 497 |
+
</style>
|
| 498 |
+
"""
|
| 499 |
+
|
| 500 |
+
|
| 501 |
+
def get_acknowledgements_html():
|
| 502 |
+
"""
|
| 503 |
+
Generate the acknowledgements section HTML.
|
| 504 |
+
|
| 505 |
+
Returns:
|
| 506 |
+
str: HTML string for the acknowledgements
|
| 507 |
+
"""
|
| 508 |
+
return """
|
| 509 |
+
<div style="background: linear-gradient(135deg, rgba(20, 184, 166, 0.08) 0%, rgba(37, 99, 235, 0.08) 100%);
|
| 510 |
+
padding: 25px; border-radius: 15px; margin: 20px 0; border: 1px solid rgba(20, 184, 166, 0.2);">
|
| 511 |
+
<h3 style="color: #14B8A6; margin-top: 0; text-align: center; font-size: 1.4em;">
|
| 512 |
+
<i class="fas fa-trophy fa-color-yellow" style="margin-right: 8px;"></i> Research Credits & Acknowledgments
|
| 513 |
+
</h3>
|
| 514 |
+
|
| 515 |
+
<div style="display: grid; grid-template-columns: 1fr 1fr; gap: 20px; margin: 15px 0;">
|
| 516 |
+
<!-- Original Research Section (Left) -->
|
| 517 |
+
<div style="text-align: center;">
|
| 518 |
+
<h4 style="color: #2563EB; margin: 10px 0;"><i class="fas fa-flask fa-color-green" style="margin-right: 8px;"></i> Original Research</h4>
|
| 519 |
+
<p style="color: #9CA3AF; margin: 5px 0;">
|
| 520 |
+
<a href="https://depth-anything-3.github.io" target="_blank"
|
| 521 |
+
style="color: #14B8A6; text-decoration: none; font-weight: 600;">
|
| 522 |
+
Depth Anything 3
|
| 523 |
+
</a>
|
| 524 |
+
</p>
|
| 525 |
+
</div>
|
| 526 |
+
|
| 527 |
+
<!-- Previous Versions Section (Right) -->
|
| 528 |
+
<div style="text-align: center;">
|
| 529 |
+
<h4 style="color: #2563EB; margin: 10px 0;"><i class="fas fa-history fa-color-blue" style="margin-right: 8px;"></i> Previous Versions</h4>
|
| 530 |
+
<div style="display: flex; flex-direction: row; gap: 15px; justify-content: center; align-items: center;">
|
| 531 |
+
<p style="color: #9CA3AF; margin: 0;">
|
| 532 |
+
<a href="https://huggingface.co/spaces/LiheYoung/Depth-Anything" target="_blank"
|
| 533 |
+
style="color: #14B8A6; text-decoration: none; font-weight: 600;">
|
| 534 |
+
Depth-Anything
|
| 535 |
+
</a>
|
| 536 |
+
</p>
|
| 537 |
+
<span style="color: #9CA3AF;">•</span>
|
| 538 |
+
<p style="color: #9CA3AF; margin: 0;">
|
| 539 |
+
<a href="https://huggingface.co/spaces/depth-anything/Depth-Anything-V2" target="_blank"
|
| 540 |
+
style="color: #14B8A6; text-decoration: none; font-weight: 600;">
|
| 541 |
+
Depth-Anything-V2
|
| 542 |
+
</a>
|
| 543 |
+
</p>
|
| 544 |
+
</div>
|
| 545 |
+
</div>
|
| 546 |
+
</div>
|
| 547 |
+
|
| 548 |
+
<!-- HF Demo Adapted from - Centered at the bottom of the whole block -->
|
| 549 |
+
<div style="margin-top: 20px; padding-top: 15px; border-top: 1px solid rgba(20, 184, 166, 0.2); text-align: center;">
|
| 550 |
+
<p style="color: #6B7280; font-size: 0.9em; margin: 0;">
|
| 551 |
+
<i class="fas fa-code-branch" style="margin-right: 5px; color: #9CA3AF;"></i> HF demo adapted from <a href="https://huggingface.co/spaces/facebook/map-anything" target="_blank" style="color: inherit; text-decoration: none;">Map Anything</a>
|
| 552 |
+
</p>
|
| 553 |
+
</div>
|
| 554 |
+
</div>
|
| 555 |
+
"""
|
| 556 |
+
|
| 557 |
+
|
| 558 |
+
def get_gradio_theme():
|
| 559 |
+
"""
|
| 560 |
+
Get the configured Gradio theme with modern teal/blue colors.
|
| 561 |
+
|
| 562 |
+
Color palette:
|
| 563 |
+
- Primary: Teal (#14B8A6)
|
| 564 |
+
- Secondary: Blue (#2563EB)
|
| 565 |
+
- Accent: Orange (#F97316)
|
| 566 |
+
- Neutrals: Clean grays (#F9FAFB to #111827)
|
| 567 |
+
|
| 568 |
+
Returns:
|
| 569 |
+
gr.themes.Base: Configured Gradio theme
|
| 570 |
+
"""
|
| 571 |
+
import gradio as gr
|
| 572 |
+
|
| 573 |
+
return gr.themes.Base(
|
| 574 |
+
# Primary hue: Teal
|
| 575 |
+
primary_hue=gr.themes.Color(
|
| 576 |
+
c50="#F0FDFA",
|
| 577 |
+
c100="#CCFBF1",
|
| 578 |
+
c200="#99F6E4",
|
| 579 |
+
c300="#5EEAD4",
|
| 580 |
+
c400="#2DD4BF",
|
| 581 |
+
c500="#14B8A6",
|
| 582 |
+
c600="#0D9488",
|
| 583 |
+
c700="#0F766E",
|
| 584 |
+
c800="#115E59",
|
| 585 |
+
c900="#134E4A",
|
| 586 |
+
c950="#042F2E",
|
| 587 |
+
),
|
| 588 |
+
# Secondary hue: Blue
|
| 589 |
+
secondary_hue=gr.themes.Color(
|
| 590 |
+
c50="#EFF6FF",
|
| 591 |
+
c100="#DBEAFE",
|
| 592 |
+
c200="#BFDBFE",
|
| 593 |
+
c300="#93C5FD",
|
| 594 |
+
c400="#60A5FA",
|
| 595 |
+
c500="#3B82F6",
|
| 596 |
+
c600="#2563EB",
|
| 597 |
+
c700="#1D4ED8",
|
| 598 |
+
c800="#1E40AF",
|
| 599 |
+
c900="#1E3A8A",
|
| 600 |
+
c950="#172554",
|
| 601 |
+
),
|
| 602 |
+
# Neutral hue: Clean grays
|
| 603 |
+
neutral_hue=gr.themes.Color(
|
| 604 |
+
c50="#F9FAFB",
|
| 605 |
+
c100="#F3F4F6",
|
| 606 |
+
c200="#E5E7EB",
|
| 607 |
+
c300="#D1D5DB",
|
| 608 |
+
c400="#9CA3AF",
|
| 609 |
+
c500="#6B7280",
|
| 610 |
+
c600="#4B5563",
|
| 611 |
+
c700="#374151",
|
| 612 |
+
c800="#1F2937",
|
| 613 |
+
c900="#111827",
|
| 614 |
+
c950="#030712",
|
| 615 |
+
),
|
| 616 |
+
)
|
| 617 |
+
|
| 618 |
+
|
| 619 |
+
# Measure tab instructions HTML
|
| 620 |
+
MEASURE_INSTRUCTIONS_HTML = """
|
| 621 |
+
### Click points on the image to compute distance.
|
| 622 |
+
> <i class="fas fa-triangle-exclamation fa-color-red" style="margin-right: 5px;"></i> Metric scale estimation is difficult on aerial/drone images.
|
| 623 |
+
"""
|
src/depth_anything_3/app/gradio_app.py
ADDED
|
@@ -0,0 +1,743 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
"""
|
| 16 |
+
Refactored Gradio App for Depth Anything 3.
|
| 17 |
+
|
| 18 |
+
This is the main application file that orchestrates all components.
|
| 19 |
+
The original functionality has been split into modular components for better maintainability.
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
import argparse
|
| 23 |
+
import os
|
| 24 |
+
from typing import Any, Dict, List
|
| 25 |
+
|
| 26 |
+
import gradio as gr
|
| 27 |
+
|
| 28 |
+
from depth_anything_3.app.css_and_html import GRADIO_CSS, get_gradio_theme
|
| 29 |
+
from depth_anything_3.app.modules.event_handlers import EventHandlers
|
| 30 |
+
from depth_anything_3.app.modules.ui_components import UIComponents
|
| 31 |
+
|
| 32 |
+
# Set environment variables
|
| 33 |
+
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class DepthAnything3App:
|
| 37 |
+
"""
|
| 38 |
+
Main application class for Depth Anything 3 Gradio app.
|
| 39 |
+
"""
|
| 40 |
+
|
| 41 |
+
def __init__(self, model_dir: str = None, workspace_dir: str = None, gallery_dir: str = None):
|
| 42 |
+
"""
|
| 43 |
+
Initialize the application.
|
| 44 |
+
|
| 45 |
+
Args:
|
| 46 |
+
model_dir: Path to the model directory
|
| 47 |
+
workspace_dir: Path to the workspace directory
|
| 48 |
+
gallery_dir: Path to the gallery directory
|
| 49 |
+
"""
|
| 50 |
+
self.model_dir = model_dir
|
| 51 |
+
self.workspace_dir = workspace_dir
|
| 52 |
+
self.gallery_dir = gallery_dir
|
| 53 |
+
|
| 54 |
+
# Set environment variables for directories
|
| 55 |
+
if self.model_dir:
|
| 56 |
+
os.environ["DA3_MODEL_DIR"] = self.model_dir
|
| 57 |
+
if self.workspace_dir:
|
| 58 |
+
os.environ["DA3_WORKSPACE_DIR"] = self.workspace_dir
|
| 59 |
+
if self.gallery_dir:
|
| 60 |
+
os.environ["DA3_GALLERY_DIR"] = self.gallery_dir
|
| 61 |
+
|
| 62 |
+
self.event_handlers = EventHandlers()
|
| 63 |
+
self.ui_components = UIComponents()
|
| 64 |
+
|
| 65 |
+
def cache_examples(
|
| 66 |
+
self,
|
| 67 |
+
show_cam: bool = True,
|
| 68 |
+
filter_black_bg: bool = False,
|
| 69 |
+
filter_white_bg: bool = False,
|
| 70 |
+
save_percentage: float = 20.0,
|
| 71 |
+
num_max_points: int = 1000,
|
| 72 |
+
cache_gs_tag: str = "",
|
| 73 |
+
gs_trj_mode: str = "smooth",
|
| 74 |
+
gs_video_quality: str = "low",
|
| 75 |
+
) -> None:
|
| 76 |
+
"""
|
| 77 |
+
Pre-cache all example scenes at startup.
|
| 78 |
+
|
| 79 |
+
Args:
|
| 80 |
+
show_cam: Whether to show camera in visualization
|
| 81 |
+
filter_black_bg: Whether to filter black background
|
| 82 |
+
filter_white_bg: Whether to filter white background
|
| 83 |
+
save_percentage: Filter percentage for point cloud
|
| 84 |
+
num_max_points: Maximum number of points
|
| 85 |
+
cache_gs_tag: Tag to match scene names for high-res+3DGS caching (e.g., "dl3dv")
|
| 86 |
+
gs_trj_mode: Trajectory mode for 3DGS
|
| 87 |
+
gs_video_quality: Video quality for 3DGS
|
| 88 |
+
"""
|
| 89 |
+
from depth_anything_3.app.modules.utils import get_scene_info
|
| 90 |
+
|
| 91 |
+
examples_dir = os.path.join(self.workspace_dir, "examples")
|
| 92 |
+
if not os.path.exists(examples_dir):
|
| 93 |
+
print(f"Examples directory not found: {examples_dir}")
|
| 94 |
+
return
|
| 95 |
+
|
| 96 |
+
scenes = get_scene_info(examples_dir)
|
| 97 |
+
if not scenes:
|
| 98 |
+
print("No example scenes found to cache.")
|
| 99 |
+
return
|
| 100 |
+
|
| 101 |
+
print(f"\n{'='*60}")
|
| 102 |
+
print(f"Caching {len(scenes)} example scenes...")
|
| 103 |
+
print(f"{'='*60}\n")
|
| 104 |
+
|
| 105 |
+
for i, scene in enumerate(scenes, 1):
|
| 106 |
+
scene_name = scene["name"]
|
| 107 |
+
|
| 108 |
+
# Check if scene name matches the gs tag for high-res+3DGS caching
|
| 109 |
+
use_high_res_gs = cache_gs_tag and cache_gs_tag.lower() in scene_name.lower()
|
| 110 |
+
|
| 111 |
+
if use_high_res_gs:
|
| 112 |
+
print(f"[{i}/{len(scenes)}] Caching scene: {scene_name} (HIGH-RES + 3DGS)")
|
| 113 |
+
print(f" - Number of images: {scene['num_images']}")
|
| 114 |
+
print(f" - Matched tag: '{cache_gs_tag}' - using high_res + 3DGS")
|
| 115 |
+
else:
|
| 116 |
+
print(f"[{i}/{len(scenes)}] Caching scene: {scene_name} (LOW-RES)")
|
| 117 |
+
print(f" - Number of images: {scene['num_images']}")
|
| 118 |
+
|
| 119 |
+
try:
|
| 120 |
+
# Load example scene
|
| 121 |
+
_, target_dir, _, _, _, _, _, _, _ = self.event_handlers.load_example_scene(
|
| 122 |
+
scene_name
|
| 123 |
+
)
|
| 124 |
+
|
| 125 |
+
if target_dir and target_dir != "None":
|
| 126 |
+
# Run reconstruction with appropriate settings
|
| 127 |
+
print(" - Running reconstruction...")
|
| 128 |
+
result = self.event_handlers.gradio_demo(
|
| 129 |
+
target_dir=target_dir,
|
| 130 |
+
show_cam=show_cam,
|
| 131 |
+
filter_black_bg=filter_black_bg,
|
| 132 |
+
filter_white_bg=filter_white_bg,
|
| 133 |
+
process_res_method="high_res" if use_high_res_gs else "low_res",
|
| 134 |
+
save_percentage=save_percentage,
|
| 135 |
+
num_max_points=num_max_points,
|
| 136 |
+
infer_gs=use_high_res_gs,
|
| 137 |
+
ref_view_strategy="saddle_balanced",
|
| 138 |
+
gs_trj_mode=gs_trj_mode,
|
| 139 |
+
gs_video_quality=gs_video_quality,
|
| 140 |
+
)
|
| 141 |
+
|
| 142 |
+
# Check if successful
|
| 143 |
+
if result[0] is not None: # reconstruction_output
|
| 144 |
+
print(f" ✓ Scene '{scene_name}' cached successfully")
|
| 145 |
+
else:
|
| 146 |
+
print(f" ✗ Scene '{scene_name}' caching failed: {result[1]}")
|
| 147 |
+
else:
|
| 148 |
+
print(f" ✗ Scene '{scene_name}' loading failed")
|
| 149 |
+
|
| 150 |
+
except Exception as e:
|
| 151 |
+
print(f" ✗ Error caching scene '{scene_name}': {str(e)}")
|
| 152 |
+
|
| 153 |
+
print()
|
| 154 |
+
|
| 155 |
+
print("=" * 60)
|
| 156 |
+
print("Example scene caching completed!")
|
| 157 |
+
print("=" * 60 + "\n")
|
| 158 |
+
|
| 159 |
+
def create_app(self) -> gr.Blocks:
|
| 160 |
+
"""
|
| 161 |
+
Create and configure the Gradio application.
|
| 162 |
+
|
| 163 |
+
Returns:
|
| 164 |
+
Configured Gradio Blocks interface
|
| 165 |
+
"""
|
| 166 |
+
# Get theme and CSS
|
| 167 |
+
self._theme = get_gradio_theme()
|
| 168 |
+
self._css = GRADIO_CSS
|
| 169 |
+
|
| 170 |
+
with gr.Blocks(theme=self._theme, css=self._css) as demo:
|
| 171 |
+
# State variables for the tabbed interface
|
| 172 |
+
is_example = gr.Textbox(label="is_example", visible=False, value="None")
|
| 173 |
+
processed_data_state = gr.State(value=None)
|
| 174 |
+
measure_points_state = gr.State(value=[])
|
| 175 |
+
selected_image_index_state = gr.State(value=0) # Track selected image index
|
| 176 |
+
# current_view_index = gr.State(value=0) # noqa: F841 Track current view index
|
| 177 |
+
|
| 178 |
+
# Header and description
|
| 179 |
+
self.ui_components.create_header_section()
|
| 180 |
+
self.ui_components.create_description_section()
|
| 181 |
+
|
| 182 |
+
target_dir_output = gr.Textbox(label="Target Dir", visible=False, value="None")
|
| 183 |
+
|
| 184 |
+
# Main content area
|
| 185 |
+
with gr.Row():
|
| 186 |
+
with gr.Column(scale=2):
|
| 187 |
+
# Upload section
|
| 188 |
+
(
|
| 189 |
+
input_video,
|
| 190 |
+
s_time_interval,
|
| 191 |
+
input_images,
|
| 192 |
+
image_gallery,
|
| 193 |
+
) = self.ui_components.create_upload_section()
|
| 194 |
+
|
| 195 |
+
with gr.Column(scale=4):
|
| 196 |
+
with gr.Column():
|
| 197 |
+
# gr.Markdown("**Metric 3D Reconstruction (Point Cloud and Camera Poses)**")
|
| 198 |
+
# Reconstruction control section (buttons) - moved below tabs
|
| 199 |
+
|
| 200 |
+
log_output = gr.Markdown(
|
| 201 |
+
"Please upload a video or images, then click Reconstruct.",
|
| 202 |
+
elem_classes=["custom-log"],
|
| 203 |
+
)
|
| 204 |
+
|
| 205 |
+
# Tabbed interface
|
| 206 |
+
with gr.Tabs():
|
| 207 |
+
with gr.Tab("Point Cloud & Cameras"):
|
| 208 |
+
reconstruction_output = (
|
| 209 |
+
self.ui_components.create_3d_viewer_section()
|
| 210 |
+
)
|
| 211 |
+
|
| 212 |
+
with gr.Tab("Metric Depth"):
|
| 213 |
+
(
|
| 214 |
+
prev_measure_btn,
|
| 215 |
+
measure_view_selector,
|
| 216 |
+
next_measure_btn,
|
| 217 |
+
measure_image,
|
| 218 |
+
measure_depth_image,
|
| 219 |
+
measure_text,
|
| 220 |
+
) = self.ui_components.create_measure_section()
|
| 221 |
+
|
| 222 |
+
with gr.Tab("3DGS Rendered Novel Views"):
|
| 223 |
+
gs_video, gs_info = self.ui_components.create_nvs_video()
|
| 224 |
+
|
| 225 |
+
# Inference control section (before inference)
|
| 226 |
+
(
|
| 227 |
+
model_selector,
|
| 228 |
+
process_res_method_dropdown,
|
| 229 |
+
infer_gs,
|
| 230 |
+
ref_view_strategy_dropdown,
|
| 231 |
+
) = self.ui_components.create_inference_control_section()
|
| 232 |
+
|
| 233 |
+
# Display control section - includes 3DGS options, buttons, and Visualization Options # noqa: E501
|
| 234 |
+
(
|
| 235 |
+
show_cam,
|
| 236 |
+
filter_black_bg,
|
| 237 |
+
filter_white_bg,
|
| 238 |
+
save_percentage,
|
| 239 |
+
num_max_points,
|
| 240 |
+
gs_trj_mode,
|
| 241 |
+
gs_video_quality,
|
| 242 |
+
submit_btn,
|
| 243 |
+
clear_btn,
|
| 244 |
+
) = self.ui_components.create_display_control_section()
|
| 245 |
+
|
| 246 |
+
# bind visibility of gs_trj_mode to infer_gs
|
| 247 |
+
infer_gs.change(
|
| 248 |
+
fn=lambda checked: (
|
| 249 |
+
gr.update(visible=checked),
|
| 250 |
+
gr.update(visible=checked),
|
| 251 |
+
gr.update(visible=checked),
|
| 252 |
+
gr.update(visible=(not checked)),
|
| 253 |
+
),
|
| 254 |
+
inputs=infer_gs,
|
| 255 |
+
outputs=[gs_trj_mode, gs_video_quality, gs_video, gs_info],
|
| 256 |
+
)
|
| 257 |
+
|
| 258 |
+
# Example scenes section
|
| 259 |
+
gr.Markdown("## Example Scenes")
|
| 260 |
+
|
| 261 |
+
scenes = self.ui_components.create_example_scenes_section()
|
| 262 |
+
scene_components = self.ui_components.create_example_scene_grid(scenes)
|
| 263 |
+
|
| 264 |
+
# Set up event handlers
|
| 265 |
+
self._setup_event_handlers(
|
| 266 |
+
demo,
|
| 267 |
+
is_example,
|
| 268 |
+
processed_data_state,
|
| 269 |
+
measure_points_state,
|
| 270 |
+
target_dir_output,
|
| 271 |
+
input_video,
|
| 272 |
+
input_images,
|
| 273 |
+
s_time_interval,
|
| 274 |
+
image_gallery,
|
| 275 |
+
reconstruction_output,
|
| 276 |
+
log_output,
|
| 277 |
+
show_cam,
|
| 278 |
+
filter_black_bg,
|
| 279 |
+
filter_white_bg,
|
| 280 |
+
process_res_method_dropdown,
|
| 281 |
+
save_percentage,
|
| 282 |
+
submit_btn,
|
| 283 |
+
clear_btn,
|
| 284 |
+
num_max_points,
|
| 285 |
+
infer_gs,
|
| 286 |
+
ref_view_strategy_dropdown,
|
| 287 |
+
selected_image_index_state,
|
| 288 |
+
measure_view_selector,
|
| 289 |
+
measure_image,
|
| 290 |
+
measure_depth_image,
|
| 291 |
+
measure_text,
|
| 292 |
+
prev_measure_btn,
|
| 293 |
+
next_measure_btn,
|
| 294 |
+
scenes,
|
| 295 |
+
scene_components,
|
| 296 |
+
gs_video,
|
| 297 |
+
gs_info,
|
| 298 |
+
gs_trj_mode,
|
| 299 |
+
gs_video_quality,
|
| 300 |
+
model_selector,
|
| 301 |
+
s_time_interval,
|
| 302 |
+
)
|
| 303 |
+
|
| 304 |
+
# Acknowledgements
|
| 305 |
+
self.ui_components.create_acknowledgements_section()
|
| 306 |
+
|
| 307 |
+
return demo
|
| 308 |
+
|
| 309 |
+
def _setup_event_handlers(
|
| 310 |
+
self,
|
| 311 |
+
demo: gr.Blocks,
|
| 312 |
+
is_example: gr.Textbox,
|
| 313 |
+
processed_data_state: gr.State,
|
| 314 |
+
measure_points_state: gr.State,
|
| 315 |
+
target_dir_output: gr.Textbox,
|
| 316 |
+
input_video: gr.Video,
|
| 317 |
+
input_images: gr.File,
|
| 318 |
+
s_time_interval: gr.Slider,
|
| 319 |
+
image_gallery: gr.Gallery,
|
| 320 |
+
reconstruction_output: gr.Model3D,
|
| 321 |
+
log_output: gr.Markdown,
|
| 322 |
+
show_cam: gr.Checkbox,
|
| 323 |
+
filter_black_bg: gr.Checkbox,
|
| 324 |
+
filter_white_bg: gr.Checkbox,
|
| 325 |
+
process_res_method_dropdown: gr.Dropdown,
|
| 326 |
+
save_percentage: gr.Slider,
|
| 327 |
+
submit_btn: gr.Button,
|
| 328 |
+
clear_btn: gr.ClearButton,
|
| 329 |
+
num_max_points: gr.Slider,
|
| 330 |
+
infer_gs: gr.Checkbox,
|
| 331 |
+
ref_view_strategy_dropdown: gr.Dropdown,
|
| 332 |
+
selected_image_index_state: gr.State,
|
| 333 |
+
measure_view_selector: gr.Dropdown,
|
| 334 |
+
measure_image: gr.Image,
|
| 335 |
+
measure_depth_image: gr.Image,
|
| 336 |
+
measure_text: gr.Markdown,
|
| 337 |
+
prev_measure_btn: gr.Button,
|
| 338 |
+
next_measure_btn: gr.Button,
|
| 339 |
+
scenes: List[Dict[str, Any]],
|
| 340 |
+
scene_components: List, # List of gr.Image or gr.Video
|
| 341 |
+
gs_video: gr.Video,
|
| 342 |
+
gs_info: gr.Markdown,
|
| 343 |
+
gs_trj_mode: gr.Dropdown,
|
| 344 |
+
gs_video_quality: gr.Dropdown,
|
| 345 |
+
model_selector: gr.Dropdown,
|
| 346 |
+
s_time_interval_slider: gr.Slider,
|
| 347 |
+
) -> None:
|
| 348 |
+
"""
|
| 349 |
+
Set up all event handlers for the application.
|
| 350 |
+
|
| 351 |
+
Args:
|
| 352 |
+
demo: Gradio Blocks interface
|
| 353 |
+
All other arguments: Gradio components to connect
|
| 354 |
+
"""
|
| 355 |
+
# Configure clear button
|
| 356 |
+
clear_btn.add(
|
| 357 |
+
[
|
| 358 |
+
input_video,
|
| 359 |
+
input_images,
|
| 360 |
+
reconstruction_output,
|
| 361 |
+
log_output,
|
| 362 |
+
target_dir_output,
|
| 363 |
+
image_gallery,
|
| 364 |
+
gs_video,
|
| 365 |
+
]
|
| 366 |
+
)
|
| 367 |
+
|
| 368 |
+
# Main reconstruction button
|
| 369 |
+
submit_btn.click(
|
| 370 |
+
fn=self.event_handlers.gradio_demo,
|
| 371 |
+
inputs=[
|
| 372 |
+
target_dir_output,
|
| 373 |
+
show_cam,
|
| 374 |
+
filter_black_bg,
|
| 375 |
+
filter_white_bg,
|
| 376 |
+
process_res_method_dropdown,
|
| 377 |
+
save_percentage,
|
| 378 |
+
num_max_points,
|
| 379 |
+
infer_gs,
|
| 380 |
+
ref_view_strategy_dropdown,
|
| 381 |
+
gs_trj_mode,
|
| 382 |
+
gs_video_quality,
|
| 383 |
+
model_selector,
|
| 384 |
+
],
|
| 385 |
+
outputs=[
|
| 386 |
+
reconstruction_output,
|
| 387 |
+
log_output,
|
| 388 |
+
processed_data_state,
|
| 389 |
+
measure_image,
|
| 390 |
+
measure_depth_image,
|
| 391 |
+
measure_text,
|
| 392 |
+
measure_view_selector,
|
| 393 |
+
gs_video,
|
| 394 |
+
gs_info,
|
| 395 |
+
],
|
| 396 |
+
)
|
| 397 |
+
|
| 398 |
+
# Real-time visualization updates
|
| 399 |
+
self._setup_visualization_handlers(
|
| 400 |
+
show_cam,
|
| 401 |
+
filter_black_bg,
|
| 402 |
+
filter_white_bg,
|
| 403 |
+
process_res_method_dropdown,
|
| 404 |
+
target_dir_output,
|
| 405 |
+
is_example,
|
| 406 |
+
reconstruction_output,
|
| 407 |
+
log_output,
|
| 408 |
+
)
|
| 409 |
+
|
| 410 |
+
# File upload handlers
|
| 411 |
+
input_video.change(
|
| 412 |
+
fn=self.event_handlers.handle_uploads,
|
| 413 |
+
inputs=[input_video, input_images, s_time_interval],
|
| 414 |
+
outputs=[reconstruction_output, target_dir_output, image_gallery, log_output],
|
| 415 |
+
)
|
| 416 |
+
input_images.change(
|
| 417 |
+
fn=self.event_handlers.handle_uploads,
|
| 418 |
+
inputs=[input_video, input_images, s_time_interval],
|
| 419 |
+
outputs=[reconstruction_output, target_dir_output, image_gallery, log_output],
|
| 420 |
+
)
|
| 421 |
+
|
| 422 |
+
# Navigation handlers
|
| 423 |
+
self._setup_navigation_handlers(
|
| 424 |
+
prev_measure_btn,
|
| 425 |
+
next_measure_btn,
|
| 426 |
+
measure_view_selector,
|
| 427 |
+
measure_image,
|
| 428 |
+
measure_depth_image,
|
| 429 |
+
measure_points_state,
|
| 430 |
+
processed_data_state,
|
| 431 |
+
)
|
| 432 |
+
|
| 433 |
+
# Measurement handler
|
| 434 |
+
measure_image.select(
|
| 435 |
+
fn=self.event_handlers.measure,
|
| 436 |
+
inputs=[processed_data_state, measure_points_state, measure_view_selector],
|
| 437 |
+
outputs=[measure_image, measure_depth_image, measure_points_state, measure_text],
|
| 438 |
+
)
|
| 439 |
+
|
| 440 |
+
# Example scene handlers
|
| 441 |
+
self._setup_example_scene_handlers(
|
| 442 |
+
scenes,
|
| 443 |
+
scene_components,
|
| 444 |
+
reconstruction_output,
|
| 445 |
+
target_dir_output,
|
| 446 |
+
image_gallery,
|
| 447 |
+
log_output,
|
| 448 |
+
is_example,
|
| 449 |
+
processed_data_state,
|
| 450 |
+
measure_view_selector,
|
| 451 |
+
measure_image,
|
| 452 |
+
measure_depth_image,
|
| 453 |
+
gs_video,
|
| 454 |
+
gs_info,
|
| 455 |
+
s_time_interval,
|
| 456 |
+
)
|
| 457 |
+
|
| 458 |
+
def _setup_visualization_handlers(
|
| 459 |
+
self,
|
| 460 |
+
show_cam: gr.Checkbox,
|
| 461 |
+
filter_black_bg: gr.Checkbox,
|
| 462 |
+
filter_white_bg: gr.Checkbox,
|
| 463 |
+
process_res_method_dropdown: gr.Dropdown,
|
| 464 |
+
target_dir_output: gr.Textbox,
|
| 465 |
+
is_example: gr.Textbox,
|
| 466 |
+
reconstruction_output: gr.Model3D,
|
| 467 |
+
log_output: gr.Markdown,
|
| 468 |
+
) -> None:
|
| 469 |
+
"""Set up visualization update handlers."""
|
| 470 |
+
# Common inputs for visualization updates
|
| 471 |
+
viz_inputs = [
|
| 472 |
+
target_dir_output,
|
| 473 |
+
show_cam,
|
| 474 |
+
is_example,
|
| 475 |
+
filter_black_bg,
|
| 476 |
+
filter_white_bg,
|
| 477 |
+
process_res_method_dropdown,
|
| 478 |
+
]
|
| 479 |
+
|
| 480 |
+
# Set up change handlers for all visualization controls
|
| 481 |
+
for component in [show_cam, filter_black_bg, filter_white_bg]:
|
| 482 |
+
component.change(
|
| 483 |
+
fn=self.event_handlers.update_visualization,
|
| 484 |
+
inputs=viz_inputs,
|
| 485 |
+
outputs=[reconstruction_output, log_output],
|
| 486 |
+
)
|
| 487 |
+
|
| 488 |
+
def _setup_navigation_handlers(
|
| 489 |
+
self,
|
| 490 |
+
prev_measure_btn: gr.Button,
|
| 491 |
+
next_measure_btn: gr.Button,
|
| 492 |
+
measure_view_selector: gr.Dropdown,
|
| 493 |
+
measure_image: gr.Image,
|
| 494 |
+
measure_depth_image: gr.Image,
|
| 495 |
+
measure_points_state: gr.State,
|
| 496 |
+
processed_data_state: gr.State,
|
| 497 |
+
) -> None:
|
| 498 |
+
"""Set up navigation handlers for measure tab."""
|
| 499 |
+
# Measure tab navigation
|
| 500 |
+
prev_measure_btn.click(
|
| 501 |
+
fn=lambda processed_data, current_selector: self.event_handlers.navigate_measure_view(
|
| 502 |
+
processed_data, current_selector, -1
|
| 503 |
+
),
|
| 504 |
+
inputs=[processed_data_state, measure_view_selector],
|
| 505 |
+
outputs=[
|
| 506 |
+
measure_view_selector,
|
| 507 |
+
measure_image,
|
| 508 |
+
measure_depth_image,
|
| 509 |
+
measure_points_state,
|
| 510 |
+
],
|
| 511 |
+
)
|
| 512 |
+
|
| 513 |
+
next_measure_btn.click(
|
| 514 |
+
fn=lambda processed_data, current_selector: self.event_handlers.navigate_measure_view(
|
| 515 |
+
processed_data, current_selector, 1
|
| 516 |
+
),
|
| 517 |
+
inputs=[processed_data_state, measure_view_selector],
|
| 518 |
+
outputs=[
|
| 519 |
+
measure_view_selector,
|
| 520 |
+
measure_image,
|
| 521 |
+
measure_depth_image,
|
| 522 |
+
measure_points_state,
|
| 523 |
+
],
|
| 524 |
+
)
|
| 525 |
+
|
| 526 |
+
measure_view_selector.change(
|
| 527 |
+
fn=lambda processed_data, selector_value: (
|
| 528 |
+
self.event_handlers.update_measure_view(
|
| 529 |
+
processed_data, int(selector_value.split()[1]) - 1
|
| 530 |
+
)
|
| 531 |
+
if selector_value
|
| 532 |
+
else (None, None, [])
|
| 533 |
+
),
|
| 534 |
+
inputs=[processed_data_state, measure_view_selector],
|
| 535 |
+
outputs=[measure_image, measure_depth_image, measure_points_state],
|
| 536 |
+
)
|
| 537 |
+
|
| 538 |
+
def _setup_example_scene_handlers(
|
| 539 |
+
self,
|
| 540 |
+
scenes: List[Dict[str, Any]],
|
| 541 |
+
scene_components: List, # List of gr.Image
|
| 542 |
+
reconstruction_output: gr.Model3D,
|
| 543 |
+
target_dir_output: gr.Textbox,
|
| 544 |
+
image_gallery: gr.Gallery,
|
| 545 |
+
log_output: gr.Markdown,
|
| 546 |
+
is_example: gr.Textbox,
|
| 547 |
+
processed_data_state: gr.State,
|
| 548 |
+
measure_view_selector: gr.Dropdown,
|
| 549 |
+
measure_image: gr.Image,
|
| 550 |
+
measure_depth_image: gr.Image,
|
| 551 |
+
gs_video: gr.Video,
|
| 552 |
+
gs_info: gr.Markdown,
|
| 553 |
+
s_time_interval: gr.Slider,
|
| 554 |
+
) -> None:
|
| 555 |
+
"""Set up example scene handlers."""
|
| 556 |
+
# Use assets/examples directory
|
| 557 |
+
examples_dir = os.environ.get("DA3_EXAMPLES_DIR", "assets/examples")
|
| 558 |
+
|
| 559 |
+
def load_and_update_measure(scene_name: str, fps: float):
|
| 560 |
+
"""Load example scene and update measure view."""
|
| 561 |
+
print(f"[load_and_update_measure] Called with scene_name={scene_name}, fps={fps}", flush=True)
|
| 562 |
+
result = self.event_handlers.load_example_scene(scene_name, examples_dir, fps)
|
| 563 |
+
print(f"[load_and_update_measure] target_dir from result[1]: {result[1]}", flush=True)
|
| 564 |
+
|
| 565 |
+
# Update measure view if processed_data is available
|
| 566 |
+
measure_img = None
|
| 567 |
+
measure_depth = None
|
| 568 |
+
if result[4] is not None: # processed_data exists
|
| 569 |
+
measure_img, measure_depth, _ = (
|
| 570 |
+
self.event_handlers.visualization_handler.update_measure_view(result[4], 0)
|
| 571 |
+
)
|
| 572 |
+
|
| 573 |
+
final_result = result + ("True", measure_img, measure_depth)
|
| 574 |
+
print(f"[load_and_update_measure] Returning {len(final_result)} values", flush=True)
|
| 575 |
+
return final_result
|
| 576 |
+
|
| 577 |
+
def create_scene_handler(scene_name: str):
|
| 578 |
+
"""Create a handler function for a specific scene."""
|
| 579 |
+
def handler(fps: float):
|
| 580 |
+
return load_and_update_measure(scene_name, fps)
|
| 581 |
+
return handler
|
| 582 |
+
|
| 583 |
+
for i, scene in enumerate(scenes):
|
| 584 |
+
if i < len(scene_components):
|
| 585 |
+
component = scene_components[i]
|
| 586 |
+
# Create handler with scene name bound
|
| 587 |
+
handler_fn = create_scene_handler(scene["name"])
|
| 588 |
+
outputs = [
|
| 589 |
+
reconstruction_output,
|
| 590 |
+
target_dir_output,
|
| 591 |
+
image_gallery,
|
| 592 |
+
log_output,
|
| 593 |
+
processed_data_state,
|
| 594 |
+
measure_view_selector,
|
| 595 |
+
gs_video,
|
| 596 |
+
gs_info,
|
| 597 |
+
is_example,
|
| 598 |
+
measure_image,
|
| 599 |
+
measure_depth_image,
|
| 600 |
+
]
|
| 601 |
+
|
| 602 |
+
# Use click event - s_time_interval value is passed as input
|
| 603 |
+
component.select(fn=handler_fn, inputs=[s_time_interval], outputs=outputs)
|
| 604 |
+
|
| 605 |
+
def launch(self, host: str = "127.0.0.1", port: int = 7860, **kwargs) -> None:
|
| 606 |
+
"""
|
| 607 |
+
Launch the application.
|
| 608 |
+
|
| 609 |
+
Args:
|
| 610 |
+
host: Host address to bind to
|
| 611 |
+
port: Port number to bind to
|
| 612 |
+
**kwargs: Additional arguments for demo.launch()
|
| 613 |
+
"""
|
| 614 |
+
demo = self.create_app()
|
| 615 |
+
demo.queue(max_size=20).launch(
|
| 616 |
+
show_error=True,
|
| 617 |
+
server_name=host,
|
| 618 |
+
server_port=port,
|
| 619 |
+
**kwargs,
|
| 620 |
+
)
|
| 621 |
+
|
| 622 |
+
|
| 623 |
+
def main():
|
| 624 |
+
"""Main function to run the application."""
|
| 625 |
+
parser = argparse.ArgumentParser(
|
| 626 |
+
description="Depth Anything 3 Gradio Application",
|
| 627 |
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
| 628 |
+
epilog="""
|
| 629 |
+
Examples:
|
| 630 |
+
# Basic usage
|
| 631 |
+
python gradio_app.py --help
|
| 632 |
+
python gradio_app.py --host 0.0.0.0 --port 8080
|
| 633 |
+
python gradio_app.py --model-dir /path/to/model --workspace-dir /path/to/workspace
|
| 634 |
+
|
| 635 |
+
# Cache examples at startup (all low-res)
|
| 636 |
+
python gradio_app.py --cache-examples
|
| 637 |
+
|
| 638 |
+
# Cache with selective high-res+3DGS for scenes matching tag
|
| 639 |
+
python gradio_app.py --cache-examples --cache-gs-tag dl3dv
|
| 640 |
+
# This will use high-res + 3DGS for scenes containing "dl3dv" in their name,
|
| 641 |
+
# and low-res only for other scenes
|
| 642 |
+
""",
|
| 643 |
+
)
|
| 644 |
+
|
| 645 |
+
# Server configuration
|
| 646 |
+
parser.add_argument(
|
| 647 |
+
"--host", default="127.0.0.1", help="Host address to bind to (default: 127.0.0.1)"
|
| 648 |
+
)
|
| 649 |
+
parser.add_argument(
|
| 650 |
+
"--port", type=int, default=7860, help="Port number to bind to (default: 7860)"
|
| 651 |
+
)
|
| 652 |
+
|
| 653 |
+
# Directory configuration
|
| 654 |
+
parser.add_argument(
|
| 655 |
+
"--model-dir",
|
| 656 |
+
default="depth-anything/DA3NESTED-GIANT-LARGE",
|
| 657 |
+
help="Path to the model directory (default: depth-anything/DA3NESTED-GIANT-LARGE)",
|
| 658 |
+
)
|
| 659 |
+
parser.add_argument(
|
| 660 |
+
"--workspace-dir",
|
| 661 |
+
default="workspace/gradio", # noqa: E501
|
| 662 |
+
help="Path to the workspace directory (default: workspace/gradio)", # noqa: E501
|
| 663 |
+
)
|
| 664 |
+
parser.add_argument(
|
| 665 |
+
"--gallery-dir",
|
| 666 |
+
default="workspace/gallery",
|
| 667 |
+
help="Path to the gallery directory (default: workspace/gallery)", # noqa: E501
|
| 668 |
+
)
|
| 669 |
+
|
| 670 |
+
# Additional Gradio options
|
| 671 |
+
parser.add_argument("--share", action="store_true", help="Create a public link for the app")
|
| 672 |
+
parser.add_argument("--debug", action="store_true", help="Enable debug mode")
|
| 673 |
+
|
| 674 |
+
# Example caching options
|
| 675 |
+
parser.add_argument(
|
| 676 |
+
"--cache-examples",
|
| 677 |
+
action="store_true",
|
| 678 |
+
help="Pre-cache all example scenes at startup for faster loading",
|
| 679 |
+
)
|
| 680 |
+
parser.add_argument(
|
| 681 |
+
"--cache-gs-tag",
|
| 682 |
+
type=str,
|
| 683 |
+
default="",
|
| 684 |
+
help="Tag to match scene names for high-res+3DGS caching (e.g., 'dl3dv'). Scenes containing this tag will use high_res and infer_gs=True; others will use low_res only.", # noqa: E501
|
| 685 |
+
)
|
| 686 |
+
|
| 687 |
+
args = parser.parse_args()
|
| 688 |
+
|
| 689 |
+
# Create directories if they don't exist
|
| 690 |
+
os.makedirs(args.workspace_dir, exist_ok=True)
|
| 691 |
+
os.makedirs(args.gallery_dir, exist_ok=True)
|
| 692 |
+
|
| 693 |
+
# Initialize and launch the application
|
| 694 |
+
app = DepthAnything3App(
|
| 695 |
+
model_dir=args.model_dir, workspace_dir=args.workspace_dir, gallery_dir=args.gallery_dir
|
| 696 |
+
)
|
| 697 |
+
|
| 698 |
+
# Prepare launch arguments
|
| 699 |
+
launch_kwargs = {"share": args.share, "debug": args.debug}
|
| 700 |
+
|
| 701 |
+
print("Starting Depth Anything 3 Gradio App...")
|
| 702 |
+
print(f"Host: {args.host}")
|
| 703 |
+
print(f"Port: {args.port}")
|
| 704 |
+
print(f"Model Directory: {args.model_dir}")
|
| 705 |
+
print(f"Workspace Directory: {args.workspace_dir}")
|
| 706 |
+
print(f"Gallery Directory: {args.gallery_dir}")
|
| 707 |
+
print(f"Share: {args.share}")
|
| 708 |
+
print(f"Debug: {args.debug}")
|
| 709 |
+
print(f"Cache Examples: {args.cache_examples}")
|
| 710 |
+
if args.cache_examples:
|
| 711 |
+
if args.cache_gs_tag:
|
| 712 |
+
print(
|
| 713 |
+
f"Cache GS Tag: '{args.cache_gs_tag}' (scenes matching this tag will use high-res + 3DGS)" # noqa: E501
|
| 714 |
+
) # noqa: E501
|
| 715 |
+
else:
|
| 716 |
+
print("Cache GS Tag: None (all scenes will use low-res only)")
|
| 717 |
+
|
| 718 |
+
# Pre-cache examples if requested
|
| 719 |
+
if args.cache_examples:
|
| 720 |
+
print("\n" + "=" * 60)
|
| 721 |
+
print("Pre-caching mode enabled")
|
| 722 |
+
if args.cache_gs_tag:
|
| 723 |
+
print(f"Scenes containing '{args.cache_gs_tag}' will use HIGH-RES + 3DGS")
|
| 724 |
+
print("Other scenes will use LOW-RES only")
|
| 725 |
+
else:
|
| 726 |
+
print("All scenes will use LOW-RES only")
|
| 727 |
+
print("=" * 60)
|
| 728 |
+
app.cache_examples(
|
| 729 |
+
show_cam=True,
|
| 730 |
+
filter_black_bg=False,
|
| 731 |
+
filter_white_bg=False,
|
| 732 |
+
save_percentage=5.0,
|
| 733 |
+
num_max_points=1000,
|
| 734 |
+
cache_gs_tag=args.cache_gs_tag,
|
| 735 |
+
gs_trj_mode="smooth",
|
| 736 |
+
gs_video_quality="low",
|
| 737 |
+
)
|
| 738 |
+
|
| 739 |
+
app.launch(host=args.host, port=args.port, **launch_kwargs)
|
| 740 |
+
|
| 741 |
+
|
| 742 |
+
if __name__ == "__main__":
|
| 743 |
+
main()
|
src/depth_anything_3/app/modules/__init__.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
"""
|
| 16 |
+
Modules package for Depth Anything 3 Gradio app.
|
| 17 |
+
|
| 18 |
+
This package contains all the modular components for the Gradio application.
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
from depth_anything_3.app.modules.event_handlers import EventHandlers
|
| 22 |
+
from depth_anything_3.app.modules.file_handlers import FileHandler
|
| 23 |
+
from depth_anything_3.app.modules.model_inference import ModelInference
|
| 24 |
+
from depth_anything_3.app.modules.ui_components import UIComponents
|
| 25 |
+
from depth_anything_3.app.modules.utils import (
|
| 26 |
+
create_depth_visualization,
|
| 27 |
+
get_logo_base64,
|
| 28 |
+
get_scene_info,
|
| 29 |
+
save_to_gallery_func,
|
| 30 |
+
)
|
| 31 |
+
from depth_anything_3.app.modules.visualization import VisualizationHandler
|
| 32 |
+
|
| 33 |
+
__all__ = [
|
| 34 |
+
"ModelInference",
|
| 35 |
+
"FileHandler",
|
| 36 |
+
"VisualizationHandler",
|
| 37 |
+
"EventHandlers",
|
| 38 |
+
"UIComponents",
|
| 39 |
+
"create_depth_visualization",
|
| 40 |
+
"save_to_gallery_func",
|
| 41 |
+
"get_scene_info",
|
| 42 |
+
"get_logo_base64",
|
| 43 |
+
]
|
src/depth_anything_3/app/modules/event_handlers.py
ADDED
|
@@ -0,0 +1,624 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
"""
|
| 16 |
+
Event handling module for Depth Anything 3 Gradio app.
|
| 17 |
+
|
| 18 |
+
This module handles all event callbacks and user interactions.
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
import os
|
| 22 |
+
import time
|
| 23 |
+
from glob import glob
|
| 24 |
+
from typing import Any, Dict, List, Optional, Tuple
|
| 25 |
+
|
| 26 |
+
import gradio as gr
|
| 27 |
+
import numpy as np
|
| 28 |
+
import torch
|
| 29 |
+
|
| 30 |
+
from depth_anything_3.app.modules.file_handlers import FileHandler
|
| 31 |
+
from depth_anything_3.app.modules.model_inference import ModelInference
|
| 32 |
+
from depth_anything_3.app.modules.visualization import VisualizationHandler
|
| 33 |
+
from depth_anything_3.utils.memory import cleanup_cuda_memory
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class EventHandlers:
|
| 37 |
+
"""
|
| 38 |
+
Handles all event callbacks and user interactions for the Gradio app.
|
| 39 |
+
"""
|
| 40 |
+
|
| 41 |
+
def __init__(self):
|
| 42 |
+
"""Initialize the event handlers."""
|
| 43 |
+
self.model_inference = ModelInference()
|
| 44 |
+
self.file_handler = FileHandler()
|
| 45 |
+
self.visualization_handler = VisualizationHandler()
|
| 46 |
+
|
| 47 |
+
def clear_fields(self) -> None:
|
| 48 |
+
"""
|
| 49 |
+
Clears the 3D viewer, the stored target_dir, and empties the gallery.
|
| 50 |
+
"""
|
| 51 |
+
return None
|
| 52 |
+
|
| 53 |
+
def update_log(self) -> str:
|
| 54 |
+
"""
|
| 55 |
+
Display a quick log message while waiting.
|
| 56 |
+
"""
|
| 57 |
+
return "Loading and Reconstructing..."
|
| 58 |
+
|
| 59 |
+
def save_current_visualization(
|
| 60 |
+
self,
|
| 61 |
+
target_dir: str,
|
| 62 |
+
save_percentage: float,
|
| 63 |
+
show_cam: bool,
|
| 64 |
+
filter_black_bg: bool,
|
| 65 |
+
filter_white_bg: bool,
|
| 66 |
+
processed_data: Optional[Dict],
|
| 67 |
+
scene_name: str = "",
|
| 68 |
+
) -> str:
|
| 69 |
+
"""
|
| 70 |
+
Save current visualization results to gallery with specified save percentage.
|
| 71 |
+
|
| 72 |
+
Args:
|
| 73 |
+
target_dir: Directory containing results
|
| 74 |
+
save_percentage: Percentage of points to save (0-100)
|
| 75 |
+
show_cam: Whether to show cameras
|
| 76 |
+
filter_black_bg: Whether to filter black background
|
| 77 |
+
filter_white_bg: Whether to filter white background
|
| 78 |
+
processed_data: Processed data from reconstruction
|
| 79 |
+
|
| 80 |
+
Returns:
|
| 81 |
+
Status message
|
| 82 |
+
"""
|
| 83 |
+
if not target_dir or target_dir == "None" or not os.path.isdir(target_dir):
|
| 84 |
+
return "No reconstruction available. Please run 'Reconstruct' first."
|
| 85 |
+
|
| 86 |
+
if processed_data is None:
|
| 87 |
+
return "No processed data available. Please run 'Reconstruct' first."
|
| 88 |
+
|
| 89 |
+
try:
|
| 90 |
+
import datetime
|
| 91 |
+
|
| 92 |
+
from .utils import save_to_gallery_func
|
| 93 |
+
|
| 94 |
+
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 95 |
+
if scene_name and scene_name.strip():
|
| 96 |
+
gallery_name = f"{scene_name.strip()}_{timestamp}_pct{save_percentage:.0f}"
|
| 97 |
+
else:
|
| 98 |
+
gallery_name = f"save_{timestamp}_pct{save_percentage:.0f}"
|
| 99 |
+
|
| 100 |
+
success, message = save_to_gallery_func(
|
| 101 |
+
target_dir=target_dir, processed_data=processed_data, gallery_name=gallery_name
|
| 102 |
+
)
|
| 103 |
+
|
| 104 |
+
if success:
|
| 105 |
+
return (
|
| 106 |
+
"Successfully saved to gallery!\n"
|
| 107 |
+
f"Gallery name: {gallery_name}\n"
|
| 108 |
+
f"Save percentage: {save_percentage}%\n"
|
| 109 |
+
f"Show cameras: {show_cam}\n"
|
| 110 |
+
f"Filter black bg: {filter_black_bg}\n"
|
| 111 |
+
f"Filter white bg: {filter_white_bg}\n\n"
|
| 112 |
+
f"{message}"
|
| 113 |
+
)
|
| 114 |
+
else:
|
| 115 |
+
return f"Failed to save to gallery: {message}"
|
| 116 |
+
|
| 117 |
+
except Exception as e:
|
| 118 |
+
return f"Error saving visualization: {str(e)}"
|
| 119 |
+
|
| 120 |
+
def gradio_demo(
|
| 121 |
+
self,
|
| 122 |
+
target_dir: str,
|
| 123 |
+
show_cam: bool = True,
|
| 124 |
+
filter_black_bg: bool = False,
|
| 125 |
+
filter_white_bg: bool = False,
|
| 126 |
+
process_res_method: str = "upper_bound_resize",
|
| 127 |
+
save_percentage: float = 30.0,
|
| 128 |
+
num_max_points: int = 1_000_000,
|
| 129 |
+
infer_gs: bool = False,
|
| 130 |
+
ref_view_strategy: str = "saddle_balanced",
|
| 131 |
+
gs_trj_mode: str = "extend",
|
| 132 |
+
gs_video_quality: str = "high",
|
| 133 |
+
model_name: str = None,
|
| 134 |
+
):
|
| 135 |
+
"""
|
| 136 |
+
Perform reconstruction using the already-created target_dir/images.
|
| 137 |
+
|
| 138 |
+
Args:
|
| 139 |
+
target_dir: Directory containing images
|
| 140 |
+
show_cam: Whether to show camera
|
| 141 |
+
filter_black_bg: Whether to filter black background
|
| 142 |
+
filter_white_bg: Whether to filter white background
|
| 143 |
+
process_res_method: Method for resizing input images
|
| 144 |
+
save_percentage: Filter percentage for point cloud
|
| 145 |
+
num_max_points: Maximum number of points
|
| 146 |
+
infer_gs: Whether to infer 3D Gaussian Splatting
|
| 147 |
+
ref_view_strategy: Reference view selection strategy
|
| 148 |
+
model_name: Model to use (da3-base, da3-large, da3nested-giant-large)
|
| 149 |
+
|
| 150 |
+
Returns:
|
| 151 |
+
Tuple of reconstruction results
|
| 152 |
+
"""
|
| 153 |
+
from depth_anything_3.app.modules.model_inference import DEFAULT_MODEL
|
| 154 |
+
|
| 155 |
+
if model_name is None:
|
| 156 |
+
model_name = DEFAULT_MODEL
|
| 157 |
+
|
| 158 |
+
print(f"[gradio_demo] Called with target_dir={target_dir}, model={model_name}", flush=True)
|
| 159 |
+
|
| 160 |
+
if target_dir is None or not os.path.isdir(target_dir) or target_dir == "None":
|
| 161 |
+
print("[gradio_demo] Invalid target_dir, returning early")
|
| 162 |
+
return (
|
| 163 |
+
None,
|
| 164 |
+
"No valid target directory found. Please upload first.",
|
| 165 |
+
None,
|
| 166 |
+
None,
|
| 167 |
+
None,
|
| 168 |
+
"",
|
| 169 |
+
None,
|
| 170 |
+
gr.update(value=None, visible=False), # gs_video
|
| 171 |
+
gr.update(visible=True), # gs_info
|
| 172 |
+
)
|
| 173 |
+
|
| 174 |
+
start_time = time.time()
|
| 175 |
+
cleanup_cuda_memory()
|
| 176 |
+
|
| 177 |
+
# Get image files for logging
|
| 178 |
+
target_dir_images = os.path.join(target_dir, "images")
|
| 179 |
+
all_files = (
|
| 180 |
+
sorted(os.listdir(target_dir_images)) if os.path.isdir(target_dir_images) else []
|
| 181 |
+
)
|
| 182 |
+
|
| 183 |
+
print(f"[gradio_demo] Running {model_name} on {len(all_files)} images...")
|
| 184 |
+
print(f"[gradio_demo] Reference view strategy: {ref_view_strategy}")
|
| 185 |
+
|
| 186 |
+
try:
|
| 187 |
+
with torch.no_grad():
|
| 188 |
+
prediction, processed_data = self.model_inference.run_inference(
|
| 189 |
+
target_dir,
|
| 190 |
+
process_res_method=process_res_method,
|
| 191 |
+
show_camera=show_cam,
|
| 192 |
+
save_percentage=save_percentage,
|
| 193 |
+
num_max_points=int(num_max_points * 1000), # Convert K to actual count
|
| 194 |
+
infer_gs=infer_gs,
|
| 195 |
+
ref_view_strategy=ref_view_strategy,
|
| 196 |
+
gs_trj_mode=gs_trj_mode,
|
| 197 |
+
gs_video_quality=gs_video_quality,
|
| 198 |
+
model_name=model_name,
|
| 199 |
+
)
|
| 200 |
+
except Exception as e:
|
| 201 |
+
error_msg = f"Reconstruction failed: {str(e)}"
|
| 202 |
+
print(f"[ERROR] {error_msg}")
|
| 203 |
+
import traceback
|
| 204 |
+
traceback.print_exc()
|
| 205 |
+
return (
|
| 206 |
+
None,
|
| 207 |
+
error_msg,
|
| 208 |
+
None,
|
| 209 |
+
None,
|
| 210 |
+
None,
|
| 211 |
+
"",
|
| 212 |
+
None,
|
| 213 |
+
gr.update(value=None, visible=False),
|
| 214 |
+
gr.update(visible=True),
|
| 215 |
+
)
|
| 216 |
+
|
| 217 |
+
# The GLB file is already generated by the API
|
| 218 |
+
glbfile = os.path.join(target_dir, "scene.glb")
|
| 219 |
+
|
| 220 |
+
# Handle 3DGS video based on infer_gs flag
|
| 221 |
+
gsvideo_path = None
|
| 222 |
+
gs_video_visible = False
|
| 223 |
+
gs_info_visible = True
|
| 224 |
+
|
| 225 |
+
if infer_gs:
|
| 226 |
+
try:
|
| 227 |
+
gsvideo_path = sorted(glob(os.path.join(target_dir, "gs_video", "*.mp4")))[-1]
|
| 228 |
+
gs_video_visible = True
|
| 229 |
+
gs_info_visible = False
|
| 230 |
+
except IndexError:
|
| 231 |
+
gsvideo_path = None
|
| 232 |
+
print("3DGS video not found, but infer_gs was enabled")
|
| 233 |
+
|
| 234 |
+
# Cleanup
|
| 235 |
+
cleanup_cuda_memory()
|
| 236 |
+
|
| 237 |
+
end_time = time.time()
|
| 238 |
+
print(f"Total time: {end_time - start_time:.2f} seconds")
|
| 239 |
+
log_msg = f"Reconstruction Success ({len(all_files)} frames). Waiting for visualization."
|
| 240 |
+
|
| 241 |
+
# Populate visualization tabs with processed data
|
| 242 |
+
depth_vis, measure_img, measure_depth_vis, measure_pts = (
|
| 243 |
+
self.visualization_handler.populate_visualization_tabs(processed_data)
|
| 244 |
+
)
|
| 245 |
+
|
| 246 |
+
# Update view selectors based on available views
|
| 247 |
+
depth_selector, measure_selector = self.visualization_handler.update_view_selectors(
|
| 248 |
+
processed_data
|
| 249 |
+
)
|
| 250 |
+
|
| 251 |
+
return (
|
| 252 |
+
glbfile,
|
| 253 |
+
log_msg,
|
| 254 |
+
processed_data,
|
| 255 |
+
measure_img, # measure_image
|
| 256 |
+
measure_depth_vis, # measure_depth_image
|
| 257 |
+
"", # measure_text (empty initially)
|
| 258 |
+
measure_selector, # measure_view_selector
|
| 259 |
+
gr.update(value=gsvideo_path, visible=gs_video_visible), # gs_video
|
| 260 |
+
gr.update(visible=gs_info_visible), # gs_info visibility
|
| 261 |
+
)
|
| 262 |
+
|
| 263 |
+
def update_visualization(
|
| 264 |
+
self,
|
| 265 |
+
target_dir: str,
|
| 266 |
+
show_cam: bool,
|
| 267 |
+
is_example: str,
|
| 268 |
+
filter_black_bg: bool = False,
|
| 269 |
+
filter_white_bg: bool = False,
|
| 270 |
+
process_res_method: str = "upper_bound_resize",
|
| 271 |
+
) -> Tuple[gr.update, str]:
|
| 272 |
+
"""
|
| 273 |
+
Reload saved predictions from npz, create (or reuse) the GLB for new parameters,
|
| 274 |
+
and return it for the 3D viewer.
|
| 275 |
+
|
| 276 |
+
Args:
|
| 277 |
+
target_dir: Directory containing results
|
| 278 |
+
show_cam: Whether to show camera
|
| 279 |
+
is_example: Whether this is an example scene
|
| 280 |
+
filter_black_bg: Whether to filter black background
|
| 281 |
+
filter_white_bg: Whether to filter white background
|
| 282 |
+
process_res_method: Method for resizing input images
|
| 283 |
+
|
| 284 |
+
Returns:
|
| 285 |
+
Tuple of (glb_file, log_message)
|
| 286 |
+
"""
|
| 287 |
+
if not target_dir or target_dir == "None" or not os.path.isdir(target_dir):
|
| 288 |
+
return (
|
| 289 |
+
gr.update(),
|
| 290 |
+
"No reconstruction available. Please click the Reconstruct button first.",
|
| 291 |
+
)
|
| 292 |
+
|
| 293 |
+
# Check if GLB exists (could be cached example or reconstructed scene)
|
| 294 |
+
glbfile = os.path.join(target_dir, "scene.glb")
|
| 295 |
+
if os.path.exists(glbfile):
|
| 296 |
+
return (
|
| 297 |
+
glbfile,
|
| 298 |
+
(
|
| 299 |
+
"Visualization loaded from cache."
|
| 300 |
+
if is_example == "True"
|
| 301 |
+
else "Visualization updated."
|
| 302 |
+
),
|
| 303 |
+
)
|
| 304 |
+
|
| 305 |
+
# If no GLB but it's an example that hasn't been reconstructed yet
|
| 306 |
+
if is_example == "True":
|
| 307 |
+
return (
|
| 308 |
+
gr.update(),
|
| 309 |
+
"No reconstruction available. Please click the Reconstruct button first.",
|
| 310 |
+
)
|
| 311 |
+
|
| 312 |
+
# For non-examples, check predictions.npz
|
| 313 |
+
predictions_path = os.path.join(target_dir, "predictions.npz")
|
| 314 |
+
if not os.path.exists(predictions_path):
|
| 315 |
+
error_message = (
|
| 316 |
+
f"No reconstruction available at {predictions_path}. "
|
| 317 |
+
"Please run 'Reconstruct' first."
|
| 318 |
+
)
|
| 319 |
+
return gr.update(), error_message
|
| 320 |
+
|
| 321 |
+
loaded = np.load(predictions_path, allow_pickle=True)
|
| 322 |
+
predictions = {key: loaded[key] for key in loaded.keys()} # noqa: F841
|
| 323 |
+
|
| 324 |
+
return (
|
| 325 |
+
glbfile,
|
| 326 |
+
"Visualization updated.",
|
| 327 |
+
)
|
| 328 |
+
|
| 329 |
+
def handle_uploads(
|
| 330 |
+
self,
|
| 331 |
+
input_video: Optional[str],
|
| 332 |
+
input_images: Optional[List],
|
| 333 |
+
s_time_interval: float = 10.0,
|
| 334 |
+
) -> Tuple[Optional[str], Optional[str], Optional[List], Optional[str]]:
|
| 335 |
+
"""
|
| 336 |
+
Handle file uploads and update gallery.
|
| 337 |
+
|
| 338 |
+
Args:
|
| 339 |
+
input_video: Path to input video file
|
| 340 |
+
input_images: List of input image files
|
| 341 |
+
s_time_interval: Sampling FPS (frames per second) for frame extraction
|
| 342 |
+
|
| 343 |
+
Returns:
|
| 344 |
+
Tuple of (reconstruction_output, target_dir, image_paths, log_message)
|
| 345 |
+
"""
|
| 346 |
+
return self.file_handler.update_gallery_on_upload(
|
| 347 |
+
input_video, input_images, s_time_interval
|
| 348 |
+
)
|
| 349 |
+
|
| 350 |
+
def load_example_scene(
|
| 351 |
+
self, scene_name: str, examples_dir: str = None, s_time_interval: float = None
|
| 352 |
+
) -> Tuple[
|
| 353 |
+
Optional[str],
|
| 354 |
+
Optional[str],
|
| 355 |
+
Optional[List],
|
| 356 |
+
str,
|
| 357 |
+
Optional[Dict],
|
| 358 |
+
gr.Dropdown, # measure_view_selector
|
| 359 |
+
dict, # gs_video update (value + visibility)
|
| 360 |
+
dict, # gs_info update (visibility)
|
| 361 |
+
]:
|
| 362 |
+
"""
|
| 363 |
+
Load a scene from examples directory.
|
| 364 |
+
|
| 365 |
+
Args:
|
| 366 |
+
scene_name: Name of the scene to load
|
| 367 |
+
examples_dir: Path to examples directory (if None, uses workspace_dir/examples)
|
| 368 |
+
s_time_interval: Sampling FPS for video frame extraction (default 1.0)
|
| 369 |
+
|
| 370 |
+
Returns:
|
| 371 |
+
Tuple of (reconstruction_output, target_dir, image_paths, log_message, processed_data, measure_view_selector, gs_video, gs_video_vis, gs_info_vis) # noqa: E501
|
| 372 |
+
"""
|
| 373 |
+
if examples_dir is None:
|
| 374 |
+
# Get workspace directory from environment variable
|
| 375 |
+
workspace_dir = os.environ.get("DA3_WORKSPACE_DIR", "gradio_workspace")
|
| 376 |
+
examples_dir = os.path.join(workspace_dir, "examples")
|
| 377 |
+
|
| 378 |
+
# Default FPS for video extraction
|
| 379 |
+
if s_time_interval is None:
|
| 380 |
+
s_time_interval = 1.0
|
| 381 |
+
|
| 382 |
+
reconstruction_output, target_dir, image_paths, log_message = (
|
| 383 |
+
self.file_handler.load_example_scene(scene_name, examples_dir, s_time_interval)
|
| 384 |
+
)
|
| 385 |
+
|
| 386 |
+
# Try to load cached processed data if available
|
| 387 |
+
processed_data = None
|
| 388 |
+
measure_view_selector = gr.Dropdown(choices=["View 1"], value="View 1")
|
| 389 |
+
gs_video_path = None
|
| 390 |
+
gs_video_visible = False
|
| 391 |
+
gs_info_visible = True
|
| 392 |
+
|
| 393 |
+
if target_dir and target_dir != "None":
|
| 394 |
+
predictions_path = os.path.join(target_dir, "predictions.npz")
|
| 395 |
+
if os.path.exists(predictions_path):
|
| 396 |
+
try:
|
| 397 |
+
# Load predictions from cache
|
| 398 |
+
loaded = np.load(predictions_path, allow_pickle=True)
|
| 399 |
+
predictions = {key: loaded[key] for key in loaded.keys()}
|
| 400 |
+
|
| 401 |
+
# Reconstruct processed_data structure
|
| 402 |
+
num_images = len(predictions.get("images", []))
|
| 403 |
+
processed_data = {}
|
| 404 |
+
|
| 405 |
+
for i in range(num_images):
|
| 406 |
+
processed_data[i] = {
|
| 407 |
+
"image": predictions["images"][i] if "images" in predictions else None,
|
| 408 |
+
"depth": predictions["depths"][i] if "depths" in predictions else None,
|
| 409 |
+
"depth_image": os.path.join(
|
| 410 |
+
target_dir, "depth_vis", f"{i:04d}.jpg" # Fixed: use .jpg not .png
|
| 411 |
+
),
|
| 412 |
+
"intrinsics": (
|
| 413 |
+
predictions["intrinsics"][i]
|
| 414 |
+
if "intrinsics" in predictions
|
| 415 |
+
and i < len(predictions["intrinsics"])
|
| 416 |
+
else None
|
| 417 |
+
),
|
| 418 |
+
"mask": None,
|
| 419 |
+
}
|
| 420 |
+
|
| 421 |
+
# Update measure view selector
|
| 422 |
+
choices = [f"View {i + 1}" for i in range(num_images)]
|
| 423 |
+
measure_view_selector = gr.Dropdown(choices=choices, value=choices[0])
|
| 424 |
+
|
| 425 |
+
except Exception as e:
|
| 426 |
+
print(f"Error loading cached data: {e}")
|
| 427 |
+
|
| 428 |
+
# Check for cached 3DGS video
|
| 429 |
+
gs_video_dir = os.path.join(target_dir, "gs_video")
|
| 430 |
+
if os.path.exists(gs_video_dir):
|
| 431 |
+
try:
|
| 432 |
+
from glob import glob
|
| 433 |
+
|
| 434 |
+
gs_videos = sorted(glob(os.path.join(gs_video_dir, "*.mp4")))
|
| 435 |
+
if gs_videos:
|
| 436 |
+
gs_video_path = gs_videos[-1]
|
| 437 |
+
gs_video_visible = True
|
| 438 |
+
gs_info_visible = False
|
| 439 |
+
print(f"Loaded cached 3DGS video: {gs_video_path}")
|
| 440 |
+
except Exception as e:
|
| 441 |
+
print(f"Error loading cached 3DGS video: {e}")
|
| 442 |
+
|
| 443 |
+
return (
|
| 444 |
+
reconstruction_output,
|
| 445 |
+
target_dir,
|
| 446 |
+
image_paths,
|
| 447 |
+
log_message,
|
| 448 |
+
processed_data,
|
| 449 |
+
measure_view_selector,
|
| 450 |
+
gr.update(value=gs_video_path, visible=gs_video_visible), # gs_video
|
| 451 |
+
gr.update(visible=gs_info_visible), # gs_info
|
| 452 |
+
)
|
| 453 |
+
|
| 454 |
+
def navigate_depth_view(
|
| 455 |
+
self,
|
| 456 |
+
processed_data: Optional[dict],
|
| 457 |
+
current_selector: str,
|
| 458 |
+
direction: int,
|
| 459 |
+
) -> Tuple[str, Optional[str]]:
|
| 460 |
+
"""
|
| 461 |
+
Navigate depth view.
|
| 462 |
+
|
| 463 |
+
Args:
|
| 464 |
+
processed_data: Processed data dictionary
|
| 465 |
+
current_selector: Current selector value
|
| 466 |
+
direction: Direction to navigate
|
| 467 |
+
|
| 468 |
+
Returns:
|
| 469 |
+
Tuple of (new_selector_value, depth_vis)
|
| 470 |
+
"""
|
| 471 |
+
return self.visualization_handler.navigate_depth_view(
|
| 472 |
+
processed_data, current_selector, direction
|
| 473 |
+
)
|
| 474 |
+
|
| 475 |
+
def update_depth_view(
|
| 476 |
+
self, processed_data: Optional[dict], view_index: int
|
| 477 |
+
) -> Optional[str]:
|
| 478 |
+
"""
|
| 479 |
+
Update depth view for a specific view index.
|
| 480 |
+
|
| 481 |
+
Args:
|
| 482 |
+
processed_data: Processed data dictionary
|
| 483 |
+
view_index: Index of the view to update
|
| 484 |
+
|
| 485 |
+
Returns:
|
| 486 |
+
Path to depth visualization image or None
|
| 487 |
+
"""
|
| 488 |
+
return self.visualization_handler.update_depth_view(processed_data, view_index)
|
| 489 |
+
|
| 490 |
+
def navigate_measure_view(
|
| 491 |
+
self,
|
| 492 |
+
processed_data: Optional[dict],
|
| 493 |
+
current_selector: str,
|
| 494 |
+
direction: int,
|
| 495 |
+
) -> Tuple[str, Optional[np.ndarray], Optional[np.ndarray], List]:
|
| 496 |
+
"""
|
| 497 |
+
Navigate measure view.
|
| 498 |
+
|
| 499 |
+
Args:
|
| 500 |
+
processed_data: Processed data dictionary
|
| 501 |
+
current_selector: Current selector value
|
| 502 |
+
direction: Direction to navigate
|
| 503 |
+
|
| 504 |
+
Returns:
|
| 505 |
+
Tuple of (new_selector_value, measure_image, depth_right_half, measure_points)
|
| 506 |
+
"""
|
| 507 |
+
return self.visualization_handler.navigate_measure_view(
|
| 508 |
+
processed_data, current_selector, direction
|
| 509 |
+
)
|
| 510 |
+
|
| 511 |
+
def update_measure_view(
|
| 512 |
+
self, processed_data: Optional[dict], view_index: int
|
| 513 |
+
) -> Tuple[Optional[np.ndarray], Optional[np.ndarray], List]:
|
| 514 |
+
"""
|
| 515 |
+
Update measure view for a specific view index.
|
| 516 |
+
|
| 517 |
+
Args:
|
| 518 |
+
processed_data: Processed data dictionary
|
| 519 |
+
view_index: Index of the view to update
|
| 520 |
+
|
| 521 |
+
Returns:
|
| 522 |
+
Tuple of (measure_image, depth_right_half, measure_points)
|
| 523 |
+
"""
|
| 524 |
+
return self.visualization_handler.update_measure_view(processed_data, view_index)
|
| 525 |
+
|
| 526 |
+
def measure(
|
| 527 |
+
self,
|
| 528 |
+
processed_data: Optional[dict],
|
| 529 |
+
measure_points: List,
|
| 530 |
+
current_view_selector: str,
|
| 531 |
+
event: gr.SelectData,
|
| 532 |
+
) -> List:
|
| 533 |
+
"""
|
| 534 |
+
Handle measurement on images.
|
| 535 |
+
|
| 536 |
+
Args:
|
| 537 |
+
processed_data: Processed data dictionary
|
| 538 |
+
measure_points: List of current measure points
|
| 539 |
+
current_view_selector: Current view selector value
|
| 540 |
+
event: Gradio select event
|
| 541 |
+
|
| 542 |
+
Returns:
|
| 543 |
+
List of [image, depth_right_half, measure_points, text]
|
| 544 |
+
"""
|
| 545 |
+
return self.visualization_handler.measure(
|
| 546 |
+
processed_data, measure_points, current_view_selector, event
|
| 547 |
+
)
|
| 548 |
+
|
| 549 |
+
def select_first_frame(
|
| 550 |
+
self, image_gallery: List, selected_index: int = 0
|
| 551 |
+
) -> Tuple[List, str, str]:
|
| 552 |
+
"""
|
| 553 |
+
Select the first frame from the image gallery.
|
| 554 |
+
|
| 555 |
+
Args:
|
| 556 |
+
image_gallery: List of images in the gallery
|
| 557 |
+
selected_index: Index of the selected image (default: 0)
|
| 558 |
+
|
| 559 |
+
Returns:
|
| 560 |
+
Tuple of (updated_image_gallery, log_message, selected_frame_path)
|
| 561 |
+
"""
|
| 562 |
+
try:
|
| 563 |
+
if not image_gallery or len(image_gallery) == 0:
|
| 564 |
+
return image_gallery, "No images available to select as first frame.", ""
|
| 565 |
+
|
| 566 |
+
# Handle None or invalid selected_index
|
| 567 |
+
if (
|
| 568 |
+
selected_index is None
|
| 569 |
+
or selected_index < 0
|
| 570 |
+
or selected_index >= len(image_gallery)
|
| 571 |
+
):
|
| 572 |
+
selected_index = 0
|
| 573 |
+
print(f"Invalid selected_index: {selected_index}, using default: 0")
|
| 574 |
+
|
| 575 |
+
# Get the selected image based on index
|
| 576 |
+
selected_image = image_gallery[selected_index]
|
| 577 |
+
print(f"Selected image index: {selected_index}")
|
| 578 |
+
print(f"Total images: {len(image_gallery)}")
|
| 579 |
+
|
| 580 |
+
# Extract the file path from the selected image
|
| 581 |
+
selected_frame_path = ""
|
| 582 |
+
print(f"Selected image type: {type(selected_image)}")
|
| 583 |
+
print(f"Selected image: {selected_image}")
|
| 584 |
+
|
| 585 |
+
if isinstance(selected_image, tuple):
|
| 586 |
+
# Gradio Gallery returns tuple (path, None)
|
| 587 |
+
selected_frame_path = selected_image[0]
|
| 588 |
+
elif isinstance(selected_image, str):
|
| 589 |
+
selected_frame_path = selected_image
|
| 590 |
+
elif hasattr(selected_image, "name"):
|
| 591 |
+
selected_frame_path = selected_image.name
|
| 592 |
+
elif isinstance(selected_image, dict):
|
| 593 |
+
if "name" in selected_image:
|
| 594 |
+
selected_frame_path = selected_image["name"]
|
| 595 |
+
elif "path" in selected_image:
|
| 596 |
+
selected_frame_path = selected_image["path"]
|
| 597 |
+
elif "src" in selected_image:
|
| 598 |
+
selected_frame_path = selected_image["src"]
|
| 599 |
+
else:
|
| 600 |
+
# Try to convert to string
|
| 601 |
+
selected_frame_path = str(selected_image)
|
| 602 |
+
|
| 603 |
+
print(f"Extracted path: {selected_frame_path}")
|
| 604 |
+
|
| 605 |
+
# Extract filename from the path for matching
|
| 606 |
+
import os
|
| 607 |
+
|
| 608 |
+
selected_filename = os.path.basename(selected_frame_path)
|
| 609 |
+
print(f"Selected filename: {selected_filename}")
|
| 610 |
+
|
| 611 |
+
# Move the selected image to the front
|
| 612 |
+
updated_gallery = [selected_image] + [
|
| 613 |
+
img for img in image_gallery if img != selected_image
|
| 614 |
+
]
|
| 615 |
+
|
| 616 |
+
log_message = (
|
| 617 |
+
f"Selected frame: {selected_filename}. "
|
| 618 |
+
f"Moved to first position. Total frames: {len(updated_gallery)}"
|
| 619 |
+
)
|
| 620 |
+
return updated_gallery, log_message, selected_filename
|
| 621 |
+
|
| 622 |
+
except Exception as e:
|
| 623 |
+
print(f"Error selecting first frame: {e}")
|
| 624 |
+
return image_gallery, f"Error selecting first frame: {e}", ""
|
src/depth_anything_3/app/modules/file_handlers.py
ADDED
|
@@ -0,0 +1,327 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
"""
|
| 16 |
+
File handling module for Depth Anything 3 Gradio app.
|
| 17 |
+
|
| 18 |
+
This module handles file uploads, video processing, and file operations.
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
import os
|
| 22 |
+
import shutil
|
| 23 |
+
import time
|
| 24 |
+
from datetime import datetime
|
| 25 |
+
from typing import List, Optional, Tuple
|
| 26 |
+
|
| 27 |
+
import cv2
|
| 28 |
+
from PIL import Image
|
| 29 |
+
from pillow_heif import register_heif_opener
|
| 30 |
+
|
| 31 |
+
register_heif_opener()
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
class FileHandler:
|
| 35 |
+
"""
|
| 36 |
+
Handles file uploads and processing for the Gradio app.
|
| 37 |
+
"""
|
| 38 |
+
|
| 39 |
+
def __init__(self):
|
| 40 |
+
"""Initialize the file handler."""
|
| 41 |
+
|
| 42 |
+
def handle_uploads(
|
| 43 |
+
self,
|
| 44 |
+
input_video: Optional[str],
|
| 45 |
+
input_images: Optional[List],
|
| 46 |
+
s_time_interval: float = 10.0,
|
| 47 |
+
) -> Tuple[str, List[str]]:
|
| 48 |
+
"""
|
| 49 |
+
Create a new 'target_dir' + 'images' subfolder, and place user-uploaded
|
| 50 |
+
images or extracted frames from video into it.
|
| 51 |
+
|
| 52 |
+
Args:
|
| 53 |
+
input_video: Path to input video file
|
| 54 |
+
input_images: List of input image files
|
| 55 |
+
s_time_interval: Sampling FPS (frames per second) for frame extraction
|
| 56 |
+
|
| 57 |
+
Returns:
|
| 58 |
+
Tuple of (target_dir, image_paths)
|
| 59 |
+
"""
|
| 60 |
+
start_time = time.time()
|
| 61 |
+
|
| 62 |
+
# Get workspace directory from environment variable or use default
|
| 63 |
+
workspace_dir = os.environ.get("DA3_WORKSPACE_DIR", "gradio_workspace")
|
| 64 |
+
if not os.path.exists(workspace_dir):
|
| 65 |
+
os.makedirs(workspace_dir)
|
| 66 |
+
|
| 67 |
+
# Create input_images subdirectory
|
| 68 |
+
input_images_dir = os.path.join(workspace_dir, "input_images")
|
| 69 |
+
if not os.path.exists(input_images_dir):
|
| 70 |
+
os.makedirs(input_images_dir)
|
| 71 |
+
|
| 72 |
+
# Create a unique folder name within input_images
|
| 73 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
|
| 74 |
+
target_dir = os.path.join(input_images_dir, f"session_{timestamp}")
|
| 75 |
+
target_dir_images = os.path.join(target_dir, "images")
|
| 76 |
+
|
| 77 |
+
# Clean up if somehow that folder already exists
|
| 78 |
+
if os.path.exists(target_dir):
|
| 79 |
+
shutil.rmtree(target_dir)
|
| 80 |
+
os.makedirs(target_dir)
|
| 81 |
+
os.makedirs(target_dir_images)
|
| 82 |
+
|
| 83 |
+
image_paths = []
|
| 84 |
+
|
| 85 |
+
# Handle images
|
| 86 |
+
if input_images is not None:
|
| 87 |
+
image_paths.extend(self._process_images(input_images, target_dir_images))
|
| 88 |
+
|
| 89 |
+
# Handle video
|
| 90 |
+
if input_video is not None:
|
| 91 |
+
image_paths.extend(
|
| 92 |
+
self._process_video(input_video, target_dir_images, s_time_interval)
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
# Sort final images for gallery
|
| 96 |
+
image_paths = sorted(image_paths)
|
| 97 |
+
|
| 98 |
+
end_time = time.time()
|
| 99 |
+
print(f"Files copied to {target_dir_images}; took {end_time - start_time:.3f} seconds")
|
| 100 |
+
return target_dir, image_paths
|
| 101 |
+
|
| 102 |
+
def _process_images(self, input_images: List, target_dir_images: str) -> List[str]:
|
| 103 |
+
"""
|
| 104 |
+
Process uploaded images.
|
| 105 |
+
|
| 106 |
+
Args:
|
| 107 |
+
input_images: List of input image files
|
| 108 |
+
target_dir_images: Target directory for images
|
| 109 |
+
|
| 110 |
+
Returns:
|
| 111 |
+
List of processed image paths
|
| 112 |
+
"""
|
| 113 |
+
image_paths = []
|
| 114 |
+
|
| 115 |
+
for file_data in input_images:
|
| 116 |
+
if isinstance(file_data, dict) and "name" in file_data:
|
| 117 |
+
file_path = file_data["name"]
|
| 118 |
+
else:
|
| 119 |
+
file_path = file_data
|
| 120 |
+
|
| 121 |
+
# Check if the file is a HEIC image
|
| 122 |
+
file_ext = os.path.splitext(file_path)[1].lower()
|
| 123 |
+
if file_ext in [".heic", ".heif"]:
|
| 124 |
+
# Convert HEIC to JPEG for better gallery compatibility
|
| 125 |
+
try:
|
| 126 |
+
with Image.open(file_path) as img:
|
| 127 |
+
# Convert to RGB if necessary (HEIC can have different color modes)
|
| 128 |
+
if img.mode not in ("RGB", "L"):
|
| 129 |
+
img = img.convert("RGB")
|
| 130 |
+
|
| 131 |
+
# Create JPEG filename
|
| 132 |
+
base_name = os.path.splitext(os.path.basename(file_path))[0]
|
| 133 |
+
dst_path = os.path.join(target_dir_images, f"{base_name}.jpg")
|
| 134 |
+
|
| 135 |
+
# Save as JPEG with high quality
|
| 136 |
+
img.save(dst_path, "JPEG", quality=95)
|
| 137 |
+
image_paths.append(dst_path)
|
| 138 |
+
print(
|
| 139 |
+
f"Converted HEIC to JPEG: {os.path.basename(file_path)} -> "
|
| 140 |
+
f"{os.path.basename(dst_path)}"
|
| 141 |
+
)
|
| 142 |
+
except Exception as e:
|
| 143 |
+
print(f"Error converting HEIC file {file_path}: {e}")
|
| 144 |
+
# Fall back to copying as is
|
| 145 |
+
dst_path = os.path.join(target_dir_images, os.path.basename(file_path))
|
| 146 |
+
shutil.copy(file_path, dst_path)
|
| 147 |
+
image_paths.append(dst_path)
|
| 148 |
+
else:
|
| 149 |
+
# Regular image files - copy as is
|
| 150 |
+
dst_path = os.path.join(target_dir_images, os.path.basename(file_path))
|
| 151 |
+
shutil.copy(file_path, dst_path)
|
| 152 |
+
image_paths.append(dst_path)
|
| 153 |
+
|
| 154 |
+
return image_paths
|
| 155 |
+
|
| 156 |
+
def _process_video(
|
| 157 |
+
self, input_video: str, target_dir_images: str, s_time_interval: float
|
| 158 |
+
) -> List[str]:
|
| 159 |
+
"""
|
| 160 |
+
Process video file and extract frames.
|
| 161 |
+
|
| 162 |
+
Args:
|
| 163 |
+
input_video: Path to input video file
|
| 164 |
+
target_dir_images: Target directory for extracted frames
|
| 165 |
+
s_time_interval: Sampling FPS (frames per second) for frame extraction
|
| 166 |
+
|
| 167 |
+
Returns:
|
| 168 |
+
List of extracted frame paths
|
| 169 |
+
"""
|
| 170 |
+
image_paths = []
|
| 171 |
+
|
| 172 |
+
if isinstance(input_video, dict) and "name" in input_video:
|
| 173 |
+
video_path = input_video["name"]
|
| 174 |
+
else:
|
| 175 |
+
video_path = input_video
|
| 176 |
+
|
| 177 |
+
vs = cv2.VideoCapture(video_path)
|
| 178 |
+
fps = vs.get(cv2.CAP_PROP_FPS)
|
| 179 |
+
frame_interval = max(1, int(fps / s_time_interval)) # Convert FPS to frame interval
|
| 180 |
+
|
| 181 |
+
count = 0
|
| 182 |
+
video_frame_num = 0
|
| 183 |
+
while True:
|
| 184 |
+
gotit, frame = vs.read()
|
| 185 |
+
if not gotit:
|
| 186 |
+
break
|
| 187 |
+
count += 1
|
| 188 |
+
if count % frame_interval == 0:
|
| 189 |
+
image_path = os.path.join(target_dir_images, f"{video_frame_num:06}.png")
|
| 190 |
+
cv2.imwrite(image_path, frame)
|
| 191 |
+
image_paths.append(image_path)
|
| 192 |
+
video_frame_num += 1
|
| 193 |
+
|
| 194 |
+
return image_paths
|
| 195 |
+
|
| 196 |
+
def update_gallery_on_upload(
|
| 197 |
+
self,
|
| 198 |
+
input_video: Optional[str],
|
| 199 |
+
input_images: Optional[List],
|
| 200 |
+
s_time_interval: float = 10.0,
|
| 201 |
+
) -> Tuple[Optional[str], Optional[str], Optional[List], Optional[str]]:
|
| 202 |
+
"""
|
| 203 |
+
Handle file uploads and update gallery.
|
| 204 |
+
|
| 205 |
+
Args:
|
| 206 |
+
input_video: Path to input video file
|
| 207 |
+
input_images: List of input image files
|
| 208 |
+
s_time_interval: Sampling FPS (frames per second) for frame extraction
|
| 209 |
+
|
| 210 |
+
Returns:
|
| 211 |
+
Tuple of (reconstruction_output, target_dir, image_paths, log_message)
|
| 212 |
+
"""
|
| 213 |
+
if not input_video and not input_images:
|
| 214 |
+
return None, None, None, None
|
| 215 |
+
|
| 216 |
+
target_dir, image_paths = self.handle_uploads(input_video, input_images, s_time_interval)
|
| 217 |
+
return (
|
| 218 |
+
None,
|
| 219 |
+
target_dir,
|
| 220 |
+
image_paths,
|
| 221 |
+
"Upload complete. Click 'Reconstruct' to begin 3D processing.",
|
| 222 |
+
)
|
| 223 |
+
|
| 224 |
+
def load_example_scene(
|
| 225 |
+
self, scene_name: str, examples_dir: str = "examples", s_time_interval: float = 1.0
|
| 226 |
+
) -> Tuple[Optional[str], Optional[str], Optional[List], str]:
|
| 227 |
+
"""
|
| 228 |
+
Load a scene from examples directory.
|
| 229 |
+
|
| 230 |
+
Args:
|
| 231 |
+
scene_name: Name of the scene to load
|
| 232 |
+
examples_dir: Path to examples directory
|
| 233 |
+
s_time_interval: Sampling FPS for video frame extraction (default 1.0)
|
| 234 |
+
|
| 235 |
+
Returns:
|
| 236 |
+
Tuple of (reconstruction_output, target_dir, image_paths, log_message)
|
| 237 |
+
"""
|
| 238 |
+
from depth_anything_3.app.modules.utils import get_scene_info
|
| 239 |
+
|
| 240 |
+
scenes = get_scene_info(examples_dir)
|
| 241 |
+
|
| 242 |
+
# Find the selected scene
|
| 243 |
+
selected_scene = None
|
| 244 |
+
for scene in scenes:
|
| 245 |
+
if scene["name"] == scene_name:
|
| 246 |
+
selected_scene = scene
|
| 247 |
+
break
|
| 248 |
+
|
| 249 |
+
if selected_scene is None:
|
| 250 |
+
return None, None, None, "Scene not found"
|
| 251 |
+
|
| 252 |
+
# Check if this is a video scene
|
| 253 |
+
is_video_scene = selected_scene.get("type") == "video"
|
| 254 |
+
|
| 255 |
+
# Use fixed directory name for examples (not timestamp-based)
|
| 256 |
+
workspace_dir = os.environ.get("DA3_WORKSPACE_DIR", "gradio_workspace")
|
| 257 |
+
input_images_dir = os.path.join(workspace_dir, "input_images")
|
| 258 |
+
if not os.path.exists(input_images_dir):
|
| 259 |
+
os.makedirs(input_images_dir)
|
| 260 |
+
|
| 261 |
+
# For video scenes, include FPS in folder name so different FPS = different cache
|
| 262 |
+
if is_video_scene:
|
| 263 |
+
target_dir = os.path.join(
|
| 264 |
+
input_images_dir, f"example_{scene_name}_fps{s_time_interval:.1f}"
|
| 265 |
+
)
|
| 266 |
+
else:
|
| 267 |
+
target_dir = os.path.join(input_images_dir, f"example_{scene_name}")
|
| 268 |
+
target_dir_images = os.path.join(target_dir, "images")
|
| 269 |
+
|
| 270 |
+
# Check if already cached (GLB file exists)
|
| 271 |
+
glb_path = os.path.join(target_dir, "scene.glb")
|
| 272 |
+
is_cached = os.path.exists(glb_path)
|
| 273 |
+
|
| 274 |
+
# Create directory if it doesn't exist
|
| 275 |
+
if not os.path.exists(target_dir):
|
| 276 |
+
os.makedirs(target_dir)
|
| 277 |
+
os.makedirs(target_dir_images)
|
| 278 |
+
|
| 279 |
+
# Process images or extract video frames if directory is new or empty
|
| 280 |
+
if not os.path.exists(target_dir_images) or len(os.listdir(target_dir_images)) == 0:
|
| 281 |
+
os.makedirs(target_dir_images, exist_ok=True)
|
| 282 |
+
image_paths = []
|
| 283 |
+
|
| 284 |
+
if is_video_scene:
|
| 285 |
+
# Extract frames from video using specified FPS
|
| 286 |
+
video_path = selected_scene.get("video_file")
|
| 287 |
+
if video_path:
|
| 288 |
+
image_paths = self._process_video(
|
| 289 |
+
video_path, target_dir_images, s_time_interval
|
| 290 |
+
)
|
| 291 |
+
else:
|
| 292 |
+
# Copy images
|
| 293 |
+
for file_path in selected_scene["image_files"]:
|
| 294 |
+
dst_path = os.path.join(target_dir_images, os.path.basename(file_path))
|
| 295 |
+
shutil.copy(file_path, dst_path)
|
| 296 |
+
image_paths.append(dst_path)
|
| 297 |
+
else:
|
| 298 |
+
# Use existing images
|
| 299 |
+
image_paths = sorted(
|
| 300 |
+
[
|
| 301 |
+
os.path.join(target_dir_images, f)
|
| 302 |
+
for f in os.listdir(target_dir_images)
|
| 303 |
+
if f.lower().endswith((".png", ".jpg", ".jpeg", ".bmp", ".tiff", ".tif"))
|
| 304 |
+
]
|
| 305 |
+
)
|
| 306 |
+
|
| 307 |
+
num_frames = len(image_paths)
|
| 308 |
+
scene_type = "video" if is_video_scene else "scene"
|
| 309 |
+
|
| 310 |
+
# Return cached GLB if available
|
| 311 |
+
if is_cached:
|
| 312 |
+
return (
|
| 313 |
+
glb_path, # Return cached reconstruction
|
| 314 |
+
target_dir, # Set target directory
|
| 315 |
+
image_paths, # Set gallery
|
| 316 |
+
f"Loaded cached {scene_type} '{scene_name}' with {num_frames} frames.",
|
| 317 |
+
)
|
| 318 |
+
else:
|
| 319 |
+
return (
|
| 320 |
+
None, # No cached reconstruction
|
| 321 |
+
target_dir, # Set target directory
|
| 322 |
+
image_paths, # Set gallery
|
| 323 |
+
(
|
| 324 |
+
f"Loaded {scene_type} '{scene_name}' with {num_frames} frames. "
|
| 325 |
+
"Click 'Reconstruct' to begin 3D processing."
|
| 326 |
+
),
|
| 327 |
+
)
|
src/depth_anything_3/app/modules/model_inference.py
ADDED
|
@@ -0,0 +1,454 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
|
| 2 |
+
# Optimizations (c) Delanoe Pirard / Aedelon - Apache 2.0
|
| 3 |
+
#
|
| 4 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 5 |
+
# you may not use this file except in compliance with the License.
|
| 6 |
+
# You may obtain a copy of the License at
|
| 7 |
+
#
|
| 8 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 9 |
+
#
|
| 10 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 11 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 12 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 13 |
+
# See the License for the specific language governing permissions and
|
| 14 |
+
# limitations under the License.
|
| 15 |
+
|
| 16 |
+
"""
|
| 17 |
+
Model inference module for Depth Anything 3 Gradio app.
|
| 18 |
+
|
| 19 |
+
This module handles all model-related operations including inference,
|
| 20 |
+
data processing, and result preparation.
|
| 21 |
+
|
| 22 |
+
Optimizations based on benchmarks:
|
| 23 |
+
- Smart batch sizing per model/device (MPS: B=4 for small/base, B=2 for large, B=1 for giant)
|
| 24 |
+
- CUDA: Adaptive batching at 85% memory utilization
|
| 25 |
+
- CPU: Always batch=1
|
| 26 |
+
- Model caching for 200x faster subsequent loads
|
| 27 |
+
"""
|
| 28 |
+
|
| 29 |
+
import glob
|
| 30 |
+
import os
|
| 31 |
+
import time
|
| 32 |
+
from typing import Any, Dict, List, Optional, Tuple
|
| 33 |
+
|
| 34 |
+
import numpy as np
|
| 35 |
+
import torch
|
| 36 |
+
|
| 37 |
+
from depth_anything_3.api import DepthAnything3
|
| 38 |
+
from depth_anything_3.utils.export.glb import export_to_glb
|
| 39 |
+
from depth_anything_3.utils.export.gs import export_to_gs_video
|
| 40 |
+
from depth_anything_3.utils.memory import cleanup_cuda_memory
|
| 41 |
+
|
| 42 |
+
# Available models for UI selection
|
| 43 |
+
AVAILABLE_MODELS = {
|
| 44 |
+
"da3-small": "Small (fastest, ~27 img/s)",
|
| 45 |
+
"da3-base": "Base (fast, ~10 img/s)",
|
| 46 |
+
"da3-large": "Large (balanced, ~4 img/s)",
|
| 47 |
+
"da3-giant": "Giant (high quality, ~1.6 img/s)",
|
| 48 |
+
"da3nested-giant-large": "Giant+Large (best quality, ~1.5 img/s)",
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
# Mapping from UI names to HuggingFace repo IDs
|
| 52 |
+
MODEL_TO_HF_REPO = {
|
| 53 |
+
"da3-small": "depth-anything/DA3-SMALL",
|
| 54 |
+
"da3-base": "depth-anything/DA3-BASE",
|
| 55 |
+
"da3-large": "depth-anything/DA3-LARGE",
|
| 56 |
+
"da3-giant": "depth-anything/DA3-GIANT",
|
| 57 |
+
"da3nested-giant-large": "depth-anything/DA3NESTED-GIANT-LARGE",
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
DEFAULT_MODEL = "da3nested-giant-large"
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
class ModelInference:
|
| 64 |
+
"""
|
| 65 |
+
Handles model inference and data processing for Depth Anything 3.
|
| 66 |
+
|
| 67 |
+
Uses benchmark-optimized batch sizes:
|
| 68 |
+
- MPS: B=4 for small/base, B=2 for large, B=1 for giant
|
| 69 |
+
- CUDA: Adaptive batching (85% VRAM utilization)
|
| 70 |
+
- CPU: B=1 always
|
| 71 |
+
"""
|
| 72 |
+
|
| 73 |
+
def __init__(self):
|
| 74 |
+
"""Initialize the model inference handler."""
|
| 75 |
+
self.model: Optional[DepthAnything3] = None
|
| 76 |
+
self.current_model_name: Optional[str] = None
|
| 77 |
+
self.device: Optional[torch.device] = None
|
| 78 |
+
|
| 79 |
+
def _get_optimal_batch_size(
|
| 80 |
+
self, num_images: int, model_name: str, device_type: str
|
| 81 |
+
) -> int:
|
| 82 |
+
"""
|
| 83 |
+
Get optimal batch size based on benchmarks.
|
| 84 |
+
|
| 85 |
+
Benchmark results (MPS, 1280x720):
|
| 86 |
+
- da3-small: B=4 → 27.2 img/s (vs B=1 → 22.2 img/s)
|
| 87 |
+
- da3-base: B=4 → 11.6 img/s (vs B=1 → 10.7 img/s)
|
| 88 |
+
- da3-large: B=2 → 3.8 img/s (B=4 slower due to memory pressure)
|
| 89 |
+
- da3-giant: B=1 → 1.6 img/s (B=4 → 1.2 img/s, worse!)
|
| 90 |
+
|
| 91 |
+
Args:
|
| 92 |
+
num_images: Number of images to process
|
| 93 |
+
model_name: Name of the model
|
| 94 |
+
device_type: Device type ('cuda', 'mps', 'cpu')
|
| 95 |
+
|
| 96 |
+
Returns:
|
| 97 |
+
Optimal batch size
|
| 98 |
+
"""
|
| 99 |
+
if device_type == "cpu":
|
| 100 |
+
return 1
|
| 101 |
+
|
| 102 |
+
# MPS: Use benchmark-optimized fixed batch sizes
|
| 103 |
+
if device_type == "mps":
|
| 104 |
+
if "small" in model_name:
|
| 105 |
+
return min(4, num_images)
|
| 106 |
+
elif "base" in model_name:
|
| 107 |
+
return min(4, num_images)
|
| 108 |
+
elif "giant" in model_name:
|
| 109 |
+
return 1
|
| 110 |
+
else: # large
|
| 111 |
+
return min(2, num_images)
|
| 112 |
+
|
| 113 |
+
# CUDA: Conservative batch size, can be tuned
|
| 114 |
+
if "giant" in model_name:
|
| 115 |
+
return min(2, num_images)
|
| 116 |
+
elif "large" in model_name:
|
| 117 |
+
return min(4, num_images)
|
| 118 |
+
else:
|
| 119 |
+
return min(8, num_images)
|
| 120 |
+
|
| 121 |
+
def initialize_model(self, device: torch.device, model_name: str = None) -> None:
|
| 122 |
+
"""
|
| 123 |
+
Initialize the DepthAnything3 model.
|
| 124 |
+
|
| 125 |
+
Args:
|
| 126 |
+
device: Device to load the model on
|
| 127 |
+
model_name: Model name to load (default: da3-base)
|
| 128 |
+
"""
|
| 129 |
+
if model_name is None:
|
| 130 |
+
model_name = os.environ.get("DA3_MODEL_NAME", DEFAULT_MODEL)
|
| 131 |
+
|
| 132 |
+
# Check if we need to reload the model
|
| 133 |
+
need_reload = (
|
| 134 |
+
self.model is None
|
| 135 |
+
or self.current_model_name != model_name
|
| 136 |
+
or self.device != device
|
| 137 |
+
)
|
| 138 |
+
|
| 139 |
+
if need_reload:
|
| 140 |
+
# Cleanup old model if exists
|
| 141 |
+
if self.model is not None:
|
| 142 |
+
print(f"[ModelInference] Unloading {self.current_model_name}")
|
| 143 |
+
del self.model
|
| 144 |
+
self.model = None
|
| 145 |
+
cleanup_cuda_memory()
|
| 146 |
+
|
| 147 |
+
# Get HuggingFace repo ID from model name
|
| 148 |
+
hf_repo = MODEL_TO_HF_REPO.get(model_name, model_name)
|
| 149 |
+
print(f"[ModelInference] Loading model: {model_name} ({hf_repo}) on {device}")
|
| 150 |
+
start_time = time.time()
|
| 151 |
+
|
| 152 |
+
# Use from_pretrained to load from HuggingFace
|
| 153 |
+
self.model = DepthAnything3.from_pretrained(hf_repo)
|
| 154 |
+
self.model = self.model.to(device)
|
| 155 |
+
self.current_model_name = model_name
|
| 156 |
+
self.device = device
|
| 157 |
+
|
| 158 |
+
load_time = time.time() - start_time
|
| 159 |
+
print(f"[ModelInference] Model loaded in {load_time:.2f}s")
|
| 160 |
+
else:
|
| 161 |
+
print(f"[ModelInference] Reusing cached model: {model_name}")
|
| 162 |
+
|
| 163 |
+
self.model.eval()
|
| 164 |
+
|
| 165 |
+
def run_inference(
|
| 166 |
+
self,
|
| 167 |
+
target_dir: str,
|
| 168 |
+
filter_black_bg: bool = False,
|
| 169 |
+
filter_white_bg: bool = False,
|
| 170 |
+
process_res_method: str = "upper_bound_resize",
|
| 171 |
+
show_camera: bool = True,
|
| 172 |
+
save_percentage: float = 30.0,
|
| 173 |
+
num_max_points: int = 1_000_000,
|
| 174 |
+
infer_gs: bool = False,
|
| 175 |
+
ref_view_strategy: str = "saddle_balanced",
|
| 176 |
+
gs_trj_mode: str = "extend",
|
| 177 |
+
gs_video_quality: str = "high",
|
| 178 |
+
model_name: str = None,
|
| 179 |
+
) -> Tuple[Any, dict]:
|
| 180 |
+
"""
|
| 181 |
+
Run DepthAnything3 model inference on images.
|
| 182 |
+
|
| 183 |
+
All images are processed in a single batch for optimal performance.
|
| 184 |
+
|
| 185 |
+
Args:
|
| 186 |
+
target_dir: Directory containing images
|
| 187 |
+
filter_black_bg: Whether to filter black background
|
| 188 |
+
filter_white_bg: Whether to filter white background
|
| 189 |
+
process_res_method: Method for resizing input images
|
| 190 |
+
show_camera: Whether to show camera in 3D view
|
| 191 |
+
save_percentage: Percentage of points to save (0-100)
|
| 192 |
+
num_max_points: Maximum number of points in point cloud
|
| 193 |
+
infer_gs: Whether to infer 3D Gaussian Splatting
|
| 194 |
+
ref_view_strategy: Reference view selection strategy
|
| 195 |
+
gs_trj_mode: Trajectory mode for 3DGS
|
| 196 |
+
gs_video_quality: Video quality for 3DGS
|
| 197 |
+
model_name: Model to use (default: da3-base)
|
| 198 |
+
|
| 199 |
+
Returns:
|
| 200 |
+
Tuple of (prediction, processed_data)
|
| 201 |
+
"""
|
| 202 |
+
inference_start = time.time()
|
| 203 |
+
print(f"[ModelInference] Processing images from {target_dir}")
|
| 204 |
+
|
| 205 |
+
# Device check - support CUDA, MPS (Apple Silicon), and CPU
|
| 206 |
+
if torch.cuda.is_available():
|
| 207 |
+
device = torch.device("cuda")
|
| 208 |
+
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
|
| 209 |
+
device = torch.device("mps")
|
| 210 |
+
else:
|
| 211 |
+
device = torch.device("cpu")
|
| 212 |
+
|
| 213 |
+
# Initialize model (with caching)
|
| 214 |
+
if model_name is None:
|
| 215 |
+
model_name = DEFAULT_MODEL
|
| 216 |
+
self.initialize_model(device, model_name)
|
| 217 |
+
|
| 218 |
+
# Get image paths
|
| 219 |
+
image_folder_path = os.path.join(target_dir, "images")
|
| 220 |
+
all_image_paths = sorted(glob.glob(os.path.join(image_folder_path, "*")))
|
| 221 |
+
|
| 222 |
+
# Filter for image files
|
| 223 |
+
image_extensions = [".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif"]
|
| 224 |
+
image_paths = [
|
| 225 |
+
path
|
| 226 |
+
for path in all_image_paths
|
| 227 |
+
if any(path.lower().endswith(ext) for ext in image_extensions)
|
| 228 |
+
]
|
| 229 |
+
|
| 230 |
+
num_images = len(image_paths)
|
| 231 |
+
print(f"[ModelInference] Found {num_images} images")
|
| 232 |
+
|
| 233 |
+
if num_images == 0:
|
| 234 |
+
raise ValueError("No images found. Check your upload.")
|
| 235 |
+
|
| 236 |
+
# Map UI options to actual method names
|
| 237 |
+
method_mapping = {"high_res": "lower_bound_resize", "low_res": "upper_bound_resize"}
|
| 238 |
+
actual_method = method_mapping.get(process_res_method, "upper_bound_crop")
|
| 239 |
+
|
| 240 |
+
# Get optimal batch size based on benchmarks
|
| 241 |
+
batch_size = self._get_optimal_batch_size(num_images, model_name, device.type)
|
| 242 |
+
print(
|
| 243 |
+
f"[ModelInference] Batched inference: model={model_name}, "
|
| 244 |
+
f"device={device.type}, images={num_images}, batch_size={batch_size}"
|
| 245 |
+
)
|
| 246 |
+
|
| 247 |
+
# Run model inference with batching
|
| 248 |
+
with torch.no_grad():
|
| 249 |
+
if num_images <= batch_size:
|
| 250 |
+
# Single batch - process all at once
|
| 251 |
+
prediction = self.model.inference(
|
| 252 |
+
image_paths,
|
| 253 |
+
export_dir=None,
|
| 254 |
+
process_res_method=actual_method,
|
| 255 |
+
infer_gs=infer_gs,
|
| 256 |
+
ref_view_strategy=ref_view_strategy,
|
| 257 |
+
)
|
| 258 |
+
else:
|
| 259 |
+
# Multiple batches - process in chunks and merge
|
| 260 |
+
predictions = []
|
| 261 |
+
for i in range(0, num_images, batch_size):
|
| 262 |
+
batch_paths = image_paths[i : i + batch_size]
|
| 263 |
+
print(f"[ModelInference] Processing batch {i // batch_size + 1}/{(num_images + batch_size - 1) // batch_size} ({len(batch_paths)} images)")
|
| 264 |
+
batch_pred = self.model.inference(
|
| 265 |
+
batch_paths,
|
| 266 |
+
export_dir=None,
|
| 267 |
+
process_res_method=actual_method,
|
| 268 |
+
infer_gs=False, # Only infer GS on final merged result
|
| 269 |
+
ref_view_strategy=ref_view_strategy,
|
| 270 |
+
)
|
| 271 |
+
predictions.append(batch_pred)
|
| 272 |
+
|
| 273 |
+
# Merge all batch predictions
|
| 274 |
+
prediction = self._merge_predictions(predictions)
|
| 275 |
+
# num_max_points: int = 1_000_000,
|
| 276 |
+
export_to_glb(
|
| 277 |
+
prediction,
|
| 278 |
+
filter_black_bg=filter_black_bg,
|
| 279 |
+
filter_white_bg=filter_white_bg,
|
| 280 |
+
export_dir=target_dir,
|
| 281 |
+
show_cameras=show_camera,
|
| 282 |
+
conf_thresh_percentile=save_percentage,
|
| 283 |
+
num_max_points=int(num_max_points),
|
| 284 |
+
)
|
| 285 |
+
|
| 286 |
+
# export to gs video if needed
|
| 287 |
+
if infer_gs:
|
| 288 |
+
mode_mapping = {"extend": "extend", "smooth": "interpolate_smooth"}
|
| 289 |
+
print(f"GS mode: {gs_trj_mode}; Backend mode: {mode_mapping[gs_trj_mode]}")
|
| 290 |
+
export_to_gs_video(
|
| 291 |
+
prediction,
|
| 292 |
+
export_dir=target_dir,
|
| 293 |
+
chunk_size=4,
|
| 294 |
+
trj_mode=mode_mapping.get(gs_trj_mode, "extend"),
|
| 295 |
+
enable_tqdm=True,
|
| 296 |
+
vis_depth="hcat",
|
| 297 |
+
video_quality=gs_video_quality,
|
| 298 |
+
)
|
| 299 |
+
|
| 300 |
+
# Save predictions.npz for caching metric depth data
|
| 301 |
+
self._save_predictions_cache(target_dir, prediction)
|
| 302 |
+
|
| 303 |
+
# Process results
|
| 304 |
+
processed_data = self._process_results(target_dir, prediction, image_paths)
|
| 305 |
+
|
| 306 |
+
# Clean up using centralized memory utilities for consistency with backend
|
| 307 |
+
cleanup_cuda_memory()
|
| 308 |
+
|
| 309 |
+
inference_time = time.time() - inference_start
|
| 310 |
+
throughput = num_images / inference_time if inference_time > 0 else 0
|
| 311 |
+
print(
|
| 312 |
+
f"[ModelInference] Completed in {inference_time:.2f}s "
|
| 313 |
+
f"({throughput:.1f} img/s)"
|
| 314 |
+
)
|
| 315 |
+
|
| 316 |
+
return prediction, processed_data
|
| 317 |
+
|
| 318 |
+
def _merge_predictions(self, predictions: List[Any]) -> Any:
|
| 319 |
+
"""
|
| 320 |
+
Merge multiple batch predictions into a single prediction.
|
| 321 |
+
|
| 322 |
+
Args:
|
| 323 |
+
predictions: List of Prediction objects from batch inference
|
| 324 |
+
|
| 325 |
+
Returns:
|
| 326 |
+
Merged Prediction object
|
| 327 |
+
"""
|
| 328 |
+
if not predictions:
|
| 329 |
+
return None
|
| 330 |
+
if len(predictions) == 1:
|
| 331 |
+
return predictions[0]
|
| 332 |
+
|
| 333 |
+
from depth_anything_3.specs import Prediction
|
| 334 |
+
|
| 335 |
+
# Concatenate arrays from all predictions
|
| 336 |
+
merged_depth = np.concatenate([p.depth for p in predictions], axis=0)
|
| 337 |
+
merged_conf = (
|
| 338 |
+
np.concatenate([p.conf for p in predictions], axis=0)
|
| 339 |
+
if predictions[0].conf is not None
|
| 340 |
+
else None
|
| 341 |
+
)
|
| 342 |
+
merged_processed_images = (
|
| 343 |
+
np.concatenate([p.processed_images for p in predictions], axis=0)
|
| 344 |
+
if predictions[0].processed_images is not None
|
| 345 |
+
else None
|
| 346 |
+
)
|
| 347 |
+
merged_extrinsics = (
|
| 348 |
+
np.concatenate([p.extrinsics for p in predictions], axis=0)
|
| 349 |
+
if predictions[0].extrinsics is not None
|
| 350 |
+
else None
|
| 351 |
+
)
|
| 352 |
+
merged_intrinsics = (
|
| 353 |
+
np.concatenate([p.intrinsics for p in predictions], axis=0)
|
| 354 |
+
if predictions[0].intrinsics is not None
|
| 355 |
+
else None
|
| 356 |
+
)
|
| 357 |
+
|
| 358 |
+
# Create merged prediction (use is_metric from first batch)
|
| 359 |
+
merged = Prediction(
|
| 360 |
+
depth=merged_depth,
|
| 361 |
+
is_metric=predictions[0].is_metric,
|
| 362 |
+
conf=merged_conf,
|
| 363 |
+
extrinsics=merged_extrinsics,
|
| 364 |
+
intrinsics=merged_intrinsics,
|
| 365 |
+
processed_images=merged_processed_images,
|
| 366 |
+
)
|
| 367 |
+
|
| 368 |
+
print(f"[ModelInference] Merged {len(predictions)} batches into single prediction")
|
| 369 |
+
return merged
|
| 370 |
+
|
| 371 |
+
def _save_predictions_cache(self, target_dir: str, prediction: Any) -> None:
|
| 372 |
+
"""
|
| 373 |
+
Save predictions data to predictions.npz for caching.
|
| 374 |
+
|
| 375 |
+
Args:
|
| 376 |
+
target_dir: Directory to save the cache
|
| 377 |
+
prediction: Model prediction object
|
| 378 |
+
"""
|
| 379 |
+
try:
|
| 380 |
+
output_file = os.path.join(target_dir, "predictions.npz")
|
| 381 |
+
|
| 382 |
+
# Build save dict with prediction data
|
| 383 |
+
save_dict = {}
|
| 384 |
+
|
| 385 |
+
# Save processed images if available
|
| 386 |
+
if prediction.processed_images is not None:
|
| 387 |
+
save_dict["images"] = prediction.processed_images
|
| 388 |
+
|
| 389 |
+
# Save depth data
|
| 390 |
+
if prediction.depth is not None:
|
| 391 |
+
save_dict["depths"] = np.round(prediction.depth, 6)
|
| 392 |
+
|
| 393 |
+
# Save confidence if available
|
| 394 |
+
if prediction.conf is not None:
|
| 395 |
+
save_dict["conf"] = np.round(prediction.conf, 2)
|
| 396 |
+
|
| 397 |
+
# Save camera parameters
|
| 398 |
+
if prediction.extrinsics is not None:
|
| 399 |
+
save_dict["extrinsics"] = prediction.extrinsics
|
| 400 |
+
if prediction.intrinsics is not None:
|
| 401 |
+
save_dict["intrinsics"] = prediction.intrinsics
|
| 402 |
+
|
| 403 |
+
# Save to file
|
| 404 |
+
np.savez_compressed(output_file, **save_dict)
|
| 405 |
+
print(f"Saved predictions cache to: {output_file}")
|
| 406 |
+
|
| 407 |
+
except Exception as e:
|
| 408 |
+
print(f"Warning: Failed to save predictions cache: {e}")
|
| 409 |
+
|
| 410 |
+
def _process_results(
|
| 411 |
+
self, target_dir: str, prediction: Any, image_paths: list
|
| 412 |
+
) -> dict:
|
| 413 |
+
"""
|
| 414 |
+
Process model results into structured data.
|
| 415 |
+
|
| 416 |
+
Args:
|
| 417 |
+
target_dir: Directory containing results
|
| 418 |
+
prediction: Model prediction object
|
| 419 |
+
image_paths: List of input image paths
|
| 420 |
+
|
| 421 |
+
Returns:
|
| 422 |
+
Dictionary containing processed data for each view
|
| 423 |
+
"""
|
| 424 |
+
processed_data = {}
|
| 425 |
+
|
| 426 |
+
# Read generated depth visualization files
|
| 427 |
+
depth_vis_dir = os.path.join(target_dir, "depth_vis")
|
| 428 |
+
|
| 429 |
+
if os.path.exists(depth_vis_dir):
|
| 430 |
+
depth_files = sorted(glob.glob(os.path.join(depth_vis_dir, "*.jpg")))
|
| 431 |
+
for i, depth_file in enumerate(depth_files):
|
| 432 |
+
# Use processed images directly from API
|
| 433 |
+
processed_image = None
|
| 434 |
+
if prediction.processed_images is not None and i < len(
|
| 435 |
+
prediction.processed_images
|
| 436 |
+
):
|
| 437 |
+
processed_image = prediction.processed_images[i]
|
| 438 |
+
|
| 439 |
+
processed_data[i] = {
|
| 440 |
+
"depth_image": depth_file,
|
| 441 |
+
"image": processed_image,
|
| 442 |
+
"original_image_path": image_paths[i] if i < len(image_paths) else None,
|
| 443 |
+
"depth": prediction.depth[i] if i < len(prediction.depth) else None,
|
| 444 |
+
"intrinsics": (
|
| 445 |
+
prediction.intrinsics[i]
|
| 446 |
+
if prediction.intrinsics is not None and i < len(prediction.intrinsics)
|
| 447 |
+
else None
|
| 448 |
+
),
|
| 449 |
+
"mask": None, # No mask information available
|
| 450 |
+
}
|
| 451 |
+
|
| 452 |
+
return processed_data
|
| 453 |
+
|
| 454 |
+
# cleanup() removed: call cleanup_cuda_memory() directly where needed.
|
src/depth_anything_3/app/modules/ui_components.py
ADDED
|
@@ -0,0 +1,497 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
"""
|
| 16 |
+
UI components module for Depth Anything 3 Gradio app.
|
| 17 |
+
|
| 18 |
+
This module contains UI component definitions and layout functions.
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
import os
|
| 22 |
+
from typing import Any, Dict, List, Tuple
|
| 23 |
+
|
| 24 |
+
import gradio as gr
|
| 25 |
+
|
| 26 |
+
from depth_anything_3.app.modules.utils import get_logo_base64, get_scene_info
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
class UIComponents:
|
| 30 |
+
"""
|
| 31 |
+
Handles UI component creation and layout for the Gradio app.
|
| 32 |
+
"""
|
| 33 |
+
|
| 34 |
+
def __init__(self):
|
| 35 |
+
"""Initialize the UI components handler."""
|
| 36 |
+
|
| 37 |
+
def create_upload_section(self) -> Tuple[gr.Video, gr.Slider, gr.File, gr.Gallery]:
|
| 38 |
+
"""
|
| 39 |
+
Create the upload section with video, images, and gallery components.
|
| 40 |
+
|
| 41 |
+
Returns:
|
| 42 |
+
A tuple of Gradio components: (input_video, s_time_interval, input_images, image_gallery).
|
| 43 |
+
"""
|
| 44 |
+
input_video = gr.Video(label="Upload Video", interactive=True)
|
| 45 |
+
s_time_interval = gr.Slider(
|
| 46 |
+
minimum=0.1,
|
| 47 |
+
maximum=60,
|
| 48 |
+
value=10,
|
| 49 |
+
step=0.1,
|
| 50 |
+
label="Sampling FPS (Frames Per Second)",
|
| 51 |
+
interactive=True,
|
| 52 |
+
visible=True,
|
| 53 |
+
)
|
| 54 |
+
input_images = gr.File(file_count="multiple", label="Upload Images", interactive=True)
|
| 55 |
+
image_gallery = gr.Gallery(
|
| 56 |
+
label="Preview",
|
| 57 |
+
columns=4,
|
| 58 |
+
height="300px",
|
| 59 |
+
object_fit="contain",
|
| 60 |
+
allow_preview=True,
|
| 61 |
+
interactive=False,
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
return input_video, s_time_interval, input_images, image_gallery
|
| 65 |
+
|
| 66 |
+
def create_3d_viewer_section(self) -> gr.Model3D:
|
| 67 |
+
"""
|
| 68 |
+
Create the 3D viewer component.
|
| 69 |
+
|
| 70 |
+
Returns:
|
| 71 |
+
3D model viewer component
|
| 72 |
+
"""
|
| 73 |
+
return gr.Model3D(
|
| 74 |
+
height=520,
|
| 75 |
+
zoom_speed=0.5,
|
| 76 |
+
pan_speed=0.5,
|
| 77 |
+
clear_color=[0.0, 0.0, 0.0, 0.0],
|
| 78 |
+
key="persistent_3d_viewer",
|
| 79 |
+
elem_id="reconstruction_3d_viewer",
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
def create_nvs_video(self) -> Tuple[gr.Video, gr.Markdown]:
|
| 83 |
+
"""
|
| 84 |
+
Create the 3DGS rendered video display component and info message.
|
| 85 |
+
|
| 86 |
+
Returns:
|
| 87 |
+
Tuple of (video component, info message component)
|
| 88 |
+
"""
|
| 89 |
+
with gr.Column():
|
| 90 |
+
gs_info = gr.Markdown(
|
| 91 |
+
(
|
| 92 |
+
"‼️ **3D Gaussian Splatting rendering is currently DISABLED.** <br><br><br>"
|
| 93 |
+
"To render novel views from 3DGS, "
|
| 94 |
+
"enable **Infer 3D Gaussian Splatting** below. <br>"
|
| 95 |
+
"Next, in **Visualization Options**, "
|
| 96 |
+
"*optionally* configure the **rendering trajectory** (default: smooth) "
|
| 97 |
+
"and **video quality** (default: low), "
|
| 98 |
+
"then click **Reconstruct**."
|
| 99 |
+
),
|
| 100 |
+
visible=True,
|
| 101 |
+
height=520,
|
| 102 |
+
)
|
| 103 |
+
gs_video = gr.Video(
|
| 104 |
+
height=520,
|
| 105 |
+
label="3DGS Rendered NVS Video (depth shown for reference only)",
|
| 106 |
+
interactive=False,
|
| 107 |
+
visible=False,
|
| 108 |
+
)
|
| 109 |
+
return gs_video, gs_info
|
| 110 |
+
|
| 111 |
+
def create_depth_section(self) -> Tuple[gr.Button, gr.Dropdown, gr.Button, gr.Image]:
|
| 112 |
+
"""
|
| 113 |
+
Create the depth visualization section.
|
| 114 |
+
|
| 115 |
+
Returns:
|
| 116 |
+
A tuple of (prev_depth_btn, depth_view_selector, next_depth_btn, depth_map)
|
| 117 |
+
"""
|
| 118 |
+
with gr.Row(elem_classes=["navigation-row"]):
|
| 119 |
+
prev_depth_btn = gr.Button("◀ Previous", size="sm", scale=1)
|
| 120 |
+
depth_view_selector = gr.Dropdown(
|
| 121 |
+
choices=["View 1"],
|
| 122 |
+
value="View 1",
|
| 123 |
+
label="Select View",
|
| 124 |
+
scale=2,
|
| 125 |
+
interactive=True,
|
| 126 |
+
allow_custom_value=True,
|
| 127 |
+
)
|
| 128 |
+
next_depth_btn = gr.Button("Next ▶", size="sm", scale=1)
|
| 129 |
+
depth_map = gr.Image(
|
| 130 |
+
type="numpy",
|
| 131 |
+
label="Colorized Depth Map",
|
| 132 |
+
format="png",
|
| 133 |
+
interactive=False,
|
| 134 |
+
)
|
| 135 |
+
|
| 136 |
+
return prev_depth_btn, depth_view_selector, next_depth_btn, depth_map
|
| 137 |
+
|
| 138 |
+
def create_measure_section(
|
| 139 |
+
self,
|
| 140 |
+
) -> Tuple[gr.Button, gr.Dropdown, gr.Button, gr.Image, gr.Image, gr.Markdown]:
|
| 141 |
+
"""
|
| 142 |
+
Create the measurement section.
|
| 143 |
+
|
| 144 |
+
Returns:
|
| 145 |
+
A tuple of (prev_measure_btn, measure_view_selector, next_measure_btn, measure_image,
|
| 146 |
+
measure_depth_image, measure_text)
|
| 147 |
+
"""
|
| 148 |
+
from depth_anything_3.app.css_and_html import MEASURE_INSTRUCTIONS_HTML
|
| 149 |
+
|
| 150 |
+
gr.Markdown(MEASURE_INSTRUCTIONS_HTML)
|
| 151 |
+
with gr.Row(elem_classes=["navigation-row"]):
|
| 152 |
+
prev_measure_btn = gr.Button("◀ Previous", size="sm", scale=1)
|
| 153 |
+
measure_view_selector = gr.Dropdown(
|
| 154 |
+
choices=["View 1"],
|
| 155 |
+
value="View 1",
|
| 156 |
+
label="Select View",
|
| 157 |
+
scale=2,
|
| 158 |
+
interactive=True,
|
| 159 |
+
allow_custom_value=True,
|
| 160 |
+
)
|
| 161 |
+
next_measure_btn = gr.Button("Next ▶", size="sm", scale=1)
|
| 162 |
+
with gr.Row():
|
| 163 |
+
measure_image = gr.Image(
|
| 164 |
+
type="numpy",
|
| 165 |
+
show_label=False,
|
| 166 |
+
format="webp",
|
| 167 |
+
interactive=False,
|
| 168 |
+
sources=[],
|
| 169 |
+
label="RGB Image",
|
| 170 |
+
scale=1,
|
| 171 |
+
height=275,
|
| 172 |
+
)
|
| 173 |
+
measure_depth_image = gr.Image(
|
| 174 |
+
type="numpy",
|
| 175 |
+
show_label=False,
|
| 176 |
+
format="webp",
|
| 177 |
+
interactive=False,
|
| 178 |
+
sources=[],
|
| 179 |
+
label="Depth Visualization (Right Half)",
|
| 180 |
+
scale=1,
|
| 181 |
+
height=275,
|
| 182 |
+
)
|
| 183 |
+
gr.Markdown(
|
| 184 |
+
"**Note:** Images have been adjusted to model processing size. "
|
| 185 |
+
"Click two points on the RGB image to measure distance."
|
| 186 |
+
)
|
| 187 |
+
measure_text = gr.Markdown("")
|
| 188 |
+
|
| 189 |
+
return (
|
| 190 |
+
prev_measure_btn,
|
| 191 |
+
measure_view_selector,
|
| 192 |
+
next_measure_btn,
|
| 193 |
+
measure_image,
|
| 194 |
+
measure_depth_image,
|
| 195 |
+
measure_text,
|
| 196 |
+
)
|
| 197 |
+
|
| 198 |
+
def create_inference_control_section(
|
| 199 |
+
self,
|
| 200 |
+
) -> Tuple[gr.Dropdown, gr.Dropdown, gr.Checkbox, gr.Dropdown]:
|
| 201 |
+
"""
|
| 202 |
+
Create the inference control section (before inference).
|
| 203 |
+
|
| 204 |
+
Returns:
|
| 205 |
+
Tuple of (model_selector, process_res_method_dropdown, infer_gs, ref_view_strategy)
|
| 206 |
+
"""
|
| 207 |
+
from depth_anything_3.app.modules.model_inference import AVAILABLE_MODELS, DEFAULT_MODEL
|
| 208 |
+
|
| 209 |
+
with gr.Row():
|
| 210 |
+
# Model selector - most important control
|
| 211 |
+
model_selector = gr.Dropdown(
|
| 212 |
+
choices=list(AVAILABLE_MODELS.keys()),
|
| 213 |
+
value=DEFAULT_MODEL,
|
| 214 |
+
label="Model",
|
| 215 |
+
info="da3-base: fast | da3-large: balanced | giant: best quality",
|
| 216 |
+
scale=1,
|
| 217 |
+
)
|
| 218 |
+
process_res_method_dropdown = gr.Dropdown(
|
| 219 |
+
choices=["high_res", "low_res"],
|
| 220 |
+
value="low_res",
|
| 221 |
+
label="Image Processing Method",
|
| 222 |
+
info="low_res for much more images",
|
| 223 |
+
scale=1,
|
| 224 |
+
)
|
| 225 |
+
infer_gs = gr.Checkbox(
|
| 226 |
+
label="Infer 3D Gaussian Splatting",
|
| 227 |
+
value=False,
|
| 228 |
+
info=(
|
| 229 |
+
'Enable novel view rendering from 3DGS (<i class="fas fa-triangle-exclamation '
|
| 230 |
+
'fa-color-red"></i> requires extra processing time)'
|
| 231 |
+
),
|
| 232 |
+
scale=1,
|
| 233 |
+
)
|
| 234 |
+
ref_view_strategy = gr.Dropdown(
|
| 235 |
+
choices=["saddle_balanced", "saddle_sim_range", "first", "middle"],
|
| 236 |
+
value="saddle_balanced",
|
| 237 |
+
label="Reference View Strategy",
|
| 238 |
+
info="Strategy for selecting reference view from multiple inputs",
|
| 239 |
+
scale=1,
|
| 240 |
+
)
|
| 241 |
+
|
| 242 |
+
return (model_selector, process_res_method_dropdown, infer_gs, ref_view_strategy)
|
| 243 |
+
|
| 244 |
+
def create_display_control_section(
|
| 245 |
+
self,
|
| 246 |
+
) -> Tuple[
|
| 247 |
+
gr.Checkbox,
|
| 248 |
+
gr.Checkbox,
|
| 249 |
+
gr.Checkbox,
|
| 250 |
+
gr.Slider,
|
| 251 |
+
gr.Slider,
|
| 252 |
+
gr.Dropdown,
|
| 253 |
+
gr.Dropdown,
|
| 254 |
+
gr.Button,
|
| 255 |
+
gr.ClearButton,
|
| 256 |
+
]:
|
| 257 |
+
"""
|
| 258 |
+
Create the display control section (options for visualization).
|
| 259 |
+
|
| 260 |
+
Returns:
|
| 261 |
+
Tuple of display control components including buttons
|
| 262 |
+
"""
|
| 263 |
+
with gr.Column():
|
| 264 |
+
# 3DGS options at the top
|
| 265 |
+
with gr.Row():
|
| 266 |
+
gs_trj_mode = gr.Dropdown(
|
| 267 |
+
choices=["smooth", "extend"],
|
| 268 |
+
value="smooth",
|
| 269 |
+
label=("Rendering trajectory for 3DGS viewpoints (requires n_views ≥ 2)"),
|
| 270 |
+
info=("'smooth' for view interpolation; 'extend' for longer trajectory"),
|
| 271 |
+
visible=False, # initially hidden
|
| 272 |
+
)
|
| 273 |
+
gs_video_quality = gr.Dropdown(
|
| 274 |
+
choices=["low", "medium", "high"],
|
| 275 |
+
value="low",
|
| 276 |
+
label=("Video quality for 3DGS rendered outputs"),
|
| 277 |
+
info=("'low' for faster loading speed; 'high' for better visual quality"),
|
| 278 |
+
visible=False, # initially hidden
|
| 279 |
+
)
|
| 280 |
+
|
| 281 |
+
# Reconstruct and Clear buttons (before Visualization Options)
|
| 282 |
+
with gr.Row():
|
| 283 |
+
submit_btn = gr.Button("Reconstruct", scale=1, variant="primary")
|
| 284 |
+
clear_btn = gr.ClearButton(scale=1)
|
| 285 |
+
|
| 286 |
+
gr.Markdown("### Visualization Options: (Click Reconstruct to update)")
|
| 287 |
+
show_cam = gr.Checkbox(label="Show Camera", value=True)
|
| 288 |
+
filter_black_bg = gr.Checkbox(label="Filter Black Background", value=False)
|
| 289 |
+
filter_white_bg = gr.Checkbox(label="Filter White Background", value=False)
|
| 290 |
+
save_percentage = gr.Slider(
|
| 291 |
+
minimum=0,
|
| 292 |
+
maximum=100,
|
| 293 |
+
value=10,
|
| 294 |
+
step=1,
|
| 295 |
+
label="Filter Percentage",
|
| 296 |
+
info="Confidence Threshold (%): Higher values filter more points.",
|
| 297 |
+
)
|
| 298 |
+
num_max_points = gr.Slider(
|
| 299 |
+
minimum=1000,
|
| 300 |
+
maximum=100000,
|
| 301 |
+
value=1000,
|
| 302 |
+
step=1000,
|
| 303 |
+
label="Max Points (K points)",
|
| 304 |
+
info="Maximum number of points to export to GLB (in thousands)",
|
| 305 |
+
)
|
| 306 |
+
|
| 307 |
+
return (
|
| 308 |
+
show_cam,
|
| 309 |
+
filter_black_bg,
|
| 310 |
+
filter_white_bg,
|
| 311 |
+
save_percentage,
|
| 312 |
+
num_max_points,
|
| 313 |
+
gs_trj_mode,
|
| 314 |
+
gs_video_quality,
|
| 315 |
+
submit_btn,
|
| 316 |
+
clear_btn,
|
| 317 |
+
)
|
| 318 |
+
|
| 319 |
+
def create_control_section(
|
| 320 |
+
self,
|
| 321 |
+
) -> Tuple[
|
| 322 |
+
gr.Button,
|
| 323 |
+
gr.ClearButton,
|
| 324 |
+
gr.Dropdown,
|
| 325 |
+
gr.Checkbox,
|
| 326 |
+
gr.Checkbox,
|
| 327 |
+
gr.Checkbox,
|
| 328 |
+
gr.Checkbox,
|
| 329 |
+
gr.Checkbox,
|
| 330 |
+
gr.Dropdown,
|
| 331 |
+
gr.Checkbox,
|
| 332 |
+
gr.Textbox,
|
| 333 |
+
]:
|
| 334 |
+
"""
|
| 335 |
+
Create the control section with buttons and options.
|
| 336 |
+
|
| 337 |
+
Returns:
|
| 338 |
+
Tuple of control components
|
| 339 |
+
"""
|
| 340 |
+
with gr.Row():
|
| 341 |
+
submit_btn = gr.Button("Reconstruct", scale=1, variant="primary")
|
| 342 |
+
clear_btn = gr.ClearButton(
|
| 343 |
+
scale=1,
|
| 344 |
+
)
|
| 345 |
+
|
| 346 |
+
with gr.Row():
|
| 347 |
+
frame_filter = gr.Dropdown(
|
| 348 |
+
choices=["All"], value="All", label="Show Points from Frame"
|
| 349 |
+
)
|
| 350 |
+
with gr.Column():
|
| 351 |
+
gr.Markdown("### Visualization Option: (Click Reconstruct to update)")
|
| 352 |
+
show_cam = gr.Checkbox(label="Show Camera", value=True)
|
| 353 |
+
show_mesh = gr.Checkbox(label="Show Mesh", value=True)
|
| 354 |
+
filter_black_bg = gr.Checkbox(label="Filter Black Background", value=False)
|
| 355 |
+
filter_white_bg = gr.Checkbox(label="Filter White Background", value=False)
|
| 356 |
+
gr.Markdown("### Reconstruction Options: (updated on next run)")
|
| 357 |
+
apply_mask_checkbox = gr.Checkbox(
|
| 358 |
+
label="Apply mask for predicted ambiguous depth classes & edges",
|
| 359 |
+
value=True,
|
| 360 |
+
)
|
| 361 |
+
process_res_method_dropdown = gr.Dropdown(
|
| 362 |
+
choices=[
|
| 363 |
+
"upper_bound_resize",
|
| 364 |
+
"upper_bound_crop",
|
| 365 |
+
"lower_bound_resize",
|
| 366 |
+
"lower_bound_crop",
|
| 367 |
+
],
|
| 368 |
+
value="upper_bound_resize",
|
| 369 |
+
label="Image Processing Method",
|
| 370 |
+
info="Method for resizing input images",
|
| 371 |
+
)
|
| 372 |
+
save_to_gallery_checkbox = gr.Checkbox(
|
| 373 |
+
label="Save to Gallery",
|
| 374 |
+
value=False,
|
| 375 |
+
info="Save current reconstruction results to gallery directory",
|
| 376 |
+
)
|
| 377 |
+
gallery_name_input = gr.Textbox(
|
| 378 |
+
label="Gallery Name",
|
| 379 |
+
placeholder="Enter a name for the gallery folder",
|
| 380 |
+
value="",
|
| 381 |
+
info="Leave empty for auto-generated name with timestamp",
|
| 382 |
+
)
|
| 383 |
+
|
| 384 |
+
return (
|
| 385 |
+
submit_btn,
|
| 386 |
+
clear_btn,
|
| 387 |
+
frame_filter,
|
| 388 |
+
show_cam,
|
| 389 |
+
show_mesh,
|
| 390 |
+
filter_black_bg,
|
| 391 |
+
filter_white_bg,
|
| 392 |
+
apply_mask_checkbox,
|
| 393 |
+
process_res_method_dropdown,
|
| 394 |
+
save_to_gallery_checkbox,
|
| 395 |
+
gallery_name_input,
|
| 396 |
+
)
|
| 397 |
+
|
| 398 |
+
def create_example_scenes_section(self) -> List[Dict[str, Any]]:
|
| 399 |
+
"""
|
| 400 |
+
Create the example scenes section.
|
| 401 |
+
|
| 402 |
+
Returns:
|
| 403 |
+
List of scene information dictionaries
|
| 404 |
+
"""
|
| 405 |
+
# Use assets/examples directory for example scenes
|
| 406 |
+
examples_dir = os.environ.get("DA3_EXAMPLES_DIR", "assets/examples")
|
| 407 |
+
|
| 408 |
+
# Get scene information
|
| 409 |
+
scenes = get_scene_info(examples_dir)
|
| 410 |
+
|
| 411 |
+
return scenes
|
| 412 |
+
|
| 413 |
+
def create_example_scene_grid(self, scenes: List[Dict[str, Any]]) -> List:
|
| 414 |
+
"""
|
| 415 |
+
Create the example scene grid.
|
| 416 |
+
|
| 417 |
+
Args:
|
| 418 |
+
scenes: List of scene information dictionaries
|
| 419 |
+
|
| 420 |
+
Returns:
|
| 421 |
+
List of scene components (gr.Image or gr.Video) in same order as scenes
|
| 422 |
+
"""
|
| 423 |
+
scene_components = []
|
| 424 |
+
|
| 425 |
+
if scenes:
|
| 426 |
+
for i in range(0, len(scenes), 4): # Process 4 scenes per row
|
| 427 |
+
with gr.Row():
|
| 428 |
+
for j in range(4):
|
| 429 |
+
scene_idx = i + j
|
| 430 |
+
if scene_idx < len(scenes):
|
| 431 |
+
scene = scenes[scene_idx]
|
| 432 |
+
scene_type = scene.get("type", "images")
|
| 433 |
+
|
| 434 |
+
with gr.Column(scale=1, elem_classes=["clickable-thumbnail"]):
|
| 435 |
+
# Use Image for both image and video scenes
|
| 436 |
+
# (video scenes use first frame as thumbnail)
|
| 437 |
+
scene_component = gr.Image(
|
| 438 |
+
value=scene["thumbnail"],
|
| 439 |
+
height=150,
|
| 440 |
+
interactive=False,
|
| 441 |
+
show_label=False,
|
| 442 |
+
elem_id=f"scene_thumb_{scene['name']}",
|
| 443 |
+
sources=[],
|
| 444 |
+
)
|
| 445 |
+
scene_components.append(scene_component)
|
| 446 |
+
|
| 447 |
+
if scene_type == "video":
|
| 448 |
+
# Scene name for video
|
| 449 |
+
gr.Markdown(
|
| 450 |
+
f"**{scene['name']}** \n 🎬 video",
|
| 451 |
+
elem_classes=["scene-info"],
|
| 452 |
+
)
|
| 453 |
+
else:
|
| 454 |
+
# Scene name and image count
|
| 455 |
+
gr.Markdown(
|
| 456 |
+
f"**{scene['name']}** \n {scene['num_images']} images",
|
| 457 |
+
elem_classes=["scene-info"],
|
| 458 |
+
)
|
| 459 |
+
else:
|
| 460 |
+
# Empty column to maintain grid structure
|
| 461 |
+
with gr.Column(scale=1):
|
| 462 |
+
pass
|
| 463 |
+
|
| 464 |
+
return scene_components
|
| 465 |
+
|
| 466 |
+
def create_header_section(self) -> gr.HTML:
|
| 467 |
+
"""
|
| 468 |
+
Create the header section with logo and title.
|
| 469 |
+
|
| 470 |
+
Returns:
|
| 471 |
+
Header HTML component
|
| 472 |
+
"""
|
| 473 |
+
from depth_anything_3.app.css_and_html import get_header_html
|
| 474 |
+
|
| 475 |
+
return gr.HTML(get_header_html(get_logo_base64()))
|
| 476 |
+
|
| 477 |
+
def create_description_section(self) -> gr.HTML:
|
| 478 |
+
"""
|
| 479 |
+
Create the description section.
|
| 480 |
+
|
| 481 |
+
Returns:
|
| 482 |
+
Description HTML component
|
| 483 |
+
"""
|
| 484 |
+
from depth_anything_3.app.css_and_html import get_description_html
|
| 485 |
+
|
| 486 |
+
return gr.HTML(get_description_html())
|
| 487 |
+
|
| 488 |
+
def create_acknowledgements_section(self) -> gr.HTML:
|
| 489 |
+
"""
|
| 490 |
+
Create the acknowledgements section.
|
| 491 |
+
|
| 492 |
+
Returns:
|
| 493 |
+
Acknowledgements HTML component
|
| 494 |
+
"""
|
| 495 |
+
from depth_anything_3.app.css_and_html import get_acknowledgements_html
|
| 496 |
+
|
| 497 |
+
return gr.HTML(get_acknowledgements_html())
|
src/depth_anything_3/app/modules/utils.py
ADDED
|
@@ -0,0 +1,269 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
"""
|
| 16 |
+
Utility functions for Depth Anything 3 Gradio app.
|
| 17 |
+
|
| 18 |
+
This module contains helper functions for data processing, visualization,
|
| 19 |
+
and file operations.
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
import json
|
| 24 |
+
import os
|
| 25 |
+
import shutil
|
| 26 |
+
from datetime import datetime
|
| 27 |
+
from typing import Any, Dict, List, Optional, Tuple
|
| 28 |
+
|
| 29 |
+
import numpy as np
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def create_depth_visualization(depth: np.ndarray) -> Optional[np.ndarray]:
|
| 33 |
+
"""
|
| 34 |
+
Create a colored depth visualization.
|
| 35 |
+
|
| 36 |
+
Args:
|
| 37 |
+
depth: Depth array
|
| 38 |
+
|
| 39 |
+
Returns:
|
| 40 |
+
Colored depth visualization or None
|
| 41 |
+
"""
|
| 42 |
+
if depth is None:
|
| 43 |
+
return None
|
| 44 |
+
|
| 45 |
+
# Normalize depth to 0-1 range
|
| 46 |
+
depth_min = depth[depth > 0].min() if (depth > 0).any() else 0
|
| 47 |
+
depth_max = depth.max()
|
| 48 |
+
|
| 49 |
+
if depth_max <= depth_min:
|
| 50 |
+
return None
|
| 51 |
+
|
| 52 |
+
# Normalize depth
|
| 53 |
+
depth_norm = (depth - depth_min) / (depth_max - depth_min)
|
| 54 |
+
depth_norm = np.clip(depth_norm, 0, 1)
|
| 55 |
+
|
| 56 |
+
# Apply colormap (using matplotlib's viridis colormap)
|
| 57 |
+
import matplotlib.cm as cm
|
| 58 |
+
|
| 59 |
+
# Convert to colored image
|
| 60 |
+
depth_colored = cm.viridis(depth_norm)[:, :, :3] # Remove alpha channel
|
| 61 |
+
depth_colored = (depth_colored * 255).astype(np.uint8)
|
| 62 |
+
|
| 63 |
+
return depth_colored
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def save_to_gallery_func(
|
| 67 |
+
target_dir: str, processed_data: dict, gallery_name: Optional[str] = None
|
| 68 |
+
) -> Tuple[bool, str]:
|
| 69 |
+
"""
|
| 70 |
+
Save the current reconstruction results to the gallery directory.
|
| 71 |
+
|
| 72 |
+
Args:
|
| 73 |
+
target_dir: Source directory containing reconstruction results
|
| 74 |
+
processed_data: Processed data dictionary
|
| 75 |
+
gallery_name: Name for the gallery folder
|
| 76 |
+
|
| 77 |
+
Returns:
|
| 78 |
+
Tuple of (success, message)
|
| 79 |
+
"""
|
| 80 |
+
try:
|
| 81 |
+
# Get gallery directory from environment variable or use default
|
| 82 |
+
gallery_dir = os.environ.get(
|
| 83 |
+
"DA3_GALLERY_DIR",
|
| 84 |
+
"workspace/gallery",
|
| 85 |
+
)
|
| 86 |
+
if not os.path.exists(gallery_dir):
|
| 87 |
+
os.makedirs(gallery_dir)
|
| 88 |
+
|
| 89 |
+
# Use provided name or create a unique name
|
| 90 |
+
if gallery_name is None or gallery_name.strip() == "":
|
| 91 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 92 |
+
gallery_name = f"reconstruction_{timestamp}"
|
| 93 |
+
|
| 94 |
+
gallery_path = os.path.join(gallery_dir, gallery_name)
|
| 95 |
+
|
| 96 |
+
# Check if directory already exists
|
| 97 |
+
if os.path.exists(gallery_path):
|
| 98 |
+
return False, f"Save failed: folder '{gallery_name}' already exists"
|
| 99 |
+
|
| 100 |
+
# Create the gallery directory
|
| 101 |
+
os.makedirs(gallery_path, exist_ok=True)
|
| 102 |
+
|
| 103 |
+
# Copy GLB file
|
| 104 |
+
glb_source = os.path.join(target_dir, "scene.glb")
|
| 105 |
+
glb_dest = os.path.join(gallery_path, "scene.glb")
|
| 106 |
+
if os.path.exists(glb_source):
|
| 107 |
+
shutil.copy2(glb_source, glb_dest)
|
| 108 |
+
|
| 109 |
+
# Copy depth visualization images
|
| 110 |
+
depth_vis_dir = os.path.join(target_dir, "depth_vis")
|
| 111 |
+
if os.path.exists(depth_vis_dir):
|
| 112 |
+
gallery_depth_vis = os.path.join(gallery_path, "depth_vis")
|
| 113 |
+
shutil.copytree(depth_vis_dir, gallery_depth_vis)
|
| 114 |
+
|
| 115 |
+
# Copy original images
|
| 116 |
+
images_source = os.path.join(target_dir, "images")
|
| 117 |
+
if os.path.exists(images_source):
|
| 118 |
+
gallery_images = os.path.join(gallery_path, "images")
|
| 119 |
+
shutil.copytree(images_source, gallery_images)
|
| 120 |
+
|
| 121 |
+
scene_preview_source = os.path.join(target_dir, "scene.jpg")
|
| 122 |
+
scene_preview_dest = os.path.join(gallery_path, "scene.jpg")
|
| 123 |
+
shutil.copy2(scene_preview_source, scene_preview_dest)
|
| 124 |
+
|
| 125 |
+
# Save metadata
|
| 126 |
+
metadata = {
|
| 127 |
+
"timestamp": datetime.now().strftime("%Y%m%d_%H%M%S"),
|
| 128 |
+
"num_images": len(processed_data) if processed_data else 0,
|
| 129 |
+
"gallery_name": gallery_name,
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
with open(os.path.join(gallery_path, "metadata.json"), "w") as f:
|
| 133 |
+
json.dump(metadata, f, indent=2)
|
| 134 |
+
|
| 135 |
+
print(f"Saved reconstruction to gallery: {gallery_path}")
|
| 136 |
+
return True, f"Save successful: saved to {gallery_path}"
|
| 137 |
+
|
| 138 |
+
except Exception as e:
|
| 139 |
+
print(f"Error saving to gallery: {e}")
|
| 140 |
+
return False, f"Save failed: {str(e)}"
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
def _extract_video_thumbnail(video_path: str) -> str:
|
| 144 |
+
"""
|
| 145 |
+
Extract the first frame of a video as a thumbnail image.
|
| 146 |
+
|
| 147 |
+
Args:
|
| 148 |
+
video_path: Path to the video file
|
| 149 |
+
|
| 150 |
+
Returns:
|
| 151 |
+
Path to the thumbnail image (or video path if extraction fails)
|
| 152 |
+
"""
|
| 153 |
+
import tempfile
|
| 154 |
+
|
| 155 |
+
import cv2
|
| 156 |
+
|
| 157 |
+
try:
|
| 158 |
+
cap = cv2.VideoCapture(video_path)
|
| 159 |
+
ret, frame = cap.read()
|
| 160 |
+
cap.release()
|
| 161 |
+
|
| 162 |
+
if ret and frame is not None:
|
| 163 |
+
# Save thumbnail to temp directory
|
| 164 |
+
video_name = os.path.splitext(os.path.basename(video_path))[0]
|
| 165 |
+
thumbnail_dir = os.path.join(tempfile.gettempdir(), "da3_video_thumbnails")
|
| 166 |
+
os.makedirs(thumbnail_dir, exist_ok=True)
|
| 167 |
+
thumbnail_path = os.path.join(thumbnail_dir, f"{video_name}_thumb.jpg")
|
| 168 |
+
cv2.imwrite(thumbnail_path, frame)
|
| 169 |
+
return thumbnail_path
|
| 170 |
+
except Exception as e:
|
| 171 |
+
print(f"Error extracting video thumbnail: {e}")
|
| 172 |
+
|
| 173 |
+
# Fallback to video path if extraction fails
|
| 174 |
+
return video_path
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
def get_scene_info(examples_dir: str) -> List[Dict[str, Any]]:
|
| 178 |
+
"""
|
| 179 |
+
Get information about scenes in the examples directory.
|
| 180 |
+
|
| 181 |
+
Supports:
|
| 182 |
+
- Folders containing images (scene folders)
|
| 183 |
+
- Video files at the root level
|
| 184 |
+
|
| 185 |
+
Args:
|
| 186 |
+
examples_dir: Path to examples directory
|
| 187 |
+
|
| 188 |
+
Returns:
|
| 189 |
+
List of scene information dictionaries
|
| 190 |
+
"""
|
| 191 |
+
import glob
|
| 192 |
+
|
| 193 |
+
scenes = []
|
| 194 |
+
if not os.path.exists(examples_dir):
|
| 195 |
+
return scenes
|
| 196 |
+
|
| 197 |
+
for item in sorted(os.listdir(examples_dir)):
|
| 198 |
+
item_path = os.path.join(examples_dir, item)
|
| 199 |
+
|
| 200 |
+
if os.path.isdir(item_path):
|
| 201 |
+
# Find all image files in the scene folder
|
| 202 |
+
image_extensions = ["*.jpg", "*.jpeg", "*.png", "*.bmp", "*.tiff", "*.tif"]
|
| 203 |
+
image_files = []
|
| 204 |
+
for ext in image_extensions:
|
| 205 |
+
image_files.extend(glob.glob(os.path.join(item_path, ext)))
|
| 206 |
+
image_files.extend(glob.glob(os.path.join(item_path, ext.upper())))
|
| 207 |
+
|
| 208 |
+
if image_files:
|
| 209 |
+
# Sort images and get the first one for thumbnail
|
| 210 |
+
image_files = sorted(image_files)
|
| 211 |
+
first_image = image_files[0]
|
| 212 |
+
num_images = len(image_files)
|
| 213 |
+
|
| 214 |
+
scenes.append(
|
| 215 |
+
{
|
| 216 |
+
"name": item,
|
| 217 |
+
"path": item_path,
|
| 218 |
+
"thumbnail": first_image,
|
| 219 |
+
"num_images": num_images,
|
| 220 |
+
"image_files": image_files,
|
| 221 |
+
"type": "images",
|
| 222 |
+
}
|
| 223 |
+
)
|
| 224 |
+
|
| 225 |
+
elif os.path.isfile(item_path):
|
| 226 |
+
# Check if it's a video file
|
| 227 |
+
video_extensions = [".mp4", ".avi", ".mov", ".mkv", ".webm"]
|
| 228 |
+
ext = os.path.splitext(item)[1].lower()
|
| 229 |
+
if ext in video_extensions:
|
| 230 |
+
name = os.path.splitext(item)[0]
|
| 231 |
+
# Extract first frame as thumbnail
|
| 232 |
+
thumbnail_path = _extract_video_thumbnail(item_path)
|
| 233 |
+
scenes.append(
|
| 234 |
+
{
|
| 235 |
+
"name": name,
|
| 236 |
+
"path": item_path,
|
| 237 |
+
"thumbnail": thumbnail_path, # First frame as thumbnail
|
| 238 |
+
"num_images": 0,
|
| 239 |
+
"image_files": [],
|
| 240 |
+
"video_file": item_path,
|
| 241 |
+
"type": "video",
|
| 242 |
+
}
|
| 243 |
+
)
|
| 244 |
+
|
| 245 |
+
return scenes
|
| 246 |
+
|
| 247 |
+
|
| 248 |
+
# NOTE: cleanup was moved to a single canonical helper in
|
| 249 |
+
# `depth_anything_3.utils.memory.cleanup_cuda_memory`.
|
| 250 |
+
# Callers should import and call that directly instead of using this module.
|
| 251 |
+
|
| 252 |
+
|
| 253 |
+
def get_logo_base64() -> Optional[str]:
|
| 254 |
+
"""
|
| 255 |
+
Convert WAI logo to base64 for embedding in HTML.
|
| 256 |
+
|
| 257 |
+
Returns:
|
| 258 |
+
Base64 encoded logo string or None
|
| 259 |
+
"""
|
| 260 |
+
import base64
|
| 261 |
+
|
| 262 |
+
logo_path = "examples/WAI-Logo/wai_logo.png"
|
| 263 |
+
try:
|
| 264 |
+
with open(logo_path, "rb") as img_file:
|
| 265 |
+
img_data = img_file.read()
|
| 266 |
+
base64_str = base64.b64encode(img_data).decode()
|
| 267 |
+
return f"data:image/png;base64,{base64_str}"
|
| 268 |
+
except FileNotFoundError:
|
| 269 |
+
return None
|
src/depth_anything_3/app/modules/visualization.py
ADDED
|
@@ -0,0 +1,435 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
"""
|
| 16 |
+
Visualization module for Depth Anything 3 Gradio app.
|
| 17 |
+
|
| 18 |
+
This module handles visualization updates, navigation, and measurement functionality.
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
import os
|
| 22 |
+
from typing import Any, Dict, List, Optional, Tuple
|
| 23 |
+
|
| 24 |
+
import cv2
|
| 25 |
+
import gradio as gr
|
| 26 |
+
import numpy as np
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
class VisualizationHandler:
|
| 30 |
+
"""
|
| 31 |
+
Handles visualization updates and navigation for the Gradio app.
|
| 32 |
+
"""
|
| 33 |
+
|
| 34 |
+
def __init__(self):
|
| 35 |
+
"""Initialize the visualization handler."""
|
| 36 |
+
|
| 37 |
+
def update_view_selectors(
|
| 38 |
+
self, processed_data: Optional[dict]
|
| 39 |
+
) -> Tuple[gr.Dropdown, gr.Dropdown]:
|
| 40 |
+
"""
|
| 41 |
+
Update view selector dropdowns based on available views.
|
| 42 |
+
|
| 43 |
+
Args:
|
| 44 |
+
processed_data: Processed data dictionary
|
| 45 |
+
|
| 46 |
+
Returns:
|
| 47 |
+
Tuple of (depth_view_selector, measure_view_selector)
|
| 48 |
+
"""
|
| 49 |
+
if processed_data is None or len(processed_data) == 0:
|
| 50 |
+
choices = ["View 1"]
|
| 51 |
+
else:
|
| 52 |
+
num_views = len(processed_data)
|
| 53 |
+
choices = [f"View {i + 1}" for i in range(num_views)]
|
| 54 |
+
|
| 55 |
+
return (
|
| 56 |
+
gr.Dropdown(choices=choices, value=choices[0]), # depth_view_selector
|
| 57 |
+
gr.Dropdown(choices=choices, value=choices[0]), # measure_view_selector
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
def get_view_data_by_index(
|
| 61 |
+
self, processed_data: Optional[dict], view_index: int
|
| 62 |
+
) -> Optional[Dict[str, Any]]:
|
| 63 |
+
"""
|
| 64 |
+
Get view data by index, handling bounds.
|
| 65 |
+
|
| 66 |
+
Args:
|
| 67 |
+
processed_data: Processed data dictionary
|
| 68 |
+
view_index: Index of the view to get
|
| 69 |
+
|
| 70 |
+
Returns:
|
| 71 |
+
View data dictionary or None
|
| 72 |
+
"""
|
| 73 |
+
if processed_data is None or len(processed_data) == 0:
|
| 74 |
+
return None
|
| 75 |
+
|
| 76 |
+
view_keys = list(processed_data.keys())
|
| 77 |
+
if view_index < 0 or view_index >= len(view_keys):
|
| 78 |
+
view_index = 0
|
| 79 |
+
|
| 80 |
+
return processed_data[view_keys[view_index]]
|
| 81 |
+
|
| 82 |
+
def update_depth_view(
|
| 83 |
+
self, processed_data: Optional[dict], view_index: int
|
| 84 |
+
) -> Optional[str]:
|
| 85 |
+
"""
|
| 86 |
+
Update depth view for a specific view index.
|
| 87 |
+
|
| 88 |
+
Args:
|
| 89 |
+
processed_data: Processed data dictionary
|
| 90 |
+
view_index: Index of the view to update
|
| 91 |
+
|
| 92 |
+
Returns:
|
| 93 |
+
Path to depth visualization image or None
|
| 94 |
+
"""
|
| 95 |
+
view_data = self.get_view_data_by_index(processed_data, view_index)
|
| 96 |
+
if view_data is None or view_data.get("depth_image") is None:
|
| 97 |
+
return None
|
| 98 |
+
|
| 99 |
+
# Return the depth visualization image directly
|
| 100 |
+
return view_data["depth_image"]
|
| 101 |
+
|
| 102 |
+
def navigate_depth_view(
|
| 103 |
+
self,
|
| 104 |
+
processed_data: Optional[dict],
|
| 105 |
+
current_selector_value: str,
|
| 106 |
+
direction: int,
|
| 107 |
+
) -> Tuple[str, Optional[str]]:
|
| 108 |
+
"""
|
| 109 |
+
Navigate depth view (direction: -1 for previous, +1 for next).
|
| 110 |
+
|
| 111 |
+
Args:
|
| 112 |
+
processed_data: Processed data dictionary
|
| 113 |
+
current_selector_value: Current selector value
|
| 114 |
+
direction: Direction to navigate (-1 for previous, +1 for next)
|
| 115 |
+
|
| 116 |
+
Returns:
|
| 117 |
+
Tuple of (new_selector_value, depth_vis)
|
| 118 |
+
"""
|
| 119 |
+
if processed_data is None or len(processed_data) == 0:
|
| 120 |
+
return "View 1", None
|
| 121 |
+
|
| 122 |
+
# Parse current view number
|
| 123 |
+
try:
|
| 124 |
+
current_view = int(current_selector_value.split()[1]) - 1
|
| 125 |
+
except: # noqa
|
| 126 |
+
current_view = 0
|
| 127 |
+
|
| 128 |
+
num_views = len(processed_data)
|
| 129 |
+
new_view = (current_view + direction) % num_views
|
| 130 |
+
|
| 131 |
+
new_selector_value = f"View {new_view + 1}"
|
| 132 |
+
depth_vis = self.update_depth_view(processed_data, new_view)
|
| 133 |
+
|
| 134 |
+
return new_selector_value, depth_vis
|
| 135 |
+
|
| 136 |
+
def update_measure_view(
|
| 137 |
+
self, processed_data: Optional[dict], view_index: int
|
| 138 |
+
) -> Tuple[Optional[np.ndarray], Optional[np.ndarray], List]:
|
| 139 |
+
"""
|
| 140 |
+
Update measure view for a specific view index.
|
| 141 |
+
|
| 142 |
+
Args:
|
| 143 |
+
processed_data: Processed data dictionary
|
| 144 |
+
view_index: Index of the view to update
|
| 145 |
+
|
| 146 |
+
Returns:
|
| 147 |
+
Tuple of (measure_image, depth_right_half, measure_points)
|
| 148 |
+
"""
|
| 149 |
+
view_data = self.get_view_data_by_index(processed_data, view_index)
|
| 150 |
+
if view_data is None:
|
| 151 |
+
return None, None, [] # image, depth_right_half, measure_points
|
| 152 |
+
|
| 153 |
+
# Get the processed (resized) image
|
| 154 |
+
if "image" in view_data and view_data["image"] is not None:
|
| 155 |
+
image = view_data["image"].copy()
|
| 156 |
+
else:
|
| 157 |
+
return None, None, []
|
| 158 |
+
|
| 159 |
+
# Ensure image is in uint8 format
|
| 160 |
+
if image.dtype != np.uint8:
|
| 161 |
+
if image.max() <= 1.0:
|
| 162 |
+
image = (image * 255).astype(np.uint8)
|
| 163 |
+
else:
|
| 164 |
+
image = image.astype(np.uint8)
|
| 165 |
+
|
| 166 |
+
# Extract right half of the depth visualization (pure depth part)
|
| 167 |
+
depth_image_path = view_data.get("depth_image", None)
|
| 168 |
+
depth_right_half = None
|
| 169 |
+
|
| 170 |
+
if depth_image_path and os.path.exists(depth_image_path):
|
| 171 |
+
try:
|
| 172 |
+
# Load the combined depth visualization image
|
| 173 |
+
depth_combined = cv2.imread(depth_image_path)
|
| 174 |
+
depth_combined = cv2.cvtColor(depth_combined, cv2.COLOR_BGR2RGB)
|
| 175 |
+
if depth_combined is not None:
|
| 176 |
+
height, width = depth_combined.shape[:2]
|
| 177 |
+
# Extract right half (depth visualization part)
|
| 178 |
+
depth_right_half = depth_combined[:, width // 2 :]
|
| 179 |
+
except Exception as e:
|
| 180 |
+
print(f"Error extracting depth right half: {e}")
|
| 181 |
+
|
| 182 |
+
return image, depth_right_half, []
|
| 183 |
+
|
| 184 |
+
def navigate_measure_view(
|
| 185 |
+
self,
|
| 186 |
+
processed_data: Optional[dict],
|
| 187 |
+
current_selector_value: str,
|
| 188 |
+
direction: int,
|
| 189 |
+
) -> Tuple[str, Optional[np.ndarray], Optional[str], List]:
|
| 190 |
+
"""
|
| 191 |
+
Navigate measure view (direction: -1 for previous, +1 for next).
|
| 192 |
+
|
| 193 |
+
Args:
|
| 194 |
+
processed_data: Processed data dictionary
|
| 195 |
+
current_selector_value: Current selector value
|
| 196 |
+
direction: Direction to navigate (-1 for previous, +1 for next)
|
| 197 |
+
|
| 198 |
+
Returns:
|
| 199 |
+
Tuple of (new_selector_value, measure_image, depth_image_path, measure_points)
|
| 200 |
+
"""
|
| 201 |
+
if processed_data is None or len(processed_data) == 0:
|
| 202 |
+
return "View 1", None, None, []
|
| 203 |
+
|
| 204 |
+
# Parse current view number
|
| 205 |
+
try:
|
| 206 |
+
current_view = int(current_selector_value.split()[1]) - 1
|
| 207 |
+
except: # noqa
|
| 208 |
+
current_view = 0
|
| 209 |
+
|
| 210 |
+
num_views = len(processed_data)
|
| 211 |
+
new_view = (current_view + direction) % num_views
|
| 212 |
+
|
| 213 |
+
new_selector_value = f"View {new_view + 1}"
|
| 214 |
+
measure_image, depth_right_half, measure_points = self.update_measure_view(
|
| 215 |
+
processed_data, new_view
|
| 216 |
+
)
|
| 217 |
+
|
| 218 |
+
return new_selector_value, measure_image, depth_right_half, measure_points
|
| 219 |
+
|
| 220 |
+
def populate_visualization_tabs(
|
| 221 |
+
self, processed_data: Optional[dict]
|
| 222 |
+
) -> Tuple[Optional[str], Optional[np.ndarray], Optional[str], List]:
|
| 223 |
+
"""
|
| 224 |
+
Populate the depth and measure tabs with processed data.
|
| 225 |
+
|
| 226 |
+
Args:
|
| 227 |
+
processed_data: Processed data dictionary
|
| 228 |
+
|
| 229 |
+
Returns:
|
| 230 |
+
Tuple of (depth_vis, measure_img, depth_image_path, measure_points)
|
| 231 |
+
"""
|
| 232 |
+
if processed_data is None or len(processed_data) == 0:
|
| 233 |
+
return None, None, None, []
|
| 234 |
+
|
| 235 |
+
# Use update function to get depth visualization
|
| 236 |
+
depth_vis = self.update_depth_view(processed_data, 0)
|
| 237 |
+
measure_img, depth_right_half, _ = self.update_measure_view(processed_data, 0)
|
| 238 |
+
|
| 239 |
+
return depth_vis, measure_img, depth_right_half, []
|
| 240 |
+
|
| 241 |
+
def reset_measure(
|
| 242 |
+
self, processed_data: Optional[dict]
|
| 243 |
+
) -> Tuple[Optional[np.ndarray], List, str]:
|
| 244 |
+
"""
|
| 245 |
+
Reset measure points.
|
| 246 |
+
|
| 247 |
+
Args:
|
| 248 |
+
processed_data: Processed data dictionary
|
| 249 |
+
|
| 250 |
+
Returns:
|
| 251 |
+
Tuple of (image, measure_points, text)
|
| 252 |
+
"""
|
| 253 |
+
if processed_data is None or len(processed_data) == 0:
|
| 254 |
+
return None, [], ""
|
| 255 |
+
|
| 256 |
+
# Return the first view image
|
| 257 |
+
first_view = list(processed_data.values())[0]
|
| 258 |
+
return first_view["image"], [], ""
|
| 259 |
+
|
| 260 |
+
def measure(
|
| 261 |
+
self,
|
| 262 |
+
processed_data: Optional[dict],
|
| 263 |
+
measure_points: List,
|
| 264 |
+
current_view_selector: str,
|
| 265 |
+
event: gr.SelectData,
|
| 266 |
+
) -> List:
|
| 267 |
+
"""
|
| 268 |
+
Handle measurement on images.
|
| 269 |
+
|
| 270 |
+
Args:
|
| 271 |
+
processed_data: Processed data dictionary
|
| 272 |
+
measure_points: List of current measure points
|
| 273 |
+
current_view_selector: Current view selector value
|
| 274 |
+
event: Gradio select event
|
| 275 |
+
|
| 276 |
+
Returns:
|
| 277 |
+
List of [image, depth_right_half, measure_points, text]
|
| 278 |
+
"""
|
| 279 |
+
try:
|
| 280 |
+
print(f"Measure function called with selector: {current_view_selector}")
|
| 281 |
+
|
| 282 |
+
if processed_data is None or len(processed_data) == 0:
|
| 283 |
+
return [None, [], "No data available"]
|
| 284 |
+
|
| 285 |
+
# Use the currently selected view instead of always using the first view
|
| 286 |
+
try:
|
| 287 |
+
current_view_index = int(current_view_selector.split()[1]) - 1
|
| 288 |
+
except: # noqa
|
| 289 |
+
current_view_index = 0
|
| 290 |
+
|
| 291 |
+
print(f"Using view index: {current_view_index}")
|
| 292 |
+
|
| 293 |
+
# Get view data safely
|
| 294 |
+
if current_view_index < 0 or current_view_index >= len(processed_data):
|
| 295 |
+
current_view_index = 0
|
| 296 |
+
|
| 297 |
+
view_keys = list(processed_data.keys())
|
| 298 |
+
current_view = processed_data[view_keys[current_view_index]]
|
| 299 |
+
|
| 300 |
+
if current_view is None:
|
| 301 |
+
return [None, [], "No view data available"]
|
| 302 |
+
|
| 303 |
+
point2d = event.index[0], event.index[1]
|
| 304 |
+
print(f"Clicked point: {point2d}")
|
| 305 |
+
|
| 306 |
+
measure_points.append(point2d)
|
| 307 |
+
|
| 308 |
+
# Get image and depth visualization
|
| 309 |
+
image, depth_right_half, _ = self.update_measure_view(
|
| 310 |
+
processed_data, current_view_index
|
| 311 |
+
)
|
| 312 |
+
if image is None:
|
| 313 |
+
return [None, [], "No image available"]
|
| 314 |
+
|
| 315 |
+
image = image.copy()
|
| 316 |
+
|
| 317 |
+
# Ensure image is in uint8 format for proper cv2 operations
|
| 318 |
+
try:
|
| 319 |
+
if image.dtype != np.uint8:
|
| 320 |
+
if image.max() <= 1.0:
|
| 321 |
+
# Image is in [0, 1] range, convert to [0, 255]
|
| 322 |
+
image = (image * 255).astype(np.uint8)
|
| 323 |
+
else:
|
| 324 |
+
# Image is already in [0, 255] range
|
| 325 |
+
image = image.astype(np.uint8)
|
| 326 |
+
except Exception as e:
|
| 327 |
+
print(f"Image conversion error: {e}")
|
| 328 |
+
return [None, [], f"Image conversion error: {e}"]
|
| 329 |
+
|
| 330 |
+
# Draw circles for points
|
| 331 |
+
try:
|
| 332 |
+
for p in measure_points:
|
| 333 |
+
if 0 <= p[0] < image.shape[1] and 0 <= p[1] < image.shape[0]:
|
| 334 |
+
image = cv2.circle(image, p, radius=5, color=(255, 0, 0), thickness=2)
|
| 335 |
+
except Exception as e:
|
| 336 |
+
print(f"Drawing error: {e}")
|
| 337 |
+
return [None, [], f"Drawing error: {e}"]
|
| 338 |
+
|
| 339 |
+
# Get depth information from processed_data
|
| 340 |
+
depth_text = ""
|
| 341 |
+
try:
|
| 342 |
+
for i, p in enumerate(measure_points):
|
| 343 |
+
if (
|
| 344 |
+
current_view["depth"] is not None
|
| 345 |
+
and 0 <= p[1] < current_view["depth"].shape[0]
|
| 346 |
+
and 0 <= p[0] < current_view["depth"].shape[1]
|
| 347 |
+
):
|
| 348 |
+
d = current_view["depth"][p[1], p[0]]
|
| 349 |
+
depth_text += f"- **P{i + 1} depth: {d:.2f}m**\n"
|
| 350 |
+
else:
|
| 351 |
+
depth_text += f"- **P{i + 1}: Click position ({p[0]}, {p[1]}) - No depth information**\n" # noqa: E501
|
| 352 |
+
except Exception as e:
|
| 353 |
+
print(f"Depth text error: {e}")
|
| 354 |
+
depth_text = f"Error computing depth: {e}\n"
|
| 355 |
+
|
| 356 |
+
if len(measure_points) == 2:
|
| 357 |
+
try:
|
| 358 |
+
point1, point2 = measure_points
|
| 359 |
+
# Draw line
|
| 360 |
+
if (
|
| 361 |
+
0 <= point1[0] < image.shape[1]
|
| 362 |
+
and 0 <= point1[1] < image.shape[0]
|
| 363 |
+
and 0 <= point2[0] < image.shape[1]
|
| 364 |
+
and 0 <= point2[1] < image.shape[0]
|
| 365 |
+
):
|
| 366 |
+
image = cv2.line(image, point1, point2, color=(255, 0, 0), thickness=2)
|
| 367 |
+
|
| 368 |
+
# Compute 3D distance using depth information and camera intrinsics
|
| 369 |
+
distance_text = "- **Distance: Unable to calculate 3D distance**"
|
| 370 |
+
if (
|
| 371 |
+
current_view["depth"] is not None
|
| 372 |
+
and 0 <= point1[1] < current_view["depth"].shape[0]
|
| 373 |
+
and 0 <= point1[0] < current_view["depth"].shape[1]
|
| 374 |
+
and 0 <= point2[1] < current_view["depth"].shape[0]
|
| 375 |
+
and 0 <= point2[0] < current_view["depth"].shape[1]
|
| 376 |
+
):
|
| 377 |
+
try:
|
| 378 |
+
# Get depth values at the two points
|
| 379 |
+
d1 = current_view["depth"][point1[1], point1[0]]
|
| 380 |
+
d2 = current_view["depth"][point2[1], point2[0]]
|
| 381 |
+
|
| 382 |
+
# Convert 2D pixel coordinates to 3D world coordinates
|
| 383 |
+
if current_view["intrinsics"] is not None:
|
| 384 |
+
# Get camera intrinsics
|
| 385 |
+
K = current_view["intrinsics"] # 3x3 intrinsic matrix
|
| 386 |
+
fx, fy = K[0, 0], K[1, 1] # focal lengths
|
| 387 |
+
cx, cy = K[0, 2], K[1, 2] # principal point
|
| 388 |
+
|
| 389 |
+
# Convert pixel coordinates to normalized camera coordinates
|
| 390 |
+
# Point 1: (u1, v1) -> (x1, y1, z1)
|
| 391 |
+
u1, v1 = point1[0], point1[1]
|
| 392 |
+
x1 = (u1 - cx) * d1 / fx
|
| 393 |
+
y1 = (v1 - cy) * d1 / fy
|
| 394 |
+
z1 = d1
|
| 395 |
+
|
| 396 |
+
# Point 2: (u2, v2) -> (x2, y2, z2)
|
| 397 |
+
u2, v2 = point2[0], point2[1]
|
| 398 |
+
x2 = (u2 - cx) * d2 / fx
|
| 399 |
+
y2 = (v2 - cy) * d2 / fy
|
| 400 |
+
z2 = d2
|
| 401 |
+
|
| 402 |
+
# Calculate 3D Euclidean distance
|
| 403 |
+
p1_3d = np.array([x1, y1, z1])
|
| 404 |
+
p2_3d = np.array([x2, y2, z2])
|
| 405 |
+
distance_3d = np.linalg.norm(p1_3d - p2_3d)
|
| 406 |
+
|
| 407 |
+
distance_text = f"- **Distance: {distance_3d:.2f}m**"
|
| 408 |
+
else:
|
| 409 |
+
# Fallback to simplified calculation if no intrinsics
|
| 410 |
+
pixel_distance = np.sqrt(
|
| 411 |
+
(point1[0] - point2[0]) ** 2 + (point1[1] - point2[1]) ** 2
|
| 412 |
+
)
|
| 413 |
+
avg_depth = (d1 + d2) / 2
|
| 414 |
+
scale_factor = avg_depth / 1000 # Rough scaling factor
|
| 415 |
+
estimated_3d_distance = pixel_distance * scale_factor
|
| 416 |
+
distance_text = f"- **Distance: {estimated_3d_distance:.2f}m (estimated, no intrinsics)**" # noqa: E501
|
| 417 |
+
|
| 418 |
+
except Exception as e:
|
| 419 |
+
print(f"Distance computation error: {e}")
|
| 420 |
+
distance_text = f"- **Distance computation error: {e}**"
|
| 421 |
+
|
| 422 |
+
measure_points = []
|
| 423 |
+
text = depth_text + distance_text
|
| 424 |
+
print(f"Measurement complete: {text}")
|
| 425 |
+
return [image, depth_right_half, measure_points, text]
|
| 426 |
+
except Exception as e:
|
| 427 |
+
print(f"Final measurement error: {e}")
|
| 428 |
+
return [None, [], f"Measurement error: {e}"]
|
| 429 |
+
else:
|
| 430 |
+
print(f"Single point measurement: {depth_text}")
|
| 431 |
+
return [image, depth_right_half, measure_points, depth_text]
|
| 432 |
+
|
| 433 |
+
except Exception as e:
|
| 434 |
+
print(f"Overall measure function error: {e}")
|
| 435 |
+
return [None, [], f"Measure function error: {e}"]
|
src/depth_anything_3/cache.py
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Model caching utilities for Depth Anything 3.
|
| 3 |
+
|
| 4 |
+
Provides model caching functionality to avoid reloading model weights on every instantiation.
|
| 5 |
+
This significantly reduces latency for repeated model creation (2-5s gain).
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
import threading
|
| 11 |
+
from typing import Dict, Optional, Tuple
|
| 12 |
+
|
| 13 |
+
import torch
|
| 14 |
+
import torch.nn as nn
|
| 15 |
+
|
| 16 |
+
from depth_anything_3.utils.logger import logger
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class ModelCache:
|
| 20 |
+
"""
|
| 21 |
+
Thread-safe singleton cache for Depth Anything 3 models.
|
| 22 |
+
|
| 23 |
+
Caches loaded model weights to avoid reloading from disk on every instantiation.
|
| 24 |
+
Each unique combination of (model_name, device) is cached separately.
|
| 25 |
+
|
| 26 |
+
Usage:
|
| 27 |
+
cache = ModelCache()
|
| 28 |
+
model = cache.get(model_name, device, loader_fn)
|
| 29 |
+
# loader_fn is only called if cache miss
|
| 30 |
+
|
| 31 |
+
Thread Safety:
|
| 32 |
+
Uses threading.Lock to ensure thread-safe access to cache.
|
| 33 |
+
|
| 34 |
+
Memory Management:
|
| 35 |
+
- Models are kept in cache until explicitly cleared
|
| 36 |
+
- Use clear() to free memory when needed
|
| 37 |
+
- Use clear_device() to clear specific device models
|
| 38 |
+
"""
|
| 39 |
+
|
| 40 |
+
_instance: Optional["ModelCache"] = None
|
| 41 |
+
_lock = threading.Lock()
|
| 42 |
+
|
| 43 |
+
def __new__(cls):
|
| 44 |
+
"""Singleton pattern to ensure single cache instance."""
|
| 45 |
+
if cls._instance is None:
|
| 46 |
+
with cls._lock:
|
| 47 |
+
if cls._instance is None:
|
| 48 |
+
cls._instance = super().__new__(cls)
|
| 49 |
+
cls._instance._initialized = False
|
| 50 |
+
return cls._instance
|
| 51 |
+
|
| 52 |
+
def __init__(self):
|
| 53 |
+
"""Initialize cache storage."""
|
| 54 |
+
if self._initialized:
|
| 55 |
+
return
|
| 56 |
+
|
| 57 |
+
self._cache: Dict[Tuple[str, str], nn.Module] = {}
|
| 58 |
+
self._cache_lock = threading.Lock()
|
| 59 |
+
self._initialized = True
|
| 60 |
+
logger.info("ModelCache initialized")
|
| 61 |
+
|
| 62 |
+
def get(
|
| 63 |
+
self,
|
| 64 |
+
model_name: str,
|
| 65 |
+
device: torch.device | str,
|
| 66 |
+
loader_fn: callable,
|
| 67 |
+
) -> nn.Module:
|
| 68 |
+
"""
|
| 69 |
+
Get cached model or load if not in cache.
|
| 70 |
+
|
| 71 |
+
Args:
|
| 72 |
+
model_name: Name of the model (e.g., "da3-large")
|
| 73 |
+
device: Target device (cuda, mps, cpu)
|
| 74 |
+
loader_fn: Function to load model if cache miss
|
| 75 |
+
Should return nn.Module
|
| 76 |
+
|
| 77 |
+
Returns:
|
| 78 |
+
Cached or freshly loaded model on specified device
|
| 79 |
+
|
| 80 |
+
Example:
|
| 81 |
+
>>> cache = ModelCache()
|
| 82 |
+
>>> model = cache.get(
|
| 83 |
+
... "da3-large",
|
| 84 |
+
... "cuda",
|
| 85 |
+
... lambda: create_model()
|
| 86 |
+
... )
|
| 87 |
+
"""
|
| 88 |
+
device_str = str(device)
|
| 89 |
+
cache_key = (model_name, device_str)
|
| 90 |
+
|
| 91 |
+
with self._cache_lock:
|
| 92 |
+
if cache_key in self._cache:
|
| 93 |
+
logger.debug(f"Model cache HIT: {model_name} on {device_str}")
|
| 94 |
+
return self._cache[cache_key]
|
| 95 |
+
|
| 96 |
+
logger.info(f"Model cache MISS: {model_name} on {device_str}. Loading...")
|
| 97 |
+
model = loader_fn()
|
| 98 |
+
self._cache[cache_key] = model
|
| 99 |
+
logger.info(f"Model cached: {model_name} on {device_str}")
|
| 100 |
+
|
| 101 |
+
return model
|
| 102 |
+
|
| 103 |
+
def clear(self) -> None:
|
| 104 |
+
"""
|
| 105 |
+
Clear entire cache and free memory.
|
| 106 |
+
|
| 107 |
+
Removes all cached models and forces garbage collection.
|
| 108 |
+
Useful when switching between many different models.
|
| 109 |
+
"""
|
| 110 |
+
with self._cache_lock:
|
| 111 |
+
num_cached = len(self._cache)
|
| 112 |
+
self._cache.clear()
|
| 113 |
+
|
| 114 |
+
# Force garbage collection to free GPU memory
|
| 115 |
+
import gc
|
| 116 |
+
|
| 117 |
+
gc.collect()
|
| 118 |
+
if torch.cuda.is_available():
|
| 119 |
+
torch.cuda.empty_cache()
|
| 120 |
+
if hasattr(torch, "mps") and torch.backends.mps.is_available():
|
| 121 |
+
torch.mps.empty_cache()
|
| 122 |
+
|
| 123 |
+
logger.info(f"Model cache cleared ({num_cached} models removed)")
|
| 124 |
+
|
| 125 |
+
def clear_device(self, device: torch.device | str) -> None:
|
| 126 |
+
"""
|
| 127 |
+
Clear all models on specific device.
|
| 128 |
+
|
| 129 |
+
Args:
|
| 130 |
+
device: Device to clear (e.g., "cuda", "mps", "cpu")
|
| 131 |
+
|
| 132 |
+
Example:
|
| 133 |
+
>>> cache = ModelCache()
|
| 134 |
+
>>> cache.clear_device("cuda") # Clear all CUDA models
|
| 135 |
+
"""
|
| 136 |
+
device_str = str(device)
|
| 137 |
+
|
| 138 |
+
with self._cache_lock:
|
| 139 |
+
keys_to_remove = [key for key in self._cache if key[1] == device_str]
|
| 140 |
+
for key in keys_to_remove:
|
| 141 |
+
del self._cache[key]
|
| 142 |
+
|
| 143 |
+
# Free device memory
|
| 144 |
+
if "cuda" in device_str and torch.cuda.is_available():
|
| 145 |
+
torch.cuda.empty_cache()
|
| 146 |
+
elif "mps" in device_str and hasattr(torch, "mps") and torch.backends.mps.is_available():
|
| 147 |
+
torch.mps.empty_cache()
|
| 148 |
+
|
| 149 |
+
logger.info(f"Model cache cleared for device {device_str} ({len(keys_to_remove)} models removed)")
|
| 150 |
+
|
| 151 |
+
def get_cache_info(self) -> Dict[str, int]:
|
| 152 |
+
"""
|
| 153 |
+
Get cache statistics.
|
| 154 |
+
|
| 155 |
+
Returns:
|
| 156 |
+
Dictionary with cache info:
|
| 157 |
+
- total: Total number of cached models
|
| 158 |
+
- by_device: Number of models per device
|
| 159 |
+
"""
|
| 160 |
+
with self._cache_lock:
|
| 161 |
+
info = {
|
| 162 |
+
"total": len(self._cache),
|
| 163 |
+
"by_device": {},
|
| 164 |
+
}
|
| 165 |
+
|
| 166 |
+
for model_name, device_str in self._cache.keys():
|
| 167 |
+
if device_str not in info["by_device"]:
|
| 168 |
+
info["by_device"][device_str] = 0
|
| 169 |
+
info["by_device"][device_str] += 1
|
| 170 |
+
|
| 171 |
+
return info
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
# Global singleton instance
|
| 175 |
+
_global_cache = ModelCache()
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
def get_model_cache() -> ModelCache:
|
| 179 |
+
"""
|
| 180 |
+
Get global model cache instance.
|
| 181 |
+
|
| 182 |
+
Returns:
|
| 183 |
+
Singleton ModelCache instance
|
| 184 |
+
|
| 185 |
+
Example:
|
| 186 |
+
>>> from depth_anything_3.cache import get_model_cache
|
| 187 |
+
>>> cache = get_model_cache()
|
| 188 |
+
>>> cache.clear()
|
| 189 |
+
"""
|
| 190 |
+
return _global_cache
|
src/depth_anything_3/cfg.py
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
"""
|
| 16 |
+
Configuration utility functions
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
import importlib
|
| 20 |
+
from pathlib import Path
|
| 21 |
+
from typing import Any, Callable, List, Union
|
| 22 |
+
|
| 23 |
+
from omegaconf import DictConfig, ListConfig, OmegaConf
|
| 24 |
+
|
| 25 |
+
try:
|
| 26 |
+
OmegaConf.register_new_resolver("eval", eval)
|
| 27 |
+
except Exception as e:
|
| 28 |
+
# if eval is not available, we can just pass
|
| 29 |
+
print(f"Error registering eval resolver: {e}")
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def load_config(path: str, argv: List[str] = None) -> Union[DictConfig, ListConfig]:
|
| 33 |
+
"""
|
| 34 |
+
Load a configuration. Will resolve inheritance.
|
| 35 |
+
Supports both file paths and module paths (e.g., depth_anything_3.configs.giant).
|
| 36 |
+
"""
|
| 37 |
+
# Check if path is a module path (contains dots but no slashes and doesn't end with .yaml)
|
| 38 |
+
if "." in path and "/" not in path and not path.endswith(".yaml"):
|
| 39 |
+
# It's a module path, load from package resources
|
| 40 |
+
path_parts = path.split(".")[1:]
|
| 41 |
+
config_path = Path(__file__).resolve().parent
|
| 42 |
+
for part in path_parts:
|
| 43 |
+
config_path = config_path.joinpath(part)
|
| 44 |
+
config_path = config_path.with_suffix(".yaml")
|
| 45 |
+
config = OmegaConf.load(str(config_path))
|
| 46 |
+
else:
|
| 47 |
+
# It's a file path (absolute, relative, or with .yaml extension)
|
| 48 |
+
config = OmegaConf.load(path)
|
| 49 |
+
|
| 50 |
+
if argv is not None:
|
| 51 |
+
config_argv = OmegaConf.from_dotlist(argv)
|
| 52 |
+
config = OmegaConf.merge(config, config_argv)
|
| 53 |
+
config = resolve_recursive(config, resolve_inheritance)
|
| 54 |
+
return config
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def resolve_recursive(
|
| 58 |
+
config: Any,
|
| 59 |
+
resolver: Callable[[Union[DictConfig, ListConfig]], Union[DictConfig, ListConfig]],
|
| 60 |
+
) -> Any:
|
| 61 |
+
config = resolver(config)
|
| 62 |
+
if isinstance(config, DictConfig):
|
| 63 |
+
for k in config.keys():
|
| 64 |
+
v = config.get(k)
|
| 65 |
+
if isinstance(v, (DictConfig, ListConfig)):
|
| 66 |
+
config[k] = resolve_recursive(v, resolver)
|
| 67 |
+
if isinstance(config, ListConfig):
|
| 68 |
+
for i in range(len(config)):
|
| 69 |
+
v = config.get(i)
|
| 70 |
+
if isinstance(v, (DictConfig, ListConfig)):
|
| 71 |
+
config[i] = resolve_recursive(v, resolver)
|
| 72 |
+
return config
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def resolve_inheritance(config: Union[DictConfig, ListConfig]) -> Any:
|
| 76 |
+
"""
|
| 77 |
+
Recursively resolve inheritance if the config contains:
|
| 78 |
+
__inherit__: path/to/parent.yaml or a ListConfig of such paths.
|
| 79 |
+
"""
|
| 80 |
+
if isinstance(config, DictConfig):
|
| 81 |
+
inherit = config.pop("__inherit__", None)
|
| 82 |
+
|
| 83 |
+
if inherit:
|
| 84 |
+
inherit_list = inherit if isinstance(inherit, ListConfig) else [inherit]
|
| 85 |
+
|
| 86 |
+
parent_config = None
|
| 87 |
+
for parent_path in inherit_list:
|
| 88 |
+
assert isinstance(parent_path, str)
|
| 89 |
+
parent_config = (
|
| 90 |
+
load_config(parent_path)
|
| 91 |
+
if parent_config is None
|
| 92 |
+
else OmegaConf.merge(parent_config, load_config(parent_path))
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
if len(config.keys()) > 0:
|
| 96 |
+
config = OmegaConf.merge(parent_config, config)
|
| 97 |
+
else:
|
| 98 |
+
config = parent_config
|
| 99 |
+
return config
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def import_item(path: str, name: str) -> Any:
|
| 103 |
+
"""
|
| 104 |
+
Import a python item. Example: import_item("path.to.file", "MyClass") -> MyClass
|
| 105 |
+
"""
|
| 106 |
+
return getattr(importlib.import_module(path), name)
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
def create_object(config: DictConfig) -> Any:
|
| 110 |
+
"""
|
| 111 |
+
Create an object from config.
|
| 112 |
+
The config is expected to contains the following:
|
| 113 |
+
__object__:
|
| 114 |
+
path: path.to.module
|
| 115 |
+
name: MyClass
|
| 116 |
+
args: as_config | as_params (default to as_config)
|
| 117 |
+
"""
|
| 118 |
+
config = DictConfig(config)
|
| 119 |
+
item = import_item(
|
| 120 |
+
path=config.__object__.path,
|
| 121 |
+
name=config.__object__.name,
|
| 122 |
+
)
|
| 123 |
+
args = config.__object__.get("args", "as_config")
|
| 124 |
+
if args == "as_config":
|
| 125 |
+
return item(config)
|
| 126 |
+
if args == "as_params":
|
| 127 |
+
config = OmegaConf.to_object(config)
|
| 128 |
+
config.pop("__object__")
|
| 129 |
+
return item(**config)
|
| 130 |
+
raise NotImplementedError(f"Unknown args type: {args}")
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
def create_dataset(path: str, *args, **kwargs) -> Any:
|
| 134 |
+
"""
|
| 135 |
+
Create a dataset. Requires the file to contain a "create_dataset" function.
|
| 136 |
+
"""
|
| 137 |
+
return import_item(path, "create_dataset")(*args, **kwargs)
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
def to_dict_recursive(config_obj):
|
| 141 |
+
if isinstance(config_obj, DictConfig):
|
| 142 |
+
return {k: to_dict_recursive(v) for k, v in config_obj.items()}
|
| 143 |
+
elif isinstance(config_obj, ListConfig):
|
| 144 |
+
return [to_dict_recursive(item) for item in config_obj]
|
| 145 |
+
return config_obj
|