Upload folder using huggingface_hub
Browse files- .gitignore +1 -0
- README.md +85 -78
- config.json +3 -0
- export_onnx.py +18 -10
- preprocessor_config.json +1 -1
- web/script.js +2 -2
.gitignore
CHANGED
|
@@ -3,4 +3,5 @@ __pycache__/
|
|
| 3 |
*.pyc
|
| 4 |
.DS_Store
|
| 5 |
*.safetensors
|
|
|
|
| 6 |
.vscode/
|
|
|
|
| 3 |
*.pyc
|
| 4 |
.DS_Store
|
| 5 |
*.safetensors
|
| 6 |
+
*.onnx
|
| 7 |
.vscode/
|
README.md
CHANGED
|
@@ -1,91 +1,98 @@
|
|
| 1 |
-
-
|
| 2 |
-
license: apache-2.0
|
| 3 |
-
library_name: onnx
|
| 4 |
-
tags:
|
| 5 |
-
- depth-estimation
|
| 6 |
-
- panoramic
|
| 7 |
-
- 360-degree
|
| 8 |
-
- webgpu
|
| 9 |
-
- onnx
|
| 10 |
-
pipeline_tag: depth-estimation
|
| 11 |
-
---
|
| 12 |
-
|
| 13 |
-
# DA-2: Depth Anything in Any Direction (ONNX WebGPU Version)
|
| 14 |
-
|
| 15 |
-
This repository contains the **ONNX** weights for [DA-2: Depth Anything in Any Direction](https://github.com/EnVision-Research/DA-2), optimized for **WebGPU** inference in the browser.
|
| 16 |
-
|
| 17 |
-
## Model Details
|
| 18 |
-
|
| 19 |
-
- **Original Model:** [haodongli/DA-2](https://huggingface.co/haodongli/DA-2)
|
| 20 |
-
- **Framework:** ONNX (Opset 17)
|
| 21 |
-
- **Precision:** FP32 (Full Precision)
|
| 22 |
-
- **Input Resolution:** 1092x546
|
| 23 |
-
- **Size:** ~1.4 GB
|
| 24 |
-
|
| 25 |
-
## Conversion Details
|
| 26 |
-
|
| 27 |
-
This model was converted from the original PyTorch weights to ONNX to enable client-side inference using `onnxruntime-web`.
|
| 28 |
-
|
| 29 |
-
- **Optimization:** Constant folding applied.
|
| 30 |
-
- **Compatibility:** Verified with WebGPU backend.
|
| 31 |
-
- **Modifications:**
|
| 32 |
-
- Replaced `clamp` operators with `Max`/`Min` combinations to ensure WebGPU kernel compatibility.
|
| 33 |
-
- Removed internal normalization layers to allow raw 0-1 input from the browser.
|
| 34 |
-
|
| 35 |
-
## Usage (Transformers.js)
|
| 36 |
-
|
| 37 |
-
You can also run this model using [Transformers.js](https://huggingface.co/docs/transformers.js).
|
| 38 |
-
|
| 39 |
-
```javascript
|
| 40 |
-
import { pipeline } from '@xenova/transformers';
|
| 41 |
-
|
| 42 |
-
// Initialize the pipeline
|
| 43 |
-
const depth_estimator = await pipeline('depth-estimation', 'phiph/DA-2-WebGPU', {
|
| 44 |
-
device: 'webgpu',
|
| 45 |
-
dtype: 'fp32', // Use FP32 as exported
|
| 46 |
-
});
|
| 47 |
-
|
| 48 |
-
// Run inference
|
| 49 |
-
const url = 'path/to/your/panorama.jpg';
|
| 50 |
-
const output = await depth_estimator(url);
|
| 51 |
-
// output.depth is the raw tensor
|
| 52 |
-
// output.mask is the visualized depth map
|
| 53 |
-
```
|
| 54 |
-
|
| 55 |
-
## Usage (ONNX Runtime Web)
|
| 56 |
|
| 57 |
-
|
| 58 |
|
| 59 |
-
|
| 60 |
-
import * as ort from 'onnxruntime-web/webgpu';
|
| 61 |
|
| 62 |
-
|
| 63 |
-
// Note: Model is now in the 'onnx' subdirectory
|
| 64 |
-
const session = await ort.InferenceSession.create('https://huggingface.co/phiph/DA-2-WebGPU/resolve/main/onnx/model.onnx', {
|
| 65 |
-
executionProviders: ['webgpu'],
|
| 66 |
-
preferredOutputLocation: { last_hidden_state: 'gpu-buffer' }
|
| 67 |
-
});
|
| 68 |
|
| 69 |
-
/
|
| 70 |
-
// Note: Do NOT apply ImageNet mean/std normalization. The model expects raw 0-1 floats.
|
| 71 |
-
const tensor = new ort.Tensor('float32', float32Data, [1, 3, 546, 1092]);
|
| 72 |
-
|
| 73 |
-
// 3. Run Inference
|
| 74 |
-
const results = await session.run({ images: tensor });
|
| 75 |
-
const depthMap = results.depth; // Access output
|
| 76 |
-
```
|
| 77 |
|
| 78 |
-
|
|
|
|
|
|
|
| 79 |
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
Please cite the original authors if you use this model:
|
| 83 |
|
| 84 |
```bibtex
|
| 85 |
-
@article{
|
| 86 |
-
title={
|
| 87 |
author={Li, Haodong and Zheng, Wangguangdong and He, Jing and Liu, Yuhao and Lin, Xin and Yang, Xin and Chen, Ying-Cong and Guo, Chunchao},
|
| 88 |
journal={arXiv preprint arXiv:2509.26618},
|
| 89 |
year={2025}
|
| 90 |
}
|
| 91 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# DA-2 WebGPU Port
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
+
This repository contains a port of the **DA-2 (Depth Anything in Any Direction)** model to run entirely in the browser using **WebGPU** and **ONNX Runtime**.
|
| 4 |
|
| 5 |
+
The original work was developed by EnVision-Research. This port enables real-time, client-side depth estimation from panoramic images without requiring a backend server for inference.
|
|
|
|
| 6 |
|
| 7 |
+
## 🔗 Original Work
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
+
**DA<sup>2</sup>: Depth Anything in Any Direction**
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
+
* **Repository:** [EnVision-Research/DA-2](https://github.com/EnVision-Research/DA-2)
|
| 12 |
+
* **Paper:** [arXiv:2509.26618](http://arxiv.org/abs/2509.26618)
|
| 13 |
+
* **Project Page:** [depth-any-in-any-dir.github.io](https://depth-any-in-any-dir.github.io/)
|
| 14 |
|
| 15 |
+
Please cite the original paper if you use this work:
|
|
|
|
|
|
|
| 16 |
|
| 17 |
```bibtex
|
| 18 |
+
@article{li2025da2,
|
| 19 |
+
title={DA2: Depth Anything in Any Direction},
|
| 20 |
author={Li, Haodong and Zheng, Wangguangdong and He, Jing and Liu, Yuhao and Lin, Xin and Yang, Xin and Chen, Ying-Cong and Guo, Chunchao},
|
| 21 |
journal={arXiv preprint arXiv:2509.26618},
|
| 22 |
year={2025}
|
| 23 |
}
|
| 24 |
```
|
| 25 |
+
|
| 26 |
+
## 🚀 WebGPU Demo
|
| 27 |
+
|
| 28 |
+
This project includes a web-based demo that runs the model directly in your browser.
|
| 29 |
+
|
| 30 |
+
### Prerequisites
|
| 31 |
+
|
| 32 |
+
* **Python 3.10+** (for model export)
|
| 33 |
+
* **Web Browser** with WebGPU support (Chrome 113+, Edge 113+, or Firefox Nightly).
|
| 34 |
+
|
| 35 |
+
### Installation
|
| 36 |
+
|
| 37 |
+
1. **Clone the repository:**
|
| 38 |
+
```bash
|
| 39 |
+
git clone <your-repo-url>
|
| 40 |
+
cd DA-2-Web
|
| 41 |
+
```
|
| 42 |
+
|
| 43 |
+
2. **Set up Python environment:**
|
| 44 |
+
```bash
|
| 45 |
+
python3 -m venv venv
|
| 46 |
+
source venv/bin/activate # On Windows: venv\Scripts\activate
|
| 47 |
+
pip install -r requirements.txt
|
| 48 |
+
```
|
| 49 |
+
|
| 50 |
+
### Model Preparation
|
| 51 |
+
|
| 52 |
+
To run the demo, you first need to convert the PyTorch model to ONNX format.
|
| 53 |
+
|
| 54 |
+
1. **Download the model weights:**
|
| 55 |
+
Download `model.safetensors` from the [HuggingFace repository](https://huggingface.co/haodongli/DA-2) and place it in the root directory of this project.
|
| 56 |
+
|
| 57 |
+
2. **Export to ONNX:**
|
| 58 |
+
Run the export script. This script handles the conversion to FP16 and applies necessary fixes for WebGPU compatibility (e.g., replacing `clamp` with `max`/`min`).
|
| 59 |
+
```bash
|
| 60 |
+
python export_onnx.py
|
| 61 |
+
```
|
| 62 |
+
This will generate `da2_model.onnx`.
|
| 63 |
+
|
| 64 |
+
3. **Merge ONNX files:**
|
| 65 |
+
The export process might generate external data files. Use the merge script to create a single `.onnx` file for easier web loading.
|
| 66 |
+
```bash
|
| 67 |
+
python merge_onnx.py
|
| 68 |
+
```
|
| 69 |
+
This will generate `da2_model_single.onnx`.
|
| 70 |
+
|
| 71 |
+
### Running the Demo
|
| 72 |
+
|
| 73 |
+
1. **Start a local web server:**
|
| 74 |
+
You need to serve the files over HTTP(S) for the browser to load the model and WebGPU context.
|
| 75 |
+
```bash
|
| 76 |
+
python3 -m http.server 8000
|
| 77 |
+
```
|
| 78 |
+
|
| 79 |
+
2. **Open in Browser:**
|
| 80 |
+
Navigate to `http://localhost:8000/web/` in your WebGPU-compatible browser.
|
| 81 |
+
|
| 82 |
+
3. **Usage:**
|
| 83 |
+
* Click "Choose File" to upload a panoramic image.
|
| 84 |
+
* Click "Run Inference" to generate the depth map.
|
| 85 |
+
* The process runs entirely locally on your GPU.
|
| 86 |
+
|
| 87 |
+
## 🛠️ Technical Details of the Port
|
| 88 |
+
|
| 89 |
+
* **Precision:** The model was converted to **FP16 (Half Precision)** to reduce file size (~1.4GB -> ~700MB) and improve performance on consumer GPUs.
|
| 90 |
+
* **Opset:** Exported using **ONNX Opset 17**.
|
| 91 |
+
* **Modifications:**
|
| 92 |
+
* The `SphereViT` and `ViT_w_Esphere` modules were modified to ensure strict FP16 compatibility.
|
| 93 |
+
* `torch.clamp` operations were replaced with `torch.max` and `torch.min` combinations to avoid `Clip` operator issues in `onnxruntime-web` when handling mixed scalar/tensor inputs.
|
| 94 |
+
* Sphere embeddings are pre-calculated and cast to FP16 within the model graph.
|
| 95 |
+
|
| 96 |
+
## 📄 License
|
| 97 |
+
|
| 98 |
+
This project follows the license of the original [DA-2 repository](https://github.com/EnVision-Research/DA-2). Please refer to the original repository for license details.
|
config.json
CHANGED
|
@@ -1,4 +1,7 @@
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
| 2 |
"model_type": "depth_anything",
|
| 3 |
"transformers_version": "4.39.0",
|
| 4 |
"image_size": [
|
|
|
|
| 1 |
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"DepthAnythingForDepthEstimation"
|
| 4 |
+
],
|
| 5 |
"model_type": "depth_anything",
|
| 6 |
"transformers_version": "4.39.0",
|
| 7 |
"image_size": [
|
export_onnx.py
CHANGED
|
@@ -51,14 +51,14 @@ model.eval()
|
|
| 51 |
dummy_input = torch.randn(1, 3, H, W)
|
| 52 |
|
| 53 |
# Export
|
| 54 |
-
output_file = "model.onnx"
|
| 55 |
print(f"Exporting to {output_file}...")
|
| 56 |
try:
|
| 57 |
torch.onnx.export(
|
| 58 |
model,
|
| 59 |
dummy_input,
|
| 60 |
output_file,
|
| 61 |
-
opset_version=
|
| 62 |
input_names=["pixel_values"],
|
| 63 |
output_names=["predicted_depth"],
|
| 64 |
dynamic_axes={
|
|
@@ -69,15 +69,23 @@ try:
|
|
| 69 |
do_constant_folding=True,
|
| 70 |
verbose=False
|
| 71 |
)
|
| 72 |
-
print(f"Successfully exported to {output_file}")
|
| 73 |
-
|
| 74 |
-
# Force single file (merge external data if any)
|
| 75 |
-
import onnx
|
| 76 |
-
print("Ensuring single ONNX file...")
|
| 77 |
-
onnx_model = onnx.load(output_file)
|
| 78 |
-
onnx.save_model(onnx_model, output_file, save_as_external_data=False)
|
| 79 |
-
print("Saved as single file.")
|
| 80 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
except Exception as e:
|
| 82 |
print(f"Error exporting to ONNX: {e}")
|
| 83 |
import traceback
|
|
|
|
| 51 |
dummy_input = torch.randn(1, 3, H, W)
|
| 52 |
|
| 53 |
# Export
|
| 54 |
+
output_file = "onnx/model.onnx"
|
| 55 |
print(f"Exporting to {output_file}...")
|
| 56 |
try:
|
| 57 |
torch.onnx.export(
|
| 58 |
model,
|
| 59 |
dummy_input,
|
| 60 |
output_file,
|
| 61 |
+
opset_version=17,
|
| 62 |
input_names=["pixel_values"],
|
| 63 |
output_names=["predicted_depth"],
|
| 64 |
dynamic_axes={
|
|
|
|
| 69 |
do_constant_folding=True,
|
| 70 |
verbose=False
|
| 71 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
|
| 73 |
+
print(f"Successfully exported to {output_file}")
|
| 74 |
+
# Quantize the exported ONNX model
|
| 75 |
+
try:
|
| 76 |
+
from onnxruntime.quantization import quantize_dynamic, QuantType
|
| 77 |
+
quantized_output_file = "onnx/model_quantized.onnx"
|
| 78 |
+
print(f"Quantizing model to {quantized_output_file}...")
|
| 79 |
+
quantize_dynamic(
|
| 80 |
+
output_file,
|
| 81 |
+
quantized_output_file,
|
| 82 |
+
weight_type=QuantType.QInt8
|
| 83 |
+
)
|
| 84 |
+
print(f"Successfully quantized to {quantized_output_file}")
|
| 85 |
+
except Exception as qe:
|
| 86 |
+
print(f"Error during quantization: {qe}")
|
| 87 |
+
import traceback
|
| 88 |
+
traceback.print_exc()
|
| 89 |
except Exception as e:
|
| 90 |
print(f"Error exporting to ONNX: {e}")
|
| 91 |
import traceback
|
preprocessor_config.json
CHANGED
|
@@ -8,5 +8,5 @@
|
|
| 8 |
},
|
| 9 |
"do_rescale": true,
|
| 10 |
"rescale_factor": 0.00392156862745098,
|
| 11 |
-
"image_processor_type": "
|
| 12 |
}
|
|
|
|
| 8 |
},
|
| 9 |
"do_rescale": true,
|
| 10 |
"rescale_factor": 0.00392156862745098,
|
| 11 |
+
"image_processor_type": "DepthAnythingImageProcessor"
|
| 12 |
}
|
web/script.js
CHANGED
|
@@ -71,9 +71,9 @@ runBtn.addEventListener('click', async () => {
|
|
| 71 |
const tensor = preprocess(imageData);
|
| 72 |
|
| 73 |
// Run inference
|
| 74 |
-
const feeds = {
|
| 75 |
const results = await session.run(feeds);
|
| 76 |
-
const output = results.
|
| 77 |
|
| 78 |
// Postprocess and visualize
|
| 79 |
visualize(output.data, INPUT_WIDTH, INPUT_HEIGHT);
|
|
|
|
| 71 |
const tensor = preprocess(imageData);
|
| 72 |
|
| 73 |
// Run inference
|
| 74 |
+
const feeds = { pixel_values: tensor };
|
| 75 |
const results = await session.run(feeds);
|
| 76 |
+
const output = results.predicted_depth;
|
| 77 |
|
| 78 |
// Postprocess and visualize
|
| 79 |
visualize(output.data, INPUT_WIDTH, INPUT_HEIGHT);
|