Commit ·
2ec64ce
0
Parent(s):
Initial DIRECT model release
Browse files- .gitattributes +35 -0
- README.md +43 -0
- condition_embedder.safetensors +3 -0
- config.json +38 -0
- image_projector.safetensors +3 -0
- lora.safetensors +3 -0
- pooled_image_projector.safetensors +3 -0
- time_text_embed.safetensors +3 -0
- x_embedder.safetensors +3 -0
.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
base_model:
|
| 3 |
+
- black-forest-labs/FLUX.1-Fill-dev
|
| 4 |
+
- microsoft/TRELLIS-image-large
|
| 5 |
+
tags:
|
| 6 |
+
- object-insertion
|
| 7 |
+
- image-to-image
|
| 8 |
+
- 3d-aware
|
| 9 |
+
- pose-controllable-generation
|
| 10 |
+
pipeline_tag: image-to-image
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
# DIRECT
|
| 14 |
+
|
| 15 |
+
This repository contains the model weights for **Direct 3D-Aware Object Insertion via Decomposed Visual Proxies**.
|
| 16 |
+
|
| 17 |
+
DIRECT performs pose-controllable object insertion by decomposing the insertion condition into visual proxies, including a reference object image, a geometry proxy rendered from a reconstructed 3D object, and a scene context image.
|
| 18 |
+
|
| 19 |
+
Project page: https://gong1130.github.io/DIRECT/
|
| 20 |
+
|
| 21 |
+
Code: https://github.com/Gong1130/DIRECT
|
| 22 |
+
|
| 23 |
+
## Usage
|
| 24 |
+
|
| 25 |
+
Please refer to the official code repository for installation instructions and **interactive demo** usage.
|
| 26 |
+
|
| 27 |
+
## Model Details
|
| 28 |
+
|
| 29 |
+
This repository contains **DIRECT-specific** weights **only**:
|
| 30 |
+
|
| 31 |
+
- `lora.safetensors`
|
| 32 |
+
- `condition_embedder.safetensors`
|
| 33 |
+
- `x_embedder.safetensors`
|
| 34 |
+
- `time_text_embed.safetensors`
|
| 35 |
+
- `pooled_image_projector.safetensors`
|
| 36 |
+
- `image_projector.safetensors`
|
| 37 |
+
- `config.json`
|
| 38 |
+
|
| 39 |
+
The model requires the following **external** models:
|
| 40 |
+
|
| 41 |
+
- `black-forest-labs/FLUX.1-Fill-dev`
|
| 42 |
+
- `google/siglip2-so400m-patch14-384`
|
| 43 |
+
- `microsoft/TRELLIS-image-large`
|
condition_embedder.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:955a10ab50142b229ad9cc6ef807bac9c7bee6c8ef3b6fc0a14edc7400e34a77
|
| 3 |
+
size 798872
|
config.json
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_type": "direct_pipeline",
|
| 3 |
+
"flux_model": "black-forest-labs/FLUX.1-Fill-dev",
|
| 4 |
+
"siglip_model": "google/siglip2-so400m-patch14-384",
|
| 5 |
+
"torch_dtype": "bfloat16",
|
| 6 |
+
"lora": {
|
| 7 |
+
"ranks": [128, 128],
|
| 8 |
+
"alphas": [128, 128],
|
| 9 |
+
"weights": [1, 1],
|
| 10 |
+
"n_loras": 2,
|
| 11 |
+
"double_blocks": 19,
|
| 12 |
+
"single_blocks": 38,
|
| 13 |
+
"text": {
|
| 14 |
+
"rank": 128,
|
| 15 |
+
"alpha": 128,
|
| 16 |
+
"token_length": 729
|
| 17 |
+
}
|
| 18 |
+
},
|
| 19 |
+
"condition_embedder": {
|
| 20 |
+
"input_dim": 64
|
| 21 |
+
},
|
| 22 |
+
"pooled_image_projector": {
|
| 23 |
+
"input_dim": 1152,
|
| 24 |
+
"output_dim": 768
|
| 25 |
+
},
|
| 26 |
+
"image_projector": {
|
| 27 |
+
"input_dim": 1152,
|
| 28 |
+
"output_dim": 4096
|
| 29 |
+
},
|
| 30 |
+
"weight_files": {
|
| 31 |
+
"lora": "lora.safetensors",
|
| 32 |
+
"condition_embedder": "condition_embedder.safetensors",
|
| 33 |
+
"x_embedder": "x_embedder.safetensors",
|
| 34 |
+
"time_text_embed": "time_text_embed.safetensors",
|
| 35 |
+
"pooled_image_projector": "pooled_image_projector.safetensors",
|
| 36 |
+
"image_projector": "image_projector.safetensors"
|
| 37 |
+
}
|
| 38 |
+
}
|
image_projector.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a083dd6d0a6cca808b92c9933046400d203f9492943504215ce7f25dad85e6d4
|
| 3 |
+
size 18890904
|
lora.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:15a1cb8165f291acd0850bbdd0c7e7694051921a22b993d059a180f1dce4876b
|
| 3 |
+
size 896689296
|
pooled_image_projector.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:71c61c3f4990a02150ac3109ef5f86949ae32272e2e2b68eb24fe747984c481e
|
| 3 |
+
size 3542160
|
time_text_embed.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:eab6be9862c33ef88fb131d9538f79468e871dd49bf86d99cba58ca6ebc2223a
|
| 3 |
+
size 64525712
|
x_embedder.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d7720552159ecc53f14a40792184e0c5b4bc02b0d829a13b3661ddb3d1c5aa90
|
| 3 |
+
size 2365616
|