Upload 16 files
Browse files- checkpoints/pipeline.yaml +106 -0
- checkpoints/slat_decoder_gs.ckpt +3 -0
- checkpoints/slat_decoder_gs.yaml +25 -0
- checkpoints/slat_decoder_gs_4.ckpt +3 -0
- checkpoints/slat_decoder_gs_4.yaml +25 -0
- checkpoints/slat_decoder_mesh.ckpt +3 -0
- checkpoints/slat_decoder_mesh.pt +3 -0
- checkpoints/slat_decoder_mesh.yaml +12 -0
- checkpoints/slat_generator.ckpt +3 -0
- checkpoints/slat_generator.yaml +60 -0
- checkpoints/ss_decoder.ckpt +3 -0
- checkpoints/ss_decoder.yaml +10 -0
- checkpoints/ss_encoder.safetensors +3 -0
- checkpoints/ss_encoder.yaml +0 -0
- checkpoints/ss_generator.ckpt +3 -0
- checkpoints/ss_generator.yaml +141 -0
checkpoints/pipeline.yaml
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_target_: sam3d_objects.pipeline.inference_pipeline_pointmap.InferencePipelinePointMap
|
| 2 |
+
ss_generator_config_path: ss_generator.yaml
|
| 3 |
+
ss_generator_ckpt_path: ss_generator.ckpt
|
| 4 |
+
slat_generator_config_path: slat_generator.yaml
|
| 5 |
+
slat_generator_ckpt_path: slat_generator.ckpt
|
| 6 |
+
ss_decoder_config_path: ss_decoder.yaml
|
| 7 |
+
ss_decoder_ckpt_path: ss_decoder.ckpt
|
| 8 |
+
slat_decoder_gs_config_path: slat_decoder_gs.yaml
|
| 9 |
+
slat_decoder_gs_ckpt_path: slat_decoder_gs.ckpt
|
| 10 |
+
slat_decoder_gs_4_config_path: slat_decoder_gs_4.yaml
|
| 11 |
+
slat_decoder_gs_4_ckpt_path: slat_decoder_gs_4.ckpt
|
| 12 |
+
slat_decoder_mesh_config_path: slat_decoder_mesh.yaml
|
| 13 |
+
slat_decoder_mesh_ckpt_path: slat_decoder_mesh.ckpt
|
| 14 |
+
pad_size: 1.0
|
| 15 |
+
dtype: float16
|
| 16 |
+
version: 3dfy_v9
|
| 17 |
+
slat_cfg_strength: 1
|
| 18 |
+
slat_rescale_t: 1
|
| 19 |
+
downsample_ss_dist: 1
|
| 20 |
+
compile_model: true
|
| 21 |
+
ss_condition_input_mapping: []
|
| 22 |
+
ss_preprocessor:
|
| 23 |
+
_target_: sam3d_objects.data.dataset.tdfy.preprocessor.PreProcessor
|
| 24 |
+
img_mask_joint_transform: []
|
| 25 |
+
img_mask_pointmap_joint_transform:
|
| 26 |
+
- _partial_: true
|
| 27 |
+
_target_: sam3d_objects.data.dataset.tdfy.img_and_mask_transforms.resize_all_to_same_size
|
| 28 |
+
- _partial_: true
|
| 29 |
+
_target_: sam3d_objects.data.dataset.tdfy.img_and_mask_transforms.crop_around_mask_with_padding
|
| 30 |
+
box_size_factor: 1.2
|
| 31 |
+
padding_factor: 0.0
|
| 32 |
+
img_transform:
|
| 33 |
+
_target_: torchvision.transforms.Compose
|
| 34 |
+
transforms:
|
| 35 |
+
- _partial_: true
|
| 36 |
+
_target_: sam3d_objects.data.dataset.tdfy.img_processing.pad_to_square_centered
|
| 37 |
+
- _target_: torchvision.transforms.Resize
|
| 38 |
+
size: 518
|
| 39 |
+
mask_transform:
|
| 40 |
+
_target_: torchvision.transforms.Compose
|
| 41 |
+
transforms:
|
| 42 |
+
- _partial_: true
|
| 43 |
+
_target_: sam3d_objects.data.dataset.tdfy.img_processing.pad_to_square_centered
|
| 44 |
+
- _target_: torchvision.transforms.Resize
|
| 45 |
+
interpolation: 0
|
| 46 |
+
size: 518
|
| 47 |
+
normalize_pointmap: true
|
| 48 |
+
pointmap_normalizer:
|
| 49 |
+
_target_: sam3d_objects.data.dataset.tdfy.img_and_mask_transforms.ObjectCentricSSI
|
| 50 |
+
allow_scale_and_shift_override: true
|
| 51 |
+
use_scene_scale: true
|
| 52 |
+
pointmap_transform:
|
| 53 |
+
_target_: torchvision.transforms.Compose
|
| 54 |
+
transforms:
|
| 55 |
+
- _partial_: true
|
| 56 |
+
_target_: sam3d_objects.data.dataset.tdfy.img_processing.pad_to_square_centered
|
| 57 |
+
- _target_: torchvision.transforms.Resize
|
| 58 |
+
interpolation: 0
|
| 59 |
+
size: 518
|
| 60 |
+
pose_decoder_name: ScaleShiftInvariant
|
| 61 |
+
depth_model:
|
| 62 |
+
_target_: sam3d_objects.pipeline.depth_models.moge.MoGe
|
| 63 |
+
model:
|
| 64 |
+
_target_: moge.model.v1.MoGeModel.from_pretrained
|
| 65 |
+
pretrained_model_name_or_path: Ruicheng/moge-vitl
|
| 66 |
+
slat_condition_input_mapping: []
|
| 67 |
+
slat_preprocessor:
|
| 68 |
+
_target_: sam3d_objects.data.dataset.tdfy.preprocessor.PreProcessor
|
| 69 |
+
img_transform:
|
| 70 |
+
_target_: torchvision.transforms.Compose
|
| 71 |
+
transforms:
|
| 72 |
+
- _target_: sam3d_objects.data.dataset.tdfy.img_processing.pad_to_square_centered
|
| 73 |
+
_partial_: true
|
| 74 |
+
- _target_: torchvision.transforms.Resize
|
| 75 |
+
size: 518
|
| 76 |
+
mask_transform:
|
| 77 |
+
_target_: torchvision.transforms.Compose
|
| 78 |
+
transforms:
|
| 79 |
+
- _target_: sam3d_objects.data.dataset.tdfy.img_processing.pad_to_square_centered
|
| 80 |
+
_partial_: true
|
| 81 |
+
- _target_: torchvision.transforms.Resize
|
| 82 |
+
size: 518
|
| 83 |
+
interpolation: 0
|
| 84 |
+
img_mask_joint_transform:
|
| 85 |
+
- _target_: sam3d_objects.data.dataset.tdfy.img_and_mask_transforms.crop_around_mask_with_padding
|
| 86 |
+
_partial_: true
|
| 87 |
+
box_size_factor: 1.2
|
| 88 |
+
padding_factor: 0.0
|
| 89 |
+
slat_mean:
|
| 90 |
+
- 0.12211431
|
| 91 |
+
- 0.37204156
|
| 92 |
+
- -1.26521907
|
| 93 |
+
- -2.05276058
|
| 94 |
+
- -3.10432536
|
| 95 |
+
- -0.11294304
|
| 96 |
+
- -0.85146744
|
| 97 |
+
- 0.45506954
|
| 98 |
+
slat_std:
|
| 99 |
+
- 2.37326008
|
| 100 |
+
- 2.13174402
|
| 101 |
+
- 2.2413953
|
| 102 |
+
- 2.30589401
|
| 103 |
+
- 2.1191894
|
| 104 |
+
- 1.8969511
|
| 105 |
+
- 2.41684989
|
| 106 |
+
- 2.08374642
|
checkpoints/slat_decoder_gs.ckpt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f8077c36a06eaf890dd93cda1937411f793dea1eb80b3dd9329f2038ba84a111
|
| 3 |
+
size 171476155
|
checkpoints/slat_decoder_gs.yaml
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_target_: sam3d_objects.model.backbone.tdfy_dit.models.structured_latent_vae.decoder_gs.SLatGaussianDecoderTdfyWrapper
|
| 2 |
+
resolution: 64
|
| 3 |
+
model_channels: 768
|
| 4 |
+
latent_channels: 8
|
| 5 |
+
num_blocks: 12
|
| 6 |
+
num_heads: 12
|
| 7 |
+
mlp_ratio: 4
|
| 8 |
+
attn_mode: swin
|
| 9 |
+
window_size: 8
|
| 10 |
+
representation_config:
|
| 11 |
+
lr:
|
| 12 |
+
_xyz: 1.0
|
| 13 |
+
_features_dc: 1.0
|
| 14 |
+
_opacity: 1.0
|
| 15 |
+
_scaling: 1.0
|
| 16 |
+
_rotation: 0.1
|
| 17 |
+
perturb_offset: true
|
| 18 |
+
voxel_size: 1.5
|
| 19 |
+
num_gaussians: 32
|
| 20 |
+
2d_filter_kernel_size: 0.1
|
| 21 |
+
3d_filter_kernel_size: 0.0009
|
| 22 |
+
scaling_bias: 0.004
|
| 23 |
+
opacity_bias: 0.1
|
| 24 |
+
scaling_activation: softplus
|
| 25 |
+
use_fp16: true
|
checkpoints/slat_decoder_gs_4.ckpt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:731a0eceaa47945b52aa27f650d695b2aea9cc70945751e5609e5cb5b49f0186
|
| 3 |
+
size 170269801
|
checkpoints/slat_decoder_gs_4.yaml
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_target_: sam3d_objects.model.backbone.tdfy_dit.models.structured_latent_vae.decoder_gs.SLatGaussianDecoderTdfyWrapper
|
| 2 |
+
resolution: 64
|
| 3 |
+
model_channels: 768
|
| 4 |
+
latent_channels: 8
|
| 5 |
+
num_blocks: 12
|
| 6 |
+
num_heads: 12
|
| 7 |
+
mlp_ratio: 4
|
| 8 |
+
attn_mode: swin
|
| 9 |
+
window_size: 8
|
| 10 |
+
representation_config:
|
| 11 |
+
lr:
|
| 12 |
+
_xyz: 1.0
|
| 13 |
+
_features_dc: 1.0
|
| 14 |
+
_opacity: 1.0
|
| 15 |
+
_scaling: 1.0
|
| 16 |
+
_rotation: 0.1
|
| 17 |
+
perturb_offset: true
|
| 18 |
+
voxel_size: 1.5
|
| 19 |
+
num_gaussians: 4
|
| 20 |
+
2d_filter_kernel_size: 0.1
|
| 21 |
+
3d_filter_kernel_size: 0.0009
|
| 22 |
+
scaling_bias: 0.004
|
| 23 |
+
opacity_bias: 0.1
|
| 24 |
+
scaling_activation: softplus
|
| 25 |
+
use_fp16: true
|
checkpoints/slat_decoder_mesh.ckpt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:85907b37b67d8ce5b099a96629bdcfbd873eb407dee6b3aa9a75deb15038db33
|
| 3 |
+
size 363726862
|
checkpoints/slat_decoder_mesh.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:93333fcd57a3e36ded0b3bca6969e05ce2b35142029dadab514f41df46d2f985
|
| 3 |
+
size 363728714
|
checkpoints/slat_decoder_mesh.yaml
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_target_: sam3d_objects.model.backbone.tdfy_dit.models.structured_latent_vae.decoder_mesh.SLatMeshDecoderTdfyWrapper
|
| 2 |
+
resolution: 64
|
| 3 |
+
model_channels: 768
|
| 4 |
+
latent_channels: 8
|
| 5 |
+
num_blocks: 12
|
| 6 |
+
num_heads: 12
|
| 7 |
+
mlp_ratio: 4
|
| 8 |
+
attn_mode: swin
|
| 9 |
+
window_size: 8
|
| 10 |
+
representation_config:
|
| 11 |
+
use_color: true
|
| 12 |
+
use_fp16: true
|
checkpoints/slat_generator.ckpt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:91529bde8e7daa12d09618a66c319e3a5a6398db6b23b958cedcb1c3f28faabb
|
| 3 |
+
size 4906537684
|
checkpoints/slat_generator.yaml
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
module:
|
| 2 |
+
condition_embedder:
|
| 3 |
+
backbone:
|
| 4 |
+
_target_: sam3d_objects.model.backbone.dit.embedder.embedder_fuser.EmbedderFuser
|
| 5 |
+
embedder_list:
|
| 6 |
+
- - _target_: sam3d_objects.model.backbone.dit.embedder.dino.Dino
|
| 7 |
+
dino_model: dinov2_vitl14_reg
|
| 8 |
+
input_size: 518
|
| 9 |
+
normalize_images: true
|
| 10 |
+
prenorm_features: true
|
| 11 |
+
- - - image
|
| 12 |
+
- cropped
|
| 13 |
+
- - rgb_image
|
| 14 |
+
- full
|
| 15 |
+
- - _target_: sam3d_objects.model.backbone.dit.embedder.dino.Dino
|
| 16 |
+
dino_model: dinov2_vitl14_reg
|
| 17 |
+
input_size: 518
|
| 18 |
+
normalize_images: true
|
| 19 |
+
prenorm_features: true
|
| 20 |
+
- - - mask
|
| 21 |
+
- cropped
|
| 22 |
+
- - rgb_image_mask
|
| 23 |
+
- full
|
| 24 |
+
projection_net_hidden_dim_multiplier: 4.0
|
| 25 |
+
use_pos_embedding: learned
|
| 26 |
+
generator:
|
| 27 |
+
backbone:
|
| 28 |
+
_target_: sam3d_objects.model.backbone.generator.flow_matching.model.FlowMatching
|
| 29 |
+
inference_steps: 12
|
| 30 |
+
reverse_fn:
|
| 31 |
+
_target_: sam3d_objects.model.backbone.generator.classifier_free_guidance.ClassifierFreeGuidance
|
| 32 |
+
backbone:
|
| 33 |
+
_target_: sam3d_objects.model.backbone.tdfy_dit.models.structured_latent_flow.SLatFlowModelTdfyWrapper
|
| 34 |
+
cond_channels: 1024
|
| 35 |
+
condition_embedder: null
|
| 36 |
+
force_zeros_cond: true
|
| 37 |
+
in_channels: 8
|
| 38 |
+
io_block_channels:
|
| 39 |
+
- 128
|
| 40 |
+
mlp_ratio: 4
|
| 41 |
+
model_channels: 1024
|
| 42 |
+
num_blocks: 24
|
| 43 |
+
num_heads: 16
|
| 44 |
+
num_io_res_blocks: 2
|
| 45 |
+
out_channels: 8
|
| 46 |
+
patch_size: 2
|
| 47 |
+
pe_mode: ape
|
| 48 |
+
qk_rms_norm: true
|
| 49 |
+
resolution: 64
|
| 50 |
+
use_fp16: true
|
| 51 |
+
p_unconditional: 0.0
|
| 52 |
+
strength: 0.0
|
| 53 |
+
unconditional_handling: add_flag
|
| 54 |
+
sigma_min: 0.0
|
| 55 |
+
time_scale: 1000.0
|
| 56 |
+
training_time_sampler_fn:
|
| 57 |
+
_partial_: true
|
| 58 |
+
_target_: sam3d_objects.model.backbone.generator.flow_matching.model.lognorm_sampler
|
| 59 |
+
mean: -1.0
|
| 60 |
+
std: 1.0
|
checkpoints/ss_decoder.ckpt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6dac1cd7b7fda5a38e0614fadae441f1794f80e39ea2981f1ac8aff0a7e99340
|
| 3 |
+
size 147609242
|
checkpoints/ss_decoder.yaml
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_target_: sam3d_objects.model.backbone.tdfy_dit.models.sparse_structure_vae.SparseStructureDecoderTdfyWrapper
|
| 2 |
+
out_channels: 1
|
| 3 |
+
latent_channels: 8
|
| 4 |
+
num_res_blocks: 2
|
| 5 |
+
num_res_blocks_middle: 2
|
| 6 |
+
channels:
|
| 7 |
+
- 512
|
| 8 |
+
- 128
|
| 9 |
+
- 32
|
| 10 |
+
reshape_input_to_cube: false
|
checkpoints/ss_encoder.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
|
| 3 |
+
size 0
|
checkpoints/ss_encoder.yaml
ADDED
|
File without changes
|
checkpoints/ss_generator.ckpt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:225f40479e4cff4f39d6fa14c55be3abad1475bf55b61af3bec1e19ed2f6c146
|
| 3 |
+
size 6690136964
|
checkpoints/ss_generator.yaml
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
module:
|
| 2 |
+
condition_embedder:
|
| 3 |
+
backbone:
|
| 4 |
+
_target_: sam3d_objects.model.backbone.dit.embedder.embedder_fuser.EmbedderFuser
|
| 5 |
+
drop_modalities_weight:
|
| 6 |
+
- - - pointmap
|
| 7 |
+
- rgb_pointmap
|
| 8 |
+
- 1.0
|
| 9 |
+
dropout_prob: 0.1
|
| 10 |
+
embedder_list:
|
| 11 |
+
- - _target_: sam3d_objects.model.backbone.dit.embedder.dino.Dino
|
| 12 |
+
dino_model: dinov2_vitl14_reg
|
| 13 |
+
input_size: 518
|
| 14 |
+
normalize_images: true
|
| 15 |
+
- - - image
|
| 16 |
+
- cropped
|
| 17 |
+
- - rgb_image
|
| 18 |
+
- full
|
| 19 |
+
- - _target_: sam3d_objects.model.backbone.dit.embedder.dino.Dino
|
| 20 |
+
dino_model: dinov2_vitl14_reg
|
| 21 |
+
input_size: 518
|
| 22 |
+
normalize_images: true
|
| 23 |
+
- - - mask
|
| 24 |
+
- cropped
|
| 25 |
+
- - rgb_image_mask
|
| 26 |
+
- full
|
| 27 |
+
- - _target_: sam3d_objects.model.backbone.dit.embedder.pointmap.PointPatchEmbed
|
| 28 |
+
embed_dim: 512
|
| 29 |
+
input_size: 256
|
| 30 |
+
patch_size: 8
|
| 31 |
+
remap_output: linear
|
| 32 |
+
- - - pointmap
|
| 33 |
+
- cropped
|
| 34 |
+
- - rgb_pointmap
|
| 35 |
+
- full
|
| 36 |
+
force_drop_modalities: null
|
| 37 |
+
freeze: true
|
| 38 |
+
projection_net_hidden_dim_multiplier: 4.0
|
| 39 |
+
use_pos_embedding: learned
|
| 40 |
+
generator:
|
| 41 |
+
backbone:
|
| 42 |
+
_target_: sam3d_objects.model.backbone.generator.shortcut.model.ShortCut
|
| 43 |
+
batch_mode: true
|
| 44 |
+
cfg_modalities:
|
| 45 |
+
- shape
|
| 46 |
+
inference_steps: 2
|
| 47 |
+
loss_weights:
|
| 48 |
+
6drotation_normalized: 0.1
|
| 49 |
+
_target_: sam3d_objects.config.utils.make_dict
|
| 50 |
+
scale: 0.1
|
| 51 |
+
shape: 0
|
| 52 |
+
translation: 1.0
|
| 53 |
+
translation_scale: 0.0
|
| 54 |
+
ratio_cfg_samples_in_self_consistency_target: 0.25
|
| 55 |
+
rescale_t: 1
|
| 56 |
+
reverse_fn:
|
| 57 |
+
_target_: sam3d_objects.model.backbone.generator.classifier_free_guidance.ClassifierFreeGuidanceWithExternalUnconditionalProbability
|
| 58 |
+
backbone:
|
| 59 |
+
_target_: sam3d_objects.model.backbone.tdfy_dit.models.mot_sparse_structure_flow.SparseStructureFlowTdfyWrapper
|
| 60 |
+
cond_channels: 1024
|
| 61 |
+
condition_embedder: null
|
| 62 |
+
force_zeros_cond: true
|
| 63 |
+
freeze_d_time_embedder: true
|
| 64 |
+
freeze_shared_parameters: true
|
| 65 |
+
in_channels: 8
|
| 66 |
+
is_shortcut_model: true
|
| 67 |
+
latent_mapping:
|
| 68 |
+
6drotation_normalized:
|
| 69 |
+
_target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.Latent
|
| 70 |
+
in_channels: 6
|
| 71 |
+
model_channels: 1024
|
| 72 |
+
pos_embedder:
|
| 73 |
+
_target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.LearntPositionEmbedder
|
| 74 |
+
model_channels: 1024
|
| 75 |
+
token_len: 1
|
| 76 |
+
scale:
|
| 77 |
+
_target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.Latent
|
| 78 |
+
in_channels: 3
|
| 79 |
+
model_channels: 1024
|
| 80 |
+
pos_embedder:
|
| 81 |
+
_target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.LearntPositionEmbedder
|
| 82 |
+
model_channels: 1024
|
| 83 |
+
token_len: 1
|
| 84 |
+
shape:
|
| 85 |
+
_target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.Latent
|
| 86 |
+
in_channels: 8
|
| 87 |
+
model_channels: 1024
|
| 88 |
+
pos_embedder:
|
| 89 |
+
_target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.ShapePositionEmbedder
|
| 90 |
+
model_channels: 1024
|
| 91 |
+
patch_size: 1
|
| 92 |
+
resolution: 16
|
| 93 |
+
translation:
|
| 94 |
+
_target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.Latent
|
| 95 |
+
in_channels: 3
|
| 96 |
+
model_channels: 1024
|
| 97 |
+
pos_embedder:
|
| 98 |
+
_target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.LearntPositionEmbedder
|
| 99 |
+
model_channels: 1024
|
| 100 |
+
token_len: 1
|
| 101 |
+
translation_scale:
|
| 102 |
+
_target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.Latent
|
| 103 |
+
in_channels: 1
|
| 104 |
+
model_channels: 1024
|
| 105 |
+
pos_embedder:
|
| 106 |
+
_target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.LearntPositionEmbedder
|
| 107 |
+
model_channels: 1024
|
| 108 |
+
token_len: 1
|
| 109 |
+
latent_share_transformer:
|
| 110 |
+
6drotation_normalized:
|
| 111 |
+
- 6drotation_normalized
|
| 112 |
+
- translation
|
| 113 |
+
- scale
|
| 114 |
+
- translation_scale
|
| 115 |
+
mlp_ratio: 4
|
| 116 |
+
model_channels: 1024
|
| 117 |
+
num_blocks: 24
|
| 118 |
+
num_heads: 16
|
| 119 |
+
out_channels: 8
|
| 120 |
+
patch_size: 1
|
| 121 |
+
pe_mode: ape
|
| 122 |
+
qk_rms_norm: true
|
| 123 |
+
resolution: 16
|
| 124 |
+
use_checkpoint: false
|
| 125 |
+
use_fp16: false
|
| 126 |
+
interval:
|
| 127 |
+
- 0
|
| 128 |
+
- 500
|
| 129 |
+
p_unconditional: 0.1
|
| 130 |
+
strength: 2.0
|
| 131 |
+
unconditional_handling: add_flag
|
| 132 |
+
self_consistency_cfg_strength: 2.0
|
| 133 |
+
self_consistency_prob: 0.25
|
| 134 |
+
shortcut_loss_weight: 1.0
|
| 135 |
+
sigma_min: 0.0
|
| 136 |
+
time_scale: 1000.0
|
| 137 |
+
training_time_sampler_fn:
|
| 138 |
+
_partial_: true
|
| 139 |
+
_target_: sam3d_objects.model.backbone.generator.flow_matching.model.lognorm_sampler
|
| 140 |
+
mean: -1.0
|
| 141 |
+
std: 1.0
|