jetjodh commited on
Commit
f29e43c
·
verified ·
1 Parent(s): cb70b87

Upload 16 files

Browse files
checkpoints/pipeline.yaml ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _target_: sam3d_objects.pipeline.inference_pipeline_pointmap.InferencePipelinePointMap
2
+ ss_generator_config_path: ss_generator.yaml
3
+ ss_generator_ckpt_path: ss_generator.ckpt
4
+ slat_generator_config_path: slat_generator.yaml
5
+ slat_generator_ckpt_path: slat_generator.ckpt
6
+ ss_decoder_config_path: ss_decoder.yaml
7
+ ss_decoder_ckpt_path: ss_decoder.ckpt
8
+ slat_decoder_gs_config_path: slat_decoder_gs.yaml
9
+ slat_decoder_gs_ckpt_path: slat_decoder_gs.ckpt
10
+ slat_decoder_gs_4_config_path: slat_decoder_gs_4.yaml
11
+ slat_decoder_gs_4_ckpt_path: slat_decoder_gs_4.ckpt
12
+ slat_decoder_mesh_config_path: slat_decoder_mesh.yaml
13
+ slat_decoder_mesh_ckpt_path: slat_decoder_mesh.ckpt
14
+ pad_size: 1.0
15
+ dtype: float16
16
+ version: 3dfy_v9
17
+ slat_cfg_strength: 1
18
+ slat_rescale_t: 1
19
+ downsample_ss_dist: 1
20
+ compile_model: true
21
+ ss_condition_input_mapping: []
22
+ ss_preprocessor:
23
+ _target_: sam3d_objects.data.dataset.tdfy.preprocessor.PreProcessor
24
+ img_mask_joint_transform: []
25
+ img_mask_pointmap_joint_transform:
26
+ - _partial_: true
27
+ _target_: sam3d_objects.data.dataset.tdfy.img_and_mask_transforms.resize_all_to_same_size
28
+ - _partial_: true
29
+ _target_: sam3d_objects.data.dataset.tdfy.img_and_mask_transforms.crop_around_mask_with_padding
30
+ box_size_factor: 1.2
31
+ padding_factor: 0.0
32
+ img_transform:
33
+ _target_: torchvision.transforms.Compose
34
+ transforms:
35
+ - _partial_: true
36
+ _target_: sam3d_objects.data.dataset.tdfy.img_processing.pad_to_square_centered
37
+ - _target_: torchvision.transforms.Resize
38
+ size: 518
39
+ mask_transform:
40
+ _target_: torchvision.transforms.Compose
41
+ transforms:
42
+ - _partial_: true
43
+ _target_: sam3d_objects.data.dataset.tdfy.img_processing.pad_to_square_centered
44
+ - _target_: torchvision.transforms.Resize
45
+ interpolation: 0
46
+ size: 518
47
+ normalize_pointmap: true
48
+ pointmap_normalizer:
49
+ _target_: sam3d_objects.data.dataset.tdfy.img_and_mask_transforms.ObjectCentricSSI
50
+ allow_scale_and_shift_override: true
51
+ use_scene_scale: true
52
+ pointmap_transform:
53
+ _target_: torchvision.transforms.Compose
54
+ transforms:
55
+ - _partial_: true
56
+ _target_: sam3d_objects.data.dataset.tdfy.img_processing.pad_to_square_centered
57
+ - _target_: torchvision.transforms.Resize
58
+ interpolation: 0
59
+ size: 518
60
+ pose_decoder_name: ScaleShiftInvariant
61
+ depth_model:
62
+ _target_: sam3d_objects.pipeline.depth_models.moge.MoGe
63
+ model:
64
+ _target_: moge.model.v1.MoGeModel.from_pretrained
65
+ pretrained_model_name_or_path: Ruicheng/moge-vitl
66
+ slat_condition_input_mapping: []
67
+ slat_preprocessor:
68
+ _target_: sam3d_objects.data.dataset.tdfy.preprocessor.PreProcessor
69
+ img_transform:
70
+ _target_: torchvision.transforms.Compose
71
+ transforms:
72
+ - _target_: sam3d_objects.data.dataset.tdfy.img_processing.pad_to_square_centered
73
+ _partial_: true
74
+ - _target_: torchvision.transforms.Resize
75
+ size: 518
76
+ mask_transform:
77
+ _target_: torchvision.transforms.Compose
78
+ transforms:
79
+ - _target_: sam3d_objects.data.dataset.tdfy.img_processing.pad_to_square_centered
80
+ _partial_: true
81
+ - _target_: torchvision.transforms.Resize
82
+ size: 518
83
+ interpolation: 0
84
+ img_mask_joint_transform:
85
+ - _target_: sam3d_objects.data.dataset.tdfy.img_and_mask_transforms.crop_around_mask_with_padding
86
+ _partial_: true
87
+ box_size_factor: 1.2
88
+ padding_factor: 0.0
89
+ slat_mean:
90
+ - 0.12211431
91
+ - 0.37204156
92
+ - -1.26521907
93
+ - -2.05276058
94
+ - -3.10432536
95
+ - -0.11294304
96
+ - -0.85146744
97
+ - 0.45506954
98
+ slat_std:
99
+ - 2.37326008
100
+ - 2.13174402
101
+ - 2.2413953
102
+ - 2.30589401
103
+ - 2.1191894
104
+ - 1.8969511
105
+ - 2.41684989
106
+ - 2.08374642
checkpoints/slat_decoder_gs.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f8077c36a06eaf890dd93cda1937411f793dea1eb80b3dd9329f2038ba84a111
3
+ size 171476155
checkpoints/slat_decoder_gs.yaml ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _target_: sam3d_objects.model.backbone.tdfy_dit.models.structured_latent_vae.decoder_gs.SLatGaussianDecoderTdfyWrapper
2
+ resolution: 64
3
+ model_channels: 768
4
+ latent_channels: 8
5
+ num_blocks: 12
6
+ num_heads: 12
7
+ mlp_ratio: 4
8
+ attn_mode: swin
9
+ window_size: 8
10
+ representation_config:
11
+ lr:
12
+ _xyz: 1.0
13
+ _features_dc: 1.0
14
+ _opacity: 1.0
15
+ _scaling: 1.0
16
+ _rotation: 0.1
17
+ perturb_offset: true
18
+ voxel_size: 1.5
19
+ num_gaussians: 32
20
+ 2d_filter_kernel_size: 0.1
21
+ 3d_filter_kernel_size: 0.0009
22
+ scaling_bias: 0.004
23
+ opacity_bias: 0.1
24
+ scaling_activation: softplus
25
+ use_fp16: true
checkpoints/slat_decoder_gs_4.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:731a0eceaa47945b52aa27f650d695b2aea9cc70945751e5609e5cb5b49f0186
3
+ size 170269801
checkpoints/slat_decoder_gs_4.yaml ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _target_: sam3d_objects.model.backbone.tdfy_dit.models.structured_latent_vae.decoder_gs.SLatGaussianDecoderTdfyWrapper
2
+ resolution: 64
3
+ model_channels: 768
4
+ latent_channels: 8
5
+ num_blocks: 12
6
+ num_heads: 12
7
+ mlp_ratio: 4
8
+ attn_mode: swin
9
+ window_size: 8
10
+ representation_config:
11
+ lr:
12
+ _xyz: 1.0
13
+ _features_dc: 1.0
14
+ _opacity: 1.0
15
+ _scaling: 1.0
16
+ _rotation: 0.1
17
+ perturb_offset: true
18
+ voxel_size: 1.5
19
+ num_gaussians: 4
20
+ 2d_filter_kernel_size: 0.1
21
+ 3d_filter_kernel_size: 0.0009
22
+ scaling_bias: 0.004
23
+ opacity_bias: 0.1
24
+ scaling_activation: softplus
25
+ use_fp16: true
checkpoints/slat_decoder_mesh.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85907b37b67d8ce5b099a96629bdcfbd873eb407dee6b3aa9a75deb15038db33
3
+ size 363726862
checkpoints/slat_decoder_mesh.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93333fcd57a3e36ded0b3bca6969e05ce2b35142029dadab514f41df46d2f985
3
+ size 363728714
checkpoints/slat_decoder_mesh.yaml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _target_: sam3d_objects.model.backbone.tdfy_dit.models.structured_latent_vae.decoder_mesh.SLatMeshDecoderTdfyWrapper
2
+ resolution: 64
3
+ model_channels: 768
4
+ latent_channels: 8
5
+ num_blocks: 12
6
+ num_heads: 12
7
+ mlp_ratio: 4
8
+ attn_mode: swin
9
+ window_size: 8
10
+ representation_config:
11
+ use_color: true
12
+ use_fp16: true
checkpoints/slat_generator.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91529bde8e7daa12d09618a66c319e3a5a6398db6b23b958cedcb1c3f28faabb
3
+ size 4906537684
checkpoints/slat_generator.yaml ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module:
2
+ condition_embedder:
3
+ backbone:
4
+ _target_: sam3d_objects.model.backbone.dit.embedder.embedder_fuser.EmbedderFuser
5
+ embedder_list:
6
+ - - _target_: sam3d_objects.model.backbone.dit.embedder.dino.Dino
7
+ dino_model: dinov2_vitl14_reg
8
+ input_size: 518
9
+ normalize_images: true
10
+ prenorm_features: true
11
+ - - - image
12
+ - cropped
13
+ - - rgb_image
14
+ - full
15
+ - - _target_: sam3d_objects.model.backbone.dit.embedder.dino.Dino
16
+ dino_model: dinov2_vitl14_reg
17
+ input_size: 518
18
+ normalize_images: true
19
+ prenorm_features: true
20
+ - - - mask
21
+ - cropped
22
+ - - rgb_image_mask
23
+ - full
24
+ projection_net_hidden_dim_multiplier: 4.0
25
+ use_pos_embedding: learned
26
+ generator:
27
+ backbone:
28
+ _target_: sam3d_objects.model.backbone.generator.flow_matching.model.FlowMatching
29
+ inference_steps: 12
30
+ reverse_fn:
31
+ _target_: sam3d_objects.model.backbone.generator.classifier_free_guidance.ClassifierFreeGuidance
32
+ backbone:
33
+ _target_: sam3d_objects.model.backbone.tdfy_dit.models.structured_latent_flow.SLatFlowModelTdfyWrapper
34
+ cond_channels: 1024
35
+ condition_embedder: null
36
+ force_zeros_cond: true
37
+ in_channels: 8
38
+ io_block_channels:
39
+ - 128
40
+ mlp_ratio: 4
41
+ model_channels: 1024
42
+ num_blocks: 24
43
+ num_heads: 16
44
+ num_io_res_blocks: 2
45
+ out_channels: 8
46
+ patch_size: 2
47
+ pe_mode: ape
48
+ qk_rms_norm: true
49
+ resolution: 64
50
+ use_fp16: true
51
+ p_unconditional: 0.0
52
+ strength: 0.0
53
+ unconditional_handling: add_flag
54
+ sigma_min: 0.0
55
+ time_scale: 1000.0
56
+ training_time_sampler_fn:
57
+ _partial_: true
58
+ _target_: sam3d_objects.model.backbone.generator.flow_matching.model.lognorm_sampler
59
+ mean: -1.0
60
+ std: 1.0
checkpoints/ss_decoder.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6dac1cd7b7fda5a38e0614fadae441f1794f80e39ea2981f1ac8aff0a7e99340
3
+ size 147609242
checkpoints/ss_decoder.yaml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ _target_: sam3d_objects.model.backbone.tdfy_dit.models.sparse_structure_vae.SparseStructureDecoderTdfyWrapper
2
+ out_channels: 1
3
+ latent_channels: 8
4
+ num_res_blocks: 2
5
+ num_res_blocks_middle: 2
6
+ channels:
7
+ - 512
8
+ - 128
9
+ - 32
10
+ reshape_input_to_cube: false
checkpoints/ss_encoder.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
3
+ size 0
checkpoints/ss_encoder.yaml ADDED
File without changes
checkpoints/ss_generator.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:225f40479e4cff4f39d6fa14c55be3abad1475bf55b61af3bec1e19ed2f6c146
3
+ size 6690136964
checkpoints/ss_generator.yaml ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module:
2
+ condition_embedder:
3
+ backbone:
4
+ _target_: sam3d_objects.model.backbone.dit.embedder.embedder_fuser.EmbedderFuser
5
+ drop_modalities_weight:
6
+ - - - pointmap
7
+ - rgb_pointmap
8
+ - 1.0
9
+ dropout_prob: 0.1
10
+ embedder_list:
11
+ - - _target_: sam3d_objects.model.backbone.dit.embedder.dino.Dino
12
+ dino_model: dinov2_vitl14_reg
13
+ input_size: 518
14
+ normalize_images: true
15
+ - - - image
16
+ - cropped
17
+ - - rgb_image
18
+ - full
19
+ - - _target_: sam3d_objects.model.backbone.dit.embedder.dino.Dino
20
+ dino_model: dinov2_vitl14_reg
21
+ input_size: 518
22
+ normalize_images: true
23
+ - - - mask
24
+ - cropped
25
+ - - rgb_image_mask
26
+ - full
27
+ - - _target_: sam3d_objects.model.backbone.dit.embedder.pointmap.PointPatchEmbed
28
+ embed_dim: 512
29
+ input_size: 256
30
+ patch_size: 8
31
+ remap_output: linear
32
+ - - - pointmap
33
+ - cropped
34
+ - - rgb_pointmap
35
+ - full
36
+ force_drop_modalities: null
37
+ freeze: true
38
+ projection_net_hidden_dim_multiplier: 4.0
39
+ use_pos_embedding: learned
40
+ generator:
41
+ backbone:
42
+ _target_: sam3d_objects.model.backbone.generator.shortcut.model.ShortCut
43
+ batch_mode: true
44
+ cfg_modalities:
45
+ - shape
46
+ inference_steps: 2
47
+ loss_weights:
48
+ 6drotation_normalized: 0.1
49
+ _target_: sam3d_objects.config.utils.make_dict
50
+ scale: 0.1
51
+ shape: 0
52
+ translation: 1.0
53
+ translation_scale: 0.0
54
+ ratio_cfg_samples_in_self_consistency_target: 0.25
55
+ rescale_t: 1
56
+ reverse_fn:
57
+ _target_: sam3d_objects.model.backbone.generator.classifier_free_guidance.ClassifierFreeGuidanceWithExternalUnconditionalProbability
58
+ backbone:
59
+ _target_: sam3d_objects.model.backbone.tdfy_dit.models.mot_sparse_structure_flow.SparseStructureFlowTdfyWrapper
60
+ cond_channels: 1024
61
+ condition_embedder: null
62
+ force_zeros_cond: true
63
+ freeze_d_time_embedder: true
64
+ freeze_shared_parameters: true
65
+ in_channels: 8
66
+ is_shortcut_model: true
67
+ latent_mapping:
68
+ 6drotation_normalized:
69
+ _target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.Latent
70
+ in_channels: 6
71
+ model_channels: 1024
72
+ pos_embedder:
73
+ _target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.LearntPositionEmbedder
74
+ model_channels: 1024
75
+ token_len: 1
76
+ scale:
77
+ _target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.Latent
78
+ in_channels: 3
79
+ model_channels: 1024
80
+ pos_embedder:
81
+ _target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.LearntPositionEmbedder
82
+ model_channels: 1024
83
+ token_len: 1
84
+ shape:
85
+ _target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.Latent
86
+ in_channels: 8
87
+ model_channels: 1024
88
+ pos_embedder:
89
+ _target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.ShapePositionEmbedder
90
+ model_channels: 1024
91
+ patch_size: 1
92
+ resolution: 16
93
+ translation:
94
+ _target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.Latent
95
+ in_channels: 3
96
+ model_channels: 1024
97
+ pos_embedder:
98
+ _target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.LearntPositionEmbedder
99
+ model_channels: 1024
100
+ token_len: 1
101
+ translation_scale:
102
+ _target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.Latent
103
+ in_channels: 1
104
+ model_channels: 1024
105
+ pos_embedder:
106
+ _target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.LearntPositionEmbedder
107
+ model_channels: 1024
108
+ token_len: 1
109
+ latent_share_transformer:
110
+ 6drotation_normalized:
111
+ - 6drotation_normalized
112
+ - translation
113
+ - scale
114
+ - translation_scale
115
+ mlp_ratio: 4
116
+ model_channels: 1024
117
+ num_blocks: 24
118
+ num_heads: 16
119
+ out_channels: 8
120
+ patch_size: 1
121
+ pe_mode: ape
122
+ qk_rms_norm: true
123
+ resolution: 16
124
+ use_checkpoint: false
125
+ use_fp16: false
126
+ interval:
127
+ - 0
128
+ - 500
129
+ p_unconditional: 0.1
130
+ strength: 2.0
131
+ unconditional_handling: add_flag
132
+ self_consistency_cfg_strength: 2.0
133
+ self_consistency_prob: 0.25
134
+ shortcut_loss_weight: 1.0
135
+ sigma_min: 0.0
136
+ time_scale: 1000.0
137
+ training_time_sampler_fn:
138
+ _partial_: true
139
+ _target_: sam3d_objects.model.backbone.generator.flow_matching.model.lognorm_sampler
140
+ mean: -1.0
141
+ std: 1.0