Upload 16 files

Browse files

Files changed (16) hide show

checkpoints/pipeline.yaml +106 -0
checkpoints/slat_decoder_gs.ckpt +3 -0
checkpoints/slat_decoder_gs.yaml +25 -0
checkpoints/slat_decoder_gs_4.ckpt +3 -0
checkpoints/slat_decoder_gs_4.yaml +25 -0
checkpoints/slat_decoder_mesh.ckpt +3 -0
checkpoints/slat_decoder_mesh.pt +3 -0
checkpoints/slat_decoder_mesh.yaml +12 -0
checkpoints/slat_generator.ckpt +3 -0
checkpoints/slat_generator.yaml +60 -0
checkpoints/ss_decoder.ckpt +3 -0
checkpoints/ss_decoder.yaml +10 -0
checkpoints/ss_encoder.safetensors +3 -0
checkpoints/ss_encoder.yaml +0 -0
checkpoints/ss_generator.ckpt +3 -0
checkpoints/ss_generator.yaml +141 -0

checkpoints/pipeline.yaml ADDED Viewed

	@@ -0,0 +1,106 @@

+_target_: sam3d_objects.pipeline.inference_pipeline_pointmap.InferencePipelinePointMap
+ss_generator_config_path: ss_generator.yaml
+ss_generator_ckpt_path: ss_generator.ckpt
+slat_generator_config_path: slat_generator.yaml
+slat_generator_ckpt_path: slat_generator.ckpt
+ss_decoder_config_path: ss_decoder.yaml
+ss_decoder_ckpt_path: ss_decoder.ckpt
+slat_decoder_gs_config_path: slat_decoder_gs.yaml
+slat_decoder_gs_ckpt_path: slat_decoder_gs.ckpt
+slat_decoder_gs_4_config_path: slat_decoder_gs_4.yaml
+slat_decoder_gs_4_ckpt_path: slat_decoder_gs_4.ckpt
+slat_decoder_mesh_config_path: slat_decoder_mesh.yaml
+slat_decoder_mesh_ckpt_path: slat_decoder_mesh.ckpt
+pad_size: 1.0
+dtype: float16
+version: 3dfy_v9
+slat_cfg_strength: 1
+slat_rescale_t: 1
+downsample_ss_dist: 1
+compile_model: true
+ss_condition_input_mapping: []
+ss_preprocessor:
+  _target_: sam3d_objects.data.dataset.tdfy.preprocessor.PreProcessor
+  img_mask_joint_transform: []
+  img_mask_pointmap_joint_transform:
+  - _partial_: true
+    _target_: sam3d_objects.data.dataset.tdfy.img_and_mask_transforms.resize_all_to_same_size
+  - _partial_: true
+    _target_: sam3d_objects.data.dataset.tdfy.img_and_mask_transforms.crop_around_mask_with_padding
+    box_size_factor: 1.2
+    padding_factor: 0.0
+  img_transform:
+    _target_: torchvision.transforms.Compose
+    transforms:
+    - _partial_: true
+      _target_: sam3d_objects.data.dataset.tdfy.img_processing.pad_to_square_centered
+    - _target_: torchvision.transforms.Resize
+      size: 518
+  mask_transform:
+    _target_: torchvision.transforms.Compose
+    transforms:
+    - _partial_: true
+      _target_: sam3d_objects.data.dataset.tdfy.img_processing.pad_to_square_centered
+    - _target_: torchvision.transforms.Resize
+      interpolation: 0
+      size: 518
+  normalize_pointmap: true
+  pointmap_normalizer:
+    _target_: sam3d_objects.data.dataset.tdfy.img_and_mask_transforms.ObjectCentricSSI
+    allow_scale_and_shift_override: true
+    use_scene_scale: true
+  pointmap_transform:
+    _target_: torchvision.transforms.Compose
+    transforms:
+    - _partial_: true
+      _target_: sam3d_objects.data.dataset.tdfy.img_processing.pad_to_square_centered
+    - _target_: torchvision.transforms.Resize
+      interpolation: 0
+      size: 518
+pose_decoder_name: ScaleShiftInvariant
+depth_model:
+  _target_: sam3d_objects.pipeline.depth_models.moge.MoGe
+  model:
+    _target_: moge.model.v1.MoGeModel.from_pretrained
+    pretrained_model_name_or_path: Ruicheng/moge-vitl
+slat_condition_input_mapping: []
+slat_preprocessor:
+  _target_: sam3d_objects.data.dataset.tdfy.preprocessor.PreProcessor
+  img_transform:
+    _target_: torchvision.transforms.Compose
+    transforms:
+    - _target_: sam3d_objects.data.dataset.tdfy.img_processing.pad_to_square_centered
+      _partial_: true
+    - _target_: torchvision.transforms.Resize
+      size: 518
+  mask_transform:
+    _target_: torchvision.transforms.Compose
+    transforms:
+    - _target_: sam3d_objects.data.dataset.tdfy.img_processing.pad_to_square_centered
+      _partial_: true
+    - _target_: torchvision.transforms.Resize
+      size: 518
+      interpolation: 0
+  img_mask_joint_transform:
+  - _target_: sam3d_objects.data.dataset.tdfy.img_and_mask_transforms.crop_around_mask_with_padding
+    _partial_: true
+    box_size_factor: 1.2
+    padding_factor: 0.0
+slat_mean:
+- 0.12211431
+- 0.37204156
+- -1.26521907
+- -2.05276058
+- -3.10432536
+- -0.11294304
+- -0.85146744
+- 0.45506954
+slat_std:
+- 2.37326008
+- 2.13174402
+- 2.2413953
+- 2.30589401
+- 2.1191894
+- 1.8969511
+- 2.41684989
+- 2.08374642

checkpoints/slat_decoder_gs.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f8077c36a06eaf890dd93cda1937411f793dea1eb80b3dd9329f2038ba84a111
+size 171476155

checkpoints/slat_decoder_gs.yaml ADDED Viewed

	@@ -0,0 +1,25 @@

+_target_: sam3d_objects.model.backbone.tdfy_dit.models.structured_latent_vae.decoder_gs.SLatGaussianDecoderTdfyWrapper
+resolution: 64
+model_channels: 768
+latent_channels: 8
+num_blocks: 12
+num_heads: 12
+mlp_ratio: 4
+attn_mode: swin
+window_size: 8
+representation_config:
+  lr:
+    _xyz: 1.0
+    _features_dc: 1.0
+    _opacity: 1.0
+    _scaling: 1.0
+    _rotation: 0.1
+  perturb_offset: true
+  voxel_size: 1.5
+  num_gaussians: 32
+  2d_filter_kernel_size: 0.1
+  3d_filter_kernel_size: 0.0009
+  scaling_bias: 0.004
+  opacity_bias: 0.1
+  scaling_activation: softplus
+use_fp16: true

checkpoints/slat_decoder_gs_4.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:731a0eceaa47945b52aa27f650d695b2aea9cc70945751e5609e5cb5b49f0186
+size 170269801

checkpoints/slat_decoder_gs_4.yaml ADDED Viewed

	@@ -0,0 +1,25 @@

+_target_: sam3d_objects.model.backbone.tdfy_dit.models.structured_latent_vae.decoder_gs.SLatGaussianDecoderTdfyWrapper
+resolution: 64
+model_channels: 768
+latent_channels: 8
+num_blocks: 12
+num_heads: 12
+mlp_ratio: 4
+attn_mode: swin
+window_size: 8
+representation_config:
+  lr:
+    _xyz: 1.0
+    _features_dc: 1.0
+    _opacity: 1.0
+    _scaling: 1.0
+    _rotation: 0.1
+  perturb_offset: true
+  voxel_size: 1.5
+  num_gaussians: 4
+  2d_filter_kernel_size: 0.1
+  3d_filter_kernel_size: 0.0009
+  scaling_bias: 0.004
+  opacity_bias: 0.1
+  scaling_activation: softplus
+use_fp16: true

checkpoints/slat_decoder_mesh.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:85907b37b67d8ce5b099a96629bdcfbd873eb407dee6b3aa9a75deb15038db33
+size 363726862

checkpoints/slat_decoder_mesh.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:93333fcd57a3e36ded0b3bca6969e05ce2b35142029dadab514f41df46d2f985
+size 363728714

checkpoints/slat_decoder_mesh.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+_target_: sam3d_objects.model.backbone.tdfy_dit.models.structured_latent_vae.decoder_mesh.SLatMeshDecoderTdfyWrapper
+resolution: 64
+model_channels: 768
+latent_channels: 8
+num_blocks: 12
+num_heads: 12
+mlp_ratio: 4
+attn_mode: swin
+window_size: 8
+representation_config:
+  use_color: true
+use_fp16: true

checkpoints/slat_generator.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:91529bde8e7daa12d09618a66c319e3a5a6398db6b23b958cedcb1c3f28faabb
+size 4906537684

checkpoints/slat_generator.yaml ADDED Viewed

	@@ -0,0 +1,60 @@

+module:
+  condition_embedder:
+    backbone:
+      _target_: sam3d_objects.model.backbone.dit.embedder.embedder_fuser.EmbedderFuser
+      embedder_list:
+      - - _target_: sam3d_objects.model.backbone.dit.embedder.dino.Dino
+          dino_model: dinov2_vitl14_reg
+          input_size: 518
+          normalize_images: true
+          prenorm_features: true
+        - - - image
+            - cropped
+          - - rgb_image
+            - full
+      - - _target_: sam3d_objects.model.backbone.dit.embedder.dino.Dino
+          dino_model: dinov2_vitl14_reg
+          input_size: 518
+          normalize_images: true
+          prenorm_features: true
+        - - - mask
+            - cropped
+          - - rgb_image_mask
+            - full
+      projection_net_hidden_dim_multiplier: 4.0
+      use_pos_embedding: learned
+  generator:
+    backbone:
+      _target_: sam3d_objects.model.backbone.generator.flow_matching.model.FlowMatching
+      inference_steps: 12
+      reverse_fn:
+        _target_: sam3d_objects.model.backbone.generator.classifier_free_guidance.ClassifierFreeGuidance
+        backbone:
+          _target_: sam3d_objects.model.backbone.tdfy_dit.models.structured_latent_flow.SLatFlowModelTdfyWrapper
+          cond_channels: 1024
+          condition_embedder: null
+          force_zeros_cond: true
+          in_channels: 8
+          io_block_channels:
+          - 128
+          mlp_ratio: 4
+          model_channels: 1024
+          num_blocks: 24
+          num_heads: 16
+          num_io_res_blocks: 2
+          out_channels: 8
+          patch_size: 2
+          pe_mode: ape
+          qk_rms_norm: true
+          resolution: 64
+          use_fp16: true
+        p_unconditional: 0.0
+        strength: 0.0
+        unconditional_handling: add_flag
+      sigma_min: 0.0
+      time_scale: 1000.0
+      training_time_sampler_fn:
+        _partial_: true
+        _target_: sam3d_objects.model.backbone.generator.flow_matching.model.lognorm_sampler
+        mean: -1.0
+        std: 1.0

checkpoints/ss_decoder.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6dac1cd7b7fda5a38e0614fadae441f1794f80e39ea2981f1ac8aff0a7e99340
+size 147609242

checkpoints/ss_decoder.yaml ADDED Viewed

	@@ -0,0 +1,10 @@

+_target_: sam3d_objects.model.backbone.tdfy_dit.models.sparse_structure_vae.SparseStructureDecoderTdfyWrapper
+out_channels: 1
+latent_channels: 8
+num_res_blocks: 2
+num_res_blocks_middle: 2
+channels:
+- 512
+- 128
+- 32
+reshape_input_to_cube: false

checkpoints/ss_encoder.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
+size 0

checkpoints/ss_encoder.yaml ADDED Viewed

File without changes

checkpoints/ss_generator.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:225f40479e4cff4f39d6fa14c55be3abad1475bf55b61af3bec1e19ed2f6c146
+size 6690136964

checkpoints/ss_generator.yaml ADDED Viewed

	@@ -0,0 +1,141 @@

+module:
+  condition_embedder:
+    backbone:
+      _target_: sam3d_objects.model.backbone.dit.embedder.embedder_fuser.EmbedderFuser
+      drop_modalities_weight:
+      - - - pointmap
+          - rgb_pointmap
+        - 1.0
+      dropout_prob: 0.1
+      embedder_list:
+      - - _target_: sam3d_objects.model.backbone.dit.embedder.dino.Dino
+          dino_model: dinov2_vitl14_reg
+          input_size: 518
+          normalize_images: true
+        - - - image
+            - cropped
+          - - rgb_image
+            - full
+      - - _target_: sam3d_objects.model.backbone.dit.embedder.dino.Dino
+          dino_model: dinov2_vitl14_reg
+          input_size: 518
+          normalize_images: true
+        - - - mask
+            - cropped
+          - - rgb_image_mask
+            - full
+      - - _target_: sam3d_objects.model.backbone.dit.embedder.pointmap.PointPatchEmbed
+          embed_dim: 512
+          input_size: 256
+          patch_size: 8
+          remap_output: linear
+        - - - pointmap
+            - cropped
+          - - rgb_pointmap
+            - full
+      force_drop_modalities: null
+      freeze: true
+      projection_net_hidden_dim_multiplier: 4.0
+      use_pos_embedding: learned
+  generator:
+    backbone:
+      _target_: sam3d_objects.model.backbone.generator.shortcut.model.ShortCut
+      batch_mode: true
+      cfg_modalities:
+      - shape
+      inference_steps: 2
+      loss_weights:
+        6drotation_normalized: 0.1
+        _target_: sam3d_objects.config.utils.make_dict
+        scale: 0.1
+        shape: 0
+        translation: 1.0
+        translation_scale: 0.0
+      ratio_cfg_samples_in_self_consistency_target: 0.25
+      rescale_t: 1
+      reverse_fn:
+        _target_: sam3d_objects.model.backbone.generator.classifier_free_guidance.ClassifierFreeGuidanceWithExternalUnconditionalProbability
+        backbone:
+          _target_: sam3d_objects.model.backbone.tdfy_dit.models.mot_sparse_structure_flow.SparseStructureFlowTdfyWrapper
+          cond_channels: 1024
+          condition_embedder: null
+          force_zeros_cond: true
+          freeze_d_time_embedder: true
+          freeze_shared_parameters: true
+          in_channels: 8
+          is_shortcut_model: true
+          latent_mapping:
+            6drotation_normalized:
+              _target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.Latent
+              in_channels: 6
+              model_channels: 1024
+              pos_embedder:
+                _target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.LearntPositionEmbedder
+                model_channels: 1024
+                token_len: 1
+            scale:
+              _target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.Latent
+              in_channels: 3
+              model_channels: 1024
+              pos_embedder:
+                _target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.LearntPositionEmbedder
+                model_channels: 1024
+                token_len: 1
+            shape:
+              _target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.Latent
+              in_channels: 8
+              model_channels: 1024
+              pos_embedder:
+                _target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.ShapePositionEmbedder
+                model_channels: 1024
+                patch_size: 1
+                resolution: 16
+            translation:
+              _target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.Latent
+              in_channels: 3
+              model_channels: 1024
+              pos_embedder:
+                _target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.LearntPositionEmbedder
+                model_channels: 1024
+                token_len: 1
+            translation_scale:
+              _target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.Latent
+              in_channels: 1
+              model_channels: 1024
+              pos_embedder:
+                _target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.LearntPositionEmbedder
+                model_channels: 1024
+                token_len: 1
+          latent_share_transformer:
+            6drotation_normalized:
+            - 6drotation_normalized
+            - translation
+            - scale
+            - translation_scale
+          mlp_ratio: 4
+          model_channels: 1024
+          num_blocks: 24
+          num_heads: 16
+          out_channels: 8
+          patch_size: 1
+          pe_mode: ape
+          qk_rms_norm: true
+          resolution: 16
+          use_checkpoint: false
+          use_fp16: false
+        interval:
+        - 0
+        - 500
+        p_unconditional: 0.1
+        strength: 2.0
+        unconditional_handling: add_flag
+      self_consistency_cfg_strength: 2.0
+      self_consistency_prob: 0.25
+      shortcut_loss_weight: 1.0
+      sigma_min: 0.0
+      time_scale: 1000.0
+      training_time_sampler_fn:
+        _partial_: true
+        _target_: sam3d_objects.model.backbone.generator.flow_matching.model.lognorm_sampler
+        mean: -1.0
+        std: 1.0