module: condition_embedder: backbone: _target_: sam3d_objects.model.backbone.dit.embedder.embedder_fuser.EmbedderFuser drop_modalities_weight: - - - pointmap - rgb_pointmap - 1.0 dropout_prob: 0.1 embedder_list: - - _target_: sam3d_objects.model.backbone.dit.embedder.dino.Dino dino_model: dinov2_vitl14_reg input_size: 518 normalize_images: true - - - image - cropped - - rgb_image - full - - _target_: sam3d_objects.model.backbone.dit.embedder.dino.Dino dino_model: dinov2_vitl14_reg input_size: 518 normalize_images: true - - - mask - cropped - - rgb_image_mask - full - - _target_: sam3d_objects.model.backbone.dit.embedder.pointmap.PointPatchEmbed embed_dim: 512 input_size: 256 patch_size: 8 remap_output: linear - - - pointmap - cropped - - rgb_pointmap - full force_drop_modalities: null freeze: true projection_net_hidden_dim_multiplier: 4.0 use_pos_embedding: learned generator: backbone: _target_: sam3d_objects.model.backbone.generator.shortcut.model.ShortCut batch_mode: true cfg_modalities: - shape inference_steps: 2 loss_weights: 6drotation_normalized: 0.1 _target_: sam3d_objects.config.utils.make_dict scale: 0.1 shape: 0 translation: 1.0 translation_scale: 0.0 ratio_cfg_samples_in_self_consistency_target: 0.25 rescale_t: 1 reverse_fn: _target_: sam3d_objects.model.backbone.generator.classifier_free_guidance.ClassifierFreeGuidanceWithExternalUnconditionalProbability backbone: _target_: sam3d_objects.model.backbone.tdfy_dit.models.mot_sparse_structure_flow.SparseStructureFlowTdfyWrapper cond_channels: 1024 condition_embedder: null force_zeros_cond: true freeze_d_time_embedder: true freeze_shared_parameters: true in_channels: 8 is_shortcut_model: true latent_mapping: 6drotation_normalized: _target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.Latent in_channels: 6 model_channels: 1024 pos_embedder: _target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.LearntPositionEmbedder model_channels: 1024 token_len: 1 scale: _target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.Latent in_channels: 3 model_channels: 1024 pos_embedder: _target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.LearntPositionEmbedder model_channels: 1024 token_len: 1 shape: _target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.Latent in_channels: 8 model_channels: 1024 pos_embedder: _target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.ShapePositionEmbedder model_channels: 1024 patch_size: 1 resolution: 16 translation: _target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.Latent in_channels: 3 model_channels: 1024 pos_embedder: _target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.LearntPositionEmbedder model_channels: 1024 token_len: 1 translation_scale: _target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.Latent in_channels: 1 model_channels: 1024 pos_embedder: _target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.LearntPositionEmbedder model_channels: 1024 token_len: 1 latent_share_transformer: 6drotation_normalized: - 6drotation_normalized - translation - scale - translation_scale mlp_ratio: 4 model_channels: 1024 num_blocks: 24 num_heads: 16 out_channels: 8 patch_size: 1 pe_mode: ape qk_rms_norm: true resolution: 16 use_checkpoint: false use_fp16: false interval: - 0 - 500 p_unconditional: 0.1 strength: 2.0 unconditional_handling: add_flag self_consistency_cfg_strength: 2.0 self_consistency_prob: 0.25 shortcut_loss_weight: 1.0 sigma_min: 0.0 time_scale: 1000.0 training_time_sampler_fn: _partial_: true _target_: sam3d_objects.model.backbone.generator.flow_matching.model.lognorm_sampler mean: -1.0 std: 1.0