| module: | |
| condition_embedder: | |
| backbone: | |
| _target_: sam3d_objects.model.backbone.dit.embedder.embedder_fuser.EmbedderFuser | |
| embedder_list: | |
| - - _target_: sam3d_objects.model.backbone.dit.embedder.dino.Dino | |
| dino_model: dinov2_vitl14_reg | |
| input_size: 518 | |
| normalize_images: true | |
| prenorm_features: true | |
| - - - image | |
| - cropped | |
| - - rgb_image | |
| - full | |
| - - _target_: sam3d_objects.model.backbone.dit.embedder.dino.Dino | |
| dino_model: dinov2_vitl14_reg | |
| input_size: 518 | |
| normalize_images: true | |
| prenorm_features: true | |
| - - - mask | |
| - cropped | |
| - - rgb_image_mask | |
| - full | |
| projection_net_hidden_dim_multiplier: 4.0 | |
| use_pos_embedding: learned | |
| generator: | |
| backbone: | |
| _target_: sam3d_objects.model.backbone.generator.flow_matching.model.FlowMatching | |
| inference_steps: 12 | |
| reverse_fn: | |
| _target_: sam3d_objects.model.backbone.generator.classifier_free_guidance.ClassifierFreeGuidance | |
| backbone: | |
| _target_: sam3d_objects.model.backbone.tdfy_dit.models.structured_latent_flow.SLatFlowModelTdfyWrapper | |
| cond_channels: 1024 | |
| condition_embedder: null | |
| force_zeros_cond: true | |
| in_channels: 8 | |
| io_block_channels: | |
| - 128 | |
| mlp_ratio: 4 | |
| model_channels: 1024 | |
| num_blocks: 24 | |
| num_heads: 16 | |
| num_io_res_blocks: 2 | |
| out_channels: 8 | |
| patch_size: 2 | |
| pe_mode: ape | |
| qk_rms_norm: true | |
| resolution: 64 | |
| use_fp16: true | |
| p_unconditional: 0.0 | |
| strength: 0.0 | |
| unconditional_handling: add_flag | |
| sigma_min: 0.0 | |
| time_scale: 1000.0 | |
| training_time_sampler_fn: | |
| _partial_: true | |
| _target_: sam3d_objects.model.backbone.generator.flow_matching.model.lognorm_sampler | |
| mean: -1.0 | |
| std: 1.0 | |