File size: 5,076 Bytes
ee0b416
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
module:
  condition_embedder:
    backbone:
      _target_: sam3d_objects.model.backbone.dit.embedder.embedder_fuser.EmbedderFuser
      drop_modalities_weight:
      - - - pointmap
          - rgb_pointmap
        - 1.0
      dropout_prob: 0.1
      embedder_list:
      - - _target_: sam3d_objects.model.backbone.dit.embedder.dino.Dino
          dino_model: dinov2_vitl14_reg
          input_size: 518
          normalize_images: true
        - - - image
            - cropped
          - - rgb_image
            - full
      - - _target_: sam3d_objects.model.backbone.dit.embedder.dino.Dino
          dino_model: dinov2_vitl14_reg
          input_size: 518
          normalize_images: true
        - - - mask
            - cropped
          - - rgb_image_mask
            - full
      - - _target_: sam3d_objects.model.backbone.dit.embedder.pointmap.PointPatchEmbed
          embed_dim: 512
          input_size: 256
          patch_size: 8
          remap_output: linear
        - - - pointmap
            - cropped
          - - rgb_pointmap
            - full
      force_drop_modalities: null
      freeze: true
      projection_net_hidden_dim_multiplier: 4.0
      use_pos_embedding: learned
  generator:
    backbone:
      _target_: sam3d_objects.model.backbone.generator.shortcut.model.ShortCut
      batch_mode: true
      cfg_modalities:
      - shape
      inference_steps: 2
      loss_weights:
        6drotation_normalized: 0.1
        _target_: sam3d_objects.config.utils.make_dict
        scale: 0.1
        shape: 0
        translation: 1.0
        translation_scale: 0.0
      ratio_cfg_samples_in_self_consistency_target: 0.25
      rescale_t: 1
      reverse_fn:
        _target_: sam3d_objects.model.backbone.generator.classifier_free_guidance.ClassifierFreeGuidanceWithExternalUnconditionalProbability
        backbone:
          _target_: sam3d_objects.model.backbone.tdfy_dit.models.mot_sparse_structure_flow.SparseStructureFlowTdfyWrapper
          cond_channels: 1024
          condition_embedder: null
          force_zeros_cond: true
          freeze_d_time_embedder: true
          freeze_shared_parameters: true
          in_channels: 8
          is_shortcut_model: true
          latent_mapping:
            6drotation_normalized:
              _target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.Latent
              in_channels: 6
              model_channels: 1024
              pos_embedder:
                _target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.LearntPositionEmbedder
                model_channels: 1024
                token_len: 1
            scale:
              _target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.Latent
              in_channels: 3
              model_channels: 1024
              pos_embedder:
                _target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.LearntPositionEmbedder
                model_channels: 1024
                token_len: 1
            shape:
              _target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.Latent
              in_channels: 8
              model_channels: 1024
              pos_embedder:
                _target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.ShapePositionEmbedder
                model_channels: 1024
                patch_size: 1
                resolution: 16
            translation:
              _target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.Latent
              in_channels: 3
              model_channels: 1024
              pos_embedder:
                _target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.LearntPositionEmbedder
                model_channels: 1024
                token_len: 1
            translation_scale:
              _target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.Latent
              in_channels: 1
              model_channels: 1024
              pos_embedder:
                _target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.LearntPositionEmbedder
                model_channels: 1024
                token_len: 1
          latent_share_transformer:
            6drotation_normalized:
            - 6drotation_normalized
            - translation
            - scale
            - translation_scale
          mlp_ratio: 4
          model_channels: 1024
          num_blocks: 24
          num_heads: 16
          out_channels: 8
          patch_size: 1
          pe_mode: ape
          qk_rms_norm: true
          resolution: 16
          use_checkpoint: false
          use_fp16: false
        interval:
        - 0
        - 500
        p_unconditional: 0.1
        strength: 2.0
        unconditional_handling: add_flag
      self_consistency_cfg_strength: 2.0
      self_consistency_prob: 0.25
      shortcut_loss_weight: 1.0
      sigma_min: 0.0
      time_scale: 1000.0
      training_time_sampler_fn:
        _partial_: true
        _target_: sam3d_objects.model.backbone.generator.flow_matching.model.lognorm_sampler
        mean: -1.0
        std: 1.0