File size: 5,076 Bytes
ee0b416 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
module:
condition_embedder:
backbone:
_target_: sam3d_objects.model.backbone.dit.embedder.embedder_fuser.EmbedderFuser
drop_modalities_weight:
- - - pointmap
- rgb_pointmap
- 1.0
dropout_prob: 0.1
embedder_list:
- - _target_: sam3d_objects.model.backbone.dit.embedder.dino.Dino
dino_model: dinov2_vitl14_reg
input_size: 518
normalize_images: true
- - - image
- cropped
- - rgb_image
- full
- - _target_: sam3d_objects.model.backbone.dit.embedder.dino.Dino
dino_model: dinov2_vitl14_reg
input_size: 518
normalize_images: true
- - - mask
- cropped
- - rgb_image_mask
- full
- - _target_: sam3d_objects.model.backbone.dit.embedder.pointmap.PointPatchEmbed
embed_dim: 512
input_size: 256
patch_size: 8
remap_output: linear
- - - pointmap
- cropped
- - rgb_pointmap
- full
force_drop_modalities: null
freeze: true
projection_net_hidden_dim_multiplier: 4.0
use_pos_embedding: learned
generator:
backbone:
_target_: sam3d_objects.model.backbone.generator.shortcut.model.ShortCut
batch_mode: true
cfg_modalities:
- shape
inference_steps: 2
loss_weights:
6drotation_normalized: 0.1
_target_: sam3d_objects.config.utils.make_dict
scale: 0.1
shape: 0
translation: 1.0
translation_scale: 0.0
ratio_cfg_samples_in_self_consistency_target: 0.25
rescale_t: 1
reverse_fn:
_target_: sam3d_objects.model.backbone.generator.classifier_free_guidance.ClassifierFreeGuidanceWithExternalUnconditionalProbability
backbone:
_target_: sam3d_objects.model.backbone.tdfy_dit.models.mot_sparse_structure_flow.SparseStructureFlowTdfyWrapper
cond_channels: 1024
condition_embedder: null
force_zeros_cond: true
freeze_d_time_embedder: true
freeze_shared_parameters: true
in_channels: 8
is_shortcut_model: true
latent_mapping:
6drotation_normalized:
_target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.Latent
in_channels: 6
model_channels: 1024
pos_embedder:
_target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.LearntPositionEmbedder
model_channels: 1024
token_len: 1
scale:
_target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.Latent
in_channels: 3
model_channels: 1024
pos_embedder:
_target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.LearntPositionEmbedder
model_channels: 1024
token_len: 1
shape:
_target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.Latent
in_channels: 8
model_channels: 1024
pos_embedder:
_target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.ShapePositionEmbedder
model_channels: 1024
patch_size: 1
resolution: 16
translation:
_target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.Latent
in_channels: 3
model_channels: 1024
pos_embedder:
_target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.LearntPositionEmbedder
model_channels: 1024
token_len: 1
translation_scale:
_target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.Latent
in_channels: 1
model_channels: 1024
pos_embedder:
_target_: sam3d_objects.model.backbone.tdfy_dit.models.mm_latent.LearntPositionEmbedder
model_channels: 1024
token_len: 1
latent_share_transformer:
6drotation_normalized:
- 6drotation_normalized
- translation
- scale
- translation_scale
mlp_ratio: 4
model_channels: 1024
num_blocks: 24
num_heads: 16
out_channels: 8
patch_size: 1
pe_mode: ape
qk_rms_norm: true
resolution: 16
use_checkpoint: false
use_fp16: false
interval:
- 0
- 500
p_unconditional: 0.1
strength: 2.0
unconditional_handling: add_flag
self_consistency_cfg_strength: 2.0
self_consistency_prob: 0.25
shortcut_loss_weight: 1.0
sigma_min: 0.0
time_scale: 1000.0
training_time_sampler_fn:
_partial_: true
_target_: sam3d_objects.model.backbone.generator.flow_matching.model.lognorm_sampler
mean: -1.0
std: 1.0
|