File size: 1,986 Bytes
ee0b416 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
module:
condition_embedder:
backbone:
_target_: sam3d_objects.model.backbone.dit.embedder.embedder_fuser.EmbedderFuser
embedder_list:
- - _target_: sam3d_objects.model.backbone.dit.embedder.dino.Dino
dino_model: dinov2_vitl14_reg
input_size: 518
normalize_images: true
prenorm_features: true
- - - image
- cropped
- - rgb_image
- full
- - _target_: sam3d_objects.model.backbone.dit.embedder.dino.Dino
dino_model: dinov2_vitl14_reg
input_size: 518
normalize_images: true
prenorm_features: true
- - - mask
- cropped
- - rgb_image_mask
- full
projection_net_hidden_dim_multiplier: 4.0
use_pos_embedding: learned
generator:
backbone:
_target_: sam3d_objects.model.backbone.generator.flow_matching.model.FlowMatching
inference_steps: 12
reverse_fn:
_target_: sam3d_objects.model.backbone.generator.classifier_free_guidance.ClassifierFreeGuidance
backbone:
_target_: sam3d_objects.model.backbone.tdfy_dit.models.structured_latent_flow.SLatFlowModelTdfyWrapper
cond_channels: 1024
condition_embedder: null
force_zeros_cond: true
in_channels: 8
io_block_channels:
- 128
mlp_ratio: 4
model_channels: 1024
num_blocks: 24
num_heads: 16
num_io_res_blocks: 2
out_channels: 8
patch_size: 2
pe_mode: ape
qk_rms_norm: true
resolution: 64
use_fp16: true
p_unconditional: 0.0
strength: 0.0
unconditional_handling: add_flag
sigma_min: 0.0
time_scale: 1000.0
training_time_sampler_fn:
_partial_: true
_target_: sam3d_objects.model.backbone.generator.flow_matching.model.lognorm_sampler
mean: -1.0
std: 1.0
|