horizon171852 commited on
Commit
5af4e2d
·
verified ·
1 Parent(s): 8371b9f

Upload folder using huggingface_hub

Browse files
SceneMaker_indoor_ckpts/config.yaml ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: image-to-pose-diffusion/mixed-dinov2reglarge336-PatchEmbed-pixartflow-sharpvae-dit32-160wdata-width1280-rectify-size+6Drotation+pose-direct
2
+ description: ''
3
+ tag: michelangelo-autoencoder+n16384+lr0.0001+shape2vec+scene-attn+sep_both-block+pcd2_aug-mode+lambda-rot-1+lambda-trans-1+proj-mode-sep+loss-mode-object+scene-mask-img+only-pitch-True+both-mode+scratch+train-pcd
4
+ seed: 0
5
+ use_timestamp: true
6
+ timestamp: ''
7
+ exp_root_dir: outputs
8
+ exp_dir: outputs/image-to-pose-diffusion/mixed-dinov2reglarge336-PatchEmbed-pixartflow-sharpvae-dit32-160wdata-width1280-rectify-size+6Drotation+pose-direct
9
+ trial_name: michelangelo-autoencoder+n16384+lr0.0001+shape2vec+scene-attn+sep_both-block+pcd2_aug-mode+lambda-rot-1+lambda-trans-1+proj-mode-sep+loss-mode-object+scene-mask-img+only-pitch-True+both-mode+scratch+train-pcd
10
+ trial_dir: outputs/image-to-pose-diffusion/mixed-dinov2reglarge336-PatchEmbed-pixartflow-sharpvae-dit32-160wdata-width1280-rectify-size+6Drotation+pose-direct/michelangelo-autoencoder+n16384+lr0.0001+shape2vec+scene-attn+sep_both-block+pcd2_aug-mode+lambda-rot-1+lambda-trans-1+proj-mode-sep+loss-mode-object+scene-mask-img+only-pitch-True+both-mode+scratch+train-pcd
11
+ n_gpus: 8
12
+ resume: null
13
+ data_type: Front3D-mixed-datamodule
14
+ data:
15
+ midi_cfg:
16
+ scene_list: /comp_robot/shiyukai/datasets/midi/3D-Front/midi_room_ids.json
17
+ object_list: /comp_robot/shiyukai/datasets/midi/3D-Front/midi_furniture_ids.json
18
+ surface_root_dir: /comp_robot/shiyukai/datasets/midi/3D-Front/3D-FRONT-SURFACE/
19
+ image_data_path: /comp_robot/shiyukai/datasets/midi/3D-Front/3D-FRONT-RENDER/
20
+ mask_path: /comp_robot/shiyukai/datasets/midi/3D-Front/3D-FRONT-RENDER-independent-norm/
21
+ geo_data_path: /comp_robot/shiyukai/datasets/instPiFU/datasets/normalized_watertight/sampling_objects
22
+ train_indices:
23
+ - 0
24
+ - -1000
25
+ val_indices:
26
+ - -1010
27
+ - -990
28
+ test_indices:
29
+ - -1000
30
+ - null
31
+ render_mode: both
32
+ instpifu_cfg:
33
+ data_path: /comp_robot/shiyukai/datasets/instPiFU/datasets/prepare_data/
34
+ geo_data_path: /comp_robot/shiyukai/datasets/instPiFU/datasets/normalized_watertight/sampling_objects
35
+ avg_layout_path: data/3dfront/avg_layout.pkl
36
+ geo_data_type: sdf
37
+ with_sharp_data: true
38
+ sampling_strategy: fps
39
+ n_samples: 16384
40
+ noise_sigma: 0.0
41
+ random_flip: true
42
+ random_color_jitter: true
43
+ shuffle: true
44
+ load_supervision: false
45
+ supervision_type: sdf
46
+ n_supervision: 10000
47
+ load_image: true
48
+ image_type: rgb_or_normal
49
+ image_type_ratio: 0.95
50
+ idx:
51
+ - 0
52
+ - 1
53
+ - 2
54
+ - 3
55
+ - 4
56
+ - 5
57
+ - 6
58
+ - 7
59
+ - 8
60
+ - 9
61
+ - 10
62
+ - 11
63
+ - 12
64
+ - 13
65
+ - 14
66
+ - 15
67
+ - 16
68
+ - 17
69
+ - 18
70
+ - 19
71
+ n_views: 1
72
+ background_color:
73
+ - 255
74
+ - 255
75
+ - 255
76
+ images_per_sample: 1
77
+ max_objs: 5
78
+ min_pcd: 1024
79
+ translation_mode: pcd2_aug
80
+ refine_mask: false
81
+ use_scene_geometry: false
82
+ only_use_pitch: true
83
+ use_mix_coord: true
84
+ image_width: 512
85
+ image_height: 512
86
+ batch_size: 8
87
+ num_workers: 8
88
+ system_type: direct-unify-flow-system
89
+ system:
90
+ val_samples_json: val_data/images/val_samples_rgb_image.json
91
+ z_scale_factor: 1.0
92
+ guidance_scale: 3.0
93
+ num_inference_steps: 50
94
+ eta: 0.0
95
+ compute_metric: true
96
+ visualize_mesh: true
97
+ extract_mesh_func: mc
98
+ remove_bg: true
99
+ octree_depth: 5
100
+ max_objs: 5
101
+ weighting_scheme: cosmap
102
+ pretrain_pcd: shape2vec
103
+ lambda_rot: 1
104
+ lambda_trans: 1
105
+ lambda_kl: 0.0
106
+ lambda_cd: 0.0
107
+ sup_latents: false
108
+ loss_mode: object
109
+ use_scene_img: true
110
+ use_scene_mask: false
111
+ use_scene_mask_img: true
112
+ use_scene_pcd: true
113
+ use_caption: false
114
+ freeze_pose_enc: false
115
+ freeze_pcd_model: false
116
+ shape_model_type: michelangelo-autoencoder
117
+ shape_model:
118
+ pretrained_model_name_or_path: ckpts/new-sharp-msvae-2048-tokens.ckpt
119
+ n_samples: 16384
120
+ with_sharp_data: true
121
+ use_downsample: true
122
+ num_latents: 512
123
+ embed_dim: 64
124
+ point_feats: 3
125
+ out_dim: 1
126
+ num_freqs: 8
127
+ include_pi: false
128
+ heads: 12
129
+ width: 768
130
+ num_encoder_layers: 8
131
+ num_decoder_layers: 16
132
+ use_ln_post: true
133
+ init_scale: 0.25
134
+ qkv_bias: false
135
+ use_flash: true
136
+ use_checkpoint: true
137
+ pose_model_type: pose-ae
138
+ pose_model:
139
+ in_dim: 6
140
+ out_dim: 6
141
+ embed_dim: 64
142
+ embed_type: fourier
143
+ num_latents: 5
144
+ include_pi: false
145
+ init_scale: 0.25
146
+ enable_ln_affine: true
147
+ context_dim: 1024
148
+ enable_translation: true
149
+ num_tokens: 3
150
+ pcd_model_type: shape2vectset-autoencoder
151
+ pcd_model:
152
+ num_latents: 512
153
+ embed_dim: 8
154
+ use_fps: true
155
+ condition_model_type: dinov2-embedder
156
+ condition_model:
157
+ pretrained_dino_name_or_path: facebook/dinov2-with-registers-large
158
+ encode_camera: false
159
+ n_views: 1
160
+ empty_embeds_ratio: 0.0
161
+ normalize_embeds: false
162
+ zero_uncond_embeds: true
163
+ image_size_dino: 224
164
+ caption_condition_type: t5-encoder
165
+ caption_condition:
166
+ pretrained_t5_name_or_path: google-t5/t5-small
167
+ empty_embeds_ratio: 0.1
168
+ normalize_embeds: false
169
+ zero_uncond_embeds: true
170
+ caption_condition_dim: 512
171
+ text_max_length: 77
172
+ denoiser_model_type: dit-pose-denoiser
173
+ denoiser_model:
174
+ input_channels: 64
175
+ output_channels: 64
176
+ width: 1024
177
+ layers: 16
178
+ pre_heads: 16
179
+ curr_heads: 16
180
+ context_dim: 1024
181
+ init_scale: 1.0
182
+ use_checkpoint: true
183
+ condition_type: dinov2
184
+ use_rope: true
185
+ use_pe: false
186
+ use_caption: false
187
+ num_shape_latents: 512
188
+ num_pose_latents: 3
189
+ num_pcd_latents: 512
190
+ num_img_latents: 257
191
+ num_text_latents: 77
192
+ attn_mode: scene
193
+ block_mode: sep_both
194
+ proj_mode: sep
195
+ noise_scheduler_type: diffusers.schedulers.FlowMatchEulerDiscreteScheduler
196
+ noise_scheduler:
197
+ num_train_timesteps: 1000
198
+ shift: 1.0
199
+ denoise_scheduler_type: diffusers.schedulers.FlowMatchEulerDiscreteScheduler
200
+ denoise_scheduler:
201
+ num_train_timesteps: 1000
202
+ shift: 1.0
203
+ loggers:
204
+ wandb:
205
+ enable: false
206
+ project: CraftsMan
207
+ name: image-to-shape-diffusion+image-to-pose-diffusion/mixed-dinov2reglarge336-PatchEmbed-pixartflow-sharpvae-dit32-160wdata-width1280-rectify-size+6Drotation+pose-direct+michelangelo-autoencoder+n16384+lr0.0001+shape2vec+scene-attn+sep_both-block+pcd2_aug-mode+lambda-rot-1+lambda-trans-1+proj-mode-sep+loss-mode-object+scene-mask-img+only-pitch-True+both-mode+scratch+train-pcd
208
+ loss:
209
+ loss_type: mse
210
+ lambda_diffusion: 1.0
211
+ optimizer:
212
+ name: AdamW
213
+ args:
214
+ lr: 0.0001
215
+ betas:
216
+ - 0.9
217
+ - 0.99
218
+ eps: 1.0e-06
219
+ scheduler:
220
+ interval: step
221
+ name: CosineAnnealingLR
222
+ args:
223
+ T_max: 20000
224
+ eta_min: 0.0001
225
+ trainer:
226
+ num_nodes: 1
227
+ max_epochs: 600
228
+ log_every_n_steps: 5
229
+ num_sanity_val_steps: 1
230
+ check_val_every_n_epoch: 1
231
+ enable_progress_bar: true
232
+ precision: bf16-mixed
233
+ strategy: deepspeed_stage_2
234
+ accumulate_grad_batches: 2
235
+ checkpoint:
236
+ save_last: true
237
+ save_top_k: -1
238
+ every_n_train_steps: 2000
SceneMaker_indoor_ckpts/model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:795f49e74dbe7cb72d5b5579548d789a7a2bad7b6f0917e13436f3d419fa1232
3
+ size 4148439802
SceneMaker_openset_ckpts/config.yaml ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: image-to-pose-diffusion/openset-dinov2reglarge336-PatchEmbed-pixartflow-sharpvae-dit32-160wdata-width1280-rectify-size+6Drotation+pose-direct
2
+ description: ''
3
+ tag: michelangelo-autoencoder+n16384+lr0.0001+shape2vec-pcd+scene-attn+sep_both-block+pcd2_aug-mode+lambda-rot-1+lambda-trans-1+proj-mode-sep+loss-mode-object+scene-mask-img+only-pitch-True+mixed-coord-True+render-mode+scratch+50k+rectify+train-pcd
4
+ seed: 0
5
+ use_timestamp: true
6
+ timestamp: ''
7
+ exp_root_dir: outputs
8
+ exp_dir: outputs/image-to-pose-diffusion/openset-dinov2reglarge336-PatchEmbed-pixartflow-sharpvae-dit32-160wdata-width1280-rectify-size+6Drotation+pose-direct
9
+ trial_name: michelangelo-autoencoder+n16384+lr0.0001+shape2vec-pcd+scene-attn+sep_both-block+pcd2_aug-mode+lambda-rot-1+lambda-trans-1+proj-mode-sep+loss-mode-object+scene-mask-img+only-pitch-True+mixed-coord-True+render-mode+scratch+50k+rectify+train-pcd
10
+ trial_dir: outputs/image-to-pose-diffusion/openset-dinov2reglarge336-PatchEmbed-pixartflow-sharpvae-dit32-160wdata-width1280-rectify-size+6Drotation+pose-direct/michelangelo-autoencoder+n16384+lr0.0001+shape2vec-pcd+scene-attn+sep_both-block+pcd2_aug-mode+lambda-rot-1+lambda-trans-1+proj-mode-sep+loss-mode-object+scene-mask-img+only-pitch-True+mixed-coord-True+render-mode+scratch+50k+rectify+train-pcd
11
+ n_gpus: 8
12
+ resume: null
13
+ data_type: Openset-datamodule
14
+ data:
15
+ scene_list: /comp_robot/shiyukai/datasets/openset_scene/openset50k/data/all_json_files.json
16
+ geo_data_path: /comp_robot/shiyukai/datasets/objaverse/objaverse_clean/high_quality_xl_190k_images/geometry/sample/
17
+ train_indices:
18
+ - 0
19
+ - -100
20
+ val_indices:
21
+ - -100
22
+ - null
23
+ test_indices:
24
+ - -100
25
+ - null
26
+ geo_data_type: sdf
27
+ with_sharp_data: true
28
+ sampling_strategy: fps
29
+ n_samples: 16384
30
+ noise_sigma: 0.0
31
+ random_flip: true
32
+ random_color_jitter: true
33
+ shuffle: true
34
+ load_supervision: false
35
+ supervision_type: sdf
36
+ n_supervision: 10000
37
+ load_image: true
38
+ image_type: rgb_or_normal
39
+ image_type_ratio: 0.95
40
+ idx:
41
+ - 0
42
+ - 1
43
+ - 2
44
+ - 3
45
+ - 4
46
+ - 5
47
+ - 6
48
+ - 7
49
+ - 8
50
+ - 9
51
+ - 10
52
+ - 11
53
+ - 12
54
+ - 13
55
+ - 14
56
+ - 15
57
+ - 16
58
+ - 17
59
+ - 18
60
+ - 19
61
+ n_views: 20
62
+ background_color:
63
+ - 255
64
+ - 255
65
+ - 255
66
+ images_per_sample: 1
67
+ max_objs: 5
68
+ min_pcd: 1024
69
+ translation_mode: pcd2_aug
70
+ refine_mask: false
71
+ use_scene_geometry: false
72
+ only_use_pitch: true
73
+ use_mix_coord: true
74
+ image_width: 512
75
+ image_height: 512
76
+ render_mode: render
77
+ batch_size: 8
78
+ num_workers: 8
79
+ system_type: direct-unify-flow-system
80
+ system:
81
+ val_samples_json: val_data/images/val_samples_rgb_image.json
82
+ z_scale_factor: 1.0
83
+ guidance_scale: 3.0
84
+ num_inference_steps: 50
85
+ eta: 0.0
86
+ compute_metric: true
87
+ visualize_mesh: true
88
+ extract_mesh_func: mc
89
+ remove_bg: true
90
+ octree_depth: 5
91
+ max_objs: 5
92
+ weighting_scheme: cosmap
93
+ pretrain_pcd: shape2vec
94
+ lambda_rot: 1
95
+ lambda_trans: 1
96
+ lambda_kl: 0.0
97
+ lambda_cd: 0.0
98
+ sup_latents: false
99
+ loss_mode: object
100
+ use_scene_img: true
101
+ use_scene_mask: false
102
+ use_scene_mask_img: true
103
+ use_scene_pcd: true
104
+ use_caption: false
105
+ freeze_pose_enc: false
106
+ freeze_pcd_model: false
107
+ shape_model_type: michelangelo-autoencoder
108
+ shape_model:
109
+ pretrained_model_name_or_path: ckpts/new-sharp-msvae-2048-tokens.ckpt
110
+ n_samples: 16384
111
+ with_sharp_data: true
112
+ use_downsample: true
113
+ num_latents: 512
114
+ embed_dim: 64
115
+ point_feats: 3
116
+ out_dim: 1
117
+ num_freqs: 8
118
+ include_pi: false
119
+ heads: 12
120
+ width: 768
121
+ num_encoder_layers: 8
122
+ num_decoder_layers: 16
123
+ use_ln_post: true
124
+ init_scale: 0.25
125
+ qkv_bias: false
126
+ use_flash: true
127
+ use_checkpoint: true
128
+ pose_model_type: pose-ae
129
+ pose_model:
130
+ in_dim: 6
131
+ out_dim: 6
132
+ embed_dim: 64
133
+ embed_type: fourier
134
+ num_latents: 5
135
+ include_pi: false
136
+ init_scale: 0.25
137
+ enable_ln_affine: true
138
+ context_dim: 1024
139
+ enable_translation: true
140
+ num_tokens: 3
141
+ pcd_model_type: shape2vectset-autoencoder
142
+ pcd_model:
143
+ num_latents: 512
144
+ embed_dim: 8
145
+ use_fps: true
146
+ condition_model_type: dinov2-embedder
147
+ condition_model:
148
+ pretrained_dino_name_or_path: facebook/dinov2-with-registers-large
149
+ encode_camera: false
150
+ n_views: 20
151
+ empty_embeds_ratio: 0.0
152
+ normalize_embeds: false
153
+ zero_uncond_embeds: true
154
+ image_size_dino: 224
155
+ caption_condition_type: t5-encoder
156
+ caption_condition:
157
+ pretrained_t5_name_or_path: google-t5/t5-small
158
+ empty_embeds_ratio: 0.1
159
+ normalize_embeds: false
160
+ zero_uncond_embeds: true
161
+ caption_condition_dim: 512
162
+ text_max_length: 77
163
+ denoiser_model_type: dit-pose-denoiser
164
+ denoiser_model:
165
+ input_channels: 64
166
+ output_channels: 64
167
+ width: 1024
168
+ layers: 16
169
+ pre_heads: 16
170
+ curr_heads: 16
171
+ context_dim: 1024
172
+ init_scale: 1.0
173
+ use_checkpoint: true
174
+ condition_type: dinov2
175
+ use_rope: true
176
+ use_pe: false
177
+ use_caption: false
178
+ num_shape_latents: 512
179
+ num_pose_latents: 3
180
+ num_pcd_latents: 512
181
+ num_img_latents: 257
182
+ num_text_latents: 77
183
+ attn_mode: scene
184
+ block_mode: sep_both
185
+ proj_mode: sep
186
+ noise_scheduler_type: diffusers.schedulers.FlowMatchEulerDiscreteScheduler
187
+ noise_scheduler:
188
+ num_train_timesteps: 1000
189
+ shift: 1.0
190
+ denoise_scheduler_type: diffusers.schedulers.FlowMatchEulerDiscreteScheduler
191
+ denoise_scheduler:
192
+ num_train_timesteps: 1000
193
+ shift: 1.0
194
+ loggers:
195
+ wandb:
196
+ enable: false
197
+ project: CraftsMan
198
+ name: image-to-shape-diffusion+image-to-pose-diffusion/openset-dinov2reglarge336-PatchEmbed-pixartflow-sharpvae-dit32-160wdata-width1280-rectify-size+6Drotation+pose-direct+michelangelo-autoencoder+n16384+lr0.0001+shape2vec-pcd+scene-attn+sep_both-block+pcd2_aug-mode+lambda-rot-1+lambda-trans-1+proj-mode-sep+loss-mode-object+scene-mask-img+only-pitch-True+mixed-coord-True+render-mode+scratch+50k+rectify+train-pcd
199
+ loss:
200
+ loss_type: mse
201
+ lambda_diffusion: 1.0
202
+ optimizer:
203
+ name: AdamW
204
+ args:
205
+ lr: 0.0001
206
+ betas:
207
+ - 0.9
208
+ - 0.99
209
+ eps: 1.0e-06
210
+ scheduler:
211
+ interval: step
212
+ name: CosineAnnealingLR
213
+ args:
214
+ T_max: 20000
215
+ eta_min: 0.0001
216
+ trainer:
217
+ num_nodes: 1
218
+ max_epochs: 600
219
+ log_every_n_steps: 5
220
+ num_sanity_val_steps: 1
221
+ check_val_every_n_epoch: 5
222
+ enable_progress_bar: true
223
+ precision: bf16-mixed
224
+ strategy: deepspeed_stage_2
225
+ accumulate_grad_batches: 2
226
+ checkpoint:
227
+ save_last: true
228
+ save_top_k: -1
229
+ every_n_train_steps: 2000
SceneMaker_openset_ckpts/model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9491e037e7248f5e111e79e24328cac00781c6ffe7dd7609e3307c9074ac305a
3
+ size 4148439802
new-sharp-msvae-2048-tokens.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:897296f19c0ef654e242b8164a0c9dcc6261f301d3505d76cf097a01e0104390
3
+ size 766482726