OpenDriveLab-org commited on
Commit
be9143b
·
verified ·
1 Parent(s): 06c863e

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ resim_data_jsons/navsim_token2info_train_list.json filter=lfs diff=lfs merge=lfs -text
37
+ resim_data_jsons/navsim_token2info_val_list.json filter=lfs diff=lfs merge=lfs -text
38
+ resim_data_jsons/nus_val_4k.json filter=lfs diff=lfs merge=lfs -text
resim_ckpts/exp0_no_carla/30000-ema/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd35a2490883ec9bcf810c5a5f90cf7313273f07be5a205638ba4e78fab62314
3
+ size 23667958479
resim_ckpts/exp0_no_carla/30000/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7de5e087fffaf3a7eba4e8dcf2553830963119b461659880c6b0ddd67d7ddd07
3
+ size 23667958479
resim_ckpts/exp0_no_carla/latest ADDED
@@ -0,0 +1 @@
 
 
1
+ 30000
resim_ckpts/exp0_no_carla/model_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "model_class": "SATVideoDiffusionEngine",
3
+ "model_parallel_size": 1
4
+ }
resim_ckpts/exp0_no_carla/training_config.yaml ADDED
@@ -0,0 +1,276 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ args:
2
+ checkpoint_activations: true
3
+ model_parallel_size: 1
4
+ experiment_name: nus
5
+ mode: finetune
6
+ load: /cpfs01/user/yangjiazhi/workspace/DVGen/CogVideo/sat/ckpts0/main5_joint_stage2_high_small-lr-12-08-06-48
7
+ no_load_rng: true
8
+ train_iters: 300000
9
+ eval_iters: 1
10
+ eval_interval: 10000
11
+ eval_batch_size: 1
12
+ save: ckpts
13
+ save_interval: 2500
14
+ log_interval: 20
15
+ train_data:
16
+ - /cpfs01/user/yangjiazhi/workspace/DVGen/CogVideo/custom_data/youtube_json/YouTube_svd_clip-len-49_interval-10_5M_flow_round2.json
17
+ - /cpfs01/user/yangjiazhi/workspace/DVGen/CogVideo/custom_data/navsim/token2info_train_list.json
18
+ valid_data:
19
+ - /cpfs01/user/yangjiazhi/workspace/DVGen/CogVideo/custom_data/navsim/token2info_test_all_list.json
20
+ train_data_weights:
21
+ - 1
22
+ - 40
23
+ split: 1,0,0
24
+ num_workers: 8
25
+ force_train: true
26
+ only_log_video_latents: false
27
+ lr_decay_style: constant
28
+ data:
29
+ target: data_multi.MultiSourceDataset
30
+ params:
31
+ video_size:
32
+ - 512
33
+ - 896
34
+ fps: 10
35
+ max_num_frames: 49
36
+ skip_frms_num: 7.0
37
+ prefix_prompt: This video depicts a realistic view from the driver's perspective
38
+ of a car driving on the road.
39
+ merge_static: true
40
+ exclude_highly_static: true
41
+ p_mask_out_heading: 0.5
42
+ p_drop_action_caption: 0.5
43
+ n_repeat_of_actions:
44
+ Static: 1
45
+ Moving_Forward: 1
46
+ Turning_Left: 5
47
+ Turning_Right: 5
48
+ deepspeed:
49
+ train_micro_batch_size_per_gpu: 2
50
+ gradient_accumulation_steps: 1
51
+ steps_per_print: 50
52
+ gradient_clipping: 0.1
53
+ zero_optimization:
54
+ stage: 2
55
+ cpu_offload: false
56
+ contiguous_gradients: false
57
+ overlap_comm: true
58
+ reduce_scatter: true
59
+ reduce_bucket_size: 1000000000
60
+ allgather_bucket_size: 1000000000
61
+ load_from_fp32_weights: false
62
+ zero_allow_untested_optimizer: true
63
+ bf16:
64
+ enabled: false
65
+ fp16:
66
+ enabled: true
67
+ loss_scale: 0
68
+ loss_scale_window: 400
69
+ hysteresis: 2
70
+ min_loss_scale: 1
71
+ optimizer:
72
+ type: sat.ops.FusedEmaAdam
73
+ params:
74
+ lr: 1.0e-05
75
+ betas:
76
+ - 0.9
77
+ - 0.95
78
+ eps: 1.0e-08
79
+ weight_decay: 0.0001
80
+ activation_checkpointing:
81
+ partition_activations: false
82
+ contiguous_memory_optimization: false
83
+ wall_clock_breakdown: false
84
+ model:
85
+ scale_factor: 1.15258426
86
+ disable_first_stage_autocast: true
87
+ log_keys:
88
+ - txt
89
+ en_and_decode_n_samples_a_time: 1
90
+ en_and_decode_n_frames_a_time: 17
91
+ truncate_n_frames_decode: 8
92
+ cond_inds_sampling:
93
+ - 0
94
+ - 1
95
+ - 2
96
+ denoiser_config:
97
+ target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
98
+ params:
99
+ num_idx: 1000
100
+ quantize_c_noise: false
101
+ weighting_config:
102
+ target: sgm.modules.diffusionmodules.denoiser_weighting.EpsWeighting
103
+ scaling_config:
104
+ target: sgm.modules.diffusionmodules.denoiser_scaling.VideoScaling
105
+ discretization_config:
106
+ target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization
107
+ params:
108
+ shift_scale: 3.0
109
+ network_config:
110
+ target: dit_video_concat.DiffusionTransformer
111
+ params:
112
+ time_embed_dim: 512
113
+ elementwise_affine: true
114
+ num_frames: 49
115
+ time_compressed_rate: 4
116
+ latent_width: 112
117
+ latent_height: 64
118
+ num_layers: 30
119
+ patch_size: 2
120
+ in_channels: 16
121
+ out_channels: 16
122
+ hidden_size: 1920
123
+ adm_in_channels: 256
124
+ num_attention_heads: 30
125
+ transformer_args:
126
+ checkpoint_activations: true
127
+ vocab_size: 1
128
+ max_sequence_length: 64
129
+ layernorm_order: pre
130
+ skip_init: false
131
+ model_parallel_size: 1
132
+ is_decoder: false
133
+ modules:
134
+ pos_embed_config:
135
+ target: dit_video_concat.Basic3DPositionEmbeddingMixin
136
+ params:
137
+ text_length: 235
138
+ height_interpolation: 2.0
139
+ width_interpolation: 2.3333
140
+ lora_config:
141
+ target: sat.model.finetune.lora2.LoraMixin
142
+ params:
143
+ r: 128
144
+ patch_embed_config:
145
+ target: dit_video_concat.ImagePatchEmbeddingMixin
146
+ params:
147
+ text_hidden_size: 4096
148
+ cond_emb_in_dim: 512
149
+ adaln_layer_config:
150
+ target: dit_video_concat.AdaLNMixin
151
+ params:
152
+ qk_ln: true
153
+ final_layer_config:
154
+ target: dit_video_concat.FinalLayerMixin
155
+ conditioner_config:
156
+ target: sgm.modules.GeneralConditioner
157
+ params:
158
+ emb_models:
159
+ - is_trainable: false
160
+ input_key: txt
161
+ ucg_rate: 0.2
162
+ target: sgm.modules.encoders.modules.FrozenT5Embedder
163
+ params:
164
+ model_dir: /cpfs01/user/yangjiazhi/workspace/DVGen/CogVideo/ckpts/CogVideoX-2b-sat/t5-v1_1-xxl
165
+ max_length: 226
166
+ - is_trainable: true
167
+ input_key: fut_traj
168
+ ucg_rate: 0.5
169
+ target: sgm.modules.encoders.traj_encoder.TrajEncoder
170
+ params:
171
+ seq_len: 8
172
+ dim: 1024
173
+ out_dim: 4096
174
+ depth: 2
175
+ mlp_dim: 2048
176
+ pos_emb: sine
177
+ avoid_first_ln: true
178
+ zero_init: true
179
+ use_all_tokens: true
180
+ first_stage_config:
181
+ target: vae_modules.autoencoder.VideoAutoencoderInferenceWrapper
182
+ params:
183
+ cp_size: 1
184
+ ckpt_path: /cpfs01/user/yangjiazhi/workspace/DVGen/CogVideo/ckpts/CogVideoX-2b-sat/vae/3d-vae.pt
185
+ ignore_keys:
186
+ - loss
187
+ loss_config:
188
+ target: torch.nn.Identity
189
+ regularizer_config:
190
+ target: vae_modules.regularizers.DiagonalGaussianRegularizer
191
+ encoder_config:
192
+ target: vae_modules.cp_enc_dec.ContextParallelEncoder3D
193
+ params:
194
+ double_z: true
195
+ z_channels: 16
196
+ resolution: 256
197
+ in_channels: 3
198
+ out_ch: 3
199
+ ch: 128
200
+ ch_mult:
201
+ - 1
202
+ - 2
203
+ - 2
204
+ - 4
205
+ attn_resolutions: []
206
+ num_res_blocks: 3
207
+ dropout: 0.0
208
+ gather_norm: true
209
+ decoder_config:
210
+ target: vae_modules.cp_enc_dec.ContextParallelDecoder3D
211
+ params:
212
+ double_z: true
213
+ z_channels: 16
214
+ resolution: 256
215
+ in_channels: 3
216
+ out_ch: 3
217
+ ch: 128
218
+ ch_mult:
219
+ - 1
220
+ - 2
221
+ - 2
222
+ - 4
223
+ attn_resolutions: []
224
+ num_res_blocks: 3
225
+ dropout: 0.0
226
+ gather_norm: false
227
+ loss_fn_config:
228
+ target: sgm.modules.diffusionmodules.loss.VideoDiffusionLoss
229
+ params:
230
+ offset_noise_level: 0
231
+ cond_inds:
232
+ - []
233
+ - - 0
234
+ - - 0
235
+ - 1
236
+ - - 0
237
+ - 1
238
+ - 2
239
+ cond_inds_prob:
240
+ - 0.1
241
+ - 0.15
242
+ - 0.15
243
+ - 0.6
244
+ apply_cond_aug: V2
245
+ sigma_sampler_config:
246
+ target: sgm.modules.diffusionmodules.sigma_sampling.DiscreteSampling
247
+ params:
248
+ uniform_sampling: false
249
+ custom_sampling: true
250
+ num_idx: 1000
251
+ discretization_config:
252
+ target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization
253
+ params:
254
+ shift_scale: 3.0
255
+ sampler_config:
256
+ target: sgm.modules.diffusionmodules.sampling.VPSDEDPMPP2MSampler
257
+ params:
258
+ num_steps: 50
259
+ verbose: true
260
+ fixed_frames: 3
261
+ cond_inds_sampling:
262
+ - 0
263
+ - 1
264
+ - 2
265
+ apply_cond_aug: V2
266
+ apply_cond_aug_chunk_inference: min
267
+ discretization_config:
268
+ target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization
269
+ params:
270
+ shift_scale: 3.0
271
+ guider_config:
272
+ target: sgm.modules.diffusionmodules.guiders.DynamicCFG
273
+ params:
274
+ scale: 6
275
+ exp: 5
276
+ num_steps: 50
resim_data_jsons/navsim_token2info_train_list.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e2f740701e2b59ad3170d8aa9c0862046fd8a22655b6038961edf1470c896d6
3
+ size 524831112
resim_data_jsons/navsim_token2info_val_list.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6753a55d04e9ceaaa25f560aa42fe80896c5f20fde6033aafb864f7beb0076d2
3
+ size 111971071
resim_data_jsons/nus_val_4k.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89a741317e69195b8722d41e9c967438bb414ebb7c39df58297b79ac4a4f3d45
3
+ size 27978143