ReSim_Assets

Model card Files Files and versions

xet

Community

OpenDriveLab-org commited on 3 days ago

Commit

be9143b

verified ·

1 Parent(s): 06c863e

Upload folder using huggingface_hub

Browse files

Files changed (9) hide show

.gitattributes +3 -0
resim_ckpts/exp0_no_carla/30000-ema/mp_rank_00_model_states.pt +3 -0
resim_ckpts/exp0_no_carla/30000/mp_rank_00_model_states.pt +3 -0
resim_ckpts/exp0_no_carla/latest +1 -0
resim_ckpts/exp0_no_carla/model_config.json +4 -0
resim_ckpts/exp0_no_carla/training_config.yaml +276 -0
resim_data_jsons/navsim_token2info_train_list.json +3 -0
resim_data_jsons/navsim_token2info_val_list.json +3 -0
resim_data_jsons/nus_val_4k.json +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+resim_data_jsons/navsim_token2info_train_list.json filter=lfs diff=lfs merge=lfs -text
+resim_data_jsons/navsim_token2info_val_list.json filter=lfs diff=lfs merge=lfs -text
+resim_data_jsons/nus_val_4k.json filter=lfs diff=lfs merge=lfs -text

resim_ckpts/exp0_no_carla/30000-ema/mp_rank_00_model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bd35a2490883ec9bcf810c5a5f90cf7313273f07be5a205638ba4e78fab62314
+size 23667958479

resim_ckpts/exp0_no_carla/30000/mp_rank_00_model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7de5e087fffaf3a7eba4e8dcf2553830963119b461659880c6b0ddd67d7ddd07
+size 23667958479

resim_ckpts/exp0_no_carla/latest ADDED Viewed

	@@ -0,0 +1 @@


1	+ 30000

resim_ckpts/exp0_no_carla/model_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+    "model_class": "SATVideoDiffusionEngine",
+    "model_parallel_size": 1
+}

resim_ckpts/exp0_no_carla/training_config.yaml ADDED Viewed

	@@ -0,0 +1,276 @@

+args:
+  checkpoint_activations: true
+  model_parallel_size: 1
+  experiment_name: nus
+  mode: finetune
+  load: /cpfs01/user/yangjiazhi/workspace/DVGen/CogVideo/sat/ckpts0/main5_joint_stage2_high_small-lr-12-08-06-48
+  no_load_rng: true
+  train_iters: 300000
+  eval_iters: 1
+  eval_interval: 10000
+  eval_batch_size: 1
+  save: ckpts
+  save_interval: 2500
+  log_interval: 20
+  train_data:
+  - /cpfs01/user/yangjiazhi/workspace/DVGen/CogVideo/custom_data/youtube_json/YouTube_svd_clip-len-49_interval-10_5M_flow_round2.json
+  - /cpfs01/user/yangjiazhi/workspace/DVGen/CogVideo/custom_data/navsim/token2info_train_list.json
+  valid_data:
+  - /cpfs01/user/yangjiazhi/workspace/DVGen/CogVideo/custom_data/navsim/token2info_test_all_list.json
+  train_data_weights:
+  - 1
+  - 40
+  split: 1,0,0
+  num_workers: 8
+  force_train: true
+  only_log_video_latents: false
+  lr_decay_style: constant
+data:
+  target: data_multi.MultiSourceDataset
+  params:
+    video_size:
+    - 512
+    - 896
+    fps: 10
+    max_num_frames: 49
+    skip_frms_num: 7.0
+    prefix_prompt: This video depicts a realistic view from the driver's perspective
+      of a car driving on the road.
+    merge_static: true
+    exclude_highly_static: true
+    p_mask_out_heading: 0.5
+    p_drop_action_caption: 0.5
+    n_repeat_of_actions:
+      Static: 1
+      Moving_Forward: 1
+      Turning_Left: 5
+      Turning_Right: 5
+deepspeed:
+  train_micro_batch_size_per_gpu: 2
+  gradient_accumulation_steps: 1
+  steps_per_print: 50
+  gradient_clipping: 0.1
+  zero_optimization:
+    stage: 2
+    cpu_offload: false
+    contiguous_gradients: false
+    overlap_comm: true
+    reduce_scatter: true
+    reduce_bucket_size: 1000000000
+    allgather_bucket_size: 1000000000
+    load_from_fp32_weights: false
+  zero_allow_untested_optimizer: true
+  bf16:
+    enabled: false
+  fp16:
+    enabled: true
+    loss_scale: 0
+    loss_scale_window: 400
+    hysteresis: 2
+    min_loss_scale: 1
+  optimizer:
+    type: sat.ops.FusedEmaAdam
+    params:
+      lr: 1.0e-05
+      betas:
+      - 0.9
+      - 0.95
+      eps: 1.0e-08
+      weight_decay: 0.0001
+  activation_checkpointing:
+    partition_activations: false
+    contiguous_memory_optimization: false
+  wall_clock_breakdown: false
+model:
+  scale_factor: 1.15258426
+  disable_first_stage_autocast: true
+  log_keys:
+  - txt
+  en_and_decode_n_samples_a_time: 1
+  en_and_decode_n_frames_a_time: 17
+  truncate_n_frames_decode: 8
+  cond_inds_sampling:
+  - 0
+  - 1
+  - 2
+  denoiser_config:
+    target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
+    params:
+      num_idx: 1000
+      quantize_c_noise: false
+      weighting_config:
+        target: sgm.modules.diffusionmodules.denoiser_weighting.EpsWeighting
+      scaling_config:
+        target: sgm.modules.diffusionmodules.denoiser_scaling.VideoScaling
+      discretization_config:
+        target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization
+        params:
+          shift_scale: 3.0
+  network_config:
+    target: dit_video_concat.DiffusionTransformer
+    params:
+      time_embed_dim: 512
+      elementwise_affine: true
+      num_frames: 49
+      time_compressed_rate: 4
+      latent_width: 112
+      latent_height: 64
+      num_layers: 30
+      patch_size: 2
+      in_channels: 16
+      out_channels: 16
+      hidden_size: 1920
+      adm_in_channels: 256
+      num_attention_heads: 30
+      transformer_args:
+        checkpoint_activations: true
+        vocab_size: 1
+        max_sequence_length: 64
+        layernorm_order: pre
+        skip_init: false
+        model_parallel_size: 1
+        is_decoder: false
+      modules:
+        pos_embed_config:
+          target: dit_video_concat.Basic3DPositionEmbeddingMixin
+          params:
+            text_length: 235
+            height_interpolation: 2.0
+            width_interpolation: 2.3333
+        lora_config:
+          target: sat.model.finetune.lora2.LoraMixin
+          params:
+            r: 128
+        patch_embed_config:
+          target: dit_video_concat.ImagePatchEmbeddingMixin
+          params:
+            text_hidden_size: 4096
+            cond_emb_in_dim: 512
+        adaln_layer_config:
+          target: dit_video_concat.AdaLNMixin
+          params:
+            qk_ln: true
+        final_layer_config:
+          target: dit_video_concat.FinalLayerMixin
+  conditioner_config:
+    target: sgm.modules.GeneralConditioner
+    params:
+      emb_models:
+      - is_trainable: false
+        input_key: txt
+        ucg_rate: 0.2
+        target: sgm.modules.encoders.modules.FrozenT5Embedder
+        params:
+          model_dir: /cpfs01/user/yangjiazhi/workspace/DVGen/CogVideo/ckpts/CogVideoX-2b-sat/t5-v1_1-xxl
+          max_length: 226
+      - is_trainable: true
+        input_key: fut_traj
+        ucg_rate: 0.5
+        target: sgm.modules.encoders.traj_encoder.TrajEncoder
+        params:
+          seq_len: 8
+          dim: 1024
+          out_dim: 4096
+          depth: 2
+          mlp_dim: 2048
+          pos_emb: sine
+          avoid_first_ln: true
+          zero_init: true
+          use_all_tokens: true
+  first_stage_config:
+    target: vae_modules.autoencoder.VideoAutoencoderInferenceWrapper
+    params:
+      cp_size: 1
+      ckpt_path: /cpfs01/user/yangjiazhi/workspace/DVGen/CogVideo/ckpts/CogVideoX-2b-sat/vae/3d-vae.pt
+      ignore_keys:
+      - loss
+      loss_config:
+        target: torch.nn.Identity
+      regularizer_config:
+        target: vae_modules.regularizers.DiagonalGaussianRegularizer
+      encoder_config:
+        target: vae_modules.cp_enc_dec.ContextParallelEncoder3D
+        params:
+          double_z: true
+          z_channels: 16
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 2
+          - 4
+          attn_resolutions: []
+          num_res_blocks: 3
+          dropout: 0.0
+          gather_norm: true
+      decoder_config:
+        target: vae_modules.cp_enc_dec.ContextParallelDecoder3D
+        params:
+          double_z: true
+          z_channels: 16
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 2
+          - 4
+          attn_resolutions: []
+          num_res_blocks: 3
+          dropout: 0.0
+          gather_norm: false
+  loss_fn_config:
+    target: sgm.modules.diffusionmodules.loss.VideoDiffusionLoss
+    params:
+      offset_noise_level: 0
+      cond_inds:
+      - []
+      - - 0
+      - - 0
+        - 1
+      - - 0
+        - 1
+        - 2
+      cond_inds_prob:
+      - 0.1
+      - 0.15
+      - 0.15
+      - 0.6
+      apply_cond_aug: V2
+      sigma_sampler_config:
+        target: sgm.modules.diffusionmodules.sigma_sampling.DiscreteSampling
+        params:
+          uniform_sampling: false
+          custom_sampling: true
+          num_idx: 1000
+          discretization_config:
+            target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization
+            params:
+              shift_scale: 3.0
+  sampler_config:
+    target: sgm.modules.diffusionmodules.sampling.VPSDEDPMPP2MSampler
+    params:
+      num_steps: 50
+      verbose: true
+      fixed_frames: 3
+      cond_inds_sampling:
+      - 0
+      - 1
+      - 2
+      apply_cond_aug: V2
+      apply_cond_aug_chunk_inference: min
+      discretization_config:
+        target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization
+        params:
+          shift_scale: 3.0
+      guider_config:
+        target: sgm.modules.diffusionmodules.guiders.DynamicCFG
+        params:
+          scale: 6
+          exp: 5
+          num_steps: 50

resim_data_jsons/navsim_token2info_train_list.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4e2f740701e2b59ad3170d8aa9c0862046fd8a22655b6038961edf1470c896d6
+size 524831112

resim_data_jsons/navsim_token2info_val_list.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6753a55d04e9ceaaa25f560aa42fe80896c5f20fde6033aafb864f7beb0076d2
+size 111971071

resim_data_jsons/nus_val_4k.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:89a741317e69195b8722d41e9c967438bb414ebb7c39df58297b79ac4a4f3d45
+size 27978143