anthony.gosselin commited on
Commit
2006222
·
1 Parent(s): 5dabcda
checkpoint-33400/optimizer.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cea5b252d6739c4a0ebc2110ad460797b3f0020e48a4d8585fadeb568266f10f
3
+ size 12198228894
checkpoint-33400/random_states_0.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:033caefb0ceb526b1e246a589a060a8550d522d3910eca5a7de325ca2dc7f2ef
3
+ size 14344
checkpoint-33400/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c9e92d7ca7e11f8c8280a110c0e95f7fc8368104e07cf4940880e3ea39426da
3
+ size 988
checkpoint-33400/scheduler.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2b3dfff9c1ad4d9153cf1a31ee06b9a882cf0259a3aa7a98c6220f5eeb88c15
3
+ size 1000
checkpoint-33400/unet/config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "UNetSpatioTemporalConditionModel",
3
+ "_diffusers_version": "0.27.2",
4
+ "_name_or_path": "/network/scratch/a/anthony.gosselin/Results/ctrlv/nuscenes_box_predict_2/checkpoint-22400",
5
+ "addition_time_embed_dim": 256,
6
+ "block_out_channels": [
7
+ 320,
8
+ 640,
9
+ 1280,
10
+ 1280
11
+ ],
12
+ "cross_attention_dim": 1024,
13
+ "down_block_types": [
14
+ "CrossAttnDownBlockSpatioTemporal",
15
+ "CrossAttnDownBlockSpatioTemporal",
16
+ "CrossAttnDownBlockSpatioTemporal",
17
+ "DownBlockSpatioTemporal"
18
+ ],
19
+ "in_channels": 8,
20
+ "layers_per_block": 2,
21
+ "num_attention_heads": [
22
+ 5,
23
+ 10,
24
+ 20,
25
+ 20
26
+ ],
27
+ "num_frames": 25,
28
+ "out_channels": 4,
29
+ "projection_class_embeddings_input_dim": 768,
30
+ "sample_size": 96,
31
+ "transformer_layers_per_block": 1,
32
+ "up_block_types": [
33
+ "UpBlockSpatioTemporal",
34
+ "CrossAttnUpBlockSpatioTemporal",
35
+ "CrossAttnUpBlockSpatioTemporal",
36
+ "CrossAttnUpBlockSpatioTemporal"
37
+ ]
38
+ }
checkpoint-33400/unet/diffusion_pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b5ba9a0bba0f415a5d0c135e3e48922ce1b89a605eb4edcde9903bdd942ac5d0
3
+ size 6099139590
train_scripts.sh ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # nvidia-smi | grep 'python' | awk '{ print $5 }' | xargs -n1 kill -9
2
+
3
+ timestamp=$(date +%y%m%d_%H%M%S)
4
+ DATASET="nuscenes" #"kitti/vkitti/bdd100k/..."
5
+ DATASET_PATH="/network/scratch/a/anthony.gosselin"
6
+ NAME="${DATASET}_box_predict_2" #"${DATASET}_box_predict_${timestamp}"
7
+ OUT_DIR="/network/scratch/a/anthony.gosselin/Results/ctrlv/${NAME}"
8
+ mkdir -p $OUT_DIR
9
+
10
+ PROJECT_NAME='ctrl_v'
11
+
12
+ SCRIPT_PATH=$0
13
+ SAVE_SCRIPT_PATH="${OUT_DIR}/train_scripts.sh"
14
+ cp $SCRIPT_PATH $SAVE_SCRIPT_PATH
15
+ echo "Saved script to ${SAVE_SCRIPT_PATH}"
16
+
17
+ CUDA_LAUNCH_BLOCKING=1 accelerate launch tools/train_video_diffusion.py \
18
+ --run_name $NAME \
19
+ --data_root $DATASET_PATH \
20
+ --project_name $PROJECT_NAME \
21
+ --pretrained_model_name_or_path stabilityai/stable-video-diffusion-img2vid-xt \
22
+ --output_dir $OUT_DIR \
23
+ --variant fp16 \
24
+ --dataset_name $DATASET \
25
+ --train_batch_size 1 \
26
+ --learning_rate 5e-6 \
27
+ --checkpoints_total_limit 2 \
28
+ --checkpointing_steps 200 \
29
+ --gradient_accumulation_steps 5 \
30
+ --validation_steps 100 \
31
+ --enable_gradient_checkpointing \
32
+ --lr_scheduler constant \
33
+ --report_to wandb \
34
+ --seed 1234 \
35
+ --mixed_precision fp16 \
36
+ --clip_length 25 \
37
+ --min_guidance_scale 3 \
38
+ --max_guidance_scale 7 \
39
+ --noise_aug_strength 0.01 \
40
+ --bbox_dropout_prob 0.1 \
41
+ --conditioning_dropout_prob 0.0 \
42
+ --num_demo_samples 10 \
43
+ --backprop_temporal_blocks_start_iter -1 \
44
+ --num_train_epochs 2 \
45
+ --predict_bbox \
46
+ --num_inference_steps 30 \
47
+ --resume_from_checkpoint latest \
48
+ --num_cond_bbox_frames 3 \
49
+ --wandb_entity chris-pal \
50
+ --fps 7
51
+ # --if_last_frame_trajectory
unet/config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "UNetSpatioTemporalConditionModel",
3
+ "_diffusers_version": "0.27.2",
4
+ "_name_or_path": "/network/scratch/a/anthony.gosselin/Results/ctrlv/nuscenes_box_predict_2/checkpoint-21000",
5
+ "addition_time_embed_dim": 256,
6
+ "block_out_channels": [
7
+ 320,
8
+ 640,
9
+ 1280,
10
+ 1280
11
+ ],
12
+ "cross_attention_dim": 1024,
13
+ "down_block_types": [
14
+ "CrossAttnDownBlockSpatioTemporal",
15
+ "CrossAttnDownBlockSpatioTemporal",
16
+ "CrossAttnDownBlockSpatioTemporal",
17
+ "DownBlockSpatioTemporal"
18
+ ],
19
+ "in_channels": 8,
20
+ "layers_per_block": 2,
21
+ "num_attention_heads": [
22
+ 5,
23
+ 10,
24
+ 20,
25
+ 20
26
+ ],
27
+ "num_frames": 25,
28
+ "out_channels": 4,
29
+ "projection_class_embeddings_input_dim": 768,
30
+ "sample_size": 96,
31
+ "transformer_layers_per_block": 1,
32
+ "up_block_types": [
33
+ "UpBlockSpatioTemporal",
34
+ "CrossAttnUpBlockSpatioTemporal",
35
+ "CrossAttnUpBlockSpatioTemporal",
36
+ "CrossAttnUpBlockSpatioTemporal"
37
+ ]
38
+ }
unet/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a013992e7f9ac1259b96f764dc753d18f51624a5b193b9895d5916d324d3ce68
3
+ size 6098682464