jayLEE0301 commited on
Commit
b4480bd
·
1 Parent(s): 467fc83

Release TraceGen checkpoint

Browse files
Files changed (3) hide show
  1. tracegen_bridge.pth +3 -0
  2. tracegen_bridge.safetensors +3 -0
  3. train.yaml +82 -0
tracegen_bridge.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c826c9b4515cf3e72fef3ebd491e7721a3efcf29f734e4c486d35efcd8d20bf
3
+ size 3298381340
tracegen_bridge.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61e0d161cfa394f7a316bcf1c58b3a89eba3d292da00d6d1eb09cd98e46ff438
3
+ size 2698339840
train.yaml ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ project: "siglip_trajectory_diffusion"
2
+ seed: 1337
3
+ num_kps: 400 # For dataset compatibility
4
+ d_model: 768
5
+ siglip_ckpt: "google/siglip-base-patch16-384"
6
+ freeze_vision: true
7
+ freeze_t5: true
8
+ t5_model: "t5-base"
9
+ absolute_action: false # true
10
+ dinov2_model: vit_large_patch16_dinov3.lvd1689m #vit_large_patch16_dinov3.lvd1689m # vit_base_patch16_dinov3.lvd1689m
11
+ trajectory_horizon: 32 # Number of frames to generate (future frames only)
12
+ # Model architecture
13
+ model:
14
+ vision_encoder:
15
+ patch_size: 16
16
+ image_size: 384
17
+ text_encoder:
18
+ max_length: 64
19
+ decoder:
20
+ latent_dim: 768 # Trunk output dimension
21
+ num_attention_heads: 8 # CogVideoX attention heads
22
+ attention_head_dim: 64 # CogVideoX attention head dimension
23
+ put_frames_in_channels: 2 # Put frames in channels instead of temporal dimension (B T C H W -> B T/4 C*4 H W)
24
+ in_channels: 3 # Number of input channels in latent space (CogVideoX)
25
+ out_channels: 3 # Number of output channels in latent space (CogVideoX)
26
+ num_layers: 4 # Number of CogVideoX transformer layers
27
+ num_frames: ${trajectory_horizon} # Number of frames to generate (future frames only)
28
+ frame_size: 20 # Size of latent frames (16x16)
29
+ patch_size: 2 # Patch size for latents
30
+ patch_size_t: 1 # Patch size for temporal dimension
31
+ max_text_seq_length: 704 # Maximum text sequence length for CogVideoX
32
+ text_embed_dim: 768 # Text embedding dimension for CogVideoX
33
+ use_rotary_positional_embeddings: false # Use rotary embeddings
34
+ scale_factor: 0.7 # CogVideoX-specific scaling factor
35
+ scale_factor_spatial: 1 # Spatial scaling factor
36
+ scale_factor_temporal: 1 # Temporal scaling factor
37
+ enable_encoder_hidden_states_grad: true # Enable gradient flow through conditioning
38
+ # device: "cuda:1"
39
+
40
+ # Training configuration
41
+ train:
42
+ epochs: 700
43
+ batch_size: 32
44
+ lr_decoder: 2.0e-4
45
+ lr_backbone: 2.0e-5
46
+ weight_decay: 0.05
47
+ warmup_steps: 100
48
+ clip_grad_norm: 1.0
49
+ save_every: 1
50
+ num_log_steps_per_epoch: 0
51
+ eval_every: 1
52
+ visualize_every: 1
53
+ visualize_during_validation: true
54
+
55
+ # Data configuration
56
+ # NOTE: dataset_dirs should be overridden in train.local.yaml
57
+ data:
58
+ dataset_dirs: [] # Override this in .local.yaml with your machine-specific paths
59
+ cache_dir: "./dataset_cache" # Where to store cache files
60
+ val_split: 0.01 # 1% of episodes for validation
61
+ random_seed: 42 # For reproducible train/val split
62
+ num_workers: 16
63
+ pin_memory: false
64
+ augmentation:
65
+
66
+ # Logging configuration
67
+ # NOTE: checkpoint_dir should be overridden in train.local.yaml
68
+ logging:
69
+ wandb_project: "tracegen"
70
+ use_wandb: false # Set to false to disable wandb
71
+ log_every: 100
72
+ save_dir: "./checkpoints"
73
+ checkpoint_dir: "./checkpoint/" # Override this in .local.yaml with your machine-specific path
74
+
75
+ # Hardware configuration
76
+ hardware:
77
+ device: "cuda"
78
+ mixed_precision: true
79
+ compile_model: true
80
+
81
+ # NOTE: test_path should be overridden in train.local.yaml
82
+ test_path: null # Override this in .local.yaml with your machine-specific path