| project: "siglip_trajectory_diffusion" | |
| seed: 1337 | |
| num_kps: 400 # For dataset compatibility | |
| d_model: 768 | |
| siglip_ckpt: "google/siglip-base-patch16-384" | |
| freeze_vision: true | |
| freeze_t5: true | |
| t5_model: "t5-base" | |
| absolute_action: false # true | |
| dinov2_model: vit_large_patch16_dinov3.lvd1689m #vit_large_patch16_dinov3.lvd1689m # vit_base_patch16_dinov3.lvd1689m | |
| trajectory_horizon: 32 # Number of frames to generate (future frames only) | |
| # Model architecture | |
| model: | |
| vision_encoder: | |
| patch_size: 16 | |
| image_size: 384 | |
| text_encoder: | |
| max_length: 64 | |
| decoder: | |
| latent_dim: 768 # Trunk output dimension | |
| num_attention_heads: 8 # CogVideoX attention heads | |
| attention_head_dim: 64 # CogVideoX attention head dimension | |
| put_frames_in_channels: 2 # Put frames in channels instead of temporal dimension (B T C H W -> B T/4 C*4 H W) | |
| in_channels: 3 # Number of input channels in latent space (CogVideoX) | |
| out_channels: 3 # Number of output channels in latent space (CogVideoX) | |
| num_layers: 4 # Number of CogVideoX transformer layers | |
| num_frames: ${trajectory_horizon} # Number of frames to generate (future frames only) | |
| frame_size: 20 # Size of latent frames (16x16) | |
| patch_size: 2 # Patch size for latents | |
| patch_size_t: 1 # Patch size for temporal dimension | |
| max_text_seq_length: 704 # Maximum text sequence length for CogVideoX | |
| text_embed_dim: 768 # Text embedding dimension for CogVideoX | |
| use_rotary_positional_embeddings: false # Use rotary embeddings | |
| scale_factor: 0.7 # CogVideoX-specific scaling factor | |
| scale_factor_spatial: 1 # Spatial scaling factor | |
| scale_factor_temporal: 1 # Temporal scaling factor | |
| enable_encoder_hidden_states_grad: true # Enable gradient flow through conditioning | |
| # device: "cuda:1" | |
| # Training configuration | |
| train: | |
| epochs: 700 | |
| batch_size: 32 | |
| lr_decoder: 2.0e-4 | |
| lr_backbone: 2.0e-5 | |
| weight_decay: 0.05 | |
| warmup_steps: 100 | |
| clip_grad_norm: 1.0 | |
| save_every: 1 | |
| num_log_steps_per_epoch: 0 | |
| eval_every: 1 | |
| visualize_every: 1 | |
| visualize_during_validation: true | |
| # Data configuration | |
| # NOTE: dataset_dirs should be overridden in train.local.yaml | |
| data: | |
| dataset_dirs: [] # Override this in .local.yaml with your machine-specific paths | |
| cache_dir: "./dataset_cache" # Where to store cache files | |
| val_split: 0.01 # 1% of episodes for validation | |
| random_seed: 42 # For reproducible train/val split | |
| num_workers: 16 | |
| pin_memory: false | |
| augmentation: | |
| # Logging configuration | |
| # NOTE: checkpoint_dir should be overridden in train.local.yaml | |
| logging: | |
| wandb_project: "tracegen" | |
| use_wandb: false # Set to false to disable wandb | |
| log_every: 100 | |
| save_dir: "./checkpoints" | |
| checkpoint_dir: "./checkpoint/" # Override this in .local.yaml with your machine-specific path | |
| # Hardware configuration | |
| hardware: | |
| device: "cuda" | |
| mixed_precision: true | |
| compile_model: true | |
| # NOTE: test_path should be overridden in train.local.yaml | |
| test_path: null # Override this in .local.yaml with your machine-specific path | |