File size: 3,159 Bytes
b4480bd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
project: "siglip_trajectory_diffusion"
seed: 1337
num_kps: 400 # For dataset compatibility
d_model: 768
siglip_ckpt: "google/siglip-base-patch16-384"
freeze_vision: true
freeze_t5: true
t5_model: "t5-base"
absolute_action: false #  true
dinov2_model: vit_large_patch16_dinov3.lvd1689m #vit_large_patch16_dinov3.lvd1689m # vit_base_patch16_dinov3.lvd1689m
trajectory_horizon: 32 # Number of frames to generate (future frames only)
# Model architecture
model:
    vision_encoder:
        patch_size: 16
        image_size: 384
    text_encoder:
        max_length: 64
    decoder:
        latent_dim: 768 # Trunk output dimension
        num_attention_heads: 8 # CogVideoX attention heads
        attention_head_dim: 64 # CogVideoX attention head dimension
        put_frames_in_channels: 2 # Put frames in channels instead of temporal dimension (B T C H W -> B T/4 C*4 H W)
        in_channels: 3 # Number of input channels in latent space (CogVideoX)
        out_channels: 3 # Number of output channels in latent space (CogVideoX)
        num_layers: 4 # Number of CogVideoX transformer layers
        num_frames: ${trajectory_horizon} # Number of frames to generate (future frames only)
        frame_size: 20 # Size of latent frames (16x16)
        patch_size: 2 # Patch size for latents
        patch_size_t: 1 # Patch size for temporal dimension
        max_text_seq_length: 704 # Maximum text sequence length for CogVideoX
        text_embed_dim: 768 # Text embedding dimension for CogVideoX
        use_rotary_positional_embeddings: false # Use rotary embeddings
        scale_factor: 0.7 # CogVideoX-specific scaling factor
        scale_factor_spatial: 1 # Spatial scaling factor
        scale_factor_temporal: 1 # Temporal scaling factor
        enable_encoder_hidden_states_grad: true # Enable gradient flow through conditioning
        # device: "cuda:1"

# Training configuration
train:
    epochs: 700
    batch_size: 32
    lr_decoder: 2.0e-4
    lr_backbone: 2.0e-5
    weight_decay: 0.05
    warmup_steps: 100
    clip_grad_norm: 1.0
    save_every: 1
    num_log_steps_per_epoch: 0
    eval_every: 1
    visualize_every: 1
    visualize_during_validation: true

# Data configuration
# NOTE: dataset_dirs should be overridden in train.local.yaml
data:
    dataset_dirs: []  # Override this in .local.yaml with your machine-specific paths
    cache_dir: "./dataset_cache"  # Where to store cache files
    val_split: 0.01 # 1% of episodes for validation
    random_seed: 42 # For reproducible train/val split
    num_workers: 16
    pin_memory: false
    augmentation:

# Logging configuration
# NOTE: checkpoint_dir should be overridden in train.local.yaml
logging:
    wandb_project: "tracegen"
    use_wandb: false # Set to false to disable wandb
    log_every: 100
    save_dir: "./checkpoints"
    checkpoint_dir: "./checkpoint/"  # Override this in .local.yaml with your machine-specific path

# Hardware configuration
hardware:
    device: "cuda"
    mixed_precision: true
    compile_model: true

# NOTE: test_path should be overridden in train.local.yaml
test_path: null  # Override this in .local.yaml with your machine-specific path