model / config.yaml
tlefur's picture
Upload folder using huggingface_hub
cbfefc3 verified
checkpoint:
broadcast_via_filesystem: 'False'
dcp_allow_mismatched_size: 'False'
dcp_async_mode_enabled: 'False'
jit:
device: cuda
dtype: bfloat16
enabled: 'False'
input_shape: null
strict: 'True'
keys_not_to_resume: []
load_ema_to_reg: 'False'
load_path: ''
load_training_state: 'False'
only_load_scheduler_state: 'False'
save_iter: '100'
strict_resume: 'True'
type:
_target_: <class 'cosmos_predict2.checkpointer.Checkpointer'>
callbacks: null
verbose: 'True'
data_config: null
dataloader_train:
_target_: <class 'torch.utils.data.dataloader.DataLoader'>
batch_sampler: null
batch_size: '2'
collate_fn: null
dataset:
_target_: <class 'cosmos_predict2.data.dataset_video.Dataset'>
data_fps: '30.0'
dataset_dir: /home/ubuntu/pdt-mimic/data/push-mimic
exclude_with_substring: null
include_only_with_substrings: null
is_multi_img: 'False'
is_val: 'False'
num_frames: '61'
obs_history: '5'
val_ratio: '0.0'
video_size:
- '480'
- '640'
drop_last: 'True'
generator: null
in_order: 'False'
multiprocessing_context: null
num_workers: '12'
persistent_workers: 'True'
pin_memory: 'True'
pin_memory_device: ''
prefetch_factor: '8'
sampler:
_target_: <function get_sampler at 0x79dcbfd48310>
dataset:
_target_: <class 'cosmos_predict2.data.dataset_video.Dataset'>
data_fps: '30.0'
dataset_dir: /home/ubuntu/pdt-mimic/data/push-mimic
exclude_with_substring: null
include_only_with_substrings: null
is_multi_img: 'False'
is_val: 'False'
num_frames: '61'
obs_history: '5'
val_ratio: '0.0'
video_size:
- '480'
- '640'
shuffle: null
timeout: '0'
worker_init_fn: null
dataloader_val:
_target_: <class 'torch.utils.data.dataloader.DataLoader'>
batch_sampler: null
batch_size: '1'
collate_fn: null
dataset:
_target_: <class 'cosmos_predict2.data.dataset_video.Dataset'>
data_fps: '30.0'
dataset_dir: /home/ubuntu/pdt-mimic/data/push-mimic
exclude_with_substring: null
include_only_with_substrings: null
is_multi_img: 'False'
is_val: 'True'
num_frames: '61'
obs_history: '5'
val_ratio: '0.0'
video_size:
- '480'
- '640'
drop_last: 'False'
generator: null
in_order: 'False'
multiprocessing_context: null
num_workers: '0'
persistent_workers: 'False'
pin_memory: 'False'
pin_memory_device: ''
prefetch_factor: null
sampler:
_target_: <function get_sampler at 0x79dcbfd48310>
dataset:
_target_: <class 'cosmos_predict2.data.dataset_video.Dataset'>
data_fps: '30.0'
dataset_dir: /home/ubuntu/pdt-mimic/data/push-mimic
exclude_with_substring: null
include_only_with_substrings: null
is_multi_img: 'False'
is_val: 'True'
num_frames: '61'
obs_history: '5'
val_ratio: '0.0'
video_size:
- '480'
- '640'
shuffle: null
timeout: '0'
worker_init_fn: null
defaults:
- _self_
- data_config: null
- video_dataset_train: null
- video_dataset_val: null
- dataloader_train: null
- dataloader_val: null
- world2action_pipe: null
- optimizer: fusedadamw
- scheduler: constant
- model: null
- callbacks:
- basic
- net: null
- ema: null
- checkpoint: null
- ckpt_type: null
- experiment: null
job:
group: video2world
name: v2w_push_lora_rank32_lr1.778e-04_bsz32
project: posttraining
model:
_recursive_: 'False'
_target_: <class 'cosmos_predict2.models.video2world_model.Predict2Video2WorldModel'>
config:
adjust_video_noise: true
debug_without_randomness: false
fsdp_shard_size: 0
high_sigma_ratio: 0.05
init_lora_weights: true
input_image_key: images
input_video_key: video
lora_alpha: 32
lora_rank: 32
lora_target_modules: q_proj,k_proj,v_proj,output_proj,x_embedder.proj.1,linear_1,linear_2,mlp.layer1,mlp.layer2
loss_reduce: mean
loss_scale: 100.0
model_manager_config:
_target_: cosmos_predict2.models.video2world_model.Predict2ModelManagerConfig
dit_path: /home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/video_backbone//v2w_pretrained_cosmos.pt
text_encoder_path: ''
pipe_config:
adjust_video_noise: true
conditioner:
_target_: <class 'cosmos_predict2.conditioner.VideoConditioner'>
fps:
_target_: <class 'cosmos_predict2.conditioner.ReMapkey'>
dropout_rate: '0.0'
dtype: null
input_key: fps
output_key: fps
padding_mask:
_target_: <class 'cosmos_predict2.conditioner.ReMapkey'>
dropout_rate: '0.0'
dtype: null
input_key: padding_mask
output_key: padding_mask
text:
_target_: <class 'cosmos_predict2.conditioner.TextAttr'>
dropout_rate: '0.0'
input_key:
- obs/language_embedding
use_video_condition:
_target_: <class 'cosmos_predict2.conditioner.BooleanFlag'>
dropout_rate: '0.0'
input_key: fps
output_key: use_video_condition
conditioning_strategy: frame_replace
ema:
_target_: cosmos_predict2.configs.defaults.ema.EMAConfig
enabled: 'False'
iteration_shift: '0'
rate: '0.1'
guardrail_config:
checkpoint_dir: /home/ubuntu/pdt-mimic/mimic-video/model/checkpoints
enabled: false
offload_model_to_cpu: true
input_image_key: images
input_video_key: video
max_num_conditional_frames: 2
min_num_conditional_frames: 1
net:
_target_: <class 'cosmos_predict2.models.video2world_dit.MinimalV1LVGDiT'>
adaln_lora_dim: '256'
atten_backend: minimal_a2a
concat_padding_mask: 'True'
extra_per_block_abs_pos_emb: 'False'
in_channels: '16'
max_frames: '128'
max_img_h: '240'
max_img_w: '240'
model_channels: '2048'
num_blocks: '28'
num_heads: '16'
out_channels: '16'
patch_spatial: '2'
patch_temporal: '1'
pos_emb_cls: rope3d
pos_emb_interpolation: crop
pos_emb_learnable: 'True'
rope_enable_fps_modulation: 'False'
rope_h_extrapolation_ratio: '3.0'
rope_t_extrapolation_ratio: '1.0'
rope_w_extrapolation_ratio: '3.0'
sac_config:
_target_: cosmos_predict2.models.text2image_dit.SACConfig
every_n_blocks: '1'
mode: predict2_2b_720
use_adaln_lora: 'True'
precision: bfloat16
rectified_flow_loss_weight_uniform: true
rectified_flow_t_scaling_factor: 1.0
resize_online: false
resolution: '480'
sigma_conditional: 0.0001
sigma_data: 1.0
state_ch: 16
state_t: 16
text_encoder:
cls: !!python/object/apply:imaginaire.constants.TextEncoderClass
- t5
t5:
ckpt_path: /home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/text_encoder/t5-11b
embed_dim: 1024
num_tokens: 512
timestamps:
is_forward: false
nfe: 35
order: 7.0
t_max: 80.0
t_min: 0.002
tokenizer:
_target_: <class 'cosmos_predict2.tokenizers.tokenizer.TokenizerInterface'>
chunk_duration: '81'
load_mean_std: 'False'
name: tokenizer
temporal_window: '16'
vae_pth: /home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/video_backbone//tokenizer/tokenizer.pth
precision: bfloat16
train_architecture: lora
model_parallel:
_cpu_offloading_context: null
async_tensor_model_parallel_allreduce: false
autocast_dtype: torch.float32
barrier_with_L1_time: true
batch_p2p_comm: true
batch_p2p_sync: true
bf16: false
context_parallel_size: 1
cpu_offloading: false
cpu_offloading_activations: false
cpu_offloading_num_layers: 0
cpu_offloading_weights: false
cross_entropy_fusion_impl: native
cross_entropy_loss_fusion: false
deallocate_pipeline_outputs: false
defer_embedding_wgrad_compute: false
deterministic_mode: false
enable_autocast: false
expert_model_parallel_size: 1
expert_tensor_parallel_size: 1
finalize_model_grads_func: null
fp16: false
grad_scale_func: null
grad_sync_func: null
gradient_accumulation_fusion: false
hierarchical_context_parallel_sizes: null
microbatch_group_size_per_vp_stage: 1
moe_extended_tp: false
no_sync_func: null
num_microbatches_with_partial_activation_checkpoints: null
overlap_p2p_comm: false
overlap_p2p_comm_warmup_flush: false
param_sync_func: null
params_dtype: torch.float32
perform_initialization: true
pipeline_dtype: null
pipeline_model_parallel_comm_backend: null
pipeline_model_parallel_size: 1
pipeline_model_parallel_split_rank: null
sequence_parallel: false
tensor_model_parallel_size: 1
timers: null
tp_comm_atomic_ag: false
tp_comm_atomic_rs: false
tp_comm_bootstrap_backend: nccl
tp_comm_bulk_dgrad: true
tp_comm_bulk_wgrad: true
tp_comm_overlap: false
tp_comm_overlap_ag: true
tp_comm_overlap_disable_fc1: false
tp_comm_overlap_disable_qkv: false
tp_comm_overlap_rs: true
tp_comm_overlap_rs_dgrad: false
tp_comm_split_ag: true
tp_comm_split_rs: true
use_cpu_initialization: false
use_ring_exchange_p2p: false
use_te_rng_tracker: false
variable_seq_lengths: false
virtual_pipeline_model_parallel_size: null
wgrad_deferral_limit: 0
optimizer:
_target_: <function get_base_optimizer at 0x79dcb0bc2b90>
betas:
- '0.9'
- '0.99'
capturable: 'True'
eps: 1e-08
lr: '4.445e-05'
master_weights: 'True'
model: null
optim_type: fusedadam
weight_decay: '0.1'
scheduler:
_target_: <class 'cosmos_predict2.configs.defaults.scheduler.ConstantScheduler'>
trainer:
callbacks:
device_monitor:
_target_: <class 'cosmos_predict2.callbacks.device_monitor.DeviceMonitor'>
every_n: '1000'
log_memory_detail: 'True'
step_size: '1'
ema:
_target_: <class 'imaginaire.utils.callback.EMAModelCallback'>
config: null
trainer: null
grad_clip:
_target_: <class 'cosmos_predict2.callbacks.grad_clip.GradClip'>
clip_norm: '10.0'
force_finite: 'True'
log_wandb: 'False'
iter_speed:
_target_: <class 'cosmos_predict2.callbacks.iter_speed.IterSpeed'>
every_n: '1000'
hit_thres: '5'
low_prec:
_target_: <class 'imaginaire.utils.callback.LowPrecisionCallback'>
config: null
trainer: null
update_iter: '1'
manual_gc:
_target_: <class 'imaginaire.callbacks.manual_gc.ManualGarbageCollection'>
every_n: '5'
warm_up: '5'
progress_bar:
_target_: <class 'imaginaire.utils.callback.ProgressBarCallback'>
config: null
trainer: null
video_eval:
_target_: <class 'cosmos_predict2.callbacks.video_eval.VideoEvalCallback'>
fuse_lora: 'True'
cudnn:
benchmark: 'True'
deterministic: 'False'
ddp:
broadcast_buffers: 'True'
find_unused_parameters: 'False'
static_graph: 'True'
distributed_parallelism: ddp
grad_accum_iter: '4'
grad_scaler_args:
enabled: 'False'
logging_iter: '1000'
max_iter: '500'
max_val_iter: null
memory_format: torch.preserve_format
profiling:
enable_memory_snapshot: 'False'
enable_profiling: 'False'
first_n_rank: '4'
profile_freq: '1'
profile_memory: 'True'
record_shape: 'True'
with_modules: 'True'
with_stack: 'True'
run_validation: 'False'
seed: '0'
timeout_period: '999999999'
type: <class 'imaginaire.trainer.ImaginaireTrainer'>
validation_iter: '999999999'
video_dataset_train:
_target_: <class 'cosmos_predict2.data.dataset_video.Dataset'>
data_fps: '30.0'
dataset_dir: /home/ubuntu/pdt-mimic/data/push-mimic
exclude_with_substring: null
include_only_with_substrings: null
is_multi_img: 'False'
is_val: 'False'
num_frames: '61'
obs_history: '5'
val_ratio: '0.0'
video_size:
- '480'
- '640'
video_dataset_val:
_target_: <class 'cosmos_predict2.data.dataset_video.Dataset'>
data_fps: '30.0'
dataset_dir: /home/ubuntu/pdt-mimic/data/push-mimic
exclude_with_substring: null
include_only_with_substrings: null
is_multi_img: 'False'
is_val: 'True'
num_frames: '61'
obs_history: '5'
val_ratio: '0.0'
video_size:
- '480'
- '640'
world2action_pipe: null