| [05-07 17:16:59|INFO|imaginaire/trainer.py:116:__init__] Config: |
| [36m* [0m[32mmodel[0m: [33m{'config': {'train_architecture': 'lora', 'lora_rank': 32, 'lora_alpha': 32, 'lora_target_modules': 'q_proj,k_proj,v_proj,output_proj,x_embedder.proj.1,linear_1,linear_2,mlp.layer1,mlp.layer2', 'init_lora_weights': True, 'precision': 'bfloat16', 'input_video_key': 'video', 'input_image_key': 'images', 'loss_reduce': 'mean', 'loss_scale': 100.0, 'adjust_video_noise': True, 'model_manager_config': {'dit_path': '/home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/video_backbone//v2w_pretrained_cosmos.pt', 'text_encoder_path': '', '_target_': 'cosmos_predict2.models.video2world_model.Predict2ModelManagerConfig'}, 'pipe_config': {'adjust_video_noise': True, 'conditioner': {'fps': {'output_key': 'fps', 'dropout_rate': 0.0, 'dtype': None, 'input_key': 'fps', '_target_': <class 'cosmos_predict2.conditioner.ReMapkey'>}, 'padding_mask': {'output_key': 'padding_mask', 'dropout_rate': 0.0, 'dtype': None, 'input_key': 'padding_mask', '_target_': <class 'cosmos_predict2.conditioner.ReMapkey'>}, 'text': {'dropout_rate': 0.0, 'input_key': ['obs/language_embedding'], '_target_': <class 'cosmos_predict2.conditioner.TextAttr'>}, 'use_video_condition': {'output_key': 'use_video_condition', 'dropout_rate': 0.0, 'input_key': 'fps', '_target_': <class 'cosmos_predict2.conditioner.BooleanFlag'>}, '_target_': <class 'cosmos_predict2.conditioner.VideoConditioner'>}, 'conditioning_strategy': 'frame_replace', 'min_num_conditional_frames': 1, 'max_num_conditional_frames': 2, 'sigma_conditional': 0.0001, 'net': {'max_img_h': 240, 'max_img_w': 240, 'max_frames': 128, 'in_channels': 16, 'out_channels': 16, 'patch_spatial': 2, 'patch_temporal': 1, 'concat_padding_mask': True, 'model_channels': 2048, 'num_blocks': 28, 'num_heads': 16, 'atten_backend': 'minimal_a2a', 'pos_emb_cls': 'rope3d', 'pos_emb_learnable': True, 'pos_emb_interpolation': 'crop', 'use_adaln_lora': True, 'adaln_lora_dim': 256, 'rope_h_extrapolation_ratio': 3.0, 'rope_w_extrapolation_ratio': 3.0, 'rope_t_extrapolation_ratio': 1.0, 'extra_per_block_abs_pos_emb': False, 'rope_enable_fps_modulation': False, 'sac_config': {'mode': 'predict2_2b_720', 'every_n_blocks': 1, '_target_': 'cosmos_predict2.models.text2image_dit.SACConfig'}, '_target_': <class 'cosmos_predict2.models.video2world_dit.MinimalV1LVGDiT'>}, 'tokenizer': {'chunk_duration': 81, 'load_mean_std': False, 'temporal_window': 16, 'name': 'tokenizer', 'vae_pth': '/home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/video_backbone//tokenizer/tokenizer.pth', '_target_': <class 'cosmos_predict2.tokenizers.tokenizer.TokenizerInterface'>}, 'guardrail_config': {'checkpoint_dir': '/home/ubuntu/pdt-mimic/mimic-video/model/checkpoints', 'offload_model_to_cpu': True, 'enabled': False}, 'precision': 'bfloat16', 'rectified_flow_t_scaling_factor': 1.0, 'rectified_flow_loss_weight_uniform': True, 'resize_online': False, 'resolution': '480', 'ema': {'enabled': False, 'rate': 0.1, 'iteration_shift': 0, '_target_': 'cosmos_predict2.configs.defaults.ema.EMAConfig'}, 'sigma_data': 1.0, 'state_ch': 16, 'state_t': 16, 'text_encoder': {'cls': <TextEncoderClass.T5: 't5'>, 't5': {'ckpt_path': '/home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/text_encoder/t5-11b', 'num_tokens': 512, 'embed_dim': 1024}}, 'input_video_key': 'video', 'input_image_key': 'images', 'timestamps': {'nfe': 35, 't_min': 0.002, 't_max': 80.0, 'order': 7.0, 'is_forward': False}}, 'debug_without_randomness': False, 'fsdp_shard_size': 0, 'high_sigma_ratio': 0.05}, '_recursive_': False, '_target_': <class 'cosmos_predict2.models.video2world_model.Predict2Video2WorldModel'>}[0m |
| [36m* [0m[32mworld2action_pipe[0m: [33mNone[0m |
| [36m* [0m[32moptimizer[0m: [33m{'optim_type': 'fusedadam', 'model': None, 'lr': 0.0001778279410038923, 'weight_decay': 0.1, 'betas': [0.9, 0.99], 'eps': 1e-08, 'master_weights': True, 'capturable': True, '_target_': <function get_base_optimizer at 0x7f0a19f36d40>}[0m |
| [36m* [0m[32mscheduler[0m: [33m{'_target_': <class 'cosmos_predict2.configs.defaults.scheduler.ConstantScheduler'>}[0m |
| [36m* [0m[32mdata_config[0m: [33mNone[0m |
| [36m* [0m[32mvideo_dataset_train[0m: [33m{'include_only_with_substrings': None, 'exclude_with_substring': None, 'data_fps': 30.0, 'obs_history': 5, 'is_multi_img': False, 'val_ratio': 0.0, 'dataset_dir': '/home/ubuntu/pdt-mimic/data/push-mimic', 'num_frames': 61, 'video_size': [480, 640], 'is_val': False, '_target_': <class 'cosmos_predict2.data.dataset_video.Dataset'>}[0m |
| [36m* [0m[32mvideo_dataset_val[0m: [33m{'include_only_with_substrings': None, 'exclude_with_substring': None, 'data_fps': 30.0, 'obs_history': 5, 'is_multi_img': False, 'val_ratio': 0.0, 'dataset_dir': '/home/ubuntu/pdt-mimic/data/push-mimic', 'num_frames': 61, 'video_size': [480, 640], 'is_val': True, '_target_': <class 'cosmos_predict2.data.dataset_video.Dataset'>}[0m |
| [36m* [0m[32mdataloader_train[0m: [33m{'batch_size': 4, 'shuffle': None, 'sampler': {'dataset': {'include_only_with_substrings': None, 'exclude_with_substring': None, 'data_fps': 30.0, 'obs_history': 5, 'is_multi_img': False, 'val_ratio': 0.0, 'dataset_dir': '/home/ubuntu/pdt-mimic/data/push-mimic', 'num_frames': 61, 'video_size': [480, 640], 'is_val': False, '_target_': <class 'cosmos_predict2.data.dataset_video.Dataset'>}, '_target_': <function get_sampler at 0x7f0a293a84c0>}, 'batch_sampler': None, 'num_workers': 12, 'collate_fn': None, 'pin_memory': True, 'drop_last': True, 'timeout': 0, 'worker_init_fn': None, 'multiprocessing_context': None, 'generator': None, 'prefetch_factor': 8, 'persistent_workers': True, 'pin_memory_device': '', 'in_order': False, 'dataset': {'include_only_with_substrings': None, 'exclude_with_substring': None, 'data_fps': 30.0, 'obs_history': 5, 'is_multi_img': False, 'val_ratio': 0.0, 'dataset_dir': '/home/ubuntu/pdt-mimic/data/push-mimic', 'num_frames': 61, 'video_size': [480, 640], 'is_val': False, '_target_': <class 'cosmos_predict2.data.dataset_video.Dataset'>}, '_target_': <class 'torch.utils.data.dataloader.DataLoader'>}[0m |
| [36m* [0m[32mdataloader_val[0m: [33m{'batch_size': 1, 'shuffle': None, 'sampler': {'dataset': {'include_only_with_substrings': None, 'exclude_with_substring': None, 'data_fps': 30.0, 'obs_history': 5, 'is_multi_img': False, 'val_ratio': 0.0, 'dataset_dir': '/home/ubuntu/pdt-mimic/data/push-mimic', 'num_frames': 61, 'video_size': [480, 640], 'is_val': True, '_target_': <class 'cosmos_predict2.data.dataset_video.Dataset'>}, '_target_': <function get_sampler at 0x7f0a293a84c0>}, 'batch_sampler': None, 'num_workers': 0, 'collate_fn': None, 'pin_memory': False, 'drop_last': False, 'timeout': 0, 'worker_init_fn': None, 'multiprocessing_context': None, 'generator': None, 'prefetch_factor': None, 'persistent_workers': False, 'pin_memory_device': '', 'in_order': False, 'dataset': {'include_only_with_substrings': None, 'exclude_with_substring': None, 'data_fps': 30.0, 'obs_history': 5, 'is_multi_img': False, 'val_ratio': 0.0, 'dataset_dir': '/home/ubuntu/pdt-mimic/data/push-mimic', 'num_frames': 61, 'video_size': [480, 640], 'is_val': True, '_target_': <class 'cosmos_predict2.data.dataset_video.Dataset'>}, '_target_': <class 'torch.utils.data.dataloader.DataLoader'>}[0m |
| [36m* [0m[32mjob[0m: |
| [36m* [0m[32mproject[0m: [33mposttraining[0m |
| [36m* [0m[32mgroup[0m: [33mvideo2world[0m |
| [36m* [0m[32mname[0m: [33mv2w_push_lora_rank32_lr1.778e-04_bsz32[0m |
| [36m* [0m[32mtrainer[0m: |
| [36m* [0m[32mtype[0m: [33m<class 'imaginaire.trainer.ImaginaireTrainer'>[0m |
| [36m* [0m[32mcallbacks[0m: [33m{'ema': {'config': None, 'trainer': None, '_target_': <class 'imaginaire.utils.callback.EMAModelCallback'>}, 'progress_bar': {'config': None, 'trainer': None, '_target_': <class 'imaginaire.utils.callback.ProgressBarCallback'>}, 'low_prec': {'config': None, 'trainer': None, 'update_iter': 1, '_target_': <class 'imaginaire.utils.callback.LowPrecisionCallback'>}, 'iter_speed': {'hit_thres': 5, 'every_n': 1000, '_target_': <class 'cosmos_predict2.callbacks.iter_speed.IterSpeed'>}, 'device_monitor': {'every_n': 1000, 'step_size': 1, 'log_memory_detail': True, '_target_': <class 'cosmos_predict2.callbacks.device_monitor.DeviceMonitor'>}, 'manual_gc': {'warm_up': 5, 'every_n': 5, '_target_': <class 'imaginaire.callbacks.manual_gc.ManualGarbageCollection'>}, 'grad_clip': {'clip_norm': 10.0, 'force_finite': True, 'log_wandb': False, '_target_': <class 'cosmos_predict2.callbacks.grad_clip.GradClip'>}, 'video_eval': {'fuse_lora': True, '_target_': <class 'cosmos_predict2.callbacks.video_eval.VideoEvalCallback'>}}[0m |
| [36m* [0m[32mdistributed_parallelism[0m: [33mddp[0m |
| [36m* [0m[32mddp[0m: |
| [36m* [0m[32mfind_unused_parameters[0m: [33mFalse[0m |
| [36m* [0m[32mstatic_graph[0m: [33mTrue[0m |
| [36m* [0m[32mbroadcast_buffers[0m: [33mTrue[0m |
| [36m* [0m[32mcudnn[0m: |
| [36m* [0m[32mdeterministic[0m: [33mFalse[0m |
| [36m* [0m[32mbenchmark[0m: [33mTrue[0m |
| [36m* [0m[32mseed[0m: [33m0[0m |
| [36m* [0m[32mgrad_scaler_args[0m: [33m{'enabled': False}[0m |
| [36m* [0m[32mmax_iter[0m: [33m500[0m |
| [36m* [0m[32mmax_val_iter[0m: [33mNone[0m |
| [36m* [0m[32mlogging_iter[0m: [33m1000[0m |
| [36m* [0m[32mrun_validation[0m: [33mFalse[0m |
| [36m* [0m[32mvalidation_iter[0m: [33m999999999[0m |
| [36m* [0m[32mtimeout_period[0m: [33m999999999[0m |
| [36m* [0m[32mmemory_format[0m: [33mtorch.preserve_format[0m |
| [36m* [0m[32mgrad_accum_iter[0m: [33m8[0m |
| [36m* [0m[32mprofiling[0m: |
| [36m* [0m[32menable_profiling[0m: [33mFalse[0m |
| [36m* [0m[32menable_memory_snapshot[0m: [33mFalse[0m |
| [36m* [0m[32mprofile_freq[0m: [33m1[0m |
| [36m* [0m[32mfirst_n_rank[0m: [33m4[0m |
| [36m* [0m[32mrecord_shape[0m: [33mTrue[0m |
| [36m* [0m[32mprofile_memory[0m: [33mTrue[0m |
| [36m* [0m[32mwith_stack[0m: [33mTrue[0m |
| [36m* [0m[32mwith_modules[0m: [33mTrue[0m |
| [36m* [0m[32mmodel_parallel[0m: [33mModelParallelConfig(tensor_model_parallel_size=1, pipeline_model_parallel_comm_backend=None, pipeline_model_parallel_size=1, virtual_pipeline_model_parallel_size=None, sequence_parallel=False, context_parallel_size=1, hierarchical_context_parallel_sizes=None, expert_model_parallel_size=1, expert_tensor_parallel_size=1, moe_extended_tp=False, perform_initialization=True, use_cpu_initialization=False, fp16=False, bf16=False, params_dtype=torch.float32, timers=None, finalize_model_grads_func=None, grad_scale_func=None, no_sync_func=None, grad_sync_func=None, param_sync_func=None, deterministic_mode=False, enable_autocast=False, autocast_dtype=torch.float32, num_microbatches_with_partial_activation_checkpoints=None, gradient_accumulation_fusion=False, async_tensor_model_parallel_allreduce=False, use_te_rng_tracker=False, tp_comm_overlap=False, tp_comm_bulk_wgrad=True, tp_comm_bulk_dgrad=True, tp_comm_overlap_ag=True, tp_comm_overlap_rs=True, tp_comm_overlap_rs_dgrad=False, tp_comm_split_ag=True, tp_comm_atomic_ag=False, tp_comm_split_rs=True, tp_comm_atomic_rs=False, cross_entropy_loss_fusion=False, cross_entropy_fusion_impl='native', tp_comm_overlap_disable_qkv=False, tp_comm_overlap_disable_fc1=False, tp_comm_bootstrap_backend='nccl', pipeline_dtype=None, variable_seq_lengths=False, overlap_p2p_comm=False, batch_p2p_comm=True, batch_p2p_sync=True, use_ring_exchange_p2p=False, deallocate_pipeline_outputs=False, defer_embedding_wgrad_compute=False, wgrad_deferral_limit=0, pipeline_model_parallel_split_rank=None, overlap_p2p_comm_warmup_flush=False, microbatch_group_size_per_vp_stage=1, cpu_offloading=False, cpu_offloading_num_layers=0, _cpu_offloading_context=None, cpu_offloading_activations=False, cpu_offloading_weights=False, barrier_with_L1_time=True)[0m |
| [36m* [0m[32mcheckpoint[0m: |
| [36m* [0m[32mtype[0m: [33m{'callbacks': None, '_target_': <class 'cosmos_predict2.checkpointer.Checkpointer'>}[0m |
| [36m* [0m[32mdcp_async_mode_enabled[0m: [33mFalse[0m |
| [36m* [0m[32msave_iter[0m: [33m100[0m |
| [36m* [0m[32mload_path[0m: [33m[0m |
| [36m* [0m[32mload_training_state[0m: [33mFalse[0m |
| [36m* [0m[32monly_load_scheduler_state[0m: [33mFalse[0m |
| [36m* [0m[32mstrict_resume[0m: [33mTrue[0m |
| [36m* [0m[32mjit[0m: |
| [36m* [0m[32menabled[0m: [33mFalse[0m |
| [36m* [0m[32minput_shape[0m: [33mNone[0m |
| [36m* [0m[32mdevice[0m: [33mcuda[0m |
| [36m* [0m[32mdtype[0m: [33mbfloat16[0m |
| [36m* [0m[32mstrict[0m: [33mTrue[0m |
| [36m* [0m[32mverbose[0m: [33mTrue[0m |
| [36m* [0m[32mkeys_not_to_resume[0m: [33m[][0m |
| [36m* [0m[32mbroadcast_via_filesystem[0m: [33mFalse[0m |
| [36m* [0m[32mload_ema_to_reg[0m: [33mFalse[0m |
| [36m* [0m[32mdcp_allow_mismatched_size[0m: [33mFalse[0m |
| [36m* [0m[32mdefaults[0m: [33m['_self_', {'data_config': None}, {'video_dataset_train': None}, {'video_dataset_val': None}, {'dataloader_train': None}, {'dataloader_val': None}, {'world2action_pipe': None}, {'optimizer': 'fusedadamw'}, {'scheduler': 'constant'}, {'model': None}, {'callbacks': ['basic']}, {'net': None}, {'ema': None}, {'checkpoint': None}, {'ckpt_type': None}, {'experiment': None}][0m |
| [05-07 17:16:59|WARNING|imaginaire/utils/misc.py:127:print_environ_variables] Environment variable [32mTORCH_HOME[0m not set! |
| [05-07 17:16:59|INFO|imaginaire/utils/misc.py:125:print_environ_variables] Environment variable [32mIMAGINAIRE_OUTPUT_ROOT[0m: [33m/home/ubuntu/checkpoints[0m |
| [05-07 17:16:59|INFO|imaginaire/utils/misc.py:139:set_random_seed] Using random seed 0. |
| [05-07 17:16:59|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback ema: {'config': None, 'trainer': None, '_target_': <class 'imaginaire.utils.callback.EMAModelCallback'>} |
| [05-07 17:16:59|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback progress_bar: {'config': None, 'trainer': None, '_target_': <class 'imaginaire.utils.callback.ProgressBarCallback'>} |
| [05-07 17:16:59|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback low_prec: {'config': None, 'trainer': None, 'update_iter': 1, '_target_': <class 'imaginaire.utils.callback.LowPrecisionCallback'>} |
| [05-07 17:16:59|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback iter_speed: {'hit_thres': 5, 'every_n': 1000, '_target_': <class 'cosmos_predict2.callbacks.iter_speed.IterSpeed'>} |
| [05-07 17:16:59|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback device_monitor: {'every_n': 1000, 'step_size': 1, 'log_memory_detail': True, '_target_': <class 'cosmos_predict2.callbacks.device_monitor.DeviceMonitor'>} |
| [05-07 17:16:59|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback manual_gc: {'warm_up': 5, 'every_n': 5, '_target_': <class 'imaginaire.callbacks.manual_gc.ManualGarbageCollection'>} |
| [05-07 17:16:59|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback grad_clip: {'clip_norm': 10.0, 'force_finite': True, 'log_wandb': False, '_target_': <class 'cosmos_predict2.callbacks.grad_clip.GradClip'>} |
| [05-07 17:16:59|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback video_eval: {'fuse_lora': True, '_target_': <class 'cosmos_predict2.callbacks.video_eval.VideoEvalCallback'>} |
| [05-07 17:16:59|CRITICAL|cosmos_predict2/models/video2world_model.py:138:__init__] Using mean loss reduce with loss scale 100.0 |
| [05-07 17:16:59|WARNING|cosmos_predict2/pipelines/video2world.py:277:from_config] precision torch.bfloat16 |
| [05-07 17:17:00|INFO|cosmos_predict2/tokenizers/tokenizer.py:687:_video_vae] Loading /home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/video_backbone//tokenizer/tokenizer.pth |
| [05-07 17:17:00|SUCCESS|cosmos_predict2/tokenizers/tokenizer.py:689:_video_vae] Successfully loaded /home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/video_backbone//tokenizer/tokenizer.pth |
| [05-07 17:17:00|INFO|imaginaire/utils/distributed.py:430:sync_model_states] Synchronizing model states from rank 0 to all ranks in process group [0]. |
| [05-07 17:17:00|INFO|cosmos_predict2/pipelines/video2world.py:335:from_config] Loading DiT from /home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/video_backbone//v2w_pretrained_cosmos.pt |
| [05-07 17:17:04|SUCCESS|cosmos_predict2/pipelines/video2world.py:354:from_config] Successfully loaded DiT from /home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/video_backbone//v2w_pretrained_cosmos.pt |
| [05-07 17:17:05|INFO|cosmos_predict2/models/video2world_model.py:321:add_lora_to_model] Adding LoRA adapters: rank=32, alpha=32, targets=['q_proj', 'k_proj', 'v_proj', 'output_proj', 'x_embedder.proj.1', 'linear_1', 'linear_2', 'mlp.layer1', 'mlp.layer2'] |
| [05-07 17:17:05|INFO|cosmos_predict2/models/video2world_model.py:345:add_lora_to_model] LoRA injection successful: 46,336,256 trainable parameters out of 2,002,749,696 total (2.314%) |
| [05-07 17:17:05|INFO|cosmos_predict2/models/video2world_model.py:363:_log_lora_statistics] LoRA parameter breakdown: |
| [05-07 17:17:05|INFO|cosmos_predict2/models/video2world_model.py:365:_log_lora_statistics] lora_A: 22,153,472 parameters |
| [05-07 17:17:05|INFO|cosmos_predict2/models/video2world_model.py:365:_log_lora_statistics] lora_B: 24,182,784 parameters |
| [05-07 17:17:05|INFO|cosmos_predict2/models/video2world_model.py:366:_log_lora_statistics] Total LoRA: 46,336,256 parameters |
| [05-07 17:17:05|INFO|cosmos_predict2/models/video2world_model.py:185:__init__] Total parameters: 2.00B, Frozen parameters: 1,778,679,808, Trainable parameters: 224,069,888 |
| [05-07 17:17:05|INFO|cosmos_predict2/models/video2world_model.py:202:__init__] FSDP (Fully Sharded Data Parallel) is disabled. |
| [05-07 17:17:05|INFO|cosmos_predict2/data/dataset_video.py:102:__init__] 204 videos in total |
| [05-07 17:17:05|INFO|cosmos_predict2/data/dataset_video.py:112:__init__] 204 videos in train. |
| [05-07 17:17:05|INFO|cosmos_predict2/data/dataset_video.py:102:__init__] 204 videos in total |
| [05-07 17:17:05|INFO|cosmos_predict2/data/dataset_video.py:112:__init__] 204 videos in train. |
| [05-07 17:17:05|CRITICAL|cosmos_predict2/utils/optim_instantiate_dtensor.py:49:get_base_optimizer] total num parameters : 224,069,888 |
| [05-07 17:17:05|WARNING|cosmos_predict2/utils/fused_adam_dtensor.py:103:__init__] FusedAdam master_weights: True capturable: True |
| [05-07 17:17:05|INFO|cosmos_predict2/checkpointer.py:288:load] Training from scratch. |
| [05-07 17:17:05|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint loading: 0.0013 seconds |
| [05-07 17:17:05|CRITICAL|imaginaire/trainer.py:178:train] Distributed parallelism mode: ddp |
| [05-07 17:17:06|INFO|imaginaire/trainer.py:186:train] Starting training... |
| [05-07 17:17:06|INFO|cosmos_predict2/callbacks/device_monitor.py:92:on_train_start] DeviceMonitor callback: local_dir: /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/DeviceMonitor |
| [05-07 17:17:09|WARNING|imaginaire/utils/distributed.py:284:ddp_sync_grad] DDP static_graph=True is incompatible with sync_grad(). Performance will be reduced. |
| [05-07 17:18:29|INFO|cosmos_predict2/callbacks/iter_speed.py:51:on_training_step_end] Iteration 1: Hit counter: 1/5 | Loss: 4.0617 | Time: 89.90s |
| [05-07 17:19:00|INFO|imaginaire/trainer.py:116:__init__] Config: |
| [36m* [0m[32mmodel[0m: [33m{'config': {'train_architecture': 'lora', 'lora_rank': 32, 'lora_alpha': 32, 'lora_target_modules': 'q_proj,k_proj,v_proj,output_proj,x_embedder.proj.1,linear_1,linear_2,mlp.layer1,mlp.layer2', 'init_lora_weights': True, 'precision': 'bfloat16', 'input_video_key': 'video', 'input_image_key': 'images', 'loss_reduce': 'mean', 'loss_scale': 100.0, 'adjust_video_noise': True, 'model_manager_config': {'dit_path': '/home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/video_backbone//v2w_pretrained_cosmos.pt', 'text_encoder_path': '', '_target_': 'cosmos_predict2.models.video2world_model.Predict2ModelManagerConfig'}, 'pipe_config': {'adjust_video_noise': True, 'conditioner': {'fps': {'output_key': 'fps', 'dropout_rate': 0.0, 'dtype': None, 'input_key': 'fps', '_target_': <class 'cosmos_predict2.conditioner.ReMapkey'>}, 'padding_mask': {'output_key': 'padding_mask', 'dropout_rate': 0.0, 'dtype': None, 'input_key': 'padding_mask', '_target_': <class 'cosmos_predict2.conditioner.ReMapkey'>}, 'text': {'dropout_rate': 0.0, 'input_key': ['obs/language_embedding'], '_target_': <class 'cosmos_predict2.conditioner.TextAttr'>}, 'use_video_condition': {'output_key': 'use_video_condition', 'dropout_rate': 0.0, 'input_key': 'fps', '_target_': <class 'cosmos_predict2.conditioner.BooleanFlag'>}, '_target_': <class 'cosmos_predict2.conditioner.VideoConditioner'>}, 'conditioning_strategy': 'frame_replace', 'min_num_conditional_frames': 1, 'max_num_conditional_frames': 2, 'sigma_conditional': 0.0001, 'net': {'max_img_h': 240, 'max_img_w': 240, 'max_frames': 128, 'in_channels': 16, 'out_channels': 16, 'patch_spatial': 2, 'patch_temporal': 1, 'concat_padding_mask': True, 'model_channels': 2048, 'num_blocks': 28, 'num_heads': 16, 'atten_backend': 'minimal_a2a', 'pos_emb_cls': 'rope3d', 'pos_emb_learnable': True, 'pos_emb_interpolation': 'crop', 'use_adaln_lora': True, 'adaln_lora_dim': 256, 'rope_h_extrapolation_ratio': 3.0, 'rope_w_extrapolation_ratio': 3.0, 'rope_t_extrapolation_ratio': 1.0, 'extra_per_block_abs_pos_emb': False, 'rope_enable_fps_modulation': False, 'sac_config': {'mode': 'predict2_2b_720', 'every_n_blocks': 1, '_target_': 'cosmos_predict2.models.text2image_dit.SACConfig'}, '_target_': <class 'cosmos_predict2.models.video2world_dit.MinimalV1LVGDiT'>}, 'tokenizer': {'chunk_duration': 81, 'load_mean_std': False, 'temporal_window': 16, 'name': 'tokenizer', 'vae_pth': '/home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/video_backbone//tokenizer/tokenizer.pth', '_target_': <class 'cosmos_predict2.tokenizers.tokenizer.TokenizerInterface'>}, 'guardrail_config': {'checkpoint_dir': '/home/ubuntu/pdt-mimic/mimic-video/model/checkpoints', 'offload_model_to_cpu': True, 'enabled': False}, 'precision': 'bfloat16', 'rectified_flow_t_scaling_factor': 1.0, 'rectified_flow_loss_weight_uniform': True, 'resize_online': False, 'resolution': '480', 'ema': {'enabled': False, 'rate': 0.1, 'iteration_shift': 0, '_target_': 'cosmos_predict2.configs.defaults.ema.EMAConfig'}, 'sigma_data': 1.0, 'state_ch': 16, 'state_t': 16, 'text_encoder': {'cls': <TextEncoderClass.T5: 't5'>, 't5': {'ckpt_path': '/home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/text_encoder/t5-11b', 'num_tokens': 512, 'embed_dim': 1024}}, 'input_video_key': 'video', 'input_image_key': 'images', 'timestamps': {'nfe': 35, 't_min': 0.002, 't_max': 80.0, 'order': 7.0, 'is_forward': False}}, 'debug_without_randomness': False, 'fsdp_shard_size': 0, 'high_sigma_ratio': 0.05}, '_recursive_': False, '_target_': <class 'cosmos_predict2.models.video2world_model.Predict2Video2WorldModel'>}[0m |
| [36m* [0m[32mworld2action_pipe[0m: [33mNone[0m |
| [36m* [0m[32moptimizer[0m: [33m{'optim_type': 'fusedadam', 'model': None, 'lr': 0.0001778279410038923, 'weight_decay': 0.1, 'betas': [0.9, 0.99], 'eps': 1e-08, 'master_weights': True, 'capturable': True, '_target_': <function get_base_optimizer at 0x734ac793ab90>}[0m |
| [36m* [0m[32mscheduler[0m: [33m{'_target_': <class 'cosmos_predict2.configs.defaults.scheduler.ConstantScheduler'>}[0m |
| [36m* [0m[32mdata_config[0m: [33mNone[0m |
| [36m* [0m[32mvideo_dataset_train[0m: [33m{'include_only_with_substrings': None, 'exclude_with_substring': None, 'data_fps': 30.0, 'obs_history': 5, 'is_multi_img': False, 'val_ratio': 0.0, 'dataset_dir': '/home/ubuntu/pdt-mimic/data/push-mimic', 'num_frames': 61, 'video_size': [480, 640], 'is_val': False, '_target_': <class 'cosmos_predict2.data.dataset_video.Dataset'>}[0m |
| [36m* [0m[32mvideo_dataset_val[0m: [33m{'include_only_with_substrings': None, 'exclude_with_substring': None, 'data_fps': 30.0, 'obs_history': 5, 'is_multi_img': False, 'val_ratio': 0.0, 'dataset_dir': '/home/ubuntu/pdt-mimic/data/push-mimic', 'num_frames': 61, 'video_size': [480, 640], 'is_val': True, '_target_': <class 'cosmos_predict2.data.dataset_video.Dataset'>}[0m |
| [36m* [0m[32mdataloader_train[0m: [33m{'batch_size': 2, 'shuffle': None, 'sampler': {'dataset': {'include_only_with_substrings': None, 'exclude_with_substring': None, 'data_fps': 30.0, 'obs_history': 5, 'is_multi_img': False, 'val_ratio': 0.0, 'dataset_dir': '/home/ubuntu/pdt-mimic/data/push-mimic', 'num_frames': 61, 'video_size': [480, 640], 'is_val': False, '_target_': <class 'cosmos_predict2.data.dataset_video.Dataset'>}, '_target_': <function get_sampler at 0x734ad73ac310>}, 'batch_sampler': None, 'num_workers': 12, 'collate_fn': None, 'pin_memory': True, 'drop_last': True, 'timeout': 0, 'worker_init_fn': None, 'multiprocessing_context': None, 'generator': None, 'prefetch_factor': 8, 'persistent_workers': True, 'pin_memory_device': '', 'in_order': False, 'dataset': {'include_only_with_substrings': None, 'exclude_with_substring': None, 'data_fps': 30.0, 'obs_history': 5, 'is_multi_img': False, 'val_ratio': 0.0, 'dataset_dir': '/home/ubuntu/pdt-mimic/data/push-mimic', 'num_frames': 61, 'video_size': [480, 640], 'is_val': False, '_target_': <class 'cosmos_predict2.data.dataset_video.Dataset'>}, '_target_': <class 'torch.utils.data.dataloader.DataLoader'>}[0m |
| [36m* [0m[32mdataloader_val[0m: [33m{'batch_size': 1, 'shuffle': None, 'sampler': {'dataset': {'include_only_with_substrings': None, 'exclude_with_substring': None, 'data_fps': 30.0, 'obs_history': 5, 'is_multi_img': False, 'val_ratio': 0.0, 'dataset_dir': '/home/ubuntu/pdt-mimic/data/push-mimic', 'num_frames': 61, 'video_size': [480, 640], 'is_val': True, '_target_': <class 'cosmos_predict2.data.dataset_video.Dataset'>}, '_target_': <function get_sampler at 0x734ad73ac310>}, 'batch_sampler': None, 'num_workers': 0, 'collate_fn': None, 'pin_memory': False, 'drop_last': False, 'timeout': 0, 'worker_init_fn': None, 'multiprocessing_context': None, 'generator': None, 'prefetch_factor': None, 'persistent_workers': False, 'pin_memory_device': '', 'in_order': False, 'dataset': {'include_only_with_substrings': None, 'exclude_with_substring': None, 'data_fps': 30.0, 'obs_history': 5, 'is_multi_img': False, 'val_ratio': 0.0, 'dataset_dir': '/home/ubuntu/pdt-mimic/data/push-mimic', 'num_frames': 61, 'video_size': [480, 640], 'is_val': True, '_target_': <class 'cosmos_predict2.data.dataset_video.Dataset'>}, '_target_': <class 'torch.utils.data.dataloader.DataLoader'>}[0m |
| [36m* [0m[32mjob[0m: |
| [36m* [0m[32mproject[0m: [33mposttraining[0m |
| [36m* [0m[32mgroup[0m: [33mvideo2world[0m |
| [36m* [0m[32mname[0m: [33mv2w_push_lora_rank32_lr1.778e-04_bsz32[0m |
| [36m* [0m[32mtrainer[0m: |
| [36m* [0m[32mtype[0m: [33m<class 'imaginaire.trainer.ImaginaireTrainer'>[0m |
| [36m* [0m[32mcallbacks[0m: [33m{'ema': {'config': None, 'trainer': None, '_target_': <class 'imaginaire.utils.callback.EMAModelCallback'>}, 'progress_bar': {'config': None, 'trainer': None, '_target_': <class 'imaginaire.utils.callback.ProgressBarCallback'>}, 'low_prec': {'config': None, 'trainer': None, 'update_iter': 1, '_target_': <class 'imaginaire.utils.callback.LowPrecisionCallback'>}, 'iter_speed': {'hit_thres': 5, 'every_n': 1000, '_target_': <class 'cosmos_predict2.callbacks.iter_speed.IterSpeed'>}, 'device_monitor': {'every_n': 1000, 'step_size': 1, 'log_memory_detail': True, '_target_': <class 'cosmos_predict2.callbacks.device_monitor.DeviceMonitor'>}, 'manual_gc': {'warm_up': 5, 'every_n': 5, '_target_': <class 'imaginaire.callbacks.manual_gc.ManualGarbageCollection'>}, 'grad_clip': {'clip_norm': 10.0, 'force_finite': True, 'log_wandb': False, '_target_': <class 'cosmos_predict2.callbacks.grad_clip.GradClip'>}, 'video_eval': {'fuse_lora': True, '_target_': <class 'cosmos_predict2.callbacks.video_eval.VideoEvalCallback'>}}[0m |
| [36m* [0m[32mdistributed_parallelism[0m: [33mddp[0m |
| [36m* [0m[32mddp[0m: |
| [36m* [0m[32mfind_unused_parameters[0m: [33mFalse[0m |
| [36m* [0m[32mstatic_graph[0m: [33mTrue[0m |
| [36m* [0m[32mbroadcast_buffers[0m: [33mTrue[0m |
| [36m* [0m[32mcudnn[0m: |
| [36m* [0m[32mdeterministic[0m: [33mFalse[0m |
| [36m* [0m[32mbenchmark[0m: [33mTrue[0m |
| [36m* [0m[32mseed[0m: [33m0[0m |
| [36m* [0m[32mgrad_scaler_args[0m: [33m{'enabled': False}[0m |
| [36m* [0m[32mmax_iter[0m: [33m500[0m |
| [36m* [0m[32mmax_val_iter[0m: [33mNone[0m |
| [36m* [0m[32mlogging_iter[0m: [33m1000[0m |
| [36m* [0m[32mrun_validation[0m: [33mFalse[0m |
| [36m* [0m[32mvalidation_iter[0m: [33m999999999[0m |
| [36m* [0m[32mtimeout_period[0m: [33m999999999[0m |
| [36m* [0m[32mmemory_format[0m: [33mtorch.preserve_format[0m |
| [36m* [0m[32mgrad_accum_iter[0m: [33m4[0m |
| [36m* [0m[32mprofiling[0m: |
| [36m* [0m[32menable_profiling[0m: [33mFalse[0m |
| [36m* [0m[32menable_memory_snapshot[0m: [33mFalse[0m |
| [36m* [0m[32mprofile_freq[0m: [33m1[0m |
| [36m* [0m[32mfirst_n_rank[0m: [33m4[0m |
| [36m* [0m[32mrecord_shape[0m: [33mTrue[0m |
| [36m* [0m[32mprofile_memory[0m: [33mTrue[0m |
| [36m* [0m[32mwith_stack[0m: [33mTrue[0m |
| [36m* [0m[32mwith_modules[0m: [33mTrue[0m |
| [36m* [0m[32mmodel_parallel[0m: [33mModelParallelConfig(tensor_model_parallel_size=1, pipeline_model_parallel_comm_backend=None, pipeline_model_parallel_size=1, virtual_pipeline_model_parallel_size=None, sequence_parallel=False, context_parallel_size=1, hierarchical_context_parallel_sizes=None, expert_model_parallel_size=1, expert_tensor_parallel_size=1, moe_extended_tp=False, perform_initialization=True, use_cpu_initialization=False, fp16=False, bf16=False, params_dtype=torch.float32, timers=None, finalize_model_grads_func=None, grad_scale_func=None, no_sync_func=None, grad_sync_func=None, param_sync_func=None, deterministic_mode=False, enable_autocast=False, autocast_dtype=torch.float32, num_microbatches_with_partial_activation_checkpoints=None, gradient_accumulation_fusion=False, async_tensor_model_parallel_allreduce=False, use_te_rng_tracker=False, tp_comm_overlap=False, tp_comm_bulk_wgrad=True, tp_comm_bulk_dgrad=True, tp_comm_overlap_ag=True, tp_comm_overlap_rs=True, tp_comm_overlap_rs_dgrad=False, tp_comm_split_ag=True, tp_comm_atomic_ag=False, tp_comm_split_rs=True, tp_comm_atomic_rs=False, cross_entropy_loss_fusion=False, cross_entropy_fusion_impl='native', tp_comm_overlap_disable_qkv=False, tp_comm_overlap_disable_fc1=False, tp_comm_bootstrap_backend='nccl', pipeline_dtype=None, variable_seq_lengths=False, overlap_p2p_comm=False, batch_p2p_comm=True, batch_p2p_sync=True, use_ring_exchange_p2p=False, deallocate_pipeline_outputs=False, defer_embedding_wgrad_compute=False, wgrad_deferral_limit=0, pipeline_model_parallel_split_rank=None, overlap_p2p_comm_warmup_flush=False, microbatch_group_size_per_vp_stage=1, cpu_offloading=False, cpu_offloading_num_layers=0, _cpu_offloading_context=None, cpu_offloading_activations=False, cpu_offloading_weights=False, barrier_with_L1_time=True)[0m |
| [36m* [0m[32mcheckpoint[0m: |
| [36m* [0m[32mtype[0m: [33m{'callbacks': None, '_target_': <class 'cosmos_predict2.checkpointer.Checkpointer'>}[0m |
| [36m* [0m[32mdcp_async_mode_enabled[0m: [33mFalse[0m |
| [36m* [0m[32msave_iter[0m: [33m100[0m |
| [36m* [0m[32mload_path[0m: [33m[0m |
| [36m* [0m[32mload_training_state[0m: [33mFalse[0m |
| [36m* [0m[32monly_load_scheduler_state[0m: [33mFalse[0m |
| [36m* [0m[32mstrict_resume[0m: [33mTrue[0m |
| [36m* [0m[32mjit[0m: |
| [36m* [0m[32menabled[0m: [33mFalse[0m |
| [36m* [0m[32minput_shape[0m: [33mNone[0m |
| [36m* [0m[32mdevice[0m: [33mcuda[0m |
| [36m* [0m[32mdtype[0m: [33mbfloat16[0m |
| [36m* [0m[32mstrict[0m: [33mTrue[0m |
| [36m* [0m[32mverbose[0m: [33mTrue[0m |
| [36m* [0m[32mkeys_not_to_resume[0m: [33m[][0m |
| [36m* [0m[32mbroadcast_via_filesystem[0m: [33mFalse[0m |
| [36m* [0m[32mload_ema_to_reg[0m: [33mFalse[0m |
| [36m* [0m[32mdcp_allow_mismatched_size[0m: [33mFalse[0m |
| [36m* [0m[32mdefaults[0m: [33m['_self_', {'data_config': None}, {'video_dataset_train': None}, {'video_dataset_val': None}, {'dataloader_train': None}, {'dataloader_val': None}, {'world2action_pipe': None}, {'optimizer': 'fusedadamw'}, {'scheduler': 'constant'}, {'model': None}, {'callbacks': ['basic']}, {'net': None}, {'ema': None}, {'checkpoint': None}, {'ckpt_type': None}, {'experiment': None}][0m |
| [05-07 17:19:00|WARNING|imaginaire/utils/misc.py:127:print_environ_variables] Environment variable [32mTORCH_HOME[0m not set! |
| [05-07 17:19:00|INFO|imaginaire/utils/misc.py:125:print_environ_variables] Environment variable [32mIMAGINAIRE_OUTPUT_ROOT[0m: [33m/home/ubuntu/checkpoints[0m |
| [05-07 17:19:00|INFO|imaginaire/utils/misc.py:139:set_random_seed] Using random seed 0. |
| [05-07 17:19:00|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback ema: {'config': None, 'trainer': None, '_target_': <class 'imaginaire.utils.callback.EMAModelCallback'>} |
| [05-07 17:19:00|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback progress_bar: {'config': None, 'trainer': None, '_target_': <class 'imaginaire.utils.callback.ProgressBarCallback'>} |
| [05-07 17:19:00|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback low_prec: {'config': None, 'trainer': None, 'update_iter': 1, '_target_': <class 'imaginaire.utils.callback.LowPrecisionCallback'>} |
| [05-07 17:19:00|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback iter_speed: {'hit_thres': 5, 'every_n': 1000, '_target_': <class 'cosmos_predict2.callbacks.iter_speed.IterSpeed'>} |
| [05-07 17:19:00|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback device_monitor: {'every_n': 1000, 'step_size': 1, 'log_memory_detail': True, '_target_': <class 'cosmos_predict2.callbacks.device_monitor.DeviceMonitor'>} |
| [05-07 17:19:00|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback manual_gc: {'warm_up': 5, 'every_n': 5, '_target_': <class 'imaginaire.callbacks.manual_gc.ManualGarbageCollection'>} |
| [05-07 17:19:00|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback grad_clip: {'clip_norm': 10.0, 'force_finite': True, 'log_wandb': False, '_target_': <class 'cosmos_predict2.callbacks.grad_clip.GradClip'>} |
| [05-07 17:19:00|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback video_eval: {'fuse_lora': True, '_target_': <class 'cosmos_predict2.callbacks.video_eval.VideoEvalCallback'>} |
| [05-07 17:19:00|CRITICAL|cosmos_predict2/models/video2world_model.py:138:__init__] Using mean loss reduce with loss scale 100.0 |
| [05-07 17:19:00|WARNING|cosmos_predict2/pipelines/video2world.py:277:from_config] precision torch.bfloat16 |
| [05-07 17:19:00|INFO|cosmos_predict2/tokenizers/tokenizer.py:687:_video_vae] Loading /home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/video_backbone//tokenizer/tokenizer.pth |
| [05-07 17:19:00|SUCCESS|cosmos_predict2/tokenizers/tokenizer.py:689:_video_vae] Successfully loaded /home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/video_backbone//tokenizer/tokenizer.pth |
| [05-07 17:19:00|INFO|imaginaire/utils/distributed.py:430:sync_model_states] Synchronizing model states from rank 0 to all ranks in process group [0]. |
| [05-07 17:19:00|INFO|cosmos_predict2/pipelines/video2world.py:335:from_config] Loading DiT from /home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/video_backbone//v2w_pretrained_cosmos.pt |
| [05-07 17:19:05|SUCCESS|cosmos_predict2/pipelines/video2world.py:354:from_config] Successfully loaded DiT from /home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/video_backbone//v2w_pretrained_cosmos.pt |
| [05-07 17:19:05|INFO|cosmos_predict2/models/video2world_model.py:321:add_lora_to_model] Adding LoRA adapters: rank=32, alpha=32, targets=['q_proj', 'k_proj', 'v_proj', 'output_proj', 'x_embedder.proj.1', 'linear_1', 'linear_2', 'mlp.layer1', 'mlp.layer2'] |
| [05-07 17:19:06|INFO|cosmos_predict2/models/video2world_model.py:345:add_lora_to_model] LoRA injection successful: 46,336,256 trainable parameters out of 2,002,749,696 total (2.314%) |
| [05-07 17:19:06|INFO|cosmos_predict2/models/video2world_model.py:363:_log_lora_statistics] LoRA parameter breakdown: |
| [05-07 17:19:06|INFO|cosmos_predict2/models/video2world_model.py:365:_log_lora_statistics] lora_A: 22,153,472 parameters |
| [05-07 17:19:06|INFO|cosmos_predict2/models/video2world_model.py:365:_log_lora_statistics] lora_B: 24,182,784 parameters |
| [05-07 17:19:06|INFO|cosmos_predict2/models/video2world_model.py:366:_log_lora_statistics] Total LoRA: 46,336,256 parameters |
| [05-07 17:19:06|INFO|cosmos_predict2/models/video2world_model.py:185:__init__] Total parameters: 2.00B, Frozen parameters: 1,778,679,808, Trainable parameters: 224,069,888 |
| [05-07 17:19:06|INFO|cosmos_predict2/models/video2world_model.py:202:__init__] FSDP (Fully Sharded Data Parallel) is disabled. |
| [05-07 17:19:06|INFO|cosmos_predict2/data/dataset_video.py:102:__init__] 204 videos in total |
| [05-07 17:19:06|INFO|cosmos_predict2/data/dataset_video.py:112:__init__] 204 videos in train. |
| [05-07 17:19:06|INFO|cosmos_predict2/data/dataset_video.py:102:__init__] 204 videos in total |
| [05-07 17:19:06|INFO|cosmos_predict2/data/dataset_video.py:112:__init__] 204 videos in train. |
| [05-07 17:19:06|CRITICAL|cosmos_predict2/utils/optim_instantiate_dtensor.py:49:get_base_optimizer] total num parameters : 224,069,888 |
| [05-07 17:19:06|WARNING|cosmos_predict2/utils/fused_adam_dtensor.py:103:__init__] FusedAdam master_weights: True capturable: True |
| [05-07 17:19:06|INFO|cosmos_predict2/checkpointer.py:288:load] Training from scratch. |
| [05-07 17:19:06|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint loading: 0.0014 seconds |
| [05-07 17:19:06|CRITICAL|imaginaire/trainer.py:178:train] Distributed parallelism mode: ddp |
| [05-07 17:19:06|INFO|imaginaire/trainer.py:186:train] Starting training... |
| [05-07 17:19:06|INFO|cosmos_predict2/callbacks/device_monitor.py:92:on_train_start] DeviceMonitor callback: local_dir: /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/DeviceMonitor |
| [05-07 17:19:08|WARNING|imaginaire/utils/distributed.py:284:ddp_sync_grad] DDP static_graph=True is incompatible with sync_grad(). Performance will be reduced. |
| [05-07 17:19:35|INFO|cosmos_predict2/callbacks/iter_speed.py:51:on_training_step_end] Iteration 1: Hit counter: 1/5 | Loss: 4.5430 | Time: 35.34s |
| [05-07 17:19:52|INFO|cosmos_predict2/callbacks/iter_speed.py:51:on_training_step_end] Iteration 2: Hit counter: 2/5 | Loss: 7.1425 | Time: 17.27s |
| [05-07 17:20:09|INFO|cosmos_predict2/callbacks/iter_speed.py:51:on_training_step_end] Iteration 3: Hit counter: 3/5 | Loss: 4.8634 | Time: 17.24s |
| [05-07 17:20:27|INFO|cosmos_predict2/callbacks/iter_speed.py:51:on_training_step_end] Iteration 4: Hit counter: 4/5 | Loss: 4.9483 | Time: 17.32s |
| [05-07 17:21:34|INFO|imaginaire/trainer.py:116:__init__] Config: |
| [36m* [0m[32mmodel[0m: [33m{'config': {'train_architecture': 'lora', 'lora_rank': 32, 'lora_alpha': 32, 'lora_target_modules': 'q_proj,k_proj,v_proj,output_proj,x_embedder.proj.1,linear_1,linear_2,mlp.layer1,mlp.layer2', 'init_lora_weights': True, 'precision': 'bfloat16', 'input_video_key': 'video', 'input_image_key': 'images', 'loss_reduce': 'mean', 'loss_scale': 100.0, 'adjust_video_noise': True, 'model_manager_config': {'dit_path': '/home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/video_backbone//v2w_pretrained_cosmos.pt', 'text_encoder_path': '', '_target_': 'cosmos_predict2.models.video2world_model.Predict2ModelManagerConfig'}, 'pipe_config': {'adjust_video_noise': True, 'conditioner': {'fps': {'output_key': 'fps', 'dropout_rate': 0.0, 'dtype': None, 'input_key': 'fps', '_target_': <class 'cosmos_predict2.conditioner.ReMapkey'>}, 'padding_mask': {'output_key': 'padding_mask', 'dropout_rate': 0.0, 'dtype': None, 'input_key': 'padding_mask', '_target_': <class 'cosmos_predict2.conditioner.ReMapkey'>}, 'text': {'dropout_rate': 0.0, 'input_key': ['obs/language_embedding'], '_target_': <class 'cosmos_predict2.conditioner.TextAttr'>}, 'use_video_condition': {'output_key': 'use_video_condition', 'dropout_rate': 0.0, 'input_key': 'fps', '_target_': <class 'cosmos_predict2.conditioner.BooleanFlag'>}, '_target_': <class 'cosmos_predict2.conditioner.VideoConditioner'>}, 'conditioning_strategy': 'frame_replace', 'min_num_conditional_frames': 1, 'max_num_conditional_frames': 2, 'sigma_conditional': 0.0001, 'net': {'max_img_h': 240, 'max_img_w': 240, 'max_frames': 128, 'in_channels': 16, 'out_channels': 16, 'patch_spatial': 2, 'patch_temporal': 1, 'concat_padding_mask': True, 'model_channels': 2048, 'num_blocks': 28, 'num_heads': 16, 'atten_backend': 'minimal_a2a', 'pos_emb_cls': 'rope3d', 'pos_emb_learnable': True, 'pos_emb_interpolation': 'crop', 'use_adaln_lora': True, 'adaln_lora_dim': 256, 'rope_h_extrapolation_ratio': 3.0, 'rope_w_extrapolation_ratio': 3.0, 'rope_t_extrapolation_ratio': 1.0, 'extra_per_block_abs_pos_emb': False, 'rope_enable_fps_modulation': False, 'sac_config': {'mode': 'predict2_2b_720', 'every_n_blocks': 1, '_target_': 'cosmos_predict2.models.text2image_dit.SACConfig'}, '_target_': <class 'cosmos_predict2.models.video2world_dit.MinimalV1LVGDiT'>}, 'tokenizer': {'chunk_duration': 81, 'load_mean_std': False, 'temporal_window': 16, 'name': 'tokenizer', 'vae_pth': '/home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/video_backbone//tokenizer/tokenizer.pth', '_target_': <class 'cosmos_predict2.tokenizers.tokenizer.TokenizerInterface'>}, 'guardrail_config': {'checkpoint_dir': '/home/ubuntu/pdt-mimic/mimic-video/model/checkpoints', 'offload_model_to_cpu': True, 'enabled': False}, 'precision': 'bfloat16', 'rectified_flow_t_scaling_factor': 1.0, 'rectified_flow_loss_weight_uniform': True, 'resize_online': False, 'resolution': '480', 'ema': {'enabled': False, 'rate': 0.1, 'iteration_shift': 0, '_target_': 'cosmos_predict2.configs.defaults.ema.EMAConfig'}, 'sigma_data': 1.0, 'state_ch': 16, 'state_t': 16, 'text_encoder': {'cls': <TextEncoderClass.T5: 't5'>, 't5': {'ckpt_path': '/home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/text_encoder/t5-11b', 'num_tokens': 512, 'embed_dim': 1024}}, 'input_video_key': 'video', 'input_image_key': 'images', 'timestamps': {'nfe': 35, 't_min': 0.002, 't_max': 80.0, 'order': 7.0, 'is_forward': False}}, 'debug_without_randomness': False, 'fsdp_shard_size': 0, 'high_sigma_ratio': 0.05}, '_recursive_': False, '_target_': <class 'cosmos_predict2.models.video2world_model.Predict2Video2WorldModel'>}[0m |
| [36m* [0m[32mworld2action_pipe[0m: [33mNone[0m |
| [36m* [0m[32moptimizer[0m: [33m{'optim_type': 'fusedadam', 'model': None, 'lr': 4.445e-05, 'weight_decay': 0.1, 'betas': [0.9, 0.99], 'eps': 1e-08, 'master_weights': True, 'capturable': True, '_target_': <function get_base_optimizer at 0x745843b1ab90>}[0m |
| [36m* [0m[32mscheduler[0m: [33m{'_target_': <class 'cosmos_predict2.configs.defaults.scheduler.ConstantScheduler'>}[0m |
| [36m* [0m[32mdata_config[0m: [33mNone[0m |
| [36m* [0m[32mvideo_dataset_train[0m: [33m{'include_only_with_substrings': None, 'exclude_with_substring': None, 'data_fps': 30.0, 'obs_history': 5, 'is_multi_img': False, 'val_ratio': 0.0, 'dataset_dir': '/home/ubuntu/pdt-mimic/data/push-mimic', 'num_frames': 61, 'video_size': [480, 640], 'is_val': False, '_target_': <class 'cosmos_predict2.data.dataset_video.Dataset'>}[0m |
| [36m* [0m[32mvideo_dataset_val[0m: [33m{'include_only_with_substrings': None, 'exclude_with_substring': None, 'data_fps': 30.0, 'obs_history': 5, 'is_multi_img': False, 'val_ratio': 0.0, 'dataset_dir': '/home/ubuntu/pdt-mimic/data/push-mimic', 'num_frames': 61, 'video_size': [480, 640], 'is_val': True, '_target_': <class 'cosmos_predict2.data.dataset_video.Dataset'>}[0m |
| [36m* [0m[32mdataloader_train[0m: [33m{'batch_size': 2, 'shuffle': None, 'sampler': {'dataset': {'include_only_with_substrings': None, 'exclude_with_substring': None, 'data_fps': 30.0, 'obs_history': 5, 'is_multi_img': False, 'val_ratio': 0.0, 'dataset_dir': '/home/ubuntu/pdt-mimic/data/push-mimic', 'num_frames': 61, 'video_size': [480, 640], 'is_val': False, '_target_': <class 'cosmos_predict2.data.dataset_video.Dataset'>}, '_target_': <function get_sampler at 0x745852f4c310>}, 'batch_sampler': None, 'num_workers': 12, 'collate_fn': None, 'pin_memory': True, 'drop_last': True, 'timeout': 0, 'worker_init_fn': None, 'multiprocessing_context': None, 'generator': None, 'prefetch_factor': 8, 'persistent_workers': True, 'pin_memory_device': '', 'in_order': False, 'dataset': {'include_only_with_substrings': None, 'exclude_with_substring': None, 'data_fps': 30.0, 'obs_history': 5, 'is_multi_img': False, 'val_ratio': 0.0, 'dataset_dir': '/home/ubuntu/pdt-mimic/data/push-mimic', 'num_frames': 61, 'video_size': [480, 640], 'is_val': False, '_target_': <class 'cosmos_predict2.data.dataset_video.Dataset'>}, '_target_': <class 'torch.utils.data.dataloader.DataLoader'>}[0m |
| [36m* [0m[32mdataloader_val[0m: [33m{'batch_size': 1, 'shuffle': None, 'sampler': {'dataset': {'include_only_with_substrings': None, 'exclude_with_substring': None, 'data_fps': 30.0, 'obs_history': 5, 'is_multi_img': False, 'val_ratio': 0.0, 'dataset_dir': '/home/ubuntu/pdt-mimic/data/push-mimic', 'num_frames': 61, 'video_size': [480, 640], 'is_val': True, '_target_': <class 'cosmos_predict2.data.dataset_video.Dataset'>}, '_target_': <function get_sampler at 0x745852f4c310>}, 'batch_sampler': None, 'num_workers': 0, 'collate_fn': None, 'pin_memory': False, 'drop_last': False, 'timeout': 0, 'worker_init_fn': None, 'multiprocessing_context': None, 'generator': None, 'prefetch_factor': None, 'persistent_workers': False, 'pin_memory_device': '', 'in_order': False, 'dataset': {'include_only_with_substrings': None, 'exclude_with_substring': None, 'data_fps': 30.0, 'obs_history': 5, 'is_multi_img': False, 'val_ratio': 0.0, 'dataset_dir': '/home/ubuntu/pdt-mimic/data/push-mimic', 'num_frames': 61, 'video_size': [480, 640], 'is_val': True, '_target_': <class 'cosmos_predict2.data.dataset_video.Dataset'>}, '_target_': <class 'torch.utils.data.dataloader.DataLoader'>}[0m |
| [36m* [0m[32mjob[0m: |
| [36m* [0m[32mproject[0m: [33mposttraining[0m |
| [36m* [0m[32mgroup[0m: [33mvideo2world[0m |
| [36m* [0m[32mname[0m: [33mv2w_push_lora_rank32_lr1.778e-04_bsz32[0m |
| [36m* [0m[32mtrainer[0m: |
| [36m* [0m[32mtype[0m: [33m<class 'imaginaire.trainer.ImaginaireTrainer'>[0m |
| [36m* [0m[32mcallbacks[0m: [33m{'ema': {'config': None, 'trainer': None, '_target_': <class 'imaginaire.utils.callback.EMAModelCallback'>}, 'progress_bar': {'config': None, 'trainer': None, '_target_': <class 'imaginaire.utils.callback.ProgressBarCallback'>}, 'low_prec': {'config': None, 'trainer': None, 'update_iter': 1, '_target_': <class 'imaginaire.utils.callback.LowPrecisionCallback'>}, 'iter_speed': {'hit_thres': 5, 'every_n': 1000, '_target_': <class 'cosmos_predict2.callbacks.iter_speed.IterSpeed'>}, 'device_monitor': {'every_n': 1000, 'step_size': 1, 'log_memory_detail': True, '_target_': <class 'cosmos_predict2.callbacks.device_monitor.DeviceMonitor'>}, 'manual_gc': {'warm_up': 5, 'every_n': 5, '_target_': <class 'imaginaire.callbacks.manual_gc.ManualGarbageCollection'>}, 'grad_clip': {'clip_norm': 10.0, 'force_finite': True, 'log_wandb': False, '_target_': <class 'cosmos_predict2.callbacks.grad_clip.GradClip'>}, 'video_eval': {'fuse_lora': True, '_target_': <class 'cosmos_predict2.callbacks.video_eval.VideoEvalCallback'>}}[0m |
| [36m* [0m[32mdistributed_parallelism[0m: [33mddp[0m |
| [36m* [0m[32mddp[0m: |
| [36m* [0m[32mfind_unused_parameters[0m: [33mFalse[0m |
| [36m* [0m[32mstatic_graph[0m: [33mTrue[0m |
| [36m* [0m[32mbroadcast_buffers[0m: [33mTrue[0m |
| [36m* [0m[32mcudnn[0m: |
| [36m* [0m[32mdeterministic[0m: [33mFalse[0m |
| [36m* [0m[32mbenchmark[0m: [33mTrue[0m |
| [36m* [0m[32mseed[0m: [33m0[0m |
| [36m* [0m[32mgrad_scaler_args[0m: [33m{'enabled': False}[0m |
| [36m* [0m[32mmax_iter[0m: [33m500[0m |
| [36m* [0m[32mmax_val_iter[0m: [33mNone[0m |
| [36m* [0m[32mlogging_iter[0m: [33m1000[0m |
| [36m* [0m[32mrun_validation[0m: [33mFalse[0m |
| [36m* [0m[32mvalidation_iter[0m: [33m999999999[0m |
| [36m* [0m[32mtimeout_period[0m: [33m999999999[0m |
| [36m* [0m[32mmemory_format[0m: [33mtorch.preserve_format[0m |
| [36m* [0m[32mgrad_accum_iter[0m: [33m4[0m |
| [36m* [0m[32mprofiling[0m: |
| [36m* [0m[32menable_profiling[0m: [33mFalse[0m |
| [36m* [0m[32menable_memory_snapshot[0m: [33mFalse[0m |
| [36m* [0m[32mprofile_freq[0m: [33m1[0m |
| [36m* [0m[32mfirst_n_rank[0m: [33m4[0m |
| [36m* [0m[32mrecord_shape[0m: [33mTrue[0m |
| [36m* [0m[32mprofile_memory[0m: [33mTrue[0m |
| [36m* [0m[32mwith_stack[0m: [33mTrue[0m |
| [36m* [0m[32mwith_modules[0m: [33mTrue[0m |
| [36m* [0m[32mmodel_parallel[0m: [33mModelParallelConfig(tensor_model_parallel_size=1, pipeline_model_parallel_comm_backend=None, pipeline_model_parallel_size=1, virtual_pipeline_model_parallel_size=None, sequence_parallel=False, context_parallel_size=1, hierarchical_context_parallel_sizes=None, expert_model_parallel_size=1, expert_tensor_parallel_size=1, moe_extended_tp=False, perform_initialization=True, use_cpu_initialization=False, fp16=False, bf16=False, params_dtype=torch.float32, timers=None, finalize_model_grads_func=None, grad_scale_func=None, no_sync_func=None, grad_sync_func=None, param_sync_func=None, deterministic_mode=False, enable_autocast=False, autocast_dtype=torch.float32, num_microbatches_with_partial_activation_checkpoints=None, gradient_accumulation_fusion=False, async_tensor_model_parallel_allreduce=False, use_te_rng_tracker=False, tp_comm_overlap=False, tp_comm_bulk_wgrad=True, tp_comm_bulk_dgrad=True, tp_comm_overlap_ag=True, tp_comm_overlap_rs=True, tp_comm_overlap_rs_dgrad=False, tp_comm_split_ag=True, tp_comm_atomic_ag=False, tp_comm_split_rs=True, tp_comm_atomic_rs=False, cross_entropy_loss_fusion=False, cross_entropy_fusion_impl='native', tp_comm_overlap_disable_qkv=False, tp_comm_overlap_disable_fc1=False, tp_comm_bootstrap_backend='nccl', pipeline_dtype=None, variable_seq_lengths=False, overlap_p2p_comm=False, batch_p2p_comm=True, batch_p2p_sync=True, use_ring_exchange_p2p=False, deallocate_pipeline_outputs=False, defer_embedding_wgrad_compute=False, wgrad_deferral_limit=0, pipeline_model_parallel_split_rank=None, overlap_p2p_comm_warmup_flush=False, microbatch_group_size_per_vp_stage=1, cpu_offloading=False, cpu_offloading_num_layers=0, _cpu_offloading_context=None, cpu_offloading_activations=False, cpu_offloading_weights=False, barrier_with_L1_time=True)[0m |
| [36m* [0m[32mcheckpoint[0m: |
| [36m* [0m[32mtype[0m: [33m{'callbacks': None, '_target_': <class 'cosmos_predict2.checkpointer.Checkpointer'>}[0m |
| [36m* [0m[32mdcp_async_mode_enabled[0m: [33mFalse[0m |
| [36m* [0m[32msave_iter[0m: [33m100[0m |
| [36m* [0m[32mload_path[0m: [33m[0m |
| [36m* [0m[32mload_training_state[0m: [33mFalse[0m |
| [36m* [0m[32monly_load_scheduler_state[0m: [33mFalse[0m |
| [36m* [0m[32mstrict_resume[0m: [33mTrue[0m |
| [36m* [0m[32mjit[0m: |
| [36m* [0m[32menabled[0m: [33mFalse[0m |
| [36m* [0m[32minput_shape[0m: [33mNone[0m |
| [36m* [0m[32mdevice[0m: [33mcuda[0m |
| [36m* [0m[32mdtype[0m: [33mbfloat16[0m |
| [36m* [0m[32mstrict[0m: [33mTrue[0m |
| [36m* [0m[32mverbose[0m: [33mTrue[0m |
| [36m* [0m[32mkeys_not_to_resume[0m: [33m[][0m |
| [36m* [0m[32mbroadcast_via_filesystem[0m: [33mFalse[0m |
| [36m* [0m[32mload_ema_to_reg[0m: [33mFalse[0m |
| [36m* [0m[32mdcp_allow_mismatched_size[0m: [33mFalse[0m |
| [36m* [0m[32mdefaults[0m: [33m['_self_', {'data_config': None}, {'video_dataset_train': None}, {'video_dataset_val': None}, {'dataloader_train': None}, {'dataloader_val': None}, {'world2action_pipe': None}, {'optimizer': 'fusedadamw'}, {'scheduler': 'constant'}, {'model': None}, {'callbacks': ['basic']}, {'net': None}, {'ema': None}, {'checkpoint': None}, {'ckpt_type': None}, {'experiment': None}][0m |
| [05-07 17:21:34|WARNING|imaginaire/utils/misc.py:127:print_environ_variables] Environment variable [32mTORCH_HOME[0m not set! |
| [05-07 17:21:34|INFO|imaginaire/utils/misc.py:125:print_environ_variables] Environment variable [32mIMAGINAIRE_OUTPUT_ROOT[0m: [33m/home/ubuntu/checkpoints[0m |
| [05-07 17:21:34|INFO|imaginaire/utils/misc.py:139:set_random_seed] Using random seed 0. |
| [05-07 17:21:34|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback ema: {'config': None, 'trainer': None, '_target_': <class 'imaginaire.utils.callback.EMAModelCallback'>} |
| [05-07 17:21:34|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback progress_bar: {'config': None, 'trainer': None, '_target_': <class 'imaginaire.utils.callback.ProgressBarCallback'>} |
| [05-07 17:21:34|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback low_prec: {'config': None, 'trainer': None, 'update_iter': 1, '_target_': <class 'imaginaire.utils.callback.LowPrecisionCallback'>} |
| [05-07 17:21:34|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback iter_speed: {'hit_thres': 5, 'every_n': 1000, '_target_': <class 'cosmos_predict2.callbacks.iter_speed.IterSpeed'>} |
| [05-07 17:21:34|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback device_monitor: {'every_n': 1000, 'step_size': 1, 'log_memory_detail': True, '_target_': <class 'cosmos_predict2.callbacks.device_monitor.DeviceMonitor'>} |
| [05-07 17:21:34|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback manual_gc: {'warm_up': 5, 'every_n': 5, '_target_': <class 'imaginaire.callbacks.manual_gc.ManualGarbageCollection'>} |
| [05-07 17:21:34|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback grad_clip: {'clip_norm': 10.0, 'force_finite': True, 'log_wandb': False, '_target_': <class 'cosmos_predict2.callbacks.grad_clip.GradClip'>} |
| [05-07 17:21:34|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback video_eval: {'fuse_lora': True, '_target_': <class 'cosmos_predict2.callbacks.video_eval.VideoEvalCallback'>} |
| [05-07 17:21:34|CRITICAL|cosmos_predict2/models/video2world_model.py:138:__init__] Using mean loss reduce with loss scale 100.0 |
| [05-07 17:21:34|WARNING|cosmos_predict2/pipelines/video2world.py:277:from_config] precision torch.bfloat16 |
| [05-07 17:21:34|INFO|cosmos_predict2/tokenizers/tokenizer.py:687:_video_vae] Loading /home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/video_backbone//tokenizer/tokenizer.pth |
| [05-07 17:21:34|SUCCESS|cosmos_predict2/tokenizers/tokenizer.py:689:_video_vae] Successfully loaded /home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/video_backbone//tokenizer/tokenizer.pth |
| [05-07 17:21:34|INFO|imaginaire/utils/distributed.py:430:sync_model_states] Synchronizing model states from rank 0 to all ranks in process group [0]. |
| [05-07 17:21:34|INFO|cosmos_predict2/pipelines/video2world.py:335:from_config] Loading DiT from /home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/video_backbone//v2w_pretrained_cosmos.pt |
| [05-07 17:21:39|SUCCESS|cosmos_predict2/pipelines/video2world.py:354:from_config] Successfully loaded DiT from /home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/video_backbone//v2w_pretrained_cosmos.pt |
| [05-07 17:21:40|INFO|cosmos_predict2/models/video2world_model.py:321:add_lora_to_model] Adding LoRA adapters: rank=32, alpha=32, targets=['q_proj', 'k_proj', 'v_proj', 'output_proj', 'x_embedder.proj.1', 'linear_1', 'linear_2', 'mlp.layer1', 'mlp.layer2'] |
| [05-07 17:21:40|INFO|cosmos_predict2/models/video2world_model.py:345:add_lora_to_model] LoRA injection successful: 46,336,256 trainable parameters out of 2,002,749,696 total (2.314%) |
| [05-07 17:21:40|INFO|cosmos_predict2/models/video2world_model.py:363:_log_lora_statistics] LoRA parameter breakdown: |
| [05-07 17:21:40|INFO|cosmos_predict2/models/video2world_model.py:365:_log_lora_statistics] lora_A: 22,153,472 parameters |
| [05-07 17:21:40|INFO|cosmos_predict2/models/video2world_model.py:365:_log_lora_statistics] lora_B: 24,182,784 parameters |
| [05-07 17:21:40|INFO|cosmos_predict2/models/video2world_model.py:366:_log_lora_statistics] Total LoRA: 46,336,256 parameters |
| [05-07 17:21:40|INFO|cosmos_predict2/models/video2world_model.py:185:__init__] Total parameters: 2.00B, Frozen parameters: 1,778,679,808, Trainable parameters: 224,069,888 |
| [05-07 17:21:40|INFO|cosmos_predict2/models/video2world_model.py:202:__init__] FSDP (Fully Sharded Data Parallel) is disabled. |
| [05-07 17:21:40|INFO|cosmos_predict2/data/dataset_video.py:102:__init__] 204 videos in total |
| [05-07 17:21:40|INFO|cosmos_predict2/data/dataset_video.py:112:__init__] 204 videos in train. |
| [05-07 17:21:40|INFO|cosmos_predict2/data/dataset_video.py:102:__init__] 204 videos in total |
| [05-07 17:21:40|INFO|cosmos_predict2/data/dataset_video.py:112:__init__] 204 videos in train. |
| [05-07 17:21:40|CRITICAL|cosmos_predict2/utils/optim_instantiate_dtensor.py:49:get_base_optimizer] total num parameters : 224,069,888 |
| [05-07 17:21:40|WARNING|cosmos_predict2/utils/fused_adam_dtensor.py:103:__init__] FusedAdam master_weights: True capturable: True |
| [05-07 17:21:40|INFO|cosmos_predict2/checkpointer.py:288:load] Training from scratch. |
| [05-07 17:21:40|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint loading: 0.0014 seconds |
| [05-07 17:21:40|CRITICAL|imaginaire/trainer.py:178:train] Distributed parallelism mode: ddp |
| [05-07 17:21:40|INFO|imaginaire/trainer.py:186:train] Starting training... |
| [05-07 17:21:40|INFO|cosmos_predict2/callbacks/device_monitor.py:92:on_train_start] DeviceMonitor callback: local_dir: /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/DeviceMonitor |
| [05-07 17:21:42|WARNING|imaginaire/utils/distributed.py:284:ddp_sync_grad] DDP static_graph=True is incompatible with sync_grad(). Performance will be reduced. |
| [05-07 17:22:09|INFO|cosmos_predict2/callbacks/iter_speed.py:51:on_training_step_end] Iteration 1: Hit counter: 1/5 | Loss: 4.1693 | Time: 35.39s |
| [05-07 17:22:26|INFO|cosmos_predict2/callbacks/iter_speed.py:51:on_training_step_end] Iteration 2: Hit counter: 2/5 | Loss: 6.1576 | Time: 17.27s |
| [05-07 17:22:44|INFO|cosmos_predict2/callbacks/iter_speed.py:51:on_training_step_end] Iteration 3: Hit counter: 3/5 | Loss: 4.6536 | Time: 17.24s |
| [05-07 17:23:01|INFO|cosmos_predict2/callbacks/iter_speed.py:51:on_training_step_end] Iteration 4: Hit counter: 4/5 | Loss: 3.3145 | Time: 17.34s |
| [05-07 17:23:18|INFO|cosmos_predict2/callbacks/iter_speed.py:51:on_training_step_end] Iteration 5: Hit counter: 5/5 | Loss: 4.0082 | Time: 17.26s |
| [05-07 17:29:03|CRITICAL|imaginaire/callbacks/manual_gc.py:48:every_n_impl] Garbage collection disabled |
| [05-07 17:29:17|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/model/iter_000000025.pt |
| [05-07 17:29:26|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 11.6441 seconds |
| [05-07 17:29:27|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/optim/iter_000000025.pt |
| [05-07 17:29:29|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 3.5409 seconds |
| [05-07 17:29:29|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/scheduler/iter_000000025.pt |
| [05-07 17:29:29|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 0.0019 seconds |
| [05-07 17:29:29|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/trainer/iter_000000025.pt |
| [05-07 17:29:29|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 0.0020 seconds |
| [05-07 17:36:54|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/model/iter_000000051.pt |
| [05-07 17:37:02|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 11.2918 seconds |
| [05-07 17:37:04|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/optim/iter_000000051.pt |
| [05-07 17:37:06|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 3.2635 seconds |
| [05-07 17:37:06|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/scheduler/iter_000000051.pt |
| [05-07 17:37:06|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 0.0018 seconds |
| [05-07 17:37:06|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/trainer/iter_000000051.pt |
| [05-07 17:37:06|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 0.0022 seconds |
| [05-07 17:44:30|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/model/iter_000000076.pt |
| [05-07 17:44:37|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 9.5314 seconds |
| [05-07 17:44:38|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/optim/iter_000000076.pt |
| [05-07 17:44:40|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 3.4604 seconds |
| [05-07 17:44:40|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/scheduler/iter_000000076.pt |
| [05-07 17:44:40|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 0.0019 seconds |
| [05-07 17:44:40|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/trainer/iter_000000076.pt |
| [05-07 17:44:40|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 0.0021 seconds |
| [05-07 17:51:31|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/model/iter_000000100.pt |
| [05-07 17:51:38|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 10.0176 seconds |
| [05-07 17:51:39|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/optim/iter_000000100.pt |
| [05-07 17:51:42|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 3.3693 seconds |
| [05-07 17:51:42|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/scheduler/iter_000000100.pt |
| [05-07 17:51:42|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 0.0019 seconds |
| [05-07 17:51:42|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/trainer/iter_000000100.pt |
| [05-07 17:51:42|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 0.0020 seconds |
| [05-07 18:13:39|INFO|imaginaire/trainer.py:116:__init__] Config: |
| [36m* [0m[32mmodel[0m: [33m{'config': {'train_architecture': 'lora', 'lora_rank': 32, 'lora_alpha': 32, 'lora_target_modules': 'q_proj,k_proj,v_proj,output_proj,x_embedder.proj.1,linear_1,linear_2,mlp.layer1,mlp.layer2', 'init_lora_weights': True, 'precision': 'bfloat16', 'input_video_key': 'video', 'input_image_key': 'images', 'loss_reduce': 'mean', 'loss_scale': 100.0, 'adjust_video_noise': True, 'model_manager_config': {'dit_path': '/home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/video_backbone//v2w_pretrained_cosmos.pt', 'text_encoder_path': '', '_target_': 'cosmos_predict2.models.video2world_model.Predict2ModelManagerConfig'}, 'pipe_config': {'adjust_video_noise': True, 'conditioner': {'fps': {'output_key': 'fps', 'dropout_rate': 0.0, 'dtype': None, 'input_key': 'fps', '_target_': <class 'cosmos_predict2.conditioner.ReMapkey'>}, 'padding_mask': {'output_key': 'padding_mask', 'dropout_rate': 0.0, 'dtype': None, 'input_key': 'padding_mask', '_target_': <class 'cosmos_predict2.conditioner.ReMapkey'>}, 'text': {'dropout_rate': 0.0, 'input_key': ['obs/language_embedding'], '_target_': <class 'cosmos_predict2.conditioner.TextAttr'>}, 'use_video_condition': {'output_key': 'use_video_condition', 'dropout_rate': 0.0, 'input_key': 'fps', '_target_': <class 'cosmos_predict2.conditioner.BooleanFlag'>}, '_target_': <class 'cosmos_predict2.conditioner.VideoConditioner'>}, 'conditioning_strategy': 'frame_replace', 'min_num_conditional_frames': 1, 'max_num_conditional_frames': 2, 'sigma_conditional': 0.0001, 'net': {'max_img_h': 240, 'max_img_w': 240, 'max_frames': 128, 'in_channels': 16, 'out_channels': 16, 'patch_spatial': 2, 'patch_temporal': 1, 'concat_padding_mask': True, 'model_channels': 2048, 'num_blocks': 28, 'num_heads': 16, 'atten_backend': 'minimal_a2a', 'pos_emb_cls': 'rope3d', 'pos_emb_learnable': True, 'pos_emb_interpolation': 'crop', 'use_adaln_lora': True, 'adaln_lora_dim': 256, 'rope_h_extrapolation_ratio': 3.0, 'rope_w_extrapolation_ratio': 3.0, 'rope_t_extrapolation_ratio': 1.0, 'extra_per_block_abs_pos_emb': False, 'rope_enable_fps_modulation': False, 'sac_config': {'mode': 'predict2_2b_720', 'every_n_blocks': 1, '_target_': 'cosmos_predict2.models.text2image_dit.SACConfig'}, '_target_': <class 'cosmos_predict2.models.video2world_dit.MinimalV1LVGDiT'>}, 'tokenizer': {'chunk_duration': 81, 'load_mean_std': False, 'temporal_window': 16, 'name': 'tokenizer', 'vae_pth': '/home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/video_backbone//tokenizer/tokenizer.pth', '_target_': <class 'cosmos_predict2.tokenizers.tokenizer.TokenizerInterface'>}, 'guardrail_config': {'checkpoint_dir': '/home/ubuntu/pdt-mimic/mimic-video/model/checkpoints', 'offload_model_to_cpu': True, 'enabled': False}, 'precision': 'bfloat16', 'rectified_flow_t_scaling_factor': 1.0, 'rectified_flow_loss_weight_uniform': True, 'resize_online': False, 'resolution': '480', 'ema': {'enabled': False, 'rate': 0.1, 'iteration_shift': 0, '_target_': 'cosmos_predict2.configs.defaults.ema.EMAConfig'}, 'sigma_data': 1.0, 'state_ch': 16, 'state_t': 16, 'text_encoder': {'cls': <TextEncoderClass.T5: 't5'>, 't5': {'ckpt_path': '/home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/text_encoder/t5-11b', 'num_tokens': 512, 'embed_dim': 1024}}, 'input_video_key': 'video', 'input_image_key': 'images', 'timestamps': {'nfe': 35, 't_min': 0.002, 't_max': 80.0, 'order': 7.0, 'is_forward': False}}, 'debug_without_randomness': False, 'fsdp_shard_size': 0, 'high_sigma_ratio': 0.05}, '_recursive_': False, '_target_': <class 'cosmos_predict2.models.video2world_model.Predict2Video2WorldModel'>}[0m |
| [36m* [0m[32mworld2action_pipe[0m: [33mNone[0m |
| [36m* [0m[32moptimizer[0m: [33m{'optim_type': 'fusedadam', 'model': None, 'lr': 4.445e-05, 'weight_decay': 0.1, 'betas': [0.9, 0.99], 'eps': 1e-08, 'master_weights': True, 'capturable': True, '_target_': <function get_base_optimizer at 0x79dcb0bc2b90>}[0m |
| [36m* [0m[32mscheduler[0m: [33m{'_target_': <class 'cosmos_predict2.configs.defaults.scheduler.ConstantScheduler'>}[0m |
| [36m* [0m[32mdata_config[0m: [33mNone[0m |
| [36m* [0m[32mvideo_dataset_train[0m: [33m{'include_only_with_substrings': None, 'exclude_with_substring': None, 'data_fps': 30.0, 'obs_history': 5, 'is_multi_img': False, 'val_ratio': 0.0, 'dataset_dir': '/home/ubuntu/pdt-mimic/data/push-mimic', 'num_frames': 61, 'video_size': [480, 640], 'is_val': False, '_target_': <class 'cosmos_predict2.data.dataset_video.Dataset'>}[0m |
| [36m* [0m[32mvideo_dataset_val[0m: [33m{'include_only_with_substrings': None, 'exclude_with_substring': None, 'data_fps': 30.0, 'obs_history': 5, 'is_multi_img': False, 'val_ratio': 0.0, 'dataset_dir': '/home/ubuntu/pdt-mimic/data/push-mimic', 'num_frames': 61, 'video_size': [480, 640], 'is_val': True, '_target_': <class 'cosmos_predict2.data.dataset_video.Dataset'>}[0m |
| [36m* [0m[32mdataloader_train[0m: [33m{'batch_size': 2, 'shuffle': None, 'sampler': {'dataset': {'include_only_with_substrings': None, 'exclude_with_substring': None, 'data_fps': 30.0, 'obs_history': 5, 'is_multi_img': False, 'val_ratio': 0.0, 'dataset_dir': '/home/ubuntu/pdt-mimic/data/push-mimic', 'num_frames': 61, 'video_size': [480, 640], 'is_val': False, '_target_': <class 'cosmos_predict2.data.dataset_video.Dataset'>}, '_target_': <function get_sampler at 0x79dcbfd48310>}, 'batch_sampler': None, 'num_workers': 12, 'collate_fn': None, 'pin_memory': True, 'drop_last': True, 'timeout': 0, 'worker_init_fn': None, 'multiprocessing_context': None, 'generator': None, 'prefetch_factor': 8, 'persistent_workers': True, 'pin_memory_device': '', 'in_order': False, 'dataset': {'include_only_with_substrings': None, 'exclude_with_substring': None, 'data_fps': 30.0, 'obs_history': 5, 'is_multi_img': False, 'val_ratio': 0.0, 'dataset_dir': '/home/ubuntu/pdt-mimic/data/push-mimic', 'num_frames': 61, 'video_size': [480, 640], 'is_val': False, '_target_': <class 'cosmos_predict2.data.dataset_video.Dataset'>}, '_target_': <class 'torch.utils.data.dataloader.DataLoader'>}[0m |
| [36m* [0m[32mdataloader_val[0m: [33m{'batch_size': 1, 'shuffle': None, 'sampler': {'dataset': {'include_only_with_substrings': None, 'exclude_with_substring': None, 'data_fps': 30.0, 'obs_history': 5, 'is_multi_img': False, 'val_ratio': 0.0, 'dataset_dir': '/home/ubuntu/pdt-mimic/data/push-mimic', 'num_frames': 61, 'video_size': [480, 640], 'is_val': True, '_target_': <class 'cosmos_predict2.data.dataset_video.Dataset'>}, '_target_': <function get_sampler at 0x79dcbfd48310>}, 'batch_sampler': None, 'num_workers': 0, 'collate_fn': None, 'pin_memory': False, 'drop_last': False, 'timeout': 0, 'worker_init_fn': None, 'multiprocessing_context': None, 'generator': None, 'prefetch_factor': None, 'persistent_workers': False, 'pin_memory_device': '', 'in_order': False, 'dataset': {'include_only_with_substrings': None, 'exclude_with_substring': None, 'data_fps': 30.0, 'obs_history': 5, 'is_multi_img': False, 'val_ratio': 0.0, 'dataset_dir': '/home/ubuntu/pdt-mimic/data/push-mimic', 'num_frames': 61, 'video_size': [480, 640], 'is_val': True, '_target_': <class 'cosmos_predict2.data.dataset_video.Dataset'>}, '_target_': <class 'torch.utils.data.dataloader.DataLoader'>}[0m |
| [36m* [0m[32mjob[0m: |
| [36m* [0m[32mproject[0m: [33mposttraining[0m |
| [36m* [0m[32mgroup[0m: [33mvideo2world[0m |
| [36m* [0m[32mname[0m: [33mv2w_push_lora_rank32_lr1.778e-04_bsz32[0m |
| [36m* [0m[32mtrainer[0m: |
| [36m* [0m[32mtype[0m: [33m<class 'imaginaire.trainer.ImaginaireTrainer'>[0m |
| [36m* [0m[32mcallbacks[0m: [33m{'ema': {'config': None, 'trainer': None, '_target_': <class 'imaginaire.utils.callback.EMAModelCallback'>}, 'progress_bar': {'config': None, 'trainer': None, '_target_': <class 'imaginaire.utils.callback.ProgressBarCallback'>}, 'low_prec': {'config': None, 'trainer': None, 'update_iter': 1, '_target_': <class 'imaginaire.utils.callback.LowPrecisionCallback'>}, 'iter_speed': {'hit_thres': 5, 'every_n': 1000, '_target_': <class 'cosmos_predict2.callbacks.iter_speed.IterSpeed'>}, 'device_monitor': {'every_n': 1000, 'step_size': 1, 'log_memory_detail': True, '_target_': <class 'cosmos_predict2.callbacks.device_monitor.DeviceMonitor'>}, 'manual_gc': {'warm_up': 5, 'every_n': 5, '_target_': <class 'imaginaire.callbacks.manual_gc.ManualGarbageCollection'>}, 'grad_clip': {'clip_norm': 10.0, 'force_finite': True, 'log_wandb': False, '_target_': <class 'cosmos_predict2.callbacks.grad_clip.GradClip'>}, 'video_eval': {'fuse_lora': True, '_target_': <class 'cosmos_predict2.callbacks.video_eval.VideoEvalCallback'>}}[0m |
| [36m* [0m[32mdistributed_parallelism[0m: [33mddp[0m |
| [36m* [0m[32mddp[0m: |
| [36m* [0m[32mfind_unused_parameters[0m: [33mFalse[0m |
| [36m* [0m[32mstatic_graph[0m: [33mTrue[0m |
| [36m* [0m[32mbroadcast_buffers[0m: [33mTrue[0m |
| [36m* [0m[32mcudnn[0m: |
| [36m* [0m[32mdeterministic[0m: [33mFalse[0m |
| [36m* [0m[32mbenchmark[0m: [33mTrue[0m |
| [36m* [0m[32mseed[0m: [33m0[0m |
| [36m* [0m[32mgrad_scaler_args[0m: [33m{'enabled': False}[0m |
| [36m* [0m[32mmax_iter[0m: [33m500[0m |
| [36m* [0m[32mmax_val_iter[0m: [33mNone[0m |
| [36m* [0m[32mlogging_iter[0m: [33m1000[0m |
| [36m* [0m[32mrun_validation[0m: [33mFalse[0m |
| [36m* [0m[32mvalidation_iter[0m: [33m999999999[0m |
| [36m* [0m[32mtimeout_period[0m: [33m999999999[0m |
| [36m* [0m[32mmemory_format[0m: [33mtorch.preserve_format[0m |
| [36m* [0m[32mgrad_accum_iter[0m: [33m4[0m |
| [36m* [0m[32mprofiling[0m: |
| [36m* [0m[32menable_profiling[0m: [33mFalse[0m |
| [36m* [0m[32menable_memory_snapshot[0m: [33mFalse[0m |
| [36m* [0m[32mprofile_freq[0m: [33m1[0m |
| [36m* [0m[32mfirst_n_rank[0m: [33m4[0m |
| [36m* [0m[32mrecord_shape[0m: [33mTrue[0m |
| [36m* [0m[32mprofile_memory[0m: [33mTrue[0m |
| [36m* [0m[32mwith_stack[0m: [33mTrue[0m |
| [36m* [0m[32mwith_modules[0m: [33mTrue[0m |
| [36m* [0m[32mmodel_parallel[0m: [33mModelParallelConfig(tensor_model_parallel_size=1, pipeline_model_parallel_comm_backend=None, pipeline_model_parallel_size=1, virtual_pipeline_model_parallel_size=None, sequence_parallel=False, context_parallel_size=1, hierarchical_context_parallel_sizes=None, expert_model_parallel_size=1, expert_tensor_parallel_size=1, moe_extended_tp=False, perform_initialization=True, use_cpu_initialization=False, fp16=False, bf16=False, params_dtype=torch.float32, timers=None, finalize_model_grads_func=None, grad_scale_func=None, no_sync_func=None, grad_sync_func=None, param_sync_func=None, deterministic_mode=False, enable_autocast=False, autocast_dtype=torch.float32, num_microbatches_with_partial_activation_checkpoints=None, gradient_accumulation_fusion=False, async_tensor_model_parallel_allreduce=False, use_te_rng_tracker=False, tp_comm_overlap=False, tp_comm_bulk_wgrad=True, tp_comm_bulk_dgrad=True, tp_comm_overlap_ag=True, tp_comm_overlap_rs=True, tp_comm_overlap_rs_dgrad=False, tp_comm_split_ag=True, tp_comm_atomic_ag=False, tp_comm_split_rs=True, tp_comm_atomic_rs=False, cross_entropy_loss_fusion=False, cross_entropy_fusion_impl='native', tp_comm_overlap_disable_qkv=False, tp_comm_overlap_disable_fc1=False, tp_comm_bootstrap_backend='nccl', pipeline_dtype=None, variable_seq_lengths=False, overlap_p2p_comm=False, batch_p2p_comm=True, batch_p2p_sync=True, use_ring_exchange_p2p=False, deallocate_pipeline_outputs=False, defer_embedding_wgrad_compute=False, wgrad_deferral_limit=0, pipeline_model_parallel_split_rank=None, overlap_p2p_comm_warmup_flush=False, microbatch_group_size_per_vp_stage=1, cpu_offloading=False, cpu_offloading_num_layers=0, _cpu_offloading_context=None, cpu_offloading_activations=False, cpu_offloading_weights=False, barrier_with_L1_time=True)[0m |
| [36m* [0m[32mcheckpoint[0m: |
| [36m* [0m[32mtype[0m: [33m{'callbacks': None, '_target_': <class 'cosmos_predict2.checkpointer.Checkpointer'>}[0m |
| [36m* [0m[32mdcp_async_mode_enabled[0m: [33mFalse[0m |
| [36m* [0m[32msave_iter[0m: [33m100[0m |
| [36m* [0m[32mload_path[0m: [33m[0m |
| [36m* [0m[32mload_training_state[0m: [33mFalse[0m |
| [36m* [0m[32monly_load_scheduler_state[0m: [33mFalse[0m |
| [36m* [0m[32mstrict_resume[0m: [33mTrue[0m |
| [36m* [0m[32mjit[0m: |
| [36m* [0m[32menabled[0m: [33mFalse[0m |
| [36m* [0m[32minput_shape[0m: [33mNone[0m |
| [36m* [0m[32mdevice[0m: [33mcuda[0m |
| [36m* [0m[32mdtype[0m: [33mbfloat16[0m |
| [36m* [0m[32mstrict[0m: [33mTrue[0m |
| [36m* [0m[32mverbose[0m: [33mTrue[0m |
| [36m* [0m[32mkeys_not_to_resume[0m: [33m[][0m |
| [36m* [0m[32mbroadcast_via_filesystem[0m: [33mFalse[0m |
| [36m* [0m[32mload_ema_to_reg[0m: [33mFalse[0m |
| [36m* [0m[32mdcp_allow_mismatched_size[0m: [33mFalse[0m |
| [36m* [0m[32mdefaults[0m: [33m['_self_', {'data_config': None}, {'video_dataset_train': None}, {'video_dataset_val': None}, {'dataloader_train': None}, {'dataloader_val': None}, {'world2action_pipe': None}, {'optimizer': 'fusedadamw'}, {'scheduler': 'constant'}, {'model': None}, {'callbacks': ['basic']}, {'net': None}, {'ema': None}, {'checkpoint': None}, {'ckpt_type': None}, {'experiment': None}][0m |
| [05-07 18:13:39|WARNING|imaginaire/utils/misc.py:127:print_environ_variables] Environment variable [32mTORCH_HOME[0m not set! |
| [05-07 18:13:39|INFO|imaginaire/utils/misc.py:125:print_environ_variables] Environment variable [32mIMAGINAIRE_OUTPUT_ROOT[0m: [33m/home/ubuntu/checkpoints[0m |
| [05-07 18:13:39|INFO|imaginaire/utils/misc.py:139:set_random_seed] Using random seed 0. |
| [05-07 18:13:39|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback ema: {'config': None, 'trainer': None, '_target_': <class 'imaginaire.utils.callback.EMAModelCallback'>} |
| [05-07 18:13:39|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback progress_bar: {'config': None, 'trainer': None, '_target_': <class 'imaginaire.utils.callback.ProgressBarCallback'>} |
| [05-07 18:13:39|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback low_prec: {'config': None, 'trainer': None, 'update_iter': 1, '_target_': <class 'imaginaire.utils.callback.LowPrecisionCallback'>} |
| [05-07 18:13:39|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback iter_speed: {'hit_thres': 5, 'every_n': 1000, '_target_': <class 'cosmos_predict2.callbacks.iter_speed.IterSpeed'>} |
| [05-07 18:13:39|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback device_monitor: {'every_n': 1000, 'step_size': 1, 'log_memory_detail': True, '_target_': <class 'cosmos_predict2.callbacks.device_monitor.DeviceMonitor'>} |
| [05-07 18:13:39|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback manual_gc: {'warm_up': 5, 'every_n': 5, '_target_': <class 'imaginaire.callbacks.manual_gc.ManualGarbageCollection'>} |
| [05-07 18:13:39|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback grad_clip: {'clip_norm': 10.0, 'force_finite': True, 'log_wandb': False, '_target_': <class 'cosmos_predict2.callbacks.grad_clip.GradClip'>} |
| [05-07 18:13:39|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback video_eval: {'fuse_lora': True, '_target_': <class 'cosmos_predict2.callbacks.video_eval.VideoEvalCallback'>} |
| [05-07 18:13:39|CRITICAL|cosmos_predict2/models/video2world_model.py:138:__init__] Using mean loss reduce with loss scale 100.0 |
| [05-07 18:13:39|WARNING|cosmos_predict2/pipelines/video2world.py:277:from_config] precision torch.bfloat16 |
| [05-07 18:13:40|INFO|cosmos_predict2/tokenizers/tokenizer.py:687:_video_vae] Loading /home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/video_backbone//tokenizer/tokenizer.pth |
| [05-07 18:13:40|SUCCESS|cosmos_predict2/tokenizers/tokenizer.py:689:_video_vae] Successfully loaded /home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/video_backbone//tokenizer/tokenizer.pth |
| [05-07 18:13:40|INFO|imaginaire/utils/distributed.py:430:sync_model_states] Synchronizing model states from rank 0 to all ranks in process group [0]. |
| [05-07 18:13:40|INFO|cosmos_predict2/pipelines/video2world.py:335:from_config] Loading DiT from /home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/video_backbone//v2w_pretrained_cosmos.pt |
| [05-07 18:13:45|SUCCESS|cosmos_predict2/pipelines/video2world.py:354:from_config] Successfully loaded DiT from /home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/video_backbone//v2w_pretrained_cosmos.pt |
| [05-07 18:13:45|INFO|cosmos_predict2/models/video2world_model.py:321:add_lora_to_model] Adding LoRA adapters: rank=32, alpha=32, targets=['q_proj', 'k_proj', 'v_proj', 'output_proj', 'x_embedder.proj.1', 'linear_1', 'linear_2', 'mlp.layer1', 'mlp.layer2'] |
| [05-07 18:13:46|INFO|cosmos_predict2/models/video2world_model.py:345:add_lora_to_model] LoRA injection successful: 46,336,256 trainable parameters out of 2,002,749,696 total (2.314%) |
| [05-07 18:13:46|INFO|cosmos_predict2/models/video2world_model.py:363:_log_lora_statistics] LoRA parameter breakdown: |
| [05-07 18:13:46|INFO|cosmos_predict2/models/video2world_model.py:365:_log_lora_statistics] lora_A: 22,153,472 parameters |
| [05-07 18:13:46|INFO|cosmos_predict2/models/video2world_model.py:365:_log_lora_statistics] lora_B: 24,182,784 parameters |
| [05-07 18:13:46|INFO|cosmos_predict2/models/video2world_model.py:366:_log_lora_statistics] Total LoRA: 46,336,256 parameters |
| [05-07 18:13:46|INFO|cosmos_predict2/models/video2world_model.py:185:__init__] Total parameters: 2.00B, Frozen parameters: 1,778,679,808, Trainable parameters: 224,069,888 |
| [05-07 18:13:46|INFO|cosmos_predict2/models/video2world_model.py:202:__init__] FSDP (Fully Sharded Data Parallel) is disabled. |
| [05-07 18:13:46|INFO|cosmos_predict2/data/dataset_video.py:102:__init__] 204 videos in total |
| [05-07 18:13:46|INFO|cosmos_predict2/data/dataset_video.py:112:__init__] 204 videos in train. |
| [05-07 18:13:46|INFO|cosmos_predict2/data/dataset_video.py:102:__init__] 204 videos in total |
| [05-07 18:13:46|INFO|cosmos_predict2/data/dataset_video.py:112:__init__] 204 videos in train. |
| [05-07 18:13:46|CRITICAL|cosmos_predict2/utils/optim_instantiate_dtensor.py:49:get_base_optimizer] total num parameters : 224,069,888 |
| [05-07 18:13:46|WARNING|cosmos_predict2/utils/fused_adam_dtensor.py:103:__init__] FusedAdam master_weights: True capturable: True |
| [05-07 18:13:46|INFO|cosmos_predict2/checkpointer.py:206:load] Loading checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/model/iter_000000100.pt |
| [05-07 18:13:48|SUCCESS|cosmos_predict2/checkpointer.py:208:load] Complete loading checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/model/iter_000000100.pt |
| [05-07 18:13:48|INFO|cosmos_predict2/checkpointer.py:206:load] Loading checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/optim/iter_000000100.pt |
| [05-07 18:13:49|SUCCESS|cosmos_predict2/checkpointer.py:208:load] Complete loading checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/optim/iter_000000100.pt |
| [05-07 18:13:49|INFO|cosmos_predict2/checkpointer.py:206:load] Loading checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/scheduler/iter_000000100.pt |
| [05-07 18:13:49|SUCCESS|cosmos_predict2/checkpointer.py:208:load] Complete loading checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/scheduler/iter_000000100.pt |
| [05-07 18:13:49|INFO|cosmos_predict2/checkpointer.py:206:load] Loading checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/trainer/iter_000000100.pt |
| [05-07 18:13:49|SUCCESS|cosmos_predict2/checkpointer.py:208:load] Complete loading checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/trainer/iter_000000100.pt |
| [05-07 18:13:49|INFO|cosmos_predict2/checkpointer.py:212:load] - Loading the model... |
| [05-07 18:13:49|INFO|cosmos_predict2/checkpointer.py:260:load] - Loading the scheduler... |
| [05-07 18:13:49|INFO|cosmos_predict2/checkpointer.py:267:load] - Loading the optimizer... |
| [05-07 18:13:49|INFO|cosmos_predict2/checkpointer.py:280:load] - Loading the gradient scaler... |
| [05-07 18:13:49|SUCCESS|cosmos_predict2/checkpointer.py:282:load] Done with loading the checkpoint (iteration 100). |
| [05-07 18:13:49|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint loading: 3.4351 seconds |
| [05-07 18:13:49|CRITICAL|imaginaire/trainer.py:178:train] Distributed parallelism mode: ddp |
| [05-07 18:13:50|INFO|imaginaire/trainer.py:186:train] Starting training... |
| [05-07 18:13:50|INFO|cosmos_predict2/callbacks/device_monitor.py:92:on_train_start] DeviceMonitor callback: local_dir: /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/DeviceMonitor |
| [05-07 18:13:51|WARNING|imaginaire/utils/distributed.py:284:ddp_sync_grad] DDP static_graph=True is incompatible with sync_grad(). Performance will be reduced. |
| [05-07 18:14:13|INFO|cosmos_predict2/callbacks/iter_speed.py:51:on_training_step_end] Iteration 101: Hit counter: 1/5 | Loss: 4.8886 | Time: 33.45s |
| [05-07 18:14:30|INFO|cosmos_predict2/callbacks/iter_speed.py:51:on_training_step_end] Iteration 102: Hit counter: 2/5 | Loss: 5.8927 | Time: 17.23s |
| [05-07 18:14:49|INFO|cosmos_predict2/callbacks/iter_speed.py:51:on_training_step_end] Iteration 103: Hit counter: 3/5 | Loss: 4.2789 | Time: 19.49s |
| [05-07 18:15:07|INFO|cosmos_predict2/callbacks/iter_speed.py:51:on_training_step_end] Iteration 104: Hit counter: 4/5 | Loss: 3.6165 | Time: 17.32s |
| [05-07 18:15:24|INFO|cosmos_predict2/callbacks/iter_speed.py:51:on_training_step_end] Iteration 105: Hit counter: 5/5 | Loss: 4.6682 | Time: 17.27s |
| [05-07 18:21:09|CRITICAL|imaginaire/callbacks/manual_gc.py:48:every_n_impl] Garbage collection disabled |
| [05-07 18:42:54|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/model/iter_000000200.pt |
| [05-07 18:43:03|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 12.1174 seconds |
| [05-07 18:43:04|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/optim/iter_000000200.pt |
| [05-07 18:43:07|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 3.6517 seconds |
| [05-07 18:43:07|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/scheduler/iter_000000200.pt |
| [05-07 18:43:07|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 0.0020 seconds |
| [05-07 18:43:07|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/trainer/iter_000000200.pt |
| [05-07 18:43:07|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 0.0020 seconds |
| [05-07 19:12:02|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/model/iter_000000300.pt |
| [05-07 19:12:09|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 9.9800 seconds |
| [05-07 19:12:10|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/optim/iter_000000300.pt |
| [05-07 19:12:12|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 3.4690 seconds |
| [05-07 19:12:12|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/scheduler/iter_000000300.pt |
| [05-07 19:12:12|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 0.0021 seconds |
| [05-07 19:12:12|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/trainer/iter_000000300.pt |
| [05-07 19:12:12|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 0.0020 seconds |
| [05-07 19:41:07|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/model/iter_000000400.pt |
| [05-07 19:41:14|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 10.2649 seconds |
| [05-07 19:41:16|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/optim/iter_000000400.pt |
| [05-07 19:41:18|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 3.5255 seconds |
| [05-07 19:41:18|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/scheduler/iter_000000400.pt |
| [05-07 19:41:18|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 0.0020 seconds |
| [05-07 19:41:18|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/trainer/iter_000000400.pt |
| [05-07 19:41:18|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 0.0023 seconds |
| [05-07 20:10:12|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/model/iter_000000500.pt |
| [05-07 20:10:19|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 9.8569 seconds |
| [05-07 20:10:20|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/optim/iter_000000500.pt |
| [05-07 20:10:22|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 3.6050 seconds |
| [05-07 20:10:22|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/scheduler/iter_000000500.pt |
| [05-07 20:10:22|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 0.0020 seconds |
| [05-07 20:10:22|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/trainer/iter_000000500.pt |
| [05-07 20:10:22|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 0.0021 seconds |
| [05-07 20:10:22|SUCCESS|imaginaire/trainer.py:288:train] Done with training. |
|
|