Upload folder using huggingface_hub
Browse files
action_decoder/chain/w2a_so101_task3_1_103ep_lora32_iter2500_75m_2gpu/stdout.log
CHANGED
|
@@ -82,3 +82,22 @@
|
|
| 82 |
[05-20 18:40:12|SUCCESS|cosmos_predict2/tokenizers/tokenizer.py:689:_video_vae] Successfully loaded /home/nvidia/pdt-mimic/mimic-video/model/checkpoints/video_backbone//tokenizer/tokenizer.pth
|
| 83 |
[05-20 18:40:12|INFO|imaginaire/utils/distributed.py:430:sync_model_states] Synchronizing model states from rank 0 to all ranks in process group [0, 1].
|
| 84 |
[05-20 18:40:12|INFO|cosmos_predict2/pipelines/video2world.py:335:from_config] Loading DiT from /home/nvidia/pdt-mimic/mimic-video/model/checkpoints/video_backbone/v2w_push_combined_lora_rank32_lr1.778e-04_bsz32_iter_000002500_fused.pt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
[05-20 18:40:12|SUCCESS|cosmos_predict2/tokenizers/tokenizer.py:689:_video_vae] Successfully loaded /home/nvidia/pdt-mimic/mimic-video/model/checkpoints/video_backbone//tokenizer/tokenizer.pth
|
| 83 |
[05-20 18:40:12|INFO|imaginaire/utils/distributed.py:430:sync_model_states] Synchronizing model states from rank 0 to all ranks in process group [0, 1].
|
| 84 |
[05-20 18:40:12|INFO|cosmos_predict2/pipelines/video2world.py:335:from_config] Loading DiT from /home/nvidia/pdt-mimic/mimic-video/model/checkpoints/video_backbone/v2w_push_combined_lora_rank32_lr1.778e-04_bsz32_iter_000002500_fused.pt
|
| 85 |
+
[05-20 18:40:19|SUCCESS|cosmos_predict2/pipelines/video2world.py:354:from_config] Successfully loaded DiT from /home/nvidia/pdt-mimic/mimic-video/model/checkpoints/video_backbone/v2w_push_combined_lora_rank32_lr1.778e-04_bsz32_iter_000002500_fused.pt
|
| 86 |
+
[05-20 18:40:19|INFO|cosmos_predict2/models/world2action_model.py:148:__init__] Total parameters: 2.04B, Frozen parameters: 1,956,413,440, Trainable parameters: 80,859,000
|
| 87 |
+
[05-20 18:40:19|INFO|cosmos_predict2/models/world2action_model.py:167:__init__] FSDP (Fully Sharded Data Parallel) is disabled.
|
| 88 |
+
[05-20 18:40:20|CRITICAL|cosmos_predict2/utils/optim_instantiate_dtensor.py:49:get_base_optimizer] total num parameters : 80,859,000
|
| 89 |
+
[05-20 18:40:20|WARNING|cosmos_predict2/utils/fused_adam_dtensor.py:103:__init__] FusedAdam master_weights: True capturable: True
|
| 90 |
+
[05-20 18:40:20|INFO|cosmos_predict2/checkpointer.py:288:load] Training from scratch.
|
| 91 |
+
[05-20 18:40:20|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint loading: 0.0003 seconds
|
| 92 |
+
[05-20 18:40:20|CRITICAL|imaginaire/trainer.py:179:train] Distributed parallelism mode: ddp
|
| 93 |
+
[05-20 18:40:20|INFO|imaginaire/trainer.py:187:train] Starting training...
|
| 94 |
+
[05-20 18:40:20|INFO|cosmos_predict2/callbacks/device_monitor.py:92:on_train_start] DeviceMonitor callback: local_dir: checkpoints/vam/so101/w2a_so101_task3_1_103ep_lora32_iter2500_75m_2gpu/DeviceMonitor
|
| 95 |
+
[05-20 18:40:22|INFO|imaginaire/trainer.py:388:validate] Validating at iteration 0...
|
| 96 |
+
[05-20 18:41:14|INFO|imaginaire/trainer.py:193:train] Initial validation done.
|
| 97 |
+
[05-20 18:41:18|WARNING|imaginaire/utils/distributed.py:284:ddp_sync_grad] DDP static_graph=True is incompatible with sync_grad(). Performance will be reduced.
|
| 98 |
+
[05-20 18:41:49|INFO|cosmos_predict2/callbacks/iter_speed.py:51:on_training_step_end] Iteration 1: Hit counter: 1/5 | Loss: 39.6939 | Time: 99.82s
|
| 99 |
+
[05-20 18:42:06|INFO|cosmos_predict2/callbacks/iter_speed.py:51:on_training_step_end] Iteration 2: Hit counter: 2/5 | Loss: 32.9864 | Time: 17.41s
|
| 100 |
+
[05-20 18:42:23|INFO|cosmos_predict2/callbacks/iter_speed.py:51:on_training_step_end] Iteration 3: Hit counter: 3/5 | Loss: 28.2805 | Time: 17.47s
|
| 101 |
+
[05-20 18:42:41|INFO|cosmos_predict2/callbacks/iter_speed.py:51:on_training_step_end] Iteration 4: Hit counter: 4/5 | Loss: 23.8440 | Time: 17.56s
|
| 102 |
+
[05-20 18:42:59|INFO|cosmos_predict2/callbacks/iter_speed.py:51:on_training_step_end] Iteration 5: Hit counter: 5/5 | Loss: 25.2305 | Time: 17.57s
|
| 103 |
+
[05-20 18:48:49|CRITICAL|imaginaire/callbacks/manual_gc.py:48:every_n_impl] Garbage collection disabled
|