BryanW commited on
Commit
4b46d4c
Β·
verified Β·
1 Parent(s): c2925de

Upload code from /mnt/43.oT_eV

Browse files
This view is limited to 50 files because it contains too many changes. Β  See raw diff
Files changed (50) hide show
  1. .gitattributes +3 -0
  2. 128_128_17/video_codes.tar.zst +3 -0
  3. 256_256_17/video_codes.tar.zst +3 -0
  4. Meissonic/cosmos_test_output/comparison_video_0.mp4 +0 -0
  5. Meissonic/cosmos_test_output/comparison_video_1.mp4 +3 -0
  6. Meissonic/cosmos_test_output/comparison_video_2.mp4 +3 -0
  7. Meissonic/cosmos_test_output/comparison_video_3.mp4 +3 -0
  8. Meissonic/model/diffusion_pytorch_model.safetensors +3 -0
  9. Meissonic/src/__pycache__/pipeline.cpython-310.pyc +0 -0
  10. Meissonic/src/__pycache__/pipeline_video.cpython-310.pyc +0 -0
  11. Meissonic/src/__pycache__/pipeline_video.cpython-313.pyc +0 -0
  12. Meissonic/src/__pycache__/pipeline_video.cpython-314.pyc +0 -0
  13. Meissonic/src/__pycache__/scheduler.cpython-310.pyc +0 -0
  14. Meissonic/src/__pycache__/scheduler.cpython-313.pyc +0 -0
  15. Meissonic/src/__pycache__/scheduler_video.cpython-310.pyc +0 -0
  16. Meissonic/src/__pycache__/scheduler_video.cpython-313.pyc +0 -0
  17. Meissonic/src/__pycache__/scheduler_video.cpython-314.pyc +0 -0
  18. Meissonic/src/__pycache__/transformer.cpython-310.pyc +0 -0
  19. Meissonic/src/__pycache__/transformer.cpython-313.pyc +0 -0
  20. Meissonic/src/__pycache__/transformer_video.cpython-310.pyc +0 -0
  21. Meissonic/src/__pycache__/transformer_video.cpython-313.pyc +0 -0
  22. Meissonic/src/__pycache__/transformer_video.cpython-314.pyc +0 -0
  23. Meissonic/train/__pycache__/dataset_utils.cpython-310.pyc +0 -0
  24. Meissonic/train/__pycache__/dataset_utils.cpython-313.pyc +0 -0
  25. Meissonic/train/__pycache__/trainer_utils.cpython-310.pyc +0 -0
  26. Meissonic/train/__pycache__/trainer_utils.cpython-313.pyc +0 -0
  27. Meissonic/wandb/debug-internal.log +11 -0
  28. Meissonic/wandb/debug.log +24 -0
  29. Meissonic/wandb/run-20251229_081634-hjn0m6c2/files/output.log +17 -0
  30. Meissonic/wandb/run-20251229_081634-hjn0m6c2/logs/debug-core.log +14 -0
  31. Meissonic/wandb/run-20251229_081634-hjn0m6c2/logs/debug-internal.log +11 -0
  32. Meissonic/wandb/run-20251229_081634-hjn0m6c2/logs/debug.log +24 -0
  33. Meissonic/wandb/run-20251229_081752-78ojckdj/files/output.log +17 -0
  34. Meissonic/wandb/run-20251229_081752-78ojckdj/logs/debug-core.log +14 -0
  35. Meissonic/wandb/run-20251229_081752-78ojckdj/logs/debug-internal.log +11 -0
  36. Meissonic/wandb/run-20251229_081752-78ojckdj/logs/debug.log +24 -0
  37. Meissonic/wandb/run-20251229_081959-tvb7bjux/files/output.log +8 -0
  38. Meissonic/wandb/run-20251229_081959-tvb7bjux/logs/debug-core.log +7 -0
  39. Meissonic/wandb/run-20251229_081959-tvb7bjux/logs/debug-internal.log +6 -0
  40. Meissonic/wandb/run-20251229_081959-tvb7bjux/logs/debug.log +22 -0
  41. Meissonic/wandb/run-20251229_082208-d5bens3y/files/output.log +68 -0
  42. Meissonic/wandb/run-20251229_082208-d5bens3y/logs/debug-core.log +14 -0
  43. Meissonic/wandb/run-20251229_082208-d5bens3y/logs/debug-internal.log +11 -0
  44. Meissonic/wandb/run-20251229_082208-d5bens3y/logs/debug.log +24 -0
  45. Meissonic/wandb/run-20251229_082348-xdcob8vv/files/output.log +68 -0
  46. Meissonic/wandb/run-20251229_082348-xdcob8vv/logs/debug-core.log +14 -0
  47. Meissonic/wandb/run-20251229_082348-xdcob8vv/logs/debug-internal.log +11 -0
  48. Meissonic/wandb/run-20251229_082348-xdcob8vv/logs/debug.log +24 -0
  49. Meissonic/wandb/run-20251229_082735-s2rbngfj/files/output.log +68 -0
  50. Meissonic/wandb/run-20251229_082735-s2rbngfj/logs/debug-core.log +14 -0
.gitattributes CHANGED
@@ -876,3 +876,6 @@ Meissonic/wandb/run-20251229_093500-yyrdgepk/run-yyrdgepk.wandb filter=lfs diff=
876
  OpenVid1M_reorganized.csv filter=lfs diff=lfs merge=lfs -text
877
  Wan2.1-T2V-1.3B/examples/i2v_input.JPG filter=lfs diff=lfs merge=lfs -text
878
  Wan2.1-T2V-1.3B/google/umt5-xxl/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 
 
 
 
876
  OpenVid1M_reorganized.csv filter=lfs diff=lfs merge=lfs -text
877
  Wan2.1-T2V-1.3B/examples/i2v_input.JPG filter=lfs diff=lfs merge=lfs -text
878
  Wan2.1-T2V-1.3B/google/umt5-xxl/tokenizer.json filter=lfs diff=lfs merge=lfs -text
879
+ Meissonic/cosmos_test_output/comparison_video_1.mp4 filter=lfs diff=lfs merge=lfs -text
880
+ Meissonic/cosmos_test_output/comparison_video_2.mp4 filter=lfs diff=lfs merge=lfs -text
881
+ Meissonic/cosmos_test_output/comparison_video_3.mp4 filter=lfs diff=lfs merge=lfs -text
128_128_17/video_codes.tar.zst ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09f283283fcbaa8e88678c39ffaf7b37d14c2f234798403f77fbda59ea65b5e0
3
+ size 2966606624
256_256_17/video_codes.tar.zst ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fdb5b8c632876b41b319e069c021e517fe3ab6477b49e4ad5ed950646d58bcd5
3
+ size 11880937045
Meissonic/cosmos_test_output/comparison_video_0.mp4 ADDED
Binary file (36.3 kB). View file
 
Meissonic/cosmos_test_output/comparison_video_1.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b7311b27e36333219d20c8d835432ecadf9ebe5977bcf760bc6706a85a95cabd
3
+ size 1089113
Meissonic/cosmos_test_output/comparison_video_2.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e02445cac3531ab68bda4ba1bc90ac570a7b423f78b9493471acb4d6e5f9a28
3
+ size 1618316
Meissonic/cosmos_test_output/comparison_video_3.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:017fcf1133dc553228724625c5ad6ec7f58f97ddc27c91201aa88a07423a76e2
3
+ size 931953
Meissonic/model/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96b6b242ca1c2f24e9d02cd6596066fab6d310e2d7538f33ae267cb18d957e8f
3
+ size 5676070424
Meissonic/src/__pycache__/pipeline.cpython-310.pyc ADDED
Binary file (11.4 kB). View file
 
Meissonic/src/__pycache__/pipeline_video.cpython-310.pyc ADDED
Binary file (27.9 kB). View file
 
Meissonic/src/__pycache__/pipeline_video.cpython-313.pyc ADDED
Binary file (41.8 kB). View file
 
Meissonic/src/__pycache__/pipeline_video.cpython-314.pyc ADDED
Binary file (44.8 kB). View file
 
Meissonic/src/__pycache__/scheduler.cpython-310.pyc ADDED
Binary file (5.09 kB). View file
 
Meissonic/src/__pycache__/scheduler.cpython-313.pyc ADDED
Binary file (9.32 kB). View file
 
Meissonic/src/__pycache__/scheduler_video.cpython-310.pyc ADDED
Binary file (5.27 kB). View file
 
Meissonic/src/__pycache__/scheduler_video.cpython-313.pyc ADDED
Binary file (9.87 kB). View file
 
Meissonic/src/__pycache__/scheduler_video.cpython-314.pyc ADDED
Binary file (11.1 kB). View file
 
Meissonic/src/__pycache__/transformer.cpython-310.pyc ADDED
Binary file (33 kB). View file
 
Meissonic/src/__pycache__/transformer.cpython-313.pyc ADDED
Binary file (52 kB). View file
 
Meissonic/src/__pycache__/transformer_video.cpython-310.pyc ADDED
Binary file (29.4 kB). View file
 
Meissonic/src/__pycache__/transformer_video.cpython-313.pyc ADDED
Binary file (49.1 kB). View file
 
Meissonic/src/__pycache__/transformer_video.cpython-314.pyc ADDED
Binary file (49 kB). View file
 
Meissonic/train/__pycache__/dataset_utils.cpython-310.pyc ADDED
Binary file (28.2 kB). View file
 
Meissonic/train/__pycache__/dataset_utils.cpython-313.pyc ADDED
Binary file (50 kB). View file
 
Meissonic/train/__pycache__/trainer_utils.cpython-310.pyc ADDED
Binary file (1.27 kB). View file
 
Meissonic/train/__pycache__/trainer_utils.cpython-313.pyc ADDED
Binary file (2.03 kB). View file
 
Meissonic/wandb/debug-internal.log ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-12-29T09:35:00.674748488Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"}
2
+ {"time":"2025-12-29T09:35:00.840745763Z","level":"INFO","msg":"stream: created new stream","id":"yyrdgepk"}
3
+ {"time":"2025-12-29T09:35:00.840887309Z","level":"INFO","msg":"handler: started","stream_id":"yyrdgepk"}
4
+ {"time":"2025-12-29T09:35:00.840989877Z","level":"INFO","msg":"stream: started","id":"yyrdgepk"}
5
+ {"time":"2025-12-29T09:35:00.841004187Z","level":"INFO","msg":"writer: started","stream_id":"yyrdgepk"}
6
+ {"time":"2025-12-29T09:35:00.841006253Z","level":"INFO","msg":"sender: started","stream_id":"yyrdgepk"}
7
+ {"time":"2025-12-29T09:42:02.535940574Z","level":"INFO","msg":"stream: closing","id":"yyrdgepk"}
8
+ {"time":"2025-12-29T09:42:02.752587654Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
9
+ {"time":"2025-12-29T09:42:02.857589578Z","level":"INFO","msg":"handler: closed","stream_id":"yyrdgepk"}
10
+ {"time":"2025-12-29T09:42:02.857716241Z","level":"INFO","msg":"sender: closed","stream_id":"yyrdgepk"}
11
+ {"time":"2025-12-29T09:42:02.857727173Z","level":"INFO","msg":"stream: closed","id":"yyrdgepk"}
Meissonic/wandb/debug.log ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-12-29 09:35:00,410 INFO MainThread:843534 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1
2
+ 2025-12-29 09:35:00,410 INFO MainThread:843534 [wandb_setup.py:_flush():80] Configure stats pid to 843534
3
+ 2025-12-29 09:35:00,410 INFO MainThread:843534 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings
4
+ 2025-12-29 09:35:00,410 INFO MainThread:843534 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings
5
+ 2025-12-29 09:35:00,410 INFO MainThread:843534 [wandb_setup.py:_flush():80] Loading settings from environment variables
6
+ 2025-12-29 09:35:00,410 INFO MainThread:843534 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251229_093500-yyrdgepk/logs/debug.log
7
+ 2025-12-29 09:35:00,410 INFO MainThread:843534 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251229_093500-yyrdgepk/logs/debug-internal.log
8
+ 2025-12-29 09:35:00,410 INFO MainThread:843534 [wandb_init.py:init():841] calling init triggers
9
+ 2025-12-29 09:35:00,410 INFO MainThread:843534 [wandb_init.py:init():846] wandb.init called with sweep_config: {}
10
+ config: {'_wandb': {}}
11
+ 2025-12-29 09:35:00,410 INFO MainThread:843534 [wandb_init.py:init():889] starting backend
12
+ 2025-12-29 09:35:00,668 INFO MainThread:843534 [wandb_init.py:init():892] sending inform_init request
13
+ 2025-12-29 09:35:00,673 INFO MainThread:843534 [wandb_init.py:init():900] backend started and connected
14
+ 2025-12-29 09:35:00,674 INFO MainThread:843534 [wandb_init.py:init():970] updated telemetry
15
+ 2025-12-29 09:35:00,678 INFO MainThread:843534 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout
16
+ 2025-12-29 09:35:01,041 INFO MainThread:843534 [wandb_init.py:init():1041] starting run threads in backend
17
+ 2025-12-29 09:35:01,126 INFO MainThread:843534 [wandb_run.py:_console_start():2521] atexit reg
18
+ 2025-12-29 09:35:01,126 INFO MainThread:843534 [wandb_run.py:_redirect():2369] redirect: wrap_raw
19
+ 2025-12-29 09:35:01,126 INFO MainThread:843534 [wandb_run.py:_redirect():2438] Wrapping output streams.
20
+ 2025-12-29 09:35:01,126 INFO MainThread:843534 [wandb_run.py:_redirect():2461] Redirects installed.
21
+ 2025-12-29 09:35:01,128 INFO MainThread:843534 [wandb_init.py:init():1081] run started, returning control to user process
22
+ 2025-12-29 09:35:01,130 INFO MainThread:843534 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-xxl', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 8, 'dataloader_prefetch_factor': 2, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output_256x256_17f_2*4bs_4*8*8vqvae_0_2_ratio_lr3e-3', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 100000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 1, 'gradient_accumulation_steps': 1, 'learning_rate': 0.003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 0, 'validation_steps': 100, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a girl walking', "The video features a man named David Schultz from Hamline University. He is dressed in a suit and tie, standing in front of a building with a tree in the background. The man appears to be speaking or presenting, as suggested by the context of the image. The style of the video is likely informative or educational, given the context of the man's attire and the setting. The video may be part of a news segment or a lecture series, as indicated by the man's professional appearance and the presence of a building that could be a university or academic institution.", "The video captures the interior of a car at a car show. The car features a striking orange and black color scheme, with the seats upholstered in orange leather and the door panels in black leather. The car's interior is well-lit, highlighting the details of the upholstery and the design of the door panels. The car is on display, with people walking around and observing it. The car show setting is bustling with activity, with other cars and people visible in the background. The video is a close-up shot of the car's interior, focusing on the details of the upholstery and the design of the door panels. The style of the video is realistic, capturing the car's interior in a clear and detailed manner."], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.0, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 17, 'video_height': 128, 'video_width': 128, 'video_tokenizer_model_id': 'Cosmos-0.1-Tokenizer-DV4x8x8', 'wan_pretrained_path': '/mnt/Wan2.1-T2V-1.3B', 'freeze_wan_backbone': False, 'wan_backbone_lr_ratio': 0.2, 'use_precomputed_features': False, 'use_precomputed_video_only': True, 'features_dir': '/mnt/VideoGen/dataset/OpenVid1M/extracted_features_17_128_128', 'empty_embeds_path': None}
23
+ 2025-12-29 09:42:02,535 INFO wandb-AsyncioManager-main:843534 [service_client.py:_forward_responses():80] Reached EOF.
24
+ 2025-12-29 09:42:02,535 INFO wandb-AsyncioManager-main:843534 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles.
Meissonic/wandb/run-20251229_081634-hjn0m6c2/files/output.log ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 12/29/2025 08:16:35 - INFO - __main__ - Using precomputed video codes only - will encode text with UMT5-XXL at runtime
2
+ 12/29/2025 08:16:35 - INFO - __main__ - Video tokenizer will be loaded only during validation/inference
3
+ Loading checkpoint shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 6/6 [00:00<00:00, 62.15it/s]
4
+ You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
5
+ 12/29/2025 08:16:38 - INFO - __main__ - Loaded text encoder: google/umt5-xxl (d_model=4096)
6
+ Traceback (most recent call last):
7
+ File "/mnt/Meissonic/train/train_mei_video.py", line 1892, in <module>
8
+ main(parse_args())
9
+ File "/mnt/Meissonic/train/train_mei_video.py", line 554, in main
10
+ dataset.tokenizer = tokenizer
11
+ UnboundLocalError: local variable 'dataset' referenced before assignment
12
+ [rank0]: Traceback (most recent call last):
13
+ [rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1892, in <module>
14
+ [rank0]: main(parse_args())
15
+ [rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 554, in main
16
+ [rank0]: dataset.tokenizer = tokenizer
17
+ [rank0]: UnboundLocalError: local variable 'dataset' referenced before assignment
Meissonic/wandb/run-20251229_081634-hjn0m6c2/logs/debug-core.log ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-12-29T08:16:34.925791368Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpjvxtgfwa/port-680831.txt","pid":680831,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
2
+ {"time":"2025-12-29T08:16:34.92651504Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":680831}
3
+ {"time":"2025-12-29T08:16:34.926493614Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-680831-681084-3226924194/socket","Net":"unix"}}
4
+ {"time":"2025-12-29T08:16:35.112196944Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
5
+ {"time":"2025-12-29T08:16:35.118201645Z","level":"INFO","msg":"handleInformInit: received","streamId":"hjn0m6c2","id":"1(@)"}
6
+ {"time":"2025-12-29T08:16:35.284535005Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"hjn0m6c2","id":"1(@)"}
7
+ {"time":"2025-12-29T08:16:38.409050659Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
8
+ {"time":"2025-12-29T08:16:38.409094413Z","level":"INFO","msg":"server is shutting down"}
9
+ {"time":"2025-12-29T08:16:38.409089535Z","level":"INFO","msg":"connection: closing","id":"1(@)"}
10
+ {"time":"2025-12-29T08:16:38.409131426Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
11
+ {"time":"2025-12-29T08:16:38.409243761Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-680831-681084-3226924194/socket","Net":"unix"}}
12
+ {"time":"2025-12-29T08:16:38.912785622Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
13
+ {"time":"2025-12-29T08:16:38.912803973Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
14
+ {"time":"2025-12-29T08:16:38.912818214Z","level":"INFO","msg":"server is closed"}
Meissonic/wandb/run-20251229_081634-hjn0m6c2/logs/debug-internal.log ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-12-29T08:16:35.118294642Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"}
2
+ {"time":"2025-12-29T08:16:35.284331327Z","level":"INFO","msg":"stream: created new stream","id":"hjn0m6c2"}
3
+ {"time":"2025-12-29T08:16:35.28441448Z","level":"INFO","msg":"handler: started","stream_id":"hjn0m6c2"}
4
+ {"time":"2025-12-29T08:16:35.284528509Z","level":"INFO","msg":"stream: started","id":"hjn0m6c2"}
5
+ {"time":"2025-12-29T08:16:35.284552699Z","level":"INFO","msg":"sender: started","stream_id":"hjn0m6c2"}
6
+ {"time":"2025-12-29T08:16:35.284556048Z","level":"INFO","msg":"writer: started","stream_id":"hjn0m6c2"}
7
+ {"time":"2025-12-29T08:16:38.40910837Z","level":"INFO","msg":"stream: closing","id":"hjn0m6c2"}
8
+ {"time":"2025-12-29T08:16:38.726721311Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
9
+ {"time":"2025-12-29T08:16:38.907987768Z","level":"INFO","msg":"handler: closed","stream_id":"hjn0m6c2"}
10
+ {"time":"2025-12-29T08:16:38.908080631Z","level":"INFO","msg":"sender: closed","stream_id":"hjn0m6c2"}
11
+ {"time":"2025-12-29T08:16:38.908087916Z","level":"INFO","msg":"stream: closed","id":"hjn0m6c2"}
Meissonic/wandb/run-20251229_081634-hjn0m6c2/logs/debug.log ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-12-29 08:16:34,856 INFO MainThread:680831 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1
2
+ 2025-12-29 08:16:34,856 INFO MainThread:680831 [wandb_setup.py:_flush():80] Configure stats pid to 680831
3
+ 2025-12-29 08:16:34,856 INFO MainThread:680831 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings
4
+ 2025-12-29 08:16:34,856 INFO MainThread:680831 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings
5
+ 2025-12-29 08:16:34,856 INFO MainThread:680831 [wandb_setup.py:_flush():80] Loading settings from environment variables
6
+ 2025-12-29 08:16:34,856 INFO MainThread:680831 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251229_081634-hjn0m6c2/logs/debug.log
7
+ 2025-12-29 08:16:34,856 INFO MainThread:680831 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251229_081634-hjn0m6c2/logs/debug-internal.log
8
+ 2025-12-29 08:16:34,856 INFO MainThread:680831 [wandb_init.py:init():841] calling init triggers
9
+ 2025-12-29 08:16:34,856 INFO MainThread:680831 [wandb_init.py:init():846] wandb.init called with sweep_config: {}
10
+ config: {'_wandb': {}}
11
+ 2025-12-29 08:16:34,856 INFO MainThread:680831 [wandb_init.py:init():889] starting backend
12
+ 2025-12-29 08:16:35,112 INFO MainThread:680831 [wandb_init.py:init():892] sending inform_init request
13
+ 2025-12-29 08:16:35,116 INFO MainThread:680831 [wandb_init.py:init():900] backend started and connected
14
+ 2025-12-29 08:16:35,118 INFO MainThread:680831 [wandb_init.py:init():970] updated telemetry
15
+ 2025-12-29 08:16:35,123 INFO MainThread:680831 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout
16
+ 2025-12-29 08:16:35,554 INFO MainThread:680831 [wandb_init.py:init():1041] starting run threads in backend
17
+ 2025-12-29 08:16:35,679 INFO MainThread:680831 [wandb_run.py:_console_start():2521] atexit reg
18
+ 2025-12-29 08:16:35,679 INFO MainThread:680831 [wandb_run.py:_redirect():2369] redirect: wrap_raw
19
+ 2025-12-29 08:16:35,679 INFO MainThread:680831 [wandb_run.py:_redirect():2438] Wrapping output streams.
20
+ 2025-12-29 08:16:35,679 INFO MainThread:680831 [wandb_run.py:_redirect():2461] Redirects installed.
21
+ 2025-12-29 08:16:35,681 INFO MainThread:680831 [wandb_init.py:init():1081] run started, returning control to user process
22
+ 2025-12-29 08:16:35,682 INFO MainThread:680831 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-xxl', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 8, 'dataloader_prefetch_factor': 2, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output_256x256_17f_2*4bs_4*8*8vqvae_0_2_ratio_lr3e-3', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 100000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 2, 'gradient_accumulation_steps': 4, 'learning_rate': 0.003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 0, 'validation_steps': 100, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a girl walking', "The video features a man named David Schultz from Hamline University. He is dressed in a suit and tie, standing in front of a building with a tree in the background. The man appears to be speaking or presenting, as suggested by the context of the image. The style of the video is likely informative or educational, given the context of the man's attire and the setting. The video may be part of a news segment or a lecture series, as indicated by the man's professional appearance and the presence of a building that could be a university or academic institution.", "The video captures the interior of a car at a car show. The car features a striking orange and black color scheme, with the seats upholstered in orange leather and the door panels in black leather. The car's interior is well-lit, highlighting the details of the upholstery and the design of the door panels. The car is on display, with people walking around and observing it. The car show setting is bustling with activity, with other cars and people visible in the background. The video is a close-up shot of the car's interior, focusing on the details of the upholstery and the design of the door panels. The style of the video is realistic, capturing the car's interior in a clear and detailed manner."], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.0, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 17, 'video_height': 256, 'video_width': 256, 'video_tokenizer_model_id': 'Cosmos-0.1-Tokenizer-DV4x8x8', 'wan_pretrained_path': '/mnt/Wan2.1-T2V-1.3B', 'freeze_wan_backbone': False, 'wan_backbone_lr_ratio': 0.2, 'use_precomputed_features': False, 'use_precomputed_video_only': True, 'features_dir': '/mnt/VideoGen/dataset/OpenVid1M/extracted_features_17_256_256_full_set', 'empty_embeds_path': None}
23
+ 2025-12-29 08:16:38,409 INFO wandb-AsyncioManager-main:680831 [service_client.py:_forward_responses():80] Reached EOF.
24
+ 2025-12-29 08:16:38,409 INFO wandb-AsyncioManager-main:680831 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles.
Meissonic/wandb/run-20251229_081752-78ojckdj/files/output.log ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 12/29/2025 08:17:53 - INFO - __main__ - Using precomputed video codes only - will encode text with UMT5-XXL at runtime
2
+ 12/29/2025 08:17:53 - INFO - __main__ - Video tokenizer will be loaded only during validation/inference
3
+ Loading checkpoint shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 6/6 [00:00<00:00, 68.25it/s]
4
+ You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
5
+ 12/29/2025 08:17:55 - INFO - __main__ - Loaded text encoder: google/umt5-xxl (d_model=4096)
6
+ Traceback (most recent call last):
7
+ File "/mnt/Meissonic/train/train_mei_video.py", line 1891, in <module>
8
+ main(parse_args())
9
+ File "/mnt/Meissonic/train/train_mei_video.py", line 553, in main
10
+ dataset.tokenizer = tokenizer
11
+ UnboundLocalError: local variable 'dataset' referenced before assignment
12
+ [rank0]: Traceback (most recent call last):
13
+ [rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1891, in <module>
14
+ [rank0]: main(parse_args())
15
+ [rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 553, in main
16
+ [rank0]: dataset.tokenizer = tokenizer
17
+ [rank0]: UnboundLocalError: local variable 'dataset' referenced before assignment
Meissonic/wandb/run-20251229_081752-78ojckdj/logs/debug-core.log ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-12-29T08:17:52.415361788Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpoautak8q/port-681864.txt","pid":681864,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
2
+ {"time":"2025-12-29T08:17:52.415911531Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":681864}
3
+ {"time":"2025-12-29T08:17:52.415892317Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-681864-682101-615016650/socket","Net":"unix"}}
4
+ {"time":"2025-12-29T08:17:52.600038892Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
5
+ {"time":"2025-12-29T08:17:52.605938403Z","level":"INFO","msg":"handleInformInit: received","streamId":"78ojckdj","id":"1(@)"}
6
+ {"time":"2025-12-29T08:17:52.775428685Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"78ojckdj","id":"1(@)"}
7
+ {"time":"2025-12-29T08:17:55.715872394Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
8
+ {"time":"2025-12-29T08:17:55.715918634Z","level":"INFO","msg":"server is shutting down"}
9
+ {"time":"2025-12-29T08:17:55.715913241Z","level":"INFO","msg":"connection: closing","id":"1(@)"}
10
+ {"time":"2025-12-29T08:17:55.71601316Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-681864-682101-615016650/socket","Net":"unix"}}
11
+ {"time":"2025-12-29T08:17:55.716036224Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
12
+ {"time":"2025-12-29T08:17:56.359848916Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
13
+ {"time":"2025-12-29T08:17:56.359873934Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
14
+ {"time":"2025-12-29T08:17:56.359888804Z","level":"INFO","msg":"server is closed"}
Meissonic/wandb/run-20251229_081752-78ojckdj/logs/debug-internal.log ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-12-29T08:17:52.606062282Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"}
2
+ {"time":"2025-12-29T08:17:52.77520788Z","level":"INFO","msg":"stream: created new stream","id":"78ojckdj"}
3
+ {"time":"2025-12-29T08:17:52.775295249Z","level":"INFO","msg":"handler: started","stream_id":"78ojckdj"}
4
+ {"time":"2025-12-29T08:17:52.775420221Z","level":"INFO","msg":"stream: started","id":"78ojckdj"}
5
+ {"time":"2025-12-29T08:17:52.775434881Z","level":"INFO","msg":"writer: started","stream_id":"78ojckdj"}
6
+ {"time":"2025-12-29T08:17:52.775434899Z","level":"INFO","msg":"sender: started","stream_id":"78ojckdj"}
7
+ {"time":"2025-12-29T08:17:55.715926892Z","level":"INFO","msg":"stream: closing","id":"78ojckdj"}
8
+ {"time":"2025-12-29T08:17:56.25572227Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
9
+ {"time":"2025-12-29T08:17:56.355939724Z","level":"INFO","msg":"handler: closed","stream_id":"78ojckdj"}
10
+ {"time":"2025-12-29T08:17:56.35603204Z","level":"INFO","msg":"sender: closed","stream_id":"78ojckdj"}
11
+ {"time":"2025-12-29T08:17:56.356037202Z","level":"INFO","msg":"stream: closed","id":"78ojckdj"}
Meissonic/wandb/run-20251229_081752-78ojckdj/logs/debug.log ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-12-29 08:17:52,347 INFO MainThread:681864 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1
2
+ 2025-12-29 08:17:52,348 INFO MainThread:681864 [wandb_setup.py:_flush():80] Configure stats pid to 681864
3
+ 2025-12-29 08:17:52,348 INFO MainThread:681864 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings
4
+ 2025-12-29 08:17:52,348 INFO MainThread:681864 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings
5
+ 2025-12-29 08:17:52,348 INFO MainThread:681864 [wandb_setup.py:_flush():80] Loading settings from environment variables
6
+ 2025-12-29 08:17:52,348 INFO MainThread:681864 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251229_081752-78ojckdj/logs/debug.log
7
+ 2025-12-29 08:17:52,348 INFO MainThread:681864 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251229_081752-78ojckdj/logs/debug-internal.log
8
+ 2025-12-29 08:17:52,348 INFO MainThread:681864 [wandb_init.py:init():841] calling init triggers
9
+ 2025-12-29 08:17:52,348 INFO MainThread:681864 [wandb_init.py:init():846] wandb.init called with sweep_config: {}
10
+ config: {'_wandb': {}}
11
+ 2025-12-29 08:17:52,348 INFO MainThread:681864 [wandb_init.py:init():889] starting backend
12
+ 2025-12-29 08:17:52,600 INFO MainThread:681864 [wandb_init.py:init():892] sending inform_init request
13
+ 2025-12-29 08:17:52,604 INFO MainThread:681864 [wandb_init.py:init():900] backend started and connected
14
+ 2025-12-29 08:17:52,605 INFO MainThread:681864 [wandb_init.py:init():970] updated telemetry
15
+ 2025-12-29 08:17:52,609 INFO MainThread:681864 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout
16
+ 2025-12-29 08:17:52,979 INFO MainThread:681864 [wandb_init.py:init():1041] starting run threads in backend
17
+ 2025-12-29 08:17:53,102 INFO MainThread:681864 [wandb_run.py:_console_start():2521] atexit reg
18
+ 2025-12-29 08:17:53,102 INFO MainThread:681864 [wandb_run.py:_redirect():2369] redirect: wrap_raw
19
+ 2025-12-29 08:17:53,102 INFO MainThread:681864 [wandb_run.py:_redirect():2438] Wrapping output streams.
20
+ 2025-12-29 08:17:53,103 INFO MainThread:681864 [wandb_run.py:_redirect():2461] Redirects installed.
21
+ 2025-12-29 08:17:53,105 INFO MainThread:681864 [wandb_init.py:init():1081] run started, returning control to user process
22
+ 2025-12-29 08:17:53,106 INFO MainThread:681864 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-xxl', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 8, 'dataloader_prefetch_factor': 2, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output_256x256_17f_2*4bs_4*8*8vqvae_0_2_ratio_lr3e-3', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 100000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 2, 'gradient_accumulation_steps': 4, 'learning_rate': 0.003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 0, 'validation_steps': 100, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a girl walking', "The video features a man named David Schultz from Hamline University. He is dressed in a suit and tie, standing in front of a building with a tree in the background. The man appears to be speaking or presenting, as suggested by the context of the image. The style of the video is likely informative or educational, given the context of the man's attire and the setting. The video may be part of a news segment or a lecture series, as indicated by the man's professional appearance and the presence of a building that could be a university or academic institution.", "The video captures the interior of a car at a car show. The car features a striking orange and black color scheme, with the seats upholstered in orange leather and the door panels in black leather. The car's interior is well-lit, highlighting the details of the upholstery and the design of the door panels. The car is on display, with people walking around and observing it. The car show setting is bustling with activity, with other cars and people visible in the background. The video is a close-up shot of the car's interior, focusing on the details of the upholstery and the design of the door panels. The style of the video is realistic, capturing the car's interior in a clear and detailed manner."], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.0, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 17, 'video_height': 256, 'video_width': 256, 'video_tokenizer_model_id': 'Cosmos-0.1-Tokenizer-DV4x8x8', 'wan_pretrained_path': '/mnt/Wan2.1-T2V-1.3B', 'freeze_wan_backbone': False, 'wan_backbone_lr_ratio': 0.2, 'use_precomputed_features': False, 'use_precomputed_video_only': True, 'features_dir': '/mnt/VideoGen/dataset/OpenVid1M/extracted_features_17_256_256_full_set', 'empty_embeds_path': None}
23
+ 2025-12-29 08:17:55,715 INFO wandb-AsyncioManager-main:681864 [service_client.py:_forward_responses():80] Reached EOF.
24
+ 2025-12-29 08:17:55,716 INFO wandb-AsyncioManager-main:681864 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles.
Meissonic/wandb/run-20251229_081959-tvb7bjux/files/output.log ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ 12/29/2025 08:20:00 - INFO - __main__ - Using precomputed video codes only - will encode text with UMT5-XXL at runtime
2
+ 12/29/2025 08:20:00 - INFO - __main__ - Video tokenizer will be loaded only during validation/inference
3
+ Loading checkpoint shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 6/6 [00:00<00:00, 68.27it/s]
4
+ '(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 3ca70188-ebaa-40b0-a3ff-1473c60ab7d9)')' thrown while requesting HEAD https://huggingface.co/google/umt5-xxl/resolve/main/tokenizer_config.json
5
+ 12/29/2025 08:20:10 - WARNING - huggingface_hub.utils._http - '(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 3ca70188-ebaa-40b0-a3ff-1473c60ab7d9)')' thrown while requesting HEAD https://huggingface.co/google/umt5-xxl/resolve/main/tokenizer_config.json
6
+ Retrying in 1s [Retry 1/5].
7
+ 12/29/2025 08:20:10 - WARNING - huggingface_hub.utils._http - Retrying in 1s [Retry 1/5].
8
+ You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Meissonic/wandb/run-20251229_081959-tvb7bjux/logs/debug-core.log ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {"time":"2025-12-29T08:19:59.444483356Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpk2lbu65l/port-683325.txt","pid":683325,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
2
+ {"time":"2025-12-29T08:19:59.445159843Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":683325}
3
+ {"time":"2025-12-29T08:19:59.445163741Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-683325-683564-770336178/socket","Net":"unix"}}
4
+ {"time":"2025-12-29T08:19:59.630747774Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
5
+ {"time":"2025-12-29T08:19:59.636523927Z","level":"INFO","msg":"handleInformInit: received","streamId":"tvb7bjux","id":"1(@)"}
6
+ {"time":"2025-12-29T08:19:59.807596347Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"tvb7bjux","id":"1(@)"}
7
+ {"time":"2025-12-29T08:20:13.475754205Z","level":"INFO","msg":"server: parent process exited, terminating service process"}
Meissonic/wandb/run-20251229_081959-tvb7bjux/logs/debug-internal.log ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {"time":"2025-12-29T08:19:59.636615677Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"}
2
+ {"time":"2025-12-29T08:19:59.807402325Z","level":"INFO","msg":"stream: created new stream","id":"tvb7bjux"}
3
+ {"time":"2025-12-29T08:19:59.807478253Z","level":"INFO","msg":"handler: started","stream_id":"tvb7bjux"}
4
+ {"time":"2025-12-29T08:19:59.807589456Z","level":"INFO","msg":"stream: started","id":"tvb7bjux"}
5
+ {"time":"2025-12-29T08:19:59.807608334Z","level":"INFO","msg":"sender: started","stream_id":"tvb7bjux"}
6
+ {"time":"2025-12-29T08:19:59.807611249Z","level":"INFO","msg":"writer: started","stream_id":"tvb7bjux"}
Meissonic/wandb/run-20251229_081959-tvb7bjux/logs/debug.log ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-12-29 08:19:59,377 INFO MainThread:683325 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1
2
+ 2025-12-29 08:19:59,377 INFO MainThread:683325 [wandb_setup.py:_flush():80] Configure stats pid to 683325
3
+ 2025-12-29 08:19:59,377 INFO MainThread:683325 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings
4
+ 2025-12-29 08:19:59,377 INFO MainThread:683325 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings
5
+ 2025-12-29 08:19:59,377 INFO MainThread:683325 [wandb_setup.py:_flush():80] Loading settings from environment variables
6
+ 2025-12-29 08:19:59,377 INFO MainThread:683325 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251229_081959-tvb7bjux/logs/debug.log
7
+ 2025-12-29 08:19:59,377 INFO MainThread:683325 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251229_081959-tvb7bjux/logs/debug-internal.log
8
+ 2025-12-29 08:19:59,378 INFO MainThread:683325 [wandb_init.py:init():841] calling init triggers
9
+ 2025-12-29 08:19:59,378 INFO MainThread:683325 [wandb_init.py:init():846] wandb.init called with sweep_config: {}
10
+ config: {'_wandb': {}}
11
+ 2025-12-29 08:19:59,378 INFO MainThread:683325 [wandb_init.py:init():889] starting backend
12
+ 2025-12-29 08:19:59,630 INFO MainThread:683325 [wandb_init.py:init():892] sending inform_init request
13
+ 2025-12-29 08:19:59,635 INFO MainThread:683325 [wandb_init.py:init():900] backend started and connected
14
+ 2025-12-29 08:19:59,636 INFO MainThread:683325 [wandb_init.py:init():970] updated telemetry
15
+ 2025-12-29 08:19:59,640 INFO MainThread:683325 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout
16
+ 2025-12-29 08:20:00,091 INFO MainThread:683325 [wandb_init.py:init():1041] starting run threads in backend
17
+ 2025-12-29 08:20:00,220 INFO MainThread:683325 [wandb_run.py:_console_start():2521] atexit reg
18
+ 2025-12-29 08:20:00,220 INFO MainThread:683325 [wandb_run.py:_redirect():2369] redirect: wrap_raw
19
+ 2025-12-29 08:20:00,220 INFO MainThread:683325 [wandb_run.py:_redirect():2438] Wrapping output streams.
20
+ 2025-12-29 08:20:00,220 INFO MainThread:683325 [wandb_run.py:_redirect():2461] Redirects installed.
21
+ 2025-12-29 08:20:00,223 INFO MainThread:683325 [wandb_init.py:init():1081] run started, returning control to user process
22
+ 2025-12-29 08:20:00,224 INFO MainThread:683325 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-xxl', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 8, 'dataloader_prefetch_factor': 2, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output_256x256_17f_2*4bs_4*8*8vqvae_0_2_ratio_lr3e-3', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 100000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 2, 'gradient_accumulation_steps': 4, 'learning_rate': 0.003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 0, 'validation_steps': 100, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a girl walking', "The video features a man named David Schultz from Hamline University. He is dressed in a suit and tie, standing in front of a building with a tree in the background. The man appears to be speaking or presenting, as suggested by the context of the image. The style of the video is likely informative or educational, given the context of the man's attire and the setting. The video may be part of a news segment or a lecture series, as indicated by the man's professional appearance and the presence of a building that could be a university or academic institution.", "The video captures the interior of a car at a car show. The car features a striking orange and black color scheme, with the seats upholstered in orange leather and the door panels in black leather. The car's interior is well-lit, highlighting the details of the upholstery and the design of the door panels. The car is on display, with people walking around and observing it. The car show setting is bustling with activity, with other cars and people visible in the background. The video is a close-up shot of the car's interior, focusing on the details of the upholstery and the design of the door panels. The style of the video is realistic, capturing the car's interior in a clear and detailed manner."], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.0, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 17, 'video_height': 256, 'video_width': 256, 'video_tokenizer_model_id': 'Cosmos-0.1-Tokenizer-DV4x8x8', 'wan_pretrained_path': '/mnt/Wan2.1-T2V-1.3B', 'freeze_wan_backbone': False, 'wan_backbone_lr_ratio': 0.2, 'use_precomputed_features': False, 'use_precomputed_video_only': True, 'features_dir': '/mnt/VideoGen/dataset/OpenVid1M/extracted_features_17_256_256_full_set', 'empty_embeds_path': None}
Meissonic/wandb/run-20251229_082208-d5bens3y/files/output.log ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 12/29/2025 08:22:09 - INFO - __main__ - Using precomputed video codes only - will encode text with UMT5-XXL at runtime
2
+ 12/29/2025 08:22:09 - INFO - __main__ - Video tokenizer will be loaded only during validation/inference
3
+ Loading checkpoint shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 6/6 [00:00<00:00, 69.71it/s]
4
+ You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
5
+ 12/29/2025 08:22:12 - INFO - __main__ - Loaded text encoder: google/umt5-xxl (d_model=4096)
6
+ 12/29/2025 08:22:20 - INFO - __main__ - Loaded from metadata: codebook_size=64000, mask_token_id=64000
7
+ 12/29/2025 08:22:20 - INFO - __main__ - Minimal tokenizer created: mask_token_id=64000, codebook_size=64000
8
+ 12/29/2025 08:22:20 - INFO - __main__ - Getting compressed dimensions from precomputed features...
9
+ 12/29/2025 08:22:29 - INFO - __main__ - Got dimensions from metadata: F'=5, H'=32, W'=32
10
+ 12/29/2025 08:22:29 - INFO - __main__ - Using actual text encoder dimension for umt5-xxl: 4096
11
+ 12/29/2025 08:22:29 - INFO - __main__ - Loading Wan config from: /mnt/Wan2.1-T2V-1.3B
12
+ 12/29/2025 08:22:29 - INFO - __main__ - Loaded Wan config: dim=1536, ffn_dim=8960, num_layers=30, num_heads=12
13
+ 12/29/2025 08:22:46 - INFO - __main__ - Loading Wan pretrained weights from: /mnt/Wan2.1-T2V-1.3B
14
+ 12/29/2025 08:22:46 - INFO - __main__ - Loading weights from local path: /mnt/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors
15
+ 12/29/2025 08:22:48 - INFO - __main__ - βœ“ Successfully loaded Wan pretrained weights into backbone (excluding text_embedding)
16
+ 12/29/2025 08:22:49 - INFO - __main__ - Parameter counts: backbone=1,418,996,800, other=2,112,033, total=1,421,108,833
17
+ 12/29/2025 08:22:49 - INFO - __main__ - Wan backbone lr = 0.000600 (base_lr * 0.2)
18
+ 12/29/2025 08:22:49 - INFO - __main__ - Other parts (token_embedding, logits_head) lr = 0.003000
19
+ 12/29/2025 08:22:49 - INFO - __main__ - Creating dataloaders and lr_scheduler
20
+ 12/29/2025 08:22:49 - INFO - __main__ - Using pre-extracted video codes from: /mnt/VideoGen/dataset/OpenVid1M/extracted_features_17_256_256_full_set
21
+ 12/29/2025 08:22:49 - INFO - __main__ - Text will be encoded with UMT5-XXL at runtime
22
+ 12/29/2025 08:22:58 - INFO - train.dataset_utils - Loaded metadata from /mnt/VideoGen/dataset/OpenVid1M/extracted_features_17_256_256_full_set/metadata.json
23
+ 12/29/2025 08:22:58 - INFO - train.dataset_utils - Total samples in metadata: unknown
24
+ 12/29/2025 08:22:58 - INFO - train.dataset_utils - PrecomputedVideoOnlyDataset: 1019957 samples available
25
+ 12/29/2025 08:22:58 - INFO - train.dataset_utils - Index range: 0 to 1019956
26
+ 12/29/2025 08:22:58 - INFO - __main__ - Using precomputed features - DataLoader settings: prefetch_factor=1, pin_memory=True
27
+ 12/29/2025 08:22:58 - INFO - __main__ - Dataloader configuration:
28
+ 12/29/2025 08:22:58 - INFO - __main__ - - num_workers: 8 (0 = single-threaded, recommended: 4-8 for video)
29
+ 12/29/2025 08:22:58 - INFO - __main__ - - prefetch_factor: 2
30
+ 12/29/2025 08:22:58 - INFO - __main__ - - persistent_workers: True
31
+ 12/29/2025 08:22:58 - INFO - __main__ - - pin_memory: True
32
+ 12/29/2025 08:22:58 - INFO - __main__ - Preparing model, optimizer and dataloaders
33
+ Traceback (most recent call last):
34
+ File "/mnt/Meissonic/train/train_mei_video.py", line 1894, in <module>
35
+ main(parse_args())
36
+ File "/mnt/Meissonic/train/train_mei_video.py", line 1323, in main
37
+ model, optimizer, lr_scheduler, train_dataloader, text_encoder = accelerator.prepare(
38
+ File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 1559, in prepare
39
+ result = tuple(
40
+ File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 1560, in <genexpr>
41
+ self._prepare_one(obj, first_pass=True, device_placement=d) for obj, d in zip(args, device_placement)
42
+ File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 1402, in _prepare_one
43
+ return self.prepare_model(obj, device_placement=device_placement)
44
+ File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 1847, in prepare_model
45
+ model = torch.nn.parallel.DistributedDataParallel(
46
+ File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 873, in __init__
47
+ self._ddp_init_helper(
48
+ File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1222, in _ddp_init_helper
49
+ self.reducer = dist.Reducer(
50
+ torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 21.16 GiB. GPU 0 has a total capacity of 39.49 GiB of which 11.55 GiB is free. Including non-PyTorch memory, this process has 27.93 GiB memory in use. Of the allocated memory 26.46 GiB is allocated by PyTorch, and 411.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
51
+ [rank0]: Traceback (most recent call last):
52
+ [rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1894, in <module>
53
+ [rank0]: main(parse_args())
54
+ [rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1323, in main
55
+ [rank0]: model, optimizer, lr_scheduler, train_dataloader, text_encoder = accelerator.prepare(
56
+ [rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 1559, in prepare
57
+ [rank0]: result = tuple(
58
+ [rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 1560, in <genexpr>
59
+ [rank0]: self._prepare_one(obj, first_pass=True, device_placement=d) for obj, d in zip(args, device_placement)
60
+ [rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 1402, in _prepare_one
61
+ [rank0]: return self.prepare_model(obj, device_placement=device_placement)
62
+ [rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 1847, in prepare_model
63
+ [rank0]: model = torch.nn.parallel.DistributedDataParallel(
64
+ [rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 873, in __init__
65
+ [rank0]: self._ddp_init_helper(
66
+ [rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1222, in _ddp_init_helper
67
+ [rank0]: self.reducer = dist.Reducer(
68
+ [rank0]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 21.16 GiB. GPU 0 has a total capacity of 39.49 GiB of which 11.55 GiB is free. Including non-PyTorch memory, this process has 27.93 GiB memory in use. Of the allocated memory 26.46 GiB is allocated by PyTorch, and 411.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
Meissonic/wandb/run-20251229_082208-d5bens3y/logs/debug-core.log ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-12-29T08:22:08.633253613Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpfee4vdgx/port-684910.txt","pid":684910,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
2
+ {"time":"2025-12-29T08:22:08.633786607Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":684910}
3
+ {"time":"2025-12-29T08:22:08.633765139Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-684910-685159-1258026704/socket","Net":"unix"}}
4
+ {"time":"2025-12-29T08:22:08.819292223Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
5
+ {"time":"2025-12-29T08:22:08.826487265Z","level":"INFO","msg":"handleInformInit: received","streamId":"d5bens3y","id":"1(@)"}
6
+ {"time":"2025-12-29T08:22:08.995050977Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"d5bens3y","id":"1(@)"}
7
+ {"time":"2025-12-29T08:23:10.182467655Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
8
+ {"time":"2025-12-29T08:23:10.182531417Z","level":"INFO","msg":"server is shutting down"}
9
+ {"time":"2025-12-29T08:23:10.182519187Z","level":"INFO","msg":"connection: closing","id":"1(@)"}
10
+ {"time":"2025-12-29T08:23:10.182572054Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
11
+ {"time":"2025-12-29T08:23:10.182609016Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-684910-685159-1258026704/socket","Net":"unix"}}
12
+ {"time":"2025-12-29T08:23:10.552208267Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
13
+ {"time":"2025-12-29T08:23:10.552231257Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
14
+ {"time":"2025-12-29T08:23:10.552243636Z","level":"INFO","msg":"server is closed"}
Meissonic/wandb/run-20251229_082208-d5bens3y/logs/debug-internal.log ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-12-29T08:22:08.826738283Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"}
2
+ {"time":"2025-12-29T08:22:08.994805914Z","level":"INFO","msg":"stream: created new stream","id":"d5bens3y"}
3
+ {"time":"2025-12-29T08:22:08.9949314Z","level":"INFO","msg":"handler: started","stream_id":"d5bens3y"}
4
+ {"time":"2025-12-29T08:22:08.995043335Z","level":"INFO","msg":"stream: started","id":"d5bens3y"}
5
+ {"time":"2025-12-29T08:22:08.995063351Z","level":"INFO","msg":"sender: started","stream_id":"d5bens3y"}
6
+ {"time":"2025-12-29T08:22:08.995066887Z","level":"INFO","msg":"writer: started","stream_id":"d5bens3y"}
7
+ {"time":"2025-12-29T08:23:10.182529884Z","level":"INFO","msg":"stream: closing","id":"d5bens3y"}
8
+ {"time":"2025-12-29T08:23:10.451151782Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
9
+ {"time":"2025-12-29T08:23:10.548788578Z","level":"INFO","msg":"handler: closed","stream_id":"d5bens3y"}
10
+ {"time":"2025-12-29T08:23:10.548905656Z","level":"INFO","msg":"sender: closed","stream_id":"d5bens3y"}
11
+ {"time":"2025-12-29T08:23:10.548914674Z","level":"INFO","msg":"stream: closed","id":"d5bens3y"}
Meissonic/wandb/run-20251229_082208-d5bens3y/logs/debug.log ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-12-29 08:22:08,565 INFO MainThread:684910 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1
2
+ 2025-12-29 08:22:08,565 INFO MainThread:684910 [wandb_setup.py:_flush():80] Configure stats pid to 684910
3
+ 2025-12-29 08:22:08,565 INFO MainThread:684910 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings
4
+ 2025-12-29 08:22:08,565 INFO MainThread:684910 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings
5
+ 2025-12-29 08:22:08,565 INFO MainThread:684910 [wandb_setup.py:_flush():80] Loading settings from environment variables
6
+ 2025-12-29 08:22:08,565 INFO MainThread:684910 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251229_082208-d5bens3y/logs/debug.log
7
+ 2025-12-29 08:22:08,565 INFO MainThread:684910 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251229_082208-d5bens3y/logs/debug-internal.log
8
+ 2025-12-29 08:22:08,565 INFO MainThread:684910 [wandb_init.py:init():841] calling init triggers
9
+ 2025-12-29 08:22:08,565 INFO MainThread:684910 [wandb_init.py:init():846] wandb.init called with sweep_config: {}
10
+ config: {'_wandb': {}}
11
+ 2025-12-29 08:22:08,565 INFO MainThread:684910 [wandb_init.py:init():889] starting backend
12
+ 2025-12-29 08:22:08,819 INFO MainThread:684910 [wandb_init.py:init():892] sending inform_init request
13
+ 2025-12-29 08:22:08,825 INFO MainThread:684910 [wandb_init.py:init():900] backend started and connected
14
+ 2025-12-29 08:22:08,826 INFO MainThread:684910 [wandb_init.py:init():970] updated telemetry
15
+ 2025-12-29 08:22:08,832 INFO MainThread:684910 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout
16
+ 2025-12-29 08:22:09,324 INFO MainThread:684910 [wandb_init.py:init():1041] starting run threads in backend
17
+ 2025-12-29 08:22:09,503 INFO MainThread:684910 [wandb_run.py:_console_start():2521] atexit reg
18
+ 2025-12-29 08:22:09,503 INFO MainThread:684910 [wandb_run.py:_redirect():2369] redirect: wrap_raw
19
+ 2025-12-29 08:22:09,503 INFO MainThread:684910 [wandb_run.py:_redirect():2438] Wrapping output streams.
20
+ 2025-12-29 08:22:09,503 INFO MainThread:684910 [wandb_run.py:_redirect():2461] Redirects installed.
21
+ 2025-12-29 08:22:09,506 INFO MainThread:684910 [wandb_init.py:init():1081] run started, returning control to user process
22
+ 2025-12-29 08:22:09,507 INFO MainThread:684910 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-xxl', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 8, 'dataloader_prefetch_factor': 2, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output_256x256_17f_2*4bs_4*8*8vqvae_0_2_ratio_lr3e-3', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 100000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 2, 'gradient_accumulation_steps': 4, 'learning_rate': 0.003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 0, 'validation_steps': 100, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a girl walking', "The video features a man named David Schultz from Hamline University. He is dressed in a suit and tie, standing in front of a building with a tree in the background. The man appears to be speaking or presenting, as suggested by the context of the image. The style of the video is likely informative or educational, given the context of the man's attire and the setting. The video may be part of a news segment or a lecture series, as indicated by the man's professional appearance and the presence of a building that could be a university or academic institution.", "The video captures the interior of a car at a car show. The car features a striking orange and black color scheme, with the seats upholstered in orange leather and the door panels in black leather. The car's interior is well-lit, highlighting the details of the upholstery and the design of the door panels. The car is on display, with people walking around and observing it. The car show setting is bustling with activity, with other cars and people visible in the background. The video is a close-up shot of the car's interior, focusing on the details of the upholstery and the design of the door panels. The style of the video is realistic, capturing the car's interior in a clear and detailed manner."], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.0, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 17, 'video_height': 256, 'video_width': 256, 'video_tokenizer_model_id': 'Cosmos-0.1-Tokenizer-DV4x8x8', 'wan_pretrained_path': '/mnt/Wan2.1-T2V-1.3B', 'freeze_wan_backbone': False, 'wan_backbone_lr_ratio': 0.2, 'use_precomputed_features': False, 'use_precomputed_video_only': True, 'features_dir': '/mnt/VideoGen/dataset/OpenVid1M/extracted_features_17_256_256_full_set', 'empty_embeds_path': None}
23
+ 2025-12-29 08:23:10,182 INFO wandb-AsyncioManager-main:684910 [service_client.py:_forward_responses():80] Reached EOF.
24
+ 2025-12-29 08:23:10,182 INFO wandb-AsyncioManager-main:684910 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles.
Meissonic/wandb/run-20251229_082348-xdcob8vv/files/output.log ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 12/29/2025 08:23:49 - INFO - __main__ - Using precomputed video codes only - will encode text with UMT5-XXL at runtime
2
+ 12/29/2025 08:23:49 - INFO - __main__ - Video tokenizer will be loaded only during validation/inference
3
+ Loading checkpoint shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 6/6 [00:00<00:00, 64.61it/s]
4
+ You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
5
+ 12/29/2025 08:23:51 - INFO - __main__ - Loaded text encoder: google/umt5-xxl (d_model=4096)
6
+ 12/29/2025 08:24:00 - INFO - __main__ - Loaded from metadata: codebook_size=64000, mask_token_id=64000
7
+ 12/29/2025 08:24:00 - INFO - __main__ - Minimal tokenizer created: mask_token_id=64000, codebook_size=64000
8
+ 12/29/2025 08:24:00 - INFO - __main__ - Getting compressed dimensions from precomputed features...
9
+ 12/29/2025 08:24:09 - INFO - __main__ - Got dimensions from metadata: F'=5, H'=32, W'=32
10
+ 12/29/2025 08:24:09 - INFO - __main__ - Using actual text encoder dimension for umt5-xxl: 4096
11
+ 12/29/2025 08:24:09 - INFO - __main__ - Loading Wan config from: /mnt/Wan2.1-T2V-1.3B
12
+ 12/29/2025 08:24:09 - INFO - __main__ - Loaded Wan config: dim=1536, ffn_dim=8960, num_layers=30, num_heads=12
13
+ 12/29/2025 08:24:26 - INFO - __main__ - Loading Wan pretrained weights from: /mnt/Wan2.1-T2V-1.3B
14
+ 12/29/2025 08:24:26 - INFO - __main__ - Loading weights from local path: /mnt/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors
15
+ 12/29/2025 08:24:28 - INFO - __main__ - βœ“ Successfully loaded Wan pretrained weights into backbone (excluding text_embedding)
16
+ 12/29/2025 08:24:29 - INFO - __main__ - Parameter counts: backbone=1,418,996,800, other=2,112,033, total=1,421,108,833
17
+ 12/29/2025 08:24:29 - INFO - __main__ - Wan backbone lr = 0.000600 (base_lr * 0.2)
18
+ 12/29/2025 08:24:29 - INFO - __main__ - Other parts (token_embedding, logits_head) lr = 0.003000
19
+ 12/29/2025 08:24:29 - INFO - __main__ - Creating dataloaders and lr_scheduler
20
+ 12/29/2025 08:24:29 - INFO - __main__ - Using pre-extracted video codes from: /mnt/VideoGen/dataset/OpenVid1M/extracted_features_17_256_256_full_set
21
+ 12/29/2025 08:24:29 - INFO - __main__ - Text will be encoded with UMT5-XXL at runtime
22
+ 12/29/2025 08:24:38 - INFO - train.dataset_utils - Loaded metadata from /mnt/VideoGen/dataset/OpenVid1M/extracted_features_17_256_256_full_set/metadata.json
23
+ 12/29/2025 08:24:38 - INFO - train.dataset_utils - Total samples in metadata: unknown
24
+ 12/29/2025 08:24:38 - INFO - train.dataset_utils - PrecomputedVideoOnlyDataset: 1019957 samples available
25
+ 12/29/2025 08:24:38 - INFO - train.dataset_utils - Index range: 0 to 1019956
26
+ 12/29/2025 08:24:38 - INFO - __main__ - Using precomputed features - DataLoader settings: prefetch_factor=1, pin_memory=True
27
+ 12/29/2025 08:24:38 - INFO - __main__ - Dataloader configuration:
28
+ 12/29/2025 08:24:38 - INFO - __main__ - - num_workers: 8 (0 = single-threaded, recommended: 4-8 for video)
29
+ 12/29/2025 08:24:38 - INFO - __main__ - - prefetch_factor: 2
30
+ 12/29/2025 08:24:38 - INFO - __main__ - - persistent_workers: True
31
+ 12/29/2025 08:24:38 - INFO - __main__ - - pin_memory: True
32
+ 12/29/2025 08:24:38 - INFO - __main__ - Preparing model, optimizer and dataloaders
33
+ Traceback (most recent call last):
34
+ File "/mnt/Meissonic/train/train_mei_video.py", line 1894, in <module>
35
+ main(parse_args())
36
+ File "/mnt/Meissonic/train/train_mei_video.py", line 1323, in main
37
+ model, optimizer, lr_scheduler, train_dataloader, text_encoder = accelerator.prepare(
38
+ File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 1559, in prepare
39
+ result = tuple(
40
+ File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 1560, in <genexpr>
41
+ self._prepare_one(obj, first_pass=True, device_placement=d) for obj, d in zip(args, device_placement)
42
+ File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 1402, in _prepare_one
43
+ return self.prepare_model(obj, device_placement=device_placement)
44
+ File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 1847, in prepare_model
45
+ model = torch.nn.parallel.DistributedDataParallel(
46
+ File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 873, in __init__
47
+ self._ddp_init_helper(
48
+ File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1222, in _ddp_init_helper
49
+ self.reducer = dist.Reducer(
50
+ torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 21.16 GiB. GPU 0 has a total capacity of 39.49 GiB of which 11.55 GiB is free. Including non-PyTorch memory, this process has 27.93 GiB memory in use. Of the allocated memory 26.46 GiB is allocated by PyTorch, and 411.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
51
+ [rank0]: Traceback (most recent call last):
52
+ [rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1894, in <module>
53
+ [rank0]: main(parse_args())
54
+ [rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1323, in main
55
+ [rank0]: model, optimizer, lr_scheduler, train_dataloader, text_encoder = accelerator.prepare(
56
+ [rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 1559, in prepare
57
+ [rank0]: result = tuple(
58
+ [rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 1560, in <genexpr>
59
+ [rank0]: self._prepare_one(obj, first_pass=True, device_placement=d) for obj, d in zip(args, device_placement)
60
+ [rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 1402, in _prepare_one
61
+ [rank0]: return self.prepare_model(obj, device_placement=device_placement)
62
+ [rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 1847, in prepare_model
63
+ [rank0]: model = torch.nn.parallel.DistributedDataParallel(
64
+ [rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 873, in __init__
65
+ [rank0]: self._ddp_init_helper(
66
+ [rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1222, in _ddp_init_helper
67
+ [rank0]: self.reducer = dist.Reducer(
68
+ [rank0]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 21.16 GiB. GPU 0 has a total capacity of 39.49 GiB of which 11.55 GiB is free. Including non-PyTorch memory, this process has 27.93 GiB memory in use. Of the allocated memory 26.46 GiB is allocated by PyTorch, and 411.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
Meissonic/wandb/run-20251229_082348-xdcob8vv/logs/debug-core.log ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-12-29T08:23:48.48214766Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmp_kibi_k0/port-687239.txt","pid":687239,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
2
+ {"time":"2025-12-29T08:23:48.48261065Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":687239}
3
+ {"time":"2025-12-29T08:23:48.482606464Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-687239-687548-1521909327/socket","Net":"unix"}}
4
+ {"time":"2025-12-29T08:23:48.668396575Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
5
+ {"time":"2025-12-29T08:23:48.67438215Z","level":"INFO","msg":"handleInformInit: received","streamId":"xdcob8vv","id":"1(@)"}
6
+ {"time":"2025-12-29T08:23:48.838417506Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"xdcob8vv","id":"1(@)"}
7
+ {"time":"2025-12-29T08:24:51.064143118Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
8
+ {"time":"2025-12-29T08:24:51.064216028Z","level":"INFO","msg":"connection: closing","id":"1(@)"}
9
+ {"time":"2025-12-29T08:24:51.064271092Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
10
+ {"time":"2025-12-29T08:24:51.064228351Z","level":"INFO","msg":"server is shutting down"}
11
+ {"time":"2025-12-29T08:24:51.064361726Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-687239-687548-1521909327/socket","Net":"unix"}}
12
+ {"time":"2025-12-29T08:24:51.603614002Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
13
+ {"time":"2025-12-29T08:24:51.603644195Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
14
+ {"time":"2025-12-29T08:24:51.603660135Z","level":"INFO","msg":"server is closed"}
Meissonic/wandb/run-20251229_082348-xdcob8vv/logs/debug-internal.log ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-12-29T08:23:48.674533717Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"}
2
+ {"time":"2025-12-29T08:23:48.83819002Z","level":"INFO","msg":"stream: created new stream","id":"xdcob8vv"}
3
+ {"time":"2025-12-29T08:23:48.838277887Z","level":"INFO","msg":"handler: started","stream_id":"xdcob8vv"}
4
+ {"time":"2025-12-29T08:23:48.838409545Z","level":"INFO","msg":"stream: started","id":"xdcob8vv"}
5
+ {"time":"2025-12-29T08:23:48.838424189Z","level":"INFO","msg":"writer: started","stream_id":"xdcob8vv"}
6
+ {"time":"2025-12-29T08:23:48.838433456Z","level":"INFO","msg":"sender: started","stream_id":"xdcob8vv"}
7
+ {"time":"2025-12-29T08:24:51.064239479Z","level":"INFO","msg":"stream: closing","id":"xdcob8vv"}
8
+ {"time":"2025-12-29T08:24:51.333940412Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
9
+ {"time":"2025-12-29T08:24:51.600325077Z","level":"INFO","msg":"handler: closed","stream_id":"xdcob8vv"}
10
+ {"time":"2025-12-29T08:24:51.600456594Z","level":"INFO","msg":"sender: closed","stream_id":"xdcob8vv"}
11
+ {"time":"2025-12-29T08:24:51.600464276Z","level":"INFO","msg":"stream: closed","id":"xdcob8vv"}
Meissonic/wandb/run-20251229_082348-xdcob8vv/logs/debug.log ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-12-29 08:23:48,415 INFO MainThread:687239 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1
2
+ 2025-12-29 08:23:48,415 INFO MainThread:687239 [wandb_setup.py:_flush():80] Configure stats pid to 687239
3
+ 2025-12-29 08:23:48,415 INFO MainThread:687239 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings
4
+ 2025-12-29 08:23:48,415 INFO MainThread:687239 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings
5
+ 2025-12-29 08:23:48,415 INFO MainThread:687239 [wandb_setup.py:_flush():80] Loading settings from environment variables
6
+ 2025-12-29 08:23:48,415 INFO MainThread:687239 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251229_082348-xdcob8vv/logs/debug.log
7
+ 2025-12-29 08:23:48,415 INFO MainThread:687239 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251229_082348-xdcob8vv/logs/debug-internal.log
8
+ 2025-12-29 08:23:48,415 INFO MainThread:687239 [wandb_init.py:init():841] calling init triggers
9
+ 2025-12-29 08:23:48,415 INFO MainThread:687239 [wandb_init.py:init():846] wandb.init called with sweep_config: {}
10
+ config: {'_wandb': {}}
11
+ 2025-12-29 08:23:48,415 INFO MainThread:687239 [wandb_init.py:init():889] starting backend
12
+ 2025-12-29 08:23:48,668 INFO MainThread:687239 [wandb_init.py:init():892] sending inform_init request
13
+ 2025-12-29 08:23:48,672 INFO MainThread:687239 [wandb_init.py:init():900] backend started and connected
14
+ 2025-12-29 08:23:48,674 INFO MainThread:687239 [wandb_init.py:init():970] updated telemetry
15
+ 2025-12-29 08:23:48,678 INFO MainThread:687239 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout
16
+ 2025-12-29 08:23:49,038 INFO MainThread:687239 [wandb_init.py:init():1041] starting run threads in backend
17
+ 2025-12-29 08:23:49,163 INFO MainThread:687239 [wandb_run.py:_console_start():2521] atexit reg
18
+ 2025-12-29 08:23:49,163 INFO MainThread:687239 [wandb_run.py:_redirect():2369] redirect: wrap_raw
19
+ 2025-12-29 08:23:49,163 INFO MainThread:687239 [wandb_run.py:_redirect():2438] Wrapping output streams.
20
+ 2025-12-29 08:23:49,163 INFO MainThread:687239 [wandb_run.py:_redirect():2461] Redirects installed.
21
+ 2025-12-29 08:23:49,166 INFO MainThread:687239 [wandb_init.py:init():1081] run started, returning control to user process
22
+ 2025-12-29 08:23:49,167 INFO MainThread:687239 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-xxl', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 8, 'dataloader_prefetch_factor': 2, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output_256x256_17f_2*4bs_4*8*8vqvae_0_2_ratio_lr3e-3', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 100000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 1, 'gradient_accumulation_steps': 4, 'learning_rate': 0.003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 0, 'validation_steps': 100, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a girl walking', "The video features a man named David Schultz from Hamline University. He is dressed in a suit and tie, standing in front of a building with a tree in the background. The man appears to be speaking or presenting, as suggested by the context of the image. The style of the video is likely informative or educational, given the context of the man's attire and the setting. The video may be part of a news segment or a lecture series, as indicated by the man's professional appearance and the presence of a building that could be a university or academic institution.", "The video captures the interior of a car at a car show. The car features a striking orange and black color scheme, with the seats upholstered in orange leather and the door panels in black leather. The car's interior is well-lit, highlighting the details of the upholstery and the design of the door panels. The car is on display, with people walking around and observing it. The car show setting is bustling with activity, with other cars and people visible in the background. The video is a close-up shot of the car's interior, focusing on the details of the upholstery and the design of the door panels. The style of the video is realistic, capturing the car's interior in a clear and detailed manner."], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.0, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 17, 'video_height': 256, 'video_width': 256, 'video_tokenizer_model_id': 'Cosmos-0.1-Tokenizer-DV4x8x8', 'wan_pretrained_path': '/mnt/Wan2.1-T2V-1.3B', 'freeze_wan_backbone': False, 'wan_backbone_lr_ratio': 0.2, 'use_precomputed_features': False, 'use_precomputed_video_only': True, 'features_dir': '/mnt/VideoGen/dataset/OpenVid1M/extracted_features_17_256_256_full_set', 'empty_embeds_path': None}
23
+ 2025-12-29 08:24:51,064 INFO wandb-AsyncioManager-main:687239 [service_client.py:_forward_responses():80] Reached EOF.
24
+ 2025-12-29 08:24:51,064 INFO wandb-AsyncioManager-main:687239 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles.
Meissonic/wandb/run-20251229_082735-s2rbngfj/files/output.log ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 12/29/2025 08:27:36 - INFO - __main__ - Using precomputed video codes only - will encode text with UMT5-XXL at runtime
2
+ 12/29/2025 08:27:36 - INFO - __main__ - Video tokenizer will be loaded only during validation/inference
3
+ Loading checkpoint shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 6/6 [00:00<00:00, 68.71it/s]
4
+ You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
5
+ 12/29/2025 08:27:38 - INFO - __main__ - Loaded text encoder: google/umt5-xxl (d_model=4096)
6
+ 12/29/2025 08:27:39 - INFO - __main__ - Loaded from metadata: codebook_size=64000, mask_token_id=64000
7
+ 12/29/2025 08:27:39 - INFO - __main__ - Minimal tokenizer created: mask_token_id=64000, codebook_size=64000
8
+ 12/29/2025 08:27:39 - INFO - __main__ - Getting compressed dimensions from precomputed features...
9
+ 12/29/2025 08:27:40 - INFO - __main__ - Got dimensions from metadata: F'=5, H'=16, W'=16
10
+ 12/29/2025 08:27:40 - INFO - __main__ - Using actual text encoder dimension for umt5-xxl: 4096
11
+ 12/29/2025 08:27:40 - INFO - __main__ - Loading Wan config from: /mnt/Wan2.1-T2V-1.3B
12
+ 12/29/2025 08:27:40 - INFO - __main__ - Loaded Wan config: dim=1536, ffn_dim=8960, num_layers=30, num_heads=12
13
+ 12/29/2025 08:27:57 - INFO - __main__ - Loading Wan pretrained weights from: /mnt/Wan2.1-T2V-1.3B
14
+ 12/29/2025 08:27:57 - INFO - __main__ - Loading weights from local path: /mnt/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors
15
+ 12/29/2025 08:27:59 - INFO - __main__ - βœ“ Successfully loaded Wan pretrained weights into backbone (excluding text_embedding)
16
+ 12/29/2025 08:28:01 - INFO - __main__ - Parameter counts: backbone=1,418,996,800, other=2,112,033, total=1,421,108,833
17
+ 12/29/2025 08:28:01 - INFO - __main__ - Wan backbone lr = 0.000600 (base_lr * 0.2)
18
+ 12/29/2025 08:28:01 - INFO - __main__ - Other parts (token_embedding, logits_head) lr = 0.003000
19
+ 12/29/2025 08:28:01 - INFO - __main__ - Creating dataloaders and lr_scheduler
20
+ 12/29/2025 08:28:01 - INFO - __main__ - Using pre-extracted video codes from: /mnt/VideoGen/dataset/OpenVid1M/extracted_features_17_128_128
21
+ 12/29/2025 08:28:01 - INFO - __main__ - Text will be encoded with UMT5-XXL at runtime
22
+ 12/29/2025 08:28:02 - INFO - train.dataset_utils - Loaded metadata from /mnt/VideoGen/dataset/OpenVid1M/extracted_features_17_128_128/metadata.json
23
+ 12/29/2025 08:28:02 - INFO - train.dataset_utils - Total samples in metadata: 1019957
24
+ 12/29/2025 08:28:02 - INFO - train.dataset_utils - PrecomputedVideoOnlyDataset: 128000 samples available
25
+ 12/29/2025 08:28:02 - INFO - train.dataset_utils - Index range: 0 to 127999
26
+ 12/29/2025 08:28:02 - INFO - __main__ - Using precomputed features - DataLoader settings: prefetch_factor=1, pin_memory=True
27
+ 12/29/2025 08:28:02 - INFO - __main__ - Dataloader configuration:
28
+ 12/29/2025 08:28:02 - INFO - __main__ - - num_workers: 8 (0 = single-threaded, recommended: 4-8 for video)
29
+ 12/29/2025 08:28:02 - INFO - __main__ - - prefetch_factor: 2
30
+ 12/29/2025 08:28:02 - INFO - __main__ - - persistent_workers: True
31
+ 12/29/2025 08:28:02 - INFO - __main__ - - pin_memory: True
32
+ 12/29/2025 08:28:02 - INFO - __main__ - Preparing model, optimizer and dataloaders
33
+ Traceback (most recent call last):
34
+ File "/mnt/Meissonic/train/train_mei_video.py", line 1909, in <module>
35
+ main(parse_args())
36
+ File "/mnt/Meissonic/train/train_mei_video.py", line 1323, in main
37
+ model, optimizer, lr_scheduler, train_dataloader, text_encoder = accelerator.prepare(
38
+ File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 1559, in prepare
39
+ result = tuple(
40
+ File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 1560, in <genexpr>
41
+ self._prepare_one(obj, first_pass=True, device_placement=d) for obj, d in zip(args, device_placement)
42
+ File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 1402, in _prepare_one
43
+ return self.prepare_model(obj, device_placement=device_placement)
44
+ File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 1847, in prepare_model
45
+ model = torch.nn.parallel.DistributedDataParallel(
46
+ File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 873, in __init__
47
+ self._ddp_init_helper(
48
+ File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1222, in _ddp_init_helper
49
+ self.reducer = dist.Reducer(
50
+ torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 21.16 GiB. GPU 0 has a total capacity of 39.49 GiB of which 11.55 GiB is free. Including non-PyTorch memory, this process has 27.93 GiB memory in use. Of the allocated memory 26.46 GiB is allocated by PyTorch, and 411.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
51
+ [rank0]: Traceback (most recent call last):
52
+ [rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1909, in <module>
53
+ [rank0]: main(parse_args())
54
+ [rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1323, in main
55
+ [rank0]: model, optimizer, lr_scheduler, train_dataloader, text_encoder = accelerator.prepare(
56
+ [rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 1559, in prepare
57
+ [rank0]: result = tuple(
58
+ [rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 1560, in <genexpr>
59
+ [rank0]: self._prepare_one(obj, first_pass=True, device_placement=d) for obj, d in zip(args, device_placement)
60
+ [rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 1402, in _prepare_one
61
+ [rank0]: return self.prepare_model(obj, device_placement=device_placement)
62
+ [rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 1847, in prepare_model
63
+ [rank0]: model = torch.nn.parallel.DistributedDataParallel(
64
+ [rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 873, in __init__
65
+ [rank0]: self._ddp_init_helper(
66
+ [rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1222, in _ddp_init_helper
67
+ [rank0]: self.reducer = dist.Reducer(
68
+ [rank0]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 21.16 GiB. GPU 0 has a total capacity of 39.49 GiB of which 11.55 GiB is free. Including non-PyTorch memory, this process has 27.93 GiB memory in use. Of the allocated memory 26.46 GiB is allocated by PyTorch, and 411.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
Meissonic/wandb/run-20251229_082735-s2rbngfj/logs/debug-core.log ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-12-29T08:27:35.245761137Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpdee2lffa/port-691754.txt","pid":691754,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
2
+ {"time":"2025-12-29T08:27:35.24620287Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":691754}
3
+ {"time":"2025-12-29T08:27:35.246206125Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-691754-692079-1616011383/socket","Net":"unix"}}
4
+ {"time":"2025-12-29T08:27:35.432196378Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
5
+ {"time":"2025-12-29T08:27:35.439108806Z","level":"INFO","msg":"handleInformInit: received","streamId":"s2rbngfj","id":"1(@)"}
6
+ {"time":"2025-12-29T08:27:35.609262249Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"s2rbngfj","id":"1(@)"}
7
+ {"time":"2025-12-29T08:28:12.961224171Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
8
+ {"time":"2025-12-29T08:28:12.961266225Z","level":"INFO","msg":"server is shutting down"}
9
+ {"time":"2025-12-29T08:28:12.961261491Z","level":"INFO","msg":"connection: closing","id":"1(@)"}
10
+ {"time":"2025-12-29T08:28:12.96133838Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
11
+ {"time":"2025-12-29T08:28:12.961339854Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-691754-692079-1616011383/socket","Net":"unix"}}
12
+ {"time":"2025-12-29T08:28:13.298524802Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
13
+ {"time":"2025-12-29T08:28:13.298553535Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
14
+ {"time":"2025-12-29T08:28:13.298566342Z","level":"INFO","msg":"server is closed"}