Upload code from /mnt/43.oT_eV
Browse filesThis view is limited to 50 files because it contains too many changes. Β
See raw diff
- .gitattributes +3 -0
- 128_128_17/video_codes.tar.zst +3 -0
- 256_256_17/video_codes.tar.zst +3 -0
- Meissonic/cosmos_test_output/comparison_video_0.mp4 +0 -0
- Meissonic/cosmos_test_output/comparison_video_1.mp4 +3 -0
- Meissonic/cosmos_test_output/comparison_video_2.mp4 +3 -0
- Meissonic/cosmos_test_output/comparison_video_3.mp4 +3 -0
- Meissonic/model/diffusion_pytorch_model.safetensors +3 -0
- Meissonic/src/__pycache__/pipeline.cpython-310.pyc +0 -0
- Meissonic/src/__pycache__/pipeline_video.cpython-310.pyc +0 -0
- Meissonic/src/__pycache__/pipeline_video.cpython-313.pyc +0 -0
- Meissonic/src/__pycache__/pipeline_video.cpython-314.pyc +0 -0
- Meissonic/src/__pycache__/scheduler.cpython-310.pyc +0 -0
- Meissonic/src/__pycache__/scheduler.cpython-313.pyc +0 -0
- Meissonic/src/__pycache__/scheduler_video.cpython-310.pyc +0 -0
- Meissonic/src/__pycache__/scheduler_video.cpython-313.pyc +0 -0
- Meissonic/src/__pycache__/scheduler_video.cpython-314.pyc +0 -0
- Meissonic/src/__pycache__/transformer.cpython-310.pyc +0 -0
- Meissonic/src/__pycache__/transformer.cpython-313.pyc +0 -0
- Meissonic/src/__pycache__/transformer_video.cpython-310.pyc +0 -0
- Meissonic/src/__pycache__/transformer_video.cpython-313.pyc +0 -0
- Meissonic/src/__pycache__/transformer_video.cpython-314.pyc +0 -0
- Meissonic/train/__pycache__/dataset_utils.cpython-310.pyc +0 -0
- Meissonic/train/__pycache__/dataset_utils.cpython-313.pyc +0 -0
- Meissonic/train/__pycache__/trainer_utils.cpython-310.pyc +0 -0
- Meissonic/train/__pycache__/trainer_utils.cpython-313.pyc +0 -0
- Meissonic/wandb/debug-internal.log +11 -0
- Meissonic/wandb/debug.log +24 -0
- Meissonic/wandb/run-20251229_081634-hjn0m6c2/files/output.log +17 -0
- Meissonic/wandb/run-20251229_081634-hjn0m6c2/logs/debug-core.log +14 -0
- Meissonic/wandb/run-20251229_081634-hjn0m6c2/logs/debug-internal.log +11 -0
- Meissonic/wandb/run-20251229_081634-hjn0m6c2/logs/debug.log +24 -0
- Meissonic/wandb/run-20251229_081752-78ojckdj/files/output.log +17 -0
- Meissonic/wandb/run-20251229_081752-78ojckdj/logs/debug-core.log +14 -0
- Meissonic/wandb/run-20251229_081752-78ojckdj/logs/debug-internal.log +11 -0
- Meissonic/wandb/run-20251229_081752-78ojckdj/logs/debug.log +24 -0
- Meissonic/wandb/run-20251229_081959-tvb7bjux/files/output.log +8 -0
- Meissonic/wandb/run-20251229_081959-tvb7bjux/logs/debug-core.log +7 -0
- Meissonic/wandb/run-20251229_081959-tvb7bjux/logs/debug-internal.log +6 -0
- Meissonic/wandb/run-20251229_081959-tvb7bjux/logs/debug.log +22 -0
- Meissonic/wandb/run-20251229_082208-d5bens3y/files/output.log +68 -0
- Meissonic/wandb/run-20251229_082208-d5bens3y/logs/debug-core.log +14 -0
- Meissonic/wandb/run-20251229_082208-d5bens3y/logs/debug-internal.log +11 -0
- Meissonic/wandb/run-20251229_082208-d5bens3y/logs/debug.log +24 -0
- Meissonic/wandb/run-20251229_082348-xdcob8vv/files/output.log +68 -0
- Meissonic/wandb/run-20251229_082348-xdcob8vv/logs/debug-core.log +14 -0
- Meissonic/wandb/run-20251229_082348-xdcob8vv/logs/debug-internal.log +11 -0
- Meissonic/wandb/run-20251229_082348-xdcob8vv/logs/debug.log +24 -0
- Meissonic/wandb/run-20251229_082735-s2rbngfj/files/output.log +68 -0
- Meissonic/wandb/run-20251229_082735-s2rbngfj/logs/debug-core.log +14 -0
.gitattributes
CHANGED
|
@@ -876,3 +876,6 @@ Meissonic/wandb/run-20251229_093500-yyrdgepk/run-yyrdgepk.wandb filter=lfs diff=
|
|
| 876 |
OpenVid1M_reorganized.csv filter=lfs diff=lfs merge=lfs -text
|
| 877 |
Wan2.1-T2V-1.3B/examples/i2v_input.JPG filter=lfs diff=lfs merge=lfs -text
|
| 878 |
Wan2.1-T2V-1.3B/google/umt5-xxl/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
| 876 |
OpenVid1M_reorganized.csv filter=lfs diff=lfs merge=lfs -text
|
| 877 |
Wan2.1-T2V-1.3B/examples/i2v_input.JPG filter=lfs diff=lfs merge=lfs -text
|
| 878 |
Wan2.1-T2V-1.3B/google/umt5-xxl/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 879 |
+
Meissonic/cosmos_test_output/comparison_video_1.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 880 |
+
Meissonic/cosmos_test_output/comparison_video_2.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 881 |
+
Meissonic/cosmos_test_output/comparison_video_3.mp4 filter=lfs diff=lfs merge=lfs -text
|
128_128_17/video_codes.tar.zst
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:09f283283fcbaa8e88678c39ffaf7b37d14c2f234798403f77fbda59ea65b5e0
|
| 3 |
+
size 2966606624
|
256_256_17/video_codes.tar.zst
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fdb5b8c632876b41b319e069c021e517fe3ab6477b49e4ad5ed950646d58bcd5
|
| 3 |
+
size 11880937045
|
Meissonic/cosmos_test_output/comparison_video_0.mp4
ADDED
|
Binary file (36.3 kB). View file
|
|
|
Meissonic/cosmos_test_output/comparison_video_1.mp4
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b7311b27e36333219d20c8d835432ecadf9ebe5977bcf760bc6706a85a95cabd
|
| 3 |
+
size 1089113
|
Meissonic/cosmos_test_output/comparison_video_2.mp4
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5e02445cac3531ab68bda4ba1bc90ac570a7b423f78b9493471acb4d6e5f9a28
|
| 3 |
+
size 1618316
|
Meissonic/cosmos_test_output/comparison_video_3.mp4
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:017fcf1133dc553228724625c5ad6ec7f58f97ddc27c91201aa88a07423a76e2
|
| 3 |
+
size 931953
|
Meissonic/model/diffusion_pytorch_model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:96b6b242ca1c2f24e9d02cd6596066fab6d310e2d7538f33ae267cb18d957e8f
|
| 3 |
+
size 5676070424
|
Meissonic/src/__pycache__/pipeline.cpython-310.pyc
ADDED
|
Binary file (11.4 kB). View file
|
|
|
Meissonic/src/__pycache__/pipeline_video.cpython-310.pyc
ADDED
|
Binary file (27.9 kB). View file
|
|
|
Meissonic/src/__pycache__/pipeline_video.cpython-313.pyc
ADDED
|
Binary file (41.8 kB). View file
|
|
|
Meissonic/src/__pycache__/pipeline_video.cpython-314.pyc
ADDED
|
Binary file (44.8 kB). View file
|
|
|
Meissonic/src/__pycache__/scheduler.cpython-310.pyc
ADDED
|
Binary file (5.09 kB). View file
|
|
|
Meissonic/src/__pycache__/scheduler.cpython-313.pyc
ADDED
|
Binary file (9.32 kB). View file
|
|
|
Meissonic/src/__pycache__/scheduler_video.cpython-310.pyc
ADDED
|
Binary file (5.27 kB). View file
|
|
|
Meissonic/src/__pycache__/scheduler_video.cpython-313.pyc
ADDED
|
Binary file (9.87 kB). View file
|
|
|
Meissonic/src/__pycache__/scheduler_video.cpython-314.pyc
ADDED
|
Binary file (11.1 kB). View file
|
|
|
Meissonic/src/__pycache__/transformer.cpython-310.pyc
ADDED
|
Binary file (33 kB). View file
|
|
|
Meissonic/src/__pycache__/transformer.cpython-313.pyc
ADDED
|
Binary file (52 kB). View file
|
|
|
Meissonic/src/__pycache__/transformer_video.cpython-310.pyc
ADDED
|
Binary file (29.4 kB). View file
|
|
|
Meissonic/src/__pycache__/transformer_video.cpython-313.pyc
ADDED
|
Binary file (49.1 kB). View file
|
|
|
Meissonic/src/__pycache__/transformer_video.cpython-314.pyc
ADDED
|
Binary file (49 kB). View file
|
|
|
Meissonic/train/__pycache__/dataset_utils.cpython-310.pyc
ADDED
|
Binary file (28.2 kB). View file
|
|
|
Meissonic/train/__pycache__/dataset_utils.cpython-313.pyc
ADDED
|
Binary file (50 kB). View file
|
|
|
Meissonic/train/__pycache__/trainer_utils.cpython-310.pyc
ADDED
|
Binary file (1.27 kB). View file
|
|
|
Meissonic/train/__pycache__/trainer_utils.cpython-313.pyc
ADDED
|
Binary file (2.03 kB). View file
|
|
|
Meissonic/wandb/debug-internal.log
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-12-29T09:35:00.674748488Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"}
|
| 2 |
+
{"time":"2025-12-29T09:35:00.840745763Z","level":"INFO","msg":"stream: created new stream","id":"yyrdgepk"}
|
| 3 |
+
{"time":"2025-12-29T09:35:00.840887309Z","level":"INFO","msg":"handler: started","stream_id":"yyrdgepk"}
|
| 4 |
+
{"time":"2025-12-29T09:35:00.840989877Z","level":"INFO","msg":"stream: started","id":"yyrdgepk"}
|
| 5 |
+
{"time":"2025-12-29T09:35:00.841004187Z","level":"INFO","msg":"writer: started","stream_id":"yyrdgepk"}
|
| 6 |
+
{"time":"2025-12-29T09:35:00.841006253Z","level":"INFO","msg":"sender: started","stream_id":"yyrdgepk"}
|
| 7 |
+
{"time":"2025-12-29T09:42:02.535940574Z","level":"INFO","msg":"stream: closing","id":"yyrdgepk"}
|
| 8 |
+
{"time":"2025-12-29T09:42:02.752587654Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 9 |
+
{"time":"2025-12-29T09:42:02.857589578Z","level":"INFO","msg":"handler: closed","stream_id":"yyrdgepk"}
|
| 10 |
+
{"time":"2025-12-29T09:42:02.857716241Z","level":"INFO","msg":"sender: closed","stream_id":"yyrdgepk"}
|
| 11 |
+
{"time":"2025-12-29T09:42:02.857727173Z","level":"INFO","msg":"stream: closed","id":"yyrdgepk"}
|
Meissonic/wandb/debug.log
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-12-29 09:35:00,410 INFO MainThread:843534 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1
|
| 2 |
+
2025-12-29 09:35:00,410 INFO MainThread:843534 [wandb_setup.py:_flush():80] Configure stats pid to 843534
|
| 3 |
+
2025-12-29 09:35:00,410 INFO MainThread:843534 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings
|
| 4 |
+
2025-12-29 09:35:00,410 INFO MainThread:843534 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings
|
| 5 |
+
2025-12-29 09:35:00,410 INFO MainThread:843534 [wandb_setup.py:_flush():80] Loading settings from environment variables
|
| 6 |
+
2025-12-29 09:35:00,410 INFO MainThread:843534 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251229_093500-yyrdgepk/logs/debug.log
|
| 7 |
+
2025-12-29 09:35:00,410 INFO MainThread:843534 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251229_093500-yyrdgepk/logs/debug-internal.log
|
| 8 |
+
2025-12-29 09:35:00,410 INFO MainThread:843534 [wandb_init.py:init():841] calling init triggers
|
| 9 |
+
2025-12-29 09:35:00,410 INFO MainThread:843534 [wandb_init.py:init():846] wandb.init called with sweep_config: {}
|
| 10 |
+
config: {'_wandb': {}}
|
| 11 |
+
2025-12-29 09:35:00,410 INFO MainThread:843534 [wandb_init.py:init():889] starting backend
|
| 12 |
+
2025-12-29 09:35:00,668 INFO MainThread:843534 [wandb_init.py:init():892] sending inform_init request
|
| 13 |
+
2025-12-29 09:35:00,673 INFO MainThread:843534 [wandb_init.py:init():900] backend started and connected
|
| 14 |
+
2025-12-29 09:35:00,674 INFO MainThread:843534 [wandb_init.py:init():970] updated telemetry
|
| 15 |
+
2025-12-29 09:35:00,678 INFO MainThread:843534 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout
|
| 16 |
+
2025-12-29 09:35:01,041 INFO MainThread:843534 [wandb_init.py:init():1041] starting run threads in backend
|
| 17 |
+
2025-12-29 09:35:01,126 INFO MainThread:843534 [wandb_run.py:_console_start():2521] atexit reg
|
| 18 |
+
2025-12-29 09:35:01,126 INFO MainThread:843534 [wandb_run.py:_redirect():2369] redirect: wrap_raw
|
| 19 |
+
2025-12-29 09:35:01,126 INFO MainThread:843534 [wandb_run.py:_redirect():2438] Wrapping output streams.
|
| 20 |
+
2025-12-29 09:35:01,126 INFO MainThread:843534 [wandb_run.py:_redirect():2461] Redirects installed.
|
| 21 |
+
2025-12-29 09:35:01,128 INFO MainThread:843534 [wandb_init.py:init():1081] run started, returning control to user process
|
| 22 |
+
2025-12-29 09:35:01,130 INFO MainThread:843534 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-xxl', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 8, 'dataloader_prefetch_factor': 2, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output_256x256_17f_2*4bs_4*8*8vqvae_0_2_ratio_lr3e-3', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 100000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 1, 'gradient_accumulation_steps': 1, 'learning_rate': 0.003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 0, 'validation_steps': 100, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a girl walking', "The video features a man named David Schultz from Hamline University. He is dressed in a suit and tie, standing in front of a building with a tree in the background. The man appears to be speaking or presenting, as suggested by the context of the image. The style of the video is likely informative or educational, given the context of the man's attire and the setting. The video may be part of a news segment or a lecture series, as indicated by the man's professional appearance and the presence of a building that could be a university or academic institution.", "The video captures the interior of a car at a car show. The car features a striking orange and black color scheme, with the seats upholstered in orange leather and the door panels in black leather. The car's interior is well-lit, highlighting the details of the upholstery and the design of the door panels. The car is on display, with people walking around and observing it. The car show setting is bustling with activity, with other cars and people visible in the background. The video is a close-up shot of the car's interior, focusing on the details of the upholstery and the design of the door panels. The style of the video is realistic, capturing the car's interior in a clear and detailed manner."], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.0, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 17, 'video_height': 128, 'video_width': 128, 'video_tokenizer_model_id': 'Cosmos-0.1-Tokenizer-DV4x8x8', 'wan_pretrained_path': '/mnt/Wan2.1-T2V-1.3B', 'freeze_wan_backbone': False, 'wan_backbone_lr_ratio': 0.2, 'use_precomputed_features': False, 'use_precomputed_video_only': True, 'features_dir': '/mnt/VideoGen/dataset/OpenVid1M/extracted_features_17_128_128', 'empty_embeds_path': None}
|
| 23 |
+
2025-12-29 09:42:02,535 INFO wandb-AsyncioManager-main:843534 [service_client.py:_forward_responses():80] Reached EOF.
|
| 24 |
+
2025-12-29 09:42:02,535 INFO wandb-AsyncioManager-main:843534 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles.
|
Meissonic/wandb/run-20251229_081634-hjn0m6c2/files/output.log
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
12/29/2025 08:16:35 - INFO - __main__ - Using precomputed video codes only - will encode text with UMT5-XXL at runtime
|
| 2 |
+
12/29/2025 08:16:35 - INFO - __main__ - Video tokenizer will be loaded only during validation/inference
|
| 3 |
+
Loading checkpoint shards: 100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 6/6 [00:00<00:00, 62.15it/s]
|
| 4 |
+
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
|
| 5 |
+
12/29/2025 08:16:38 - INFO - __main__ - Loaded text encoder: google/umt5-xxl (d_model=4096)
|
| 6 |
+
Traceback (most recent call last):
|
| 7 |
+
File "/mnt/Meissonic/train/train_mei_video.py", line 1892, in <module>
|
| 8 |
+
main(parse_args())
|
| 9 |
+
File "/mnt/Meissonic/train/train_mei_video.py", line 554, in main
|
| 10 |
+
dataset.tokenizer = tokenizer
|
| 11 |
+
UnboundLocalError: local variable 'dataset' referenced before assignment
|
| 12 |
+
[rank0]: Traceback (most recent call last):
|
| 13 |
+
[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1892, in <module>
|
| 14 |
+
[rank0]: main(parse_args())
|
| 15 |
+
[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 554, in main
|
| 16 |
+
[rank0]: dataset.tokenizer = tokenizer
|
| 17 |
+
[rank0]: UnboundLocalError: local variable 'dataset' referenced before assignment
|
Meissonic/wandb/run-20251229_081634-hjn0m6c2/logs/debug-core.log
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-12-29T08:16:34.925791368Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpjvxtgfwa/port-680831.txt","pid":680831,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
|
| 2 |
+
{"time":"2025-12-29T08:16:34.92651504Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":680831}
|
| 3 |
+
{"time":"2025-12-29T08:16:34.926493614Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-680831-681084-3226924194/socket","Net":"unix"}}
|
| 4 |
+
{"time":"2025-12-29T08:16:35.112196944Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
|
| 5 |
+
{"time":"2025-12-29T08:16:35.118201645Z","level":"INFO","msg":"handleInformInit: received","streamId":"hjn0m6c2","id":"1(@)"}
|
| 6 |
+
{"time":"2025-12-29T08:16:35.284535005Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"hjn0m6c2","id":"1(@)"}
|
| 7 |
+
{"time":"2025-12-29T08:16:38.409050659Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
|
| 8 |
+
{"time":"2025-12-29T08:16:38.409094413Z","level":"INFO","msg":"server is shutting down"}
|
| 9 |
+
{"time":"2025-12-29T08:16:38.409089535Z","level":"INFO","msg":"connection: closing","id":"1(@)"}
|
| 10 |
+
{"time":"2025-12-29T08:16:38.409131426Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
|
| 11 |
+
{"time":"2025-12-29T08:16:38.409243761Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-680831-681084-3226924194/socket","Net":"unix"}}
|
| 12 |
+
{"time":"2025-12-29T08:16:38.912785622Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
|
| 13 |
+
{"time":"2025-12-29T08:16:38.912803973Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
|
| 14 |
+
{"time":"2025-12-29T08:16:38.912818214Z","level":"INFO","msg":"server is closed"}
|
Meissonic/wandb/run-20251229_081634-hjn0m6c2/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-12-29T08:16:35.118294642Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"}
|
| 2 |
+
{"time":"2025-12-29T08:16:35.284331327Z","level":"INFO","msg":"stream: created new stream","id":"hjn0m6c2"}
|
| 3 |
+
{"time":"2025-12-29T08:16:35.28441448Z","level":"INFO","msg":"handler: started","stream_id":"hjn0m6c2"}
|
| 4 |
+
{"time":"2025-12-29T08:16:35.284528509Z","level":"INFO","msg":"stream: started","id":"hjn0m6c2"}
|
| 5 |
+
{"time":"2025-12-29T08:16:35.284552699Z","level":"INFO","msg":"sender: started","stream_id":"hjn0m6c2"}
|
| 6 |
+
{"time":"2025-12-29T08:16:35.284556048Z","level":"INFO","msg":"writer: started","stream_id":"hjn0m6c2"}
|
| 7 |
+
{"time":"2025-12-29T08:16:38.40910837Z","level":"INFO","msg":"stream: closing","id":"hjn0m6c2"}
|
| 8 |
+
{"time":"2025-12-29T08:16:38.726721311Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 9 |
+
{"time":"2025-12-29T08:16:38.907987768Z","level":"INFO","msg":"handler: closed","stream_id":"hjn0m6c2"}
|
| 10 |
+
{"time":"2025-12-29T08:16:38.908080631Z","level":"INFO","msg":"sender: closed","stream_id":"hjn0m6c2"}
|
| 11 |
+
{"time":"2025-12-29T08:16:38.908087916Z","level":"INFO","msg":"stream: closed","id":"hjn0m6c2"}
|
Meissonic/wandb/run-20251229_081634-hjn0m6c2/logs/debug.log
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-12-29 08:16:34,856 INFO MainThread:680831 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1
|
| 2 |
+
2025-12-29 08:16:34,856 INFO MainThread:680831 [wandb_setup.py:_flush():80] Configure stats pid to 680831
|
| 3 |
+
2025-12-29 08:16:34,856 INFO MainThread:680831 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings
|
| 4 |
+
2025-12-29 08:16:34,856 INFO MainThread:680831 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings
|
| 5 |
+
2025-12-29 08:16:34,856 INFO MainThread:680831 [wandb_setup.py:_flush():80] Loading settings from environment variables
|
| 6 |
+
2025-12-29 08:16:34,856 INFO MainThread:680831 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251229_081634-hjn0m6c2/logs/debug.log
|
| 7 |
+
2025-12-29 08:16:34,856 INFO MainThread:680831 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251229_081634-hjn0m6c2/logs/debug-internal.log
|
| 8 |
+
2025-12-29 08:16:34,856 INFO MainThread:680831 [wandb_init.py:init():841] calling init triggers
|
| 9 |
+
2025-12-29 08:16:34,856 INFO MainThread:680831 [wandb_init.py:init():846] wandb.init called with sweep_config: {}
|
| 10 |
+
config: {'_wandb': {}}
|
| 11 |
+
2025-12-29 08:16:34,856 INFO MainThread:680831 [wandb_init.py:init():889] starting backend
|
| 12 |
+
2025-12-29 08:16:35,112 INFO MainThread:680831 [wandb_init.py:init():892] sending inform_init request
|
| 13 |
+
2025-12-29 08:16:35,116 INFO MainThread:680831 [wandb_init.py:init():900] backend started and connected
|
| 14 |
+
2025-12-29 08:16:35,118 INFO MainThread:680831 [wandb_init.py:init():970] updated telemetry
|
| 15 |
+
2025-12-29 08:16:35,123 INFO MainThread:680831 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout
|
| 16 |
+
2025-12-29 08:16:35,554 INFO MainThread:680831 [wandb_init.py:init():1041] starting run threads in backend
|
| 17 |
+
2025-12-29 08:16:35,679 INFO MainThread:680831 [wandb_run.py:_console_start():2521] atexit reg
|
| 18 |
+
2025-12-29 08:16:35,679 INFO MainThread:680831 [wandb_run.py:_redirect():2369] redirect: wrap_raw
|
| 19 |
+
2025-12-29 08:16:35,679 INFO MainThread:680831 [wandb_run.py:_redirect():2438] Wrapping output streams.
|
| 20 |
+
2025-12-29 08:16:35,679 INFO MainThread:680831 [wandb_run.py:_redirect():2461] Redirects installed.
|
| 21 |
+
2025-12-29 08:16:35,681 INFO MainThread:680831 [wandb_init.py:init():1081] run started, returning control to user process
|
| 22 |
+
2025-12-29 08:16:35,682 INFO MainThread:680831 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-xxl', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 8, 'dataloader_prefetch_factor': 2, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output_256x256_17f_2*4bs_4*8*8vqvae_0_2_ratio_lr3e-3', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 100000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 2, 'gradient_accumulation_steps': 4, 'learning_rate': 0.003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 0, 'validation_steps': 100, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a girl walking', "The video features a man named David Schultz from Hamline University. He is dressed in a suit and tie, standing in front of a building with a tree in the background. The man appears to be speaking or presenting, as suggested by the context of the image. The style of the video is likely informative or educational, given the context of the man's attire and the setting. The video may be part of a news segment or a lecture series, as indicated by the man's professional appearance and the presence of a building that could be a university or academic institution.", "The video captures the interior of a car at a car show. The car features a striking orange and black color scheme, with the seats upholstered in orange leather and the door panels in black leather. The car's interior is well-lit, highlighting the details of the upholstery and the design of the door panels. The car is on display, with people walking around and observing it. The car show setting is bustling with activity, with other cars and people visible in the background. The video is a close-up shot of the car's interior, focusing on the details of the upholstery and the design of the door panels. The style of the video is realistic, capturing the car's interior in a clear and detailed manner."], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.0, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 17, 'video_height': 256, 'video_width': 256, 'video_tokenizer_model_id': 'Cosmos-0.1-Tokenizer-DV4x8x8', 'wan_pretrained_path': '/mnt/Wan2.1-T2V-1.3B', 'freeze_wan_backbone': False, 'wan_backbone_lr_ratio': 0.2, 'use_precomputed_features': False, 'use_precomputed_video_only': True, 'features_dir': '/mnt/VideoGen/dataset/OpenVid1M/extracted_features_17_256_256_full_set', 'empty_embeds_path': None}
|
| 23 |
+
2025-12-29 08:16:38,409 INFO wandb-AsyncioManager-main:680831 [service_client.py:_forward_responses():80] Reached EOF.
|
| 24 |
+
2025-12-29 08:16:38,409 INFO wandb-AsyncioManager-main:680831 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles.
|
Meissonic/wandb/run-20251229_081752-78ojckdj/files/output.log
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
12/29/2025 08:17:53 - INFO - __main__ - Using precomputed video codes only - will encode text with UMT5-XXL at runtime
|
| 2 |
+
12/29/2025 08:17:53 - INFO - __main__ - Video tokenizer will be loaded only during validation/inference
|
| 3 |
+
Loading checkpoint shards: 100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 6/6 [00:00<00:00, 68.25it/s]
|
| 4 |
+
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
|
| 5 |
+
12/29/2025 08:17:55 - INFO - __main__ - Loaded text encoder: google/umt5-xxl (d_model=4096)
|
| 6 |
+
Traceback (most recent call last):
|
| 7 |
+
File "/mnt/Meissonic/train/train_mei_video.py", line 1891, in <module>
|
| 8 |
+
main(parse_args())
|
| 9 |
+
File "/mnt/Meissonic/train/train_mei_video.py", line 553, in main
|
| 10 |
+
dataset.tokenizer = tokenizer
|
| 11 |
+
UnboundLocalError: local variable 'dataset' referenced before assignment
|
| 12 |
+
[rank0]: Traceback (most recent call last):
|
| 13 |
+
[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1891, in <module>
|
| 14 |
+
[rank0]: main(parse_args())
|
| 15 |
+
[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 553, in main
|
| 16 |
+
[rank0]: dataset.tokenizer = tokenizer
|
| 17 |
+
[rank0]: UnboundLocalError: local variable 'dataset' referenced before assignment
|
Meissonic/wandb/run-20251229_081752-78ojckdj/logs/debug-core.log
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-12-29T08:17:52.415361788Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpoautak8q/port-681864.txt","pid":681864,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
|
| 2 |
+
{"time":"2025-12-29T08:17:52.415911531Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":681864}
|
| 3 |
+
{"time":"2025-12-29T08:17:52.415892317Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-681864-682101-615016650/socket","Net":"unix"}}
|
| 4 |
+
{"time":"2025-12-29T08:17:52.600038892Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
|
| 5 |
+
{"time":"2025-12-29T08:17:52.605938403Z","level":"INFO","msg":"handleInformInit: received","streamId":"78ojckdj","id":"1(@)"}
|
| 6 |
+
{"time":"2025-12-29T08:17:52.775428685Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"78ojckdj","id":"1(@)"}
|
| 7 |
+
{"time":"2025-12-29T08:17:55.715872394Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
|
| 8 |
+
{"time":"2025-12-29T08:17:55.715918634Z","level":"INFO","msg":"server is shutting down"}
|
| 9 |
+
{"time":"2025-12-29T08:17:55.715913241Z","level":"INFO","msg":"connection: closing","id":"1(@)"}
|
| 10 |
+
{"time":"2025-12-29T08:17:55.71601316Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-681864-682101-615016650/socket","Net":"unix"}}
|
| 11 |
+
{"time":"2025-12-29T08:17:55.716036224Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
|
| 12 |
+
{"time":"2025-12-29T08:17:56.359848916Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
|
| 13 |
+
{"time":"2025-12-29T08:17:56.359873934Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
|
| 14 |
+
{"time":"2025-12-29T08:17:56.359888804Z","level":"INFO","msg":"server is closed"}
|
Meissonic/wandb/run-20251229_081752-78ojckdj/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-12-29T08:17:52.606062282Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"}
|
| 2 |
+
{"time":"2025-12-29T08:17:52.77520788Z","level":"INFO","msg":"stream: created new stream","id":"78ojckdj"}
|
| 3 |
+
{"time":"2025-12-29T08:17:52.775295249Z","level":"INFO","msg":"handler: started","stream_id":"78ojckdj"}
|
| 4 |
+
{"time":"2025-12-29T08:17:52.775420221Z","level":"INFO","msg":"stream: started","id":"78ojckdj"}
|
| 5 |
+
{"time":"2025-12-29T08:17:52.775434881Z","level":"INFO","msg":"writer: started","stream_id":"78ojckdj"}
|
| 6 |
+
{"time":"2025-12-29T08:17:52.775434899Z","level":"INFO","msg":"sender: started","stream_id":"78ojckdj"}
|
| 7 |
+
{"time":"2025-12-29T08:17:55.715926892Z","level":"INFO","msg":"stream: closing","id":"78ojckdj"}
|
| 8 |
+
{"time":"2025-12-29T08:17:56.25572227Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 9 |
+
{"time":"2025-12-29T08:17:56.355939724Z","level":"INFO","msg":"handler: closed","stream_id":"78ojckdj"}
|
| 10 |
+
{"time":"2025-12-29T08:17:56.35603204Z","level":"INFO","msg":"sender: closed","stream_id":"78ojckdj"}
|
| 11 |
+
{"time":"2025-12-29T08:17:56.356037202Z","level":"INFO","msg":"stream: closed","id":"78ojckdj"}
|
Meissonic/wandb/run-20251229_081752-78ojckdj/logs/debug.log
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-12-29 08:17:52,347 INFO MainThread:681864 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1
|
| 2 |
+
2025-12-29 08:17:52,348 INFO MainThread:681864 [wandb_setup.py:_flush():80] Configure stats pid to 681864
|
| 3 |
+
2025-12-29 08:17:52,348 INFO MainThread:681864 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings
|
| 4 |
+
2025-12-29 08:17:52,348 INFO MainThread:681864 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings
|
| 5 |
+
2025-12-29 08:17:52,348 INFO MainThread:681864 [wandb_setup.py:_flush():80] Loading settings from environment variables
|
| 6 |
+
2025-12-29 08:17:52,348 INFO MainThread:681864 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251229_081752-78ojckdj/logs/debug.log
|
| 7 |
+
2025-12-29 08:17:52,348 INFO MainThread:681864 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251229_081752-78ojckdj/logs/debug-internal.log
|
| 8 |
+
2025-12-29 08:17:52,348 INFO MainThread:681864 [wandb_init.py:init():841] calling init triggers
|
| 9 |
+
2025-12-29 08:17:52,348 INFO MainThread:681864 [wandb_init.py:init():846] wandb.init called with sweep_config: {}
|
| 10 |
+
config: {'_wandb': {}}
|
| 11 |
+
2025-12-29 08:17:52,348 INFO MainThread:681864 [wandb_init.py:init():889] starting backend
|
| 12 |
+
2025-12-29 08:17:52,600 INFO MainThread:681864 [wandb_init.py:init():892] sending inform_init request
|
| 13 |
+
2025-12-29 08:17:52,604 INFO MainThread:681864 [wandb_init.py:init():900] backend started and connected
|
| 14 |
+
2025-12-29 08:17:52,605 INFO MainThread:681864 [wandb_init.py:init():970] updated telemetry
|
| 15 |
+
2025-12-29 08:17:52,609 INFO MainThread:681864 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout
|
| 16 |
+
2025-12-29 08:17:52,979 INFO MainThread:681864 [wandb_init.py:init():1041] starting run threads in backend
|
| 17 |
+
2025-12-29 08:17:53,102 INFO MainThread:681864 [wandb_run.py:_console_start():2521] atexit reg
|
| 18 |
+
2025-12-29 08:17:53,102 INFO MainThread:681864 [wandb_run.py:_redirect():2369] redirect: wrap_raw
|
| 19 |
+
2025-12-29 08:17:53,102 INFO MainThread:681864 [wandb_run.py:_redirect():2438] Wrapping output streams.
|
| 20 |
+
2025-12-29 08:17:53,103 INFO MainThread:681864 [wandb_run.py:_redirect():2461] Redirects installed.
|
| 21 |
+
2025-12-29 08:17:53,105 INFO MainThread:681864 [wandb_init.py:init():1081] run started, returning control to user process
|
| 22 |
+
2025-12-29 08:17:53,106 INFO MainThread:681864 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-xxl', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 8, 'dataloader_prefetch_factor': 2, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output_256x256_17f_2*4bs_4*8*8vqvae_0_2_ratio_lr3e-3', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 100000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 2, 'gradient_accumulation_steps': 4, 'learning_rate': 0.003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 0, 'validation_steps': 100, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a girl walking', "The video features a man named David Schultz from Hamline University. He is dressed in a suit and tie, standing in front of a building with a tree in the background. The man appears to be speaking or presenting, as suggested by the context of the image. The style of the video is likely informative or educational, given the context of the man's attire and the setting. The video may be part of a news segment or a lecture series, as indicated by the man's professional appearance and the presence of a building that could be a university or academic institution.", "The video captures the interior of a car at a car show. The car features a striking orange and black color scheme, with the seats upholstered in orange leather and the door panels in black leather. The car's interior is well-lit, highlighting the details of the upholstery and the design of the door panels. The car is on display, with people walking around and observing it. The car show setting is bustling with activity, with other cars and people visible in the background. The video is a close-up shot of the car's interior, focusing on the details of the upholstery and the design of the door panels. The style of the video is realistic, capturing the car's interior in a clear and detailed manner."], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.0, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 17, 'video_height': 256, 'video_width': 256, 'video_tokenizer_model_id': 'Cosmos-0.1-Tokenizer-DV4x8x8', 'wan_pretrained_path': '/mnt/Wan2.1-T2V-1.3B', 'freeze_wan_backbone': False, 'wan_backbone_lr_ratio': 0.2, 'use_precomputed_features': False, 'use_precomputed_video_only': True, 'features_dir': '/mnt/VideoGen/dataset/OpenVid1M/extracted_features_17_256_256_full_set', 'empty_embeds_path': None}
|
| 23 |
+
2025-12-29 08:17:55,715 INFO wandb-AsyncioManager-main:681864 [service_client.py:_forward_responses():80] Reached EOF.
|
| 24 |
+
2025-12-29 08:17:55,716 INFO wandb-AsyncioManager-main:681864 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles.
|
Meissonic/wandb/run-20251229_081959-tvb7bjux/files/output.log
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
12/29/2025 08:20:00 - INFO - __main__ - Using precomputed video codes only - will encode text with UMT5-XXL at runtime
|
| 2 |
+
12/29/2025 08:20:00 - INFO - __main__ - Video tokenizer will be loaded only during validation/inference
|
| 3 |
+
Loading checkpoint shards: 100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 6/6 [00:00<00:00, 68.27it/s]
|
| 4 |
+
'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 3ca70188-ebaa-40b0-a3ff-1473c60ab7d9)')' thrown while requesting HEAD https://huggingface.co/google/umt5-xxl/resolve/main/tokenizer_config.json
|
| 5 |
+
12/29/2025 08:20:10 - WARNING - huggingface_hub.utils._http - '(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 3ca70188-ebaa-40b0-a3ff-1473c60ab7d9)')' thrown while requesting HEAD https://huggingface.co/google/umt5-xxl/resolve/main/tokenizer_config.json
|
| 6 |
+
Retrying in 1s [Retry 1/5].
|
| 7 |
+
12/29/2025 08:20:10 - WARNING - huggingface_hub.utils._http - Retrying in 1s [Retry 1/5].
|
| 8 |
+
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
|
Meissonic/wandb/run-20251229_081959-tvb7bjux/logs/debug-core.log
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-12-29T08:19:59.444483356Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpk2lbu65l/port-683325.txt","pid":683325,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
|
| 2 |
+
{"time":"2025-12-29T08:19:59.445159843Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":683325}
|
| 3 |
+
{"time":"2025-12-29T08:19:59.445163741Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-683325-683564-770336178/socket","Net":"unix"}}
|
| 4 |
+
{"time":"2025-12-29T08:19:59.630747774Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
|
| 5 |
+
{"time":"2025-12-29T08:19:59.636523927Z","level":"INFO","msg":"handleInformInit: received","streamId":"tvb7bjux","id":"1(@)"}
|
| 6 |
+
{"time":"2025-12-29T08:19:59.807596347Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"tvb7bjux","id":"1(@)"}
|
| 7 |
+
{"time":"2025-12-29T08:20:13.475754205Z","level":"INFO","msg":"server: parent process exited, terminating service process"}
|
Meissonic/wandb/run-20251229_081959-tvb7bjux/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-12-29T08:19:59.636615677Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"}
|
| 2 |
+
{"time":"2025-12-29T08:19:59.807402325Z","level":"INFO","msg":"stream: created new stream","id":"tvb7bjux"}
|
| 3 |
+
{"time":"2025-12-29T08:19:59.807478253Z","level":"INFO","msg":"handler: started","stream_id":"tvb7bjux"}
|
| 4 |
+
{"time":"2025-12-29T08:19:59.807589456Z","level":"INFO","msg":"stream: started","id":"tvb7bjux"}
|
| 5 |
+
{"time":"2025-12-29T08:19:59.807608334Z","level":"INFO","msg":"sender: started","stream_id":"tvb7bjux"}
|
| 6 |
+
{"time":"2025-12-29T08:19:59.807611249Z","level":"INFO","msg":"writer: started","stream_id":"tvb7bjux"}
|
Meissonic/wandb/run-20251229_081959-tvb7bjux/logs/debug.log
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-12-29 08:19:59,377 INFO MainThread:683325 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1
|
| 2 |
+
2025-12-29 08:19:59,377 INFO MainThread:683325 [wandb_setup.py:_flush():80] Configure stats pid to 683325
|
| 3 |
+
2025-12-29 08:19:59,377 INFO MainThread:683325 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings
|
| 4 |
+
2025-12-29 08:19:59,377 INFO MainThread:683325 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings
|
| 5 |
+
2025-12-29 08:19:59,377 INFO MainThread:683325 [wandb_setup.py:_flush():80] Loading settings from environment variables
|
| 6 |
+
2025-12-29 08:19:59,377 INFO MainThread:683325 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251229_081959-tvb7bjux/logs/debug.log
|
| 7 |
+
2025-12-29 08:19:59,377 INFO MainThread:683325 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251229_081959-tvb7bjux/logs/debug-internal.log
|
| 8 |
+
2025-12-29 08:19:59,378 INFO MainThread:683325 [wandb_init.py:init():841] calling init triggers
|
| 9 |
+
2025-12-29 08:19:59,378 INFO MainThread:683325 [wandb_init.py:init():846] wandb.init called with sweep_config: {}
|
| 10 |
+
config: {'_wandb': {}}
|
| 11 |
+
2025-12-29 08:19:59,378 INFO MainThread:683325 [wandb_init.py:init():889] starting backend
|
| 12 |
+
2025-12-29 08:19:59,630 INFO MainThread:683325 [wandb_init.py:init():892] sending inform_init request
|
| 13 |
+
2025-12-29 08:19:59,635 INFO MainThread:683325 [wandb_init.py:init():900] backend started and connected
|
| 14 |
+
2025-12-29 08:19:59,636 INFO MainThread:683325 [wandb_init.py:init():970] updated telemetry
|
| 15 |
+
2025-12-29 08:19:59,640 INFO MainThread:683325 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout
|
| 16 |
+
2025-12-29 08:20:00,091 INFO MainThread:683325 [wandb_init.py:init():1041] starting run threads in backend
|
| 17 |
+
2025-12-29 08:20:00,220 INFO MainThread:683325 [wandb_run.py:_console_start():2521] atexit reg
|
| 18 |
+
2025-12-29 08:20:00,220 INFO MainThread:683325 [wandb_run.py:_redirect():2369] redirect: wrap_raw
|
| 19 |
+
2025-12-29 08:20:00,220 INFO MainThread:683325 [wandb_run.py:_redirect():2438] Wrapping output streams.
|
| 20 |
+
2025-12-29 08:20:00,220 INFO MainThread:683325 [wandb_run.py:_redirect():2461] Redirects installed.
|
| 21 |
+
2025-12-29 08:20:00,223 INFO MainThread:683325 [wandb_init.py:init():1081] run started, returning control to user process
|
| 22 |
+
2025-12-29 08:20:00,224 INFO MainThread:683325 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-xxl', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 8, 'dataloader_prefetch_factor': 2, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output_256x256_17f_2*4bs_4*8*8vqvae_0_2_ratio_lr3e-3', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 100000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 2, 'gradient_accumulation_steps': 4, 'learning_rate': 0.003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 0, 'validation_steps': 100, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a girl walking', "The video features a man named David Schultz from Hamline University. He is dressed in a suit and tie, standing in front of a building with a tree in the background. The man appears to be speaking or presenting, as suggested by the context of the image. The style of the video is likely informative or educational, given the context of the man's attire and the setting. The video may be part of a news segment or a lecture series, as indicated by the man's professional appearance and the presence of a building that could be a university or academic institution.", "The video captures the interior of a car at a car show. The car features a striking orange and black color scheme, with the seats upholstered in orange leather and the door panels in black leather. The car's interior is well-lit, highlighting the details of the upholstery and the design of the door panels. The car is on display, with people walking around and observing it. The car show setting is bustling with activity, with other cars and people visible in the background. The video is a close-up shot of the car's interior, focusing on the details of the upholstery and the design of the door panels. The style of the video is realistic, capturing the car's interior in a clear and detailed manner."], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.0, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 17, 'video_height': 256, 'video_width': 256, 'video_tokenizer_model_id': 'Cosmos-0.1-Tokenizer-DV4x8x8', 'wan_pretrained_path': '/mnt/Wan2.1-T2V-1.3B', 'freeze_wan_backbone': False, 'wan_backbone_lr_ratio': 0.2, 'use_precomputed_features': False, 'use_precomputed_video_only': True, 'features_dir': '/mnt/VideoGen/dataset/OpenVid1M/extracted_features_17_256_256_full_set', 'empty_embeds_path': None}
|
Meissonic/wandb/run-20251229_082208-d5bens3y/files/output.log
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
12/29/2025 08:22:09 - INFO - __main__ - Using precomputed video codes only - will encode text with UMT5-XXL at runtime
|
| 2 |
+
12/29/2025 08:22:09 - INFO - __main__ - Video tokenizer will be loaded only during validation/inference
|
| 3 |
+
Loading checkpoint shards: 100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 6/6 [00:00<00:00, 69.71it/s]
|
| 4 |
+
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
|
| 5 |
+
12/29/2025 08:22:12 - INFO - __main__ - Loaded text encoder: google/umt5-xxl (d_model=4096)
|
| 6 |
+
12/29/2025 08:22:20 - INFO - __main__ - Loaded from metadata: codebook_size=64000, mask_token_id=64000
|
| 7 |
+
12/29/2025 08:22:20 - INFO - __main__ - Minimal tokenizer created: mask_token_id=64000, codebook_size=64000
|
| 8 |
+
12/29/2025 08:22:20 - INFO - __main__ - Getting compressed dimensions from precomputed features...
|
| 9 |
+
12/29/2025 08:22:29 - INFO - __main__ - Got dimensions from metadata: F'=5, H'=32, W'=32
|
| 10 |
+
12/29/2025 08:22:29 - INFO - __main__ - Using actual text encoder dimension for umt5-xxl: 4096
|
| 11 |
+
12/29/2025 08:22:29 - INFO - __main__ - Loading Wan config from: /mnt/Wan2.1-T2V-1.3B
|
| 12 |
+
12/29/2025 08:22:29 - INFO - __main__ - Loaded Wan config: dim=1536, ffn_dim=8960, num_layers=30, num_heads=12
|
| 13 |
+
12/29/2025 08:22:46 - INFO - __main__ - Loading Wan pretrained weights from: /mnt/Wan2.1-T2V-1.3B
|
| 14 |
+
12/29/2025 08:22:46 - INFO - __main__ - Loading weights from local path: /mnt/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors
|
| 15 |
+
12/29/2025 08:22:48 - INFO - __main__ - β Successfully loaded Wan pretrained weights into backbone (excluding text_embedding)
|
| 16 |
+
12/29/2025 08:22:49 - INFO - __main__ - Parameter counts: backbone=1,418,996,800, other=2,112,033, total=1,421,108,833
|
| 17 |
+
12/29/2025 08:22:49 - INFO - __main__ - Wan backbone lr = 0.000600 (base_lr * 0.2)
|
| 18 |
+
12/29/2025 08:22:49 - INFO - __main__ - Other parts (token_embedding, logits_head) lr = 0.003000
|
| 19 |
+
12/29/2025 08:22:49 - INFO - __main__ - Creating dataloaders and lr_scheduler
|
| 20 |
+
12/29/2025 08:22:49 - INFO - __main__ - Using pre-extracted video codes from: /mnt/VideoGen/dataset/OpenVid1M/extracted_features_17_256_256_full_set
|
| 21 |
+
12/29/2025 08:22:49 - INFO - __main__ - Text will be encoded with UMT5-XXL at runtime
|
| 22 |
+
12/29/2025 08:22:58 - INFO - train.dataset_utils - Loaded metadata from /mnt/VideoGen/dataset/OpenVid1M/extracted_features_17_256_256_full_set/metadata.json
|
| 23 |
+
12/29/2025 08:22:58 - INFO - train.dataset_utils - Total samples in metadata: unknown
|
| 24 |
+
12/29/2025 08:22:58 - INFO - train.dataset_utils - PrecomputedVideoOnlyDataset: 1019957 samples available
|
| 25 |
+
12/29/2025 08:22:58 - INFO - train.dataset_utils - Index range: 0 to 1019956
|
| 26 |
+
12/29/2025 08:22:58 - INFO - __main__ - Using precomputed features - DataLoader settings: prefetch_factor=1, pin_memory=True
|
| 27 |
+
12/29/2025 08:22:58 - INFO - __main__ - Dataloader configuration:
|
| 28 |
+
12/29/2025 08:22:58 - INFO - __main__ - - num_workers: 8 (0 = single-threaded, recommended: 4-8 for video)
|
| 29 |
+
12/29/2025 08:22:58 - INFO - __main__ - - prefetch_factor: 2
|
| 30 |
+
12/29/2025 08:22:58 - INFO - __main__ - - persistent_workers: True
|
| 31 |
+
12/29/2025 08:22:58 - INFO - __main__ - - pin_memory: True
|
| 32 |
+
12/29/2025 08:22:58 - INFO - __main__ - Preparing model, optimizer and dataloaders
|
| 33 |
+
Traceback (most recent call last):
|
| 34 |
+
File "/mnt/Meissonic/train/train_mei_video.py", line 1894, in <module>
|
| 35 |
+
main(parse_args())
|
| 36 |
+
File "/mnt/Meissonic/train/train_mei_video.py", line 1323, in main
|
| 37 |
+
model, optimizer, lr_scheduler, train_dataloader, text_encoder = accelerator.prepare(
|
| 38 |
+
File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 1559, in prepare
|
| 39 |
+
result = tuple(
|
| 40 |
+
File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 1560, in <genexpr>
|
| 41 |
+
self._prepare_one(obj, first_pass=True, device_placement=d) for obj, d in zip(args, device_placement)
|
| 42 |
+
File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 1402, in _prepare_one
|
| 43 |
+
return self.prepare_model(obj, device_placement=device_placement)
|
| 44 |
+
File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 1847, in prepare_model
|
| 45 |
+
model = torch.nn.parallel.DistributedDataParallel(
|
| 46 |
+
File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 873, in __init__
|
| 47 |
+
self._ddp_init_helper(
|
| 48 |
+
File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1222, in _ddp_init_helper
|
| 49 |
+
self.reducer = dist.Reducer(
|
| 50 |
+
torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 21.16 GiB. GPU 0 has a total capacity of 39.49 GiB of which 11.55 GiB is free. Including non-PyTorch memory, this process has 27.93 GiB memory in use. Of the allocated memory 26.46 GiB is allocated by PyTorch, and 411.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
|
| 51 |
+
[rank0]: Traceback (most recent call last):
|
| 52 |
+
[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1894, in <module>
|
| 53 |
+
[rank0]: main(parse_args())
|
| 54 |
+
[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1323, in main
|
| 55 |
+
[rank0]: model, optimizer, lr_scheduler, train_dataloader, text_encoder = accelerator.prepare(
|
| 56 |
+
[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 1559, in prepare
|
| 57 |
+
[rank0]: result = tuple(
|
| 58 |
+
[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 1560, in <genexpr>
|
| 59 |
+
[rank0]: self._prepare_one(obj, first_pass=True, device_placement=d) for obj, d in zip(args, device_placement)
|
| 60 |
+
[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 1402, in _prepare_one
|
| 61 |
+
[rank0]: return self.prepare_model(obj, device_placement=device_placement)
|
| 62 |
+
[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 1847, in prepare_model
|
| 63 |
+
[rank0]: model = torch.nn.parallel.DistributedDataParallel(
|
| 64 |
+
[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 873, in __init__
|
| 65 |
+
[rank0]: self._ddp_init_helper(
|
| 66 |
+
[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1222, in _ddp_init_helper
|
| 67 |
+
[rank0]: self.reducer = dist.Reducer(
|
| 68 |
+
[rank0]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 21.16 GiB. GPU 0 has a total capacity of 39.49 GiB of which 11.55 GiB is free. Including non-PyTorch memory, this process has 27.93 GiB memory in use. Of the allocated memory 26.46 GiB is allocated by PyTorch, and 411.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
|
Meissonic/wandb/run-20251229_082208-d5bens3y/logs/debug-core.log
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-12-29T08:22:08.633253613Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpfee4vdgx/port-684910.txt","pid":684910,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
|
| 2 |
+
{"time":"2025-12-29T08:22:08.633786607Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":684910}
|
| 3 |
+
{"time":"2025-12-29T08:22:08.633765139Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-684910-685159-1258026704/socket","Net":"unix"}}
|
| 4 |
+
{"time":"2025-12-29T08:22:08.819292223Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
|
| 5 |
+
{"time":"2025-12-29T08:22:08.826487265Z","level":"INFO","msg":"handleInformInit: received","streamId":"d5bens3y","id":"1(@)"}
|
| 6 |
+
{"time":"2025-12-29T08:22:08.995050977Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"d5bens3y","id":"1(@)"}
|
| 7 |
+
{"time":"2025-12-29T08:23:10.182467655Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
|
| 8 |
+
{"time":"2025-12-29T08:23:10.182531417Z","level":"INFO","msg":"server is shutting down"}
|
| 9 |
+
{"time":"2025-12-29T08:23:10.182519187Z","level":"INFO","msg":"connection: closing","id":"1(@)"}
|
| 10 |
+
{"time":"2025-12-29T08:23:10.182572054Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
|
| 11 |
+
{"time":"2025-12-29T08:23:10.182609016Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-684910-685159-1258026704/socket","Net":"unix"}}
|
| 12 |
+
{"time":"2025-12-29T08:23:10.552208267Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
|
| 13 |
+
{"time":"2025-12-29T08:23:10.552231257Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
|
| 14 |
+
{"time":"2025-12-29T08:23:10.552243636Z","level":"INFO","msg":"server is closed"}
|
Meissonic/wandb/run-20251229_082208-d5bens3y/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-12-29T08:22:08.826738283Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"}
|
| 2 |
+
{"time":"2025-12-29T08:22:08.994805914Z","level":"INFO","msg":"stream: created new stream","id":"d5bens3y"}
|
| 3 |
+
{"time":"2025-12-29T08:22:08.9949314Z","level":"INFO","msg":"handler: started","stream_id":"d5bens3y"}
|
| 4 |
+
{"time":"2025-12-29T08:22:08.995043335Z","level":"INFO","msg":"stream: started","id":"d5bens3y"}
|
| 5 |
+
{"time":"2025-12-29T08:22:08.995063351Z","level":"INFO","msg":"sender: started","stream_id":"d5bens3y"}
|
| 6 |
+
{"time":"2025-12-29T08:22:08.995066887Z","level":"INFO","msg":"writer: started","stream_id":"d5bens3y"}
|
| 7 |
+
{"time":"2025-12-29T08:23:10.182529884Z","level":"INFO","msg":"stream: closing","id":"d5bens3y"}
|
| 8 |
+
{"time":"2025-12-29T08:23:10.451151782Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 9 |
+
{"time":"2025-12-29T08:23:10.548788578Z","level":"INFO","msg":"handler: closed","stream_id":"d5bens3y"}
|
| 10 |
+
{"time":"2025-12-29T08:23:10.548905656Z","level":"INFO","msg":"sender: closed","stream_id":"d5bens3y"}
|
| 11 |
+
{"time":"2025-12-29T08:23:10.548914674Z","level":"INFO","msg":"stream: closed","id":"d5bens3y"}
|
Meissonic/wandb/run-20251229_082208-d5bens3y/logs/debug.log
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-12-29 08:22:08,565 INFO MainThread:684910 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1
|
| 2 |
+
2025-12-29 08:22:08,565 INFO MainThread:684910 [wandb_setup.py:_flush():80] Configure stats pid to 684910
|
| 3 |
+
2025-12-29 08:22:08,565 INFO MainThread:684910 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings
|
| 4 |
+
2025-12-29 08:22:08,565 INFO MainThread:684910 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings
|
| 5 |
+
2025-12-29 08:22:08,565 INFO MainThread:684910 [wandb_setup.py:_flush():80] Loading settings from environment variables
|
| 6 |
+
2025-12-29 08:22:08,565 INFO MainThread:684910 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251229_082208-d5bens3y/logs/debug.log
|
| 7 |
+
2025-12-29 08:22:08,565 INFO MainThread:684910 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251229_082208-d5bens3y/logs/debug-internal.log
|
| 8 |
+
2025-12-29 08:22:08,565 INFO MainThread:684910 [wandb_init.py:init():841] calling init triggers
|
| 9 |
+
2025-12-29 08:22:08,565 INFO MainThread:684910 [wandb_init.py:init():846] wandb.init called with sweep_config: {}
|
| 10 |
+
config: {'_wandb': {}}
|
| 11 |
+
2025-12-29 08:22:08,565 INFO MainThread:684910 [wandb_init.py:init():889] starting backend
|
| 12 |
+
2025-12-29 08:22:08,819 INFO MainThread:684910 [wandb_init.py:init():892] sending inform_init request
|
| 13 |
+
2025-12-29 08:22:08,825 INFO MainThread:684910 [wandb_init.py:init():900] backend started and connected
|
| 14 |
+
2025-12-29 08:22:08,826 INFO MainThread:684910 [wandb_init.py:init():970] updated telemetry
|
| 15 |
+
2025-12-29 08:22:08,832 INFO MainThread:684910 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout
|
| 16 |
+
2025-12-29 08:22:09,324 INFO MainThread:684910 [wandb_init.py:init():1041] starting run threads in backend
|
| 17 |
+
2025-12-29 08:22:09,503 INFO MainThread:684910 [wandb_run.py:_console_start():2521] atexit reg
|
| 18 |
+
2025-12-29 08:22:09,503 INFO MainThread:684910 [wandb_run.py:_redirect():2369] redirect: wrap_raw
|
| 19 |
+
2025-12-29 08:22:09,503 INFO MainThread:684910 [wandb_run.py:_redirect():2438] Wrapping output streams.
|
| 20 |
+
2025-12-29 08:22:09,503 INFO MainThread:684910 [wandb_run.py:_redirect():2461] Redirects installed.
|
| 21 |
+
2025-12-29 08:22:09,506 INFO MainThread:684910 [wandb_init.py:init():1081] run started, returning control to user process
|
| 22 |
+
2025-12-29 08:22:09,507 INFO MainThread:684910 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-xxl', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 8, 'dataloader_prefetch_factor': 2, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output_256x256_17f_2*4bs_4*8*8vqvae_0_2_ratio_lr3e-3', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 100000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 2, 'gradient_accumulation_steps': 4, 'learning_rate': 0.003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 0, 'validation_steps': 100, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a girl walking', "The video features a man named David Schultz from Hamline University. He is dressed in a suit and tie, standing in front of a building with a tree in the background. The man appears to be speaking or presenting, as suggested by the context of the image. The style of the video is likely informative or educational, given the context of the man's attire and the setting. The video may be part of a news segment or a lecture series, as indicated by the man's professional appearance and the presence of a building that could be a university or academic institution.", "The video captures the interior of a car at a car show. The car features a striking orange and black color scheme, with the seats upholstered in orange leather and the door panels in black leather. The car's interior is well-lit, highlighting the details of the upholstery and the design of the door panels. The car is on display, with people walking around and observing it. The car show setting is bustling with activity, with other cars and people visible in the background. The video is a close-up shot of the car's interior, focusing on the details of the upholstery and the design of the door panels. The style of the video is realistic, capturing the car's interior in a clear and detailed manner."], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.0, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 17, 'video_height': 256, 'video_width': 256, 'video_tokenizer_model_id': 'Cosmos-0.1-Tokenizer-DV4x8x8', 'wan_pretrained_path': '/mnt/Wan2.1-T2V-1.3B', 'freeze_wan_backbone': False, 'wan_backbone_lr_ratio': 0.2, 'use_precomputed_features': False, 'use_precomputed_video_only': True, 'features_dir': '/mnt/VideoGen/dataset/OpenVid1M/extracted_features_17_256_256_full_set', 'empty_embeds_path': None}
|
| 23 |
+
2025-12-29 08:23:10,182 INFO wandb-AsyncioManager-main:684910 [service_client.py:_forward_responses():80] Reached EOF.
|
| 24 |
+
2025-12-29 08:23:10,182 INFO wandb-AsyncioManager-main:684910 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles.
|
Meissonic/wandb/run-20251229_082348-xdcob8vv/files/output.log
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
12/29/2025 08:23:49 - INFO - __main__ - Using precomputed video codes only - will encode text with UMT5-XXL at runtime
|
| 2 |
+
12/29/2025 08:23:49 - INFO - __main__ - Video tokenizer will be loaded only during validation/inference
|
| 3 |
+
Loading checkpoint shards: 100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 6/6 [00:00<00:00, 64.61it/s]
|
| 4 |
+
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
|
| 5 |
+
12/29/2025 08:23:51 - INFO - __main__ - Loaded text encoder: google/umt5-xxl (d_model=4096)
|
| 6 |
+
12/29/2025 08:24:00 - INFO - __main__ - Loaded from metadata: codebook_size=64000, mask_token_id=64000
|
| 7 |
+
12/29/2025 08:24:00 - INFO - __main__ - Minimal tokenizer created: mask_token_id=64000, codebook_size=64000
|
| 8 |
+
12/29/2025 08:24:00 - INFO - __main__ - Getting compressed dimensions from precomputed features...
|
| 9 |
+
12/29/2025 08:24:09 - INFO - __main__ - Got dimensions from metadata: F'=5, H'=32, W'=32
|
| 10 |
+
12/29/2025 08:24:09 - INFO - __main__ - Using actual text encoder dimension for umt5-xxl: 4096
|
| 11 |
+
12/29/2025 08:24:09 - INFO - __main__ - Loading Wan config from: /mnt/Wan2.1-T2V-1.3B
|
| 12 |
+
12/29/2025 08:24:09 - INFO - __main__ - Loaded Wan config: dim=1536, ffn_dim=8960, num_layers=30, num_heads=12
|
| 13 |
+
12/29/2025 08:24:26 - INFO - __main__ - Loading Wan pretrained weights from: /mnt/Wan2.1-T2V-1.3B
|
| 14 |
+
12/29/2025 08:24:26 - INFO - __main__ - Loading weights from local path: /mnt/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors
|
| 15 |
+
12/29/2025 08:24:28 - INFO - __main__ - β Successfully loaded Wan pretrained weights into backbone (excluding text_embedding)
|
| 16 |
+
12/29/2025 08:24:29 - INFO - __main__ - Parameter counts: backbone=1,418,996,800, other=2,112,033, total=1,421,108,833
|
| 17 |
+
12/29/2025 08:24:29 - INFO - __main__ - Wan backbone lr = 0.000600 (base_lr * 0.2)
|
| 18 |
+
12/29/2025 08:24:29 - INFO - __main__ - Other parts (token_embedding, logits_head) lr = 0.003000
|
| 19 |
+
12/29/2025 08:24:29 - INFO - __main__ - Creating dataloaders and lr_scheduler
|
| 20 |
+
12/29/2025 08:24:29 - INFO - __main__ - Using pre-extracted video codes from: /mnt/VideoGen/dataset/OpenVid1M/extracted_features_17_256_256_full_set
|
| 21 |
+
12/29/2025 08:24:29 - INFO - __main__ - Text will be encoded with UMT5-XXL at runtime
|
| 22 |
+
12/29/2025 08:24:38 - INFO - train.dataset_utils - Loaded metadata from /mnt/VideoGen/dataset/OpenVid1M/extracted_features_17_256_256_full_set/metadata.json
|
| 23 |
+
12/29/2025 08:24:38 - INFO - train.dataset_utils - Total samples in metadata: unknown
|
| 24 |
+
12/29/2025 08:24:38 - INFO - train.dataset_utils - PrecomputedVideoOnlyDataset: 1019957 samples available
|
| 25 |
+
12/29/2025 08:24:38 - INFO - train.dataset_utils - Index range: 0 to 1019956
|
| 26 |
+
12/29/2025 08:24:38 - INFO - __main__ - Using precomputed features - DataLoader settings: prefetch_factor=1, pin_memory=True
|
| 27 |
+
12/29/2025 08:24:38 - INFO - __main__ - Dataloader configuration:
|
| 28 |
+
12/29/2025 08:24:38 - INFO - __main__ - - num_workers: 8 (0 = single-threaded, recommended: 4-8 for video)
|
| 29 |
+
12/29/2025 08:24:38 - INFO - __main__ - - prefetch_factor: 2
|
| 30 |
+
12/29/2025 08:24:38 - INFO - __main__ - - persistent_workers: True
|
| 31 |
+
12/29/2025 08:24:38 - INFO - __main__ - - pin_memory: True
|
| 32 |
+
12/29/2025 08:24:38 - INFO - __main__ - Preparing model, optimizer and dataloaders
|
| 33 |
+
Traceback (most recent call last):
|
| 34 |
+
File "/mnt/Meissonic/train/train_mei_video.py", line 1894, in <module>
|
| 35 |
+
main(parse_args())
|
| 36 |
+
File "/mnt/Meissonic/train/train_mei_video.py", line 1323, in main
|
| 37 |
+
model, optimizer, lr_scheduler, train_dataloader, text_encoder = accelerator.prepare(
|
| 38 |
+
File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 1559, in prepare
|
| 39 |
+
result = tuple(
|
| 40 |
+
File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 1560, in <genexpr>
|
| 41 |
+
self._prepare_one(obj, first_pass=True, device_placement=d) for obj, d in zip(args, device_placement)
|
| 42 |
+
File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 1402, in _prepare_one
|
| 43 |
+
return self.prepare_model(obj, device_placement=device_placement)
|
| 44 |
+
File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 1847, in prepare_model
|
| 45 |
+
model = torch.nn.parallel.DistributedDataParallel(
|
| 46 |
+
File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 873, in __init__
|
| 47 |
+
self._ddp_init_helper(
|
| 48 |
+
File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1222, in _ddp_init_helper
|
| 49 |
+
self.reducer = dist.Reducer(
|
| 50 |
+
torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 21.16 GiB. GPU 0 has a total capacity of 39.49 GiB of which 11.55 GiB is free. Including non-PyTorch memory, this process has 27.93 GiB memory in use. Of the allocated memory 26.46 GiB is allocated by PyTorch, and 411.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
|
| 51 |
+
[rank0]: Traceback (most recent call last):
|
| 52 |
+
[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1894, in <module>
|
| 53 |
+
[rank0]: main(parse_args())
|
| 54 |
+
[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1323, in main
|
| 55 |
+
[rank0]: model, optimizer, lr_scheduler, train_dataloader, text_encoder = accelerator.prepare(
|
| 56 |
+
[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 1559, in prepare
|
| 57 |
+
[rank0]: result = tuple(
|
| 58 |
+
[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 1560, in <genexpr>
|
| 59 |
+
[rank0]: self._prepare_one(obj, first_pass=True, device_placement=d) for obj, d in zip(args, device_placement)
|
| 60 |
+
[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 1402, in _prepare_one
|
| 61 |
+
[rank0]: return self.prepare_model(obj, device_placement=device_placement)
|
| 62 |
+
[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 1847, in prepare_model
|
| 63 |
+
[rank0]: model = torch.nn.parallel.DistributedDataParallel(
|
| 64 |
+
[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 873, in __init__
|
| 65 |
+
[rank0]: self._ddp_init_helper(
|
| 66 |
+
[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1222, in _ddp_init_helper
|
| 67 |
+
[rank0]: self.reducer = dist.Reducer(
|
| 68 |
+
[rank0]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 21.16 GiB. GPU 0 has a total capacity of 39.49 GiB of which 11.55 GiB is free. Including non-PyTorch memory, this process has 27.93 GiB memory in use. Of the allocated memory 26.46 GiB is allocated by PyTorch, and 411.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
|
Meissonic/wandb/run-20251229_082348-xdcob8vv/logs/debug-core.log
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-12-29T08:23:48.48214766Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmp_kibi_k0/port-687239.txt","pid":687239,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
|
| 2 |
+
{"time":"2025-12-29T08:23:48.48261065Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":687239}
|
| 3 |
+
{"time":"2025-12-29T08:23:48.482606464Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-687239-687548-1521909327/socket","Net":"unix"}}
|
| 4 |
+
{"time":"2025-12-29T08:23:48.668396575Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
|
| 5 |
+
{"time":"2025-12-29T08:23:48.67438215Z","level":"INFO","msg":"handleInformInit: received","streamId":"xdcob8vv","id":"1(@)"}
|
| 6 |
+
{"time":"2025-12-29T08:23:48.838417506Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"xdcob8vv","id":"1(@)"}
|
| 7 |
+
{"time":"2025-12-29T08:24:51.064143118Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
|
| 8 |
+
{"time":"2025-12-29T08:24:51.064216028Z","level":"INFO","msg":"connection: closing","id":"1(@)"}
|
| 9 |
+
{"time":"2025-12-29T08:24:51.064271092Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
|
| 10 |
+
{"time":"2025-12-29T08:24:51.064228351Z","level":"INFO","msg":"server is shutting down"}
|
| 11 |
+
{"time":"2025-12-29T08:24:51.064361726Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-687239-687548-1521909327/socket","Net":"unix"}}
|
| 12 |
+
{"time":"2025-12-29T08:24:51.603614002Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
|
| 13 |
+
{"time":"2025-12-29T08:24:51.603644195Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
|
| 14 |
+
{"time":"2025-12-29T08:24:51.603660135Z","level":"INFO","msg":"server is closed"}
|
Meissonic/wandb/run-20251229_082348-xdcob8vv/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-12-29T08:23:48.674533717Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"}
|
| 2 |
+
{"time":"2025-12-29T08:23:48.83819002Z","level":"INFO","msg":"stream: created new stream","id":"xdcob8vv"}
|
| 3 |
+
{"time":"2025-12-29T08:23:48.838277887Z","level":"INFO","msg":"handler: started","stream_id":"xdcob8vv"}
|
| 4 |
+
{"time":"2025-12-29T08:23:48.838409545Z","level":"INFO","msg":"stream: started","id":"xdcob8vv"}
|
| 5 |
+
{"time":"2025-12-29T08:23:48.838424189Z","level":"INFO","msg":"writer: started","stream_id":"xdcob8vv"}
|
| 6 |
+
{"time":"2025-12-29T08:23:48.838433456Z","level":"INFO","msg":"sender: started","stream_id":"xdcob8vv"}
|
| 7 |
+
{"time":"2025-12-29T08:24:51.064239479Z","level":"INFO","msg":"stream: closing","id":"xdcob8vv"}
|
| 8 |
+
{"time":"2025-12-29T08:24:51.333940412Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 9 |
+
{"time":"2025-12-29T08:24:51.600325077Z","level":"INFO","msg":"handler: closed","stream_id":"xdcob8vv"}
|
| 10 |
+
{"time":"2025-12-29T08:24:51.600456594Z","level":"INFO","msg":"sender: closed","stream_id":"xdcob8vv"}
|
| 11 |
+
{"time":"2025-12-29T08:24:51.600464276Z","level":"INFO","msg":"stream: closed","id":"xdcob8vv"}
|
Meissonic/wandb/run-20251229_082348-xdcob8vv/logs/debug.log
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-12-29 08:23:48,415 INFO MainThread:687239 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1
|
| 2 |
+
2025-12-29 08:23:48,415 INFO MainThread:687239 [wandb_setup.py:_flush():80] Configure stats pid to 687239
|
| 3 |
+
2025-12-29 08:23:48,415 INFO MainThread:687239 [wandb_setup.py:_flush():80] Loading settings from /home/ubuntu/.config/wandb/settings
|
| 4 |
+
2025-12-29 08:23:48,415 INFO MainThread:687239 [wandb_setup.py:_flush():80] Loading settings from /mnt/Meissonic/wandb/settings
|
| 5 |
+
2025-12-29 08:23:48,415 INFO MainThread:687239 [wandb_setup.py:_flush():80] Loading settings from environment variables
|
| 6 |
+
2025-12-29 08:23:48,415 INFO MainThread:687239 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /mnt/Meissonic/wandb/run-20251229_082348-xdcob8vv/logs/debug.log
|
| 7 |
+
2025-12-29 08:23:48,415 INFO MainThread:687239 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /mnt/Meissonic/wandb/run-20251229_082348-xdcob8vv/logs/debug-internal.log
|
| 8 |
+
2025-12-29 08:23:48,415 INFO MainThread:687239 [wandb_init.py:init():841] calling init triggers
|
| 9 |
+
2025-12-29 08:23:48,415 INFO MainThread:687239 [wandb_init.py:init():846] wandb.init called with sweep_config: {}
|
| 10 |
+
config: {'_wandb': {}}
|
| 11 |
+
2025-12-29 08:23:48,415 INFO MainThread:687239 [wandb_init.py:init():889] starting backend
|
| 12 |
+
2025-12-29 08:23:48,668 INFO MainThread:687239 [wandb_init.py:init():892] sending inform_init request
|
| 13 |
+
2025-12-29 08:23:48,672 INFO MainThread:687239 [wandb_init.py:init():900] backend started and connected
|
| 14 |
+
2025-12-29 08:23:48,674 INFO MainThread:687239 [wandb_init.py:init():970] updated telemetry
|
| 15 |
+
2025-12-29 08:23:48,678 INFO MainThread:687239 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout
|
| 16 |
+
2025-12-29 08:23:49,038 INFO MainThread:687239 [wandb_init.py:init():1041] starting run threads in backend
|
| 17 |
+
2025-12-29 08:23:49,163 INFO MainThread:687239 [wandb_run.py:_console_start():2521] atexit reg
|
| 18 |
+
2025-12-29 08:23:49,163 INFO MainThread:687239 [wandb_run.py:_redirect():2369] redirect: wrap_raw
|
| 19 |
+
2025-12-29 08:23:49,163 INFO MainThread:687239 [wandb_run.py:_redirect():2438] Wrapping output streams.
|
| 20 |
+
2025-12-29 08:23:49,163 INFO MainThread:687239 [wandb_run.py:_redirect():2461] Redirects installed.
|
| 21 |
+
2025-12-29 08:23:49,166 INFO MainThread:687239 [wandb_init.py:init():1081] run started, returning control to user process
|
| 22 |
+
2025-12-29 08:23:49,167 INFO MainThread:687239 [wandb_run.py:_config_callback():1396] config_cb None None {'text_encoder_architecture': 'umt5-xxl', 'instance_dataset': 'OpenVid1MDataset', 'training_from_scratch': True, 'pretrained_model_name_or_path': 'dummy', 'revision': None, 'variant': None, 'instance_data_dataset': None, 'instance_data_dir': '/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv', 'instance_data_image': None, 'use_8bit_adam': True, 'dataloader_num_workers': 8, 'dataloader_prefetch_factor': 2, 'allow_tf32': True, 'use_ema': False, 'ema_decay': 0.9999, 'ema_update_after_step': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.01, 'adam_epsilon': 1e-08, 'output_dir': './output_256x256_17f_2*4bs_4*8*8vqvae_0_2_ratio_lr3e-3', 'seed': 42, 'logging_dir': 'logs', 'max_train_steps': 100000, 'checkpointing_steps': 500, 'logging_steps': 10, 'checkpoints_total_limit': None, 'resume_from_checkpoint': None, 'train_batch_size': 1, 'gradient_accumulation_steps': 4, 'learning_rate': 0.003, 'scale_lr': False, 'lr_scheduler': 'constant', 'lr_warmup_steps': 0, 'validation_steps': 100, 'mixed_precision': 'bf16', 'report_to': 'wandb', 'validation_prompts': ['a cat playing', 'a girl walking', "The video features a man named David Schultz from Hamline University. He is dressed in a suit and tie, standing in front of a building with a tree in the background. The man appears to be speaking or presenting, as suggested by the context of the image. The style of the video is likely informative or educational, given the context of the man's attire and the setting. The video may be part of a news segment or a lecture series, as indicated by the man's professional appearance and the presence of a building that could be a university or academic institution.", "The video captures the interior of a car at a car show. The car features a striking orange and black color scheme, with the seats upholstered in orange leather and the door panels in black leather. The car's interior is well-lit, highlighting the details of the upholstery and the design of the door panels. The car is on display, with people walking around and observing it. The car show setting is bustling with activity, with other cars and people visible in the background. The video is a close-up shot of the car's interior, focusing on the details of the upholstery and the design of the door panels. The style of the video is realistic, capturing the car's interior in a clear and detailed manner."], 'resolution': 512, 'split_vae_encode': 1, 'min_masking_rate': 0.0, 'cond_dropout_prob': 0.0, 'max_grad_norm': 50.0, 'use_lora': False, 'text_encoder_use_lora': False, 'lora_r': 16, 'lora_alpha': 32, 'lora_target_modules': ['to_q', 'to_k', 'to_v'], 'text_encoder_lora_r': 16, 'text_encoder_lora_alpha': 32, 'text_encoder_lora_target_modules': ['to_q', 'to_k', 'to_v'], 'train_text_encoder': False, 'image_key': None, 'prompt_key': None, 'gradient_checkpointing': True, 'prompt_prefix': None, 'num_frames': 17, 'video_height': 256, 'video_width': 256, 'video_tokenizer_model_id': 'Cosmos-0.1-Tokenizer-DV4x8x8', 'wan_pretrained_path': '/mnt/Wan2.1-T2V-1.3B', 'freeze_wan_backbone': False, 'wan_backbone_lr_ratio': 0.2, 'use_precomputed_features': False, 'use_precomputed_video_only': True, 'features_dir': '/mnt/VideoGen/dataset/OpenVid1M/extracted_features_17_256_256_full_set', 'empty_embeds_path': None}
|
| 23 |
+
2025-12-29 08:24:51,064 INFO wandb-AsyncioManager-main:687239 [service_client.py:_forward_responses():80] Reached EOF.
|
| 24 |
+
2025-12-29 08:24:51,064 INFO wandb-AsyncioManager-main:687239 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles.
|
Meissonic/wandb/run-20251229_082735-s2rbngfj/files/output.log
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
12/29/2025 08:27:36 - INFO - __main__ - Using precomputed video codes only - will encode text with UMT5-XXL at runtime
|
| 2 |
+
12/29/2025 08:27:36 - INFO - __main__ - Video tokenizer will be loaded only during validation/inference
|
| 3 |
+
Loading checkpoint shards: 100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 6/6 [00:00<00:00, 68.71it/s]
|
| 4 |
+
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
|
| 5 |
+
12/29/2025 08:27:38 - INFO - __main__ - Loaded text encoder: google/umt5-xxl (d_model=4096)
|
| 6 |
+
12/29/2025 08:27:39 - INFO - __main__ - Loaded from metadata: codebook_size=64000, mask_token_id=64000
|
| 7 |
+
12/29/2025 08:27:39 - INFO - __main__ - Minimal tokenizer created: mask_token_id=64000, codebook_size=64000
|
| 8 |
+
12/29/2025 08:27:39 - INFO - __main__ - Getting compressed dimensions from precomputed features...
|
| 9 |
+
12/29/2025 08:27:40 - INFO - __main__ - Got dimensions from metadata: F'=5, H'=16, W'=16
|
| 10 |
+
12/29/2025 08:27:40 - INFO - __main__ - Using actual text encoder dimension for umt5-xxl: 4096
|
| 11 |
+
12/29/2025 08:27:40 - INFO - __main__ - Loading Wan config from: /mnt/Wan2.1-T2V-1.3B
|
| 12 |
+
12/29/2025 08:27:40 - INFO - __main__ - Loaded Wan config: dim=1536, ffn_dim=8960, num_layers=30, num_heads=12
|
| 13 |
+
12/29/2025 08:27:57 - INFO - __main__ - Loading Wan pretrained weights from: /mnt/Wan2.1-T2V-1.3B
|
| 14 |
+
12/29/2025 08:27:57 - INFO - __main__ - Loading weights from local path: /mnt/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors
|
| 15 |
+
12/29/2025 08:27:59 - INFO - __main__ - β Successfully loaded Wan pretrained weights into backbone (excluding text_embedding)
|
| 16 |
+
12/29/2025 08:28:01 - INFO - __main__ - Parameter counts: backbone=1,418,996,800, other=2,112,033, total=1,421,108,833
|
| 17 |
+
12/29/2025 08:28:01 - INFO - __main__ - Wan backbone lr = 0.000600 (base_lr * 0.2)
|
| 18 |
+
12/29/2025 08:28:01 - INFO - __main__ - Other parts (token_embedding, logits_head) lr = 0.003000
|
| 19 |
+
12/29/2025 08:28:01 - INFO - __main__ - Creating dataloaders and lr_scheduler
|
| 20 |
+
12/29/2025 08:28:01 - INFO - __main__ - Using pre-extracted video codes from: /mnt/VideoGen/dataset/OpenVid1M/extracted_features_17_128_128
|
| 21 |
+
12/29/2025 08:28:01 - INFO - __main__ - Text will be encoded with UMT5-XXL at runtime
|
| 22 |
+
12/29/2025 08:28:02 - INFO - train.dataset_utils - Loaded metadata from /mnt/VideoGen/dataset/OpenVid1M/extracted_features_17_128_128/metadata.json
|
| 23 |
+
12/29/2025 08:28:02 - INFO - train.dataset_utils - Total samples in metadata: 1019957
|
| 24 |
+
12/29/2025 08:28:02 - INFO - train.dataset_utils - PrecomputedVideoOnlyDataset: 128000 samples available
|
| 25 |
+
12/29/2025 08:28:02 - INFO - train.dataset_utils - Index range: 0 to 127999
|
| 26 |
+
12/29/2025 08:28:02 - INFO - __main__ - Using precomputed features - DataLoader settings: prefetch_factor=1, pin_memory=True
|
| 27 |
+
12/29/2025 08:28:02 - INFO - __main__ - Dataloader configuration:
|
| 28 |
+
12/29/2025 08:28:02 - INFO - __main__ - - num_workers: 8 (0 = single-threaded, recommended: 4-8 for video)
|
| 29 |
+
12/29/2025 08:28:02 - INFO - __main__ - - prefetch_factor: 2
|
| 30 |
+
12/29/2025 08:28:02 - INFO - __main__ - - persistent_workers: True
|
| 31 |
+
12/29/2025 08:28:02 - INFO - __main__ - - pin_memory: True
|
| 32 |
+
12/29/2025 08:28:02 - INFO - __main__ - Preparing model, optimizer and dataloaders
|
| 33 |
+
Traceback (most recent call last):
|
| 34 |
+
File "/mnt/Meissonic/train/train_mei_video.py", line 1909, in <module>
|
| 35 |
+
main(parse_args())
|
| 36 |
+
File "/mnt/Meissonic/train/train_mei_video.py", line 1323, in main
|
| 37 |
+
model, optimizer, lr_scheduler, train_dataloader, text_encoder = accelerator.prepare(
|
| 38 |
+
File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 1559, in prepare
|
| 39 |
+
result = tuple(
|
| 40 |
+
File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 1560, in <genexpr>
|
| 41 |
+
self._prepare_one(obj, first_pass=True, device_placement=d) for obj, d in zip(args, device_placement)
|
| 42 |
+
File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 1402, in _prepare_one
|
| 43 |
+
return self.prepare_model(obj, device_placement=device_placement)
|
| 44 |
+
File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 1847, in prepare_model
|
| 45 |
+
model = torch.nn.parallel.DistributedDataParallel(
|
| 46 |
+
File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 873, in __init__
|
| 47 |
+
self._ddp_init_helper(
|
| 48 |
+
File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1222, in _ddp_init_helper
|
| 49 |
+
self.reducer = dist.Reducer(
|
| 50 |
+
torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 21.16 GiB. GPU 0 has a total capacity of 39.49 GiB of which 11.55 GiB is free. Including non-PyTorch memory, this process has 27.93 GiB memory in use. Of the allocated memory 26.46 GiB is allocated by PyTorch, and 411.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
|
| 51 |
+
[rank0]: Traceback (most recent call last):
|
| 52 |
+
[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1909, in <module>
|
| 53 |
+
[rank0]: main(parse_args())
|
| 54 |
+
[rank0]: File "/mnt/Meissonic/train/train_mei_video.py", line 1323, in main
|
| 55 |
+
[rank0]: model, optimizer, lr_scheduler, train_dataloader, text_encoder = accelerator.prepare(
|
| 56 |
+
[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 1559, in prepare
|
| 57 |
+
[rank0]: result = tuple(
|
| 58 |
+
[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 1560, in <genexpr>
|
| 59 |
+
[rank0]: self._prepare_one(obj, first_pass=True, device_placement=d) for obj, d in zip(args, device_placement)
|
| 60 |
+
[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 1402, in _prepare_one
|
| 61 |
+
[rank0]: return self.prepare_model(obj, device_placement=device_placement)
|
| 62 |
+
[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/accelerate/accelerator.py", line 1847, in prepare_model
|
| 63 |
+
[rank0]: model = torch.nn.parallel.DistributedDataParallel(
|
| 64 |
+
[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 873, in __init__
|
| 65 |
+
[rank0]: self._ddp_init_helper(
|
| 66 |
+
[rank0]: File "/home/ubuntu/miniconda3/envs/matrix-game2/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1222, in _ddp_init_helper
|
| 67 |
+
[rank0]: self.reducer = dist.Reducer(
|
| 68 |
+
[rank0]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 21.16 GiB. GPU 0 has a total capacity of 39.49 GiB of which 11.55 GiB is free. Including non-PyTorch memory, this process has 27.93 GiB memory in use. Of the allocated memory 26.46 GiB is allocated by PyTorch, and 411.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
|
Meissonic/wandb/run-20251229_082735-s2rbngfj/logs/debug-core.log
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-12-29T08:27:35.245761137Z","level":"INFO","msg":"main: starting server","port-filename":"/opt/dlami/nvme/tmp_user/tmpdee2lffa/port-691754.txt","pid":691754,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
|
| 2 |
+
{"time":"2025-12-29T08:27:35.24620287Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":691754}
|
| 3 |
+
{"time":"2025-12-29T08:27:35.246206125Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-691754-692079-1616011383/socket","Net":"unix"}}
|
| 4 |
+
{"time":"2025-12-29T08:27:35.432196378Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
|
| 5 |
+
{"time":"2025-12-29T08:27:35.439108806Z","level":"INFO","msg":"handleInformInit: received","streamId":"s2rbngfj","id":"1(@)"}
|
| 6 |
+
{"time":"2025-12-29T08:27:35.609262249Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"s2rbngfj","id":"1(@)"}
|
| 7 |
+
{"time":"2025-12-29T08:28:12.961224171Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
|
| 8 |
+
{"time":"2025-12-29T08:28:12.961266225Z","level":"INFO","msg":"server is shutting down"}
|
| 9 |
+
{"time":"2025-12-29T08:28:12.961261491Z","level":"INFO","msg":"connection: closing","id":"1(@)"}
|
| 10 |
+
{"time":"2025-12-29T08:28:12.96133838Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
|
| 11 |
+
{"time":"2025-12-29T08:28:12.961339854Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/opt/dlami/nvme/tmp_user/wandb-691754-692079-1616011383/socket","Net":"unix"}}
|
| 12 |
+
{"time":"2025-12-29T08:28:13.298524802Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
|
| 13 |
+
{"time":"2025-12-29T08:28:13.298553535Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
|
| 14 |
+
{"time":"2025-12-29T08:28:13.298566342Z","level":"INFO","msg":"server is closed"}
|