htlou commited on
Commit
0849b40
·
verified ·
1 Parent(s): d50f8c9

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +2 -0
  2. debug-internal.log +7 -0
  3. debug.log +24 -0
  4. run-20250329_003552-1mlhe6om/files/config.yaml +95 -0
  5. run-20250329_003552-1mlhe6om/files/output.log +21 -0
  6. run-20250329_003552-1mlhe6om/files/requirements.txt +167 -0
  7. run-20250329_003552-1mlhe6om/files/wandb-metadata.json +106 -0
  8. run-20250329_003552-1mlhe6om/files/wandb-summary.json +1 -0
  9. run-20250329_003552-1mlhe6om/logs/debug-core.log +13 -0
  10. run-20250329_003552-1mlhe6om/logs/debug-internal.log +16 -0
  11. run-20250329_003552-1mlhe6om/logs/debug.log +47 -0
  12. run-20250329_003552-1mlhe6om/run-1mlhe6om.wandb +0 -0
  13. run-20250329_003923-yubb37lj/files/output.log +16 -0
  14. run-20250329_003923-yubb37lj/files/requirements.txt +167 -0
  15. run-20250329_003923-yubb37lj/files/wandb-metadata.json +106 -0
  16. run-20250329_003923-yubb37lj/logs/debug-core.log +7 -0
  17. run-20250329_003923-yubb37lj/logs/debug-internal.log +7 -0
  18. run-20250329_003923-yubb37lj/logs/debug.log +22 -0
  19. run-20250329_003923-yubb37lj/run-yubb37lj.wandb +0 -0
  20. run-20250329_004923-vanwhj5e/files/output.log +60 -0
  21. run-20250329_004923-vanwhj5e/files/requirements.txt +167 -0
  22. run-20250329_004923-vanwhj5e/files/wandb-metadata.json +106 -0
  23. run-20250329_004923-vanwhj5e/files/wandb-summary.json +1 -0
  24. run-20250329_004923-vanwhj5e/logs/debug-core.log +7 -0
  25. run-20250329_004923-vanwhj5e/logs/debug-internal.log +9 -0
  26. run-20250329_004923-vanwhj5e/logs/debug.log +26 -0
  27. run-20250329_004923-vanwhj5e/run-vanwhj5e.wandb +0 -0
  28. run-20250329_005139-6x2eqgtz/files/output.log +0 -0
  29. run-20250329_005139-6x2eqgtz/files/requirements.txt +167 -0
  30. run-20250329_005139-6x2eqgtz/files/wandb-metadata.json +35 -0
  31. run-20250329_005139-6x2eqgtz/logs/debug-core.log +7 -0
  32. run-20250329_005139-6x2eqgtz/logs/debug-internal.log +8 -0
  33. run-20250329_005139-6x2eqgtz/logs/debug.log +26 -0
  34. run-20250329_005139-6x2eqgtz/run-6x2eqgtz.wandb +0 -0
  35. run-20250329_005425-3al6iztu/files/output.log +13 -0
  36. run-20250329_005425-3al6iztu/files/requirements.txt +167 -0
  37. run-20250329_005425-3al6iztu/files/wandb-metadata.json +35 -0
  38. run-20250329_005425-3al6iztu/logs/debug-core.log +7 -0
  39. run-20250329_005425-3al6iztu/logs/debug-internal.log +8 -0
  40. run-20250329_005425-3al6iztu/logs/debug.log +26 -0
  41. run-20250329_005425-3al6iztu/run-3al6iztu.wandb +0 -0
  42. run-20250329_005541-bq1jaffa/files/config.yaml +95 -0
  43. run-20250329_005541-bq1jaffa/files/output.log +22 -0
  44. run-20250329_005541-bq1jaffa/files/requirements.txt +167 -0
  45. run-20250329_005541-bq1jaffa/files/wandb-metadata.json +106 -0
  46. run-20250329_005541-bq1jaffa/files/wandb-summary.json +1 -0
  47. run-20250329_005541-bq1jaffa/logs/debug-core.log +15 -0
  48. run-20250329_005541-bq1jaffa/logs/debug-internal.log +16 -0
  49. run-20250329_005541-bq1jaffa/logs/debug.log +29 -0
  50. run-20250329_005541-bq1jaffa/run-bq1jaffa.wandb +0 -0
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ run-20250329_010934-3x35hjks/run-3x35hjks.wandb filter=lfs diff=lfs merge=lfs -text
37
+ run-20250329_012205-co1ecmky/run-co1ecmky.wandb filter=lfs diff=lfs merge=lfs -text
debug-internal.log ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {"time":"2025-03-29T01:22:05.252520599+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.8","symlink path":"../outputs/test_7B/wandb/run-20250329_012205-co1ecmky/logs/debug-core.log"}
2
+ {"time":"2025-03-29T01:22:05.467254306+08:00","level":"INFO","msg":"created new stream","id":"co1ecmky"}
3
+ {"time":"2025-03-29T01:22:05.467309548+08:00","level":"INFO","msg":"stream: started","id":"co1ecmky"}
4
+ {"time":"2025-03-29T01:22:05.467331707+08:00","level":"INFO","msg":"handler: started","stream_id":"co1ecmky"}
5
+ {"time":"2025-03-29T01:22:05.467333162+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"co1ecmky"}
6
+ {"time":"2025-03-29T01:22:05.467336174+08:00","level":"INFO","msg":"sender: started","stream_id":"co1ecmky"}
7
+ {"time":"2025-03-29T01:22:05.772490021+08:00","level":"INFO","msg":"Starting system monitor"}
debug.log ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-03-29 01:22:05,246 INFO MainThread:104999 [wandb_setup.py:_flush():67] Current SDK version is 0.19.8
2
+ 2025-03-29 01:22:05,246 INFO MainThread:104999 [wandb_setup.py:_flush():67] Configure stats pid to 104999
3
+ 2025-03-29 01:22:05,246 INFO MainThread:104999 [wandb_setup.py:_flush():67] Loading settings from /home/yangyaodong/.config/wandb/settings
4
+ 2025-03-29 01:22:05,246 INFO MainThread:104999 [wandb_setup.py:_flush():67] Loading settings from /aifs4su/yaodong/hantao/align-anything/scripts/wandb/settings
5
+ 2025-03-29 01:22:05,246 INFO MainThread:104999 [wandb_setup.py:_flush():67] Loading settings from environment variables
6
+ 2025-03-29 01:22:05,246 INFO MainThread:104999 [wandb_init.py:setup_run_log_directory():647] Logging user logs to ../outputs/test_7B/wandb/run-20250329_012205-co1ecmky/logs/debug.log
7
+ 2025-03-29 01:22:05,246 INFO MainThread:104999 [wandb_init.py:setup_run_log_directory():648] Logging internal logs to ../outputs/test_7B/wandb/run-20250329_012205-co1ecmky/logs/debug-internal.log
8
+ 2025-03-29 01:22:05,246 INFO MainThread:104999 [wandb_init.py:init():761] calling init triggers
9
+ 2025-03-29 01:22:05,246 INFO MainThread:104999 [wandb_init.py:init():766] wandb.init called with sweep_config: {}
10
+ config: {'train_cfgs': {'save_checkpoint': False, 'load_checkpoint': False, 'ds_cfgs': 'ds_z3_config.json', 'epochs': 3, 'seed': 42, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 16, 'gradient_checkpointing': True, 'learning_rate': 2e-05, 'lr_scheduler_type': 'cosine', 'lr_warmup_ratio': 0.03, 'weight_decay': 0.0, 'adam_betas': [0.9, 0.95], 'adam_epsilon': 1e-08, 'bf16': True, 'fp16': False, 'eval_strategy': 'epoch', 'eval_interval': 10, 'freeze_mm_proj': False, 'freeze_vision_tower': True, 'freeze_language_model': False, 'max_grad_norm': 1.0}, 'data_cfgs': {'load_multi_datasets': False, 'train_datasets': '/aifs4su/yaodong/hantao/datasets/MMInstruct-GPT4V_mistral-7b_cosi_cut/merged/top1-10', 'train_template': 'MM_TI2T_LLAVA', 'train_size': {}, 'train_split': 'train', 'train_name': 'text-image-to-text', 'train_data_files': {}, 'train_optional_args': [], 'eval_datasets': {}, 'eval_template': {}, 'eval_name': {}, 'eval_size': {}, 'eval_split': {}, 'eval_subset': {}, 'eval_data_files': {}, 'eval_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'align-anything', 'log_run_name': 'sft', 'output_dir': '../outputs/test_7B', 'cache_dir': {}, 'save_total_limit': 6}, 'model_cfgs': {'model_name_or_path': '/aifs4su/yaodong/hantao/models/llava-v1.6-mistral-7b-hf', 'trust_remote_code': True, 'model_max_length': 2048}, 'special_tokens': {}, '_wandb': {}}
11
+ 2025-03-29 01:22:05,246 INFO MainThread:104999 [wandb_init.py:init():784] starting backend
12
+ 2025-03-29 01:22:05,246 INFO MainThread:104999 [wandb_init.py:init():788] sending inform_init request
13
+ 2025-03-29 01:22:05,249 INFO MainThread:104999 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-03-29 01:22:05,249 INFO MainThread:104999 [wandb_init.py:init():798] backend started and connected
15
+ 2025-03-29 01:22:05,251 INFO MainThread:104999 [wandb_init.py:init():891] updated telemetry
16
+ 2025-03-29 01:22:05,262 INFO MainThread:104999 [wandb_init.py:init():915] communicating run to backend with 90.0 second timeout
17
+ 2025-03-29 01:22:05,770 INFO MainThread:104999 [wandb_init.py:init():990] starting run threads in backend
18
+ 2025-03-29 01:22:05,989 INFO MainThread:104999 [wandb_run.py:_console_start():2375] atexit reg
19
+ 2025-03-29 01:22:05,990 INFO MainThread:104999 [wandb_run.py:_redirect():2227] redirect: wrap_raw
20
+ 2025-03-29 01:22:05,990 INFO MainThread:104999 [wandb_run.py:_redirect():2292] Wrapping output streams.
21
+ 2025-03-29 01:22:05,990 INFO MainThread:104999 [wandb_run.py:_redirect():2315] Redirects installed.
22
+ 2025-03-29 01:22:05,992 INFO MainThread:104999 [wandb_init.py:init():1032] run started, returning control to user process
23
+ 2025-03-29 01:26:51,278 INFO MainThread:104999 [wandb_run.py:_finish():2112] finishing run htlou/align-anything/co1ecmky
24
+ 2025-03-29 01:26:51,278 INFO MainThread:104999 [wandb_run.py:_atexit_cleanup():2340] got exitcode: 0
run-20250329_003552-1mlhe6om/files/config.yaml ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.19.8
4
+ m: []
5
+ python_version: 3.11.11
6
+ t:
7
+ "1":
8
+ - 1
9
+ - 5
10
+ - 11
11
+ - 41
12
+ - 49
13
+ - 51
14
+ - 53
15
+ - 55
16
+ - 63
17
+ - 71
18
+ - 83
19
+ - 98
20
+ - 105
21
+ "2":
22
+ - 1
23
+ - 5
24
+ - 11
25
+ - 41
26
+ - 49
27
+ - 51
28
+ - 53
29
+ - 55
30
+ - 63
31
+ - 71
32
+ - 83
33
+ - 98
34
+ - 105
35
+ "3":
36
+ - 2
37
+ - 13
38
+ - 16
39
+ - 23
40
+ - 55
41
+ "4": 3.11.11
42
+ "5": 0.19.8
43
+ "6": 4.51.0.dev0
44
+ "8":
45
+ - 5
46
+ "12": 0.19.8
47
+ "13": linux-x86_64
48
+ data_cfgs:
49
+ value:
50
+ eval_optional_args: []
51
+ load_multi_datasets: false
52
+ train_datasets: /aifs4su/yaodong/hantao/datasets/MMInstruct-GPT4V_mistral-7b_cosi_cut/merged/top1-10
53
+ train_name: text-image-to-text
54
+ train_optional_args: []
55
+ train_split: train
56
+ train_template: MM_TI2T_LLAVA
57
+ logger_cfgs:
58
+ value:
59
+ log_project: align-anything
60
+ log_run_name: sft
61
+ log_type: wandb
62
+ output_dir: ../outputs/test_7B
63
+ save_total_limit: 6
64
+ model_cfgs:
65
+ value:
66
+ model_max_length: 2048
67
+ model_name_or_path: /aifs4su/yaodong/hantao/models/llava-v1.6-mistral-7b-hf
68
+ trust_remote_code: true
69
+ train_cfgs:
70
+ value:
71
+ adam_betas:
72
+ - 0.9
73
+ - 0.95
74
+ adam_epsilon: 1e-08
75
+ bf16: true
76
+ ds_cfgs: ds_z3_config.json
77
+ epochs: 3
78
+ eval_interval: 10
79
+ eval_strategy: epoch
80
+ fp16: false
81
+ freeze_language_model: false
82
+ freeze_mm_proj: false
83
+ freeze_vision_tower: true
84
+ gradient_accumulation_steps: 16
85
+ gradient_checkpointing: true
86
+ learning_rate: 2e-05
87
+ load_checkpoint: false
88
+ lr_scheduler_type: cosine
89
+ lr_warmup_ratio: 0.03
90
+ max_grad_norm: 1
91
+ per_device_eval_batch_size: 1
92
+ per_device_train_batch_size: 1
93
+ save_checkpoint: true
94
+ seed: 42
95
+ weight_decay: 0
run-20250329_003552-1mlhe6om/files/output.log ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ***** Running training *****
2
+ Resuming from checkpoint 3/3 epoch : 0%| | 0/7326 [00:00<?, ?it/s]
3
+ Current epoch: 0
4
+ Current epoch: 1
5
+ Current epoch: 2
6
+ Saving model to "../outputs/test_7B/slice_end" ...
7
+ Saving 16-bit model...
8
+ [2025-03-29 00:36:00,887] [INFO] [logging.py:107:log_dist] [Rank 0] [Torch] Checkpoint global_step0 is about to be saved!
9
+ [2025-03-29 00:36:00,887] [INFO] [engine.py:3831:save_16bit_model] Saving model weights to ../outputs/test_7B/slice_end/pytorch_model.bin, tag: global_step0
10
+ [2025-03-29 00:36:00,888] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/test_7B/slice_end/pytorch_model.bin...
11
+ [2025-03-29 00:36:14,412] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/test_7B/slice_end/pytorch_model.bin.
12
+ [2025-03-29 00:36:14,412] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step0 is ready now!
13
+ [2025-03-29 00:36:15,002] [INFO] [logging.py:107:log_dist] [Rank 0] [Torch] Checkpoint global_step0 is about to be saved!
14
+ [2025-03-29 00:36:15,045] [INFO] [logging.py:107:log_dist] [Rank 0] Saving model checkpoint: ../outputs/test_7B/slice_end/global_step0/zero_pp_rank_0_mp_rank_00_model_states.pt
15
+ [2025-03-29 00:36:15,045] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/test_7B/slice_end/global_step0/zero_pp_rank_0_mp_rank_00_model_states.pt...
16
+ [2025-03-29 00:36:15,195] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/test_7B/slice_end/global_step0/zero_pp_rank_0_mp_rank_00_model_states.pt.
17
+ [2025-03-29 00:36:20,367] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/test_7B/slice_end/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt...
18
+ [2025-03-29 00:36:26,225] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/test_7B/slice_end/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt.
19
+ [2025-03-29 00:36:26,228] [INFO] [engine.py:3672:_save_zero_checkpoint] zero checkpoint saved ../outputs/test_7B/slice_end/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
20
+ [2025-03-29 00:36:26,608] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step0 is ready now!
21
+ Model saved!
run-20250329_003552-1mlhe6om/files/requirements.txt ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ maskrcnn_benchmark==0.0.0
2
+ webdataset==0.2.111
3
+ websockets==15.0.1
4
+ typer==0.15.2
5
+ blobfile==3.0.0
6
+ pooch==1.8.2
7
+ python-dateutil==2.9.0.post0
8
+ gmpy2==2.2.1
9
+ httpcore==1.0.7
10
+ charset-normalizer==3.3.2
11
+ torchlibrosa==0.1.0
12
+ multiprocess==0.70.16
13
+ Werkzeug==3.1.3
14
+ aiofiles==23.2.1
15
+ six==1.17.0
16
+ typing_extensions==4.12.2
17
+ psutil==7.0.0
18
+ frozenlist==1.5.0
19
+ einops==0.8.1
20
+ flash_attn==2.7.4.post1
21
+ PySocks==1.7.1
22
+ regex==2024.11.6
23
+ markdown-it-py==3.0.0
24
+ ruff==0.11.2
25
+ docker-pycreds==0.4.0
26
+ protobuf==5.29.4
27
+ resampy==0.4.3
28
+ aiohappyeyeballs==2.6.1
29
+ httpx==0.28.1
30
+ encodec==0.1.1
31
+ ffmpy==0.5.0
32
+ mkl_random==1.2.8
33
+ soxr==0.5.0.post1
34
+ absl-py==2.2.1
35
+ networkx==3.4.2
36
+ h5py==3.13.0
37
+ hjson==3.1.0
38
+ tensorboard==2.19.0
39
+ aiosignal==1.3.2
40
+ pip==25.0
41
+ triton==3.1.0
42
+ zipp==3.21.0
43
+ ftfy==6.3.1
44
+ attrs==25.3.0
45
+ requests==2.32.3
46
+ progressbar==2.5
47
+ sniffio==1.3.1
48
+ lxml==5.3.1
49
+ starlette==0.46.1
50
+ Markdown==3.7
51
+ mdurl==0.1.2
52
+ torchaudio==2.5.1
53
+ safetensors==0.5.3
54
+ opencv-python==4.6.0.66
55
+ torchvision==0.20.1
56
+ shellingham==1.5.4
57
+ gradio==5.23.1
58
+ timm==1.0.15
59
+ multidict==6.2.0
60
+ semantic-version==2.10.0
61
+ numba==0.60.0
62
+ gradio_client==1.8.0
63
+ pydantic_core==2.33.0
64
+ dill==0.3.8
65
+ msgpack==1.1.0
66
+ sentry-sdk==2.24.1
67
+ grpcio==1.71.0
68
+ cffi==1.17.1
69
+ PyYAML==6.0.2
70
+ tensorboard-data-server==0.7.2
71
+ fastapi==0.115.12
72
+ lazy_loader==0.4
73
+ mkl_fft==1.3.11
74
+ annotated-types==0.7.0
75
+ scikit-learn==1.6.1
76
+ wget==3.2
77
+ setuptools==75.8.0
78
+ certifi==2025.1.31
79
+ click==8.1.8
80
+ laion_clap==1.1.5
81
+ Pygments==2.19.1
82
+ tomlkit==0.13.2
83
+ idna==3.7
84
+ propcache==0.3.1
85
+ platformdirs==4.3.7
86
+ align-anything==0.0.1.dev0
87
+ deepspeed==0.16.5
88
+ smmap==5.0.2
89
+ pillow==11.1.0
90
+ typing-inspection==0.4.0
91
+ braceexpand==0.1.7
92
+ decorator==5.2.1
93
+ pandas==2.2.3
94
+ huggingface-hub==0.29.3
95
+ pyarrow==19.0.1
96
+ tokenizers==0.21.1
97
+ GitPython==3.1.44
98
+ xxhash==3.5.0
99
+ packaging==24.2
100
+ numpy==1.23.4
101
+ setproctitle==1.3.5
102
+ llvmlite==0.43.0
103
+ tiktoken==0.9.0
104
+ mpmath==1.3.0
105
+ nvidia-ml-py==12.570.86
106
+ pydantic==2.11.0
107
+ datasets==3.5.0
108
+ librosa==0.11.0
109
+ frechet_audio_distance==0.3.1
110
+ sympy==1.13.1
111
+ safehttpx==0.1.6
112
+ Jinja2==3.1.6
113
+ h11==0.14.0
114
+ aiohttp==3.11.14
115
+ diffusers==0.32.2
116
+ tqdm==4.67.1
117
+ filelock==3.13.1
118
+ transformers==4.51.0.dev0
119
+ scipy==1.10.1
120
+ audioread==3.0.1
121
+ sentencepiece==0.2.0
122
+ pytz==2025.2
123
+ tzdata==2025.2
124
+ python-multipart==0.0.20
125
+ urllib3==2.3.0
126
+ pycryptodomex==3.22.0
127
+ yarl==1.18.3
128
+ pydub==0.25.1
129
+ pycparser==2.22
130
+ soundfile==0.13.1
131
+ wcwidth==0.2.13
132
+ groovy==0.1.2
133
+ torch==2.5.1
134
+ anyio==4.9.0
135
+ wandb==0.19.8
136
+ joblib==1.4.2
137
+ fsspec==2024.12.0
138
+ peft==0.15.1
139
+ accelerate==1.5.2
140
+ py-cpuinfo==9.0.0
141
+ uvicorn==0.34.0
142
+ orjson==3.10.16
143
+ Brotli==1.0.9
144
+ rich==13.9.4
145
+ importlib_metadata==8.6.1
146
+ ninja==1.11.1.4
147
+ wheel==0.45.1
148
+ MarkupSafe==3.0.2
149
+ threadpoolctl==3.6.0
150
+ gitdb==4.0.12
151
+ mkl-service==2.4.0
152
+ typing_extensions==4.12.2
153
+ tomli==2.0.1
154
+ zipp==3.19.2
155
+ wheel==0.43.0
156
+ jaraco.text==3.12.1
157
+ packaging==24.2
158
+ autocommand==2.2.2
159
+ jaraco.functools==4.0.1
160
+ jaraco.collections==5.1.0
161
+ platformdirs==4.2.2
162
+ more-itertools==10.3.0
163
+ inflect==7.3.1
164
+ jaraco.context==5.3.0
165
+ typeguard==4.3.0
166
+ backports.tarfile==1.2.0
167
+ importlib_metadata==8.0.0
run-20250329_003552-1mlhe6om/files/wandb-metadata.json ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-1040-nvidia-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.11.11",
4
+ "startedAt": "2025-03-28T16:35:52.296406Z",
5
+ "args": [
6
+ "--local_rank=0",
7
+ "--model_name_or_path",
8
+ "/aifs4su/yaodong/hantao/models/llava-v1.6-mistral-7b-hf",
9
+ "--train_datasets",
10
+ "/aifs4su/yaodong/hantao/datasets/MMInstruct-GPT4V_mistral-7b_cosi_cut/merged/top1-10",
11
+ "--train_template",
12
+ "MM_TI2T_LLAVA",
13
+ "--train_split",
14
+ "train",
15
+ "--train_name",
16
+ "text-image-to-text",
17
+ "--output_dir",
18
+ "../outputs/test_7B",
19
+ "--save_total_limit",
20
+ "6",
21
+ "--train_batch_size",
22
+ "8",
23
+ "--epochs",
24
+ "3"
25
+ ],
26
+ "program": "-m align_anything.trainers.text_image_to_text.sft",
27
+ "git": {
28
+ "remote": "git@github.com-hantao:PKU-Alignment/align-anything.git",
29
+ "commit": "106588f9802757a3283c1aff1f33ea9afd737f31"
30
+ },
31
+ "email": "2200017789@stu.pku.edu.cn",
32
+ "root": "../outputs/test_7B",
33
+ "host": "dgx-092",
34
+ "executable": "/aifs4su/yaodong/miniconda3/envs/hantao_llama/bin/python",
35
+ "cpu_count": 112,
36
+ "cpu_count_logical": 224,
37
+ "gpu": "NVIDIA H800",
38
+ "gpu_count": 8,
39
+ "disk": {
40
+ "/": {
41
+ "total": "1888556142592",
42
+ "used": "148607098880"
43
+ }
44
+ },
45
+ "memory": {
46
+ "total": "2164195454976"
47
+ },
48
+ "cpu": {
49
+ "count": 112,
50
+ "countLogical": 224
51
+ },
52
+ "gpu_nvidia": [
53
+ {
54
+ "name": "NVIDIA H800",
55
+ "memoryTotal": "85520809984",
56
+ "cudaCores": 16896,
57
+ "architecture": "Hopper"
58
+ },
59
+ {
60
+ "name": "NVIDIA H800",
61
+ "memoryTotal": "85520809984",
62
+ "cudaCores": 16896,
63
+ "architecture": "Hopper"
64
+ },
65
+ {
66
+ "name": "NVIDIA H800",
67
+ "memoryTotal": "85520809984",
68
+ "cudaCores": 16896,
69
+ "architecture": "Hopper"
70
+ },
71
+ {
72
+ "name": "NVIDIA H800",
73
+ "memoryTotal": "85520809984",
74
+ "cudaCores": 16896,
75
+ "architecture": "Hopper"
76
+ },
77
+ {
78
+ "name": "NVIDIA H800",
79
+ "memoryTotal": "85520809984",
80
+ "cudaCores": 16896,
81
+ "architecture": "Hopper"
82
+ },
83
+ {
84
+ "name": "NVIDIA H800",
85
+ "memoryTotal": "85520809984",
86
+ "cudaCores": 16896,
87
+ "architecture": "Hopper"
88
+ },
89
+ {
90
+ "name": "NVIDIA H800",
91
+ "memoryTotal": "85520809984",
92
+ "cudaCores": 16896,
93
+ "architecture": "Hopper"
94
+ },
95
+ {
96
+ "name": "NVIDIA H800",
97
+ "memoryTotal": "85520809984",
98
+ "cudaCores": 16896,
99
+ "architecture": "Hopper"
100
+ }
101
+ ],
102
+ "slurm": {
103
+ "conf": "/cm/shared/apps/slurm/var/etc/slurm/slurm.conf"
104
+ },
105
+ "cudaVersion": "12.2"
106
+ }
run-20250329_003552-1mlhe6om/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_wandb":{"runtime":34}}
run-20250329_003552-1mlhe6om/logs/debug-core.log ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-03-29T00:35:51.700332688+08:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmp4xrq9hq1/port-9551.txt","pid":9551,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false}
2
+ {"time":"2025-03-29T00:35:51.701486575+08:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":9551}
3
+ {"time":"2025-03-29T00:35:51.701420457+08:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":41983,"Zone":""}}
4
+ {"time":"2025-03-29T00:35:51.880332094+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:61422"}
5
+ {"time":"2025-03-29T00:35:52.297979257+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"1mlhe6om","id":"127.0.0.1:61422"}
6
+ {"time":"2025-03-29T00:35:52.514464455+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"1mlhe6om","id":"127.0.0.1:61422"}
7
+ {"time":"2025-03-29T00:36:28.014384472+08:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:61422"}
8
+ {"time":"2025-03-29T00:36:28.014433444+08:00","level":"INFO","msg":"connection: closing","id":"127.0.0.1:61422"}
9
+ {"time":"2025-03-29T00:36:28.014448354+08:00","level":"INFO","msg":"server is shutting down"}
10
+ {"time":"2025-03-29T00:36:28.014489675+08:00","level":"INFO","msg":"connection: closed successfully","id":"127.0.0.1:61422"}
11
+ {"time":"2025-03-29T00:36:28.023136355+08:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:61422"}
12
+ {"time":"2025-03-29T00:36:28.023152325+08:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:61422"}
13
+ {"time":"2025-03-29T00:36:28.023157944+08:00","level":"INFO","msg":"server is closed"}
run-20250329_003552-1mlhe6om/logs/debug-internal.log ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-03-29T00:35:52.299334516+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.8","symlink path":"../outputs/test_7B/wandb/run-20250329_003552-1mlhe6om/logs/debug-core.log"}
2
+ {"time":"2025-03-29T00:35:52.5143997+08:00","level":"INFO","msg":"created new stream","id":"1mlhe6om"}
3
+ {"time":"2025-03-29T00:35:52.514457632+08:00","level":"INFO","msg":"stream: started","id":"1mlhe6om"}
4
+ {"time":"2025-03-29T00:35:52.514478469+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"1mlhe6om"}
5
+ {"time":"2025-03-29T00:35:52.514486118+08:00","level":"INFO","msg":"handler: started","stream_id":"1mlhe6om"}
6
+ {"time":"2025-03-29T00:35:52.514497089+08:00","level":"INFO","msg":"sender: started","stream_id":"1mlhe6om"}
7
+ {"time":"2025-03-29T00:35:52.799983877+08:00","level":"INFO","msg":"Starting system monitor"}
8
+ {"time":"2025-03-29T00:36:26.630995684+08:00","level":"INFO","msg":"Stopping system monitor"}
9
+ {"time":"2025-03-29T00:36:26.631709662+08:00","level":"INFO","msg":"Stopped system monitor"}
10
+ {"time":"2025-03-29T00:36:27.77584189+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
11
+ {"time":"2025-03-29T00:36:28.01449504+08:00","level":"INFO","msg":"stream: closing","id":"1mlhe6om"}
12
+ {"time":"2025-03-29T00:36:28.01455515+08:00","level":"INFO","msg":"handler: closed","stream_id":"1mlhe6om"}
13
+ {"time":"2025-03-29T00:36:28.014530377+08:00","level":"WARN","msg":"sender: received Exit record more than once, ignoring"}
14
+ {"time":"2025-03-29T00:36:28.014604384+08:00","level":"INFO","msg":"sender: closed","stream_id":"1mlhe6om"}
15
+ {"time":"2025-03-29T00:36:28.014567177+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"1mlhe6om"}
16
+ {"time":"2025-03-29T00:36:28.02303065+08:00","level":"INFO","msg":"stream: closed","id":"1mlhe6om"}
run-20250329_003552-1mlhe6om/logs/debug.log ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-03-29 00:35:52,291 INFO MainThread:9551 [wandb_setup.py:_flush():67] Current SDK version is 0.19.8
2
+ 2025-03-29 00:35:52,292 INFO MainThread:9551 [wandb_setup.py:_flush():67] Configure stats pid to 9551
3
+ 2025-03-29 00:35:52,292 INFO MainThread:9551 [wandb_setup.py:_flush():67] Loading settings from /home/yangyaodong/.config/wandb/settings
4
+ 2025-03-29 00:35:52,292 INFO MainThread:9551 [wandb_setup.py:_flush():67] Loading settings from /aifs4su/yaodong/hantao/align-anything/scripts/wandb/settings
5
+ 2025-03-29 00:35:52,292 INFO MainThread:9551 [wandb_setup.py:_flush():67] Loading settings from environment variables
6
+ 2025-03-29 00:35:52,292 INFO MainThread:9551 [wandb_init.py:setup_run_log_directory():647] Logging user logs to ../outputs/test_7B/wandb/run-20250329_003552-1mlhe6om/logs/debug.log
7
+ 2025-03-29 00:35:52,292 INFO MainThread:9551 [wandb_init.py:setup_run_log_directory():648] Logging internal logs to ../outputs/test_7B/wandb/run-20250329_003552-1mlhe6om/logs/debug-internal.log
8
+ 2025-03-29 00:35:52,292 INFO MainThread:9551 [wandb_init.py:init():761] calling init triggers
9
+ 2025-03-29 00:35:52,292 INFO MainThread:9551 [wandb_init.py:init():766] wandb.init called with sweep_config: {}
10
+ config: {'train_cfgs': {'save_checkpoint': True, 'load_checkpoint': False, 'ds_cfgs': 'ds_z3_config.json', 'epochs': 3, 'seed': 42, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 16, 'gradient_checkpointing': True, 'learning_rate': 2e-05, 'lr_scheduler_type': 'cosine', 'lr_warmup_ratio': 0.03, 'weight_decay': 0.0, 'adam_betas': [0.9, 0.95], 'adam_epsilon': 1e-08, 'bf16': True, 'fp16': False, 'eval_strategy': 'epoch', 'eval_interval': 10, 'freeze_mm_proj': False, 'freeze_vision_tower': True, 'freeze_language_model': False, 'max_grad_norm': 1.0}, 'data_cfgs': {'load_multi_datasets': False, 'train_datasets': '/aifs4su/yaodong/hantao/datasets/MMInstruct-GPT4V_mistral-7b_cosi_cut/merged/top1-10', 'train_template': 'MM_TI2T_LLAVA', 'train_size': {}, 'train_split': 'train', 'train_name': 'text-image-to-text', 'train_data_files': {}, 'train_optional_args': [], 'eval_datasets': {}, 'eval_template': {}, 'eval_name': {}, 'eval_size': {}, 'eval_split': {}, 'eval_subset': {}, 'eval_data_files': {}, 'eval_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'align-anything', 'log_run_name': 'sft', 'output_dir': '../outputs/test_7B', 'cache_dir': {}, 'save_total_limit': 6}, 'model_cfgs': {'model_name_or_path': '/aifs4su/yaodong/hantao/models/llava-v1.6-mistral-7b-hf', 'trust_remote_code': True, 'model_max_length': 2048}, 'special_tokens': {}, '_wandb': {}}
11
+ 2025-03-29 00:35:52,292 INFO MainThread:9551 [wandb_init.py:init():784] starting backend
12
+ 2025-03-29 00:35:52,292 INFO MainThread:9551 [wandb_init.py:init():788] sending inform_init request
13
+ 2025-03-29 00:35:52,296 INFO MainThread:9551 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-03-29 00:35:52,296 INFO MainThread:9551 [wandb_init.py:init():798] backend started and connected
15
+ 2025-03-29 00:35:52,297 INFO MainThread:9551 [wandb_init.py:init():891] updated telemetry
16
+ 2025-03-29 00:35:52,308 INFO MainThread:9551 [wandb_init.py:init():915] communicating run to backend with 90.0 second timeout
17
+ 2025-03-29 00:35:52,796 INFO MainThread:9551 [wandb_init.py:init():990] starting run threads in backend
18
+ 2025-03-29 00:35:52,947 INFO MainThread:9551 [wandb_run.py:_console_start():2375] atexit reg
19
+ 2025-03-29 00:35:52,947 INFO MainThread:9551 [wandb_run.py:_redirect():2227] redirect: wrap_raw
20
+ 2025-03-29 00:35:52,947 INFO MainThread:9551 [wandb_run.py:_redirect():2292] Wrapping output streams.
21
+ 2025-03-29 00:35:52,947 INFO MainThread:9551 [wandb_run.py:_redirect():2315] Redirects installed.
22
+ 2025-03-29 00:35:52,949 INFO MainThread:9551 [wandb_init.py:init():1032] run started, returning control to user process
23
+ 2025-03-29 00:36:26,629 INFO MainThread:9551 [wandb_run.py:_finish():2112] finishing run htlou/align-anything/1mlhe6om
24
+ 2025-03-29 00:36:26,630 INFO MainThread:9551 [wandb_run.py:_atexit_cleanup():2340] got exitcode: 0
25
+ 2025-03-29 00:36:26,630 INFO MainThread:9551 [wandb_run.py:_restore():2322] restore
26
+ 2025-03-29 00:36:26,630 INFO MainThread:9551 [wandb_run.py:_restore():2328] restore done
27
+ 2025-03-29 00:36:27,631 INFO MainThread:9551 [wandb_run.py:_restore():2322] restore
28
+ 2025-03-29 00:36:27,631 INFO MainThread:9551 [wandb_run.py:_restore():2328] restore done
29
+ 2025-03-29 00:36:27,631 ERROR MainThread:9551 [wandb_run.py:_atexit_cleanup():2361] Problem finishing run
30
+ Traceback (most recent call last):
31
+ File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 2352, in _atexit_cleanup
32
+ self._on_finish()
33
+ File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 2609, in _on_finish
34
+ wait_with_progress(
35
+ File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/wandb/sdk/mailbox/wait_with_progress.py", line 24, in wait_with_progress
36
+ return wait_all_with_progress(
37
+ ^^^^^^^^^^^^^^^^^^^^^^^
38
+ File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/wandb/sdk/mailbox/wait_with_progress.py", line 87, in wait_all_with_progress
39
+ return asyncio_compat.run(progress_loop_with_timeout)
40
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
41
+ File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/wandb/sdk/lib/asyncio_compat.py", line 27, in run
42
+ future = executor.submit(runner.run, fn)
43
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
44
+ File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/concurrent/futures/thread.py", line 169, in submit
45
+ raise RuntimeError('cannot schedule new futures after '
46
+ RuntimeError: cannot schedule new futures after interpreter shutdown
47
+ 2025-03-29 00:36:28,013 INFO MsgRouterThr:9551 [mailbox.py:close():129] Closing mailbox, abandoning 1 handles.
run-20250329_003552-1mlhe6om/run-1mlhe6om.wandb ADDED
Binary file (15.9 kB). View file
 
run-20250329_003923-yubb37lj/files/output.log ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ***** Running training *****
2
+ Resuming from checkpoint 3/3 epoch : 0%| | 0/7326 [00:00<?, ?it/s]
3
+ Current epoch: 0
4
+ Current epoch: 1
5
+ Current epoch: 2
6
+ Saving model to "../outputs/test_7B/slice_end" ...
7
+ Saving 16-bit model...
8
+ [2025-03-29 00:39:32,108] [INFO] [logging.py:107:log_dist] [Rank 0] [Torch] Checkpoint global_step0 is about to be saved!
9
+ [2025-03-29 00:39:32,108] [INFO] [engine.py:3831:save_16bit_model] Saving model weights to ../outputs/test_7B/slice_end/pytorch_model.bin, tag: global_step0
10
+ [2025-03-29 00:39:32,109] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/test_7B/slice_end/pytorch_model.bin...
11
+ [2025-03-29 00:39:46,748] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/test_7B/slice_end/pytorch_model.bin.
12
+ [2025-03-29 00:39:46,748] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step0 is ready now!
13
+ [2025-03-29 00:39:47,316] [INFO] [logging.py:107:log_dist] [Rank 0] [Torch] Checkpoint global_step0 is about to be saved!
14
+ [2025-03-29 00:39:47,344] [INFO] [logging.py:107:log_dist] [Rank 0] Saving model checkpoint: ../outputs/test_7B/slice_end/global_step0/zero_pp_rank_0_mp_rank_00_model_states.pt
15
+ [2025-03-29 00:39:47,345] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/test_7B/slice_end/global_step0/zero_pp_rank_0_mp_rank_00_model_states.pt...
16
+ [2025-03-29 00:39:47,453] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/test_7B/slice_end/global_step0/zero_pp_rank_0_mp_rank_00_model_states.pt.
run-20250329_003923-yubb37lj/files/requirements.txt ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ maskrcnn_benchmark==0.0.0
2
+ webdataset==0.2.111
3
+ websockets==15.0.1
4
+ typer==0.15.2
5
+ blobfile==3.0.0
6
+ pooch==1.8.2
7
+ python-dateutil==2.9.0.post0
8
+ gmpy2==2.2.1
9
+ httpcore==1.0.7
10
+ charset-normalizer==3.3.2
11
+ torchlibrosa==0.1.0
12
+ multiprocess==0.70.16
13
+ Werkzeug==3.1.3
14
+ aiofiles==23.2.1
15
+ six==1.17.0
16
+ typing_extensions==4.12.2
17
+ psutil==7.0.0
18
+ frozenlist==1.5.0
19
+ einops==0.8.1
20
+ flash_attn==2.7.4.post1
21
+ PySocks==1.7.1
22
+ regex==2024.11.6
23
+ markdown-it-py==3.0.0
24
+ ruff==0.11.2
25
+ docker-pycreds==0.4.0
26
+ protobuf==5.29.4
27
+ resampy==0.4.3
28
+ aiohappyeyeballs==2.6.1
29
+ httpx==0.28.1
30
+ encodec==0.1.1
31
+ ffmpy==0.5.0
32
+ mkl_random==1.2.8
33
+ soxr==0.5.0.post1
34
+ absl-py==2.2.1
35
+ networkx==3.4.2
36
+ h5py==3.13.0
37
+ hjson==3.1.0
38
+ tensorboard==2.19.0
39
+ aiosignal==1.3.2
40
+ pip==25.0
41
+ triton==3.1.0
42
+ zipp==3.21.0
43
+ ftfy==6.3.1
44
+ attrs==25.3.0
45
+ requests==2.32.3
46
+ progressbar==2.5
47
+ sniffio==1.3.1
48
+ lxml==5.3.1
49
+ starlette==0.46.1
50
+ Markdown==3.7
51
+ mdurl==0.1.2
52
+ torchaudio==2.5.1
53
+ safetensors==0.5.3
54
+ opencv-python==4.6.0.66
55
+ torchvision==0.20.1
56
+ shellingham==1.5.4
57
+ gradio==5.23.1
58
+ timm==1.0.15
59
+ multidict==6.2.0
60
+ semantic-version==2.10.0
61
+ numba==0.60.0
62
+ gradio_client==1.8.0
63
+ pydantic_core==2.33.0
64
+ dill==0.3.8
65
+ msgpack==1.1.0
66
+ sentry-sdk==2.24.1
67
+ grpcio==1.71.0
68
+ cffi==1.17.1
69
+ PyYAML==6.0.2
70
+ tensorboard-data-server==0.7.2
71
+ fastapi==0.115.12
72
+ lazy_loader==0.4
73
+ mkl_fft==1.3.11
74
+ annotated-types==0.7.0
75
+ scikit-learn==1.6.1
76
+ wget==3.2
77
+ setuptools==75.8.0
78
+ certifi==2025.1.31
79
+ click==8.1.8
80
+ laion_clap==1.1.5
81
+ Pygments==2.19.1
82
+ tomlkit==0.13.2
83
+ idna==3.7
84
+ propcache==0.3.1
85
+ platformdirs==4.3.7
86
+ align-anything==0.0.1.dev0
87
+ deepspeed==0.16.5
88
+ smmap==5.0.2
89
+ pillow==11.1.0
90
+ typing-inspection==0.4.0
91
+ braceexpand==0.1.7
92
+ decorator==5.2.1
93
+ pandas==2.2.3
94
+ huggingface-hub==0.29.3
95
+ pyarrow==19.0.1
96
+ tokenizers==0.21.1
97
+ GitPython==3.1.44
98
+ xxhash==3.5.0
99
+ packaging==24.2
100
+ numpy==1.23.4
101
+ setproctitle==1.3.5
102
+ llvmlite==0.43.0
103
+ tiktoken==0.9.0
104
+ mpmath==1.3.0
105
+ nvidia-ml-py==12.570.86
106
+ pydantic==2.11.0
107
+ datasets==3.5.0
108
+ librosa==0.11.0
109
+ frechet_audio_distance==0.3.1
110
+ sympy==1.13.1
111
+ safehttpx==0.1.6
112
+ Jinja2==3.1.6
113
+ h11==0.14.0
114
+ aiohttp==3.11.14
115
+ diffusers==0.32.2
116
+ tqdm==4.67.1
117
+ filelock==3.13.1
118
+ transformers==4.51.0.dev0
119
+ scipy==1.10.1
120
+ audioread==3.0.1
121
+ sentencepiece==0.2.0
122
+ pytz==2025.2
123
+ tzdata==2025.2
124
+ python-multipart==0.0.20
125
+ urllib3==2.3.0
126
+ pycryptodomex==3.22.0
127
+ yarl==1.18.3
128
+ pydub==0.25.1
129
+ pycparser==2.22
130
+ soundfile==0.13.1
131
+ wcwidth==0.2.13
132
+ groovy==0.1.2
133
+ torch==2.5.1
134
+ anyio==4.9.0
135
+ wandb==0.19.8
136
+ joblib==1.4.2
137
+ fsspec==2024.12.0
138
+ peft==0.15.1
139
+ accelerate==1.5.2
140
+ py-cpuinfo==9.0.0
141
+ uvicorn==0.34.0
142
+ orjson==3.10.16
143
+ Brotli==1.0.9
144
+ rich==13.9.4
145
+ importlib_metadata==8.6.1
146
+ ninja==1.11.1.4
147
+ wheel==0.45.1
148
+ MarkupSafe==3.0.2
149
+ threadpoolctl==3.6.0
150
+ gitdb==4.0.12
151
+ mkl-service==2.4.0
152
+ typing_extensions==4.12.2
153
+ tomli==2.0.1
154
+ zipp==3.19.2
155
+ wheel==0.43.0
156
+ jaraco.text==3.12.1
157
+ packaging==24.2
158
+ autocommand==2.2.2
159
+ jaraco.functools==4.0.1
160
+ jaraco.collections==5.1.0
161
+ platformdirs==4.2.2
162
+ more-itertools==10.3.0
163
+ inflect==7.3.1
164
+ jaraco.context==5.3.0
165
+ typeguard==4.3.0
166
+ backports.tarfile==1.2.0
167
+ importlib_metadata==8.0.0
run-20250329_003923-yubb37lj/files/wandb-metadata.json ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-1040-nvidia-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.11.11",
4
+ "startedAt": "2025-03-28T16:39:23.374186Z",
5
+ "args": [
6
+ "--local_rank=0",
7
+ "--model_name_or_path",
8
+ "/aifs4su/yaodong/hantao/models/llava-v1.6-mistral-7b-hf",
9
+ "--train_datasets",
10
+ "/aifs4su/yaodong/hantao/datasets/MMInstruct-GPT4V_mistral-7b_cosi_cut/merged/top1-10",
11
+ "--train_template",
12
+ "MM_TI2T_LLAVA",
13
+ "--train_split",
14
+ "train",
15
+ "--train_name",
16
+ "text-image-to-text",
17
+ "--output_dir",
18
+ "../outputs/test_7B",
19
+ "--save_total_limit",
20
+ "6",
21
+ "--train_batch_size",
22
+ "8",
23
+ "--epochs",
24
+ "3"
25
+ ],
26
+ "program": "-m align_anything.trainers.text_image_to_text.sft",
27
+ "git": {
28
+ "remote": "git@github.com-hantao:PKU-Alignment/align-anything.git",
29
+ "commit": "106588f9802757a3283c1aff1f33ea9afd737f31"
30
+ },
31
+ "email": "2200017789@stu.pku.edu.cn",
32
+ "root": "../outputs/test_7B",
33
+ "host": "dgx-092",
34
+ "executable": "/aifs4su/yaodong/miniconda3/envs/hantao_llama/bin/python",
35
+ "cpu_count": 112,
36
+ "cpu_count_logical": 224,
37
+ "gpu": "NVIDIA H800",
38
+ "gpu_count": 8,
39
+ "disk": {
40
+ "/": {
41
+ "total": "1888556142592",
42
+ "used": "148607471616"
43
+ }
44
+ },
45
+ "memory": {
46
+ "total": "2164195454976"
47
+ },
48
+ "cpu": {
49
+ "count": 112,
50
+ "countLogical": 224
51
+ },
52
+ "gpu_nvidia": [
53
+ {
54
+ "name": "NVIDIA H800",
55
+ "memoryTotal": "85520809984",
56
+ "cudaCores": 16896,
57
+ "architecture": "Hopper"
58
+ },
59
+ {
60
+ "name": "NVIDIA H800",
61
+ "memoryTotal": "85520809984",
62
+ "cudaCores": 16896,
63
+ "architecture": "Hopper"
64
+ },
65
+ {
66
+ "name": "NVIDIA H800",
67
+ "memoryTotal": "85520809984",
68
+ "cudaCores": 16896,
69
+ "architecture": "Hopper"
70
+ },
71
+ {
72
+ "name": "NVIDIA H800",
73
+ "memoryTotal": "85520809984",
74
+ "cudaCores": 16896,
75
+ "architecture": "Hopper"
76
+ },
77
+ {
78
+ "name": "NVIDIA H800",
79
+ "memoryTotal": "85520809984",
80
+ "cudaCores": 16896,
81
+ "architecture": "Hopper"
82
+ },
83
+ {
84
+ "name": "NVIDIA H800",
85
+ "memoryTotal": "85520809984",
86
+ "cudaCores": 16896,
87
+ "architecture": "Hopper"
88
+ },
89
+ {
90
+ "name": "NVIDIA H800",
91
+ "memoryTotal": "85520809984",
92
+ "cudaCores": 16896,
93
+ "architecture": "Hopper"
94
+ },
95
+ {
96
+ "name": "NVIDIA H800",
97
+ "memoryTotal": "85520809984",
98
+ "cudaCores": 16896,
99
+ "architecture": "Hopper"
100
+ }
101
+ ],
102
+ "slurm": {
103
+ "conf": "/cm/shared/apps/slurm/var/etc/slurm/slurm.conf"
104
+ },
105
+ "cudaVersion": "12.2"
106
+ }
run-20250329_003923-yubb37lj/logs/debug-core.log ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {"time":"2025-03-29T00:39:22.471053243+08:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpjod75rq7/port-18738.txt","pid":18738,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false}
2
+ {"time":"2025-03-29T00:39:22.471952784+08:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":18738}
3
+ {"time":"2025-03-29T00:39:22.471948837+08:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":42047,"Zone":""}}
4
+ {"time":"2025-03-29T00:39:22.652859041+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:57374"}
5
+ {"time":"2025-03-29T00:39:23.375489118+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"yubb37lj","id":"127.0.0.1:57374"}
6
+ {"time":"2025-03-29T00:39:23.590375898+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"yubb37lj","id":"127.0.0.1:57374"}
7
+ {"time":"2025-03-29T00:39:50.477044845+08:00","level":"INFO","msg":"received shutdown signal","signal":15}
run-20250329_003923-yubb37lj/logs/debug-internal.log ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {"time":"2025-03-29T00:39:23.376972436+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.8","symlink path":"../outputs/test_7B/wandb/run-20250329_003923-yubb37lj/logs/debug-core.log"}
2
+ {"time":"2025-03-29T00:39:23.590312919+08:00","level":"INFO","msg":"created new stream","id":"yubb37lj"}
3
+ {"time":"2025-03-29T00:39:23.590368384+08:00","level":"INFO","msg":"stream: started","id":"yubb37lj"}
4
+ {"time":"2025-03-29T00:39:23.59039266+08:00","level":"INFO","msg":"handler: started","stream_id":"yubb37lj"}
5
+ {"time":"2025-03-29T00:39:23.59039881+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"yubb37lj"}
6
+ {"time":"2025-03-29T00:39:23.590398926+08:00","level":"INFO","msg":"sender: started","stream_id":"yubb37lj"}
7
+ {"time":"2025-03-29T00:39:23.847802173+08:00","level":"INFO","msg":"Starting system monitor"}
run-20250329_003923-yubb37lj/logs/debug.log ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-03-29 00:39:23,371 INFO MainThread:18738 [wandb_setup.py:_flush():67] Current SDK version is 0.19.8
2
+ 2025-03-29 00:39:23,371 INFO MainThread:18738 [wandb_setup.py:_flush():67] Configure stats pid to 18738
3
+ 2025-03-29 00:39:23,371 INFO MainThread:18738 [wandb_setup.py:_flush():67] Loading settings from /home/yangyaodong/.config/wandb/settings
4
+ 2025-03-29 00:39:23,371 INFO MainThread:18738 [wandb_setup.py:_flush():67] Loading settings from /aifs4su/yaodong/hantao/align-anything/scripts/wandb/settings
5
+ 2025-03-29 00:39:23,371 INFO MainThread:18738 [wandb_setup.py:_flush():67] Loading settings from environment variables
6
+ 2025-03-29 00:39:23,371 INFO MainThread:18738 [wandb_init.py:setup_run_log_directory():647] Logging user logs to ../outputs/test_7B/wandb/run-20250329_003923-yubb37lj/logs/debug.log
7
+ 2025-03-29 00:39:23,371 INFO MainThread:18738 [wandb_init.py:setup_run_log_directory():648] Logging internal logs to ../outputs/test_7B/wandb/run-20250329_003923-yubb37lj/logs/debug-internal.log
8
+ 2025-03-29 00:39:23,371 INFO MainThread:18738 [wandb_init.py:init():761] calling init triggers
9
+ 2025-03-29 00:39:23,371 INFO MainThread:18738 [wandb_init.py:init():766] wandb.init called with sweep_config: {}
10
+ config: {'train_cfgs': {'save_checkpoint': True, 'load_checkpoint': False, 'ds_cfgs': 'ds_z3_config.json', 'epochs': 3, 'seed': 42, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 16, 'gradient_checkpointing': True, 'learning_rate': 2e-05, 'lr_scheduler_type': 'cosine', 'lr_warmup_ratio': 0.03, 'weight_decay': 0.0, 'adam_betas': [0.9, 0.95], 'adam_epsilon': 1e-08, 'bf16': True, 'fp16': False, 'eval_strategy': 'epoch', 'eval_interval': 10, 'freeze_mm_proj': False, 'freeze_vision_tower': True, 'freeze_language_model': False, 'max_grad_norm': 1.0}, 'data_cfgs': {'load_multi_datasets': False, 'train_datasets': '/aifs4su/yaodong/hantao/datasets/MMInstruct-GPT4V_mistral-7b_cosi_cut/merged/top1-10', 'train_template': 'MM_TI2T_LLAVA', 'train_size': {}, 'train_split': 'train', 'train_name': 'text-image-to-text', 'train_data_files': {}, 'train_optional_args': [], 'eval_datasets': {}, 'eval_template': {}, 'eval_name': {}, 'eval_size': {}, 'eval_split': {}, 'eval_subset': {}, 'eval_data_files': {}, 'eval_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'align-anything', 'log_run_name': 'sft', 'output_dir': '../outputs/test_7B', 'cache_dir': {}, 'save_total_limit': 6}, 'model_cfgs': {'model_name_or_path': '/aifs4su/yaodong/hantao/models/llava-v1.6-mistral-7b-hf', 'trust_remote_code': True, 'model_max_length': 2048}, 'special_tokens': {}, '_wandb': {}}
11
+ 2025-03-29 00:39:23,371 INFO MainThread:18738 [wandb_init.py:init():784] starting backend
12
+ 2025-03-29 00:39:23,371 INFO MainThread:18738 [wandb_init.py:init():788] sending inform_init request
13
+ 2025-03-29 00:39:23,374 INFO MainThread:18738 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-03-29 00:39:23,374 INFO MainThread:18738 [wandb_init.py:init():798] backend started and connected
15
+ 2025-03-29 00:39:23,375 INFO MainThread:18738 [wandb_init.py:init():891] updated telemetry
16
+ 2025-03-29 00:39:23,385 INFO MainThread:18738 [wandb_init.py:init():915] communicating run to backend with 90.0 second timeout
17
+ 2025-03-29 00:39:23,845 INFO MainThread:18738 [wandb_init.py:init():990] starting run threads in backend
18
+ 2025-03-29 00:39:23,996 INFO MainThread:18738 [wandb_run.py:_console_start():2375] atexit reg
19
+ 2025-03-29 00:39:23,996 INFO MainThread:18738 [wandb_run.py:_redirect():2227] redirect: wrap_raw
20
+ 2025-03-29 00:39:23,996 INFO MainThread:18738 [wandb_run.py:_redirect():2292] Wrapping output streams.
21
+ 2025-03-29 00:39:23,996 INFO MainThread:18738 [wandb_run.py:_redirect():2315] Redirects installed.
22
+ 2025-03-29 00:39:23,998 INFO MainThread:18738 [wandb_init.py:init():1032] run started, returning control to user process
run-20250329_003923-yubb37lj/run-yubb37lj.wandb ADDED
File without changes
run-20250329_004923-vanwhj5e/files/output.log ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ***** Running training *****
2
+ Resuming from checkpoint 3/3 epoch : 0%| | 0/7326 [00:00<?, ?it/s]
3
+ Saving model to "../outputs/test_7B/slice_end" ...
4
+ Saving 16-bit model...
5
+ Traceback (most recent call last):
6
+ File "<frozen runpy>", line 198, in _run_module_as_main
7
+ File "<frozen runpy>", line 88, in _run_code
8
+ File "/aifs4su/yaodong/hantao/align-anything/align_anything/trainers/text_image_to_text/sft.py", line 100, in <module>
9
+ sys.exit(main())
10
+ ^^^^^^
11
+ File "/aifs4su/yaodong/hantao/align-anything/align_anything/trainers/text_image_to_text/sft.py", line 96, in main
12
+ trainer.save()
13
+ File "/aifs4su/yaodong/hantao/align-anything/align_anything/trainers/text_to_text/sft.py", line 228, in save
14
+ self.save_transformers(model=model, tag=tag)
15
+ File "/aifs4su/yaodong/hantao/align-anything/align_anything/trainers/base/supervised_trainer.py", line 435, in save_transformers
16
+ model.save_16bit_model(output_dir, save_filename=save_file_name)
17
+ File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 3815, in save_16bit_model
18
+ state_dict = self._zero3_consolidated_16bit_state_dict(
19
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
20
+ File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 3776, in _zero3_consolidated_16bit_state_dict
21
+ get_layer_state_dict(self.module, prefix="")
22
+ File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 3769, in get_layer_state_dict
23
+ get_layer_state_dict(child, prefix + name + ".")
24
+ File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 3769, in get_layer_state_dict
25
+ get_layer_state_dict(child, prefix + name + ".")
26
+ File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 3769, in get_layer_state_dict
27
+ get_layer_state_dict(child, prefix + name + ".")
28
+ [Previous line repeated 3 more times]
29
+ File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 3757, in get_layer_state_dict
30
+ state_dict[key] = param.detach().cpu()
31
+ ^^^^^^^^^^^^^^^^^^^^
32
+ KeyboardInterrupt
33
+ [rank0]: Traceback (most recent call last):
34
+ [rank0]: File "<frozen runpy>", line 198, in _run_module_as_main
35
+ [rank0]: File "<frozen runpy>", line 88, in _run_code
36
+ [rank0]: File "/aifs4su/yaodong/hantao/align-anything/align_anything/trainers/text_image_to_text/sft.py", line 100, in <module>
37
+ [rank0]: sys.exit(main())
38
+ [rank0]: ^^^^^^
39
+ [rank0]: File "/aifs4su/yaodong/hantao/align-anything/align_anything/trainers/text_image_to_text/sft.py", line 96, in main
40
+ [rank0]: trainer.save()
41
+ [rank0]: File "/aifs4su/yaodong/hantao/align-anything/align_anything/trainers/text_to_text/sft.py", line 228, in save
42
+ [rank0]: self.save_transformers(model=model, tag=tag)
43
+ [rank0]: File "/aifs4su/yaodong/hantao/align-anything/align_anything/trainers/base/supervised_trainer.py", line 435, in save_transformers
44
+ [rank0]: model.save_16bit_model(output_dir, save_filename=save_file_name)
45
+ [rank0]: File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 3815, in save_16bit_model
46
+ [rank0]: state_dict = self._zero3_consolidated_16bit_state_dict(
47
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
48
+ [rank0]: File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 3776, in _zero3_consolidated_16bit_state_dict
49
+ [rank0]: get_layer_state_dict(self.module, prefix="")
50
+ [rank0]: File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 3769, in get_layer_state_dict
51
+ [rank0]: get_layer_state_dict(child, prefix + name + ".")
52
+ [rank0]: File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 3769, in get_layer_state_dict
53
+ [rank0]: get_layer_state_dict(child, prefix + name + ".")
54
+ [rank0]: File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 3769, in get_layer_state_dict
55
+ [rank0]: get_layer_state_dict(child, prefix + name + ".")
56
+ [rank0]: [Previous line repeated 3 more times]
57
+ [rank0]: File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 3757, in get_layer_state_dict
58
+ [rank0]: state_dict[key] = param.detach().cpu()
59
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^
60
+ [rank0]: KeyboardInterrupt
run-20250329_004923-vanwhj5e/files/requirements.txt ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ maskrcnn_benchmark==0.0.0
2
+ webdataset==0.2.111
3
+ websockets==15.0.1
4
+ typer==0.15.2
5
+ blobfile==3.0.0
6
+ pooch==1.8.2
7
+ python-dateutil==2.9.0.post0
8
+ gmpy2==2.2.1
9
+ httpcore==1.0.7
10
+ charset-normalizer==3.3.2
11
+ torchlibrosa==0.1.0
12
+ multiprocess==0.70.16
13
+ Werkzeug==3.1.3
14
+ aiofiles==23.2.1
15
+ six==1.17.0
16
+ typing_extensions==4.12.2
17
+ psutil==7.0.0
18
+ frozenlist==1.5.0
19
+ einops==0.8.1
20
+ flash_attn==2.7.4.post1
21
+ PySocks==1.7.1
22
+ regex==2024.11.6
23
+ markdown-it-py==3.0.0
24
+ ruff==0.11.2
25
+ docker-pycreds==0.4.0
26
+ protobuf==5.29.4
27
+ resampy==0.4.3
28
+ aiohappyeyeballs==2.6.1
29
+ httpx==0.28.1
30
+ encodec==0.1.1
31
+ ffmpy==0.5.0
32
+ mkl_random==1.2.8
33
+ soxr==0.5.0.post1
34
+ absl-py==2.2.1
35
+ networkx==3.4.2
36
+ h5py==3.13.0
37
+ hjson==3.1.0
38
+ tensorboard==2.19.0
39
+ aiosignal==1.3.2
40
+ pip==25.0
41
+ triton==3.1.0
42
+ zipp==3.21.0
43
+ ftfy==6.3.1
44
+ attrs==25.3.0
45
+ requests==2.32.3
46
+ progressbar==2.5
47
+ sniffio==1.3.1
48
+ lxml==5.3.1
49
+ starlette==0.46.1
50
+ Markdown==3.7
51
+ mdurl==0.1.2
52
+ torchaudio==2.5.1
53
+ safetensors==0.5.3
54
+ opencv-python==4.6.0.66
55
+ torchvision==0.20.1
56
+ shellingham==1.5.4
57
+ gradio==5.23.1
58
+ timm==1.0.15
59
+ multidict==6.2.0
60
+ semantic-version==2.10.0
61
+ numba==0.60.0
62
+ gradio_client==1.8.0
63
+ pydantic_core==2.33.0
64
+ dill==0.3.8
65
+ msgpack==1.1.0
66
+ sentry-sdk==2.24.1
67
+ grpcio==1.71.0
68
+ cffi==1.17.1
69
+ PyYAML==6.0.2
70
+ tensorboard-data-server==0.7.2
71
+ fastapi==0.115.12
72
+ lazy_loader==0.4
73
+ mkl_fft==1.3.11
74
+ annotated-types==0.7.0
75
+ scikit-learn==1.6.1
76
+ wget==3.2
77
+ setuptools==75.8.0
78
+ certifi==2025.1.31
79
+ click==8.1.8
80
+ laion_clap==1.1.5
81
+ Pygments==2.19.1
82
+ tomlkit==0.13.2
83
+ idna==3.7
84
+ propcache==0.3.1
85
+ platformdirs==4.3.7
86
+ align-anything==0.0.1.dev0
87
+ deepspeed==0.16.5
88
+ smmap==5.0.2
89
+ pillow==11.1.0
90
+ typing-inspection==0.4.0
91
+ braceexpand==0.1.7
92
+ decorator==5.2.1
93
+ pandas==2.2.3
94
+ huggingface-hub==0.29.3
95
+ pyarrow==19.0.1
96
+ tokenizers==0.21.1
97
+ GitPython==3.1.44
98
+ xxhash==3.5.0
99
+ packaging==24.2
100
+ numpy==1.23.4
101
+ setproctitle==1.3.5
102
+ llvmlite==0.43.0
103
+ tiktoken==0.9.0
104
+ mpmath==1.3.0
105
+ nvidia-ml-py==12.570.86
106
+ pydantic==2.11.0
107
+ datasets==3.5.0
108
+ librosa==0.11.0
109
+ frechet_audio_distance==0.3.1
110
+ sympy==1.13.1
111
+ safehttpx==0.1.6
112
+ Jinja2==3.1.6
113
+ h11==0.14.0
114
+ aiohttp==3.11.14
115
+ diffusers==0.32.2
116
+ tqdm==4.67.1
117
+ filelock==3.13.1
118
+ transformers==4.51.0.dev0
119
+ scipy==1.10.1
120
+ audioread==3.0.1
121
+ sentencepiece==0.2.0
122
+ pytz==2025.2
123
+ tzdata==2025.2
124
+ python-multipart==0.0.20
125
+ urllib3==2.3.0
126
+ pycryptodomex==3.22.0
127
+ yarl==1.18.3
128
+ pydub==0.25.1
129
+ pycparser==2.22
130
+ soundfile==0.13.1
131
+ wcwidth==0.2.13
132
+ groovy==0.1.2
133
+ torch==2.5.1
134
+ anyio==4.9.0
135
+ wandb==0.19.8
136
+ joblib==1.4.2
137
+ fsspec==2024.12.0
138
+ peft==0.15.1
139
+ accelerate==1.5.2
140
+ py-cpuinfo==9.0.0
141
+ uvicorn==0.34.0
142
+ orjson==3.10.16
143
+ Brotli==1.0.9
144
+ rich==13.9.4
145
+ importlib_metadata==8.6.1
146
+ ninja==1.11.1.4
147
+ wheel==0.45.1
148
+ MarkupSafe==3.0.2
149
+ threadpoolctl==3.6.0
150
+ gitdb==4.0.12
151
+ mkl-service==2.4.0
152
+ typing_extensions==4.12.2
153
+ tomli==2.0.1
154
+ zipp==3.19.2
155
+ wheel==0.43.0
156
+ jaraco.text==3.12.1
157
+ packaging==24.2
158
+ autocommand==2.2.2
159
+ jaraco.functools==4.0.1
160
+ jaraco.collections==5.1.0
161
+ platformdirs==4.2.2
162
+ more-itertools==10.3.0
163
+ inflect==7.3.1
164
+ jaraco.context==5.3.0
165
+ typeguard==4.3.0
166
+ backports.tarfile==1.2.0
167
+ importlib_metadata==8.0.0
run-20250329_004923-vanwhj5e/files/wandb-metadata.json ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-1040-nvidia-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.11.11",
4
+ "startedAt": "2025-03-28T16:49:23.693460Z",
5
+ "args": [
6
+ "--local_rank=0",
7
+ "--model_name_or_path",
8
+ "/aifs4su/yaodong/hantao/models/llava-v1.6-mistral-7b-hf",
9
+ "--train_datasets",
10
+ "/aifs4su/yaodong/hantao/datasets/MMInstruct-GPT4V_mistral-7b_cosi_cut/merged/top1-10",
11
+ "--train_template",
12
+ "MM_TI2T_LLAVA",
13
+ "--train_split",
14
+ "train",
15
+ "--train_name",
16
+ "text-image-to-text",
17
+ "--output_dir",
18
+ "../outputs/test_7B",
19
+ "--save_total_limit",
20
+ "6",
21
+ "--train_batch_size",
22
+ "8",
23
+ "--epochs",
24
+ "3"
25
+ ],
26
+ "program": "-m align_anything.trainers.text_image_to_text.sft",
27
+ "git": {
28
+ "remote": "git@github.com-hantao:PKU-Alignment/align-anything.git",
29
+ "commit": "106588f9802757a3283c1aff1f33ea9afd737f31"
30
+ },
31
+ "email": "2200017789@stu.pku.edu.cn",
32
+ "root": "../outputs/test_7B",
33
+ "host": "dgx-092",
34
+ "executable": "/aifs4su/yaodong/miniconda3/envs/hantao_llama/bin/python",
35
+ "cpu_count": 112,
36
+ "cpu_count_logical": 224,
37
+ "gpu": "NVIDIA H800",
38
+ "gpu_count": 8,
39
+ "disk": {
40
+ "/": {
41
+ "total": "1888556142592",
42
+ "used": "148608499712"
43
+ }
44
+ },
45
+ "memory": {
46
+ "total": "2164195454976"
47
+ },
48
+ "cpu": {
49
+ "count": 112,
50
+ "countLogical": 224
51
+ },
52
+ "gpu_nvidia": [
53
+ {
54
+ "name": "NVIDIA H800",
55
+ "memoryTotal": "85520809984",
56
+ "cudaCores": 16896,
57
+ "architecture": "Hopper"
58
+ },
59
+ {
60
+ "name": "NVIDIA H800",
61
+ "memoryTotal": "85520809984",
62
+ "cudaCores": 16896,
63
+ "architecture": "Hopper"
64
+ },
65
+ {
66
+ "name": "NVIDIA H800",
67
+ "memoryTotal": "85520809984",
68
+ "cudaCores": 16896,
69
+ "architecture": "Hopper"
70
+ },
71
+ {
72
+ "name": "NVIDIA H800",
73
+ "memoryTotal": "85520809984",
74
+ "cudaCores": 16896,
75
+ "architecture": "Hopper"
76
+ },
77
+ {
78
+ "name": "NVIDIA H800",
79
+ "memoryTotal": "85520809984",
80
+ "cudaCores": 16896,
81
+ "architecture": "Hopper"
82
+ },
83
+ {
84
+ "name": "NVIDIA H800",
85
+ "memoryTotal": "85520809984",
86
+ "cudaCores": 16896,
87
+ "architecture": "Hopper"
88
+ },
89
+ {
90
+ "name": "NVIDIA H800",
91
+ "memoryTotal": "85520809984",
92
+ "cudaCores": 16896,
93
+ "architecture": "Hopper"
94
+ },
95
+ {
96
+ "name": "NVIDIA H800",
97
+ "memoryTotal": "85520809984",
98
+ "cudaCores": 16896,
99
+ "architecture": "Hopper"
100
+ }
101
+ ],
102
+ "slurm": {
103
+ "conf": "/cm/shared/apps/slurm/var/etc/slurm/slurm.conf"
104
+ },
105
+ "cudaVersion": "12.2"
106
+ }
run-20250329_004923-vanwhj5e/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_wandb":{"runtime":4}}
run-20250329_004923-vanwhj5e/logs/debug-core.log ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {"time":"2025-03-29T00:49:23.0903988+08:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmp1bwuylg2/port-35456.txt","pid":35456,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false}
2
+ {"time":"2025-03-29T00:49:23.091269653+08:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":35456}
3
+ {"time":"2025-03-29T00:49:23.091244341+08:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":40795,"Zone":""}}
4
+ {"time":"2025-03-29T00:49:23.270609752+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:54946"}
5
+ {"time":"2025-03-29T00:49:23.694940245+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"vanwhj5e","id":"127.0.0.1:54946"}
6
+ {"time":"2025-03-29T00:49:23.912684251+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"vanwhj5e","id":"127.0.0.1:54946"}
7
+ {"time":"2025-03-29T00:49:27.869501123+08:00","level":"INFO","msg":"received shutdown signal","signal":15}
run-20250329_004923-vanwhj5e/logs/debug-internal.log ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-03-29T00:49:23.696239084+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.8","symlink path":"../outputs/test_7B/wandb/run-20250329_004923-vanwhj5e/logs/debug-core.log"}
2
+ {"time":"2025-03-29T00:49:23.912592778+08:00","level":"INFO","msg":"created new stream","id":"vanwhj5e"}
3
+ {"time":"2025-03-29T00:49:23.91267373+08:00","level":"INFO","msg":"stream: started","id":"vanwhj5e"}
4
+ {"time":"2025-03-29T00:49:23.912692782+08:00","level":"INFO","msg":"handler: started","stream_id":"vanwhj5e"}
5
+ {"time":"2025-03-29T00:49:23.912700862+08:00","level":"INFO","msg":"sender: started","stream_id":"vanwhj5e"}
6
+ {"time":"2025-03-29T00:49:23.912703724+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"vanwhj5e"}
7
+ {"time":"2025-03-29T00:49:24.229784705+08:00","level":"INFO","msg":"Starting system monitor"}
8
+ {"time":"2025-03-29T00:49:27.854433357+08:00","level":"INFO","msg":"Stopping system monitor"}
9
+ {"time":"2025-03-29T00:49:27.855141086+08:00","level":"INFO","msg":"Stopped system monitor"}
run-20250329_004923-vanwhj5e/logs/debug.log ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-03-29 00:49:23,690 INFO MainThread:35456 [wandb_setup.py:_flush():67] Current SDK version is 0.19.8
2
+ 2025-03-29 00:49:23,690 INFO MainThread:35456 [wandb_setup.py:_flush():67] Configure stats pid to 35456
3
+ 2025-03-29 00:49:23,690 INFO MainThread:35456 [wandb_setup.py:_flush():67] Loading settings from /home/yangyaodong/.config/wandb/settings
4
+ 2025-03-29 00:49:23,690 INFO MainThread:35456 [wandb_setup.py:_flush():67] Loading settings from /aifs4su/yaodong/hantao/align-anything/scripts/wandb/settings
5
+ 2025-03-29 00:49:23,690 INFO MainThread:35456 [wandb_setup.py:_flush():67] Loading settings from environment variables
6
+ 2025-03-29 00:49:23,690 INFO MainThread:35456 [wandb_init.py:setup_run_log_directory():647] Logging user logs to ../outputs/test_7B/wandb/run-20250329_004923-vanwhj5e/logs/debug.log
7
+ 2025-03-29 00:49:23,690 INFO MainThread:35456 [wandb_init.py:setup_run_log_directory():648] Logging internal logs to ../outputs/test_7B/wandb/run-20250329_004923-vanwhj5e/logs/debug-internal.log
8
+ 2025-03-29 00:49:23,690 INFO MainThread:35456 [wandb_init.py:init():761] calling init triggers
9
+ 2025-03-29 00:49:23,690 INFO MainThread:35456 [wandb_init.py:init():766] wandb.init called with sweep_config: {}
10
+ config: {'train_cfgs': {'save_checkpoint': False, 'load_checkpoint': False, 'ds_cfgs': 'ds_z3_config.json', 'epochs': 3, 'seed': 42, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 16, 'gradient_checkpointing': True, 'learning_rate': 2e-05, 'lr_scheduler_type': 'cosine', 'lr_warmup_ratio': 0.03, 'weight_decay': 0.0, 'adam_betas': [0.9, 0.95], 'adam_epsilon': 1e-08, 'bf16': True, 'fp16': False, 'eval_strategy': 'epoch', 'eval_interval': 10, 'freeze_mm_proj': False, 'freeze_vision_tower': True, 'freeze_language_model': False, 'max_grad_norm': 1.0}, 'data_cfgs': {'load_multi_datasets': False, 'train_datasets': '/aifs4su/yaodong/hantao/datasets/MMInstruct-GPT4V_mistral-7b_cosi_cut/merged/top1-10', 'train_template': 'MM_TI2T_LLAVA', 'train_size': {}, 'train_split': 'train', 'train_name': 'text-image-to-text', 'train_data_files': {}, 'train_optional_args': [], 'eval_datasets': {}, 'eval_template': {}, 'eval_name': {}, 'eval_size': {}, 'eval_split': {}, 'eval_subset': {}, 'eval_data_files': {}, 'eval_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'align-anything', 'log_run_name': 'sft', 'output_dir': '../outputs/test_7B', 'cache_dir': {}, 'save_total_limit': 6}, 'model_cfgs': {'model_name_or_path': '/aifs4su/yaodong/hantao/models/llava-v1.6-mistral-7b-hf', 'trust_remote_code': True, 'model_max_length': 2048}, 'special_tokens': {}, '_wandb': {}}
11
+ 2025-03-29 00:49:23,690 INFO MainThread:35456 [wandb_init.py:init():784] starting backend
12
+ 2025-03-29 00:49:23,690 INFO MainThread:35456 [wandb_init.py:init():788] sending inform_init request
13
+ 2025-03-29 00:49:23,693 INFO MainThread:35456 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-03-29 00:49:23,693 INFO MainThread:35456 [wandb_init.py:init():798] backend started and connected
15
+ 2025-03-29 00:49:23,694 INFO MainThread:35456 [wandb_init.py:init():891] updated telemetry
16
+ 2025-03-29 00:49:23,705 INFO MainThread:35456 [wandb_init.py:init():915] communicating run to backend with 90.0 second timeout
17
+ 2025-03-29 00:49:24,227 INFO MainThread:35456 [wandb_init.py:init():990] starting run threads in backend
18
+ 2025-03-29 00:49:24,382 INFO MainThread:35456 [wandb_run.py:_console_start():2375] atexit reg
19
+ 2025-03-29 00:49:24,382 INFO MainThread:35456 [wandb_run.py:_redirect():2227] redirect: wrap_raw
20
+ 2025-03-29 00:49:24,382 INFO MainThread:35456 [wandb_run.py:_redirect():2292] Wrapping output streams.
21
+ 2025-03-29 00:49:24,382 INFO MainThread:35456 [wandb_run.py:_redirect():2315] Redirects installed.
22
+ 2025-03-29 00:49:24,384 INFO MainThread:35456 [wandb_init.py:init():1032] run started, returning control to user process
23
+ 2025-03-29 00:49:27,852 INFO MainThread:35456 [wandb_run.py:_finish():2112] finishing run htlou/align-anything/vanwhj5e
24
+ 2025-03-29 00:49:27,853 INFO MainThread:35456 [wandb_run.py:_atexit_cleanup():2340] got exitcode: 0
25
+ 2025-03-29 00:49:27,853 INFO MainThread:35456 [wandb_run.py:_restore():2322] restore
26
+ 2025-03-29 00:49:27,854 INFO MainThread:35456 [wandb_run.py:_restore():2328] restore done
run-20250329_004923-vanwhj5e/run-vanwhj5e.wandb ADDED
File without changes
run-20250329_005139-6x2eqgtz/files/output.log ADDED
File without changes
run-20250329_005139-6x2eqgtz/files/requirements.txt ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ maskrcnn_benchmark==0.0.0
2
+ webdataset==0.2.111
3
+ websockets==15.0.1
4
+ typer==0.15.2
5
+ blobfile==3.0.0
6
+ pooch==1.8.2
7
+ python-dateutil==2.9.0.post0
8
+ gmpy2==2.2.1
9
+ httpcore==1.0.7
10
+ charset-normalizer==3.3.2
11
+ torchlibrosa==0.1.0
12
+ multiprocess==0.70.16
13
+ Werkzeug==3.1.3
14
+ aiofiles==23.2.1
15
+ six==1.17.0
16
+ typing_extensions==4.12.2
17
+ psutil==7.0.0
18
+ frozenlist==1.5.0
19
+ einops==0.8.1
20
+ flash_attn==2.7.4.post1
21
+ PySocks==1.7.1
22
+ regex==2024.11.6
23
+ markdown-it-py==3.0.0
24
+ ruff==0.11.2
25
+ docker-pycreds==0.4.0
26
+ protobuf==5.29.4
27
+ resampy==0.4.3
28
+ aiohappyeyeballs==2.6.1
29
+ httpx==0.28.1
30
+ encodec==0.1.1
31
+ ffmpy==0.5.0
32
+ mkl_random==1.2.8
33
+ soxr==0.5.0.post1
34
+ absl-py==2.2.1
35
+ networkx==3.4.2
36
+ h5py==3.13.0
37
+ hjson==3.1.0
38
+ tensorboard==2.19.0
39
+ aiosignal==1.3.2
40
+ pip==25.0
41
+ triton==3.1.0
42
+ zipp==3.21.0
43
+ ftfy==6.3.1
44
+ attrs==25.3.0
45
+ requests==2.32.3
46
+ progressbar==2.5
47
+ sniffio==1.3.1
48
+ lxml==5.3.1
49
+ starlette==0.46.1
50
+ Markdown==3.7
51
+ mdurl==0.1.2
52
+ torchaudio==2.5.1
53
+ safetensors==0.5.3
54
+ opencv-python==4.6.0.66
55
+ torchvision==0.20.1
56
+ shellingham==1.5.4
57
+ gradio==5.23.1
58
+ timm==1.0.15
59
+ multidict==6.2.0
60
+ semantic-version==2.10.0
61
+ numba==0.60.0
62
+ gradio_client==1.8.0
63
+ pydantic_core==2.33.0
64
+ dill==0.3.8
65
+ msgpack==1.1.0
66
+ sentry-sdk==2.24.1
67
+ grpcio==1.71.0
68
+ cffi==1.17.1
69
+ PyYAML==6.0.2
70
+ tensorboard-data-server==0.7.2
71
+ fastapi==0.115.12
72
+ lazy_loader==0.4
73
+ mkl_fft==1.3.11
74
+ annotated-types==0.7.0
75
+ scikit-learn==1.6.1
76
+ wget==3.2
77
+ setuptools==75.8.0
78
+ certifi==2025.1.31
79
+ click==8.1.8
80
+ laion_clap==1.1.5
81
+ Pygments==2.19.1
82
+ tomlkit==0.13.2
83
+ idna==3.7
84
+ propcache==0.3.1
85
+ platformdirs==4.3.7
86
+ align-anything==0.0.1.dev0
87
+ deepspeed==0.16.5
88
+ smmap==5.0.2
89
+ pillow==11.1.0
90
+ typing-inspection==0.4.0
91
+ braceexpand==0.1.7
92
+ decorator==5.2.1
93
+ pandas==2.2.3
94
+ huggingface-hub==0.29.3
95
+ pyarrow==19.0.1
96
+ tokenizers==0.21.1
97
+ GitPython==3.1.44
98
+ xxhash==3.5.0
99
+ packaging==24.2
100
+ numpy==1.23.4
101
+ setproctitle==1.3.5
102
+ llvmlite==0.43.0
103
+ tiktoken==0.9.0
104
+ mpmath==1.3.0
105
+ nvidia-ml-py==12.570.86
106
+ pydantic==2.11.0
107
+ datasets==3.5.0
108
+ librosa==0.11.0
109
+ frechet_audio_distance==0.3.1
110
+ sympy==1.13.1
111
+ safehttpx==0.1.6
112
+ Jinja2==3.1.6
113
+ h11==0.14.0
114
+ aiohttp==3.11.14
115
+ diffusers==0.32.2
116
+ tqdm==4.67.1
117
+ filelock==3.13.1
118
+ transformers==4.51.0.dev0
119
+ scipy==1.10.1
120
+ audioread==3.0.1
121
+ sentencepiece==0.2.0
122
+ pytz==2025.2
123
+ tzdata==2025.2
124
+ python-multipart==0.0.20
125
+ urllib3==2.3.0
126
+ pycryptodomex==3.22.0
127
+ yarl==1.18.3
128
+ pydub==0.25.1
129
+ pycparser==2.22
130
+ soundfile==0.13.1
131
+ wcwidth==0.2.13
132
+ groovy==0.1.2
133
+ torch==2.5.1
134
+ anyio==4.9.0
135
+ wandb==0.19.8
136
+ joblib==1.4.2
137
+ fsspec==2024.12.0
138
+ peft==0.15.1
139
+ accelerate==1.5.2
140
+ py-cpuinfo==9.0.0
141
+ uvicorn==0.34.0
142
+ orjson==3.10.16
143
+ Brotli==1.0.9
144
+ rich==13.9.4
145
+ importlib_metadata==8.6.1
146
+ ninja==1.11.1.4
147
+ wheel==0.45.1
148
+ MarkupSafe==3.0.2
149
+ threadpoolctl==3.6.0
150
+ gitdb==4.0.12
151
+ mkl-service==2.4.0
152
+ typing_extensions==4.12.2
153
+ tomli==2.0.1
154
+ zipp==3.19.2
155
+ wheel==0.43.0
156
+ jaraco.text==3.12.1
157
+ packaging==24.2
158
+ autocommand==2.2.2
159
+ jaraco.functools==4.0.1
160
+ jaraco.collections==5.1.0
161
+ platformdirs==4.2.2
162
+ more-itertools==10.3.0
163
+ inflect==7.3.1
164
+ jaraco.context==5.3.0
165
+ typeguard==4.3.0
166
+ backports.tarfile==1.2.0
167
+ importlib_metadata==8.0.0
run-20250329_005139-6x2eqgtz/files/wandb-metadata.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-1040-nvidia-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.11.11",
4
+ "startedAt": "2025-03-28T16:51:39.067886Z",
5
+ "args": [
6
+ "--local_rank=0",
7
+ "--model_name_or_path",
8
+ "/aifs4su/yaodong/hantao/models/llava-v1.6-mistral-7b-hf",
9
+ "--train_datasets",
10
+ "/aifs4su/yaodong/hantao/datasets/MMInstruct-GPT4V_mistral-7b_cosi_cut/merged/top1-10",
11
+ "--train_template",
12
+ "MM_TI2T_LLAVA",
13
+ "--train_split",
14
+ "train",
15
+ "--train_name",
16
+ "text-image-to-text",
17
+ "--output_dir",
18
+ "../outputs/test_7B",
19
+ "--save_total_limit",
20
+ "6",
21
+ "--train_batch_size",
22
+ "8",
23
+ "--epochs",
24
+ "3"
25
+ ],
26
+ "program": "-m align_anything.trainers.text_image_to_text.sft",
27
+ "git": {
28
+ "remote": "git@github.com-hantao:PKU-Alignment/align-anything.git",
29
+ "commit": "106588f9802757a3283c1aff1f33ea9afd737f31"
30
+ },
31
+ "email": "2200017789@stu.pku.edu.cn",
32
+ "root": "../outputs/test_7B",
33
+ "host": "dgx-092",
34
+ "executable": "/aifs4su/yaodong/miniconda3/envs/hantao_llama/bin/python"
35
+ }
run-20250329_005139-6x2eqgtz/logs/debug-core.log ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {"time":"2025-03-29T00:51:38.479231525+08:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpb6xx_4px/port-42596.txt","pid":42596,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false}
2
+ {"time":"2025-03-29T00:51:38.480170741+08:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":42596}
3
+ {"time":"2025-03-29T00:51:38.480164974+08:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":35441,"Zone":""}}
4
+ {"time":"2025-03-29T00:51:38.663800746+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:50004"}
5
+ {"time":"2025-03-29T00:51:39.069534671+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"6x2eqgtz","id":"127.0.0.1:50004"}
6
+ {"time":"2025-03-29T00:51:39.285743333+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"6x2eqgtz","id":"127.0.0.1:50004"}
7
+ {"time":"2025-03-29T00:51:40.320100827+08:00","level":"INFO","msg":"received shutdown signal","signal":15}
run-20250329_005139-6x2eqgtz/logs/debug-internal.log ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-03-29T00:51:39.071049577+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.8","symlink path":"../outputs/test_7B/wandb/run-20250329_005139-6x2eqgtz/logs/debug-core.log"}
2
+ {"time":"2025-03-29T00:51:39.285637637+08:00","level":"INFO","msg":"created new stream","id":"6x2eqgtz"}
3
+ {"time":"2025-03-29T00:51:39.285734961+08:00","level":"INFO","msg":"stream: started","id":"6x2eqgtz"}
4
+ {"time":"2025-03-29T00:51:39.285766391+08:00","level":"INFO","msg":"handler: started","stream_id":"6x2eqgtz"}
5
+ {"time":"2025-03-29T00:51:39.286029925+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"6x2eqgtz"}
6
+ {"time":"2025-03-29T00:51:39.285781884+08:00","level":"INFO","msg":"sender: started","stream_id":"6x2eqgtz"}
7
+ {"time":"2025-03-29T00:51:39.613194812+08:00","level":"INFO","msg":"Starting system monitor"}
8
+ {"time":"2025-03-29T00:51:39.846153883+08:00","level":"INFO","msg":"Stopping system monitor"}
run-20250329_005139-6x2eqgtz/logs/debug.log ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-03-29 00:51:39,064 INFO MainThread:42596 [wandb_setup.py:_flush():67] Current SDK version is 0.19.8
2
+ 2025-03-29 00:51:39,064 INFO MainThread:42596 [wandb_setup.py:_flush():67] Configure stats pid to 42596
3
+ 2025-03-29 00:51:39,064 INFO MainThread:42596 [wandb_setup.py:_flush():67] Loading settings from /home/yangyaodong/.config/wandb/settings
4
+ 2025-03-29 00:51:39,064 INFO MainThread:42596 [wandb_setup.py:_flush():67] Loading settings from /aifs4su/yaodong/hantao/align-anything/scripts/wandb/settings
5
+ 2025-03-29 00:51:39,064 INFO MainThread:42596 [wandb_setup.py:_flush():67] Loading settings from environment variables
6
+ 2025-03-29 00:51:39,065 INFO MainThread:42596 [wandb_init.py:setup_run_log_directory():647] Logging user logs to ../outputs/test_7B/wandb/run-20250329_005139-6x2eqgtz/logs/debug.log
7
+ 2025-03-29 00:51:39,065 INFO MainThread:42596 [wandb_init.py:setup_run_log_directory():648] Logging internal logs to ../outputs/test_7B/wandb/run-20250329_005139-6x2eqgtz/logs/debug-internal.log
8
+ 2025-03-29 00:51:39,065 INFO MainThread:42596 [wandb_init.py:init():761] calling init triggers
9
+ 2025-03-29 00:51:39,065 INFO MainThread:42596 [wandb_init.py:init():766] wandb.init called with sweep_config: {}
10
+ config: {'train_cfgs': {'save_checkpoint': False, 'load_checkpoint': False, 'ds_cfgs': 'ds_z3_config.json', 'epochs': 3, 'seed': 42, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 16, 'gradient_checkpointing': True, 'learning_rate': 2e-05, 'lr_scheduler_type': 'cosine', 'lr_warmup_ratio': 0.03, 'weight_decay': 0.0, 'adam_betas': [0.9, 0.95], 'adam_epsilon': 1e-08, 'bf16': True, 'fp16': False, 'eval_strategy': 'epoch', 'eval_interval': 10, 'freeze_mm_proj': False, 'freeze_vision_tower': True, 'freeze_language_model': False, 'max_grad_norm': 1.0}, 'data_cfgs': {'load_multi_datasets': False, 'train_datasets': '/aifs4su/yaodong/hantao/datasets/MMInstruct-GPT4V_mistral-7b_cosi_cut/merged/top1-10', 'train_template': 'MM_TI2T_LLAVA', 'train_size': {}, 'train_split': 'train', 'train_name': 'text-image-to-text', 'train_data_files': {}, 'train_optional_args': [], 'eval_datasets': {}, 'eval_template': {}, 'eval_name': {}, 'eval_size': {}, 'eval_split': {}, 'eval_subset': {}, 'eval_data_files': {}, 'eval_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'align-anything', 'log_run_name': 'sft', 'output_dir': '../outputs/test_7B', 'cache_dir': {}, 'save_total_limit': 6}, 'model_cfgs': {'model_name_or_path': '/aifs4su/yaodong/hantao/models/llava-v1.6-mistral-7b-hf', 'trust_remote_code': True, 'model_max_length': 2048}, 'special_tokens': {}, '_wandb': {}}
11
+ 2025-03-29 00:51:39,065 INFO MainThread:42596 [wandb_init.py:init():784] starting backend
12
+ 2025-03-29 00:51:39,065 INFO MainThread:42596 [wandb_init.py:init():788] sending inform_init request
13
+ 2025-03-29 00:51:39,067 INFO MainThread:42596 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-03-29 00:51:39,067 INFO MainThread:42596 [wandb_init.py:init():798] backend started and connected
15
+ 2025-03-29 00:51:39,068 INFO MainThread:42596 [wandb_init.py:init():891] updated telemetry
16
+ 2025-03-29 00:51:39,080 INFO MainThread:42596 [wandb_init.py:init():915] communicating run to backend with 90.0 second timeout
17
+ 2025-03-29 00:51:39,610 INFO MainThread:42596 [wandb_init.py:init():990] starting run threads in backend
18
+ 2025-03-29 00:51:39,802 INFO MainThread:42596 [wandb_run.py:_console_start():2375] atexit reg
19
+ 2025-03-29 00:51:39,803 INFO MainThread:42596 [wandb_run.py:_redirect():2227] redirect: wrap_raw
20
+ 2025-03-29 00:51:39,803 INFO MainThread:42596 [wandb_run.py:_redirect():2292] Wrapping output streams.
21
+ 2025-03-29 00:51:39,803 INFO MainThread:42596 [wandb_run.py:_redirect():2315] Redirects installed.
22
+ 2025-03-29 00:51:39,805 INFO MainThread:42596 [wandb_init.py:init():1032] run started, returning control to user process
23
+ 2025-03-29 00:51:39,816 INFO MainThread:42596 [wandb_run.py:_finish():2112] finishing run htlou/align-anything/6x2eqgtz
24
+ 2025-03-29 00:51:39,816 INFO MainThread:42596 [wandb_run.py:_atexit_cleanup():2340] got exitcode: 0
25
+ 2025-03-29 00:51:39,816 INFO MainThread:42596 [wandb_run.py:_restore():2322] restore
26
+ 2025-03-29 00:51:39,816 INFO MainThread:42596 [wandb_run.py:_restore():2328] restore done
run-20250329_005139-6x2eqgtz/run-6x2eqgtz.wandb ADDED
File without changes
run-20250329_005425-3al6iztu/files/output.log ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ***** Running training *****
2
+ Resuming from checkpoint 1/3 epoch : 0%| | 0/7326 [00:00<?, ?it/s]Traceback (most recent call last):
3
+ Train dataloader: <torch.utils.data.dataloader.DataLoader object at 0x1551083766d0>
4
+ File "<frozen runpy>", line 198, in _run_module_as_main
5
+ File "<frozen runpy>", line 88, in _run_code
6
+ File "/aifs4su/yaodong/hantao/align-anything/align_anything/trainers/text_image_to_text/sft.py", line 100, in <module>
7
+ sys.exit(main())
8
+ ^^^^^^
9
+ File "/aifs4su/yaodong/hantao/align-anything/align_anything/trainers/text_image_to_text/sft.py", line 95, in main
10
+ trainer.train()
11
+ File "/aifs4su/yaodong/hantao/align-anything/align_anything/trainers/text_to_text/sft.py", line 140, in train
12
+ print('First batch: ', enumerate(self.train_dataloader)[0])
13
+ ~~~~~~~~~~~~~~~~~~~~
run-20250329_005425-3al6iztu/files/requirements.txt ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ maskrcnn_benchmark==0.0.0
2
+ webdataset==0.2.111
3
+ websockets==15.0.1
4
+ typer==0.15.2
5
+ blobfile==3.0.0
6
+ pooch==1.8.2
7
+ python-dateutil==2.9.0.post0
8
+ gmpy2==2.2.1
9
+ httpcore==1.0.7
10
+ charset-normalizer==3.3.2
11
+ torchlibrosa==0.1.0
12
+ multiprocess==0.70.16
13
+ Werkzeug==3.1.3
14
+ aiofiles==23.2.1
15
+ six==1.17.0
16
+ typing_extensions==4.12.2
17
+ psutil==7.0.0
18
+ frozenlist==1.5.0
19
+ einops==0.8.1
20
+ flash_attn==2.7.4.post1
21
+ PySocks==1.7.1
22
+ regex==2024.11.6
23
+ markdown-it-py==3.0.0
24
+ ruff==0.11.2
25
+ docker-pycreds==0.4.0
26
+ protobuf==5.29.4
27
+ resampy==0.4.3
28
+ aiohappyeyeballs==2.6.1
29
+ httpx==0.28.1
30
+ encodec==0.1.1
31
+ ffmpy==0.5.0
32
+ mkl_random==1.2.8
33
+ soxr==0.5.0.post1
34
+ absl-py==2.2.1
35
+ networkx==3.4.2
36
+ h5py==3.13.0
37
+ hjson==3.1.0
38
+ tensorboard==2.19.0
39
+ aiosignal==1.3.2
40
+ pip==25.0
41
+ triton==3.1.0
42
+ zipp==3.21.0
43
+ ftfy==6.3.1
44
+ attrs==25.3.0
45
+ requests==2.32.3
46
+ progressbar==2.5
47
+ sniffio==1.3.1
48
+ lxml==5.3.1
49
+ starlette==0.46.1
50
+ Markdown==3.7
51
+ mdurl==0.1.2
52
+ torchaudio==2.5.1
53
+ safetensors==0.5.3
54
+ opencv-python==4.6.0.66
55
+ torchvision==0.20.1
56
+ shellingham==1.5.4
57
+ gradio==5.23.1
58
+ timm==1.0.15
59
+ multidict==6.2.0
60
+ semantic-version==2.10.0
61
+ numba==0.60.0
62
+ gradio_client==1.8.0
63
+ pydantic_core==2.33.0
64
+ dill==0.3.8
65
+ msgpack==1.1.0
66
+ sentry-sdk==2.24.1
67
+ grpcio==1.71.0
68
+ cffi==1.17.1
69
+ PyYAML==6.0.2
70
+ tensorboard-data-server==0.7.2
71
+ fastapi==0.115.12
72
+ lazy_loader==0.4
73
+ mkl_fft==1.3.11
74
+ annotated-types==0.7.0
75
+ scikit-learn==1.6.1
76
+ wget==3.2
77
+ setuptools==75.8.0
78
+ certifi==2025.1.31
79
+ click==8.1.8
80
+ laion_clap==1.1.5
81
+ Pygments==2.19.1
82
+ tomlkit==0.13.2
83
+ idna==3.7
84
+ propcache==0.3.1
85
+ platformdirs==4.3.7
86
+ align-anything==0.0.1.dev0
87
+ deepspeed==0.16.5
88
+ smmap==5.0.2
89
+ pillow==11.1.0
90
+ typing-inspection==0.4.0
91
+ braceexpand==0.1.7
92
+ decorator==5.2.1
93
+ pandas==2.2.3
94
+ huggingface-hub==0.29.3
95
+ pyarrow==19.0.1
96
+ tokenizers==0.21.1
97
+ GitPython==3.1.44
98
+ xxhash==3.5.0
99
+ packaging==24.2
100
+ numpy==1.23.4
101
+ setproctitle==1.3.5
102
+ llvmlite==0.43.0
103
+ tiktoken==0.9.0
104
+ mpmath==1.3.0
105
+ nvidia-ml-py==12.570.86
106
+ pydantic==2.11.0
107
+ datasets==3.5.0
108
+ librosa==0.11.0
109
+ frechet_audio_distance==0.3.1
110
+ sympy==1.13.1
111
+ safehttpx==0.1.6
112
+ Jinja2==3.1.6
113
+ h11==0.14.0
114
+ aiohttp==3.11.14
115
+ diffusers==0.32.2
116
+ tqdm==4.67.1
117
+ filelock==3.13.1
118
+ transformers==4.51.0.dev0
119
+ scipy==1.10.1
120
+ audioread==3.0.1
121
+ sentencepiece==0.2.0
122
+ pytz==2025.2
123
+ tzdata==2025.2
124
+ python-multipart==0.0.20
125
+ urllib3==2.3.0
126
+ pycryptodomex==3.22.0
127
+ yarl==1.18.3
128
+ pydub==0.25.1
129
+ pycparser==2.22
130
+ soundfile==0.13.1
131
+ wcwidth==0.2.13
132
+ groovy==0.1.2
133
+ torch==2.5.1
134
+ anyio==4.9.0
135
+ wandb==0.19.8
136
+ joblib==1.4.2
137
+ fsspec==2024.12.0
138
+ peft==0.15.1
139
+ accelerate==1.5.2
140
+ py-cpuinfo==9.0.0
141
+ uvicorn==0.34.0
142
+ orjson==3.10.16
143
+ Brotli==1.0.9
144
+ rich==13.9.4
145
+ importlib_metadata==8.6.1
146
+ ninja==1.11.1.4
147
+ wheel==0.45.1
148
+ MarkupSafe==3.0.2
149
+ threadpoolctl==3.6.0
150
+ gitdb==4.0.12
151
+ mkl-service==2.4.0
152
+ typing_extensions==4.12.2
153
+ tomli==2.0.1
154
+ zipp==3.19.2
155
+ wheel==0.43.0
156
+ jaraco.text==3.12.1
157
+ packaging==24.2
158
+ autocommand==2.2.2
159
+ jaraco.functools==4.0.1
160
+ jaraco.collections==5.1.0
161
+ platformdirs==4.2.2
162
+ more-itertools==10.3.0
163
+ inflect==7.3.1
164
+ jaraco.context==5.3.0
165
+ typeguard==4.3.0
166
+ backports.tarfile==1.2.0
167
+ importlib_metadata==8.0.0
run-20250329_005425-3al6iztu/files/wandb-metadata.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-1040-nvidia-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.11.11",
4
+ "startedAt": "2025-03-28T16:54:25.328152Z",
5
+ "args": [
6
+ "--local_rank=0",
7
+ "--model_name_or_path",
8
+ "/aifs4su/yaodong/hantao/models/llava-v1.6-mistral-7b-hf",
9
+ "--train_datasets",
10
+ "/aifs4su/yaodong/hantao/datasets/MMInstruct-GPT4V_mistral-7b_cosi_cut/merged/top1-10",
11
+ "--train_template",
12
+ "MM_TI2T_LLAVA",
13
+ "--train_split",
14
+ "train",
15
+ "--train_name",
16
+ "text-image-to-text",
17
+ "--output_dir",
18
+ "../outputs/test_7B",
19
+ "--save_total_limit",
20
+ "6",
21
+ "--train_batch_size",
22
+ "8",
23
+ "--epochs",
24
+ "3"
25
+ ],
26
+ "program": "-m align_anything.trainers.text_image_to_text.sft",
27
+ "git": {
28
+ "remote": "git@github.com-hantao:PKU-Alignment/align-anything.git",
29
+ "commit": "106588f9802757a3283c1aff1f33ea9afd737f31"
30
+ },
31
+ "email": "2200017789@stu.pku.edu.cn",
32
+ "root": "../outputs/test_7B",
33
+ "host": "dgx-092",
34
+ "executable": "/aifs4su/yaodong/miniconda3/envs/hantao_llama/bin/python"
35
+ }
run-20250329_005425-3al6iztu/logs/debug-core.log ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {"time":"2025-03-29T00:54:24.691982358+08:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpszvpq4wi/port-48756.txt","pid":48756,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false}
2
+ {"time":"2025-03-29T00:54:24.692861871+08:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":48756}
3
+ {"time":"2025-03-29T00:54:24.692845205+08:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":42415,"Zone":""}}
4
+ {"time":"2025-03-29T00:54:24.873664584+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:37608"}
5
+ {"time":"2025-03-29T00:54:25.329808834+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"3al6iztu","id":"127.0.0.1:37608"}
6
+ {"time":"2025-03-29T00:54:25.545982861+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"3al6iztu","id":"127.0.0.1:37608"}
7
+ {"time":"2025-03-29T00:54:26.490223915+08:00","level":"INFO","msg":"received shutdown signal","signal":15}
run-20250329_005425-3al6iztu/logs/debug-internal.log ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-03-29T00:54:25.331350097+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.8","symlink path":"../outputs/test_7B/wandb/run-20250329_005425-3al6iztu/logs/debug-core.log"}
2
+ {"time":"2025-03-29T00:54:25.545872725+08:00","level":"INFO","msg":"created new stream","id":"3al6iztu"}
3
+ {"time":"2025-03-29T00:54:25.54597415+08:00","level":"INFO","msg":"stream: started","id":"3al6iztu"}
4
+ {"time":"2025-03-29T00:54:25.546010693+08:00","level":"INFO","msg":"sender: started","stream_id":"3al6iztu"}
5
+ {"time":"2025-03-29T00:54:25.546003597+08:00","level":"INFO","msg":"handler: started","stream_id":"3al6iztu"}
6
+ {"time":"2025-03-29T00:54:25.546055332+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"3al6iztu"}
7
+ {"time":"2025-03-29T00:54:25.844887265+08:00","level":"INFO","msg":"Starting system monitor"}
8
+ {"time":"2025-03-29T00:54:26.062125748+08:00","level":"INFO","msg":"Stopping system monitor"}
run-20250329_005425-3al6iztu/logs/debug.log ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-03-29 00:54:25,324 INFO MainThread:48756 [wandb_setup.py:_flush():67] Current SDK version is 0.19.8
2
+ 2025-03-29 00:54:25,325 INFO MainThread:48756 [wandb_setup.py:_flush():67] Configure stats pid to 48756
3
+ 2025-03-29 00:54:25,325 INFO MainThread:48756 [wandb_setup.py:_flush():67] Loading settings from /home/yangyaodong/.config/wandb/settings
4
+ 2025-03-29 00:54:25,325 INFO MainThread:48756 [wandb_setup.py:_flush():67] Loading settings from /aifs4su/yaodong/hantao/align-anything/scripts/wandb/settings
5
+ 2025-03-29 00:54:25,325 INFO MainThread:48756 [wandb_setup.py:_flush():67] Loading settings from environment variables
6
+ 2025-03-29 00:54:25,325 INFO MainThread:48756 [wandb_init.py:setup_run_log_directory():647] Logging user logs to ../outputs/test_7B/wandb/run-20250329_005425-3al6iztu/logs/debug.log
7
+ 2025-03-29 00:54:25,325 INFO MainThread:48756 [wandb_init.py:setup_run_log_directory():648] Logging internal logs to ../outputs/test_7B/wandb/run-20250329_005425-3al6iztu/logs/debug-internal.log
8
+ 2025-03-29 00:54:25,325 INFO MainThread:48756 [wandb_init.py:init():761] calling init triggers
9
+ 2025-03-29 00:54:25,325 INFO MainThread:48756 [wandb_init.py:init():766] wandb.init called with sweep_config: {}
10
+ config: {'train_cfgs': {'save_checkpoint': False, 'load_checkpoint': False, 'ds_cfgs': 'ds_z3_config.json', 'epochs': 3, 'seed': 42, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 16, 'gradient_checkpointing': True, 'learning_rate': 2e-05, 'lr_scheduler_type': 'cosine', 'lr_warmup_ratio': 0.03, 'weight_decay': 0.0, 'adam_betas': [0.9, 0.95], 'adam_epsilon': 1e-08, 'bf16': True, 'fp16': False, 'eval_strategy': 'epoch', 'eval_interval': 10, 'freeze_mm_proj': False, 'freeze_vision_tower': True, 'freeze_language_model': False, 'max_grad_norm': 1.0}, 'data_cfgs': {'load_multi_datasets': False, 'train_datasets': '/aifs4su/yaodong/hantao/datasets/MMInstruct-GPT4V_mistral-7b_cosi_cut/merged/top1-10', 'train_template': 'MM_TI2T_LLAVA', 'train_size': {}, 'train_split': 'train', 'train_name': 'text-image-to-text', 'train_data_files': {}, 'train_optional_args': [], 'eval_datasets': {}, 'eval_template': {}, 'eval_name': {}, 'eval_size': {}, 'eval_split': {}, 'eval_subset': {}, 'eval_data_files': {}, 'eval_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'align-anything', 'log_run_name': 'sft', 'output_dir': '../outputs/test_7B', 'cache_dir': {}, 'save_total_limit': 6}, 'model_cfgs': {'model_name_or_path': '/aifs4su/yaodong/hantao/models/llava-v1.6-mistral-7b-hf', 'trust_remote_code': True, 'model_max_length': 2048}, 'special_tokens': {}, '_wandb': {}}
11
+ 2025-03-29 00:54:25,325 INFO MainThread:48756 [wandb_init.py:init():784] starting backend
12
+ 2025-03-29 00:54:25,325 INFO MainThread:48756 [wandb_init.py:init():788] sending inform_init request
13
+ 2025-03-29 00:54:25,327 INFO MainThread:48756 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-03-29 00:54:25,328 INFO MainThread:48756 [wandb_init.py:init():798] backend started and connected
15
+ 2025-03-29 00:54:25,329 INFO MainThread:48756 [wandb_init.py:init():891] updated telemetry
16
+ 2025-03-29 00:54:25,340 INFO MainThread:48756 [wandb_init.py:init():915] communicating run to backend with 90.0 second timeout
17
+ 2025-03-29 00:54:25,842 INFO MainThread:48756 [wandb_init.py:init():990] starting run threads in backend
18
+ 2025-03-29 00:54:26,016 INFO MainThread:48756 [wandb_run.py:_console_start():2375] atexit reg
19
+ 2025-03-29 00:54:26,016 INFO MainThread:48756 [wandb_run.py:_redirect():2227] redirect: wrap_raw
20
+ 2025-03-29 00:54:26,016 INFO MainThread:48756 [wandb_run.py:_redirect():2292] Wrapping output streams.
21
+ 2025-03-29 00:54:26,016 INFO MainThread:48756 [wandb_run.py:_redirect():2315] Redirects installed.
22
+ 2025-03-29 00:54:26,019 INFO MainThread:48756 [wandb_init.py:init():1032] run started, returning control to user process
23
+ 2025-03-29 00:54:26,029 INFO MainThread:48756 [wandb_run.py:_finish():2112] finishing run htlou/align-anything/3al6iztu
24
+ 2025-03-29 00:54:26,030 INFO MainThread:48756 [wandb_run.py:_atexit_cleanup():2340] got exitcode: 0
25
+ 2025-03-29 00:54:26,030 INFO MainThread:48756 [wandb_run.py:_restore():2322] restore
26
+ 2025-03-29 00:54:26,030 INFO MainThread:48756 [wandb_run.py:_restore():2328] restore done
run-20250329_005425-3al6iztu/run-3al6iztu.wandb ADDED
File without changes
run-20250329_005541-bq1jaffa/files/config.yaml ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.19.8
4
+ m: []
5
+ python_version: 3.11.11
6
+ t:
7
+ "1":
8
+ - 1
9
+ - 5
10
+ - 11
11
+ - 41
12
+ - 49
13
+ - 51
14
+ - 53
15
+ - 55
16
+ - 63
17
+ - 71
18
+ - 83
19
+ - 98
20
+ - 105
21
+ "2":
22
+ - 1
23
+ - 5
24
+ - 11
25
+ - 41
26
+ - 49
27
+ - 51
28
+ - 53
29
+ - 55
30
+ - 63
31
+ - 71
32
+ - 83
33
+ - 98
34
+ - 105
35
+ "3":
36
+ - 2
37
+ - 13
38
+ - 16
39
+ - 23
40
+ - 55
41
+ "4": 3.11.11
42
+ "5": 0.19.8
43
+ "6": 4.51.0.dev0
44
+ "8":
45
+ - 5
46
+ "12": 0.19.8
47
+ "13": linux-x86_64
48
+ data_cfgs:
49
+ value:
50
+ eval_optional_args: []
51
+ load_multi_datasets: false
52
+ train_datasets: /aifs4su/yaodong/hantao/datasets/MMInstruct-GPT4V_mistral-7b_cosi_cut/merged/top1-10
53
+ train_name: text-image-to-text
54
+ train_optional_args: []
55
+ train_split: train
56
+ train_template: MM_TI2T_LLAVA
57
+ logger_cfgs:
58
+ value:
59
+ log_project: align-anything
60
+ log_run_name: sft
61
+ log_type: wandb
62
+ output_dir: ../outputs/test_7B
63
+ save_total_limit: 6
64
+ model_cfgs:
65
+ value:
66
+ model_max_length: 2048
67
+ model_name_or_path: /aifs4su/yaodong/hantao/models/llava-v1.6-mistral-7b-hf
68
+ trust_remote_code: true
69
+ train_cfgs:
70
+ value:
71
+ adam_betas:
72
+ - 0.9
73
+ - 0.95
74
+ adam_epsilon: 1e-08
75
+ bf16: true
76
+ ds_cfgs: ds_z3_config.json
77
+ epochs: 3
78
+ eval_interval: 10
79
+ eval_strategy: epoch
80
+ fp16: false
81
+ freeze_language_model: false
82
+ freeze_mm_proj: false
83
+ freeze_vision_tower: true
84
+ gradient_accumulation_steps: 16
85
+ gradient_checkpointing: true
86
+ learning_rate: 2e-05
87
+ load_checkpoint: false
88
+ lr_scheduler_type: cosine
89
+ lr_warmup_ratio: 0.03
90
+ max_grad_norm: 1
91
+ per_device_eval_batch_size: 1
92
+ per_device_train_batch_size: 1
93
+ save_checkpoint: false
94
+ seed: 42
95
+ weight_decay: 0
run-20250329_005541-bq1jaffa/files/output.log ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ***** Running training *****
2
+ Resuming from checkpoint 3/3 epoch : 0%| | 0/7326 [00:00<?, ?it/s]
3
+ Train dataloader: <torch.utils.data.dataloader.DataLoader object at 0x155108341d10>
4
+ <enumerate object at 0x154ff4f2b6f0>
5
+ Check if empty: False
6
+ First data: {'id': 182941, 'image': 'ocr/0001/00000300.jpg', 'conversations': [{'from': 'human', 'value': '<image>\nDo you have the ability to recognize and translate textual information in images?\nA. The image shows the front cover of a book titled "THE CALMING COLLECTION PRESENTS GOODBYE WORRIES TRAIN YOUR MIND TO QUIET YOUR THOUGHTS ANYTIME."\nB. The image displays the product label of a herbal tea called "THE CALMING COLLECTION PRESENTS GOODBYE WORRIES TRAIN YOUR MIND TO QUIET YOUR THOUGHTS ANYTIME."\nC. The image represents a poster promoting a workshop on "THE CALMING COLLECTION PRESENTS GOODBYE WORRIES TRAIN YOUR MIND TO QUIET YOUR THOUGHTS ANYTIME."\nD. "THE CALMING COLLECTION PRESENTS GOODBYE WORRIES TRAIN YOUR MIND TO QUIET YOUR THOUGHTS ANYTIME."\nAnswer with the option\'s letter from the given choices directly.'}, {'from': 'gpt', 'value': 'D'}], 'cosi': 70.53595100308407}
7
+ Train dataloader: <torch.utils.data.dataloader.DataLoader object at 0x155108341d10>
8
+ <enumerate object at 0x155005bebd80>
9
+ Check if empty: False
10
+ First data: {'id': 182941, 'image': 'ocr/0001/00000300.jpg', 'conversations': [{'from': 'human', 'value': '<image>\nDo you have the ability to recognize and translate textual information in images?\nA. The image shows the front cover of a book titled "THE CALMING COLLECTION PRESENTS GOODBYE WORRIES TRAIN YOUR MIND TO QUIET YOUR THOUGHTS ANYTIME."\nB. The image displays the product label of a herbal tea called "THE CALMING COLLECTION PRESENTS GOODBYE WORRIES TRAIN YOUR MIND TO QUIET YOUR THOUGHTS ANYTIME."\nC. The image represents a poster promoting a workshop on "THE CALMING COLLECTION PRESENTS GOODBYE WORRIES TRAIN YOUR MIND TO QUIET YOUR THOUGHTS ANYTIME."\nD. "THE CALMING COLLECTION PRESENTS GOODBYE WORRIES TRAIN YOUR MIND TO QUIET YOUR THOUGHTS ANYTIME."\nAnswer with the option\'s letter from the given choices directly.'}, {'from': 'gpt', 'value': 'D'}], 'cosi': 70.53595100308407}
11
+ Train dataloader: <torch.utils.data.dataloader.DataLoader object at 0x155108341d10>
12
+ <enumerate object at 0x154ff4f75080>
13
+ Check if empty: False
14
+ First data: {'id': 182941, 'image': 'ocr/0001/00000300.jpg', 'conversations': [{'from': 'human', 'value': '<image>\nDo you have the ability to recognize and translate textual information in images?\nA. The image shows the front cover of a book titled "THE CALMING COLLECTION PRESENTS GOODBYE WORRIES TRAIN YOUR MIND TO QUIET YOUR THOUGHTS ANYTIME."\nB. The image displays the product label of a herbal tea called "THE CALMING COLLECTION PRESENTS GOODBYE WORRIES TRAIN YOUR MIND TO QUIET YOUR THOUGHTS ANYTIME."\nC. The image represents a poster promoting a workshop on "THE CALMING COLLECTION PRESENTS GOODBYE WORRIES TRAIN YOUR MIND TO QUIET YOUR THOUGHTS ANYTIME."\nD. "THE CALMING COLLECTION PRESENTS GOODBYE WORRIES TRAIN YOUR MIND TO QUIET YOUR THOUGHTS ANYTIME."\nAnswer with the option\'s letter from the given choices directly.'}, {'from': 'gpt', 'value': 'D'}], 'cosi': 70.53595100308407}
15
+ Saving model to "../outputs/test_7B/slice_end" ...
16
+ Saving 16-bit model...
17
+ [2025-03-29 00:55:50,235] [INFO] [logging.py:107:log_dist] [Rank 0] [Torch] Checkpoint global_step0 is about to be saved!
18
+ [2025-03-29 00:55:50,236] [INFO] [engine.py:3831:save_16bit_model] Saving model weights to ../outputs/test_7B/slice_end/pytorch_model.bin, tag: global_step0
19
+ [2025-03-29 00:55:50,236] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/test_7B/slice_end/pytorch_model.bin...
20
+ [2025-03-29 00:56:05,543] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/test_7B/slice_end/pytorch_model.bin.
21
+ [2025-03-29 00:56:05,544] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step0 is ready now!
22
+ Model saved!
run-20250329_005541-bq1jaffa/files/requirements.txt ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ maskrcnn_benchmark==0.0.0
2
+ webdataset==0.2.111
3
+ websockets==15.0.1
4
+ typer==0.15.2
5
+ blobfile==3.0.0
6
+ pooch==1.8.2
7
+ python-dateutil==2.9.0.post0
8
+ gmpy2==2.2.1
9
+ httpcore==1.0.7
10
+ charset-normalizer==3.3.2
11
+ torchlibrosa==0.1.0
12
+ multiprocess==0.70.16
13
+ Werkzeug==3.1.3
14
+ aiofiles==23.2.1
15
+ six==1.17.0
16
+ typing_extensions==4.12.2
17
+ psutil==7.0.0
18
+ frozenlist==1.5.0
19
+ einops==0.8.1
20
+ flash_attn==2.7.4.post1
21
+ PySocks==1.7.1
22
+ regex==2024.11.6
23
+ markdown-it-py==3.0.0
24
+ ruff==0.11.2
25
+ docker-pycreds==0.4.0
26
+ protobuf==5.29.4
27
+ resampy==0.4.3
28
+ aiohappyeyeballs==2.6.1
29
+ httpx==0.28.1
30
+ encodec==0.1.1
31
+ ffmpy==0.5.0
32
+ mkl_random==1.2.8
33
+ soxr==0.5.0.post1
34
+ absl-py==2.2.1
35
+ networkx==3.4.2
36
+ h5py==3.13.0
37
+ hjson==3.1.0
38
+ tensorboard==2.19.0
39
+ aiosignal==1.3.2
40
+ pip==25.0
41
+ triton==3.1.0
42
+ zipp==3.21.0
43
+ ftfy==6.3.1
44
+ attrs==25.3.0
45
+ requests==2.32.3
46
+ progressbar==2.5
47
+ sniffio==1.3.1
48
+ lxml==5.3.1
49
+ starlette==0.46.1
50
+ Markdown==3.7
51
+ mdurl==0.1.2
52
+ torchaudio==2.5.1
53
+ safetensors==0.5.3
54
+ opencv-python==4.6.0.66
55
+ torchvision==0.20.1
56
+ shellingham==1.5.4
57
+ gradio==5.23.1
58
+ timm==1.0.15
59
+ multidict==6.2.0
60
+ semantic-version==2.10.0
61
+ numba==0.60.0
62
+ gradio_client==1.8.0
63
+ pydantic_core==2.33.0
64
+ dill==0.3.8
65
+ msgpack==1.1.0
66
+ sentry-sdk==2.24.1
67
+ grpcio==1.71.0
68
+ cffi==1.17.1
69
+ PyYAML==6.0.2
70
+ tensorboard-data-server==0.7.2
71
+ fastapi==0.115.12
72
+ lazy_loader==0.4
73
+ mkl_fft==1.3.11
74
+ annotated-types==0.7.0
75
+ scikit-learn==1.6.1
76
+ wget==3.2
77
+ setuptools==75.8.0
78
+ certifi==2025.1.31
79
+ click==8.1.8
80
+ laion_clap==1.1.5
81
+ Pygments==2.19.1
82
+ tomlkit==0.13.2
83
+ idna==3.7
84
+ propcache==0.3.1
85
+ platformdirs==4.3.7
86
+ align-anything==0.0.1.dev0
87
+ deepspeed==0.16.5
88
+ smmap==5.0.2
89
+ pillow==11.1.0
90
+ typing-inspection==0.4.0
91
+ braceexpand==0.1.7
92
+ decorator==5.2.1
93
+ pandas==2.2.3
94
+ huggingface-hub==0.29.3
95
+ pyarrow==19.0.1
96
+ tokenizers==0.21.1
97
+ GitPython==3.1.44
98
+ xxhash==3.5.0
99
+ packaging==24.2
100
+ numpy==1.23.4
101
+ setproctitle==1.3.5
102
+ llvmlite==0.43.0
103
+ tiktoken==0.9.0
104
+ mpmath==1.3.0
105
+ nvidia-ml-py==12.570.86
106
+ pydantic==2.11.0
107
+ datasets==3.5.0
108
+ librosa==0.11.0
109
+ frechet_audio_distance==0.3.1
110
+ sympy==1.13.1
111
+ safehttpx==0.1.6
112
+ Jinja2==3.1.6
113
+ h11==0.14.0
114
+ aiohttp==3.11.14
115
+ diffusers==0.32.2
116
+ tqdm==4.67.1
117
+ filelock==3.13.1
118
+ transformers==4.51.0.dev0
119
+ scipy==1.10.1
120
+ audioread==3.0.1
121
+ sentencepiece==0.2.0
122
+ pytz==2025.2
123
+ tzdata==2025.2
124
+ python-multipart==0.0.20
125
+ urllib3==2.3.0
126
+ pycryptodomex==3.22.0
127
+ yarl==1.18.3
128
+ pydub==0.25.1
129
+ pycparser==2.22
130
+ soundfile==0.13.1
131
+ wcwidth==0.2.13
132
+ groovy==0.1.2
133
+ torch==2.5.1
134
+ anyio==4.9.0
135
+ wandb==0.19.8
136
+ joblib==1.4.2
137
+ fsspec==2024.12.0
138
+ peft==0.15.1
139
+ accelerate==1.5.2
140
+ py-cpuinfo==9.0.0
141
+ uvicorn==0.34.0
142
+ orjson==3.10.16
143
+ Brotli==1.0.9
144
+ rich==13.9.4
145
+ importlib_metadata==8.6.1
146
+ ninja==1.11.1.4
147
+ wheel==0.45.1
148
+ MarkupSafe==3.0.2
149
+ threadpoolctl==3.6.0
150
+ gitdb==4.0.12
151
+ mkl-service==2.4.0
152
+ typing_extensions==4.12.2
153
+ tomli==2.0.1
154
+ zipp==3.19.2
155
+ wheel==0.43.0
156
+ jaraco.text==3.12.1
157
+ packaging==24.2
158
+ autocommand==2.2.2
159
+ jaraco.functools==4.0.1
160
+ jaraco.collections==5.1.0
161
+ platformdirs==4.2.2
162
+ more-itertools==10.3.0
163
+ inflect==7.3.1
164
+ jaraco.context==5.3.0
165
+ typeguard==4.3.0
166
+ backports.tarfile==1.2.0
167
+ importlib_metadata==8.0.0
run-20250329_005541-bq1jaffa/files/wandb-metadata.json ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-1040-nvidia-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.11.11",
4
+ "startedAt": "2025-03-28T16:55:41.711696Z",
5
+ "args": [
6
+ "--local_rank=0",
7
+ "--model_name_or_path",
8
+ "/aifs4su/yaodong/hantao/models/llava-v1.6-mistral-7b-hf",
9
+ "--train_datasets",
10
+ "/aifs4su/yaodong/hantao/datasets/MMInstruct-GPT4V_mistral-7b_cosi_cut/merged/top1-10",
11
+ "--train_template",
12
+ "MM_TI2T_LLAVA",
13
+ "--train_split",
14
+ "train",
15
+ "--train_name",
16
+ "text-image-to-text",
17
+ "--output_dir",
18
+ "../outputs/test_7B",
19
+ "--save_total_limit",
20
+ "6",
21
+ "--train_batch_size",
22
+ "8",
23
+ "--epochs",
24
+ "3"
25
+ ],
26
+ "program": "-m align_anything.trainers.text_image_to_text.sft",
27
+ "git": {
28
+ "remote": "git@github.com-hantao:PKU-Alignment/align-anything.git",
29
+ "commit": "106588f9802757a3283c1aff1f33ea9afd737f31"
30
+ },
31
+ "email": "2200017789@stu.pku.edu.cn",
32
+ "root": "../outputs/test_7B",
33
+ "host": "dgx-092",
34
+ "executable": "/aifs4su/yaodong/miniconda3/envs/hantao_llama/bin/python",
35
+ "cpu_count": 112,
36
+ "cpu_count_logical": 224,
37
+ "gpu": "NVIDIA H800",
38
+ "gpu_count": 8,
39
+ "disk": {
40
+ "/": {
41
+ "total": "1888556142592",
42
+ "used": "148609179648"
43
+ }
44
+ },
45
+ "memory": {
46
+ "total": "2164195454976"
47
+ },
48
+ "cpu": {
49
+ "count": 112,
50
+ "countLogical": 224
51
+ },
52
+ "gpu_nvidia": [
53
+ {
54
+ "name": "NVIDIA H800",
55
+ "memoryTotal": "85520809984",
56
+ "cudaCores": 16896,
57
+ "architecture": "Hopper"
58
+ },
59
+ {
60
+ "name": "NVIDIA H800",
61
+ "memoryTotal": "85520809984",
62
+ "cudaCores": 16896,
63
+ "architecture": "Hopper"
64
+ },
65
+ {
66
+ "name": "NVIDIA H800",
67
+ "memoryTotal": "85520809984",
68
+ "cudaCores": 16896,
69
+ "architecture": "Hopper"
70
+ },
71
+ {
72
+ "name": "NVIDIA H800",
73
+ "memoryTotal": "85520809984",
74
+ "cudaCores": 16896,
75
+ "architecture": "Hopper"
76
+ },
77
+ {
78
+ "name": "NVIDIA H800",
79
+ "memoryTotal": "85520809984",
80
+ "cudaCores": 16896,
81
+ "architecture": "Hopper"
82
+ },
83
+ {
84
+ "name": "NVIDIA H800",
85
+ "memoryTotal": "85520809984",
86
+ "cudaCores": 16896,
87
+ "architecture": "Hopper"
88
+ },
89
+ {
90
+ "name": "NVIDIA H800",
91
+ "memoryTotal": "85520809984",
92
+ "cudaCores": 16896,
93
+ "architecture": "Hopper"
94
+ },
95
+ {
96
+ "name": "NVIDIA H800",
97
+ "memoryTotal": "85520809984",
98
+ "cudaCores": 16896,
99
+ "architecture": "Hopper"
100
+ }
101
+ ],
102
+ "slurm": {
103
+ "conf": "/cm/shared/apps/slurm/var/etc/slurm/slurm.conf"
104
+ },
105
+ "cudaVersion": "12.2"
106
+ }
run-20250329_005541-bq1jaffa/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_wandb":{"runtime":24}}
run-20250329_005541-bq1jaffa/logs/debug-core.log ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-03-29T00:55:41.128572776+08:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpaadyf_35/port-52806.txt","pid":52806,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false}
2
+ {"time":"2025-03-29T00:55:41.129538675+08:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":52806}
3
+ {"time":"2025-03-29T00:55:41.129533087+08:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":38349,"Zone":""}}
4
+ {"time":"2025-03-29T00:55:41.310059156+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:46506"}
5
+ {"time":"2025-03-29T00:55:41.713050692+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"bq1jaffa","id":"127.0.0.1:46506"}
6
+ {"time":"2025-03-29T00:55:41.928489719+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"bq1jaffa","id":"127.0.0.1:46506"}
7
+ {"time":"2025-03-29T00:56:07.1453019+08:00","level":"INFO","msg":"handleInformFinish: finish message received","streamId":"bq1jaffa","id":"127.0.0.1:46506"}
8
+ {"time":"2025-03-29T00:56:07.146579952+08:00","level":"INFO","msg":"handleInformFinish: stream closed","streamId":"bq1jaffa","id":"127.0.0.1:46506"}
9
+ {"time":"2025-03-29T00:56:08.145760813+08:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:46506"}
10
+ {"time":"2025-03-29T00:56:08.145785955+08:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:46506"}
11
+ {"time":"2025-03-29T00:56:08.14579164+08:00","level":"INFO","msg":"server is shutting down"}
12
+ {"time":"2025-03-29T00:56:08.145814767+08:00","level":"INFO","msg":"connection: closing","id":"127.0.0.1:46506"}
13
+ {"time":"2025-03-29T00:56:08.145848309+08:00","level":"INFO","msg":"connection: closed successfully","id":"127.0.0.1:46506"}
14
+ {"time":"2025-03-29T00:56:08.145850825+08:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:46506"}
15
+ {"time":"2025-03-29T00:56:08.145853752+08:00","level":"INFO","msg":"server is closed"}
run-20250329_005541-bq1jaffa/logs/debug-internal.log ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-03-29T00:55:41.714661009+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.8","symlink path":"../outputs/test_7B/wandb/run-20250329_005541-bq1jaffa/logs/debug-core.log"}
2
+ {"time":"2025-03-29T00:55:41.928417193+08:00","level":"INFO","msg":"created new stream","id":"bq1jaffa"}
3
+ {"time":"2025-03-29T00:55:41.928482583+08:00","level":"INFO","msg":"stream: started","id":"bq1jaffa"}
4
+ {"time":"2025-03-29T00:55:41.928501227+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"bq1jaffa"}
5
+ {"time":"2025-03-29T00:55:41.928513756+08:00","level":"INFO","msg":"sender: started","stream_id":"bq1jaffa"}
6
+ {"time":"2025-03-29T00:55:41.928511582+08:00","level":"INFO","msg":"handler: started","stream_id":"bq1jaffa"}
7
+ {"time":"2025-03-29T00:55:42.22838417+08:00","level":"INFO","msg":"Starting system monitor"}
8
+ {"time":"2025-03-29T00:56:06.166942314+08:00","level":"INFO","msg":"Stopping system monitor"}
9
+ {"time":"2025-03-29T00:56:06.16760159+08:00","level":"INFO","msg":"Stopped system monitor"}
10
+ {"time":"2025-03-29T00:56:06.921670341+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
11
+ {"time":"2025-03-29T00:56:07.143514035+08:00","level":"INFO","msg":"handler: operation stats","stats":{}}
12
+ {"time":"2025-03-29T00:56:07.145585911+08:00","level":"INFO","msg":"stream: closing","id":"bq1jaffa"}
13
+ {"time":"2025-03-29T00:56:07.145626123+08:00","level":"INFO","msg":"handler: closed","stream_id":"bq1jaffa"}
14
+ {"time":"2025-03-29T00:56:07.145635477+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"bq1jaffa"}
15
+ {"time":"2025-03-29T00:56:07.145639618+08:00","level":"INFO","msg":"sender: closed","stream_id":"bq1jaffa"}
16
+ {"time":"2025-03-29T00:56:07.146568555+08:00","level":"INFO","msg":"stream: closed","id":"bq1jaffa"}
run-20250329_005541-bq1jaffa/logs/debug.log ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-03-29 00:55:41,708 INFO MainThread:52806 [wandb_setup.py:_flush():67] Current SDK version is 0.19.8
2
+ 2025-03-29 00:55:41,709 INFO MainThread:52806 [wandb_setup.py:_flush():67] Configure stats pid to 52806
3
+ 2025-03-29 00:55:41,709 INFO MainThread:52806 [wandb_setup.py:_flush():67] Loading settings from /home/yangyaodong/.config/wandb/settings
4
+ 2025-03-29 00:55:41,709 INFO MainThread:52806 [wandb_setup.py:_flush():67] Loading settings from /aifs4su/yaodong/hantao/align-anything/scripts/wandb/settings
5
+ 2025-03-29 00:55:41,709 INFO MainThread:52806 [wandb_setup.py:_flush():67] Loading settings from environment variables
6
+ 2025-03-29 00:55:41,709 INFO MainThread:52806 [wandb_init.py:setup_run_log_directory():647] Logging user logs to ../outputs/test_7B/wandb/run-20250329_005541-bq1jaffa/logs/debug.log
7
+ 2025-03-29 00:55:41,709 INFO MainThread:52806 [wandb_init.py:setup_run_log_directory():648] Logging internal logs to ../outputs/test_7B/wandb/run-20250329_005541-bq1jaffa/logs/debug-internal.log
8
+ 2025-03-29 00:55:41,709 INFO MainThread:52806 [wandb_init.py:init():761] calling init triggers
9
+ 2025-03-29 00:55:41,709 INFO MainThread:52806 [wandb_init.py:init():766] wandb.init called with sweep_config: {}
10
+ config: {'train_cfgs': {'save_checkpoint': False, 'load_checkpoint': False, 'ds_cfgs': 'ds_z3_config.json', 'epochs': 3, 'seed': 42, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 16, 'gradient_checkpointing': True, 'learning_rate': 2e-05, 'lr_scheduler_type': 'cosine', 'lr_warmup_ratio': 0.03, 'weight_decay': 0.0, 'adam_betas': [0.9, 0.95], 'adam_epsilon': 1e-08, 'bf16': True, 'fp16': False, 'eval_strategy': 'epoch', 'eval_interval': 10, 'freeze_mm_proj': False, 'freeze_vision_tower': True, 'freeze_language_model': False, 'max_grad_norm': 1.0}, 'data_cfgs': {'load_multi_datasets': False, 'train_datasets': '/aifs4su/yaodong/hantao/datasets/MMInstruct-GPT4V_mistral-7b_cosi_cut/merged/top1-10', 'train_template': 'MM_TI2T_LLAVA', 'train_size': {}, 'train_split': 'train', 'train_name': 'text-image-to-text', 'train_data_files': {}, 'train_optional_args': [], 'eval_datasets': {}, 'eval_template': {}, 'eval_name': {}, 'eval_size': {}, 'eval_split': {}, 'eval_subset': {}, 'eval_data_files': {}, 'eval_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'align-anything', 'log_run_name': 'sft', 'output_dir': '../outputs/test_7B', 'cache_dir': {}, 'save_total_limit': 6}, 'model_cfgs': {'model_name_or_path': '/aifs4su/yaodong/hantao/models/llava-v1.6-mistral-7b-hf', 'trust_remote_code': True, 'model_max_length': 2048}, 'special_tokens': {}, '_wandb': {}}
11
+ 2025-03-29 00:55:41,709 INFO MainThread:52806 [wandb_init.py:init():784] starting backend
12
+ 2025-03-29 00:55:41,709 INFO MainThread:52806 [wandb_init.py:init():788] sending inform_init request
13
+ 2025-03-29 00:55:41,711 INFO MainThread:52806 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-03-29 00:55:41,711 INFO MainThread:52806 [wandb_init.py:init():798] backend started and connected
15
+ 2025-03-29 00:55:41,712 INFO MainThread:52806 [wandb_init.py:init():891] updated telemetry
16
+ 2025-03-29 00:55:41,722 INFO MainThread:52806 [wandb_init.py:init():915] communicating run to backend with 90.0 second timeout
17
+ 2025-03-29 00:55:42,226 INFO MainThread:52806 [wandb_init.py:init():990] starting run threads in backend
18
+ 2025-03-29 00:55:42,378 INFO MainThread:52806 [wandb_run.py:_console_start():2375] atexit reg
19
+ 2025-03-29 00:55:42,378 INFO MainThread:52806 [wandb_run.py:_redirect():2227] redirect: wrap_raw
20
+ 2025-03-29 00:55:42,378 INFO MainThread:52806 [wandb_run.py:_redirect():2292] Wrapping output streams.
21
+ 2025-03-29 00:55:42,378 INFO MainThread:52806 [wandb_run.py:_redirect():2315] Redirects installed.
22
+ 2025-03-29 00:55:42,380 INFO MainThread:52806 [wandb_init.py:init():1032] run started, returning control to user process
23
+ 2025-03-29 00:56:06,165 INFO MainThread:52806 [wandb_run.py:_finish():2112] finishing run htlou/align-anything/bq1jaffa
24
+ 2025-03-29 00:56:06,165 INFO MainThread:52806 [wandb_run.py:_atexit_cleanup():2340] got exitcode: 0
25
+ 2025-03-29 00:56:06,166 INFO MainThread:52806 [wandb_run.py:_restore():2322] restore
26
+ 2025-03-29 00:56:06,166 INFO MainThread:52806 [wandb_run.py:_restore():2328] restore done
27
+ 2025-03-29 00:56:07,144 INFO MainThread:52806 [wandb_run.py:_footer_history_summary_info():3956] rendering history
28
+ 2025-03-29 00:56:07,144 INFO MainThread:52806 [wandb_run.py:_footer_history_summary_info():3988] rendering summary
29
+ 2025-03-29 00:56:07,144 INFO MainThread:52806 [wandb_run.py:_footer_sync_info():3917] logging synced files
run-20250329_005541-bq1jaffa/run-bq1jaffa.wandb ADDED
Binary file (15.4 kB). View file