Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +2 -0
- debug-internal.log +7 -0
- debug.log +24 -0
- run-20250329_003552-1mlhe6om/files/config.yaml +95 -0
- run-20250329_003552-1mlhe6om/files/output.log +21 -0
- run-20250329_003552-1mlhe6om/files/requirements.txt +167 -0
- run-20250329_003552-1mlhe6om/files/wandb-metadata.json +106 -0
- run-20250329_003552-1mlhe6om/files/wandb-summary.json +1 -0
- run-20250329_003552-1mlhe6om/logs/debug-core.log +13 -0
- run-20250329_003552-1mlhe6om/logs/debug-internal.log +16 -0
- run-20250329_003552-1mlhe6om/logs/debug.log +47 -0
- run-20250329_003552-1mlhe6om/run-1mlhe6om.wandb +0 -0
- run-20250329_003923-yubb37lj/files/output.log +16 -0
- run-20250329_003923-yubb37lj/files/requirements.txt +167 -0
- run-20250329_003923-yubb37lj/files/wandb-metadata.json +106 -0
- run-20250329_003923-yubb37lj/logs/debug-core.log +7 -0
- run-20250329_003923-yubb37lj/logs/debug-internal.log +7 -0
- run-20250329_003923-yubb37lj/logs/debug.log +22 -0
- run-20250329_003923-yubb37lj/run-yubb37lj.wandb +0 -0
- run-20250329_004923-vanwhj5e/files/output.log +60 -0
- run-20250329_004923-vanwhj5e/files/requirements.txt +167 -0
- run-20250329_004923-vanwhj5e/files/wandb-metadata.json +106 -0
- run-20250329_004923-vanwhj5e/files/wandb-summary.json +1 -0
- run-20250329_004923-vanwhj5e/logs/debug-core.log +7 -0
- run-20250329_004923-vanwhj5e/logs/debug-internal.log +9 -0
- run-20250329_004923-vanwhj5e/logs/debug.log +26 -0
- run-20250329_004923-vanwhj5e/run-vanwhj5e.wandb +0 -0
- run-20250329_005139-6x2eqgtz/files/output.log +0 -0
- run-20250329_005139-6x2eqgtz/files/requirements.txt +167 -0
- run-20250329_005139-6x2eqgtz/files/wandb-metadata.json +35 -0
- run-20250329_005139-6x2eqgtz/logs/debug-core.log +7 -0
- run-20250329_005139-6x2eqgtz/logs/debug-internal.log +8 -0
- run-20250329_005139-6x2eqgtz/logs/debug.log +26 -0
- run-20250329_005139-6x2eqgtz/run-6x2eqgtz.wandb +0 -0
- run-20250329_005425-3al6iztu/files/output.log +13 -0
- run-20250329_005425-3al6iztu/files/requirements.txt +167 -0
- run-20250329_005425-3al6iztu/files/wandb-metadata.json +35 -0
- run-20250329_005425-3al6iztu/logs/debug-core.log +7 -0
- run-20250329_005425-3al6iztu/logs/debug-internal.log +8 -0
- run-20250329_005425-3al6iztu/logs/debug.log +26 -0
- run-20250329_005425-3al6iztu/run-3al6iztu.wandb +0 -0
- run-20250329_005541-bq1jaffa/files/config.yaml +95 -0
- run-20250329_005541-bq1jaffa/files/output.log +22 -0
- run-20250329_005541-bq1jaffa/files/requirements.txt +167 -0
- run-20250329_005541-bq1jaffa/files/wandb-metadata.json +106 -0
- run-20250329_005541-bq1jaffa/files/wandb-summary.json +1 -0
- run-20250329_005541-bq1jaffa/logs/debug-core.log +15 -0
- run-20250329_005541-bq1jaffa/logs/debug-internal.log +16 -0
- run-20250329_005541-bq1jaffa/logs/debug.log +29 -0
- run-20250329_005541-bq1jaffa/run-bq1jaffa.wandb +0 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
run-20250329_010934-3x35hjks/run-3x35hjks.wandb filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
run-20250329_012205-co1ecmky/run-co1ecmky.wandb filter=lfs diff=lfs merge=lfs -text
|
debug-internal.log
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-03-29T01:22:05.252520599+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.8","symlink path":"../outputs/test_7B/wandb/run-20250329_012205-co1ecmky/logs/debug-core.log"}
|
| 2 |
+
{"time":"2025-03-29T01:22:05.467254306+08:00","level":"INFO","msg":"created new stream","id":"co1ecmky"}
|
| 3 |
+
{"time":"2025-03-29T01:22:05.467309548+08:00","level":"INFO","msg":"stream: started","id":"co1ecmky"}
|
| 4 |
+
{"time":"2025-03-29T01:22:05.467331707+08:00","level":"INFO","msg":"handler: started","stream_id":"co1ecmky"}
|
| 5 |
+
{"time":"2025-03-29T01:22:05.467333162+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"co1ecmky"}
|
| 6 |
+
{"time":"2025-03-29T01:22:05.467336174+08:00","level":"INFO","msg":"sender: started","stream_id":"co1ecmky"}
|
| 7 |
+
{"time":"2025-03-29T01:22:05.772490021+08:00","level":"INFO","msg":"Starting system monitor"}
|
debug.log
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-03-29 01:22:05,246 INFO MainThread:104999 [wandb_setup.py:_flush():67] Current SDK version is 0.19.8
|
| 2 |
+
2025-03-29 01:22:05,246 INFO MainThread:104999 [wandb_setup.py:_flush():67] Configure stats pid to 104999
|
| 3 |
+
2025-03-29 01:22:05,246 INFO MainThread:104999 [wandb_setup.py:_flush():67] Loading settings from /home/yangyaodong/.config/wandb/settings
|
| 4 |
+
2025-03-29 01:22:05,246 INFO MainThread:104999 [wandb_setup.py:_flush():67] Loading settings from /aifs4su/yaodong/hantao/align-anything/scripts/wandb/settings
|
| 5 |
+
2025-03-29 01:22:05,246 INFO MainThread:104999 [wandb_setup.py:_flush():67] Loading settings from environment variables
|
| 6 |
+
2025-03-29 01:22:05,246 INFO MainThread:104999 [wandb_init.py:setup_run_log_directory():647] Logging user logs to ../outputs/test_7B/wandb/run-20250329_012205-co1ecmky/logs/debug.log
|
| 7 |
+
2025-03-29 01:22:05,246 INFO MainThread:104999 [wandb_init.py:setup_run_log_directory():648] Logging internal logs to ../outputs/test_7B/wandb/run-20250329_012205-co1ecmky/logs/debug-internal.log
|
| 8 |
+
2025-03-29 01:22:05,246 INFO MainThread:104999 [wandb_init.py:init():761] calling init triggers
|
| 9 |
+
2025-03-29 01:22:05,246 INFO MainThread:104999 [wandb_init.py:init():766] wandb.init called with sweep_config: {}
|
| 10 |
+
config: {'train_cfgs': {'save_checkpoint': False, 'load_checkpoint': False, 'ds_cfgs': 'ds_z3_config.json', 'epochs': 3, 'seed': 42, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 16, 'gradient_checkpointing': True, 'learning_rate': 2e-05, 'lr_scheduler_type': 'cosine', 'lr_warmup_ratio': 0.03, 'weight_decay': 0.0, 'adam_betas': [0.9, 0.95], 'adam_epsilon': 1e-08, 'bf16': True, 'fp16': False, 'eval_strategy': 'epoch', 'eval_interval': 10, 'freeze_mm_proj': False, 'freeze_vision_tower': True, 'freeze_language_model': False, 'max_grad_norm': 1.0}, 'data_cfgs': {'load_multi_datasets': False, 'train_datasets': '/aifs4su/yaodong/hantao/datasets/MMInstruct-GPT4V_mistral-7b_cosi_cut/merged/top1-10', 'train_template': 'MM_TI2T_LLAVA', 'train_size': {}, 'train_split': 'train', 'train_name': 'text-image-to-text', 'train_data_files': {}, 'train_optional_args': [], 'eval_datasets': {}, 'eval_template': {}, 'eval_name': {}, 'eval_size': {}, 'eval_split': {}, 'eval_subset': {}, 'eval_data_files': {}, 'eval_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'align-anything', 'log_run_name': 'sft', 'output_dir': '../outputs/test_7B', 'cache_dir': {}, 'save_total_limit': 6}, 'model_cfgs': {'model_name_or_path': '/aifs4su/yaodong/hantao/models/llava-v1.6-mistral-7b-hf', 'trust_remote_code': True, 'model_max_length': 2048}, 'special_tokens': {}, '_wandb': {}}
|
| 11 |
+
2025-03-29 01:22:05,246 INFO MainThread:104999 [wandb_init.py:init():784] starting backend
|
| 12 |
+
2025-03-29 01:22:05,246 INFO MainThread:104999 [wandb_init.py:init():788] sending inform_init request
|
| 13 |
+
2025-03-29 01:22:05,249 INFO MainThread:104999 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
| 14 |
+
2025-03-29 01:22:05,249 INFO MainThread:104999 [wandb_init.py:init():798] backend started and connected
|
| 15 |
+
2025-03-29 01:22:05,251 INFO MainThread:104999 [wandb_init.py:init():891] updated telemetry
|
| 16 |
+
2025-03-29 01:22:05,262 INFO MainThread:104999 [wandb_init.py:init():915] communicating run to backend with 90.0 second timeout
|
| 17 |
+
2025-03-29 01:22:05,770 INFO MainThread:104999 [wandb_init.py:init():990] starting run threads in backend
|
| 18 |
+
2025-03-29 01:22:05,989 INFO MainThread:104999 [wandb_run.py:_console_start():2375] atexit reg
|
| 19 |
+
2025-03-29 01:22:05,990 INFO MainThread:104999 [wandb_run.py:_redirect():2227] redirect: wrap_raw
|
| 20 |
+
2025-03-29 01:22:05,990 INFO MainThread:104999 [wandb_run.py:_redirect():2292] Wrapping output streams.
|
| 21 |
+
2025-03-29 01:22:05,990 INFO MainThread:104999 [wandb_run.py:_redirect():2315] Redirects installed.
|
| 22 |
+
2025-03-29 01:22:05,992 INFO MainThread:104999 [wandb_init.py:init():1032] run started, returning control to user process
|
| 23 |
+
2025-03-29 01:26:51,278 INFO MainThread:104999 [wandb_run.py:_finish():2112] finishing run htlou/align-anything/co1ecmky
|
| 24 |
+
2025-03-29 01:26:51,278 INFO MainThread:104999 [wandb_run.py:_atexit_cleanup():2340] got exitcode: 0
|
run-20250329_003552-1mlhe6om/files/config.yaml
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_wandb:
|
| 2 |
+
value:
|
| 3 |
+
cli_version: 0.19.8
|
| 4 |
+
m: []
|
| 5 |
+
python_version: 3.11.11
|
| 6 |
+
t:
|
| 7 |
+
"1":
|
| 8 |
+
- 1
|
| 9 |
+
- 5
|
| 10 |
+
- 11
|
| 11 |
+
- 41
|
| 12 |
+
- 49
|
| 13 |
+
- 51
|
| 14 |
+
- 53
|
| 15 |
+
- 55
|
| 16 |
+
- 63
|
| 17 |
+
- 71
|
| 18 |
+
- 83
|
| 19 |
+
- 98
|
| 20 |
+
- 105
|
| 21 |
+
"2":
|
| 22 |
+
- 1
|
| 23 |
+
- 5
|
| 24 |
+
- 11
|
| 25 |
+
- 41
|
| 26 |
+
- 49
|
| 27 |
+
- 51
|
| 28 |
+
- 53
|
| 29 |
+
- 55
|
| 30 |
+
- 63
|
| 31 |
+
- 71
|
| 32 |
+
- 83
|
| 33 |
+
- 98
|
| 34 |
+
- 105
|
| 35 |
+
"3":
|
| 36 |
+
- 2
|
| 37 |
+
- 13
|
| 38 |
+
- 16
|
| 39 |
+
- 23
|
| 40 |
+
- 55
|
| 41 |
+
"4": 3.11.11
|
| 42 |
+
"5": 0.19.8
|
| 43 |
+
"6": 4.51.0.dev0
|
| 44 |
+
"8":
|
| 45 |
+
- 5
|
| 46 |
+
"12": 0.19.8
|
| 47 |
+
"13": linux-x86_64
|
| 48 |
+
data_cfgs:
|
| 49 |
+
value:
|
| 50 |
+
eval_optional_args: []
|
| 51 |
+
load_multi_datasets: false
|
| 52 |
+
train_datasets: /aifs4su/yaodong/hantao/datasets/MMInstruct-GPT4V_mistral-7b_cosi_cut/merged/top1-10
|
| 53 |
+
train_name: text-image-to-text
|
| 54 |
+
train_optional_args: []
|
| 55 |
+
train_split: train
|
| 56 |
+
train_template: MM_TI2T_LLAVA
|
| 57 |
+
logger_cfgs:
|
| 58 |
+
value:
|
| 59 |
+
log_project: align-anything
|
| 60 |
+
log_run_name: sft
|
| 61 |
+
log_type: wandb
|
| 62 |
+
output_dir: ../outputs/test_7B
|
| 63 |
+
save_total_limit: 6
|
| 64 |
+
model_cfgs:
|
| 65 |
+
value:
|
| 66 |
+
model_max_length: 2048
|
| 67 |
+
model_name_or_path: /aifs4su/yaodong/hantao/models/llava-v1.6-mistral-7b-hf
|
| 68 |
+
trust_remote_code: true
|
| 69 |
+
train_cfgs:
|
| 70 |
+
value:
|
| 71 |
+
adam_betas:
|
| 72 |
+
- 0.9
|
| 73 |
+
- 0.95
|
| 74 |
+
adam_epsilon: 1e-08
|
| 75 |
+
bf16: true
|
| 76 |
+
ds_cfgs: ds_z3_config.json
|
| 77 |
+
epochs: 3
|
| 78 |
+
eval_interval: 10
|
| 79 |
+
eval_strategy: epoch
|
| 80 |
+
fp16: false
|
| 81 |
+
freeze_language_model: false
|
| 82 |
+
freeze_mm_proj: false
|
| 83 |
+
freeze_vision_tower: true
|
| 84 |
+
gradient_accumulation_steps: 16
|
| 85 |
+
gradient_checkpointing: true
|
| 86 |
+
learning_rate: 2e-05
|
| 87 |
+
load_checkpoint: false
|
| 88 |
+
lr_scheduler_type: cosine
|
| 89 |
+
lr_warmup_ratio: 0.03
|
| 90 |
+
max_grad_norm: 1
|
| 91 |
+
per_device_eval_batch_size: 1
|
| 92 |
+
per_device_train_batch_size: 1
|
| 93 |
+
save_checkpoint: true
|
| 94 |
+
seed: 42
|
| 95 |
+
weight_decay: 0
|
run-20250329_003552-1mlhe6om/files/output.log
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
***** Running training *****
|
| 2 |
+
Resuming from checkpoint 3/3 epoch : 0%| | 0/7326 [00:00<?, ?it/s]
|
| 3 |
+
Current epoch: 0
|
| 4 |
+
Current epoch: 1
|
| 5 |
+
Current epoch: 2
|
| 6 |
+
Saving model to "../outputs/test_7B/slice_end" ...
|
| 7 |
+
Saving 16-bit model...
|
| 8 |
+
[2025-03-29 00:36:00,887] [INFO] [logging.py:107:log_dist] [Rank 0] [Torch] Checkpoint global_step0 is about to be saved!
|
| 9 |
+
[2025-03-29 00:36:00,887] [INFO] [engine.py:3831:save_16bit_model] Saving model weights to ../outputs/test_7B/slice_end/pytorch_model.bin, tag: global_step0
|
| 10 |
+
[2025-03-29 00:36:00,888] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/test_7B/slice_end/pytorch_model.bin...
|
| 11 |
+
[2025-03-29 00:36:14,412] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/test_7B/slice_end/pytorch_model.bin.
|
| 12 |
+
[2025-03-29 00:36:14,412] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step0 is ready now!
|
| 13 |
+
[2025-03-29 00:36:15,002] [INFO] [logging.py:107:log_dist] [Rank 0] [Torch] Checkpoint global_step0 is about to be saved!
|
| 14 |
+
[2025-03-29 00:36:15,045] [INFO] [logging.py:107:log_dist] [Rank 0] Saving model checkpoint: ../outputs/test_7B/slice_end/global_step0/zero_pp_rank_0_mp_rank_00_model_states.pt
|
| 15 |
+
[2025-03-29 00:36:15,045] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/test_7B/slice_end/global_step0/zero_pp_rank_0_mp_rank_00_model_states.pt...
|
| 16 |
+
[2025-03-29 00:36:15,195] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/test_7B/slice_end/global_step0/zero_pp_rank_0_mp_rank_00_model_states.pt.
|
| 17 |
+
[2025-03-29 00:36:20,367] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/test_7B/slice_end/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt...
|
| 18 |
+
[2025-03-29 00:36:26,225] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/test_7B/slice_end/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt.
|
| 19 |
+
[2025-03-29 00:36:26,228] [INFO] [engine.py:3672:_save_zero_checkpoint] zero checkpoint saved ../outputs/test_7B/slice_end/global_step0/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
|
| 20 |
+
[2025-03-29 00:36:26,608] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step0 is ready now!
|
| 21 |
+
Model saved!
|
run-20250329_003552-1mlhe6om/files/requirements.txt
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
maskrcnn_benchmark==0.0.0
|
| 2 |
+
webdataset==0.2.111
|
| 3 |
+
websockets==15.0.1
|
| 4 |
+
typer==0.15.2
|
| 5 |
+
blobfile==3.0.0
|
| 6 |
+
pooch==1.8.2
|
| 7 |
+
python-dateutil==2.9.0.post0
|
| 8 |
+
gmpy2==2.2.1
|
| 9 |
+
httpcore==1.0.7
|
| 10 |
+
charset-normalizer==3.3.2
|
| 11 |
+
torchlibrosa==0.1.0
|
| 12 |
+
multiprocess==0.70.16
|
| 13 |
+
Werkzeug==3.1.3
|
| 14 |
+
aiofiles==23.2.1
|
| 15 |
+
six==1.17.0
|
| 16 |
+
typing_extensions==4.12.2
|
| 17 |
+
psutil==7.0.0
|
| 18 |
+
frozenlist==1.5.0
|
| 19 |
+
einops==0.8.1
|
| 20 |
+
flash_attn==2.7.4.post1
|
| 21 |
+
PySocks==1.7.1
|
| 22 |
+
regex==2024.11.6
|
| 23 |
+
markdown-it-py==3.0.0
|
| 24 |
+
ruff==0.11.2
|
| 25 |
+
docker-pycreds==0.4.0
|
| 26 |
+
protobuf==5.29.4
|
| 27 |
+
resampy==0.4.3
|
| 28 |
+
aiohappyeyeballs==2.6.1
|
| 29 |
+
httpx==0.28.1
|
| 30 |
+
encodec==0.1.1
|
| 31 |
+
ffmpy==0.5.0
|
| 32 |
+
mkl_random==1.2.8
|
| 33 |
+
soxr==0.5.0.post1
|
| 34 |
+
absl-py==2.2.1
|
| 35 |
+
networkx==3.4.2
|
| 36 |
+
h5py==3.13.0
|
| 37 |
+
hjson==3.1.0
|
| 38 |
+
tensorboard==2.19.0
|
| 39 |
+
aiosignal==1.3.2
|
| 40 |
+
pip==25.0
|
| 41 |
+
triton==3.1.0
|
| 42 |
+
zipp==3.21.0
|
| 43 |
+
ftfy==6.3.1
|
| 44 |
+
attrs==25.3.0
|
| 45 |
+
requests==2.32.3
|
| 46 |
+
progressbar==2.5
|
| 47 |
+
sniffio==1.3.1
|
| 48 |
+
lxml==5.3.1
|
| 49 |
+
starlette==0.46.1
|
| 50 |
+
Markdown==3.7
|
| 51 |
+
mdurl==0.1.2
|
| 52 |
+
torchaudio==2.5.1
|
| 53 |
+
safetensors==0.5.3
|
| 54 |
+
opencv-python==4.6.0.66
|
| 55 |
+
torchvision==0.20.1
|
| 56 |
+
shellingham==1.5.4
|
| 57 |
+
gradio==5.23.1
|
| 58 |
+
timm==1.0.15
|
| 59 |
+
multidict==6.2.0
|
| 60 |
+
semantic-version==2.10.0
|
| 61 |
+
numba==0.60.0
|
| 62 |
+
gradio_client==1.8.0
|
| 63 |
+
pydantic_core==2.33.0
|
| 64 |
+
dill==0.3.8
|
| 65 |
+
msgpack==1.1.0
|
| 66 |
+
sentry-sdk==2.24.1
|
| 67 |
+
grpcio==1.71.0
|
| 68 |
+
cffi==1.17.1
|
| 69 |
+
PyYAML==6.0.2
|
| 70 |
+
tensorboard-data-server==0.7.2
|
| 71 |
+
fastapi==0.115.12
|
| 72 |
+
lazy_loader==0.4
|
| 73 |
+
mkl_fft==1.3.11
|
| 74 |
+
annotated-types==0.7.0
|
| 75 |
+
scikit-learn==1.6.1
|
| 76 |
+
wget==3.2
|
| 77 |
+
setuptools==75.8.0
|
| 78 |
+
certifi==2025.1.31
|
| 79 |
+
click==8.1.8
|
| 80 |
+
laion_clap==1.1.5
|
| 81 |
+
Pygments==2.19.1
|
| 82 |
+
tomlkit==0.13.2
|
| 83 |
+
idna==3.7
|
| 84 |
+
propcache==0.3.1
|
| 85 |
+
platformdirs==4.3.7
|
| 86 |
+
align-anything==0.0.1.dev0
|
| 87 |
+
deepspeed==0.16.5
|
| 88 |
+
smmap==5.0.2
|
| 89 |
+
pillow==11.1.0
|
| 90 |
+
typing-inspection==0.4.0
|
| 91 |
+
braceexpand==0.1.7
|
| 92 |
+
decorator==5.2.1
|
| 93 |
+
pandas==2.2.3
|
| 94 |
+
huggingface-hub==0.29.3
|
| 95 |
+
pyarrow==19.0.1
|
| 96 |
+
tokenizers==0.21.1
|
| 97 |
+
GitPython==3.1.44
|
| 98 |
+
xxhash==3.5.0
|
| 99 |
+
packaging==24.2
|
| 100 |
+
numpy==1.23.4
|
| 101 |
+
setproctitle==1.3.5
|
| 102 |
+
llvmlite==0.43.0
|
| 103 |
+
tiktoken==0.9.0
|
| 104 |
+
mpmath==1.3.0
|
| 105 |
+
nvidia-ml-py==12.570.86
|
| 106 |
+
pydantic==2.11.0
|
| 107 |
+
datasets==3.5.0
|
| 108 |
+
librosa==0.11.0
|
| 109 |
+
frechet_audio_distance==0.3.1
|
| 110 |
+
sympy==1.13.1
|
| 111 |
+
safehttpx==0.1.6
|
| 112 |
+
Jinja2==3.1.6
|
| 113 |
+
h11==0.14.0
|
| 114 |
+
aiohttp==3.11.14
|
| 115 |
+
diffusers==0.32.2
|
| 116 |
+
tqdm==4.67.1
|
| 117 |
+
filelock==3.13.1
|
| 118 |
+
transformers==4.51.0.dev0
|
| 119 |
+
scipy==1.10.1
|
| 120 |
+
audioread==3.0.1
|
| 121 |
+
sentencepiece==0.2.0
|
| 122 |
+
pytz==2025.2
|
| 123 |
+
tzdata==2025.2
|
| 124 |
+
python-multipart==0.0.20
|
| 125 |
+
urllib3==2.3.0
|
| 126 |
+
pycryptodomex==3.22.0
|
| 127 |
+
yarl==1.18.3
|
| 128 |
+
pydub==0.25.1
|
| 129 |
+
pycparser==2.22
|
| 130 |
+
soundfile==0.13.1
|
| 131 |
+
wcwidth==0.2.13
|
| 132 |
+
groovy==0.1.2
|
| 133 |
+
torch==2.5.1
|
| 134 |
+
anyio==4.9.0
|
| 135 |
+
wandb==0.19.8
|
| 136 |
+
joblib==1.4.2
|
| 137 |
+
fsspec==2024.12.0
|
| 138 |
+
peft==0.15.1
|
| 139 |
+
accelerate==1.5.2
|
| 140 |
+
py-cpuinfo==9.0.0
|
| 141 |
+
uvicorn==0.34.0
|
| 142 |
+
orjson==3.10.16
|
| 143 |
+
Brotli==1.0.9
|
| 144 |
+
rich==13.9.4
|
| 145 |
+
importlib_metadata==8.6.1
|
| 146 |
+
ninja==1.11.1.4
|
| 147 |
+
wheel==0.45.1
|
| 148 |
+
MarkupSafe==3.0.2
|
| 149 |
+
threadpoolctl==3.6.0
|
| 150 |
+
gitdb==4.0.12
|
| 151 |
+
mkl-service==2.4.0
|
| 152 |
+
typing_extensions==4.12.2
|
| 153 |
+
tomli==2.0.1
|
| 154 |
+
zipp==3.19.2
|
| 155 |
+
wheel==0.43.0
|
| 156 |
+
jaraco.text==3.12.1
|
| 157 |
+
packaging==24.2
|
| 158 |
+
autocommand==2.2.2
|
| 159 |
+
jaraco.functools==4.0.1
|
| 160 |
+
jaraco.collections==5.1.0
|
| 161 |
+
platformdirs==4.2.2
|
| 162 |
+
more-itertools==10.3.0
|
| 163 |
+
inflect==7.3.1
|
| 164 |
+
jaraco.context==5.3.0
|
| 165 |
+
typeguard==4.3.0
|
| 166 |
+
backports.tarfile==1.2.0
|
| 167 |
+
importlib_metadata==8.0.0
|
run-20250329_003552-1mlhe6om/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-5.15.0-1040-nvidia-x86_64-with-glibc2.35",
|
| 3 |
+
"python": "CPython 3.11.11",
|
| 4 |
+
"startedAt": "2025-03-28T16:35:52.296406Z",
|
| 5 |
+
"args": [
|
| 6 |
+
"--local_rank=0",
|
| 7 |
+
"--model_name_or_path",
|
| 8 |
+
"/aifs4su/yaodong/hantao/models/llava-v1.6-mistral-7b-hf",
|
| 9 |
+
"--train_datasets",
|
| 10 |
+
"/aifs4su/yaodong/hantao/datasets/MMInstruct-GPT4V_mistral-7b_cosi_cut/merged/top1-10",
|
| 11 |
+
"--train_template",
|
| 12 |
+
"MM_TI2T_LLAVA",
|
| 13 |
+
"--train_split",
|
| 14 |
+
"train",
|
| 15 |
+
"--train_name",
|
| 16 |
+
"text-image-to-text",
|
| 17 |
+
"--output_dir",
|
| 18 |
+
"../outputs/test_7B",
|
| 19 |
+
"--save_total_limit",
|
| 20 |
+
"6",
|
| 21 |
+
"--train_batch_size",
|
| 22 |
+
"8",
|
| 23 |
+
"--epochs",
|
| 24 |
+
"3"
|
| 25 |
+
],
|
| 26 |
+
"program": "-m align_anything.trainers.text_image_to_text.sft",
|
| 27 |
+
"git": {
|
| 28 |
+
"remote": "git@github.com-hantao:PKU-Alignment/align-anything.git",
|
| 29 |
+
"commit": "106588f9802757a3283c1aff1f33ea9afd737f31"
|
| 30 |
+
},
|
| 31 |
+
"email": "2200017789@stu.pku.edu.cn",
|
| 32 |
+
"root": "../outputs/test_7B",
|
| 33 |
+
"host": "dgx-092",
|
| 34 |
+
"executable": "/aifs4su/yaodong/miniconda3/envs/hantao_llama/bin/python",
|
| 35 |
+
"cpu_count": 112,
|
| 36 |
+
"cpu_count_logical": 224,
|
| 37 |
+
"gpu": "NVIDIA H800",
|
| 38 |
+
"gpu_count": 8,
|
| 39 |
+
"disk": {
|
| 40 |
+
"/": {
|
| 41 |
+
"total": "1888556142592",
|
| 42 |
+
"used": "148607098880"
|
| 43 |
+
}
|
| 44 |
+
},
|
| 45 |
+
"memory": {
|
| 46 |
+
"total": "2164195454976"
|
| 47 |
+
},
|
| 48 |
+
"cpu": {
|
| 49 |
+
"count": 112,
|
| 50 |
+
"countLogical": 224
|
| 51 |
+
},
|
| 52 |
+
"gpu_nvidia": [
|
| 53 |
+
{
|
| 54 |
+
"name": "NVIDIA H800",
|
| 55 |
+
"memoryTotal": "85520809984",
|
| 56 |
+
"cudaCores": 16896,
|
| 57 |
+
"architecture": "Hopper"
|
| 58 |
+
},
|
| 59 |
+
{
|
| 60 |
+
"name": "NVIDIA H800",
|
| 61 |
+
"memoryTotal": "85520809984",
|
| 62 |
+
"cudaCores": 16896,
|
| 63 |
+
"architecture": "Hopper"
|
| 64 |
+
},
|
| 65 |
+
{
|
| 66 |
+
"name": "NVIDIA H800",
|
| 67 |
+
"memoryTotal": "85520809984",
|
| 68 |
+
"cudaCores": 16896,
|
| 69 |
+
"architecture": "Hopper"
|
| 70 |
+
},
|
| 71 |
+
{
|
| 72 |
+
"name": "NVIDIA H800",
|
| 73 |
+
"memoryTotal": "85520809984",
|
| 74 |
+
"cudaCores": 16896,
|
| 75 |
+
"architecture": "Hopper"
|
| 76 |
+
},
|
| 77 |
+
{
|
| 78 |
+
"name": "NVIDIA H800",
|
| 79 |
+
"memoryTotal": "85520809984",
|
| 80 |
+
"cudaCores": 16896,
|
| 81 |
+
"architecture": "Hopper"
|
| 82 |
+
},
|
| 83 |
+
{
|
| 84 |
+
"name": "NVIDIA H800",
|
| 85 |
+
"memoryTotal": "85520809984",
|
| 86 |
+
"cudaCores": 16896,
|
| 87 |
+
"architecture": "Hopper"
|
| 88 |
+
},
|
| 89 |
+
{
|
| 90 |
+
"name": "NVIDIA H800",
|
| 91 |
+
"memoryTotal": "85520809984",
|
| 92 |
+
"cudaCores": 16896,
|
| 93 |
+
"architecture": "Hopper"
|
| 94 |
+
},
|
| 95 |
+
{
|
| 96 |
+
"name": "NVIDIA H800",
|
| 97 |
+
"memoryTotal": "85520809984",
|
| 98 |
+
"cudaCores": 16896,
|
| 99 |
+
"architecture": "Hopper"
|
| 100 |
+
}
|
| 101 |
+
],
|
| 102 |
+
"slurm": {
|
| 103 |
+
"conf": "/cm/shared/apps/slurm/var/etc/slurm/slurm.conf"
|
| 104 |
+
},
|
| 105 |
+
"cudaVersion": "12.2"
|
| 106 |
+
}
|
run-20250329_003552-1mlhe6om/files/wandb-summary.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"_wandb":{"runtime":34}}
|
run-20250329_003552-1mlhe6om/logs/debug-core.log
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-03-29T00:35:51.700332688+08:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmp4xrq9hq1/port-9551.txt","pid":9551,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false}
|
| 2 |
+
{"time":"2025-03-29T00:35:51.701486575+08:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":9551}
|
| 3 |
+
{"time":"2025-03-29T00:35:51.701420457+08:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":41983,"Zone":""}}
|
| 4 |
+
{"time":"2025-03-29T00:35:51.880332094+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:61422"}
|
| 5 |
+
{"time":"2025-03-29T00:35:52.297979257+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"1mlhe6om","id":"127.0.0.1:61422"}
|
| 6 |
+
{"time":"2025-03-29T00:35:52.514464455+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"1mlhe6om","id":"127.0.0.1:61422"}
|
| 7 |
+
{"time":"2025-03-29T00:36:28.014384472+08:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:61422"}
|
| 8 |
+
{"time":"2025-03-29T00:36:28.014433444+08:00","level":"INFO","msg":"connection: closing","id":"127.0.0.1:61422"}
|
| 9 |
+
{"time":"2025-03-29T00:36:28.014448354+08:00","level":"INFO","msg":"server is shutting down"}
|
| 10 |
+
{"time":"2025-03-29T00:36:28.014489675+08:00","level":"INFO","msg":"connection: closed successfully","id":"127.0.0.1:61422"}
|
| 11 |
+
{"time":"2025-03-29T00:36:28.023136355+08:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:61422"}
|
| 12 |
+
{"time":"2025-03-29T00:36:28.023152325+08:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:61422"}
|
| 13 |
+
{"time":"2025-03-29T00:36:28.023157944+08:00","level":"INFO","msg":"server is closed"}
|
run-20250329_003552-1mlhe6om/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-03-29T00:35:52.299334516+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.8","symlink path":"../outputs/test_7B/wandb/run-20250329_003552-1mlhe6om/logs/debug-core.log"}
|
| 2 |
+
{"time":"2025-03-29T00:35:52.5143997+08:00","level":"INFO","msg":"created new stream","id":"1mlhe6om"}
|
| 3 |
+
{"time":"2025-03-29T00:35:52.514457632+08:00","level":"INFO","msg":"stream: started","id":"1mlhe6om"}
|
| 4 |
+
{"time":"2025-03-29T00:35:52.514478469+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"1mlhe6om"}
|
| 5 |
+
{"time":"2025-03-29T00:35:52.514486118+08:00","level":"INFO","msg":"handler: started","stream_id":"1mlhe6om"}
|
| 6 |
+
{"time":"2025-03-29T00:35:52.514497089+08:00","level":"INFO","msg":"sender: started","stream_id":"1mlhe6om"}
|
| 7 |
+
{"time":"2025-03-29T00:35:52.799983877+08:00","level":"INFO","msg":"Starting system monitor"}
|
| 8 |
+
{"time":"2025-03-29T00:36:26.630995684+08:00","level":"INFO","msg":"Stopping system monitor"}
|
| 9 |
+
{"time":"2025-03-29T00:36:26.631709662+08:00","level":"INFO","msg":"Stopped system monitor"}
|
| 10 |
+
{"time":"2025-03-29T00:36:27.77584189+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 11 |
+
{"time":"2025-03-29T00:36:28.01449504+08:00","level":"INFO","msg":"stream: closing","id":"1mlhe6om"}
|
| 12 |
+
{"time":"2025-03-29T00:36:28.01455515+08:00","level":"INFO","msg":"handler: closed","stream_id":"1mlhe6om"}
|
| 13 |
+
{"time":"2025-03-29T00:36:28.014530377+08:00","level":"WARN","msg":"sender: received Exit record more than once, ignoring"}
|
| 14 |
+
{"time":"2025-03-29T00:36:28.014604384+08:00","level":"INFO","msg":"sender: closed","stream_id":"1mlhe6om"}
|
| 15 |
+
{"time":"2025-03-29T00:36:28.014567177+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"1mlhe6om"}
|
| 16 |
+
{"time":"2025-03-29T00:36:28.02303065+08:00","level":"INFO","msg":"stream: closed","id":"1mlhe6om"}
|
run-20250329_003552-1mlhe6om/logs/debug.log
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-03-29 00:35:52,291 INFO MainThread:9551 [wandb_setup.py:_flush():67] Current SDK version is 0.19.8
|
| 2 |
+
2025-03-29 00:35:52,292 INFO MainThread:9551 [wandb_setup.py:_flush():67] Configure stats pid to 9551
|
| 3 |
+
2025-03-29 00:35:52,292 INFO MainThread:9551 [wandb_setup.py:_flush():67] Loading settings from /home/yangyaodong/.config/wandb/settings
|
| 4 |
+
2025-03-29 00:35:52,292 INFO MainThread:9551 [wandb_setup.py:_flush():67] Loading settings from /aifs4su/yaodong/hantao/align-anything/scripts/wandb/settings
|
| 5 |
+
2025-03-29 00:35:52,292 INFO MainThread:9551 [wandb_setup.py:_flush():67] Loading settings from environment variables
|
| 6 |
+
2025-03-29 00:35:52,292 INFO MainThread:9551 [wandb_init.py:setup_run_log_directory():647] Logging user logs to ../outputs/test_7B/wandb/run-20250329_003552-1mlhe6om/logs/debug.log
|
| 7 |
+
2025-03-29 00:35:52,292 INFO MainThread:9551 [wandb_init.py:setup_run_log_directory():648] Logging internal logs to ../outputs/test_7B/wandb/run-20250329_003552-1mlhe6om/logs/debug-internal.log
|
| 8 |
+
2025-03-29 00:35:52,292 INFO MainThread:9551 [wandb_init.py:init():761] calling init triggers
|
| 9 |
+
2025-03-29 00:35:52,292 INFO MainThread:9551 [wandb_init.py:init():766] wandb.init called with sweep_config: {}
|
| 10 |
+
config: {'train_cfgs': {'save_checkpoint': True, 'load_checkpoint': False, 'ds_cfgs': 'ds_z3_config.json', 'epochs': 3, 'seed': 42, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 16, 'gradient_checkpointing': True, 'learning_rate': 2e-05, 'lr_scheduler_type': 'cosine', 'lr_warmup_ratio': 0.03, 'weight_decay': 0.0, 'adam_betas': [0.9, 0.95], 'adam_epsilon': 1e-08, 'bf16': True, 'fp16': False, 'eval_strategy': 'epoch', 'eval_interval': 10, 'freeze_mm_proj': False, 'freeze_vision_tower': True, 'freeze_language_model': False, 'max_grad_norm': 1.0}, 'data_cfgs': {'load_multi_datasets': False, 'train_datasets': '/aifs4su/yaodong/hantao/datasets/MMInstruct-GPT4V_mistral-7b_cosi_cut/merged/top1-10', 'train_template': 'MM_TI2T_LLAVA', 'train_size': {}, 'train_split': 'train', 'train_name': 'text-image-to-text', 'train_data_files': {}, 'train_optional_args': [], 'eval_datasets': {}, 'eval_template': {}, 'eval_name': {}, 'eval_size': {}, 'eval_split': {}, 'eval_subset': {}, 'eval_data_files': {}, 'eval_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'align-anything', 'log_run_name': 'sft', 'output_dir': '../outputs/test_7B', 'cache_dir': {}, 'save_total_limit': 6}, 'model_cfgs': {'model_name_or_path': '/aifs4su/yaodong/hantao/models/llava-v1.6-mistral-7b-hf', 'trust_remote_code': True, 'model_max_length': 2048}, 'special_tokens': {}, '_wandb': {}}
|
| 11 |
+
2025-03-29 00:35:52,292 INFO MainThread:9551 [wandb_init.py:init():784] starting backend
|
| 12 |
+
2025-03-29 00:35:52,292 INFO MainThread:9551 [wandb_init.py:init():788] sending inform_init request
|
| 13 |
+
2025-03-29 00:35:52,296 INFO MainThread:9551 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
| 14 |
+
2025-03-29 00:35:52,296 INFO MainThread:9551 [wandb_init.py:init():798] backend started and connected
|
| 15 |
+
2025-03-29 00:35:52,297 INFO MainThread:9551 [wandb_init.py:init():891] updated telemetry
|
| 16 |
+
2025-03-29 00:35:52,308 INFO MainThread:9551 [wandb_init.py:init():915] communicating run to backend with 90.0 second timeout
|
| 17 |
+
2025-03-29 00:35:52,796 INFO MainThread:9551 [wandb_init.py:init():990] starting run threads in backend
|
| 18 |
+
2025-03-29 00:35:52,947 INFO MainThread:9551 [wandb_run.py:_console_start():2375] atexit reg
|
| 19 |
+
2025-03-29 00:35:52,947 INFO MainThread:9551 [wandb_run.py:_redirect():2227] redirect: wrap_raw
|
| 20 |
+
2025-03-29 00:35:52,947 INFO MainThread:9551 [wandb_run.py:_redirect():2292] Wrapping output streams.
|
| 21 |
+
2025-03-29 00:35:52,947 INFO MainThread:9551 [wandb_run.py:_redirect():2315] Redirects installed.
|
| 22 |
+
2025-03-29 00:35:52,949 INFO MainThread:9551 [wandb_init.py:init():1032] run started, returning control to user process
|
| 23 |
+
2025-03-29 00:36:26,629 INFO MainThread:9551 [wandb_run.py:_finish():2112] finishing run htlou/align-anything/1mlhe6om
|
| 24 |
+
2025-03-29 00:36:26,630 INFO MainThread:9551 [wandb_run.py:_atexit_cleanup():2340] got exitcode: 0
|
| 25 |
+
2025-03-29 00:36:26,630 INFO MainThread:9551 [wandb_run.py:_restore():2322] restore
|
| 26 |
+
2025-03-29 00:36:26,630 INFO MainThread:9551 [wandb_run.py:_restore():2328] restore done
|
| 27 |
+
2025-03-29 00:36:27,631 INFO MainThread:9551 [wandb_run.py:_restore():2322] restore
|
| 28 |
+
2025-03-29 00:36:27,631 INFO MainThread:9551 [wandb_run.py:_restore():2328] restore done
|
| 29 |
+
2025-03-29 00:36:27,631 ERROR MainThread:9551 [wandb_run.py:_atexit_cleanup():2361] Problem finishing run
|
| 30 |
+
Traceback (most recent call last):
|
| 31 |
+
File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 2352, in _atexit_cleanup
|
| 32 |
+
self._on_finish()
|
| 33 |
+
File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 2609, in _on_finish
|
| 34 |
+
wait_with_progress(
|
| 35 |
+
File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/wandb/sdk/mailbox/wait_with_progress.py", line 24, in wait_with_progress
|
| 36 |
+
return wait_all_with_progress(
|
| 37 |
+
^^^^^^^^^^^^^^^^^^^^^^^
|
| 38 |
+
File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/wandb/sdk/mailbox/wait_with_progress.py", line 87, in wait_all_with_progress
|
| 39 |
+
return asyncio_compat.run(progress_loop_with_timeout)
|
| 40 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 41 |
+
File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/wandb/sdk/lib/asyncio_compat.py", line 27, in run
|
| 42 |
+
future = executor.submit(runner.run, fn)
|
| 43 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 44 |
+
File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/concurrent/futures/thread.py", line 169, in submit
|
| 45 |
+
raise RuntimeError('cannot schedule new futures after '
|
| 46 |
+
RuntimeError: cannot schedule new futures after interpreter shutdown
|
| 47 |
+
2025-03-29 00:36:28,013 INFO MsgRouterThr:9551 [mailbox.py:close():129] Closing mailbox, abandoning 1 handles.
|
run-20250329_003552-1mlhe6om/run-1mlhe6om.wandb
ADDED
|
Binary file (15.9 kB). View file
|
|
|
run-20250329_003923-yubb37lj/files/output.log
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
***** Running training *****
|
| 2 |
+
Resuming from checkpoint 3/3 epoch : 0%| | 0/7326 [00:00<?, ?it/s]
|
| 3 |
+
Current epoch: 0
|
| 4 |
+
Current epoch: 1
|
| 5 |
+
Current epoch: 2
|
| 6 |
+
Saving model to "../outputs/test_7B/slice_end" ...
|
| 7 |
+
Saving 16-bit model...
|
| 8 |
+
[2025-03-29 00:39:32,108] [INFO] [logging.py:107:log_dist] [Rank 0] [Torch] Checkpoint global_step0 is about to be saved!
|
| 9 |
+
[2025-03-29 00:39:32,108] [INFO] [engine.py:3831:save_16bit_model] Saving model weights to ../outputs/test_7B/slice_end/pytorch_model.bin, tag: global_step0
|
| 10 |
+
[2025-03-29 00:39:32,109] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/test_7B/slice_end/pytorch_model.bin...
|
| 11 |
+
[2025-03-29 00:39:46,748] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/test_7B/slice_end/pytorch_model.bin.
|
| 12 |
+
[2025-03-29 00:39:46,748] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step0 is ready now!
|
| 13 |
+
[2025-03-29 00:39:47,316] [INFO] [logging.py:107:log_dist] [Rank 0] [Torch] Checkpoint global_step0 is about to be saved!
|
| 14 |
+
[2025-03-29 00:39:47,344] [INFO] [logging.py:107:log_dist] [Rank 0] Saving model checkpoint: ../outputs/test_7B/slice_end/global_step0/zero_pp_rank_0_mp_rank_00_model_states.pt
|
| 15 |
+
[2025-03-29 00:39:47,345] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/test_7B/slice_end/global_step0/zero_pp_rank_0_mp_rank_00_model_states.pt...
|
| 16 |
+
[2025-03-29 00:39:47,453] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/test_7B/slice_end/global_step0/zero_pp_rank_0_mp_rank_00_model_states.pt.
|
run-20250329_003923-yubb37lj/files/requirements.txt
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
maskrcnn_benchmark==0.0.0
|
| 2 |
+
webdataset==0.2.111
|
| 3 |
+
websockets==15.0.1
|
| 4 |
+
typer==0.15.2
|
| 5 |
+
blobfile==3.0.0
|
| 6 |
+
pooch==1.8.2
|
| 7 |
+
python-dateutil==2.9.0.post0
|
| 8 |
+
gmpy2==2.2.1
|
| 9 |
+
httpcore==1.0.7
|
| 10 |
+
charset-normalizer==3.3.2
|
| 11 |
+
torchlibrosa==0.1.0
|
| 12 |
+
multiprocess==0.70.16
|
| 13 |
+
Werkzeug==3.1.3
|
| 14 |
+
aiofiles==23.2.1
|
| 15 |
+
six==1.17.0
|
| 16 |
+
typing_extensions==4.12.2
|
| 17 |
+
psutil==7.0.0
|
| 18 |
+
frozenlist==1.5.0
|
| 19 |
+
einops==0.8.1
|
| 20 |
+
flash_attn==2.7.4.post1
|
| 21 |
+
PySocks==1.7.1
|
| 22 |
+
regex==2024.11.6
|
| 23 |
+
markdown-it-py==3.0.0
|
| 24 |
+
ruff==0.11.2
|
| 25 |
+
docker-pycreds==0.4.0
|
| 26 |
+
protobuf==5.29.4
|
| 27 |
+
resampy==0.4.3
|
| 28 |
+
aiohappyeyeballs==2.6.1
|
| 29 |
+
httpx==0.28.1
|
| 30 |
+
encodec==0.1.1
|
| 31 |
+
ffmpy==0.5.0
|
| 32 |
+
mkl_random==1.2.8
|
| 33 |
+
soxr==0.5.0.post1
|
| 34 |
+
absl-py==2.2.1
|
| 35 |
+
networkx==3.4.2
|
| 36 |
+
h5py==3.13.0
|
| 37 |
+
hjson==3.1.0
|
| 38 |
+
tensorboard==2.19.0
|
| 39 |
+
aiosignal==1.3.2
|
| 40 |
+
pip==25.0
|
| 41 |
+
triton==3.1.0
|
| 42 |
+
zipp==3.21.0
|
| 43 |
+
ftfy==6.3.1
|
| 44 |
+
attrs==25.3.0
|
| 45 |
+
requests==2.32.3
|
| 46 |
+
progressbar==2.5
|
| 47 |
+
sniffio==1.3.1
|
| 48 |
+
lxml==5.3.1
|
| 49 |
+
starlette==0.46.1
|
| 50 |
+
Markdown==3.7
|
| 51 |
+
mdurl==0.1.2
|
| 52 |
+
torchaudio==2.5.1
|
| 53 |
+
safetensors==0.5.3
|
| 54 |
+
opencv-python==4.6.0.66
|
| 55 |
+
torchvision==0.20.1
|
| 56 |
+
shellingham==1.5.4
|
| 57 |
+
gradio==5.23.1
|
| 58 |
+
timm==1.0.15
|
| 59 |
+
multidict==6.2.0
|
| 60 |
+
semantic-version==2.10.0
|
| 61 |
+
numba==0.60.0
|
| 62 |
+
gradio_client==1.8.0
|
| 63 |
+
pydantic_core==2.33.0
|
| 64 |
+
dill==0.3.8
|
| 65 |
+
msgpack==1.1.0
|
| 66 |
+
sentry-sdk==2.24.1
|
| 67 |
+
grpcio==1.71.0
|
| 68 |
+
cffi==1.17.1
|
| 69 |
+
PyYAML==6.0.2
|
| 70 |
+
tensorboard-data-server==0.7.2
|
| 71 |
+
fastapi==0.115.12
|
| 72 |
+
lazy_loader==0.4
|
| 73 |
+
mkl_fft==1.3.11
|
| 74 |
+
annotated-types==0.7.0
|
| 75 |
+
scikit-learn==1.6.1
|
| 76 |
+
wget==3.2
|
| 77 |
+
setuptools==75.8.0
|
| 78 |
+
certifi==2025.1.31
|
| 79 |
+
click==8.1.8
|
| 80 |
+
laion_clap==1.1.5
|
| 81 |
+
Pygments==2.19.1
|
| 82 |
+
tomlkit==0.13.2
|
| 83 |
+
idna==3.7
|
| 84 |
+
propcache==0.3.1
|
| 85 |
+
platformdirs==4.3.7
|
| 86 |
+
align-anything==0.0.1.dev0
|
| 87 |
+
deepspeed==0.16.5
|
| 88 |
+
smmap==5.0.2
|
| 89 |
+
pillow==11.1.0
|
| 90 |
+
typing-inspection==0.4.0
|
| 91 |
+
braceexpand==0.1.7
|
| 92 |
+
decorator==5.2.1
|
| 93 |
+
pandas==2.2.3
|
| 94 |
+
huggingface-hub==0.29.3
|
| 95 |
+
pyarrow==19.0.1
|
| 96 |
+
tokenizers==0.21.1
|
| 97 |
+
GitPython==3.1.44
|
| 98 |
+
xxhash==3.5.0
|
| 99 |
+
packaging==24.2
|
| 100 |
+
numpy==1.23.4
|
| 101 |
+
setproctitle==1.3.5
|
| 102 |
+
llvmlite==0.43.0
|
| 103 |
+
tiktoken==0.9.0
|
| 104 |
+
mpmath==1.3.0
|
| 105 |
+
nvidia-ml-py==12.570.86
|
| 106 |
+
pydantic==2.11.0
|
| 107 |
+
datasets==3.5.0
|
| 108 |
+
librosa==0.11.0
|
| 109 |
+
frechet_audio_distance==0.3.1
|
| 110 |
+
sympy==1.13.1
|
| 111 |
+
safehttpx==0.1.6
|
| 112 |
+
Jinja2==3.1.6
|
| 113 |
+
h11==0.14.0
|
| 114 |
+
aiohttp==3.11.14
|
| 115 |
+
diffusers==0.32.2
|
| 116 |
+
tqdm==4.67.1
|
| 117 |
+
filelock==3.13.1
|
| 118 |
+
transformers==4.51.0.dev0
|
| 119 |
+
scipy==1.10.1
|
| 120 |
+
audioread==3.0.1
|
| 121 |
+
sentencepiece==0.2.0
|
| 122 |
+
pytz==2025.2
|
| 123 |
+
tzdata==2025.2
|
| 124 |
+
python-multipart==0.0.20
|
| 125 |
+
urllib3==2.3.0
|
| 126 |
+
pycryptodomex==3.22.0
|
| 127 |
+
yarl==1.18.3
|
| 128 |
+
pydub==0.25.1
|
| 129 |
+
pycparser==2.22
|
| 130 |
+
soundfile==0.13.1
|
| 131 |
+
wcwidth==0.2.13
|
| 132 |
+
groovy==0.1.2
|
| 133 |
+
torch==2.5.1
|
| 134 |
+
anyio==4.9.0
|
| 135 |
+
wandb==0.19.8
|
| 136 |
+
joblib==1.4.2
|
| 137 |
+
fsspec==2024.12.0
|
| 138 |
+
peft==0.15.1
|
| 139 |
+
accelerate==1.5.2
|
| 140 |
+
py-cpuinfo==9.0.0
|
| 141 |
+
uvicorn==0.34.0
|
| 142 |
+
orjson==3.10.16
|
| 143 |
+
Brotli==1.0.9
|
| 144 |
+
rich==13.9.4
|
| 145 |
+
importlib_metadata==8.6.1
|
| 146 |
+
ninja==1.11.1.4
|
| 147 |
+
wheel==0.45.1
|
| 148 |
+
MarkupSafe==3.0.2
|
| 149 |
+
threadpoolctl==3.6.0
|
| 150 |
+
gitdb==4.0.12
|
| 151 |
+
mkl-service==2.4.0
|
| 152 |
+
typing_extensions==4.12.2
|
| 153 |
+
tomli==2.0.1
|
| 154 |
+
zipp==3.19.2
|
| 155 |
+
wheel==0.43.0
|
| 156 |
+
jaraco.text==3.12.1
|
| 157 |
+
packaging==24.2
|
| 158 |
+
autocommand==2.2.2
|
| 159 |
+
jaraco.functools==4.0.1
|
| 160 |
+
jaraco.collections==5.1.0
|
| 161 |
+
platformdirs==4.2.2
|
| 162 |
+
more-itertools==10.3.0
|
| 163 |
+
inflect==7.3.1
|
| 164 |
+
jaraco.context==5.3.0
|
| 165 |
+
typeguard==4.3.0
|
| 166 |
+
backports.tarfile==1.2.0
|
| 167 |
+
importlib_metadata==8.0.0
|
run-20250329_003923-yubb37lj/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-5.15.0-1040-nvidia-x86_64-with-glibc2.35",
|
| 3 |
+
"python": "CPython 3.11.11",
|
| 4 |
+
"startedAt": "2025-03-28T16:39:23.374186Z",
|
| 5 |
+
"args": [
|
| 6 |
+
"--local_rank=0",
|
| 7 |
+
"--model_name_or_path",
|
| 8 |
+
"/aifs4su/yaodong/hantao/models/llava-v1.6-mistral-7b-hf",
|
| 9 |
+
"--train_datasets",
|
| 10 |
+
"/aifs4su/yaodong/hantao/datasets/MMInstruct-GPT4V_mistral-7b_cosi_cut/merged/top1-10",
|
| 11 |
+
"--train_template",
|
| 12 |
+
"MM_TI2T_LLAVA",
|
| 13 |
+
"--train_split",
|
| 14 |
+
"train",
|
| 15 |
+
"--train_name",
|
| 16 |
+
"text-image-to-text",
|
| 17 |
+
"--output_dir",
|
| 18 |
+
"../outputs/test_7B",
|
| 19 |
+
"--save_total_limit",
|
| 20 |
+
"6",
|
| 21 |
+
"--train_batch_size",
|
| 22 |
+
"8",
|
| 23 |
+
"--epochs",
|
| 24 |
+
"3"
|
| 25 |
+
],
|
| 26 |
+
"program": "-m align_anything.trainers.text_image_to_text.sft",
|
| 27 |
+
"git": {
|
| 28 |
+
"remote": "git@github.com-hantao:PKU-Alignment/align-anything.git",
|
| 29 |
+
"commit": "106588f9802757a3283c1aff1f33ea9afd737f31"
|
| 30 |
+
},
|
| 31 |
+
"email": "2200017789@stu.pku.edu.cn",
|
| 32 |
+
"root": "../outputs/test_7B",
|
| 33 |
+
"host": "dgx-092",
|
| 34 |
+
"executable": "/aifs4su/yaodong/miniconda3/envs/hantao_llama/bin/python",
|
| 35 |
+
"cpu_count": 112,
|
| 36 |
+
"cpu_count_logical": 224,
|
| 37 |
+
"gpu": "NVIDIA H800",
|
| 38 |
+
"gpu_count": 8,
|
| 39 |
+
"disk": {
|
| 40 |
+
"/": {
|
| 41 |
+
"total": "1888556142592",
|
| 42 |
+
"used": "148607471616"
|
| 43 |
+
}
|
| 44 |
+
},
|
| 45 |
+
"memory": {
|
| 46 |
+
"total": "2164195454976"
|
| 47 |
+
},
|
| 48 |
+
"cpu": {
|
| 49 |
+
"count": 112,
|
| 50 |
+
"countLogical": 224
|
| 51 |
+
},
|
| 52 |
+
"gpu_nvidia": [
|
| 53 |
+
{
|
| 54 |
+
"name": "NVIDIA H800",
|
| 55 |
+
"memoryTotal": "85520809984",
|
| 56 |
+
"cudaCores": 16896,
|
| 57 |
+
"architecture": "Hopper"
|
| 58 |
+
},
|
| 59 |
+
{
|
| 60 |
+
"name": "NVIDIA H800",
|
| 61 |
+
"memoryTotal": "85520809984",
|
| 62 |
+
"cudaCores": 16896,
|
| 63 |
+
"architecture": "Hopper"
|
| 64 |
+
},
|
| 65 |
+
{
|
| 66 |
+
"name": "NVIDIA H800",
|
| 67 |
+
"memoryTotal": "85520809984",
|
| 68 |
+
"cudaCores": 16896,
|
| 69 |
+
"architecture": "Hopper"
|
| 70 |
+
},
|
| 71 |
+
{
|
| 72 |
+
"name": "NVIDIA H800",
|
| 73 |
+
"memoryTotal": "85520809984",
|
| 74 |
+
"cudaCores": 16896,
|
| 75 |
+
"architecture": "Hopper"
|
| 76 |
+
},
|
| 77 |
+
{
|
| 78 |
+
"name": "NVIDIA H800",
|
| 79 |
+
"memoryTotal": "85520809984",
|
| 80 |
+
"cudaCores": 16896,
|
| 81 |
+
"architecture": "Hopper"
|
| 82 |
+
},
|
| 83 |
+
{
|
| 84 |
+
"name": "NVIDIA H800",
|
| 85 |
+
"memoryTotal": "85520809984",
|
| 86 |
+
"cudaCores": 16896,
|
| 87 |
+
"architecture": "Hopper"
|
| 88 |
+
},
|
| 89 |
+
{
|
| 90 |
+
"name": "NVIDIA H800",
|
| 91 |
+
"memoryTotal": "85520809984",
|
| 92 |
+
"cudaCores": 16896,
|
| 93 |
+
"architecture": "Hopper"
|
| 94 |
+
},
|
| 95 |
+
{
|
| 96 |
+
"name": "NVIDIA H800",
|
| 97 |
+
"memoryTotal": "85520809984",
|
| 98 |
+
"cudaCores": 16896,
|
| 99 |
+
"architecture": "Hopper"
|
| 100 |
+
}
|
| 101 |
+
],
|
| 102 |
+
"slurm": {
|
| 103 |
+
"conf": "/cm/shared/apps/slurm/var/etc/slurm/slurm.conf"
|
| 104 |
+
},
|
| 105 |
+
"cudaVersion": "12.2"
|
| 106 |
+
}
|
run-20250329_003923-yubb37lj/logs/debug-core.log
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-03-29T00:39:22.471053243+08:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpjod75rq7/port-18738.txt","pid":18738,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false}
|
| 2 |
+
{"time":"2025-03-29T00:39:22.471952784+08:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":18738}
|
| 3 |
+
{"time":"2025-03-29T00:39:22.471948837+08:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":42047,"Zone":""}}
|
| 4 |
+
{"time":"2025-03-29T00:39:22.652859041+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:57374"}
|
| 5 |
+
{"time":"2025-03-29T00:39:23.375489118+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"yubb37lj","id":"127.0.0.1:57374"}
|
| 6 |
+
{"time":"2025-03-29T00:39:23.590375898+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"yubb37lj","id":"127.0.0.1:57374"}
|
| 7 |
+
{"time":"2025-03-29T00:39:50.477044845+08:00","level":"INFO","msg":"received shutdown signal","signal":15}
|
run-20250329_003923-yubb37lj/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-03-29T00:39:23.376972436+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.8","symlink path":"../outputs/test_7B/wandb/run-20250329_003923-yubb37lj/logs/debug-core.log"}
|
| 2 |
+
{"time":"2025-03-29T00:39:23.590312919+08:00","level":"INFO","msg":"created new stream","id":"yubb37lj"}
|
| 3 |
+
{"time":"2025-03-29T00:39:23.590368384+08:00","level":"INFO","msg":"stream: started","id":"yubb37lj"}
|
| 4 |
+
{"time":"2025-03-29T00:39:23.59039266+08:00","level":"INFO","msg":"handler: started","stream_id":"yubb37lj"}
|
| 5 |
+
{"time":"2025-03-29T00:39:23.59039881+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"yubb37lj"}
|
| 6 |
+
{"time":"2025-03-29T00:39:23.590398926+08:00","level":"INFO","msg":"sender: started","stream_id":"yubb37lj"}
|
| 7 |
+
{"time":"2025-03-29T00:39:23.847802173+08:00","level":"INFO","msg":"Starting system monitor"}
|
run-20250329_003923-yubb37lj/logs/debug.log
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-03-29 00:39:23,371 INFO MainThread:18738 [wandb_setup.py:_flush():67] Current SDK version is 0.19.8
|
| 2 |
+
2025-03-29 00:39:23,371 INFO MainThread:18738 [wandb_setup.py:_flush():67] Configure stats pid to 18738
|
| 3 |
+
2025-03-29 00:39:23,371 INFO MainThread:18738 [wandb_setup.py:_flush():67] Loading settings from /home/yangyaodong/.config/wandb/settings
|
| 4 |
+
2025-03-29 00:39:23,371 INFO MainThread:18738 [wandb_setup.py:_flush():67] Loading settings from /aifs4su/yaodong/hantao/align-anything/scripts/wandb/settings
|
| 5 |
+
2025-03-29 00:39:23,371 INFO MainThread:18738 [wandb_setup.py:_flush():67] Loading settings from environment variables
|
| 6 |
+
2025-03-29 00:39:23,371 INFO MainThread:18738 [wandb_init.py:setup_run_log_directory():647] Logging user logs to ../outputs/test_7B/wandb/run-20250329_003923-yubb37lj/logs/debug.log
|
| 7 |
+
2025-03-29 00:39:23,371 INFO MainThread:18738 [wandb_init.py:setup_run_log_directory():648] Logging internal logs to ../outputs/test_7B/wandb/run-20250329_003923-yubb37lj/logs/debug-internal.log
|
| 8 |
+
2025-03-29 00:39:23,371 INFO MainThread:18738 [wandb_init.py:init():761] calling init triggers
|
| 9 |
+
2025-03-29 00:39:23,371 INFO MainThread:18738 [wandb_init.py:init():766] wandb.init called with sweep_config: {}
|
| 10 |
+
config: {'train_cfgs': {'save_checkpoint': True, 'load_checkpoint': False, 'ds_cfgs': 'ds_z3_config.json', 'epochs': 3, 'seed': 42, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 16, 'gradient_checkpointing': True, 'learning_rate': 2e-05, 'lr_scheduler_type': 'cosine', 'lr_warmup_ratio': 0.03, 'weight_decay': 0.0, 'adam_betas': [0.9, 0.95], 'adam_epsilon': 1e-08, 'bf16': True, 'fp16': False, 'eval_strategy': 'epoch', 'eval_interval': 10, 'freeze_mm_proj': False, 'freeze_vision_tower': True, 'freeze_language_model': False, 'max_grad_norm': 1.0}, 'data_cfgs': {'load_multi_datasets': False, 'train_datasets': '/aifs4su/yaodong/hantao/datasets/MMInstruct-GPT4V_mistral-7b_cosi_cut/merged/top1-10', 'train_template': 'MM_TI2T_LLAVA', 'train_size': {}, 'train_split': 'train', 'train_name': 'text-image-to-text', 'train_data_files': {}, 'train_optional_args': [], 'eval_datasets': {}, 'eval_template': {}, 'eval_name': {}, 'eval_size': {}, 'eval_split': {}, 'eval_subset': {}, 'eval_data_files': {}, 'eval_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'align-anything', 'log_run_name': 'sft', 'output_dir': '../outputs/test_7B', 'cache_dir': {}, 'save_total_limit': 6}, 'model_cfgs': {'model_name_or_path': '/aifs4su/yaodong/hantao/models/llava-v1.6-mistral-7b-hf', 'trust_remote_code': True, 'model_max_length': 2048}, 'special_tokens': {}, '_wandb': {}}
|
| 11 |
+
2025-03-29 00:39:23,371 INFO MainThread:18738 [wandb_init.py:init():784] starting backend
|
| 12 |
+
2025-03-29 00:39:23,371 INFO MainThread:18738 [wandb_init.py:init():788] sending inform_init request
|
| 13 |
+
2025-03-29 00:39:23,374 INFO MainThread:18738 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
| 14 |
+
2025-03-29 00:39:23,374 INFO MainThread:18738 [wandb_init.py:init():798] backend started and connected
|
| 15 |
+
2025-03-29 00:39:23,375 INFO MainThread:18738 [wandb_init.py:init():891] updated telemetry
|
| 16 |
+
2025-03-29 00:39:23,385 INFO MainThread:18738 [wandb_init.py:init():915] communicating run to backend with 90.0 second timeout
|
| 17 |
+
2025-03-29 00:39:23,845 INFO MainThread:18738 [wandb_init.py:init():990] starting run threads in backend
|
| 18 |
+
2025-03-29 00:39:23,996 INFO MainThread:18738 [wandb_run.py:_console_start():2375] atexit reg
|
| 19 |
+
2025-03-29 00:39:23,996 INFO MainThread:18738 [wandb_run.py:_redirect():2227] redirect: wrap_raw
|
| 20 |
+
2025-03-29 00:39:23,996 INFO MainThread:18738 [wandb_run.py:_redirect():2292] Wrapping output streams.
|
| 21 |
+
2025-03-29 00:39:23,996 INFO MainThread:18738 [wandb_run.py:_redirect():2315] Redirects installed.
|
| 22 |
+
2025-03-29 00:39:23,998 INFO MainThread:18738 [wandb_init.py:init():1032] run started, returning control to user process
|
run-20250329_003923-yubb37lj/run-yubb37lj.wandb
ADDED
|
File without changes
|
run-20250329_004923-vanwhj5e/files/output.log
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
***** Running training *****
|
| 2 |
+
Resuming from checkpoint 3/3 epoch : 0%| | 0/7326 [00:00<?, ?it/s]
|
| 3 |
+
Saving model to "../outputs/test_7B/slice_end" ...
|
| 4 |
+
Saving 16-bit model...
|
| 5 |
+
Traceback (most recent call last):
|
| 6 |
+
File "<frozen runpy>", line 198, in _run_module_as_main
|
| 7 |
+
File "<frozen runpy>", line 88, in _run_code
|
| 8 |
+
File "/aifs4su/yaodong/hantao/align-anything/align_anything/trainers/text_image_to_text/sft.py", line 100, in <module>
|
| 9 |
+
sys.exit(main())
|
| 10 |
+
^^^^^^
|
| 11 |
+
File "/aifs4su/yaodong/hantao/align-anything/align_anything/trainers/text_image_to_text/sft.py", line 96, in main
|
| 12 |
+
trainer.save()
|
| 13 |
+
File "/aifs4su/yaodong/hantao/align-anything/align_anything/trainers/text_to_text/sft.py", line 228, in save
|
| 14 |
+
self.save_transformers(model=model, tag=tag)
|
| 15 |
+
File "/aifs4su/yaodong/hantao/align-anything/align_anything/trainers/base/supervised_trainer.py", line 435, in save_transformers
|
| 16 |
+
model.save_16bit_model(output_dir, save_filename=save_file_name)
|
| 17 |
+
File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 3815, in save_16bit_model
|
| 18 |
+
state_dict = self._zero3_consolidated_16bit_state_dict(
|
| 19 |
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 20 |
+
File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 3776, in _zero3_consolidated_16bit_state_dict
|
| 21 |
+
get_layer_state_dict(self.module, prefix="")
|
| 22 |
+
File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 3769, in get_layer_state_dict
|
| 23 |
+
get_layer_state_dict(child, prefix + name + ".")
|
| 24 |
+
File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 3769, in get_layer_state_dict
|
| 25 |
+
get_layer_state_dict(child, prefix + name + ".")
|
| 26 |
+
File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 3769, in get_layer_state_dict
|
| 27 |
+
get_layer_state_dict(child, prefix + name + ".")
|
| 28 |
+
[Previous line repeated 3 more times]
|
| 29 |
+
File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 3757, in get_layer_state_dict
|
| 30 |
+
state_dict[key] = param.detach().cpu()
|
| 31 |
+
^^^^^^^^^^^^^^^^^^^^
|
| 32 |
+
KeyboardInterrupt
|
| 33 |
+
[rank0]: Traceback (most recent call last):
|
| 34 |
+
[rank0]: File "<frozen runpy>", line 198, in _run_module_as_main
|
| 35 |
+
[rank0]: File "<frozen runpy>", line 88, in _run_code
|
| 36 |
+
[rank0]: File "/aifs4su/yaodong/hantao/align-anything/align_anything/trainers/text_image_to_text/sft.py", line 100, in <module>
|
| 37 |
+
[rank0]: sys.exit(main())
|
| 38 |
+
[rank0]: ^^^^^^
|
| 39 |
+
[rank0]: File "/aifs4su/yaodong/hantao/align-anything/align_anything/trainers/text_image_to_text/sft.py", line 96, in main
|
| 40 |
+
[rank0]: trainer.save()
|
| 41 |
+
[rank0]: File "/aifs4su/yaodong/hantao/align-anything/align_anything/trainers/text_to_text/sft.py", line 228, in save
|
| 42 |
+
[rank0]: self.save_transformers(model=model, tag=tag)
|
| 43 |
+
[rank0]: File "/aifs4su/yaodong/hantao/align-anything/align_anything/trainers/base/supervised_trainer.py", line 435, in save_transformers
|
| 44 |
+
[rank0]: model.save_16bit_model(output_dir, save_filename=save_file_name)
|
| 45 |
+
[rank0]: File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 3815, in save_16bit_model
|
| 46 |
+
[rank0]: state_dict = self._zero3_consolidated_16bit_state_dict(
|
| 47 |
+
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 48 |
+
[rank0]: File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 3776, in _zero3_consolidated_16bit_state_dict
|
| 49 |
+
[rank0]: get_layer_state_dict(self.module, prefix="")
|
| 50 |
+
[rank0]: File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 3769, in get_layer_state_dict
|
| 51 |
+
[rank0]: get_layer_state_dict(child, prefix + name + ".")
|
| 52 |
+
[rank0]: File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 3769, in get_layer_state_dict
|
| 53 |
+
[rank0]: get_layer_state_dict(child, prefix + name + ".")
|
| 54 |
+
[rank0]: File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 3769, in get_layer_state_dict
|
| 55 |
+
[rank0]: get_layer_state_dict(child, prefix + name + ".")
|
| 56 |
+
[rank0]: [Previous line repeated 3 more times]
|
| 57 |
+
[rank0]: File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 3757, in get_layer_state_dict
|
| 58 |
+
[rank0]: state_dict[key] = param.detach().cpu()
|
| 59 |
+
[rank0]: ^^^^^^^^^^^^^^^^^^^^
|
| 60 |
+
[rank0]: KeyboardInterrupt
|
run-20250329_004923-vanwhj5e/files/requirements.txt
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
maskrcnn_benchmark==0.0.0
|
| 2 |
+
webdataset==0.2.111
|
| 3 |
+
websockets==15.0.1
|
| 4 |
+
typer==0.15.2
|
| 5 |
+
blobfile==3.0.0
|
| 6 |
+
pooch==1.8.2
|
| 7 |
+
python-dateutil==2.9.0.post0
|
| 8 |
+
gmpy2==2.2.1
|
| 9 |
+
httpcore==1.0.7
|
| 10 |
+
charset-normalizer==3.3.2
|
| 11 |
+
torchlibrosa==0.1.0
|
| 12 |
+
multiprocess==0.70.16
|
| 13 |
+
Werkzeug==3.1.3
|
| 14 |
+
aiofiles==23.2.1
|
| 15 |
+
six==1.17.0
|
| 16 |
+
typing_extensions==4.12.2
|
| 17 |
+
psutil==7.0.0
|
| 18 |
+
frozenlist==1.5.0
|
| 19 |
+
einops==0.8.1
|
| 20 |
+
flash_attn==2.7.4.post1
|
| 21 |
+
PySocks==1.7.1
|
| 22 |
+
regex==2024.11.6
|
| 23 |
+
markdown-it-py==3.0.0
|
| 24 |
+
ruff==0.11.2
|
| 25 |
+
docker-pycreds==0.4.0
|
| 26 |
+
protobuf==5.29.4
|
| 27 |
+
resampy==0.4.3
|
| 28 |
+
aiohappyeyeballs==2.6.1
|
| 29 |
+
httpx==0.28.1
|
| 30 |
+
encodec==0.1.1
|
| 31 |
+
ffmpy==0.5.0
|
| 32 |
+
mkl_random==1.2.8
|
| 33 |
+
soxr==0.5.0.post1
|
| 34 |
+
absl-py==2.2.1
|
| 35 |
+
networkx==3.4.2
|
| 36 |
+
h5py==3.13.0
|
| 37 |
+
hjson==3.1.0
|
| 38 |
+
tensorboard==2.19.0
|
| 39 |
+
aiosignal==1.3.2
|
| 40 |
+
pip==25.0
|
| 41 |
+
triton==3.1.0
|
| 42 |
+
zipp==3.21.0
|
| 43 |
+
ftfy==6.3.1
|
| 44 |
+
attrs==25.3.0
|
| 45 |
+
requests==2.32.3
|
| 46 |
+
progressbar==2.5
|
| 47 |
+
sniffio==1.3.1
|
| 48 |
+
lxml==5.3.1
|
| 49 |
+
starlette==0.46.1
|
| 50 |
+
Markdown==3.7
|
| 51 |
+
mdurl==0.1.2
|
| 52 |
+
torchaudio==2.5.1
|
| 53 |
+
safetensors==0.5.3
|
| 54 |
+
opencv-python==4.6.0.66
|
| 55 |
+
torchvision==0.20.1
|
| 56 |
+
shellingham==1.5.4
|
| 57 |
+
gradio==5.23.1
|
| 58 |
+
timm==1.0.15
|
| 59 |
+
multidict==6.2.0
|
| 60 |
+
semantic-version==2.10.0
|
| 61 |
+
numba==0.60.0
|
| 62 |
+
gradio_client==1.8.0
|
| 63 |
+
pydantic_core==2.33.0
|
| 64 |
+
dill==0.3.8
|
| 65 |
+
msgpack==1.1.0
|
| 66 |
+
sentry-sdk==2.24.1
|
| 67 |
+
grpcio==1.71.0
|
| 68 |
+
cffi==1.17.1
|
| 69 |
+
PyYAML==6.0.2
|
| 70 |
+
tensorboard-data-server==0.7.2
|
| 71 |
+
fastapi==0.115.12
|
| 72 |
+
lazy_loader==0.4
|
| 73 |
+
mkl_fft==1.3.11
|
| 74 |
+
annotated-types==0.7.0
|
| 75 |
+
scikit-learn==1.6.1
|
| 76 |
+
wget==3.2
|
| 77 |
+
setuptools==75.8.0
|
| 78 |
+
certifi==2025.1.31
|
| 79 |
+
click==8.1.8
|
| 80 |
+
laion_clap==1.1.5
|
| 81 |
+
Pygments==2.19.1
|
| 82 |
+
tomlkit==0.13.2
|
| 83 |
+
idna==3.7
|
| 84 |
+
propcache==0.3.1
|
| 85 |
+
platformdirs==4.3.7
|
| 86 |
+
align-anything==0.0.1.dev0
|
| 87 |
+
deepspeed==0.16.5
|
| 88 |
+
smmap==5.0.2
|
| 89 |
+
pillow==11.1.0
|
| 90 |
+
typing-inspection==0.4.0
|
| 91 |
+
braceexpand==0.1.7
|
| 92 |
+
decorator==5.2.1
|
| 93 |
+
pandas==2.2.3
|
| 94 |
+
huggingface-hub==0.29.3
|
| 95 |
+
pyarrow==19.0.1
|
| 96 |
+
tokenizers==0.21.1
|
| 97 |
+
GitPython==3.1.44
|
| 98 |
+
xxhash==3.5.0
|
| 99 |
+
packaging==24.2
|
| 100 |
+
numpy==1.23.4
|
| 101 |
+
setproctitle==1.3.5
|
| 102 |
+
llvmlite==0.43.0
|
| 103 |
+
tiktoken==0.9.0
|
| 104 |
+
mpmath==1.3.0
|
| 105 |
+
nvidia-ml-py==12.570.86
|
| 106 |
+
pydantic==2.11.0
|
| 107 |
+
datasets==3.5.0
|
| 108 |
+
librosa==0.11.0
|
| 109 |
+
frechet_audio_distance==0.3.1
|
| 110 |
+
sympy==1.13.1
|
| 111 |
+
safehttpx==0.1.6
|
| 112 |
+
Jinja2==3.1.6
|
| 113 |
+
h11==0.14.0
|
| 114 |
+
aiohttp==3.11.14
|
| 115 |
+
diffusers==0.32.2
|
| 116 |
+
tqdm==4.67.1
|
| 117 |
+
filelock==3.13.1
|
| 118 |
+
transformers==4.51.0.dev0
|
| 119 |
+
scipy==1.10.1
|
| 120 |
+
audioread==3.0.1
|
| 121 |
+
sentencepiece==0.2.0
|
| 122 |
+
pytz==2025.2
|
| 123 |
+
tzdata==2025.2
|
| 124 |
+
python-multipart==0.0.20
|
| 125 |
+
urllib3==2.3.0
|
| 126 |
+
pycryptodomex==3.22.0
|
| 127 |
+
yarl==1.18.3
|
| 128 |
+
pydub==0.25.1
|
| 129 |
+
pycparser==2.22
|
| 130 |
+
soundfile==0.13.1
|
| 131 |
+
wcwidth==0.2.13
|
| 132 |
+
groovy==0.1.2
|
| 133 |
+
torch==2.5.1
|
| 134 |
+
anyio==4.9.0
|
| 135 |
+
wandb==0.19.8
|
| 136 |
+
joblib==1.4.2
|
| 137 |
+
fsspec==2024.12.0
|
| 138 |
+
peft==0.15.1
|
| 139 |
+
accelerate==1.5.2
|
| 140 |
+
py-cpuinfo==9.0.0
|
| 141 |
+
uvicorn==0.34.0
|
| 142 |
+
orjson==3.10.16
|
| 143 |
+
Brotli==1.0.9
|
| 144 |
+
rich==13.9.4
|
| 145 |
+
importlib_metadata==8.6.1
|
| 146 |
+
ninja==1.11.1.4
|
| 147 |
+
wheel==0.45.1
|
| 148 |
+
MarkupSafe==3.0.2
|
| 149 |
+
threadpoolctl==3.6.0
|
| 150 |
+
gitdb==4.0.12
|
| 151 |
+
mkl-service==2.4.0
|
| 152 |
+
typing_extensions==4.12.2
|
| 153 |
+
tomli==2.0.1
|
| 154 |
+
zipp==3.19.2
|
| 155 |
+
wheel==0.43.0
|
| 156 |
+
jaraco.text==3.12.1
|
| 157 |
+
packaging==24.2
|
| 158 |
+
autocommand==2.2.2
|
| 159 |
+
jaraco.functools==4.0.1
|
| 160 |
+
jaraco.collections==5.1.0
|
| 161 |
+
platformdirs==4.2.2
|
| 162 |
+
more-itertools==10.3.0
|
| 163 |
+
inflect==7.3.1
|
| 164 |
+
jaraco.context==5.3.0
|
| 165 |
+
typeguard==4.3.0
|
| 166 |
+
backports.tarfile==1.2.0
|
| 167 |
+
importlib_metadata==8.0.0
|
run-20250329_004923-vanwhj5e/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-5.15.0-1040-nvidia-x86_64-with-glibc2.35",
|
| 3 |
+
"python": "CPython 3.11.11",
|
| 4 |
+
"startedAt": "2025-03-28T16:49:23.693460Z",
|
| 5 |
+
"args": [
|
| 6 |
+
"--local_rank=0",
|
| 7 |
+
"--model_name_or_path",
|
| 8 |
+
"/aifs4su/yaodong/hantao/models/llava-v1.6-mistral-7b-hf",
|
| 9 |
+
"--train_datasets",
|
| 10 |
+
"/aifs4su/yaodong/hantao/datasets/MMInstruct-GPT4V_mistral-7b_cosi_cut/merged/top1-10",
|
| 11 |
+
"--train_template",
|
| 12 |
+
"MM_TI2T_LLAVA",
|
| 13 |
+
"--train_split",
|
| 14 |
+
"train",
|
| 15 |
+
"--train_name",
|
| 16 |
+
"text-image-to-text",
|
| 17 |
+
"--output_dir",
|
| 18 |
+
"../outputs/test_7B",
|
| 19 |
+
"--save_total_limit",
|
| 20 |
+
"6",
|
| 21 |
+
"--train_batch_size",
|
| 22 |
+
"8",
|
| 23 |
+
"--epochs",
|
| 24 |
+
"3"
|
| 25 |
+
],
|
| 26 |
+
"program": "-m align_anything.trainers.text_image_to_text.sft",
|
| 27 |
+
"git": {
|
| 28 |
+
"remote": "git@github.com-hantao:PKU-Alignment/align-anything.git",
|
| 29 |
+
"commit": "106588f9802757a3283c1aff1f33ea9afd737f31"
|
| 30 |
+
},
|
| 31 |
+
"email": "2200017789@stu.pku.edu.cn",
|
| 32 |
+
"root": "../outputs/test_7B",
|
| 33 |
+
"host": "dgx-092",
|
| 34 |
+
"executable": "/aifs4su/yaodong/miniconda3/envs/hantao_llama/bin/python",
|
| 35 |
+
"cpu_count": 112,
|
| 36 |
+
"cpu_count_logical": 224,
|
| 37 |
+
"gpu": "NVIDIA H800",
|
| 38 |
+
"gpu_count": 8,
|
| 39 |
+
"disk": {
|
| 40 |
+
"/": {
|
| 41 |
+
"total": "1888556142592",
|
| 42 |
+
"used": "148608499712"
|
| 43 |
+
}
|
| 44 |
+
},
|
| 45 |
+
"memory": {
|
| 46 |
+
"total": "2164195454976"
|
| 47 |
+
},
|
| 48 |
+
"cpu": {
|
| 49 |
+
"count": 112,
|
| 50 |
+
"countLogical": 224
|
| 51 |
+
},
|
| 52 |
+
"gpu_nvidia": [
|
| 53 |
+
{
|
| 54 |
+
"name": "NVIDIA H800",
|
| 55 |
+
"memoryTotal": "85520809984",
|
| 56 |
+
"cudaCores": 16896,
|
| 57 |
+
"architecture": "Hopper"
|
| 58 |
+
},
|
| 59 |
+
{
|
| 60 |
+
"name": "NVIDIA H800",
|
| 61 |
+
"memoryTotal": "85520809984",
|
| 62 |
+
"cudaCores": 16896,
|
| 63 |
+
"architecture": "Hopper"
|
| 64 |
+
},
|
| 65 |
+
{
|
| 66 |
+
"name": "NVIDIA H800",
|
| 67 |
+
"memoryTotal": "85520809984",
|
| 68 |
+
"cudaCores": 16896,
|
| 69 |
+
"architecture": "Hopper"
|
| 70 |
+
},
|
| 71 |
+
{
|
| 72 |
+
"name": "NVIDIA H800",
|
| 73 |
+
"memoryTotal": "85520809984",
|
| 74 |
+
"cudaCores": 16896,
|
| 75 |
+
"architecture": "Hopper"
|
| 76 |
+
},
|
| 77 |
+
{
|
| 78 |
+
"name": "NVIDIA H800",
|
| 79 |
+
"memoryTotal": "85520809984",
|
| 80 |
+
"cudaCores": 16896,
|
| 81 |
+
"architecture": "Hopper"
|
| 82 |
+
},
|
| 83 |
+
{
|
| 84 |
+
"name": "NVIDIA H800",
|
| 85 |
+
"memoryTotal": "85520809984",
|
| 86 |
+
"cudaCores": 16896,
|
| 87 |
+
"architecture": "Hopper"
|
| 88 |
+
},
|
| 89 |
+
{
|
| 90 |
+
"name": "NVIDIA H800",
|
| 91 |
+
"memoryTotal": "85520809984",
|
| 92 |
+
"cudaCores": 16896,
|
| 93 |
+
"architecture": "Hopper"
|
| 94 |
+
},
|
| 95 |
+
{
|
| 96 |
+
"name": "NVIDIA H800",
|
| 97 |
+
"memoryTotal": "85520809984",
|
| 98 |
+
"cudaCores": 16896,
|
| 99 |
+
"architecture": "Hopper"
|
| 100 |
+
}
|
| 101 |
+
],
|
| 102 |
+
"slurm": {
|
| 103 |
+
"conf": "/cm/shared/apps/slurm/var/etc/slurm/slurm.conf"
|
| 104 |
+
},
|
| 105 |
+
"cudaVersion": "12.2"
|
| 106 |
+
}
|
run-20250329_004923-vanwhj5e/files/wandb-summary.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"_wandb":{"runtime":4}}
|
run-20250329_004923-vanwhj5e/logs/debug-core.log
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-03-29T00:49:23.0903988+08:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmp1bwuylg2/port-35456.txt","pid":35456,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false}
|
| 2 |
+
{"time":"2025-03-29T00:49:23.091269653+08:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":35456}
|
| 3 |
+
{"time":"2025-03-29T00:49:23.091244341+08:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":40795,"Zone":""}}
|
| 4 |
+
{"time":"2025-03-29T00:49:23.270609752+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:54946"}
|
| 5 |
+
{"time":"2025-03-29T00:49:23.694940245+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"vanwhj5e","id":"127.0.0.1:54946"}
|
| 6 |
+
{"time":"2025-03-29T00:49:23.912684251+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"vanwhj5e","id":"127.0.0.1:54946"}
|
| 7 |
+
{"time":"2025-03-29T00:49:27.869501123+08:00","level":"INFO","msg":"received shutdown signal","signal":15}
|
run-20250329_004923-vanwhj5e/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-03-29T00:49:23.696239084+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.8","symlink path":"../outputs/test_7B/wandb/run-20250329_004923-vanwhj5e/logs/debug-core.log"}
|
| 2 |
+
{"time":"2025-03-29T00:49:23.912592778+08:00","level":"INFO","msg":"created new stream","id":"vanwhj5e"}
|
| 3 |
+
{"time":"2025-03-29T00:49:23.91267373+08:00","level":"INFO","msg":"stream: started","id":"vanwhj5e"}
|
| 4 |
+
{"time":"2025-03-29T00:49:23.912692782+08:00","level":"INFO","msg":"handler: started","stream_id":"vanwhj5e"}
|
| 5 |
+
{"time":"2025-03-29T00:49:23.912700862+08:00","level":"INFO","msg":"sender: started","stream_id":"vanwhj5e"}
|
| 6 |
+
{"time":"2025-03-29T00:49:23.912703724+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"vanwhj5e"}
|
| 7 |
+
{"time":"2025-03-29T00:49:24.229784705+08:00","level":"INFO","msg":"Starting system monitor"}
|
| 8 |
+
{"time":"2025-03-29T00:49:27.854433357+08:00","level":"INFO","msg":"Stopping system monitor"}
|
| 9 |
+
{"time":"2025-03-29T00:49:27.855141086+08:00","level":"INFO","msg":"Stopped system monitor"}
|
run-20250329_004923-vanwhj5e/logs/debug.log
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-03-29 00:49:23,690 INFO MainThread:35456 [wandb_setup.py:_flush():67] Current SDK version is 0.19.8
|
| 2 |
+
2025-03-29 00:49:23,690 INFO MainThread:35456 [wandb_setup.py:_flush():67] Configure stats pid to 35456
|
| 3 |
+
2025-03-29 00:49:23,690 INFO MainThread:35456 [wandb_setup.py:_flush():67] Loading settings from /home/yangyaodong/.config/wandb/settings
|
| 4 |
+
2025-03-29 00:49:23,690 INFO MainThread:35456 [wandb_setup.py:_flush():67] Loading settings from /aifs4su/yaodong/hantao/align-anything/scripts/wandb/settings
|
| 5 |
+
2025-03-29 00:49:23,690 INFO MainThread:35456 [wandb_setup.py:_flush():67] Loading settings from environment variables
|
| 6 |
+
2025-03-29 00:49:23,690 INFO MainThread:35456 [wandb_init.py:setup_run_log_directory():647] Logging user logs to ../outputs/test_7B/wandb/run-20250329_004923-vanwhj5e/logs/debug.log
|
| 7 |
+
2025-03-29 00:49:23,690 INFO MainThread:35456 [wandb_init.py:setup_run_log_directory():648] Logging internal logs to ../outputs/test_7B/wandb/run-20250329_004923-vanwhj5e/logs/debug-internal.log
|
| 8 |
+
2025-03-29 00:49:23,690 INFO MainThread:35456 [wandb_init.py:init():761] calling init triggers
|
| 9 |
+
2025-03-29 00:49:23,690 INFO MainThread:35456 [wandb_init.py:init():766] wandb.init called with sweep_config: {}
|
| 10 |
+
config: {'train_cfgs': {'save_checkpoint': False, 'load_checkpoint': False, 'ds_cfgs': 'ds_z3_config.json', 'epochs': 3, 'seed': 42, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 16, 'gradient_checkpointing': True, 'learning_rate': 2e-05, 'lr_scheduler_type': 'cosine', 'lr_warmup_ratio': 0.03, 'weight_decay': 0.0, 'adam_betas': [0.9, 0.95], 'adam_epsilon': 1e-08, 'bf16': True, 'fp16': False, 'eval_strategy': 'epoch', 'eval_interval': 10, 'freeze_mm_proj': False, 'freeze_vision_tower': True, 'freeze_language_model': False, 'max_grad_norm': 1.0}, 'data_cfgs': {'load_multi_datasets': False, 'train_datasets': '/aifs4su/yaodong/hantao/datasets/MMInstruct-GPT4V_mistral-7b_cosi_cut/merged/top1-10', 'train_template': 'MM_TI2T_LLAVA', 'train_size': {}, 'train_split': 'train', 'train_name': 'text-image-to-text', 'train_data_files': {}, 'train_optional_args': [], 'eval_datasets': {}, 'eval_template': {}, 'eval_name': {}, 'eval_size': {}, 'eval_split': {}, 'eval_subset': {}, 'eval_data_files': {}, 'eval_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'align-anything', 'log_run_name': 'sft', 'output_dir': '../outputs/test_7B', 'cache_dir': {}, 'save_total_limit': 6}, 'model_cfgs': {'model_name_or_path': '/aifs4su/yaodong/hantao/models/llava-v1.6-mistral-7b-hf', 'trust_remote_code': True, 'model_max_length': 2048}, 'special_tokens': {}, '_wandb': {}}
|
| 11 |
+
2025-03-29 00:49:23,690 INFO MainThread:35456 [wandb_init.py:init():784] starting backend
|
| 12 |
+
2025-03-29 00:49:23,690 INFO MainThread:35456 [wandb_init.py:init():788] sending inform_init request
|
| 13 |
+
2025-03-29 00:49:23,693 INFO MainThread:35456 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
| 14 |
+
2025-03-29 00:49:23,693 INFO MainThread:35456 [wandb_init.py:init():798] backend started and connected
|
| 15 |
+
2025-03-29 00:49:23,694 INFO MainThread:35456 [wandb_init.py:init():891] updated telemetry
|
| 16 |
+
2025-03-29 00:49:23,705 INFO MainThread:35456 [wandb_init.py:init():915] communicating run to backend with 90.0 second timeout
|
| 17 |
+
2025-03-29 00:49:24,227 INFO MainThread:35456 [wandb_init.py:init():990] starting run threads in backend
|
| 18 |
+
2025-03-29 00:49:24,382 INFO MainThread:35456 [wandb_run.py:_console_start():2375] atexit reg
|
| 19 |
+
2025-03-29 00:49:24,382 INFO MainThread:35456 [wandb_run.py:_redirect():2227] redirect: wrap_raw
|
| 20 |
+
2025-03-29 00:49:24,382 INFO MainThread:35456 [wandb_run.py:_redirect():2292] Wrapping output streams.
|
| 21 |
+
2025-03-29 00:49:24,382 INFO MainThread:35456 [wandb_run.py:_redirect():2315] Redirects installed.
|
| 22 |
+
2025-03-29 00:49:24,384 INFO MainThread:35456 [wandb_init.py:init():1032] run started, returning control to user process
|
| 23 |
+
2025-03-29 00:49:27,852 INFO MainThread:35456 [wandb_run.py:_finish():2112] finishing run htlou/align-anything/vanwhj5e
|
| 24 |
+
2025-03-29 00:49:27,853 INFO MainThread:35456 [wandb_run.py:_atexit_cleanup():2340] got exitcode: 0
|
| 25 |
+
2025-03-29 00:49:27,853 INFO MainThread:35456 [wandb_run.py:_restore():2322] restore
|
| 26 |
+
2025-03-29 00:49:27,854 INFO MainThread:35456 [wandb_run.py:_restore():2328] restore done
|
run-20250329_004923-vanwhj5e/run-vanwhj5e.wandb
ADDED
|
File without changes
|
run-20250329_005139-6x2eqgtz/files/output.log
ADDED
|
File without changes
|
run-20250329_005139-6x2eqgtz/files/requirements.txt
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
maskrcnn_benchmark==0.0.0
|
| 2 |
+
webdataset==0.2.111
|
| 3 |
+
websockets==15.0.1
|
| 4 |
+
typer==0.15.2
|
| 5 |
+
blobfile==3.0.0
|
| 6 |
+
pooch==1.8.2
|
| 7 |
+
python-dateutil==2.9.0.post0
|
| 8 |
+
gmpy2==2.2.1
|
| 9 |
+
httpcore==1.0.7
|
| 10 |
+
charset-normalizer==3.3.2
|
| 11 |
+
torchlibrosa==0.1.0
|
| 12 |
+
multiprocess==0.70.16
|
| 13 |
+
Werkzeug==3.1.3
|
| 14 |
+
aiofiles==23.2.1
|
| 15 |
+
six==1.17.0
|
| 16 |
+
typing_extensions==4.12.2
|
| 17 |
+
psutil==7.0.0
|
| 18 |
+
frozenlist==1.5.0
|
| 19 |
+
einops==0.8.1
|
| 20 |
+
flash_attn==2.7.4.post1
|
| 21 |
+
PySocks==1.7.1
|
| 22 |
+
regex==2024.11.6
|
| 23 |
+
markdown-it-py==3.0.0
|
| 24 |
+
ruff==0.11.2
|
| 25 |
+
docker-pycreds==0.4.0
|
| 26 |
+
protobuf==5.29.4
|
| 27 |
+
resampy==0.4.3
|
| 28 |
+
aiohappyeyeballs==2.6.1
|
| 29 |
+
httpx==0.28.1
|
| 30 |
+
encodec==0.1.1
|
| 31 |
+
ffmpy==0.5.0
|
| 32 |
+
mkl_random==1.2.8
|
| 33 |
+
soxr==0.5.0.post1
|
| 34 |
+
absl-py==2.2.1
|
| 35 |
+
networkx==3.4.2
|
| 36 |
+
h5py==3.13.0
|
| 37 |
+
hjson==3.1.0
|
| 38 |
+
tensorboard==2.19.0
|
| 39 |
+
aiosignal==1.3.2
|
| 40 |
+
pip==25.0
|
| 41 |
+
triton==3.1.0
|
| 42 |
+
zipp==3.21.0
|
| 43 |
+
ftfy==6.3.1
|
| 44 |
+
attrs==25.3.0
|
| 45 |
+
requests==2.32.3
|
| 46 |
+
progressbar==2.5
|
| 47 |
+
sniffio==1.3.1
|
| 48 |
+
lxml==5.3.1
|
| 49 |
+
starlette==0.46.1
|
| 50 |
+
Markdown==3.7
|
| 51 |
+
mdurl==0.1.2
|
| 52 |
+
torchaudio==2.5.1
|
| 53 |
+
safetensors==0.5.3
|
| 54 |
+
opencv-python==4.6.0.66
|
| 55 |
+
torchvision==0.20.1
|
| 56 |
+
shellingham==1.5.4
|
| 57 |
+
gradio==5.23.1
|
| 58 |
+
timm==1.0.15
|
| 59 |
+
multidict==6.2.0
|
| 60 |
+
semantic-version==2.10.0
|
| 61 |
+
numba==0.60.0
|
| 62 |
+
gradio_client==1.8.0
|
| 63 |
+
pydantic_core==2.33.0
|
| 64 |
+
dill==0.3.8
|
| 65 |
+
msgpack==1.1.0
|
| 66 |
+
sentry-sdk==2.24.1
|
| 67 |
+
grpcio==1.71.0
|
| 68 |
+
cffi==1.17.1
|
| 69 |
+
PyYAML==6.0.2
|
| 70 |
+
tensorboard-data-server==0.7.2
|
| 71 |
+
fastapi==0.115.12
|
| 72 |
+
lazy_loader==0.4
|
| 73 |
+
mkl_fft==1.3.11
|
| 74 |
+
annotated-types==0.7.0
|
| 75 |
+
scikit-learn==1.6.1
|
| 76 |
+
wget==3.2
|
| 77 |
+
setuptools==75.8.0
|
| 78 |
+
certifi==2025.1.31
|
| 79 |
+
click==8.1.8
|
| 80 |
+
laion_clap==1.1.5
|
| 81 |
+
Pygments==2.19.1
|
| 82 |
+
tomlkit==0.13.2
|
| 83 |
+
idna==3.7
|
| 84 |
+
propcache==0.3.1
|
| 85 |
+
platformdirs==4.3.7
|
| 86 |
+
align-anything==0.0.1.dev0
|
| 87 |
+
deepspeed==0.16.5
|
| 88 |
+
smmap==5.0.2
|
| 89 |
+
pillow==11.1.0
|
| 90 |
+
typing-inspection==0.4.0
|
| 91 |
+
braceexpand==0.1.7
|
| 92 |
+
decorator==5.2.1
|
| 93 |
+
pandas==2.2.3
|
| 94 |
+
huggingface-hub==0.29.3
|
| 95 |
+
pyarrow==19.0.1
|
| 96 |
+
tokenizers==0.21.1
|
| 97 |
+
GitPython==3.1.44
|
| 98 |
+
xxhash==3.5.0
|
| 99 |
+
packaging==24.2
|
| 100 |
+
numpy==1.23.4
|
| 101 |
+
setproctitle==1.3.5
|
| 102 |
+
llvmlite==0.43.0
|
| 103 |
+
tiktoken==0.9.0
|
| 104 |
+
mpmath==1.3.0
|
| 105 |
+
nvidia-ml-py==12.570.86
|
| 106 |
+
pydantic==2.11.0
|
| 107 |
+
datasets==3.5.0
|
| 108 |
+
librosa==0.11.0
|
| 109 |
+
frechet_audio_distance==0.3.1
|
| 110 |
+
sympy==1.13.1
|
| 111 |
+
safehttpx==0.1.6
|
| 112 |
+
Jinja2==3.1.6
|
| 113 |
+
h11==0.14.0
|
| 114 |
+
aiohttp==3.11.14
|
| 115 |
+
diffusers==0.32.2
|
| 116 |
+
tqdm==4.67.1
|
| 117 |
+
filelock==3.13.1
|
| 118 |
+
transformers==4.51.0.dev0
|
| 119 |
+
scipy==1.10.1
|
| 120 |
+
audioread==3.0.1
|
| 121 |
+
sentencepiece==0.2.0
|
| 122 |
+
pytz==2025.2
|
| 123 |
+
tzdata==2025.2
|
| 124 |
+
python-multipart==0.0.20
|
| 125 |
+
urllib3==2.3.0
|
| 126 |
+
pycryptodomex==3.22.0
|
| 127 |
+
yarl==1.18.3
|
| 128 |
+
pydub==0.25.1
|
| 129 |
+
pycparser==2.22
|
| 130 |
+
soundfile==0.13.1
|
| 131 |
+
wcwidth==0.2.13
|
| 132 |
+
groovy==0.1.2
|
| 133 |
+
torch==2.5.1
|
| 134 |
+
anyio==4.9.0
|
| 135 |
+
wandb==0.19.8
|
| 136 |
+
joblib==1.4.2
|
| 137 |
+
fsspec==2024.12.0
|
| 138 |
+
peft==0.15.1
|
| 139 |
+
accelerate==1.5.2
|
| 140 |
+
py-cpuinfo==9.0.0
|
| 141 |
+
uvicorn==0.34.0
|
| 142 |
+
orjson==3.10.16
|
| 143 |
+
Brotli==1.0.9
|
| 144 |
+
rich==13.9.4
|
| 145 |
+
importlib_metadata==8.6.1
|
| 146 |
+
ninja==1.11.1.4
|
| 147 |
+
wheel==0.45.1
|
| 148 |
+
MarkupSafe==3.0.2
|
| 149 |
+
threadpoolctl==3.6.0
|
| 150 |
+
gitdb==4.0.12
|
| 151 |
+
mkl-service==2.4.0
|
| 152 |
+
typing_extensions==4.12.2
|
| 153 |
+
tomli==2.0.1
|
| 154 |
+
zipp==3.19.2
|
| 155 |
+
wheel==0.43.0
|
| 156 |
+
jaraco.text==3.12.1
|
| 157 |
+
packaging==24.2
|
| 158 |
+
autocommand==2.2.2
|
| 159 |
+
jaraco.functools==4.0.1
|
| 160 |
+
jaraco.collections==5.1.0
|
| 161 |
+
platformdirs==4.2.2
|
| 162 |
+
more-itertools==10.3.0
|
| 163 |
+
inflect==7.3.1
|
| 164 |
+
jaraco.context==5.3.0
|
| 165 |
+
typeguard==4.3.0
|
| 166 |
+
backports.tarfile==1.2.0
|
| 167 |
+
importlib_metadata==8.0.0
|
run-20250329_005139-6x2eqgtz/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-5.15.0-1040-nvidia-x86_64-with-glibc2.35",
|
| 3 |
+
"python": "CPython 3.11.11",
|
| 4 |
+
"startedAt": "2025-03-28T16:51:39.067886Z",
|
| 5 |
+
"args": [
|
| 6 |
+
"--local_rank=0",
|
| 7 |
+
"--model_name_or_path",
|
| 8 |
+
"/aifs4su/yaodong/hantao/models/llava-v1.6-mistral-7b-hf",
|
| 9 |
+
"--train_datasets",
|
| 10 |
+
"/aifs4su/yaodong/hantao/datasets/MMInstruct-GPT4V_mistral-7b_cosi_cut/merged/top1-10",
|
| 11 |
+
"--train_template",
|
| 12 |
+
"MM_TI2T_LLAVA",
|
| 13 |
+
"--train_split",
|
| 14 |
+
"train",
|
| 15 |
+
"--train_name",
|
| 16 |
+
"text-image-to-text",
|
| 17 |
+
"--output_dir",
|
| 18 |
+
"../outputs/test_7B",
|
| 19 |
+
"--save_total_limit",
|
| 20 |
+
"6",
|
| 21 |
+
"--train_batch_size",
|
| 22 |
+
"8",
|
| 23 |
+
"--epochs",
|
| 24 |
+
"3"
|
| 25 |
+
],
|
| 26 |
+
"program": "-m align_anything.trainers.text_image_to_text.sft",
|
| 27 |
+
"git": {
|
| 28 |
+
"remote": "git@github.com-hantao:PKU-Alignment/align-anything.git",
|
| 29 |
+
"commit": "106588f9802757a3283c1aff1f33ea9afd737f31"
|
| 30 |
+
},
|
| 31 |
+
"email": "2200017789@stu.pku.edu.cn",
|
| 32 |
+
"root": "../outputs/test_7B",
|
| 33 |
+
"host": "dgx-092",
|
| 34 |
+
"executable": "/aifs4su/yaodong/miniconda3/envs/hantao_llama/bin/python"
|
| 35 |
+
}
|
run-20250329_005139-6x2eqgtz/logs/debug-core.log
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-03-29T00:51:38.479231525+08:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpb6xx_4px/port-42596.txt","pid":42596,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false}
|
| 2 |
+
{"time":"2025-03-29T00:51:38.480170741+08:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":42596}
|
| 3 |
+
{"time":"2025-03-29T00:51:38.480164974+08:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":35441,"Zone":""}}
|
| 4 |
+
{"time":"2025-03-29T00:51:38.663800746+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:50004"}
|
| 5 |
+
{"time":"2025-03-29T00:51:39.069534671+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"6x2eqgtz","id":"127.0.0.1:50004"}
|
| 6 |
+
{"time":"2025-03-29T00:51:39.285743333+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"6x2eqgtz","id":"127.0.0.1:50004"}
|
| 7 |
+
{"time":"2025-03-29T00:51:40.320100827+08:00","level":"INFO","msg":"received shutdown signal","signal":15}
|
run-20250329_005139-6x2eqgtz/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-03-29T00:51:39.071049577+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.8","symlink path":"../outputs/test_7B/wandb/run-20250329_005139-6x2eqgtz/logs/debug-core.log"}
|
| 2 |
+
{"time":"2025-03-29T00:51:39.285637637+08:00","level":"INFO","msg":"created new stream","id":"6x2eqgtz"}
|
| 3 |
+
{"time":"2025-03-29T00:51:39.285734961+08:00","level":"INFO","msg":"stream: started","id":"6x2eqgtz"}
|
| 4 |
+
{"time":"2025-03-29T00:51:39.285766391+08:00","level":"INFO","msg":"handler: started","stream_id":"6x2eqgtz"}
|
| 5 |
+
{"time":"2025-03-29T00:51:39.286029925+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"6x2eqgtz"}
|
| 6 |
+
{"time":"2025-03-29T00:51:39.285781884+08:00","level":"INFO","msg":"sender: started","stream_id":"6x2eqgtz"}
|
| 7 |
+
{"time":"2025-03-29T00:51:39.613194812+08:00","level":"INFO","msg":"Starting system monitor"}
|
| 8 |
+
{"time":"2025-03-29T00:51:39.846153883+08:00","level":"INFO","msg":"Stopping system monitor"}
|
run-20250329_005139-6x2eqgtz/logs/debug.log
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-03-29 00:51:39,064 INFO MainThread:42596 [wandb_setup.py:_flush():67] Current SDK version is 0.19.8
|
| 2 |
+
2025-03-29 00:51:39,064 INFO MainThread:42596 [wandb_setup.py:_flush():67] Configure stats pid to 42596
|
| 3 |
+
2025-03-29 00:51:39,064 INFO MainThread:42596 [wandb_setup.py:_flush():67] Loading settings from /home/yangyaodong/.config/wandb/settings
|
| 4 |
+
2025-03-29 00:51:39,064 INFO MainThread:42596 [wandb_setup.py:_flush():67] Loading settings from /aifs4su/yaodong/hantao/align-anything/scripts/wandb/settings
|
| 5 |
+
2025-03-29 00:51:39,064 INFO MainThread:42596 [wandb_setup.py:_flush():67] Loading settings from environment variables
|
| 6 |
+
2025-03-29 00:51:39,065 INFO MainThread:42596 [wandb_init.py:setup_run_log_directory():647] Logging user logs to ../outputs/test_7B/wandb/run-20250329_005139-6x2eqgtz/logs/debug.log
|
| 7 |
+
2025-03-29 00:51:39,065 INFO MainThread:42596 [wandb_init.py:setup_run_log_directory():648] Logging internal logs to ../outputs/test_7B/wandb/run-20250329_005139-6x2eqgtz/logs/debug-internal.log
|
| 8 |
+
2025-03-29 00:51:39,065 INFO MainThread:42596 [wandb_init.py:init():761] calling init triggers
|
| 9 |
+
2025-03-29 00:51:39,065 INFO MainThread:42596 [wandb_init.py:init():766] wandb.init called with sweep_config: {}
|
| 10 |
+
config: {'train_cfgs': {'save_checkpoint': False, 'load_checkpoint': False, 'ds_cfgs': 'ds_z3_config.json', 'epochs': 3, 'seed': 42, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 16, 'gradient_checkpointing': True, 'learning_rate': 2e-05, 'lr_scheduler_type': 'cosine', 'lr_warmup_ratio': 0.03, 'weight_decay': 0.0, 'adam_betas': [0.9, 0.95], 'adam_epsilon': 1e-08, 'bf16': True, 'fp16': False, 'eval_strategy': 'epoch', 'eval_interval': 10, 'freeze_mm_proj': False, 'freeze_vision_tower': True, 'freeze_language_model': False, 'max_grad_norm': 1.0}, 'data_cfgs': {'load_multi_datasets': False, 'train_datasets': '/aifs4su/yaodong/hantao/datasets/MMInstruct-GPT4V_mistral-7b_cosi_cut/merged/top1-10', 'train_template': 'MM_TI2T_LLAVA', 'train_size': {}, 'train_split': 'train', 'train_name': 'text-image-to-text', 'train_data_files': {}, 'train_optional_args': [], 'eval_datasets': {}, 'eval_template': {}, 'eval_name': {}, 'eval_size': {}, 'eval_split': {}, 'eval_subset': {}, 'eval_data_files': {}, 'eval_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'align-anything', 'log_run_name': 'sft', 'output_dir': '../outputs/test_7B', 'cache_dir': {}, 'save_total_limit': 6}, 'model_cfgs': {'model_name_or_path': '/aifs4su/yaodong/hantao/models/llava-v1.6-mistral-7b-hf', 'trust_remote_code': True, 'model_max_length': 2048}, 'special_tokens': {}, '_wandb': {}}
|
| 11 |
+
2025-03-29 00:51:39,065 INFO MainThread:42596 [wandb_init.py:init():784] starting backend
|
| 12 |
+
2025-03-29 00:51:39,065 INFO MainThread:42596 [wandb_init.py:init():788] sending inform_init request
|
| 13 |
+
2025-03-29 00:51:39,067 INFO MainThread:42596 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
| 14 |
+
2025-03-29 00:51:39,067 INFO MainThread:42596 [wandb_init.py:init():798] backend started and connected
|
| 15 |
+
2025-03-29 00:51:39,068 INFO MainThread:42596 [wandb_init.py:init():891] updated telemetry
|
| 16 |
+
2025-03-29 00:51:39,080 INFO MainThread:42596 [wandb_init.py:init():915] communicating run to backend with 90.0 second timeout
|
| 17 |
+
2025-03-29 00:51:39,610 INFO MainThread:42596 [wandb_init.py:init():990] starting run threads in backend
|
| 18 |
+
2025-03-29 00:51:39,802 INFO MainThread:42596 [wandb_run.py:_console_start():2375] atexit reg
|
| 19 |
+
2025-03-29 00:51:39,803 INFO MainThread:42596 [wandb_run.py:_redirect():2227] redirect: wrap_raw
|
| 20 |
+
2025-03-29 00:51:39,803 INFO MainThread:42596 [wandb_run.py:_redirect():2292] Wrapping output streams.
|
| 21 |
+
2025-03-29 00:51:39,803 INFO MainThread:42596 [wandb_run.py:_redirect():2315] Redirects installed.
|
| 22 |
+
2025-03-29 00:51:39,805 INFO MainThread:42596 [wandb_init.py:init():1032] run started, returning control to user process
|
| 23 |
+
2025-03-29 00:51:39,816 INFO MainThread:42596 [wandb_run.py:_finish():2112] finishing run htlou/align-anything/6x2eqgtz
|
| 24 |
+
2025-03-29 00:51:39,816 INFO MainThread:42596 [wandb_run.py:_atexit_cleanup():2340] got exitcode: 0
|
| 25 |
+
2025-03-29 00:51:39,816 INFO MainThread:42596 [wandb_run.py:_restore():2322] restore
|
| 26 |
+
2025-03-29 00:51:39,816 INFO MainThread:42596 [wandb_run.py:_restore():2328] restore done
|
run-20250329_005139-6x2eqgtz/run-6x2eqgtz.wandb
ADDED
|
File without changes
|
run-20250329_005425-3al6iztu/files/output.log
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
***** Running training *****
|
| 2 |
+
Resuming from checkpoint 1/3 epoch : 0%| | 0/7326 [00:00<?, ?it/s]Traceback (most recent call last):
|
| 3 |
+
Train dataloader: <torch.utils.data.dataloader.DataLoader object at 0x1551083766d0>
|
| 4 |
+
File "<frozen runpy>", line 198, in _run_module_as_main
|
| 5 |
+
File "<frozen runpy>", line 88, in _run_code
|
| 6 |
+
File "/aifs4su/yaodong/hantao/align-anything/align_anything/trainers/text_image_to_text/sft.py", line 100, in <module>
|
| 7 |
+
sys.exit(main())
|
| 8 |
+
^^^^^^
|
| 9 |
+
File "/aifs4su/yaodong/hantao/align-anything/align_anything/trainers/text_image_to_text/sft.py", line 95, in main
|
| 10 |
+
trainer.train()
|
| 11 |
+
File "/aifs4su/yaodong/hantao/align-anything/align_anything/trainers/text_to_text/sft.py", line 140, in train
|
| 12 |
+
print('First batch: ', enumerate(self.train_dataloader)[0])
|
| 13 |
+
~~~~~~~~~~~~~~~~~~~~
|
run-20250329_005425-3al6iztu/files/requirements.txt
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
maskrcnn_benchmark==0.0.0
|
| 2 |
+
webdataset==0.2.111
|
| 3 |
+
websockets==15.0.1
|
| 4 |
+
typer==0.15.2
|
| 5 |
+
blobfile==3.0.0
|
| 6 |
+
pooch==1.8.2
|
| 7 |
+
python-dateutil==2.9.0.post0
|
| 8 |
+
gmpy2==2.2.1
|
| 9 |
+
httpcore==1.0.7
|
| 10 |
+
charset-normalizer==3.3.2
|
| 11 |
+
torchlibrosa==0.1.0
|
| 12 |
+
multiprocess==0.70.16
|
| 13 |
+
Werkzeug==3.1.3
|
| 14 |
+
aiofiles==23.2.1
|
| 15 |
+
six==1.17.0
|
| 16 |
+
typing_extensions==4.12.2
|
| 17 |
+
psutil==7.0.0
|
| 18 |
+
frozenlist==1.5.0
|
| 19 |
+
einops==0.8.1
|
| 20 |
+
flash_attn==2.7.4.post1
|
| 21 |
+
PySocks==1.7.1
|
| 22 |
+
regex==2024.11.6
|
| 23 |
+
markdown-it-py==3.0.0
|
| 24 |
+
ruff==0.11.2
|
| 25 |
+
docker-pycreds==0.4.0
|
| 26 |
+
protobuf==5.29.4
|
| 27 |
+
resampy==0.4.3
|
| 28 |
+
aiohappyeyeballs==2.6.1
|
| 29 |
+
httpx==0.28.1
|
| 30 |
+
encodec==0.1.1
|
| 31 |
+
ffmpy==0.5.0
|
| 32 |
+
mkl_random==1.2.8
|
| 33 |
+
soxr==0.5.0.post1
|
| 34 |
+
absl-py==2.2.1
|
| 35 |
+
networkx==3.4.2
|
| 36 |
+
h5py==3.13.0
|
| 37 |
+
hjson==3.1.0
|
| 38 |
+
tensorboard==2.19.0
|
| 39 |
+
aiosignal==1.3.2
|
| 40 |
+
pip==25.0
|
| 41 |
+
triton==3.1.0
|
| 42 |
+
zipp==3.21.0
|
| 43 |
+
ftfy==6.3.1
|
| 44 |
+
attrs==25.3.0
|
| 45 |
+
requests==2.32.3
|
| 46 |
+
progressbar==2.5
|
| 47 |
+
sniffio==1.3.1
|
| 48 |
+
lxml==5.3.1
|
| 49 |
+
starlette==0.46.1
|
| 50 |
+
Markdown==3.7
|
| 51 |
+
mdurl==0.1.2
|
| 52 |
+
torchaudio==2.5.1
|
| 53 |
+
safetensors==0.5.3
|
| 54 |
+
opencv-python==4.6.0.66
|
| 55 |
+
torchvision==0.20.1
|
| 56 |
+
shellingham==1.5.4
|
| 57 |
+
gradio==5.23.1
|
| 58 |
+
timm==1.0.15
|
| 59 |
+
multidict==6.2.0
|
| 60 |
+
semantic-version==2.10.0
|
| 61 |
+
numba==0.60.0
|
| 62 |
+
gradio_client==1.8.0
|
| 63 |
+
pydantic_core==2.33.0
|
| 64 |
+
dill==0.3.8
|
| 65 |
+
msgpack==1.1.0
|
| 66 |
+
sentry-sdk==2.24.1
|
| 67 |
+
grpcio==1.71.0
|
| 68 |
+
cffi==1.17.1
|
| 69 |
+
PyYAML==6.0.2
|
| 70 |
+
tensorboard-data-server==0.7.2
|
| 71 |
+
fastapi==0.115.12
|
| 72 |
+
lazy_loader==0.4
|
| 73 |
+
mkl_fft==1.3.11
|
| 74 |
+
annotated-types==0.7.0
|
| 75 |
+
scikit-learn==1.6.1
|
| 76 |
+
wget==3.2
|
| 77 |
+
setuptools==75.8.0
|
| 78 |
+
certifi==2025.1.31
|
| 79 |
+
click==8.1.8
|
| 80 |
+
laion_clap==1.1.5
|
| 81 |
+
Pygments==2.19.1
|
| 82 |
+
tomlkit==0.13.2
|
| 83 |
+
idna==3.7
|
| 84 |
+
propcache==0.3.1
|
| 85 |
+
platformdirs==4.3.7
|
| 86 |
+
align-anything==0.0.1.dev0
|
| 87 |
+
deepspeed==0.16.5
|
| 88 |
+
smmap==5.0.2
|
| 89 |
+
pillow==11.1.0
|
| 90 |
+
typing-inspection==0.4.0
|
| 91 |
+
braceexpand==0.1.7
|
| 92 |
+
decorator==5.2.1
|
| 93 |
+
pandas==2.2.3
|
| 94 |
+
huggingface-hub==0.29.3
|
| 95 |
+
pyarrow==19.0.1
|
| 96 |
+
tokenizers==0.21.1
|
| 97 |
+
GitPython==3.1.44
|
| 98 |
+
xxhash==3.5.0
|
| 99 |
+
packaging==24.2
|
| 100 |
+
numpy==1.23.4
|
| 101 |
+
setproctitle==1.3.5
|
| 102 |
+
llvmlite==0.43.0
|
| 103 |
+
tiktoken==0.9.0
|
| 104 |
+
mpmath==1.3.0
|
| 105 |
+
nvidia-ml-py==12.570.86
|
| 106 |
+
pydantic==2.11.0
|
| 107 |
+
datasets==3.5.0
|
| 108 |
+
librosa==0.11.0
|
| 109 |
+
frechet_audio_distance==0.3.1
|
| 110 |
+
sympy==1.13.1
|
| 111 |
+
safehttpx==0.1.6
|
| 112 |
+
Jinja2==3.1.6
|
| 113 |
+
h11==0.14.0
|
| 114 |
+
aiohttp==3.11.14
|
| 115 |
+
diffusers==0.32.2
|
| 116 |
+
tqdm==4.67.1
|
| 117 |
+
filelock==3.13.1
|
| 118 |
+
transformers==4.51.0.dev0
|
| 119 |
+
scipy==1.10.1
|
| 120 |
+
audioread==3.0.1
|
| 121 |
+
sentencepiece==0.2.0
|
| 122 |
+
pytz==2025.2
|
| 123 |
+
tzdata==2025.2
|
| 124 |
+
python-multipart==0.0.20
|
| 125 |
+
urllib3==2.3.0
|
| 126 |
+
pycryptodomex==3.22.0
|
| 127 |
+
yarl==1.18.3
|
| 128 |
+
pydub==0.25.1
|
| 129 |
+
pycparser==2.22
|
| 130 |
+
soundfile==0.13.1
|
| 131 |
+
wcwidth==0.2.13
|
| 132 |
+
groovy==0.1.2
|
| 133 |
+
torch==2.5.1
|
| 134 |
+
anyio==4.9.0
|
| 135 |
+
wandb==0.19.8
|
| 136 |
+
joblib==1.4.2
|
| 137 |
+
fsspec==2024.12.0
|
| 138 |
+
peft==0.15.1
|
| 139 |
+
accelerate==1.5.2
|
| 140 |
+
py-cpuinfo==9.0.0
|
| 141 |
+
uvicorn==0.34.0
|
| 142 |
+
orjson==3.10.16
|
| 143 |
+
Brotli==1.0.9
|
| 144 |
+
rich==13.9.4
|
| 145 |
+
importlib_metadata==8.6.1
|
| 146 |
+
ninja==1.11.1.4
|
| 147 |
+
wheel==0.45.1
|
| 148 |
+
MarkupSafe==3.0.2
|
| 149 |
+
threadpoolctl==3.6.0
|
| 150 |
+
gitdb==4.0.12
|
| 151 |
+
mkl-service==2.4.0
|
| 152 |
+
typing_extensions==4.12.2
|
| 153 |
+
tomli==2.0.1
|
| 154 |
+
zipp==3.19.2
|
| 155 |
+
wheel==0.43.0
|
| 156 |
+
jaraco.text==3.12.1
|
| 157 |
+
packaging==24.2
|
| 158 |
+
autocommand==2.2.2
|
| 159 |
+
jaraco.functools==4.0.1
|
| 160 |
+
jaraco.collections==5.1.0
|
| 161 |
+
platformdirs==4.2.2
|
| 162 |
+
more-itertools==10.3.0
|
| 163 |
+
inflect==7.3.1
|
| 164 |
+
jaraco.context==5.3.0
|
| 165 |
+
typeguard==4.3.0
|
| 166 |
+
backports.tarfile==1.2.0
|
| 167 |
+
importlib_metadata==8.0.0
|
run-20250329_005425-3al6iztu/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-5.15.0-1040-nvidia-x86_64-with-glibc2.35",
|
| 3 |
+
"python": "CPython 3.11.11",
|
| 4 |
+
"startedAt": "2025-03-28T16:54:25.328152Z",
|
| 5 |
+
"args": [
|
| 6 |
+
"--local_rank=0",
|
| 7 |
+
"--model_name_or_path",
|
| 8 |
+
"/aifs4su/yaodong/hantao/models/llava-v1.6-mistral-7b-hf",
|
| 9 |
+
"--train_datasets",
|
| 10 |
+
"/aifs4su/yaodong/hantao/datasets/MMInstruct-GPT4V_mistral-7b_cosi_cut/merged/top1-10",
|
| 11 |
+
"--train_template",
|
| 12 |
+
"MM_TI2T_LLAVA",
|
| 13 |
+
"--train_split",
|
| 14 |
+
"train",
|
| 15 |
+
"--train_name",
|
| 16 |
+
"text-image-to-text",
|
| 17 |
+
"--output_dir",
|
| 18 |
+
"../outputs/test_7B",
|
| 19 |
+
"--save_total_limit",
|
| 20 |
+
"6",
|
| 21 |
+
"--train_batch_size",
|
| 22 |
+
"8",
|
| 23 |
+
"--epochs",
|
| 24 |
+
"3"
|
| 25 |
+
],
|
| 26 |
+
"program": "-m align_anything.trainers.text_image_to_text.sft",
|
| 27 |
+
"git": {
|
| 28 |
+
"remote": "git@github.com-hantao:PKU-Alignment/align-anything.git",
|
| 29 |
+
"commit": "106588f9802757a3283c1aff1f33ea9afd737f31"
|
| 30 |
+
},
|
| 31 |
+
"email": "2200017789@stu.pku.edu.cn",
|
| 32 |
+
"root": "../outputs/test_7B",
|
| 33 |
+
"host": "dgx-092",
|
| 34 |
+
"executable": "/aifs4su/yaodong/miniconda3/envs/hantao_llama/bin/python"
|
| 35 |
+
}
|
run-20250329_005425-3al6iztu/logs/debug-core.log
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-03-29T00:54:24.691982358+08:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpszvpq4wi/port-48756.txt","pid":48756,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false}
|
| 2 |
+
{"time":"2025-03-29T00:54:24.692861871+08:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":48756}
|
| 3 |
+
{"time":"2025-03-29T00:54:24.692845205+08:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":42415,"Zone":""}}
|
| 4 |
+
{"time":"2025-03-29T00:54:24.873664584+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:37608"}
|
| 5 |
+
{"time":"2025-03-29T00:54:25.329808834+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"3al6iztu","id":"127.0.0.1:37608"}
|
| 6 |
+
{"time":"2025-03-29T00:54:25.545982861+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"3al6iztu","id":"127.0.0.1:37608"}
|
| 7 |
+
{"time":"2025-03-29T00:54:26.490223915+08:00","level":"INFO","msg":"received shutdown signal","signal":15}
|
run-20250329_005425-3al6iztu/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-03-29T00:54:25.331350097+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.8","symlink path":"../outputs/test_7B/wandb/run-20250329_005425-3al6iztu/logs/debug-core.log"}
|
| 2 |
+
{"time":"2025-03-29T00:54:25.545872725+08:00","level":"INFO","msg":"created new stream","id":"3al6iztu"}
|
| 3 |
+
{"time":"2025-03-29T00:54:25.54597415+08:00","level":"INFO","msg":"stream: started","id":"3al6iztu"}
|
| 4 |
+
{"time":"2025-03-29T00:54:25.546010693+08:00","level":"INFO","msg":"sender: started","stream_id":"3al6iztu"}
|
| 5 |
+
{"time":"2025-03-29T00:54:25.546003597+08:00","level":"INFO","msg":"handler: started","stream_id":"3al6iztu"}
|
| 6 |
+
{"time":"2025-03-29T00:54:25.546055332+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"3al6iztu"}
|
| 7 |
+
{"time":"2025-03-29T00:54:25.844887265+08:00","level":"INFO","msg":"Starting system monitor"}
|
| 8 |
+
{"time":"2025-03-29T00:54:26.062125748+08:00","level":"INFO","msg":"Stopping system monitor"}
|
run-20250329_005425-3al6iztu/logs/debug.log
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-03-29 00:54:25,324 INFO MainThread:48756 [wandb_setup.py:_flush():67] Current SDK version is 0.19.8
|
| 2 |
+
2025-03-29 00:54:25,325 INFO MainThread:48756 [wandb_setup.py:_flush():67] Configure stats pid to 48756
|
| 3 |
+
2025-03-29 00:54:25,325 INFO MainThread:48756 [wandb_setup.py:_flush():67] Loading settings from /home/yangyaodong/.config/wandb/settings
|
| 4 |
+
2025-03-29 00:54:25,325 INFO MainThread:48756 [wandb_setup.py:_flush():67] Loading settings from /aifs4su/yaodong/hantao/align-anything/scripts/wandb/settings
|
| 5 |
+
2025-03-29 00:54:25,325 INFO MainThread:48756 [wandb_setup.py:_flush():67] Loading settings from environment variables
|
| 6 |
+
2025-03-29 00:54:25,325 INFO MainThread:48756 [wandb_init.py:setup_run_log_directory():647] Logging user logs to ../outputs/test_7B/wandb/run-20250329_005425-3al6iztu/logs/debug.log
|
| 7 |
+
2025-03-29 00:54:25,325 INFO MainThread:48756 [wandb_init.py:setup_run_log_directory():648] Logging internal logs to ../outputs/test_7B/wandb/run-20250329_005425-3al6iztu/logs/debug-internal.log
|
| 8 |
+
2025-03-29 00:54:25,325 INFO MainThread:48756 [wandb_init.py:init():761] calling init triggers
|
| 9 |
+
2025-03-29 00:54:25,325 INFO MainThread:48756 [wandb_init.py:init():766] wandb.init called with sweep_config: {}
|
| 10 |
+
config: {'train_cfgs': {'save_checkpoint': False, 'load_checkpoint': False, 'ds_cfgs': 'ds_z3_config.json', 'epochs': 3, 'seed': 42, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 16, 'gradient_checkpointing': True, 'learning_rate': 2e-05, 'lr_scheduler_type': 'cosine', 'lr_warmup_ratio': 0.03, 'weight_decay': 0.0, 'adam_betas': [0.9, 0.95], 'adam_epsilon': 1e-08, 'bf16': True, 'fp16': False, 'eval_strategy': 'epoch', 'eval_interval': 10, 'freeze_mm_proj': False, 'freeze_vision_tower': True, 'freeze_language_model': False, 'max_grad_norm': 1.0}, 'data_cfgs': {'load_multi_datasets': False, 'train_datasets': '/aifs4su/yaodong/hantao/datasets/MMInstruct-GPT4V_mistral-7b_cosi_cut/merged/top1-10', 'train_template': 'MM_TI2T_LLAVA', 'train_size': {}, 'train_split': 'train', 'train_name': 'text-image-to-text', 'train_data_files': {}, 'train_optional_args': [], 'eval_datasets': {}, 'eval_template': {}, 'eval_name': {}, 'eval_size': {}, 'eval_split': {}, 'eval_subset': {}, 'eval_data_files': {}, 'eval_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'align-anything', 'log_run_name': 'sft', 'output_dir': '../outputs/test_7B', 'cache_dir': {}, 'save_total_limit': 6}, 'model_cfgs': {'model_name_or_path': '/aifs4su/yaodong/hantao/models/llava-v1.6-mistral-7b-hf', 'trust_remote_code': True, 'model_max_length': 2048}, 'special_tokens': {}, '_wandb': {}}
|
| 11 |
+
2025-03-29 00:54:25,325 INFO MainThread:48756 [wandb_init.py:init():784] starting backend
|
| 12 |
+
2025-03-29 00:54:25,325 INFO MainThread:48756 [wandb_init.py:init():788] sending inform_init request
|
| 13 |
+
2025-03-29 00:54:25,327 INFO MainThread:48756 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
| 14 |
+
2025-03-29 00:54:25,328 INFO MainThread:48756 [wandb_init.py:init():798] backend started and connected
|
| 15 |
+
2025-03-29 00:54:25,329 INFO MainThread:48756 [wandb_init.py:init():891] updated telemetry
|
| 16 |
+
2025-03-29 00:54:25,340 INFO MainThread:48756 [wandb_init.py:init():915] communicating run to backend with 90.0 second timeout
|
| 17 |
+
2025-03-29 00:54:25,842 INFO MainThread:48756 [wandb_init.py:init():990] starting run threads in backend
|
| 18 |
+
2025-03-29 00:54:26,016 INFO MainThread:48756 [wandb_run.py:_console_start():2375] atexit reg
|
| 19 |
+
2025-03-29 00:54:26,016 INFO MainThread:48756 [wandb_run.py:_redirect():2227] redirect: wrap_raw
|
| 20 |
+
2025-03-29 00:54:26,016 INFO MainThread:48756 [wandb_run.py:_redirect():2292] Wrapping output streams.
|
| 21 |
+
2025-03-29 00:54:26,016 INFO MainThread:48756 [wandb_run.py:_redirect():2315] Redirects installed.
|
| 22 |
+
2025-03-29 00:54:26,019 INFO MainThread:48756 [wandb_init.py:init():1032] run started, returning control to user process
|
| 23 |
+
2025-03-29 00:54:26,029 INFO MainThread:48756 [wandb_run.py:_finish():2112] finishing run htlou/align-anything/3al6iztu
|
| 24 |
+
2025-03-29 00:54:26,030 INFO MainThread:48756 [wandb_run.py:_atexit_cleanup():2340] got exitcode: 0
|
| 25 |
+
2025-03-29 00:54:26,030 INFO MainThread:48756 [wandb_run.py:_restore():2322] restore
|
| 26 |
+
2025-03-29 00:54:26,030 INFO MainThread:48756 [wandb_run.py:_restore():2328] restore done
|
run-20250329_005425-3al6iztu/run-3al6iztu.wandb
ADDED
|
File without changes
|
run-20250329_005541-bq1jaffa/files/config.yaml
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_wandb:
|
| 2 |
+
value:
|
| 3 |
+
cli_version: 0.19.8
|
| 4 |
+
m: []
|
| 5 |
+
python_version: 3.11.11
|
| 6 |
+
t:
|
| 7 |
+
"1":
|
| 8 |
+
- 1
|
| 9 |
+
- 5
|
| 10 |
+
- 11
|
| 11 |
+
- 41
|
| 12 |
+
- 49
|
| 13 |
+
- 51
|
| 14 |
+
- 53
|
| 15 |
+
- 55
|
| 16 |
+
- 63
|
| 17 |
+
- 71
|
| 18 |
+
- 83
|
| 19 |
+
- 98
|
| 20 |
+
- 105
|
| 21 |
+
"2":
|
| 22 |
+
- 1
|
| 23 |
+
- 5
|
| 24 |
+
- 11
|
| 25 |
+
- 41
|
| 26 |
+
- 49
|
| 27 |
+
- 51
|
| 28 |
+
- 53
|
| 29 |
+
- 55
|
| 30 |
+
- 63
|
| 31 |
+
- 71
|
| 32 |
+
- 83
|
| 33 |
+
- 98
|
| 34 |
+
- 105
|
| 35 |
+
"3":
|
| 36 |
+
- 2
|
| 37 |
+
- 13
|
| 38 |
+
- 16
|
| 39 |
+
- 23
|
| 40 |
+
- 55
|
| 41 |
+
"4": 3.11.11
|
| 42 |
+
"5": 0.19.8
|
| 43 |
+
"6": 4.51.0.dev0
|
| 44 |
+
"8":
|
| 45 |
+
- 5
|
| 46 |
+
"12": 0.19.8
|
| 47 |
+
"13": linux-x86_64
|
| 48 |
+
data_cfgs:
|
| 49 |
+
value:
|
| 50 |
+
eval_optional_args: []
|
| 51 |
+
load_multi_datasets: false
|
| 52 |
+
train_datasets: /aifs4su/yaodong/hantao/datasets/MMInstruct-GPT4V_mistral-7b_cosi_cut/merged/top1-10
|
| 53 |
+
train_name: text-image-to-text
|
| 54 |
+
train_optional_args: []
|
| 55 |
+
train_split: train
|
| 56 |
+
train_template: MM_TI2T_LLAVA
|
| 57 |
+
logger_cfgs:
|
| 58 |
+
value:
|
| 59 |
+
log_project: align-anything
|
| 60 |
+
log_run_name: sft
|
| 61 |
+
log_type: wandb
|
| 62 |
+
output_dir: ../outputs/test_7B
|
| 63 |
+
save_total_limit: 6
|
| 64 |
+
model_cfgs:
|
| 65 |
+
value:
|
| 66 |
+
model_max_length: 2048
|
| 67 |
+
model_name_or_path: /aifs4su/yaodong/hantao/models/llava-v1.6-mistral-7b-hf
|
| 68 |
+
trust_remote_code: true
|
| 69 |
+
train_cfgs:
|
| 70 |
+
value:
|
| 71 |
+
adam_betas:
|
| 72 |
+
- 0.9
|
| 73 |
+
- 0.95
|
| 74 |
+
adam_epsilon: 1e-08
|
| 75 |
+
bf16: true
|
| 76 |
+
ds_cfgs: ds_z3_config.json
|
| 77 |
+
epochs: 3
|
| 78 |
+
eval_interval: 10
|
| 79 |
+
eval_strategy: epoch
|
| 80 |
+
fp16: false
|
| 81 |
+
freeze_language_model: false
|
| 82 |
+
freeze_mm_proj: false
|
| 83 |
+
freeze_vision_tower: true
|
| 84 |
+
gradient_accumulation_steps: 16
|
| 85 |
+
gradient_checkpointing: true
|
| 86 |
+
learning_rate: 2e-05
|
| 87 |
+
load_checkpoint: false
|
| 88 |
+
lr_scheduler_type: cosine
|
| 89 |
+
lr_warmup_ratio: 0.03
|
| 90 |
+
max_grad_norm: 1
|
| 91 |
+
per_device_eval_batch_size: 1
|
| 92 |
+
per_device_train_batch_size: 1
|
| 93 |
+
save_checkpoint: false
|
| 94 |
+
seed: 42
|
| 95 |
+
weight_decay: 0
|
run-20250329_005541-bq1jaffa/files/output.log
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
***** Running training *****
|
| 2 |
+
Resuming from checkpoint 3/3 epoch : 0%| | 0/7326 [00:00<?, ?it/s]
|
| 3 |
+
Train dataloader: <torch.utils.data.dataloader.DataLoader object at 0x155108341d10>
|
| 4 |
+
<enumerate object at 0x154ff4f2b6f0>
|
| 5 |
+
Check if empty: False
|
| 6 |
+
First data: {'id': 182941, 'image': 'ocr/0001/00000300.jpg', 'conversations': [{'from': 'human', 'value': '<image>\nDo you have the ability to recognize and translate textual information in images?\nA. The image shows the front cover of a book titled "THE CALMING COLLECTION PRESENTS GOODBYE WORRIES TRAIN YOUR MIND TO QUIET YOUR THOUGHTS ANYTIME."\nB. The image displays the product label of a herbal tea called "THE CALMING COLLECTION PRESENTS GOODBYE WORRIES TRAIN YOUR MIND TO QUIET YOUR THOUGHTS ANYTIME."\nC. The image represents a poster promoting a workshop on "THE CALMING COLLECTION PRESENTS GOODBYE WORRIES TRAIN YOUR MIND TO QUIET YOUR THOUGHTS ANYTIME."\nD. "THE CALMING COLLECTION PRESENTS GOODBYE WORRIES TRAIN YOUR MIND TO QUIET YOUR THOUGHTS ANYTIME."\nAnswer with the option\'s letter from the given choices directly.'}, {'from': 'gpt', 'value': 'D'}], 'cosi': 70.53595100308407}
|
| 7 |
+
Train dataloader: <torch.utils.data.dataloader.DataLoader object at 0x155108341d10>
|
| 8 |
+
<enumerate object at 0x155005bebd80>
|
| 9 |
+
Check if empty: False
|
| 10 |
+
First data: {'id': 182941, 'image': 'ocr/0001/00000300.jpg', 'conversations': [{'from': 'human', 'value': '<image>\nDo you have the ability to recognize and translate textual information in images?\nA. The image shows the front cover of a book titled "THE CALMING COLLECTION PRESENTS GOODBYE WORRIES TRAIN YOUR MIND TO QUIET YOUR THOUGHTS ANYTIME."\nB. The image displays the product label of a herbal tea called "THE CALMING COLLECTION PRESENTS GOODBYE WORRIES TRAIN YOUR MIND TO QUIET YOUR THOUGHTS ANYTIME."\nC. The image represents a poster promoting a workshop on "THE CALMING COLLECTION PRESENTS GOODBYE WORRIES TRAIN YOUR MIND TO QUIET YOUR THOUGHTS ANYTIME."\nD. "THE CALMING COLLECTION PRESENTS GOODBYE WORRIES TRAIN YOUR MIND TO QUIET YOUR THOUGHTS ANYTIME."\nAnswer with the option\'s letter from the given choices directly.'}, {'from': 'gpt', 'value': 'D'}], 'cosi': 70.53595100308407}
|
| 11 |
+
Train dataloader: <torch.utils.data.dataloader.DataLoader object at 0x155108341d10>
|
| 12 |
+
<enumerate object at 0x154ff4f75080>
|
| 13 |
+
Check if empty: False
|
| 14 |
+
First data: {'id': 182941, 'image': 'ocr/0001/00000300.jpg', 'conversations': [{'from': 'human', 'value': '<image>\nDo you have the ability to recognize and translate textual information in images?\nA. The image shows the front cover of a book titled "THE CALMING COLLECTION PRESENTS GOODBYE WORRIES TRAIN YOUR MIND TO QUIET YOUR THOUGHTS ANYTIME."\nB. The image displays the product label of a herbal tea called "THE CALMING COLLECTION PRESENTS GOODBYE WORRIES TRAIN YOUR MIND TO QUIET YOUR THOUGHTS ANYTIME."\nC. The image represents a poster promoting a workshop on "THE CALMING COLLECTION PRESENTS GOODBYE WORRIES TRAIN YOUR MIND TO QUIET YOUR THOUGHTS ANYTIME."\nD. "THE CALMING COLLECTION PRESENTS GOODBYE WORRIES TRAIN YOUR MIND TO QUIET YOUR THOUGHTS ANYTIME."\nAnswer with the option\'s letter from the given choices directly.'}, {'from': 'gpt', 'value': 'D'}], 'cosi': 70.53595100308407}
|
| 15 |
+
Saving model to "../outputs/test_7B/slice_end" ...
|
| 16 |
+
Saving 16-bit model...
|
| 17 |
+
[2025-03-29 00:55:50,235] [INFO] [logging.py:107:log_dist] [Rank 0] [Torch] Checkpoint global_step0 is about to be saved!
|
| 18 |
+
[2025-03-29 00:55:50,236] [INFO] [engine.py:3831:save_16bit_model] Saving model weights to ../outputs/test_7B/slice_end/pytorch_model.bin, tag: global_step0
|
| 19 |
+
[2025-03-29 00:55:50,236] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/test_7B/slice_end/pytorch_model.bin...
|
| 20 |
+
[2025-03-29 00:56:05,543] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/test_7B/slice_end/pytorch_model.bin.
|
| 21 |
+
[2025-03-29 00:56:05,544] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step0 is ready now!
|
| 22 |
+
Model saved!
|
run-20250329_005541-bq1jaffa/files/requirements.txt
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
maskrcnn_benchmark==0.0.0
|
| 2 |
+
webdataset==0.2.111
|
| 3 |
+
websockets==15.0.1
|
| 4 |
+
typer==0.15.2
|
| 5 |
+
blobfile==3.0.0
|
| 6 |
+
pooch==1.8.2
|
| 7 |
+
python-dateutil==2.9.0.post0
|
| 8 |
+
gmpy2==2.2.1
|
| 9 |
+
httpcore==1.0.7
|
| 10 |
+
charset-normalizer==3.3.2
|
| 11 |
+
torchlibrosa==0.1.0
|
| 12 |
+
multiprocess==0.70.16
|
| 13 |
+
Werkzeug==3.1.3
|
| 14 |
+
aiofiles==23.2.1
|
| 15 |
+
six==1.17.0
|
| 16 |
+
typing_extensions==4.12.2
|
| 17 |
+
psutil==7.0.0
|
| 18 |
+
frozenlist==1.5.0
|
| 19 |
+
einops==0.8.1
|
| 20 |
+
flash_attn==2.7.4.post1
|
| 21 |
+
PySocks==1.7.1
|
| 22 |
+
regex==2024.11.6
|
| 23 |
+
markdown-it-py==3.0.0
|
| 24 |
+
ruff==0.11.2
|
| 25 |
+
docker-pycreds==0.4.0
|
| 26 |
+
protobuf==5.29.4
|
| 27 |
+
resampy==0.4.3
|
| 28 |
+
aiohappyeyeballs==2.6.1
|
| 29 |
+
httpx==0.28.1
|
| 30 |
+
encodec==0.1.1
|
| 31 |
+
ffmpy==0.5.0
|
| 32 |
+
mkl_random==1.2.8
|
| 33 |
+
soxr==0.5.0.post1
|
| 34 |
+
absl-py==2.2.1
|
| 35 |
+
networkx==3.4.2
|
| 36 |
+
h5py==3.13.0
|
| 37 |
+
hjson==3.1.0
|
| 38 |
+
tensorboard==2.19.0
|
| 39 |
+
aiosignal==1.3.2
|
| 40 |
+
pip==25.0
|
| 41 |
+
triton==3.1.0
|
| 42 |
+
zipp==3.21.0
|
| 43 |
+
ftfy==6.3.1
|
| 44 |
+
attrs==25.3.0
|
| 45 |
+
requests==2.32.3
|
| 46 |
+
progressbar==2.5
|
| 47 |
+
sniffio==1.3.1
|
| 48 |
+
lxml==5.3.1
|
| 49 |
+
starlette==0.46.1
|
| 50 |
+
Markdown==3.7
|
| 51 |
+
mdurl==0.1.2
|
| 52 |
+
torchaudio==2.5.1
|
| 53 |
+
safetensors==0.5.3
|
| 54 |
+
opencv-python==4.6.0.66
|
| 55 |
+
torchvision==0.20.1
|
| 56 |
+
shellingham==1.5.4
|
| 57 |
+
gradio==5.23.1
|
| 58 |
+
timm==1.0.15
|
| 59 |
+
multidict==6.2.0
|
| 60 |
+
semantic-version==2.10.0
|
| 61 |
+
numba==0.60.0
|
| 62 |
+
gradio_client==1.8.0
|
| 63 |
+
pydantic_core==2.33.0
|
| 64 |
+
dill==0.3.8
|
| 65 |
+
msgpack==1.1.0
|
| 66 |
+
sentry-sdk==2.24.1
|
| 67 |
+
grpcio==1.71.0
|
| 68 |
+
cffi==1.17.1
|
| 69 |
+
PyYAML==6.0.2
|
| 70 |
+
tensorboard-data-server==0.7.2
|
| 71 |
+
fastapi==0.115.12
|
| 72 |
+
lazy_loader==0.4
|
| 73 |
+
mkl_fft==1.3.11
|
| 74 |
+
annotated-types==0.7.0
|
| 75 |
+
scikit-learn==1.6.1
|
| 76 |
+
wget==3.2
|
| 77 |
+
setuptools==75.8.0
|
| 78 |
+
certifi==2025.1.31
|
| 79 |
+
click==8.1.8
|
| 80 |
+
laion_clap==1.1.5
|
| 81 |
+
Pygments==2.19.1
|
| 82 |
+
tomlkit==0.13.2
|
| 83 |
+
idna==3.7
|
| 84 |
+
propcache==0.3.1
|
| 85 |
+
platformdirs==4.3.7
|
| 86 |
+
align-anything==0.0.1.dev0
|
| 87 |
+
deepspeed==0.16.5
|
| 88 |
+
smmap==5.0.2
|
| 89 |
+
pillow==11.1.0
|
| 90 |
+
typing-inspection==0.4.0
|
| 91 |
+
braceexpand==0.1.7
|
| 92 |
+
decorator==5.2.1
|
| 93 |
+
pandas==2.2.3
|
| 94 |
+
huggingface-hub==0.29.3
|
| 95 |
+
pyarrow==19.0.1
|
| 96 |
+
tokenizers==0.21.1
|
| 97 |
+
GitPython==3.1.44
|
| 98 |
+
xxhash==3.5.0
|
| 99 |
+
packaging==24.2
|
| 100 |
+
numpy==1.23.4
|
| 101 |
+
setproctitle==1.3.5
|
| 102 |
+
llvmlite==0.43.0
|
| 103 |
+
tiktoken==0.9.0
|
| 104 |
+
mpmath==1.3.0
|
| 105 |
+
nvidia-ml-py==12.570.86
|
| 106 |
+
pydantic==2.11.0
|
| 107 |
+
datasets==3.5.0
|
| 108 |
+
librosa==0.11.0
|
| 109 |
+
frechet_audio_distance==0.3.1
|
| 110 |
+
sympy==1.13.1
|
| 111 |
+
safehttpx==0.1.6
|
| 112 |
+
Jinja2==3.1.6
|
| 113 |
+
h11==0.14.0
|
| 114 |
+
aiohttp==3.11.14
|
| 115 |
+
diffusers==0.32.2
|
| 116 |
+
tqdm==4.67.1
|
| 117 |
+
filelock==3.13.1
|
| 118 |
+
transformers==4.51.0.dev0
|
| 119 |
+
scipy==1.10.1
|
| 120 |
+
audioread==3.0.1
|
| 121 |
+
sentencepiece==0.2.0
|
| 122 |
+
pytz==2025.2
|
| 123 |
+
tzdata==2025.2
|
| 124 |
+
python-multipart==0.0.20
|
| 125 |
+
urllib3==2.3.0
|
| 126 |
+
pycryptodomex==3.22.0
|
| 127 |
+
yarl==1.18.3
|
| 128 |
+
pydub==0.25.1
|
| 129 |
+
pycparser==2.22
|
| 130 |
+
soundfile==0.13.1
|
| 131 |
+
wcwidth==0.2.13
|
| 132 |
+
groovy==0.1.2
|
| 133 |
+
torch==2.5.1
|
| 134 |
+
anyio==4.9.0
|
| 135 |
+
wandb==0.19.8
|
| 136 |
+
joblib==1.4.2
|
| 137 |
+
fsspec==2024.12.0
|
| 138 |
+
peft==0.15.1
|
| 139 |
+
accelerate==1.5.2
|
| 140 |
+
py-cpuinfo==9.0.0
|
| 141 |
+
uvicorn==0.34.0
|
| 142 |
+
orjson==3.10.16
|
| 143 |
+
Brotli==1.0.9
|
| 144 |
+
rich==13.9.4
|
| 145 |
+
importlib_metadata==8.6.1
|
| 146 |
+
ninja==1.11.1.4
|
| 147 |
+
wheel==0.45.1
|
| 148 |
+
MarkupSafe==3.0.2
|
| 149 |
+
threadpoolctl==3.6.0
|
| 150 |
+
gitdb==4.0.12
|
| 151 |
+
mkl-service==2.4.0
|
| 152 |
+
typing_extensions==4.12.2
|
| 153 |
+
tomli==2.0.1
|
| 154 |
+
zipp==3.19.2
|
| 155 |
+
wheel==0.43.0
|
| 156 |
+
jaraco.text==3.12.1
|
| 157 |
+
packaging==24.2
|
| 158 |
+
autocommand==2.2.2
|
| 159 |
+
jaraco.functools==4.0.1
|
| 160 |
+
jaraco.collections==5.1.0
|
| 161 |
+
platformdirs==4.2.2
|
| 162 |
+
more-itertools==10.3.0
|
| 163 |
+
inflect==7.3.1
|
| 164 |
+
jaraco.context==5.3.0
|
| 165 |
+
typeguard==4.3.0
|
| 166 |
+
backports.tarfile==1.2.0
|
| 167 |
+
importlib_metadata==8.0.0
|
run-20250329_005541-bq1jaffa/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-5.15.0-1040-nvidia-x86_64-with-glibc2.35",
|
| 3 |
+
"python": "CPython 3.11.11",
|
| 4 |
+
"startedAt": "2025-03-28T16:55:41.711696Z",
|
| 5 |
+
"args": [
|
| 6 |
+
"--local_rank=0",
|
| 7 |
+
"--model_name_or_path",
|
| 8 |
+
"/aifs4su/yaodong/hantao/models/llava-v1.6-mistral-7b-hf",
|
| 9 |
+
"--train_datasets",
|
| 10 |
+
"/aifs4su/yaodong/hantao/datasets/MMInstruct-GPT4V_mistral-7b_cosi_cut/merged/top1-10",
|
| 11 |
+
"--train_template",
|
| 12 |
+
"MM_TI2T_LLAVA",
|
| 13 |
+
"--train_split",
|
| 14 |
+
"train",
|
| 15 |
+
"--train_name",
|
| 16 |
+
"text-image-to-text",
|
| 17 |
+
"--output_dir",
|
| 18 |
+
"../outputs/test_7B",
|
| 19 |
+
"--save_total_limit",
|
| 20 |
+
"6",
|
| 21 |
+
"--train_batch_size",
|
| 22 |
+
"8",
|
| 23 |
+
"--epochs",
|
| 24 |
+
"3"
|
| 25 |
+
],
|
| 26 |
+
"program": "-m align_anything.trainers.text_image_to_text.sft",
|
| 27 |
+
"git": {
|
| 28 |
+
"remote": "git@github.com-hantao:PKU-Alignment/align-anything.git",
|
| 29 |
+
"commit": "106588f9802757a3283c1aff1f33ea9afd737f31"
|
| 30 |
+
},
|
| 31 |
+
"email": "2200017789@stu.pku.edu.cn",
|
| 32 |
+
"root": "../outputs/test_7B",
|
| 33 |
+
"host": "dgx-092",
|
| 34 |
+
"executable": "/aifs4su/yaodong/miniconda3/envs/hantao_llama/bin/python",
|
| 35 |
+
"cpu_count": 112,
|
| 36 |
+
"cpu_count_logical": 224,
|
| 37 |
+
"gpu": "NVIDIA H800",
|
| 38 |
+
"gpu_count": 8,
|
| 39 |
+
"disk": {
|
| 40 |
+
"/": {
|
| 41 |
+
"total": "1888556142592",
|
| 42 |
+
"used": "148609179648"
|
| 43 |
+
}
|
| 44 |
+
},
|
| 45 |
+
"memory": {
|
| 46 |
+
"total": "2164195454976"
|
| 47 |
+
},
|
| 48 |
+
"cpu": {
|
| 49 |
+
"count": 112,
|
| 50 |
+
"countLogical": 224
|
| 51 |
+
},
|
| 52 |
+
"gpu_nvidia": [
|
| 53 |
+
{
|
| 54 |
+
"name": "NVIDIA H800",
|
| 55 |
+
"memoryTotal": "85520809984",
|
| 56 |
+
"cudaCores": 16896,
|
| 57 |
+
"architecture": "Hopper"
|
| 58 |
+
},
|
| 59 |
+
{
|
| 60 |
+
"name": "NVIDIA H800",
|
| 61 |
+
"memoryTotal": "85520809984",
|
| 62 |
+
"cudaCores": 16896,
|
| 63 |
+
"architecture": "Hopper"
|
| 64 |
+
},
|
| 65 |
+
{
|
| 66 |
+
"name": "NVIDIA H800",
|
| 67 |
+
"memoryTotal": "85520809984",
|
| 68 |
+
"cudaCores": 16896,
|
| 69 |
+
"architecture": "Hopper"
|
| 70 |
+
},
|
| 71 |
+
{
|
| 72 |
+
"name": "NVIDIA H800",
|
| 73 |
+
"memoryTotal": "85520809984",
|
| 74 |
+
"cudaCores": 16896,
|
| 75 |
+
"architecture": "Hopper"
|
| 76 |
+
},
|
| 77 |
+
{
|
| 78 |
+
"name": "NVIDIA H800",
|
| 79 |
+
"memoryTotal": "85520809984",
|
| 80 |
+
"cudaCores": 16896,
|
| 81 |
+
"architecture": "Hopper"
|
| 82 |
+
},
|
| 83 |
+
{
|
| 84 |
+
"name": "NVIDIA H800",
|
| 85 |
+
"memoryTotal": "85520809984",
|
| 86 |
+
"cudaCores": 16896,
|
| 87 |
+
"architecture": "Hopper"
|
| 88 |
+
},
|
| 89 |
+
{
|
| 90 |
+
"name": "NVIDIA H800",
|
| 91 |
+
"memoryTotal": "85520809984",
|
| 92 |
+
"cudaCores": 16896,
|
| 93 |
+
"architecture": "Hopper"
|
| 94 |
+
},
|
| 95 |
+
{
|
| 96 |
+
"name": "NVIDIA H800",
|
| 97 |
+
"memoryTotal": "85520809984",
|
| 98 |
+
"cudaCores": 16896,
|
| 99 |
+
"architecture": "Hopper"
|
| 100 |
+
}
|
| 101 |
+
],
|
| 102 |
+
"slurm": {
|
| 103 |
+
"conf": "/cm/shared/apps/slurm/var/etc/slurm/slurm.conf"
|
| 104 |
+
},
|
| 105 |
+
"cudaVersion": "12.2"
|
| 106 |
+
}
|
run-20250329_005541-bq1jaffa/files/wandb-summary.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"_wandb":{"runtime":24}}
|
run-20250329_005541-bq1jaffa/logs/debug-core.log
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-03-29T00:55:41.128572776+08:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpaadyf_35/port-52806.txt","pid":52806,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false}
|
| 2 |
+
{"time":"2025-03-29T00:55:41.129538675+08:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":52806}
|
| 3 |
+
{"time":"2025-03-29T00:55:41.129533087+08:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":38349,"Zone":""}}
|
| 4 |
+
{"time":"2025-03-29T00:55:41.310059156+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:46506"}
|
| 5 |
+
{"time":"2025-03-29T00:55:41.713050692+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"bq1jaffa","id":"127.0.0.1:46506"}
|
| 6 |
+
{"time":"2025-03-29T00:55:41.928489719+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"bq1jaffa","id":"127.0.0.1:46506"}
|
| 7 |
+
{"time":"2025-03-29T00:56:07.1453019+08:00","level":"INFO","msg":"handleInformFinish: finish message received","streamId":"bq1jaffa","id":"127.0.0.1:46506"}
|
| 8 |
+
{"time":"2025-03-29T00:56:07.146579952+08:00","level":"INFO","msg":"handleInformFinish: stream closed","streamId":"bq1jaffa","id":"127.0.0.1:46506"}
|
| 9 |
+
{"time":"2025-03-29T00:56:08.145760813+08:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:46506"}
|
| 10 |
+
{"time":"2025-03-29T00:56:08.145785955+08:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:46506"}
|
| 11 |
+
{"time":"2025-03-29T00:56:08.14579164+08:00","level":"INFO","msg":"server is shutting down"}
|
| 12 |
+
{"time":"2025-03-29T00:56:08.145814767+08:00","level":"INFO","msg":"connection: closing","id":"127.0.0.1:46506"}
|
| 13 |
+
{"time":"2025-03-29T00:56:08.145848309+08:00","level":"INFO","msg":"connection: closed successfully","id":"127.0.0.1:46506"}
|
| 14 |
+
{"time":"2025-03-29T00:56:08.145850825+08:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:46506"}
|
| 15 |
+
{"time":"2025-03-29T00:56:08.145853752+08:00","level":"INFO","msg":"server is closed"}
|
run-20250329_005541-bq1jaffa/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-03-29T00:55:41.714661009+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.8","symlink path":"../outputs/test_7B/wandb/run-20250329_005541-bq1jaffa/logs/debug-core.log"}
|
| 2 |
+
{"time":"2025-03-29T00:55:41.928417193+08:00","level":"INFO","msg":"created new stream","id":"bq1jaffa"}
|
| 3 |
+
{"time":"2025-03-29T00:55:41.928482583+08:00","level":"INFO","msg":"stream: started","id":"bq1jaffa"}
|
| 4 |
+
{"time":"2025-03-29T00:55:41.928501227+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"bq1jaffa"}
|
| 5 |
+
{"time":"2025-03-29T00:55:41.928513756+08:00","level":"INFO","msg":"sender: started","stream_id":"bq1jaffa"}
|
| 6 |
+
{"time":"2025-03-29T00:55:41.928511582+08:00","level":"INFO","msg":"handler: started","stream_id":"bq1jaffa"}
|
| 7 |
+
{"time":"2025-03-29T00:55:42.22838417+08:00","level":"INFO","msg":"Starting system monitor"}
|
| 8 |
+
{"time":"2025-03-29T00:56:06.166942314+08:00","level":"INFO","msg":"Stopping system monitor"}
|
| 9 |
+
{"time":"2025-03-29T00:56:06.16760159+08:00","level":"INFO","msg":"Stopped system monitor"}
|
| 10 |
+
{"time":"2025-03-29T00:56:06.921670341+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 11 |
+
{"time":"2025-03-29T00:56:07.143514035+08:00","level":"INFO","msg":"handler: operation stats","stats":{}}
|
| 12 |
+
{"time":"2025-03-29T00:56:07.145585911+08:00","level":"INFO","msg":"stream: closing","id":"bq1jaffa"}
|
| 13 |
+
{"time":"2025-03-29T00:56:07.145626123+08:00","level":"INFO","msg":"handler: closed","stream_id":"bq1jaffa"}
|
| 14 |
+
{"time":"2025-03-29T00:56:07.145635477+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"bq1jaffa"}
|
| 15 |
+
{"time":"2025-03-29T00:56:07.145639618+08:00","level":"INFO","msg":"sender: closed","stream_id":"bq1jaffa"}
|
| 16 |
+
{"time":"2025-03-29T00:56:07.146568555+08:00","level":"INFO","msg":"stream: closed","id":"bq1jaffa"}
|
run-20250329_005541-bq1jaffa/logs/debug.log
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-03-29 00:55:41,708 INFO MainThread:52806 [wandb_setup.py:_flush():67] Current SDK version is 0.19.8
|
| 2 |
+
2025-03-29 00:55:41,709 INFO MainThread:52806 [wandb_setup.py:_flush():67] Configure stats pid to 52806
|
| 3 |
+
2025-03-29 00:55:41,709 INFO MainThread:52806 [wandb_setup.py:_flush():67] Loading settings from /home/yangyaodong/.config/wandb/settings
|
| 4 |
+
2025-03-29 00:55:41,709 INFO MainThread:52806 [wandb_setup.py:_flush():67] Loading settings from /aifs4su/yaodong/hantao/align-anything/scripts/wandb/settings
|
| 5 |
+
2025-03-29 00:55:41,709 INFO MainThread:52806 [wandb_setup.py:_flush():67] Loading settings from environment variables
|
| 6 |
+
2025-03-29 00:55:41,709 INFO MainThread:52806 [wandb_init.py:setup_run_log_directory():647] Logging user logs to ../outputs/test_7B/wandb/run-20250329_005541-bq1jaffa/logs/debug.log
|
| 7 |
+
2025-03-29 00:55:41,709 INFO MainThread:52806 [wandb_init.py:setup_run_log_directory():648] Logging internal logs to ../outputs/test_7B/wandb/run-20250329_005541-bq1jaffa/logs/debug-internal.log
|
| 8 |
+
2025-03-29 00:55:41,709 INFO MainThread:52806 [wandb_init.py:init():761] calling init triggers
|
| 9 |
+
2025-03-29 00:55:41,709 INFO MainThread:52806 [wandb_init.py:init():766] wandb.init called with sweep_config: {}
|
| 10 |
+
config: {'train_cfgs': {'save_checkpoint': False, 'load_checkpoint': False, 'ds_cfgs': 'ds_z3_config.json', 'epochs': 3, 'seed': 42, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 16, 'gradient_checkpointing': True, 'learning_rate': 2e-05, 'lr_scheduler_type': 'cosine', 'lr_warmup_ratio': 0.03, 'weight_decay': 0.0, 'adam_betas': [0.9, 0.95], 'adam_epsilon': 1e-08, 'bf16': True, 'fp16': False, 'eval_strategy': 'epoch', 'eval_interval': 10, 'freeze_mm_proj': False, 'freeze_vision_tower': True, 'freeze_language_model': False, 'max_grad_norm': 1.0}, 'data_cfgs': {'load_multi_datasets': False, 'train_datasets': '/aifs4su/yaodong/hantao/datasets/MMInstruct-GPT4V_mistral-7b_cosi_cut/merged/top1-10', 'train_template': 'MM_TI2T_LLAVA', 'train_size': {}, 'train_split': 'train', 'train_name': 'text-image-to-text', 'train_data_files': {}, 'train_optional_args': [], 'eval_datasets': {}, 'eval_template': {}, 'eval_name': {}, 'eval_size': {}, 'eval_split': {}, 'eval_subset': {}, 'eval_data_files': {}, 'eval_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'align-anything', 'log_run_name': 'sft', 'output_dir': '../outputs/test_7B', 'cache_dir': {}, 'save_total_limit': 6}, 'model_cfgs': {'model_name_or_path': '/aifs4su/yaodong/hantao/models/llava-v1.6-mistral-7b-hf', 'trust_remote_code': True, 'model_max_length': 2048}, 'special_tokens': {}, '_wandb': {}}
|
| 11 |
+
2025-03-29 00:55:41,709 INFO MainThread:52806 [wandb_init.py:init():784] starting backend
|
| 12 |
+
2025-03-29 00:55:41,709 INFO MainThread:52806 [wandb_init.py:init():788] sending inform_init request
|
| 13 |
+
2025-03-29 00:55:41,711 INFO MainThread:52806 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
| 14 |
+
2025-03-29 00:55:41,711 INFO MainThread:52806 [wandb_init.py:init():798] backend started and connected
|
| 15 |
+
2025-03-29 00:55:41,712 INFO MainThread:52806 [wandb_init.py:init():891] updated telemetry
|
| 16 |
+
2025-03-29 00:55:41,722 INFO MainThread:52806 [wandb_init.py:init():915] communicating run to backend with 90.0 second timeout
|
| 17 |
+
2025-03-29 00:55:42,226 INFO MainThread:52806 [wandb_init.py:init():990] starting run threads in backend
|
| 18 |
+
2025-03-29 00:55:42,378 INFO MainThread:52806 [wandb_run.py:_console_start():2375] atexit reg
|
| 19 |
+
2025-03-29 00:55:42,378 INFO MainThread:52806 [wandb_run.py:_redirect():2227] redirect: wrap_raw
|
| 20 |
+
2025-03-29 00:55:42,378 INFO MainThread:52806 [wandb_run.py:_redirect():2292] Wrapping output streams.
|
| 21 |
+
2025-03-29 00:55:42,378 INFO MainThread:52806 [wandb_run.py:_redirect():2315] Redirects installed.
|
| 22 |
+
2025-03-29 00:55:42,380 INFO MainThread:52806 [wandb_init.py:init():1032] run started, returning control to user process
|
| 23 |
+
2025-03-29 00:56:06,165 INFO MainThread:52806 [wandb_run.py:_finish():2112] finishing run htlou/align-anything/bq1jaffa
|
| 24 |
+
2025-03-29 00:56:06,165 INFO MainThread:52806 [wandb_run.py:_atexit_cleanup():2340] got exitcode: 0
|
| 25 |
+
2025-03-29 00:56:06,166 INFO MainThread:52806 [wandb_run.py:_restore():2322] restore
|
| 26 |
+
2025-03-29 00:56:06,166 INFO MainThread:52806 [wandb_run.py:_restore():2328] restore done
|
| 27 |
+
2025-03-29 00:56:07,144 INFO MainThread:52806 [wandb_run.py:_footer_history_summary_info():3956] rendering history
|
| 28 |
+
2025-03-29 00:56:07,144 INFO MainThread:52806 [wandb_run.py:_footer_history_summary_info():3988] rendering summary
|
| 29 |
+
2025-03-29 00:56:07,144 INFO MainThread:52806 [wandb_run.py:_footer_sync_info():3917] logging synced files
|
run-20250329_005541-bq1jaffa/run-bq1jaffa.wandb
ADDED
|
Binary file (15.4 kB). View file
|
|
|