htlou commited on
Commit
e2fb6fa
·
verified ·
1 Parent(s): 0849b40

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -35,3 +35,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  run-20250329_010934-3x35hjks/run-3x35hjks.wandb filter=lfs diff=lfs merge=lfs -text
37
  run-20250329_012205-co1ecmky/run-co1ecmky.wandb filter=lfs diff=lfs merge=lfs -text
 
 
 
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  run-20250329_010934-3x35hjks/run-3x35hjks.wandb filter=lfs diff=lfs merge=lfs -text
37
  run-20250329_012205-co1ecmky/run-co1ecmky.wandb filter=lfs diff=lfs merge=lfs -text
38
+ offline-run-20250328_203038-ni3gydc1/run-ni3gydc1.wandb filter=lfs diff=lfs merge=lfs -text
39
+ offline-run-20250328_203844-gc2qytpj/run-gc2qytpj.wandb filter=lfs diff=lfs merge=lfs -text
debug-internal.log CHANGED
@@ -1,7 +1,7 @@
1
- {"time":"2025-03-29T01:22:05.252520599+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.8","symlink path":"../outputs/test_7B/wandb/run-20250329_012205-co1ecmky/logs/debug-core.log"}
2
- {"time":"2025-03-29T01:22:05.467254306+08:00","level":"INFO","msg":"created new stream","id":"co1ecmky"}
3
- {"time":"2025-03-29T01:22:05.467309548+08:00","level":"INFO","msg":"stream: started","id":"co1ecmky"}
4
- {"time":"2025-03-29T01:22:05.467331707+08:00","level":"INFO","msg":"handler: started","stream_id":"co1ecmky"}
5
- {"time":"2025-03-29T01:22:05.467333162+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"co1ecmky"}
6
- {"time":"2025-03-29T01:22:05.467336174+08:00","level":"INFO","msg":"sender: started","stream_id":"co1ecmky"}
7
- {"time":"2025-03-29T01:22:05.772490021+08:00","level":"INFO","msg":"Starting system monitor"}
 
1
+ {"time":"2025-03-28T20:38:44.692258573+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.8","symlink path":"../outputs/test_13B/wandb/offline-run-20250328_203844-gc2qytpj/logs/debug-core.log"}
2
+ {"time":"2025-03-28T20:38:44.90653769+08:00","level":"INFO","msg":"created new stream","id":"gc2qytpj"}
3
+ {"time":"2025-03-28T20:38:44.90660149+08:00","level":"INFO","msg":"stream: started","id":"gc2qytpj"}
4
+ {"time":"2025-03-28T20:38:44.906615038+08:00","level":"INFO","msg":"handler: started","stream_id":"gc2qytpj"}
5
+ {"time":"2025-03-28T20:38:44.906633311+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"gc2qytpj"}
6
+ {"time":"2025-03-28T20:38:44.90663431+08:00","level":"INFO","msg":"sender: started","stream_id":"gc2qytpj"}
7
+ {"time":"2025-03-28T20:38:44.910439284+08:00","level":"INFO","msg":"Starting system monitor"}
debug.log CHANGED
@@ -1,24 +1,26 @@
1
- 2025-03-29 01:22:05,246 INFO MainThread:104999 [wandb_setup.py:_flush():67] Current SDK version is 0.19.8
2
- 2025-03-29 01:22:05,246 INFO MainThread:104999 [wandb_setup.py:_flush():67] Configure stats pid to 104999
3
- 2025-03-29 01:22:05,246 INFO MainThread:104999 [wandb_setup.py:_flush():67] Loading settings from /home/yangyaodong/.config/wandb/settings
4
- 2025-03-29 01:22:05,246 INFO MainThread:104999 [wandb_setup.py:_flush():67] Loading settings from /aifs4su/yaodong/hantao/align-anything/scripts/wandb/settings
5
- 2025-03-29 01:22:05,246 INFO MainThread:104999 [wandb_setup.py:_flush():67] Loading settings from environment variables
6
- 2025-03-29 01:22:05,246 INFO MainThread:104999 [wandb_init.py:setup_run_log_directory():647] Logging user logs to ../outputs/test_7B/wandb/run-20250329_012205-co1ecmky/logs/debug.log
7
- 2025-03-29 01:22:05,246 INFO MainThread:104999 [wandb_init.py:setup_run_log_directory():648] Logging internal logs to ../outputs/test_7B/wandb/run-20250329_012205-co1ecmky/logs/debug-internal.log
8
- 2025-03-29 01:22:05,246 INFO MainThread:104999 [wandb_init.py:init():761] calling init triggers
9
- 2025-03-29 01:22:05,246 INFO MainThread:104999 [wandb_init.py:init():766] wandb.init called with sweep_config: {}
10
- config: {'train_cfgs': {'save_checkpoint': False, 'load_checkpoint': False, 'ds_cfgs': 'ds_z3_config.json', 'epochs': 3, 'seed': 42, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 16, 'gradient_checkpointing': True, 'learning_rate': 2e-05, 'lr_scheduler_type': 'cosine', 'lr_warmup_ratio': 0.03, 'weight_decay': 0.0, 'adam_betas': [0.9, 0.95], 'adam_epsilon': 1e-08, 'bf16': True, 'fp16': False, 'eval_strategy': 'epoch', 'eval_interval': 10, 'freeze_mm_proj': False, 'freeze_vision_tower': True, 'freeze_language_model': False, 'max_grad_norm': 1.0}, 'data_cfgs': {'load_multi_datasets': False, 'train_datasets': '/aifs4su/yaodong/hantao/datasets/MMInstruct-GPT4V_mistral-7b_cosi_cut/merged/top1-10', 'train_template': 'MM_TI2T_LLAVA', 'train_size': {}, 'train_split': 'train', 'train_name': 'text-image-to-text', 'train_data_files': {}, 'train_optional_args': [], 'eval_datasets': {}, 'eval_template': {}, 'eval_name': {}, 'eval_size': {}, 'eval_split': {}, 'eval_subset': {}, 'eval_data_files': {}, 'eval_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'align-anything', 'log_run_name': 'sft', 'output_dir': '../outputs/test_7B', 'cache_dir': {}, 'save_total_limit': 6}, 'model_cfgs': {'model_name_or_path': '/aifs4su/yaodong/hantao/models/llava-v1.6-mistral-7b-hf', 'trust_remote_code': True, 'model_max_length': 2048}, 'special_tokens': {}, '_wandb': {}}
11
- 2025-03-29 01:22:05,246 INFO MainThread:104999 [wandb_init.py:init():784] starting backend
12
- 2025-03-29 01:22:05,246 INFO MainThread:104999 [wandb_init.py:init():788] sending inform_init request
13
- 2025-03-29 01:22:05,249 INFO MainThread:104999 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
- 2025-03-29 01:22:05,249 INFO MainThread:104999 [wandb_init.py:init():798] backend started and connected
15
- 2025-03-29 01:22:05,251 INFO MainThread:104999 [wandb_init.py:init():891] updated telemetry
16
- 2025-03-29 01:22:05,262 INFO MainThread:104999 [wandb_init.py:init():915] communicating run to backend with 90.0 second timeout
17
- 2025-03-29 01:22:05,770 INFO MainThread:104999 [wandb_init.py:init():990] starting run threads in backend
18
- 2025-03-29 01:22:05,989 INFO MainThread:104999 [wandb_run.py:_console_start():2375] atexit reg
19
- 2025-03-29 01:22:05,990 INFO MainThread:104999 [wandb_run.py:_redirect():2227] redirect: wrap_raw
20
- 2025-03-29 01:22:05,990 INFO MainThread:104999 [wandb_run.py:_redirect():2292] Wrapping output streams.
21
- 2025-03-29 01:22:05,990 INFO MainThread:104999 [wandb_run.py:_redirect():2315] Redirects installed.
22
- 2025-03-29 01:22:05,992 INFO MainThread:104999 [wandb_init.py:init():1032] run started, returning control to user process
23
- 2025-03-29 01:26:51,278 INFO MainThread:104999 [wandb_run.py:_finish():2112] finishing run htlou/align-anything/co1ecmky
24
- 2025-03-29 01:26:51,278 INFO MainThread:104999 [wandb_run.py:_atexit_cleanup():2340] got exitcode: 0
 
 
 
1
+ 2025-03-28 20:38:44,447 INFO MainThread:3871311 [wandb_setup.py:_flush():67] Current SDK version is 0.19.8
2
+ 2025-03-28 20:38:44,447 INFO MainThread:3871311 [wandb_setup.py:_flush():67] Configure stats pid to 3871311
3
+ 2025-03-28 20:38:44,447 INFO MainThread:3871311 [wandb_setup.py:_flush():67] Loading settings from /home/yangyaodong/.config/wandb/settings
4
+ 2025-03-28 20:38:44,447 INFO MainThread:3871311 [wandb_setup.py:_flush():67] Loading settings from /aifs4su/yaodong/hantao/align-anything/scripts/wandb/settings
5
+ 2025-03-28 20:38:44,447 INFO MainThread:3871311 [wandb_setup.py:_flush():67] Loading settings from environment variables
6
+ 2025-03-28 20:38:44,447 INFO MainThread:3871311 [wandb_init.py:setup_run_log_directory():647] Logging user logs to ../outputs/test_13B/wandb/offline-run-20250328_203844-gc2qytpj/logs/debug.log
7
+ 2025-03-28 20:38:44,447 INFO MainThread:3871311 [wandb_init.py:setup_run_log_directory():648] Logging internal logs to ../outputs/test_13B/wandb/offline-run-20250328_203844-gc2qytpj/logs/debug-internal.log
8
+ 2025-03-28 20:38:44,447 INFO MainThread:3871311 [wandb_init.py:init():761] calling init triggers
9
+ 2025-03-28 20:38:44,447 INFO MainThread:3871311 [wandb_init.py:init():766] wandb.init called with sweep_config: {}
10
+ config: {'train_cfgs': {'save_checkpoint': True, 'load_checkpoint': False, 'ds_cfgs': 'ds_z3_config.json', 'epochs': 3, 'seed': 42, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 1, 'gradient_checkpointing': True, 'learning_rate': 1e-06, 'lr_scheduler_type': 'cosine', 'lr_warmup_ratio': 0.03, 'weight_decay': 0.0, 'adam_betas': [0.9, 0.95], 'bf16': True, 'fp16': False, 'eval_strategy': 'epoch', 'eval_interval': 10, 'regularization': 0.001, 'scale_coeff': 0.1, 'freeze_mm_proj': False, 'freeze_vision_tower': True, 'freeze_language_model': False}, 'data_cfgs': {'train_datasets': '/aifs4su/yaodong/hantao/datasets/AA_preference_vicuna-13b_cosi_cut/merged/top1-10', 'train_template': 'AA_TI2T_LLAVA', 'train_size': {}, 'train_split': 'train', 'train_name': 'text-image-to-text', 'train_data_files': {}, 'train_optional_args': [], 'eval_datasets': {}, 'eval_template': {}, 'eval_size': {}, 'eval_split': {}, 'eval_subset': {}, 'eval_data_files': {}, 'eval_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'align-anything', 'log_run_name': 'dpo', 'output_dir': '../outputs/test_13B', 'cache_dir': {}, 'save_total_limit': 6}, 'model_cfgs': {'model_name_or_path': '/aifs4su/yaodong/hantao/models/llava-v1.6-vicuna-13b-hf', 'trust_remote_code': True, 'model_max_length': 2048}, 'special_tokens': {}, '_wandb': {}}
11
+ 2025-03-28 20:38:44,447 INFO MainThread:3871311 [wandb_init.py:init():784] starting backend
12
+ 2025-03-28 20:38:44,657 INFO MainThread:3871311 [wandb_init.py:init():788] sending inform_init request
13
+ 2025-03-28 20:38:44,660 INFO MainThread:3871311 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-03-28 20:38:44,660 INFO MainThread:3871311 [wandb_init.py:init():798] backend started and connected
15
+ 2025-03-28 20:38:44,662 INFO MainThread:3871311 [wandb_init.py:init():891] updated telemetry
16
+ 2025-03-28 20:38:44,672 INFO MainThread:3871311 [wandb_init.py:init():915] communicating run to backend with 90.0 second timeout
17
+ 2025-03-28 20:38:44,908 INFO MainThread:3871311 [wandb_init.py:init():990] starting run threads in backend
18
+ 2025-03-28 20:38:45,071 INFO MainThread:3871311 [wandb_run.py:_console_start():2375] atexit reg
19
+ 2025-03-28 20:38:45,071 INFO MainThread:3871311 [wandb_run.py:_redirect():2227] redirect: wrap_raw
20
+ 2025-03-28 20:38:45,071 INFO MainThread:3871311 [wandb_run.py:_redirect():2292] Wrapping output streams.
21
+ 2025-03-28 20:38:45,071 INFO MainThread:3871311 [wandb_run.py:_redirect():2315] Redirects installed.
22
+ 2025-03-28 20:38:45,074 INFO MainThread:3871311 [wandb_init.py:init():1032] run started, returning control to user process
23
+ 2025-03-28 20:47:40,230 INFO MainThread:3871311 [wandb_run.py:_finish():2112] finishing run align-anything/gc2qytpj
24
+ 2025-03-28 20:47:40,230 INFO MainThread:3871311 [wandb_run.py:_atexit_cleanup():2340] got exitcode: 0
25
+ 2025-03-28 20:47:40,230 INFO MainThread:3871311 [wandb_run.py:_restore():2322] restore
26
+ 2025-03-28 20:47:40,230 INFO MainThread:3871311 [wandb_run.py:_restore():2328] restore done
offline-run-20250328_203038-ni3gydc1/files/output.log ADDED
File without changes
offline-run-20250328_203038-ni3gydc1/files/requirements.txt ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ maskrcnn_benchmark==0.0.0
2
+ webdataset==0.2.111
3
+ websockets==15.0.1
4
+ typer==0.15.2
5
+ blobfile==3.0.0
6
+ pooch==1.8.2
7
+ python-dateutil==2.9.0.post0
8
+ gmpy2==2.2.1
9
+ httpcore==1.0.7
10
+ charset-normalizer==3.3.2
11
+ torchlibrosa==0.1.0
12
+ multiprocess==0.70.16
13
+ Werkzeug==3.1.3
14
+ aiofiles==23.2.1
15
+ six==1.17.0
16
+ typing_extensions==4.12.2
17
+ psutil==7.0.0
18
+ frozenlist==1.5.0
19
+ einops==0.8.1
20
+ flash_attn==2.7.4.post1
21
+ PySocks==1.7.1
22
+ regex==2024.11.6
23
+ markdown-it-py==3.0.0
24
+ ruff==0.11.2
25
+ docker-pycreds==0.4.0
26
+ protobuf==5.29.4
27
+ resampy==0.4.3
28
+ aiohappyeyeballs==2.6.1
29
+ httpx==0.28.1
30
+ encodec==0.1.1
31
+ ffmpy==0.5.0
32
+ mkl_random==1.2.8
33
+ soxr==0.5.0.post1
34
+ absl-py==2.2.1
35
+ networkx==3.4.2
36
+ h5py==3.13.0
37
+ hjson==3.1.0
38
+ tensorboard==2.19.0
39
+ aiosignal==1.3.2
40
+ pip==25.0
41
+ triton==3.1.0
42
+ zipp==3.21.0
43
+ ftfy==6.3.1
44
+ attrs==25.3.0
45
+ requests==2.32.3
46
+ progressbar==2.5
47
+ sniffio==1.3.1
48
+ lxml==5.3.1
49
+ starlette==0.46.1
50
+ Markdown==3.7
51
+ mdurl==0.1.2
52
+ torchaudio==2.5.1
53
+ safetensors==0.5.3
54
+ opencv-python==4.6.0.66
55
+ torchvision==0.20.1
56
+ shellingham==1.5.4
57
+ gradio==5.23.1
58
+ timm==1.0.15
59
+ multidict==6.2.0
60
+ semantic-version==2.10.0
61
+ numba==0.60.0
62
+ gradio_client==1.8.0
63
+ pydantic_core==2.33.0
64
+ dill==0.3.8
65
+ msgpack==1.1.0
66
+ sentry-sdk==2.24.1
67
+ grpcio==1.71.0
68
+ cffi==1.17.1
69
+ PyYAML==6.0.2
70
+ tensorboard-data-server==0.7.2
71
+ fastapi==0.115.12
72
+ lazy_loader==0.4
73
+ mkl_fft==1.3.11
74
+ annotated-types==0.7.0
75
+ scikit-learn==1.6.1
76
+ wget==3.2
77
+ setuptools==75.8.0
78
+ certifi==2025.1.31
79
+ click==8.1.8
80
+ laion_clap==1.1.5
81
+ Pygments==2.19.1
82
+ tomlkit==0.13.2
83
+ idna==3.7
84
+ propcache==0.3.1
85
+ platformdirs==4.3.7
86
+ align-anything==0.0.1.dev0
87
+ deepspeed==0.16.5
88
+ smmap==5.0.2
89
+ pillow==11.1.0
90
+ typing-inspection==0.4.0
91
+ braceexpand==0.1.7
92
+ decorator==5.2.1
93
+ pandas==2.2.3
94
+ huggingface-hub==0.29.3
95
+ pyarrow==19.0.1
96
+ tokenizers==0.21.1
97
+ GitPython==3.1.44
98
+ xxhash==3.5.0
99
+ packaging==24.2
100
+ numpy==1.23.4
101
+ setproctitle==1.3.5
102
+ llvmlite==0.43.0
103
+ tiktoken==0.9.0
104
+ mpmath==1.3.0
105
+ nvidia-ml-py==12.570.86
106
+ pydantic==2.11.0
107
+ datasets==3.5.0
108
+ librosa==0.11.0
109
+ frechet_audio_distance==0.3.1
110
+ sympy==1.13.1
111
+ safehttpx==0.1.6
112
+ Jinja2==3.1.6
113
+ h11==0.14.0
114
+ aiohttp==3.11.14
115
+ diffusers==0.32.2
116
+ tqdm==4.67.1
117
+ filelock==3.13.1
118
+ transformers==4.51.0.dev0
119
+ scipy==1.10.1
120
+ audioread==3.0.1
121
+ sentencepiece==0.2.0
122
+ pytz==2025.2
123
+ tzdata==2025.2
124
+ python-multipart==0.0.20
125
+ urllib3==2.3.0
126
+ pycryptodomex==3.22.0
127
+ yarl==1.18.3
128
+ pydub==0.25.1
129
+ pycparser==2.22
130
+ soundfile==0.13.1
131
+ wcwidth==0.2.13
132
+ groovy==0.1.2
133
+ torch==2.5.1
134
+ anyio==4.9.0
135
+ wandb==0.19.8
136
+ joblib==1.4.2
137
+ fsspec==2024.12.0
138
+ peft==0.15.1
139
+ accelerate==1.5.2
140
+ py-cpuinfo==9.0.0
141
+ uvicorn==0.34.0
142
+ orjson==3.10.16
143
+ Brotli==1.0.9
144
+ rich==13.9.4
145
+ importlib_metadata==8.6.1
146
+ ninja==1.11.1.4
147
+ wheel==0.45.1
148
+ MarkupSafe==3.0.2
149
+ threadpoolctl==3.6.0
150
+ gitdb==4.0.12
151
+ mkl-service==2.4.0
152
+ typing_extensions==4.12.2
153
+ tomli==2.0.1
154
+ zipp==3.19.2
155
+ wheel==0.43.0
156
+ jaraco.text==3.12.1
157
+ packaging==24.2
158
+ autocommand==2.2.2
159
+ jaraco.functools==4.0.1
160
+ jaraco.collections==5.1.0
161
+ platformdirs==4.2.2
162
+ more-itertools==10.3.0
163
+ inflect==7.3.1
164
+ jaraco.context==5.3.0
165
+ typeguard==4.3.0
166
+ backports.tarfile==1.2.0
167
+ importlib_metadata==8.0.0
offline-run-20250328_203038-ni3gydc1/files/wandb-metadata.json ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-1040-nvidia-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.11.11",
4
+ "startedAt": "2025-03-28T12:30:39.058990Z",
5
+ "args": [
6
+ "--local_rank=0",
7
+ "--model_name_or_path",
8
+ "/aifs4su/yaodong/hantao/models/llava-v1.6-vicuna-13b-hf",
9
+ "--train_datasets",
10
+ "/aifs4su/yaodong/hantao/datasets/AA_preference_vicuna-13b_cosi_cut/merged/top1-10",
11
+ "--train_template",
12
+ "AA_TI2T_LLAVA",
13
+ "--train_split",
14
+ "train",
15
+ "--train_name",
16
+ "text-image-to-text",
17
+ "--output_dir",
18
+ "../outputs/test_13B",
19
+ "--save_total_limit",
20
+ "3",
21
+ "--train_batch_size",
22
+ "1",
23
+ "--epochs",
24
+ "2"
25
+ ],
26
+ "program": "-m align_anything.trainers.text_image_to_text.dpo",
27
+ "git": {
28
+ "remote": "git@github.com-hantao:PKU-Alignment/align-anything.git",
29
+ "commit": "106588f9802757a3283c1aff1f33ea9afd737f31"
30
+ },
31
+ "root": "../outputs/test_13B",
32
+ "host": "dgx-092",
33
+ "executable": "/aifs4su/yaodong/miniconda3/envs/hantao_llama/bin/python",
34
+ "cpu_count": 112,
35
+ "cpu_count_logical": 224,
36
+ "gpu": "NVIDIA H800",
37
+ "gpu_count": 8,
38
+ "disk": {
39
+ "/": {
40
+ "total": "1888556142592",
41
+ "used": "148592041984"
42
+ }
43
+ },
44
+ "memory": {
45
+ "total": "2164195454976"
46
+ },
47
+ "cpu": {
48
+ "count": 112,
49
+ "countLogical": 224
50
+ },
51
+ "gpu_nvidia": [
52
+ {
53
+ "name": "NVIDIA H800",
54
+ "memoryTotal": "85520809984",
55
+ "cudaCores": 16896,
56
+ "architecture": "Hopper"
57
+ },
58
+ {
59
+ "name": "NVIDIA H800",
60
+ "memoryTotal": "85520809984",
61
+ "cudaCores": 16896,
62
+ "architecture": "Hopper"
63
+ },
64
+ {
65
+ "name": "NVIDIA H800",
66
+ "memoryTotal": "85520809984",
67
+ "cudaCores": 16896,
68
+ "architecture": "Hopper"
69
+ },
70
+ {
71
+ "name": "NVIDIA H800",
72
+ "memoryTotal": "85520809984",
73
+ "cudaCores": 16896,
74
+ "architecture": "Hopper"
75
+ },
76
+ {
77
+ "name": "NVIDIA H800",
78
+ "memoryTotal": "85520809984",
79
+ "cudaCores": 16896,
80
+ "architecture": "Hopper"
81
+ },
82
+ {
83
+ "name": "NVIDIA H800",
84
+ "memoryTotal": "85520809984",
85
+ "cudaCores": 16896,
86
+ "architecture": "Hopper"
87
+ },
88
+ {
89
+ "name": "NVIDIA H800",
90
+ "memoryTotal": "85520809984",
91
+ "cudaCores": 16896,
92
+ "architecture": "Hopper"
93
+ },
94
+ {
95
+ "name": "NVIDIA H800",
96
+ "memoryTotal": "85520809984",
97
+ "cudaCores": 16896,
98
+ "architecture": "Hopper"
99
+ }
100
+ ],
101
+ "slurm": {
102
+ "conf": "/cm/shared/apps/slurm/var/etc/slurm/slurm.conf"
103
+ },
104
+ "cudaVersion": "12.2"
105
+ }
offline-run-20250328_203038-ni3gydc1/logs/debug-core.log ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {"time":"2025-03-28T20:30:38.872129247+08:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpmsjq2qrj/port-3856880.txt","pid":3856880,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false}
2
+ {"time":"2025-03-28T20:30:38.873015217+08:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":3856880}
3
+ {"time":"2025-03-28T20:30:38.873005488+08:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":44091,"Zone":""}}
4
+ {"time":"2025-03-28T20:30:39.052962944+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:34304"}
5
+ {"time":"2025-03-28T20:30:39.059989776+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"ni3gydc1","id":"127.0.0.1:34304"}
6
+ {"time":"2025-03-28T20:30:39.456558723+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"ni3gydc1","id":"127.0.0.1:34304"}
7
+ {"time":"2025-03-28T20:36:33.911468496+08:00","level":"INFO","msg":"received shutdown signal","signal":15}
offline-run-20250328_203038-ni3gydc1/logs/debug-internal.log ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {"time":"2025-03-28T20:30:39.242295984+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.8","symlink path":"../outputs/test_13B/wandb/offline-run-20250328_203038-ni3gydc1/logs/debug-core.log"}
2
+ {"time":"2025-03-28T20:30:39.456487825+08:00","level":"INFO","msg":"created new stream","id":"ni3gydc1"}
3
+ {"time":"2025-03-28T20:30:39.456549716+08:00","level":"INFO","msg":"stream: started","id":"ni3gydc1"}
4
+ {"time":"2025-03-28T20:30:39.456572806+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"ni3gydc1"}
5
+ {"time":"2025-03-28T20:30:39.456577964+08:00","level":"INFO","msg":"handler: started","stream_id":"ni3gydc1"}
6
+ {"time":"2025-03-28T20:30:39.456615028+08:00","level":"INFO","msg":"sender: started","stream_id":"ni3gydc1"}
7
+ {"time":"2025-03-28T20:30:39.462148507+08:00","level":"INFO","msg":"Starting system monitor"}
offline-run-20250328_203038-ni3gydc1/logs/debug.log ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-03-28 20:30:38,838 INFO MainThread:3856880 [wandb_setup.py:_flush():67] Current SDK version is 0.19.8
2
+ 2025-03-28 20:30:38,839 INFO MainThread:3856880 [wandb_setup.py:_flush():67] Configure stats pid to 3856880
3
+ 2025-03-28 20:30:38,839 INFO MainThread:3856880 [wandb_setup.py:_flush():67] Loading settings from /home/yangyaodong/.config/wandb/settings
4
+ 2025-03-28 20:30:38,839 INFO MainThread:3856880 [wandb_setup.py:_flush():67] Loading settings from /aifs4su/yaodong/hantao/align-anything/scripts/wandb/settings
5
+ 2025-03-28 20:30:38,839 INFO MainThread:3856880 [wandb_setup.py:_flush():67] Loading settings from environment variables
6
+ 2025-03-28 20:30:38,839 INFO MainThread:3856880 [wandb_init.py:setup_run_log_directory():647] Logging user logs to ../outputs/test_13B/wandb/offline-run-20250328_203038-ni3gydc1/logs/debug.log
7
+ 2025-03-28 20:30:38,839 INFO MainThread:3856880 [wandb_init.py:setup_run_log_directory():648] Logging internal logs to ../outputs/test_13B/wandb/offline-run-20250328_203038-ni3gydc1/logs/debug-internal.log
8
+ 2025-03-28 20:30:38,839 INFO MainThread:3856880 [wandb_init.py:init():761] calling init triggers
9
+ 2025-03-28 20:30:38,839 INFO MainThread:3856880 [wandb_init.py:init():766] wandb.init called with sweep_config: {}
10
+ config: {'train_cfgs': {'save_checkpoint': True, 'load_checkpoint': False, 'ds_cfgs': 'ds_z3_config.json', 'epochs': 2, 'seed': 42, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 1, 'gradient_checkpointing': True, 'learning_rate': 1e-06, 'lr_scheduler_type': 'cosine', 'lr_warmup_ratio': 0.03, 'weight_decay': 0.0, 'adam_betas': [0.9, 0.95], 'bf16': True, 'fp16': False, 'eval_strategy': 'epoch', 'eval_interval': 10, 'regularization': 0.001, 'scale_coeff': 0.1, 'freeze_mm_proj': False, 'freeze_vision_tower': True, 'freeze_language_model': False}, 'data_cfgs': {'train_datasets': '/aifs4su/yaodong/hantao/datasets/AA_preference_vicuna-13b_cosi_cut/merged/top1-10', 'train_template': 'AA_TI2T_LLAVA', 'train_size': {}, 'train_split': 'train', 'train_name': 'text-image-to-text', 'train_data_files': {}, 'train_optional_args': [], 'eval_datasets': {}, 'eval_template': {}, 'eval_size': {}, 'eval_split': {}, 'eval_subset': {}, 'eval_data_files': {}, 'eval_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'align-anything', 'log_run_name': 'dpo', 'output_dir': '../outputs/test_13B', 'cache_dir': {}, 'save_total_limit': 3}, 'model_cfgs': {'model_name_or_path': '/aifs4su/yaodong/hantao/models/llava-v1.6-vicuna-13b-hf', 'trust_remote_code': True, 'model_max_length': 2048}, 'special_tokens': {}, '_wandb': {}}
11
+ 2025-03-28 20:30:38,839 INFO MainThread:3856880 [wandb_init.py:init():784] starting backend
12
+ 2025-03-28 20:30:39,053 INFO MainThread:3856880 [wandb_init.py:init():788] sending inform_init request
13
+ 2025-03-28 20:30:39,058 INFO MainThread:3856880 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-03-28 20:30:39,058 INFO MainThread:3856880 [wandb_init.py:init():798] backend started and connected
15
+ 2025-03-28 20:30:39,060 INFO MainThread:3856880 [wandb_init.py:init():891] updated telemetry
16
+ 2025-03-28 20:30:39,069 INFO MainThread:3856880 [wandb_init.py:init():915] communicating run to backend with 90.0 second timeout
17
+ 2025-03-28 20:30:39,458 INFO MainThread:3856880 [wandb_init.py:init():990] starting run threads in backend
18
+ 2025-03-28 20:30:39,624 INFO MainThread:3856880 [wandb_run.py:_console_start():2375] atexit reg
19
+ 2025-03-28 20:30:39,624 INFO MainThread:3856880 [wandb_run.py:_redirect():2227] redirect: wrap_raw
20
+ 2025-03-28 20:30:39,624 INFO MainThread:3856880 [wandb_run.py:_redirect():2292] Wrapping output streams.
21
+ 2025-03-28 20:30:39,624 INFO MainThread:3856880 [wandb_run.py:_redirect():2315] Redirects installed.
22
+ 2025-03-28 20:30:39,627 INFO MainThread:3856880 [wandb_init.py:init():1032] run started, returning control to user process
23
+ 2025-03-28 20:36:33,901 INFO MainThread:3856880 [wandb_run.py:_finish():2112] finishing run align-anything/ni3gydc1
24
+ 2025-03-28 20:36:33,902 INFO MainThread:3856880 [wandb_run.py:_atexit_cleanup():2340] got exitcode: 0
25
+ 2025-03-28 20:36:33,902 INFO MainThread:3856880 [wandb_run.py:_restore():2322] restore
26
+ 2025-03-28 20:36:33,902 INFO MainThread:3856880 [wandb_run.py:_restore():2328] restore done
offline-run-20250328_203038-ni3gydc1/run-ni3gydc1.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7551a30274cb4766f54360f45c34e659bd36728d49e2f3ed7d96d44362f0c1ca
3
+ size 229376
offline-run-20250328_203844-gc2qytpj/files/output.log ADDED
File without changes
offline-run-20250328_203844-gc2qytpj/files/requirements.txt ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ maskrcnn_benchmark==0.0.0
2
+ webdataset==0.2.111
3
+ websockets==15.0.1
4
+ typer==0.15.2
5
+ blobfile==3.0.0
6
+ pooch==1.8.2
7
+ python-dateutil==2.9.0.post0
8
+ gmpy2==2.2.1
9
+ httpcore==1.0.7
10
+ charset-normalizer==3.3.2
11
+ torchlibrosa==0.1.0
12
+ multiprocess==0.70.16
13
+ Werkzeug==3.1.3
14
+ aiofiles==23.2.1
15
+ six==1.17.0
16
+ typing_extensions==4.12.2
17
+ psutil==7.0.0
18
+ frozenlist==1.5.0
19
+ einops==0.8.1
20
+ flash_attn==2.7.4.post1
21
+ PySocks==1.7.1
22
+ regex==2024.11.6
23
+ markdown-it-py==3.0.0
24
+ ruff==0.11.2
25
+ docker-pycreds==0.4.0
26
+ protobuf==5.29.4
27
+ resampy==0.4.3
28
+ aiohappyeyeballs==2.6.1
29
+ httpx==0.28.1
30
+ encodec==0.1.1
31
+ ffmpy==0.5.0
32
+ mkl_random==1.2.8
33
+ soxr==0.5.0.post1
34
+ absl-py==2.2.1
35
+ networkx==3.4.2
36
+ h5py==3.13.0
37
+ hjson==3.1.0
38
+ tensorboard==2.19.0
39
+ aiosignal==1.3.2
40
+ pip==25.0
41
+ triton==3.1.0
42
+ zipp==3.21.0
43
+ ftfy==6.3.1
44
+ attrs==25.3.0
45
+ requests==2.32.3
46
+ progressbar==2.5
47
+ sniffio==1.3.1
48
+ lxml==5.3.1
49
+ starlette==0.46.1
50
+ Markdown==3.7
51
+ mdurl==0.1.2
52
+ torchaudio==2.5.1
53
+ safetensors==0.5.3
54
+ opencv-python==4.6.0.66
55
+ torchvision==0.20.1
56
+ shellingham==1.5.4
57
+ gradio==5.23.1
58
+ timm==1.0.15
59
+ multidict==6.2.0
60
+ semantic-version==2.10.0
61
+ numba==0.60.0
62
+ gradio_client==1.8.0
63
+ pydantic_core==2.33.0
64
+ dill==0.3.8
65
+ msgpack==1.1.0
66
+ sentry-sdk==2.24.1
67
+ grpcio==1.71.0
68
+ cffi==1.17.1
69
+ PyYAML==6.0.2
70
+ tensorboard-data-server==0.7.2
71
+ fastapi==0.115.12
72
+ lazy_loader==0.4
73
+ mkl_fft==1.3.11
74
+ annotated-types==0.7.0
75
+ scikit-learn==1.6.1
76
+ wget==3.2
77
+ setuptools==75.8.0
78
+ certifi==2025.1.31
79
+ click==8.1.8
80
+ laion_clap==1.1.5
81
+ Pygments==2.19.1
82
+ tomlkit==0.13.2
83
+ idna==3.7
84
+ propcache==0.3.1
85
+ platformdirs==4.3.7
86
+ align-anything==0.0.1.dev0
87
+ deepspeed==0.16.5
88
+ smmap==5.0.2
89
+ pillow==11.1.0
90
+ typing-inspection==0.4.0
91
+ braceexpand==0.1.7
92
+ decorator==5.2.1
93
+ pandas==2.2.3
94
+ huggingface-hub==0.29.3
95
+ pyarrow==19.0.1
96
+ tokenizers==0.21.1
97
+ GitPython==3.1.44
98
+ xxhash==3.5.0
99
+ packaging==24.2
100
+ numpy==1.23.4
101
+ setproctitle==1.3.5
102
+ llvmlite==0.43.0
103
+ tiktoken==0.9.0
104
+ mpmath==1.3.0
105
+ nvidia-ml-py==12.570.86
106
+ pydantic==2.11.0
107
+ datasets==3.5.0
108
+ librosa==0.11.0
109
+ frechet_audio_distance==0.3.1
110
+ sympy==1.13.1
111
+ safehttpx==0.1.6
112
+ Jinja2==3.1.6
113
+ h11==0.14.0
114
+ aiohttp==3.11.14
115
+ diffusers==0.32.2
116
+ tqdm==4.67.1
117
+ filelock==3.13.1
118
+ transformers==4.51.0.dev0
119
+ scipy==1.10.1
120
+ audioread==3.0.1
121
+ sentencepiece==0.2.0
122
+ pytz==2025.2
123
+ tzdata==2025.2
124
+ python-multipart==0.0.20
125
+ urllib3==2.3.0
126
+ pycryptodomex==3.22.0
127
+ yarl==1.18.3
128
+ pydub==0.25.1
129
+ pycparser==2.22
130
+ soundfile==0.13.1
131
+ wcwidth==0.2.13
132
+ groovy==0.1.2
133
+ torch==2.5.1
134
+ anyio==4.9.0
135
+ wandb==0.19.8
136
+ joblib==1.4.2
137
+ fsspec==2024.12.0
138
+ peft==0.15.1
139
+ accelerate==1.5.2
140
+ py-cpuinfo==9.0.0
141
+ uvicorn==0.34.0
142
+ orjson==3.10.16
143
+ Brotli==1.0.9
144
+ rich==13.9.4
145
+ importlib_metadata==8.6.1
146
+ ninja==1.11.1.4
147
+ wheel==0.45.1
148
+ MarkupSafe==3.0.2
149
+ threadpoolctl==3.6.0
150
+ gitdb==4.0.12
151
+ mkl-service==2.4.0
152
+ typing_extensions==4.12.2
153
+ tomli==2.0.1
154
+ zipp==3.19.2
155
+ wheel==0.43.0
156
+ jaraco.text==3.12.1
157
+ packaging==24.2
158
+ autocommand==2.2.2
159
+ jaraco.functools==4.0.1
160
+ jaraco.collections==5.1.0
161
+ platformdirs==4.2.2
162
+ more-itertools==10.3.0
163
+ inflect==7.3.1
164
+ jaraco.context==5.3.0
165
+ typeguard==4.3.0
166
+ backports.tarfile==1.2.0
167
+ importlib_metadata==8.0.0
offline-run-20250328_203844-gc2qytpj/files/wandb-metadata.json ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-1040-nvidia-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.11.11",
4
+ "startedAt": "2025-03-28T12:38:44.660819Z",
5
+ "args": [
6
+ "--local_rank=0",
7
+ "--model_name_or_path",
8
+ "/aifs4su/yaodong/hantao/models/llava-v1.6-vicuna-13b-hf",
9
+ "--train_datasets",
10
+ "/aifs4su/yaodong/hantao/datasets/AA_preference_vicuna-13b_cosi_cut/merged/top1-10",
11
+ "--train_template",
12
+ "AA_TI2T_LLAVA",
13
+ "--train_split",
14
+ "train",
15
+ "--train_name",
16
+ "text-image-to-text",
17
+ "--output_dir",
18
+ "../outputs/test_13B",
19
+ "--save_total_limit",
20
+ "6",
21
+ "--train_batch_size",
22
+ "2",
23
+ "--epochs",
24
+ "3"
25
+ ],
26
+ "program": "-m align_anything.trainers.text_image_to_text.dpo",
27
+ "git": {
28
+ "remote": "git@github.com-hantao:PKU-Alignment/align-anything.git",
29
+ "commit": "106588f9802757a3283c1aff1f33ea9afd737f31"
30
+ },
31
+ "root": "../outputs/test_13B",
32
+ "host": "dgx-092",
33
+ "executable": "/aifs4su/yaodong/miniconda3/envs/hantao_llama/bin/python",
34
+ "cpu_count": 112,
35
+ "cpu_count_logical": 224,
36
+ "gpu": "NVIDIA H800",
37
+ "gpu_count": 8,
38
+ "disk": {
39
+ "/": {
40
+ "total": "1888556142592",
41
+ "used": "148592906240"
42
+ }
43
+ },
44
+ "memory": {
45
+ "total": "2164195454976"
46
+ },
47
+ "cpu": {
48
+ "count": 112,
49
+ "countLogical": 224
50
+ },
51
+ "gpu_nvidia": [
52
+ {
53
+ "name": "NVIDIA H800",
54
+ "memoryTotal": "85520809984",
55
+ "cudaCores": 16896,
56
+ "architecture": "Hopper"
57
+ },
58
+ {
59
+ "name": "NVIDIA H800",
60
+ "memoryTotal": "85520809984",
61
+ "cudaCores": 16896,
62
+ "architecture": "Hopper"
63
+ },
64
+ {
65
+ "name": "NVIDIA H800",
66
+ "memoryTotal": "85520809984",
67
+ "cudaCores": 16896,
68
+ "architecture": "Hopper"
69
+ },
70
+ {
71
+ "name": "NVIDIA H800",
72
+ "memoryTotal": "85520809984",
73
+ "cudaCores": 16896,
74
+ "architecture": "Hopper"
75
+ },
76
+ {
77
+ "name": "NVIDIA H800",
78
+ "memoryTotal": "85520809984",
79
+ "cudaCores": 16896,
80
+ "architecture": "Hopper"
81
+ },
82
+ {
83
+ "name": "NVIDIA H800",
84
+ "memoryTotal": "85520809984",
85
+ "cudaCores": 16896,
86
+ "architecture": "Hopper"
87
+ },
88
+ {
89
+ "name": "NVIDIA H800",
90
+ "memoryTotal": "85520809984",
91
+ "cudaCores": 16896,
92
+ "architecture": "Hopper"
93
+ },
94
+ {
95
+ "name": "NVIDIA H800",
96
+ "memoryTotal": "85520809984",
97
+ "cudaCores": 16896,
98
+ "architecture": "Hopper"
99
+ }
100
+ ],
101
+ "slurm": {
102
+ "conf": "/cm/shared/apps/slurm/var/etc/slurm/slurm.conf"
103
+ },
104
+ "cudaVersion": "12.2"
105
+ }
offline-run-20250328_203844-gc2qytpj/logs/debug-core.log ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {"time":"2025-03-28T20:38:44.475652335+08:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpe73e3keo/port-3871311.txt","pid":3871311,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false}
2
+ {"time":"2025-03-28T20:38:44.476536664+08:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":3871311}
3
+ {"time":"2025-03-28T20:38:44.476519981+08:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":37967,"Zone":""}}
4
+ {"time":"2025-03-28T20:38:44.657114013+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:34698"}
5
+ {"time":"2025-03-28T20:38:44.662194892+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"gc2qytpj","id":"127.0.0.1:34698"}
6
+ {"time":"2025-03-28T20:38:44.906611127+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"gc2qytpj","id":"127.0.0.1:34698"}
7
+ {"time":"2025-03-28T20:47:40.250303335+08:00","level":"INFO","msg":"received shutdown signal","signal":15}
offline-run-20250328_203844-gc2qytpj/logs/debug-internal.log ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {"time":"2025-03-28T20:38:44.692258573+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.8","symlink path":"../outputs/test_13B/wandb/offline-run-20250328_203844-gc2qytpj/logs/debug-core.log"}
2
+ {"time":"2025-03-28T20:38:44.90653769+08:00","level":"INFO","msg":"created new stream","id":"gc2qytpj"}
3
+ {"time":"2025-03-28T20:38:44.90660149+08:00","level":"INFO","msg":"stream: started","id":"gc2qytpj"}
4
+ {"time":"2025-03-28T20:38:44.906615038+08:00","level":"INFO","msg":"handler: started","stream_id":"gc2qytpj"}
5
+ {"time":"2025-03-28T20:38:44.906633311+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"gc2qytpj"}
6
+ {"time":"2025-03-28T20:38:44.90663431+08:00","level":"INFO","msg":"sender: started","stream_id":"gc2qytpj"}
7
+ {"time":"2025-03-28T20:38:44.910439284+08:00","level":"INFO","msg":"Starting system monitor"}
offline-run-20250328_203844-gc2qytpj/logs/debug.log ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-03-28 20:38:44,447 INFO MainThread:3871311 [wandb_setup.py:_flush():67] Current SDK version is 0.19.8
2
+ 2025-03-28 20:38:44,447 INFO MainThread:3871311 [wandb_setup.py:_flush():67] Configure stats pid to 3871311
3
+ 2025-03-28 20:38:44,447 INFO MainThread:3871311 [wandb_setup.py:_flush():67] Loading settings from /home/yangyaodong/.config/wandb/settings
4
+ 2025-03-28 20:38:44,447 INFO MainThread:3871311 [wandb_setup.py:_flush():67] Loading settings from /aifs4su/yaodong/hantao/align-anything/scripts/wandb/settings
5
+ 2025-03-28 20:38:44,447 INFO MainThread:3871311 [wandb_setup.py:_flush():67] Loading settings from environment variables
6
+ 2025-03-28 20:38:44,447 INFO MainThread:3871311 [wandb_init.py:setup_run_log_directory():647] Logging user logs to ../outputs/test_13B/wandb/offline-run-20250328_203844-gc2qytpj/logs/debug.log
7
+ 2025-03-28 20:38:44,447 INFO MainThread:3871311 [wandb_init.py:setup_run_log_directory():648] Logging internal logs to ../outputs/test_13B/wandb/offline-run-20250328_203844-gc2qytpj/logs/debug-internal.log
8
+ 2025-03-28 20:38:44,447 INFO MainThread:3871311 [wandb_init.py:init():761] calling init triggers
9
+ 2025-03-28 20:38:44,447 INFO MainThread:3871311 [wandb_init.py:init():766] wandb.init called with sweep_config: {}
10
+ config: {'train_cfgs': {'save_checkpoint': True, 'load_checkpoint': False, 'ds_cfgs': 'ds_z3_config.json', 'epochs': 3, 'seed': 42, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 1, 'gradient_checkpointing': True, 'learning_rate': 1e-06, 'lr_scheduler_type': 'cosine', 'lr_warmup_ratio': 0.03, 'weight_decay': 0.0, 'adam_betas': [0.9, 0.95], 'bf16': True, 'fp16': False, 'eval_strategy': 'epoch', 'eval_interval': 10, 'regularization': 0.001, 'scale_coeff': 0.1, 'freeze_mm_proj': False, 'freeze_vision_tower': True, 'freeze_language_model': False}, 'data_cfgs': {'train_datasets': '/aifs4su/yaodong/hantao/datasets/AA_preference_vicuna-13b_cosi_cut/merged/top1-10', 'train_template': 'AA_TI2T_LLAVA', 'train_size': {}, 'train_split': 'train', 'train_name': 'text-image-to-text', 'train_data_files': {}, 'train_optional_args': [], 'eval_datasets': {}, 'eval_template': {}, 'eval_size': {}, 'eval_split': {}, 'eval_subset': {}, 'eval_data_files': {}, 'eval_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'align-anything', 'log_run_name': 'dpo', 'output_dir': '../outputs/test_13B', 'cache_dir': {}, 'save_total_limit': 6}, 'model_cfgs': {'model_name_or_path': '/aifs4su/yaodong/hantao/models/llava-v1.6-vicuna-13b-hf', 'trust_remote_code': True, 'model_max_length': 2048}, 'special_tokens': {}, '_wandb': {}}
11
+ 2025-03-28 20:38:44,447 INFO MainThread:3871311 [wandb_init.py:init():784] starting backend
12
+ 2025-03-28 20:38:44,657 INFO MainThread:3871311 [wandb_init.py:init():788] sending inform_init request
13
+ 2025-03-28 20:38:44,660 INFO MainThread:3871311 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
14
+ 2025-03-28 20:38:44,660 INFO MainThread:3871311 [wandb_init.py:init():798] backend started and connected
15
+ 2025-03-28 20:38:44,662 INFO MainThread:3871311 [wandb_init.py:init():891] updated telemetry
16
+ 2025-03-28 20:38:44,672 INFO MainThread:3871311 [wandb_init.py:init():915] communicating run to backend with 90.0 second timeout
17
+ 2025-03-28 20:38:44,908 INFO MainThread:3871311 [wandb_init.py:init():990] starting run threads in backend
18
+ 2025-03-28 20:38:45,071 INFO MainThread:3871311 [wandb_run.py:_console_start():2375] atexit reg
19
+ 2025-03-28 20:38:45,071 INFO MainThread:3871311 [wandb_run.py:_redirect():2227] redirect: wrap_raw
20
+ 2025-03-28 20:38:45,071 INFO MainThread:3871311 [wandb_run.py:_redirect():2292] Wrapping output streams.
21
+ 2025-03-28 20:38:45,071 INFO MainThread:3871311 [wandb_run.py:_redirect():2315] Redirects installed.
22
+ 2025-03-28 20:38:45,074 INFO MainThread:3871311 [wandb_init.py:init():1032] run started, returning control to user process
23
+ 2025-03-28 20:47:40,230 INFO MainThread:3871311 [wandb_run.py:_finish():2112] finishing run align-anything/gc2qytpj
24
+ 2025-03-28 20:47:40,230 INFO MainThread:3871311 [wandb_run.py:_atexit_cleanup():2340] got exitcode: 0
25
+ 2025-03-28 20:47:40,230 INFO MainThread:3871311 [wandb_run.py:_restore():2322] restore
26
+ 2025-03-28 20:47:40,230 INFO MainThread:3871311 [wandb_run.py:_restore():2328] restore done
offline-run-20250328_203844-gc2qytpj/run-gc2qytpj.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8aa9b355d934125304017d0d212405953c1aba2e8d426c9e82a0b3097e7e3c89
3
+ size 393216