diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..42484b910be4649b7bce17b43bf27a2edef88220 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +run-20250329_010934-3x35hjks/run-3x35hjks.wandb filter=lfs diff=lfs merge=lfs -text +run-20250329_012205-co1ecmky/run-co1ecmky.wandb filter=lfs diff=lfs merge=lfs -text diff --git a/debug-internal.log b/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..aad9315db59b0e49d398b5f74261c271e6a306a9 --- /dev/null +++ b/debug-internal.log @@ -0,0 +1,7 @@ +{"time":"2025-03-29T01:22:05.252520599+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.8","symlink path":"../outputs/test_7B/wandb/run-20250329_012205-co1ecmky/logs/debug-core.log"} +{"time":"2025-03-29T01:22:05.467254306+08:00","level":"INFO","msg":"created new stream","id":"co1ecmky"} +{"time":"2025-03-29T01:22:05.467309548+08:00","level":"INFO","msg":"stream: started","id":"co1ecmky"} +{"time":"2025-03-29T01:22:05.467331707+08:00","level":"INFO","msg":"handler: started","stream_id":"co1ecmky"} +{"time":"2025-03-29T01:22:05.467333162+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"co1ecmky"} +{"time":"2025-03-29T01:22:05.467336174+08:00","level":"INFO","msg":"sender: started","stream_id":"co1ecmky"} +{"time":"2025-03-29T01:22:05.772490021+08:00","level":"INFO","msg":"Starting system monitor"} diff --git a/debug.log b/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..e339d8f002e195108d107e4808b55b264e5ae300 --- /dev/null +++ b/debug.log @@ -0,0 +1,24 @@ +2025-03-29 01:22:05,246 INFO MainThread:104999 [wandb_setup.py:_flush():67] Current SDK version is 0.19.8 +2025-03-29 01:22:05,246 INFO MainThread:104999 [wandb_setup.py:_flush():67] Configure stats pid to 104999 +2025-03-29 01:22:05,246 INFO MainThread:104999 [wandb_setup.py:_flush():67] Loading settings from /home/yangyaodong/.config/wandb/settings +2025-03-29 01:22:05,246 INFO MainThread:104999 [wandb_setup.py:_flush():67] Loading settings from /aifs4su/yaodong/hantao/align-anything/scripts/wandb/settings +2025-03-29 01:22:05,246 INFO MainThread:104999 [wandb_setup.py:_flush():67] Loading settings from environment variables +2025-03-29 01:22:05,246 INFO MainThread:104999 [wandb_init.py:setup_run_log_directory():647] Logging user logs to ../outputs/test_7B/wandb/run-20250329_012205-co1ecmky/logs/debug.log +2025-03-29 01:22:05,246 INFO MainThread:104999 [wandb_init.py:setup_run_log_directory():648] Logging internal logs to ../outputs/test_7B/wandb/run-20250329_012205-co1ecmky/logs/debug-internal.log +2025-03-29 01:22:05,246 INFO MainThread:104999 [wandb_init.py:init():761] calling init triggers +2025-03-29 01:22:05,246 INFO MainThread:104999 [wandb_init.py:init():766] wandb.init called with sweep_config: {} +config: {'train_cfgs': {'save_checkpoint': False, 'load_checkpoint': False, 'ds_cfgs': 'ds_z3_config.json', 'epochs': 3, 'seed': 42, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 16, 'gradient_checkpointing': True, 'learning_rate': 2e-05, 'lr_scheduler_type': 'cosine', 'lr_warmup_ratio': 0.03, 'weight_decay': 0.0, 'adam_betas': [0.9, 0.95], 'adam_epsilon': 1e-08, 'bf16': True, 'fp16': False, 'eval_strategy': 'epoch', 'eval_interval': 10, 'freeze_mm_proj': False, 'freeze_vision_tower': True, 'freeze_language_model': False, 'max_grad_norm': 1.0}, 'data_cfgs': {'load_multi_datasets': False, 'train_datasets': '/aifs4su/yaodong/hantao/datasets/MMInstruct-GPT4V_mistral-7b_cosi_cut/merged/top1-10', 'train_template': 'MM_TI2T_LLAVA', 'train_size': {}, 'train_split': 'train', 'train_name': 'text-image-to-text', 'train_data_files': {}, 'train_optional_args': [], 'eval_datasets': {}, 'eval_template': {}, 'eval_name': {}, 'eval_size': {}, 'eval_split': {}, 'eval_subset': {}, 'eval_data_files': {}, 'eval_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'align-anything', 'log_run_name': 'sft', 'output_dir': '../outputs/test_7B', 'cache_dir': {}, 'save_total_limit': 6}, 'model_cfgs': {'model_name_or_path': '/aifs4su/yaodong/hantao/models/llava-v1.6-mistral-7b-hf', 'trust_remote_code': True, 'model_max_length': 2048}, 'special_tokens': {}, '_wandb': {}} +2025-03-29 01:22:05,246 INFO MainThread:104999 [wandb_init.py:init():784] starting backend +2025-03-29 01:22:05,246 INFO MainThread:104999 [wandb_init.py:init():788] sending inform_init request +2025-03-29 01:22:05,249 INFO MainThread:104999 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2025-03-29 01:22:05,249 INFO MainThread:104999 [wandb_init.py:init():798] backend started and connected +2025-03-29 01:22:05,251 INFO MainThread:104999 [wandb_init.py:init():891] updated telemetry +2025-03-29 01:22:05,262 INFO MainThread:104999 [wandb_init.py:init():915] communicating run to backend with 90.0 second timeout +2025-03-29 01:22:05,770 INFO MainThread:104999 [wandb_init.py:init():990] starting run threads in backend +2025-03-29 01:22:05,989 INFO MainThread:104999 [wandb_run.py:_console_start():2375] atexit reg +2025-03-29 01:22:05,990 INFO MainThread:104999 [wandb_run.py:_redirect():2227] redirect: wrap_raw +2025-03-29 01:22:05,990 INFO MainThread:104999 [wandb_run.py:_redirect():2292] Wrapping output streams. +2025-03-29 01:22:05,990 INFO MainThread:104999 [wandb_run.py:_redirect():2315] Redirects installed. +2025-03-29 01:22:05,992 INFO MainThread:104999 [wandb_init.py:init():1032] run started, returning control to user process +2025-03-29 01:26:51,278 INFO MainThread:104999 [wandb_run.py:_finish():2112] finishing run htlou/align-anything/co1ecmky +2025-03-29 01:26:51,278 INFO MainThread:104999 [wandb_run.py:_atexit_cleanup():2340] got exitcode: 0 diff --git a/run-20250329_003552-1mlhe6om/files/config.yaml b/run-20250329_003552-1mlhe6om/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3a1390676bc738b079cf34450bb4a99d6b1c5f0f --- /dev/null +++ b/run-20250329_003552-1mlhe6om/files/config.yaml @@ -0,0 +1,95 @@ +_wandb: + value: + cli_version: 0.19.8 + m: [] + python_version: 3.11.11 + t: + "1": + - 1 + - 5 + - 11 + - 41 + - 49 + - 51 + - 53 + - 55 + - 63 + - 71 + - 83 + - 98 + - 105 + "2": + - 1 + - 5 + - 11 + - 41 + - 49 + - 51 + - 53 + - 55 + - 63 + - 71 + - 83 + - 98 + - 105 + "3": + - 2 + - 13 + - 16 + - 23 + - 55 + "4": 3.11.11 + "5": 0.19.8 + "6": 4.51.0.dev0 + "8": + - 5 + "12": 0.19.8 + "13": linux-x86_64 +data_cfgs: + value: + eval_optional_args: [] + load_multi_datasets: false + train_datasets: /aifs4su/yaodong/hantao/datasets/MMInstruct-GPT4V_mistral-7b_cosi_cut/merged/top1-10 + train_name: text-image-to-text + train_optional_args: [] + train_split: train + train_template: MM_TI2T_LLAVA +logger_cfgs: + value: + log_project: align-anything + log_run_name: sft + log_type: wandb + output_dir: ../outputs/test_7B + save_total_limit: 6 +model_cfgs: + value: + model_max_length: 2048 + model_name_or_path: /aifs4su/yaodong/hantao/models/llava-v1.6-mistral-7b-hf + trust_remote_code: true +train_cfgs: + value: + adam_betas: + - 0.9 + - 0.95 + adam_epsilon: 1e-08 + bf16: true + ds_cfgs: ds_z3_config.json + epochs: 3 + eval_interval: 10 + eval_strategy: epoch + fp16: false + freeze_language_model: false + freeze_mm_proj: false + freeze_vision_tower: true + gradient_accumulation_steps: 16 + gradient_checkpointing: true + learning_rate: 2e-05 + load_checkpoint: false + lr_scheduler_type: cosine + lr_warmup_ratio: 0.03 + max_grad_norm: 1 + per_device_eval_batch_size: 1 + per_device_train_batch_size: 1 + save_checkpoint: true + seed: 42 + weight_decay: 0 diff --git a/run-20250329_003552-1mlhe6om/files/output.log b/run-20250329_003552-1mlhe6om/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..014d4bffbab2e465d2e95bbcda08452f4dd46d3c --- /dev/null +++ b/run-20250329_003552-1mlhe6om/files/output.log @@ -0,0 +1,21 @@ +***** Running training ***** +Resuming from checkpoint 3/3 epoch : 0%| | 0/7326 [00:00", line 198, in _run_module_as_main + File "", line 88, in _run_code + File "/aifs4su/yaodong/hantao/align-anything/align_anything/trainers/text_image_to_text/sft.py", line 100, in + sys.exit(main()) + ^^^^^^ + File "/aifs4su/yaodong/hantao/align-anything/align_anything/trainers/text_image_to_text/sft.py", line 96, in main + trainer.save() + File "/aifs4su/yaodong/hantao/align-anything/align_anything/trainers/text_to_text/sft.py", line 228, in save + self.save_transformers(model=model, tag=tag) + File "/aifs4su/yaodong/hantao/align-anything/align_anything/trainers/base/supervised_trainer.py", line 435, in save_transformers + model.save_16bit_model(output_dir, save_filename=save_file_name) + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 3815, in save_16bit_model + state_dict = self._zero3_consolidated_16bit_state_dict( + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 3776, in _zero3_consolidated_16bit_state_dict + get_layer_state_dict(self.module, prefix="") + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 3769, in get_layer_state_dict + get_layer_state_dict(child, prefix + name + ".") + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 3769, in get_layer_state_dict + get_layer_state_dict(child, prefix + name + ".") + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 3769, in get_layer_state_dict + get_layer_state_dict(child, prefix + name + ".") + [Previous line repeated 3 more times] + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 3757, in get_layer_state_dict + state_dict[key] = param.detach().cpu() + ^^^^^^^^^^^^^^^^^^^^ +KeyboardInterrupt +[rank0]: Traceback (most recent call last): +[rank0]: File "", line 198, in _run_module_as_main +[rank0]: File "", line 88, in _run_code +[rank0]: File "/aifs4su/yaodong/hantao/align-anything/align_anything/trainers/text_image_to_text/sft.py", line 100, in +[rank0]: sys.exit(main()) +[rank0]: ^^^^^^ +[rank0]: File "/aifs4su/yaodong/hantao/align-anything/align_anything/trainers/text_image_to_text/sft.py", line 96, in main +[rank0]: trainer.save() +[rank0]: File "/aifs4su/yaodong/hantao/align-anything/align_anything/trainers/text_to_text/sft.py", line 228, in save +[rank0]: self.save_transformers(model=model, tag=tag) +[rank0]: File "/aifs4su/yaodong/hantao/align-anything/align_anything/trainers/base/supervised_trainer.py", line 435, in save_transformers +[rank0]: model.save_16bit_model(output_dir, save_filename=save_file_name) +[rank0]: File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 3815, in save_16bit_model +[rank0]: state_dict = self._zero3_consolidated_16bit_state_dict( +[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank0]: File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 3776, in _zero3_consolidated_16bit_state_dict +[rank0]: get_layer_state_dict(self.module, prefix="") +[rank0]: File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 3769, in get_layer_state_dict +[rank0]: get_layer_state_dict(child, prefix + name + ".") +[rank0]: File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 3769, in get_layer_state_dict +[rank0]: get_layer_state_dict(child, prefix + name + ".") +[rank0]: File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 3769, in get_layer_state_dict +[rank0]: get_layer_state_dict(child, prefix + name + ".") +[rank0]: [Previous line repeated 3 more times] +[rank0]: File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 3757, in get_layer_state_dict +[rank0]: state_dict[key] = param.detach().cpu() +[rank0]: ^^^^^^^^^^^^^^^^^^^^ +[rank0]: KeyboardInterrupt diff --git a/run-20250329_004923-vanwhj5e/files/requirements.txt b/run-20250329_004923-vanwhj5e/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..1bd434ed56b038dff142c2a66ee591bbfc57f256 --- /dev/null +++ b/run-20250329_004923-vanwhj5e/files/requirements.txt @@ -0,0 +1,167 @@ +maskrcnn_benchmark==0.0.0 +webdataset==0.2.111 +websockets==15.0.1 +typer==0.15.2 +blobfile==3.0.0 +pooch==1.8.2 +python-dateutil==2.9.0.post0 +gmpy2==2.2.1 +httpcore==1.0.7 +charset-normalizer==3.3.2 +torchlibrosa==0.1.0 +multiprocess==0.70.16 +Werkzeug==3.1.3 +aiofiles==23.2.1 +six==1.17.0 +typing_extensions==4.12.2 +psutil==7.0.0 +frozenlist==1.5.0 +einops==0.8.1 +flash_attn==2.7.4.post1 +PySocks==1.7.1 +regex==2024.11.6 +markdown-it-py==3.0.0 +ruff==0.11.2 +docker-pycreds==0.4.0 +protobuf==5.29.4 +resampy==0.4.3 +aiohappyeyeballs==2.6.1 +httpx==0.28.1 +encodec==0.1.1 +ffmpy==0.5.0 +mkl_random==1.2.8 +soxr==0.5.0.post1 +absl-py==2.2.1 +networkx==3.4.2 +h5py==3.13.0 +hjson==3.1.0 +tensorboard==2.19.0 +aiosignal==1.3.2 +pip==25.0 +triton==3.1.0 +zipp==3.21.0 +ftfy==6.3.1 +attrs==25.3.0 +requests==2.32.3 +progressbar==2.5 +sniffio==1.3.1 +lxml==5.3.1 +starlette==0.46.1 +Markdown==3.7 +mdurl==0.1.2 +torchaudio==2.5.1 +safetensors==0.5.3 +opencv-python==4.6.0.66 +torchvision==0.20.1 +shellingham==1.5.4 +gradio==5.23.1 +timm==1.0.15 +multidict==6.2.0 +semantic-version==2.10.0 +numba==0.60.0 +gradio_client==1.8.0 +pydantic_core==2.33.0 +dill==0.3.8 +msgpack==1.1.0 +sentry-sdk==2.24.1 +grpcio==1.71.0 +cffi==1.17.1 +PyYAML==6.0.2 +tensorboard-data-server==0.7.2 +fastapi==0.115.12 +lazy_loader==0.4 +mkl_fft==1.3.11 +annotated-types==0.7.0 +scikit-learn==1.6.1 +wget==3.2 +setuptools==75.8.0 +certifi==2025.1.31 +click==8.1.8 +laion_clap==1.1.5 +Pygments==2.19.1 +tomlkit==0.13.2 +idna==3.7 +propcache==0.3.1 +platformdirs==4.3.7 +align-anything==0.0.1.dev0 +deepspeed==0.16.5 +smmap==5.0.2 +pillow==11.1.0 +typing-inspection==0.4.0 +braceexpand==0.1.7 +decorator==5.2.1 +pandas==2.2.3 +huggingface-hub==0.29.3 +pyarrow==19.0.1 +tokenizers==0.21.1 +GitPython==3.1.44 +xxhash==3.5.0 +packaging==24.2 +numpy==1.23.4 +setproctitle==1.3.5 +llvmlite==0.43.0 +tiktoken==0.9.0 +mpmath==1.3.0 +nvidia-ml-py==12.570.86 +pydantic==2.11.0 +datasets==3.5.0 +librosa==0.11.0 +frechet_audio_distance==0.3.1 +sympy==1.13.1 +safehttpx==0.1.6 +Jinja2==3.1.6 +h11==0.14.0 +aiohttp==3.11.14 +diffusers==0.32.2 +tqdm==4.67.1 +filelock==3.13.1 +transformers==4.51.0.dev0 +scipy==1.10.1 +audioread==3.0.1 +sentencepiece==0.2.0 +pytz==2025.2 +tzdata==2025.2 +python-multipart==0.0.20 +urllib3==2.3.0 +pycryptodomex==3.22.0 +yarl==1.18.3 +pydub==0.25.1 +pycparser==2.22 +soundfile==0.13.1 +wcwidth==0.2.13 +groovy==0.1.2 +torch==2.5.1 +anyio==4.9.0 +wandb==0.19.8 +joblib==1.4.2 +fsspec==2024.12.0 +peft==0.15.1 +accelerate==1.5.2 +py-cpuinfo==9.0.0 +uvicorn==0.34.0 +orjson==3.10.16 +Brotli==1.0.9 +rich==13.9.4 +importlib_metadata==8.6.1 +ninja==1.11.1.4 +wheel==0.45.1 +MarkupSafe==3.0.2 +threadpoolctl==3.6.0 +gitdb==4.0.12 +mkl-service==2.4.0 +typing_extensions==4.12.2 +tomli==2.0.1 +zipp==3.19.2 +wheel==0.43.0 +jaraco.text==3.12.1 +packaging==24.2 +autocommand==2.2.2 +jaraco.functools==4.0.1 +jaraco.collections==5.1.0 +platformdirs==4.2.2 +more-itertools==10.3.0 +inflect==7.3.1 +jaraco.context==5.3.0 +typeguard==4.3.0 +backports.tarfile==1.2.0 +importlib_metadata==8.0.0 diff --git a/run-20250329_004923-vanwhj5e/files/wandb-metadata.json b/run-20250329_004923-vanwhj5e/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..fc50513f283631e6c5a7103354f3b82945e04374 --- /dev/null +++ b/run-20250329_004923-vanwhj5e/files/wandb-metadata.json @@ -0,0 +1,106 @@ +{ + "os": "Linux-5.15.0-1040-nvidia-x86_64-with-glibc2.35", + "python": "CPython 3.11.11", + "startedAt": "2025-03-28T16:49:23.693460Z", + "args": [ + "--local_rank=0", + "--model_name_or_path", + "/aifs4su/yaodong/hantao/models/llava-v1.6-mistral-7b-hf", + "--train_datasets", + "/aifs4su/yaodong/hantao/datasets/MMInstruct-GPT4V_mistral-7b_cosi_cut/merged/top1-10", + "--train_template", + "MM_TI2T_LLAVA", + "--train_split", + "train", + "--train_name", + "text-image-to-text", + "--output_dir", + "../outputs/test_7B", + "--save_total_limit", + "6", + "--train_batch_size", + "8", + "--epochs", + "3" + ], + "program": "-m align_anything.trainers.text_image_to_text.sft", + "git": { + "remote": "git@github.com-hantao:PKU-Alignment/align-anything.git", + "commit": "106588f9802757a3283c1aff1f33ea9afd737f31" + }, + "email": "2200017789@stu.pku.edu.cn", + "root": "../outputs/test_7B", + "host": "dgx-092", + "executable": "/aifs4su/yaodong/miniconda3/envs/hantao_llama/bin/python", + "cpu_count": 112, + "cpu_count_logical": 224, + "gpu": "NVIDIA H800", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888556142592", + "used": "148608499712" + } + }, + "memory": { + "total": "2164195454976" + }, + "cpu": { + "count": 112, + "countLogical": 224 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA H800", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + }, + { + "name": "NVIDIA H800", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + }, + { + "name": "NVIDIA H800", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + }, + { + "name": "NVIDIA H800", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + }, + { + "name": "NVIDIA H800", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + }, + { + "name": "NVIDIA H800", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + }, + { + "name": "NVIDIA H800", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + }, + { + "name": "NVIDIA H800", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + } + ], + "slurm": { + "conf": "/cm/shared/apps/slurm/var/etc/slurm/slurm.conf" + }, + "cudaVersion": "12.2" +} \ No newline at end of file diff --git a/run-20250329_004923-vanwhj5e/files/wandb-summary.json b/run-20250329_004923-vanwhj5e/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..c059977b34bd915d979f049d6de5e7dbe3ee842d --- /dev/null +++ b/run-20250329_004923-vanwhj5e/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":4}} \ No newline at end of file diff --git a/run-20250329_004923-vanwhj5e/logs/debug-core.log b/run-20250329_004923-vanwhj5e/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..9e4835ba184cd6305d07730e7704b63006b82b6d --- /dev/null +++ b/run-20250329_004923-vanwhj5e/logs/debug-core.log @@ -0,0 +1,7 @@ +{"time":"2025-03-29T00:49:23.0903988+08:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmp1bwuylg2/port-35456.txt","pid":35456,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false} +{"time":"2025-03-29T00:49:23.091269653+08:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":35456} +{"time":"2025-03-29T00:49:23.091244341+08:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":40795,"Zone":""}} +{"time":"2025-03-29T00:49:23.270609752+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:54946"} +{"time":"2025-03-29T00:49:23.694940245+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"vanwhj5e","id":"127.0.0.1:54946"} +{"time":"2025-03-29T00:49:23.912684251+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"vanwhj5e","id":"127.0.0.1:54946"} +{"time":"2025-03-29T00:49:27.869501123+08:00","level":"INFO","msg":"received shutdown signal","signal":15} diff --git a/run-20250329_004923-vanwhj5e/logs/debug-internal.log b/run-20250329_004923-vanwhj5e/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..d5e6808748feb6394d7551c613b03536b5be5395 --- /dev/null +++ b/run-20250329_004923-vanwhj5e/logs/debug-internal.log @@ -0,0 +1,9 @@ +{"time":"2025-03-29T00:49:23.696239084+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.8","symlink path":"../outputs/test_7B/wandb/run-20250329_004923-vanwhj5e/logs/debug-core.log"} +{"time":"2025-03-29T00:49:23.912592778+08:00","level":"INFO","msg":"created new stream","id":"vanwhj5e"} +{"time":"2025-03-29T00:49:23.91267373+08:00","level":"INFO","msg":"stream: started","id":"vanwhj5e"} +{"time":"2025-03-29T00:49:23.912692782+08:00","level":"INFO","msg":"handler: started","stream_id":"vanwhj5e"} +{"time":"2025-03-29T00:49:23.912700862+08:00","level":"INFO","msg":"sender: started","stream_id":"vanwhj5e"} +{"time":"2025-03-29T00:49:23.912703724+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"vanwhj5e"} +{"time":"2025-03-29T00:49:24.229784705+08:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2025-03-29T00:49:27.854433357+08:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2025-03-29T00:49:27.855141086+08:00","level":"INFO","msg":"Stopped system monitor"} diff --git a/run-20250329_004923-vanwhj5e/logs/debug.log b/run-20250329_004923-vanwhj5e/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..e2710208fb13968a784a573f954f13c193f9f950 --- /dev/null +++ b/run-20250329_004923-vanwhj5e/logs/debug.log @@ -0,0 +1,26 @@ +2025-03-29 00:49:23,690 INFO MainThread:35456 [wandb_setup.py:_flush():67] Current SDK version is 0.19.8 +2025-03-29 00:49:23,690 INFO MainThread:35456 [wandb_setup.py:_flush():67] Configure stats pid to 35456 +2025-03-29 00:49:23,690 INFO MainThread:35456 [wandb_setup.py:_flush():67] Loading settings from /home/yangyaodong/.config/wandb/settings +2025-03-29 00:49:23,690 INFO MainThread:35456 [wandb_setup.py:_flush():67] Loading settings from /aifs4su/yaodong/hantao/align-anything/scripts/wandb/settings +2025-03-29 00:49:23,690 INFO MainThread:35456 [wandb_setup.py:_flush():67] Loading settings from environment variables +2025-03-29 00:49:23,690 INFO MainThread:35456 [wandb_init.py:setup_run_log_directory():647] Logging user logs to ../outputs/test_7B/wandb/run-20250329_004923-vanwhj5e/logs/debug.log +2025-03-29 00:49:23,690 INFO MainThread:35456 [wandb_init.py:setup_run_log_directory():648] Logging internal logs to ../outputs/test_7B/wandb/run-20250329_004923-vanwhj5e/logs/debug-internal.log +2025-03-29 00:49:23,690 INFO MainThread:35456 [wandb_init.py:init():761] calling init triggers +2025-03-29 00:49:23,690 INFO MainThread:35456 [wandb_init.py:init():766] wandb.init called with sweep_config: {} +config: {'train_cfgs': {'save_checkpoint': False, 'load_checkpoint': False, 'ds_cfgs': 'ds_z3_config.json', 'epochs': 3, 'seed': 42, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 16, 'gradient_checkpointing': True, 'learning_rate': 2e-05, 'lr_scheduler_type': 'cosine', 'lr_warmup_ratio': 0.03, 'weight_decay': 0.0, 'adam_betas': [0.9, 0.95], 'adam_epsilon': 1e-08, 'bf16': True, 'fp16': False, 'eval_strategy': 'epoch', 'eval_interval': 10, 'freeze_mm_proj': False, 'freeze_vision_tower': True, 'freeze_language_model': False, 'max_grad_norm': 1.0}, 'data_cfgs': {'load_multi_datasets': False, 'train_datasets': '/aifs4su/yaodong/hantao/datasets/MMInstruct-GPT4V_mistral-7b_cosi_cut/merged/top1-10', 'train_template': 'MM_TI2T_LLAVA', 'train_size': {}, 'train_split': 'train', 'train_name': 'text-image-to-text', 'train_data_files': {}, 'train_optional_args': [], 'eval_datasets': {}, 'eval_template': {}, 'eval_name': {}, 'eval_size': {}, 'eval_split': {}, 'eval_subset': {}, 'eval_data_files': {}, 'eval_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'align-anything', 'log_run_name': 'sft', 'output_dir': '../outputs/test_7B', 'cache_dir': {}, 'save_total_limit': 6}, 'model_cfgs': {'model_name_or_path': '/aifs4su/yaodong/hantao/models/llava-v1.6-mistral-7b-hf', 'trust_remote_code': True, 'model_max_length': 2048}, 'special_tokens': {}, '_wandb': {}} +2025-03-29 00:49:23,690 INFO MainThread:35456 [wandb_init.py:init():784] starting backend +2025-03-29 00:49:23,690 INFO MainThread:35456 [wandb_init.py:init():788] sending inform_init request +2025-03-29 00:49:23,693 INFO MainThread:35456 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2025-03-29 00:49:23,693 INFO MainThread:35456 [wandb_init.py:init():798] backend started and connected +2025-03-29 00:49:23,694 INFO MainThread:35456 [wandb_init.py:init():891] updated telemetry +2025-03-29 00:49:23,705 INFO MainThread:35456 [wandb_init.py:init():915] communicating run to backend with 90.0 second timeout +2025-03-29 00:49:24,227 INFO MainThread:35456 [wandb_init.py:init():990] starting run threads in backend +2025-03-29 00:49:24,382 INFO MainThread:35456 [wandb_run.py:_console_start():2375] atexit reg +2025-03-29 00:49:24,382 INFO MainThread:35456 [wandb_run.py:_redirect():2227] redirect: wrap_raw +2025-03-29 00:49:24,382 INFO MainThread:35456 [wandb_run.py:_redirect():2292] Wrapping output streams. +2025-03-29 00:49:24,382 INFO MainThread:35456 [wandb_run.py:_redirect():2315] Redirects installed. +2025-03-29 00:49:24,384 INFO MainThread:35456 [wandb_init.py:init():1032] run started, returning control to user process +2025-03-29 00:49:27,852 INFO MainThread:35456 [wandb_run.py:_finish():2112] finishing run htlou/align-anything/vanwhj5e +2025-03-29 00:49:27,853 INFO MainThread:35456 [wandb_run.py:_atexit_cleanup():2340] got exitcode: 0 +2025-03-29 00:49:27,853 INFO MainThread:35456 [wandb_run.py:_restore():2322] restore +2025-03-29 00:49:27,854 INFO MainThread:35456 [wandb_run.py:_restore():2328] restore done diff --git a/run-20250329_004923-vanwhj5e/run-vanwhj5e.wandb b/run-20250329_004923-vanwhj5e/run-vanwhj5e.wandb new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/run-20250329_005139-6x2eqgtz/files/output.log b/run-20250329_005139-6x2eqgtz/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/run-20250329_005139-6x2eqgtz/files/requirements.txt b/run-20250329_005139-6x2eqgtz/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..1bd434ed56b038dff142c2a66ee591bbfc57f256 --- /dev/null +++ b/run-20250329_005139-6x2eqgtz/files/requirements.txt @@ -0,0 +1,167 @@ +maskrcnn_benchmark==0.0.0 +webdataset==0.2.111 +websockets==15.0.1 +typer==0.15.2 +blobfile==3.0.0 +pooch==1.8.2 +python-dateutil==2.9.0.post0 +gmpy2==2.2.1 +httpcore==1.0.7 +charset-normalizer==3.3.2 +torchlibrosa==0.1.0 +multiprocess==0.70.16 +Werkzeug==3.1.3 +aiofiles==23.2.1 +six==1.17.0 +typing_extensions==4.12.2 +psutil==7.0.0 +frozenlist==1.5.0 +einops==0.8.1 +flash_attn==2.7.4.post1 +PySocks==1.7.1 +regex==2024.11.6 +markdown-it-py==3.0.0 +ruff==0.11.2 +docker-pycreds==0.4.0 +protobuf==5.29.4 +resampy==0.4.3 +aiohappyeyeballs==2.6.1 +httpx==0.28.1 +encodec==0.1.1 +ffmpy==0.5.0 +mkl_random==1.2.8 +soxr==0.5.0.post1 +absl-py==2.2.1 +networkx==3.4.2 +h5py==3.13.0 +hjson==3.1.0 +tensorboard==2.19.0 +aiosignal==1.3.2 +pip==25.0 +triton==3.1.0 +zipp==3.21.0 +ftfy==6.3.1 +attrs==25.3.0 +requests==2.32.3 +progressbar==2.5 +sniffio==1.3.1 +lxml==5.3.1 +starlette==0.46.1 +Markdown==3.7 +mdurl==0.1.2 +torchaudio==2.5.1 +safetensors==0.5.3 +opencv-python==4.6.0.66 +torchvision==0.20.1 +shellingham==1.5.4 +gradio==5.23.1 +timm==1.0.15 +multidict==6.2.0 +semantic-version==2.10.0 +numba==0.60.0 +gradio_client==1.8.0 +pydantic_core==2.33.0 +dill==0.3.8 +msgpack==1.1.0 +sentry-sdk==2.24.1 +grpcio==1.71.0 +cffi==1.17.1 +PyYAML==6.0.2 +tensorboard-data-server==0.7.2 +fastapi==0.115.12 +lazy_loader==0.4 +mkl_fft==1.3.11 +annotated-types==0.7.0 +scikit-learn==1.6.1 +wget==3.2 +setuptools==75.8.0 +certifi==2025.1.31 +click==8.1.8 +laion_clap==1.1.5 +Pygments==2.19.1 +tomlkit==0.13.2 +idna==3.7 +propcache==0.3.1 +platformdirs==4.3.7 +align-anything==0.0.1.dev0 +deepspeed==0.16.5 +smmap==5.0.2 +pillow==11.1.0 +typing-inspection==0.4.0 +braceexpand==0.1.7 +decorator==5.2.1 +pandas==2.2.3 +huggingface-hub==0.29.3 +pyarrow==19.0.1 +tokenizers==0.21.1 +GitPython==3.1.44 +xxhash==3.5.0 +packaging==24.2 +numpy==1.23.4 +setproctitle==1.3.5 +llvmlite==0.43.0 +tiktoken==0.9.0 +mpmath==1.3.0 +nvidia-ml-py==12.570.86 +pydantic==2.11.0 +datasets==3.5.0 +librosa==0.11.0 +frechet_audio_distance==0.3.1 +sympy==1.13.1 +safehttpx==0.1.6 +Jinja2==3.1.6 +h11==0.14.0 +aiohttp==3.11.14 +diffusers==0.32.2 +tqdm==4.67.1 +filelock==3.13.1 +transformers==4.51.0.dev0 +scipy==1.10.1 +audioread==3.0.1 +sentencepiece==0.2.0 +pytz==2025.2 +tzdata==2025.2 +python-multipart==0.0.20 +urllib3==2.3.0 +pycryptodomex==3.22.0 +yarl==1.18.3 +pydub==0.25.1 +pycparser==2.22 +soundfile==0.13.1 +wcwidth==0.2.13 +groovy==0.1.2 +torch==2.5.1 +anyio==4.9.0 +wandb==0.19.8 +joblib==1.4.2 +fsspec==2024.12.0 +peft==0.15.1 +accelerate==1.5.2 +py-cpuinfo==9.0.0 +uvicorn==0.34.0 +orjson==3.10.16 +Brotli==1.0.9 +rich==13.9.4 +importlib_metadata==8.6.1 +ninja==1.11.1.4 +wheel==0.45.1 +MarkupSafe==3.0.2 +threadpoolctl==3.6.0 +gitdb==4.0.12 +mkl-service==2.4.0 +typing_extensions==4.12.2 +tomli==2.0.1 +zipp==3.19.2 +wheel==0.43.0 +jaraco.text==3.12.1 +packaging==24.2 +autocommand==2.2.2 +jaraco.functools==4.0.1 +jaraco.collections==5.1.0 +platformdirs==4.2.2 +more-itertools==10.3.0 +inflect==7.3.1 +jaraco.context==5.3.0 +typeguard==4.3.0 +backports.tarfile==1.2.0 +importlib_metadata==8.0.0 diff --git a/run-20250329_005139-6x2eqgtz/files/wandb-metadata.json b/run-20250329_005139-6x2eqgtz/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..045197e7486b8e54077e5090d3b277f1df63f324 --- /dev/null +++ b/run-20250329_005139-6x2eqgtz/files/wandb-metadata.json @@ -0,0 +1,35 @@ +{ + "os": "Linux-5.15.0-1040-nvidia-x86_64-with-glibc2.35", + "python": "CPython 3.11.11", + "startedAt": "2025-03-28T16:51:39.067886Z", + "args": [ + "--local_rank=0", + "--model_name_or_path", + "/aifs4su/yaodong/hantao/models/llava-v1.6-mistral-7b-hf", + "--train_datasets", + "/aifs4su/yaodong/hantao/datasets/MMInstruct-GPT4V_mistral-7b_cosi_cut/merged/top1-10", + "--train_template", + "MM_TI2T_LLAVA", + "--train_split", + "train", + "--train_name", + "text-image-to-text", + "--output_dir", + "../outputs/test_7B", + "--save_total_limit", + "6", + "--train_batch_size", + "8", + "--epochs", + "3" + ], + "program": "-m align_anything.trainers.text_image_to_text.sft", + "git": { + "remote": "git@github.com-hantao:PKU-Alignment/align-anything.git", + "commit": "106588f9802757a3283c1aff1f33ea9afd737f31" + }, + "email": "2200017789@stu.pku.edu.cn", + "root": "../outputs/test_7B", + "host": "dgx-092", + "executable": "/aifs4su/yaodong/miniconda3/envs/hantao_llama/bin/python" +} \ No newline at end of file diff --git a/run-20250329_005139-6x2eqgtz/logs/debug-core.log b/run-20250329_005139-6x2eqgtz/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..4f0490ea3e1b1c96a9a2f63e14136d45843329e7 --- /dev/null +++ b/run-20250329_005139-6x2eqgtz/logs/debug-core.log @@ -0,0 +1,7 @@ +{"time":"2025-03-29T00:51:38.479231525+08:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpb6xx_4px/port-42596.txt","pid":42596,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false} +{"time":"2025-03-29T00:51:38.480170741+08:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":42596} +{"time":"2025-03-29T00:51:38.480164974+08:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":35441,"Zone":""}} +{"time":"2025-03-29T00:51:38.663800746+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:50004"} +{"time":"2025-03-29T00:51:39.069534671+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"6x2eqgtz","id":"127.0.0.1:50004"} +{"time":"2025-03-29T00:51:39.285743333+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"6x2eqgtz","id":"127.0.0.1:50004"} +{"time":"2025-03-29T00:51:40.320100827+08:00","level":"INFO","msg":"received shutdown signal","signal":15} diff --git a/run-20250329_005139-6x2eqgtz/logs/debug-internal.log b/run-20250329_005139-6x2eqgtz/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..a9aaac70bc70a8e61cd0a569b843b8ef6e7608ad --- /dev/null +++ b/run-20250329_005139-6x2eqgtz/logs/debug-internal.log @@ -0,0 +1,8 @@ +{"time":"2025-03-29T00:51:39.071049577+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.8","symlink path":"../outputs/test_7B/wandb/run-20250329_005139-6x2eqgtz/logs/debug-core.log"} +{"time":"2025-03-29T00:51:39.285637637+08:00","level":"INFO","msg":"created new stream","id":"6x2eqgtz"} +{"time":"2025-03-29T00:51:39.285734961+08:00","level":"INFO","msg":"stream: started","id":"6x2eqgtz"} +{"time":"2025-03-29T00:51:39.285766391+08:00","level":"INFO","msg":"handler: started","stream_id":"6x2eqgtz"} +{"time":"2025-03-29T00:51:39.286029925+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"6x2eqgtz"} +{"time":"2025-03-29T00:51:39.285781884+08:00","level":"INFO","msg":"sender: started","stream_id":"6x2eqgtz"} +{"time":"2025-03-29T00:51:39.613194812+08:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2025-03-29T00:51:39.846153883+08:00","level":"INFO","msg":"Stopping system monitor"} diff --git a/run-20250329_005139-6x2eqgtz/logs/debug.log b/run-20250329_005139-6x2eqgtz/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..365f7f1bcc0093c66cc90b4aefb406bec6bb1d6e --- /dev/null +++ b/run-20250329_005139-6x2eqgtz/logs/debug.log @@ -0,0 +1,26 @@ +2025-03-29 00:51:39,064 INFO MainThread:42596 [wandb_setup.py:_flush():67] Current SDK version is 0.19.8 +2025-03-29 00:51:39,064 INFO MainThread:42596 [wandb_setup.py:_flush():67] Configure stats pid to 42596 +2025-03-29 00:51:39,064 INFO MainThread:42596 [wandb_setup.py:_flush():67] Loading settings from /home/yangyaodong/.config/wandb/settings +2025-03-29 00:51:39,064 INFO MainThread:42596 [wandb_setup.py:_flush():67] Loading settings from /aifs4su/yaodong/hantao/align-anything/scripts/wandb/settings +2025-03-29 00:51:39,064 INFO MainThread:42596 [wandb_setup.py:_flush():67] Loading settings from environment variables +2025-03-29 00:51:39,065 INFO MainThread:42596 [wandb_init.py:setup_run_log_directory():647] Logging user logs to ../outputs/test_7B/wandb/run-20250329_005139-6x2eqgtz/logs/debug.log +2025-03-29 00:51:39,065 INFO MainThread:42596 [wandb_init.py:setup_run_log_directory():648] Logging internal logs to ../outputs/test_7B/wandb/run-20250329_005139-6x2eqgtz/logs/debug-internal.log +2025-03-29 00:51:39,065 INFO MainThread:42596 [wandb_init.py:init():761] calling init triggers +2025-03-29 00:51:39,065 INFO MainThread:42596 [wandb_init.py:init():766] wandb.init called with sweep_config: {} +config: {'train_cfgs': {'save_checkpoint': False, 'load_checkpoint': False, 'ds_cfgs': 'ds_z3_config.json', 'epochs': 3, 'seed': 42, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 16, 'gradient_checkpointing': True, 'learning_rate': 2e-05, 'lr_scheduler_type': 'cosine', 'lr_warmup_ratio': 0.03, 'weight_decay': 0.0, 'adam_betas': [0.9, 0.95], 'adam_epsilon': 1e-08, 'bf16': True, 'fp16': False, 'eval_strategy': 'epoch', 'eval_interval': 10, 'freeze_mm_proj': False, 'freeze_vision_tower': True, 'freeze_language_model': False, 'max_grad_norm': 1.0}, 'data_cfgs': {'load_multi_datasets': False, 'train_datasets': '/aifs4su/yaodong/hantao/datasets/MMInstruct-GPT4V_mistral-7b_cosi_cut/merged/top1-10', 'train_template': 'MM_TI2T_LLAVA', 'train_size': {}, 'train_split': 'train', 'train_name': 'text-image-to-text', 'train_data_files': {}, 'train_optional_args': [], 'eval_datasets': {}, 'eval_template': {}, 'eval_name': {}, 'eval_size': {}, 'eval_split': {}, 'eval_subset': {}, 'eval_data_files': {}, 'eval_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'align-anything', 'log_run_name': 'sft', 'output_dir': '../outputs/test_7B', 'cache_dir': {}, 'save_total_limit': 6}, 'model_cfgs': {'model_name_or_path': '/aifs4su/yaodong/hantao/models/llava-v1.6-mistral-7b-hf', 'trust_remote_code': True, 'model_max_length': 2048}, 'special_tokens': {}, '_wandb': {}} +2025-03-29 00:51:39,065 INFO MainThread:42596 [wandb_init.py:init():784] starting backend +2025-03-29 00:51:39,065 INFO MainThread:42596 [wandb_init.py:init():788] sending inform_init request +2025-03-29 00:51:39,067 INFO MainThread:42596 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2025-03-29 00:51:39,067 INFO MainThread:42596 [wandb_init.py:init():798] backend started and connected +2025-03-29 00:51:39,068 INFO MainThread:42596 [wandb_init.py:init():891] updated telemetry +2025-03-29 00:51:39,080 INFO MainThread:42596 [wandb_init.py:init():915] communicating run to backend with 90.0 second timeout +2025-03-29 00:51:39,610 INFO MainThread:42596 [wandb_init.py:init():990] starting run threads in backend +2025-03-29 00:51:39,802 INFO MainThread:42596 [wandb_run.py:_console_start():2375] atexit reg +2025-03-29 00:51:39,803 INFO MainThread:42596 [wandb_run.py:_redirect():2227] redirect: wrap_raw +2025-03-29 00:51:39,803 INFO MainThread:42596 [wandb_run.py:_redirect():2292] Wrapping output streams. +2025-03-29 00:51:39,803 INFO MainThread:42596 [wandb_run.py:_redirect():2315] Redirects installed. +2025-03-29 00:51:39,805 INFO MainThread:42596 [wandb_init.py:init():1032] run started, returning control to user process +2025-03-29 00:51:39,816 INFO MainThread:42596 [wandb_run.py:_finish():2112] finishing run htlou/align-anything/6x2eqgtz +2025-03-29 00:51:39,816 INFO MainThread:42596 [wandb_run.py:_atexit_cleanup():2340] got exitcode: 0 +2025-03-29 00:51:39,816 INFO MainThread:42596 [wandb_run.py:_restore():2322] restore +2025-03-29 00:51:39,816 INFO MainThread:42596 [wandb_run.py:_restore():2328] restore done diff --git a/run-20250329_005139-6x2eqgtz/run-6x2eqgtz.wandb b/run-20250329_005139-6x2eqgtz/run-6x2eqgtz.wandb new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/run-20250329_005425-3al6iztu/files/output.log b/run-20250329_005425-3al6iztu/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..4cdeac6d10a9f770951e6b47bb139648fdaf5273 --- /dev/null +++ b/run-20250329_005425-3al6iztu/files/output.log @@ -0,0 +1,13 @@ +***** Running training ***** +Resuming from checkpoint 1/3 epoch : 0%| | 0/7326 [00:00 + File "", line 198, in _run_module_as_main + File "", line 88, in _run_code + File "/aifs4su/yaodong/hantao/align-anything/align_anything/trainers/text_image_to_text/sft.py", line 100, in + sys.exit(main()) + ^^^^^^ + File "/aifs4su/yaodong/hantao/align-anything/align_anything/trainers/text_image_to_text/sft.py", line 95, in main + trainer.train() + File "/aifs4su/yaodong/hantao/align-anything/align_anything/trainers/text_to_text/sft.py", line 140, in train + print('First batch: ', enumerate(self.train_dataloader)[0]) + ~~~~~~~~~~~~~~~~~~~~ diff --git a/run-20250329_005425-3al6iztu/files/requirements.txt b/run-20250329_005425-3al6iztu/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..1bd434ed56b038dff142c2a66ee591bbfc57f256 --- /dev/null +++ b/run-20250329_005425-3al6iztu/files/requirements.txt @@ -0,0 +1,167 @@ +maskrcnn_benchmark==0.0.0 +webdataset==0.2.111 +websockets==15.0.1 +typer==0.15.2 +blobfile==3.0.0 +pooch==1.8.2 +python-dateutil==2.9.0.post0 +gmpy2==2.2.1 +httpcore==1.0.7 +charset-normalizer==3.3.2 +torchlibrosa==0.1.0 +multiprocess==0.70.16 +Werkzeug==3.1.3 +aiofiles==23.2.1 +six==1.17.0 +typing_extensions==4.12.2 +psutil==7.0.0 +frozenlist==1.5.0 +einops==0.8.1 +flash_attn==2.7.4.post1 +PySocks==1.7.1 +regex==2024.11.6 +markdown-it-py==3.0.0 +ruff==0.11.2 +docker-pycreds==0.4.0 +protobuf==5.29.4 +resampy==0.4.3 +aiohappyeyeballs==2.6.1 +httpx==0.28.1 +encodec==0.1.1 +ffmpy==0.5.0 +mkl_random==1.2.8 +soxr==0.5.0.post1 +absl-py==2.2.1 +networkx==3.4.2 +h5py==3.13.0 +hjson==3.1.0 +tensorboard==2.19.0 +aiosignal==1.3.2 +pip==25.0 +triton==3.1.0 +zipp==3.21.0 +ftfy==6.3.1 +attrs==25.3.0 +requests==2.32.3 +progressbar==2.5 +sniffio==1.3.1 +lxml==5.3.1 +starlette==0.46.1 +Markdown==3.7 +mdurl==0.1.2 +torchaudio==2.5.1 +safetensors==0.5.3 +opencv-python==4.6.0.66 +torchvision==0.20.1 +shellingham==1.5.4 +gradio==5.23.1 +timm==1.0.15 +multidict==6.2.0 +semantic-version==2.10.0 +numba==0.60.0 +gradio_client==1.8.0 +pydantic_core==2.33.0 +dill==0.3.8 +msgpack==1.1.0 +sentry-sdk==2.24.1 +grpcio==1.71.0 +cffi==1.17.1 +PyYAML==6.0.2 +tensorboard-data-server==0.7.2 +fastapi==0.115.12 +lazy_loader==0.4 +mkl_fft==1.3.11 +annotated-types==0.7.0 +scikit-learn==1.6.1 +wget==3.2 +setuptools==75.8.0 +certifi==2025.1.31 +click==8.1.8 +laion_clap==1.1.5 +Pygments==2.19.1 +tomlkit==0.13.2 +idna==3.7 +propcache==0.3.1 +platformdirs==4.3.7 +align-anything==0.0.1.dev0 +deepspeed==0.16.5 +smmap==5.0.2 +pillow==11.1.0 +typing-inspection==0.4.0 +braceexpand==0.1.7 +decorator==5.2.1 +pandas==2.2.3 +huggingface-hub==0.29.3 +pyarrow==19.0.1 +tokenizers==0.21.1 +GitPython==3.1.44 +xxhash==3.5.0 +packaging==24.2 +numpy==1.23.4 +setproctitle==1.3.5 +llvmlite==0.43.0 +tiktoken==0.9.0 +mpmath==1.3.0 +nvidia-ml-py==12.570.86 +pydantic==2.11.0 +datasets==3.5.0 +librosa==0.11.0 +frechet_audio_distance==0.3.1 +sympy==1.13.1 +safehttpx==0.1.6 +Jinja2==3.1.6 +h11==0.14.0 +aiohttp==3.11.14 +diffusers==0.32.2 +tqdm==4.67.1 +filelock==3.13.1 +transformers==4.51.0.dev0 +scipy==1.10.1 +audioread==3.0.1 +sentencepiece==0.2.0 +pytz==2025.2 +tzdata==2025.2 +python-multipart==0.0.20 +urllib3==2.3.0 +pycryptodomex==3.22.0 +yarl==1.18.3 +pydub==0.25.1 +pycparser==2.22 +soundfile==0.13.1 +wcwidth==0.2.13 +groovy==0.1.2 +torch==2.5.1 +anyio==4.9.0 +wandb==0.19.8 +joblib==1.4.2 +fsspec==2024.12.0 +peft==0.15.1 +accelerate==1.5.2 +py-cpuinfo==9.0.0 +uvicorn==0.34.0 +orjson==3.10.16 +Brotli==1.0.9 +rich==13.9.4 +importlib_metadata==8.6.1 +ninja==1.11.1.4 +wheel==0.45.1 +MarkupSafe==3.0.2 +threadpoolctl==3.6.0 +gitdb==4.0.12 +mkl-service==2.4.0 +typing_extensions==4.12.2 +tomli==2.0.1 +zipp==3.19.2 +wheel==0.43.0 +jaraco.text==3.12.1 +packaging==24.2 +autocommand==2.2.2 +jaraco.functools==4.0.1 +jaraco.collections==5.1.0 +platformdirs==4.2.2 +more-itertools==10.3.0 +inflect==7.3.1 +jaraco.context==5.3.0 +typeguard==4.3.0 +backports.tarfile==1.2.0 +importlib_metadata==8.0.0 diff --git a/run-20250329_005425-3al6iztu/files/wandb-metadata.json b/run-20250329_005425-3al6iztu/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..d740cd512717d9232f674b01c6ff0b16992c744f --- /dev/null +++ b/run-20250329_005425-3al6iztu/files/wandb-metadata.json @@ -0,0 +1,35 @@ +{ + "os": "Linux-5.15.0-1040-nvidia-x86_64-with-glibc2.35", + "python": "CPython 3.11.11", + "startedAt": "2025-03-28T16:54:25.328152Z", + "args": [ + "--local_rank=0", + "--model_name_or_path", + "/aifs4su/yaodong/hantao/models/llava-v1.6-mistral-7b-hf", + "--train_datasets", + "/aifs4su/yaodong/hantao/datasets/MMInstruct-GPT4V_mistral-7b_cosi_cut/merged/top1-10", + "--train_template", + "MM_TI2T_LLAVA", + "--train_split", + "train", + "--train_name", + "text-image-to-text", + "--output_dir", + "../outputs/test_7B", + "--save_total_limit", + "6", + "--train_batch_size", + "8", + "--epochs", + "3" + ], + "program": "-m align_anything.trainers.text_image_to_text.sft", + "git": { + "remote": "git@github.com-hantao:PKU-Alignment/align-anything.git", + "commit": "106588f9802757a3283c1aff1f33ea9afd737f31" + }, + "email": "2200017789@stu.pku.edu.cn", + "root": "../outputs/test_7B", + "host": "dgx-092", + "executable": "/aifs4su/yaodong/miniconda3/envs/hantao_llama/bin/python" +} \ No newline at end of file diff --git a/run-20250329_005425-3al6iztu/logs/debug-core.log b/run-20250329_005425-3al6iztu/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..aed055c640527986aeb9157bc76e0c879bcb54e4 --- /dev/null +++ b/run-20250329_005425-3al6iztu/logs/debug-core.log @@ -0,0 +1,7 @@ +{"time":"2025-03-29T00:54:24.691982358+08:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpszvpq4wi/port-48756.txt","pid":48756,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false} +{"time":"2025-03-29T00:54:24.692861871+08:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":48756} +{"time":"2025-03-29T00:54:24.692845205+08:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":42415,"Zone":""}} +{"time":"2025-03-29T00:54:24.873664584+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:37608"} +{"time":"2025-03-29T00:54:25.329808834+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"3al6iztu","id":"127.0.0.1:37608"} +{"time":"2025-03-29T00:54:25.545982861+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"3al6iztu","id":"127.0.0.1:37608"} +{"time":"2025-03-29T00:54:26.490223915+08:00","level":"INFO","msg":"received shutdown signal","signal":15} diff --git a/run-20250329_005425-3al6iztu/logs/debug-internal.log b/run-20250329_005425-3al6iztu/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..f782bd4e34ecbdb1e488f907f1c0023315039eaa --- /dev/null +++ b/run-20250329_005425-3al6iztu/logs/debug-internal.log @@ -0,0 +1,8 @@ +{"time":"2025-03-29T00:54:25.331350097+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.8","symlink path":"../outputs/test_7B/wandb/run-20250329_005425-3al6iztu/logs/debug-core.log"} +{"time":"2025-03-29T00:54:25.545872725+08:00","level":"INFO","msg":"created new stream","id":"3al6iztu"} +{"time":"2025-03-29T00:54:25.54597415+08:00","level":"INFO","msg":"stream: started","id":"3al6iztu"} +{"time":"2025-03-29T00:54:25.546010693+08:00","level":"INFO","msg":"sender: started","stream_id":"3al6iztu"} +{"time":"2025-03-29T00:54:25.546003597+08:00","level":"INFO","msg":"handler: started","stream_id":"3al6iztu"} +{"time":"2025-03-29T00:54:25.546055332+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"3al6iztu"} +{"time":"2025-03-29T00:54:25.844887265+08:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2025-03-29T00:54:26.062125748+08:00","level":"INFO","msg":"Stopping system monitor"} diff --git a/run-20250329_005425-3al6iztu/logs/debug.log b/run-20250329_005425-3al6iztu/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..40cc9a359d37ebdb0bc6f26b4394e222efb3b148 --- /dev/null +++ b/run-20250329_005425-3al6iztu/logs/debug.log @@ -0,0 +1,26 @@ +2025-03-29 00:54:25,324 INFO MainThread:48756 [wandb_setup.py:_flush():67] Current SDK version is 0.19.8 +2025-03-29 00:54:25,325 INFO MainThread:48756 [wandb_setup.py:_flush():67] Configure stats pid to 48756 +2025-03-29 00:54:25,325 INFO MainThread:48756 [wandb_setup.py:_flush():67] Loading settings from /home/yangyaodong/.config/wandb/settings +2025-03-29 00:54:25,325 INFO MainThread:48756 [wandb_setup.py:_flush():67] Loading settings from /aifs4su/yaodong/hantao/align-anything/scripts/wandb/settings +2025-03-29 00:54:25,325 INFO MainThread:48756 [wandb_setup.py:_flush():67] Loading settings from environment variables +2025-03-29 00:54:25,325 INFO MainThread:48756 [wandb_init.py:setup_run_log_directory():647] Logging user logs to ../outputs/test_7B/wandb/run-20250329_005425-3al6iztu/logs/debug.log +2025-03-29 00:54:25,325 INFO MainThread:48756 [wandb_init.py:setup_run_log_directory():648] Logging internal logs to ../outputs/test_7B/wandb/run-20250329_005425-3al6iztu/logs/debug-internal.log +2025-03-29 00:54:25,325 INFO MainThread:48756 [wandb_init.py:init():761] calling init triggers +2025-03-29 00:54:25,325 INFO MainThread:48756 [wandb_init.py:init():766] wandb.init called with sweep_config: {} +config: {'train_cfgs': {'save_checkpoint': False, 'load_checkpoint': False, 'ds_cfgs': 'ds_z3_config.json', 'epochs': 3, 'seed': 42, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 16, 'gradient_checkpointing': True, 'learning_rate': 2e-05, 'lr_scheduler_type': 'cosine', 'lr_warmup_ratio': 0.03, 'weight_decay': 0.0, 'adam_betas': [0.9, 0.95], 'adam_epsilon': 1e-08, 'bf16': True, 'fp16': False, 'eval_strategy': 'epoch', 'eval_interval': 10, 'freeze_mm_proj': False, 'freeze_vision_tower': True, 'freeze_language_model': False, 'max_grad_norm': 1.0}, 'data_cfgs': {'load_multi_datasets': False, 'train_datasets': '/aifs4su/yaodong/hantao/datasets/MMInstruct-GPT4V_mistral-7b_cosi_cut/merged/top1-10', 'train_template': 'MM_TI2T_LLAVA', 'train_size': {}, 'train_split': 'train', 'train_name': 'text-image-to-text', 'train_data_files': {}, 'train_optional_args': [], 'eval_datasets': {}, 'eval_template': {}, 'eval_name': {}, 'eval_size': {}, 'eval_split': {}, 'eval_subset': {}, 'eval_data_files': {}, 'eval_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'align-anything', 'log_run_name': 'sft', 'output_dir': '../outputs/test_7B', 'cache_dir': {}, 'save_total_limit': 6}, 'model_cfgs': {'model_name_or_path': '/aifs4su/yaodong/hantao/models/llava-v1.6-mistral-7b-hf', 'trust_remote_code': True, 'model_max_length': 2048}, 'special_tokens': {}, '_wandb': {}} +2025-03-29 00:54:25,325 INFO MainThread:48756 [wandb_init.py:init():784] starting backend +2025-03-29 00:54:25,325 INFO MainThread:48756 [wandb_init.py:init():788] sending inform_init request +2025-03-29 00:54:25,327 INFO MainThread:48756 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2025-03-29 00:54:25,328 INFO MainThread:48756 [wandb_init.py:init():798] backend started and connected +2025-03-29 00:54:25,329 INFO MainThread:48756 [wandb_init.py:init():891] updated telemetry +2025-03-29 00:54:25,340 INFO MainThread:48756 [wandb_init.py:init():915] communicating run to backend with 90.0 second timeout +2025-03-29 00:54:25,842 INFO MainThread:48756 [wandb_init.py:init():990] starting run threads in backend +2025-03-29 00:54:26,016 INFO MainThread:48756 [wandb_run.py:_console_start():2375] atexit reg +2025-03-29 00:54:26,016 INFO MainThread:48756 [wandb_run.py:_redirect():2227] redirect: wrap_raw +2025-03-29 00:54:26,016 INFO MainThread:48756 [wandb_run.py:_redirect():2292] Wrapping output streams. +2025-03-29 00:54:26,016 INFO MainThread:48756 [wandb_run.py:_redirect():2315] Redirects installed. +2025-03-29 00:54:26,019 INFO MainThread:48756 [wandb_init.py:init():1032] run started, returning control to user process +2025-03-29 00:54:26,029 INFO MainThread:48756 [wandb_run.py:_finish():2112] finishing run htlou/align-anything/3al6iztu +2025-03-29 00:54:26,030 INFO MainThread:48756 [wandb_run.py:_atexit_cleanup():2340] got exitcode: 0 +2025-03-29 00:54:26,030 INFO MainThread:48756 [wandb_run.py:_restore():2322] restore +2025-03-29 00:54:26,030 INFO MainThread:48756 [wandb_run.py:_restore():2328] restore done diff --git a/run-20250329_005425-3al6iztu/run-3al6iztu.wandb b/run-20250329_005425-3al6iztu/run-3al6iztu.wandb new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/run-20250329_005541-bq1jaffa/files/config.yaml b/run-20250329_005541-bq1jaffa/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0034e884827829202e5342ad55c39ebf85d93c49 --- /dev/null +++ b/run-20250329_005541-bq1jaffa/files/config.yaml @@ -0,0 +1,95 @@ +_wandb: + value: + cli_version: 0.19.8 + m: [] + python_version: 3.11.11 + t: + "1": + - 1 + - 5 + - 11 + - 41 + - 49 + - 51 + - 53 + - 55 + - 63 + - 71 + - 83 + - 98 + - 105 + "2": + - 1 + - 5 + - 11 + - 41 + - 49 + - 51 + - 53 + - 55 + - 63 + - 71 + - 83 + - 98 + - 105 + "3": + - 2 + - 13 + - 16 + - 23 + - 55 + "4": 3.11.11 + "5": 0.19.8 + "6": 4.51.0.dev0 + "8": + - 5 + "12": 0.19.8 + "13": linux-x86_64 +data_cfgs: + value: + eval_optional_args: [] + load_multi_datasets: false + train_datasets: /aifs4su/yaodong/hantao/datasets/MMInstruct-GPT4V_mistral-7b_cosi_cut/merged/top1-10 + train_name: text-image-to-text + train_optional_args: [] + train_split: train + train_template: MM_TI2T_LLAVA +logger_cfgs: + value: + log_project: align-anything + log_run_name: sft + log_type: wandb + output_dir: ../outputs/test_7B + save_total_limit: 6 +model_cfgs: + value: + model_max_length: 2048 + model_name_or_path: /aifs4su/yaodong/hantao/models/llava-v1.6-mistral-7b-hf + trust_remote_code: true +train_cfgs: + value: + adam_betas: + - 0.9 + - 0.95 + adam_epsilon: 1e-08 + bf16: true + ds_cfgs: ds_z3_config.json + epochs: 3 + eval_interval: 10 + eval_strategy: epoch + fp16: false + freeze_language_model: false + freeze_mm_proj: false + freeze_vision_tower: true + gradient_accumulation_steps: 16 + gradient_checkpointing: true + learning_rate: 2e-05 + load_checkpoint: false + lr_scheduler_type: cosine + lr_warmup_ratio: 0.03 + max_grad_norm: 1 + per_device_eval_batch_size: 1 + per_device_train_batch_size: 1 + save_checkpoint: false + seed: 42 + weight_decay: 0 diff --git a/run-20250329_005541-bq1jaffa/files/output.log b/run-20250329_005541-bq1jaffa/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..12ff47246df2c9dccfdb76c89d80ae88c331077f --- /dev/null +++ b/run-20250329_005541-bq1jaffa/files/output.log @@ -0,0 +1,22 @@ +***** Running training ***** +Resuming from checkpoint 3/3 epoch : 0%| | 0/7326 [00:00 + +Check if empty: False +First data: {'id': 182941, 'image': 'ocr/0001/00000300.jpg', 'conversations': [{'from': 'human', 'value': '\nDo you have the ability to recognize and translate textual information in images?\nA. The image shows the front cover of a book titled "THE CALMING COLLECTION PRESENTS GOODBYE WORRIES TRAIN YOUR MIND TO QUIET YOUR THOUGHTS ANYTIME."\nB. The image displays the product label of a herbal tea called "THE CALMING COLLECTION PRESENTS GOODBYE WORRIES TRAIN YOUR MIND TO QUIET YOUR THOUGHTS ANYTIME."\nC. The image represents a poster promoting a workshop on "THE CALMING COLLECTION PRESENTS GOODBYE WORRIES TRAIN YOUR MIND TO QUIET YOUR THOUGHTS ANYTIME."\nD. "THE CALMING COLLECTION PRESENTS GOODBYE WORRIES TRAIN YOUR MIND TO QUIET YOUR THOUGHTS ANYTIME."\nAnswer with the option\'s letter from the given choices directly.'}, {'from': 'gpt', 'value': 'D'}], 'cosi': 70.53595100308407} +Train dataloader: + +Check if empty: False +First data: {'id': 182941, 'image': 'ocr/0001/00000300.jpg', 'conversations': [{'from': 'human', 'value': '\nDo you have the ability to recognize and translate textual information in images?\nA. The image shows the front cover of a book titled "THE CALMING COLLECTION PRESENTS GOODBYE WORRIES TRAIN YOUR MIND TO QUIET YOUR THOUGHTS ANYTIME."\nB. The image displays the product label of a herbal tea called "THE CALMING COLLECTION PRESENTS GOODBYE WORRIES TRAIN YOUR MIND TO QUIET YOUR THOUGHTS ANYTIME."\nC. The image represents a poster promoting a workshop on "THE CALMING COLLECTION PRESENTS GOODBYE WORRIES TRAIN YOUR MIND TO QUIET YOUR THOUGHTS ANYTIME."\nD. "THE CALMING COLLECTION PRESENTS GOODBYE WORRIES TRAIN YOUR MIND TO QUIET YOUR THOUGHTS ANYTIME."\nAnswer with the option\'s letter from the given choices directly.'}, {'from': 'gpt', 'value': 'D'}], 'cosi': 70.53595100308407} +Train dataloader: + +Check if empty: False +First data: {'id': 182941, 'image': 'ocr/0001/00000300.jpg', 'conversations': [{'from': 'human', 'value': '\nDo you have the ability to recognize and translate textual information in images?\nA. The image shows the front cover of a book titled "THE CALMING COLLECTION PRESENTS GOODBYE WORRIES TRAIN YOUR MIND TO QUIET YOUR THOUGHTS ANYTIME."\nB. The image displays the product label of a herbal tea called "THE CALMING COLLECTION PRESENTS GOODBYE WORRIES TRAIN YOUR MIND TO QUIET YOUR THOUGHTS ANYTIME."\nC. The image represents a poster promoting a workshop on "THE CALMING COLLECTION PRESENTS GOODBYE WORRIES TRAIN YOUR MIND TO QUIET YOUR THOUGHTS ANYTIME."\nD. "THE CALMING COLLECTION PRESENTS GOODBYE WORRIES TRAIN YOUR MIND TO QUIET YOUR THOUGHTS ANYTIME."\nAnswer with the option\'s letter from the given choices directly.'}, {'from': 'gpt', 'value': 'D'}], 'cosi': 70.53595100308407} +Saving model to "../outputs/test_7B/slice_end" ... +Saving 16-bit model... +[2025-03-29 00:55:50,235] [INFO] [logging.py:107:log_dist] [Rank 0] [Torch] Checkpoint global_step0 is about to be saved! +[2025-03-29 00:55:50,236] [INFO] [engine.py:3831:save_16bit_model] Saving model weights to ../outputs/test_7B/slice_end/pytorch_model.bin, tag: global_step0 +[2025-03-29 00:55:50,236] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/test_7B/slice_end/pytorch_model.bin... +[2025-03-29 00:56:05,543] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/test_7B/slice_end/pytorch_model.bin. +[2025-03-29 00:56:05,544] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step0 is ready now! +Model saved! diff --git a/run-20250329_005541-bq1jaffa/files/requirements.txt b/run-20250329_005541-bq1jaffa/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..1bd434ed56b038dff142c2a66ee591bbfc57f256 --- /dev/null +++ b/run-20250329_005541-bq1jaffa/files/requirements.txt @@ -0,0 +1,167 @@ +maskrcnn_benchmark==0.0.0 +webdataset==0.2.111 +websockets==15.0.1 +typer==0.15.2 +blobfile==3.0.0 +pooch==1.8.2 +python-dateutil==2.9.0.post0 +gmpy2==2.2.1 +httpcore==1.0.7 +charset-normalizer==3.3.2 +torchlibrosa==0.1.0 +multiprocess==0.70.16 +Werkzeug==3.1.3 +aiofiles==23.2.1 +six==1.17.0 +typing_extensions==4.12.2 +psutil==7.0.0 +frozenlist==1.5.0 +einops==0.8.1 +flash_attn==2.7.4.post1 +PySocks==1.7.1 +regex==2024.11.6 +markdown-it-py==3.0.0 +ruff==0.11.2 +docker-pycreds==0.4.0 +protobuf==5.29.4 +resampy==0.4.3 +aiohappyeyeballs==2.6.1 +httpx==0.28.1 +encodec==0.1.1 +ffmpy==0.5.0 +mkl_random==1.2.8 +soxr==0.5.0.post1 +absl-py==2.2.1 +networkx==3.4.2 +h5py==3.13.0 +hjson==3.1.0 +tensorboard==2.19.0 +aiosignal==1.3.2 +pip==25.0 +triton==3.1.0 +zipp==3.21.0 +ftfy==6.3.1 +attrs==25.3.0 +requests==2.32.3 +progressbar==2.5 +sniffio==1.3.1 +lxml==5.3.1 +starlette==0.46.1 +Markdown==3.7 +mdurl==0.1.2 +torchaudio==2.5.1 +safetensors==0.5.3 +opencv-python==4.6.0.66 +torchvision==0.20.1 +shellingham==1.5.4 +gradio==5.23.1 +timm==1.0.15 +multidict==6.2.0 +semantic-version==2.10.0 +numba==0.60.0 +gradio_client==1.8.0 +pydantic_core==2.33.0 +dill==0.3.8 +msgpack==1.1.0 +sentry-sdk==2.24.1 +grpcio==1.71.0 +cffi==1.17.1 +PyYAML==6.0.2 +tensorboard-data-server==0.7.2 +fastapi==0.115.12 +lazy_loader==0.4 +mkl_fft==1.3.11 +annotated-types==0.7.0 +scikit-learn==1.6.1 +wget==3.2 +setuptools==75.8.0 +certifi==2025.1.31 +click==8.1.8 +laion_clap==1.1.5 +Pygments==2.19.1 +tomlkit==0.13.2 +idna==3.7 +propcache==0.3.1 +platformdirs==4.3.7 +align-anything==0.0.1.dev0 +deepspeed==0.16.5 +smmap==5.0.2 +pillow==11.1.0 +typing-inspection==0.4.0 +braceexpand==0.1.7 +decorator==5.2.1 +pandas==2.2.3 +huggingface-hub==0.29.3 +pyarrow==19.0.1 +tokenizers==0.21.1 +GitPython==3.1.44 +xxhash==3.5.0 +packaging==24.2 +numpy==1.23.4 +setproctitle==1.3.5 +llvmlite==0.43.0 +tiktoken==0.9.0 +mpmath==1.3.0 +nvidia-ml-py==12.570.86 +pydantic==2.11.0 +datasets==3.5.0 +librosa==0.11.0 +frechet_audio_distance==0.3.1 +sympy==1.13.1 +safehttpx==0.1.6 +Jinja2==3.1.6 +h11==0.14.0 +aiohttp==3.11.14 +diffusers==0.32.2 +tqdm==4.67.1 +filelock==3.13.1 +transformers==4.51.0.dev0 +scipy==1.10.1 +audioread==3.0.1 +sentencepiece==0.2.0 +pytz==2025.2 +tzdata==2025.2 +python-multipart==0.0.20 +urllib3==2.3.0 +pycryptodomex==3.22.0 +yarl==1.18.3 +pydub==0.25.1 +pycparser==2.22 +soundfile==0.13.1 +wcwidth==0.2.13 +groovy==0.1.2 +torch==2.5.1 +anyio==4.9.0 +wandb==0.19.8 +joblib==1.4.2 +fsspec==2024.12.0 +peft==0.15.1 +accelerate==1.5.2 +py-cpuinfo==9.0.0 +uvicorn==0.34.0 +orjson==3.10.16 +Brotli==1.0.9 +rich==13.9.4 +importlib_metadata==8.6.1 +ninja==1.11.1.4 +wheel==0.45.1 +MarkupSafe==3.0.2 +threadpoolctl==3.6.0 +gitdb==4.0.12 +mkl-service==2.4.0 +typing_extensions==4.12.2 +tomli==2.0.1 +zipp==3.19.2 +wheel==0.43.0 +jaraco.text==3.12.1 +packaging==24.2 +autocommand==2.2.2 +jaraco.functools==4.0.1 +jaraco.collections==5.1.0 +platformdirs==4.2.2 +more-itertools==10.3.0 +inflect==7.3.1 +jaraco.context==5.3.0 +typeguard==4.3.0 +backports.tarfile==1.2.0 +importlib_metadata==8.0.0 diff --git a/run-20250329_005541-bq1jaffa/files/wandb-metadata.json b/run-20250329_005541-bq1jaffa/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..eba397f483ddcd57a1622b3d32170cab2d405c63 --- /dev/null +++ b/run-20250329_005541-bq1jaffa/files/wandb-metadata.json @@ -0,0 +1,106 @@ +{ + "os": "Linux-5.15.0-1040-nvidia-x86_64-with-glibc2.35", + "python": "CPython 3.11.11", + "startedAt": "2025-03-28T16:55:41.711696Z", + "args": [ + "--local_rank=0", + "--model_name_or_path", + "/aifs4su/yaodong/hantao/models/llava-v1.6-mistral-7b-hf", + "--train_datasets", + "/aifs4su/yaodong/hantao/datasets/MMInstruct-GPT4V_mistral-7b_cosi_cut/merged/top1-10", + "--train_template", + "MM_TI2T_LLAVA", + "--train_split", + "train", + "--train_name", + "text-image-to-text", + "--output_dir", + "../outputs/test_7B", + "--save_total_limit", + "6", + "--train_batch_size", + "8", + "--epochs", + "3" + ], + "program": "-m align_anything.trainers.text_image_to_text.sft", + "git": { + "remote": "git@github.com-hantao:PKU-Alignment/align-anything.git", + "commit": "106588f9802757a3283c1aff1f33ea9afd737f31" + }, + "email": "2200017789@stu.pku.edu.cn", + "root": "../outputs/test_7B", + "host": "dgx-092", + "executable": "/aifs4su/yaodong/miniconda3/envs/hantao_llama/bin/python", + "cpu_count": 112, + "cpu_count_logical": 224, + "gpu": "NVIDIA H800", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888556142592", + "used": "148609179648" + } + }, + "memory": { + "total": "2164195454976" + }, + "cpu": { + "count": 112, + "countLogical": 224 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA H800", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + }, + { + "name": "NVIDIA H800", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + }, + { + "name": "NVIDIA H800", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + }, + { + "name": "NVIDIA H800", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + }, + { + "name": "NVIDIA H800", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + }, + { + "name": "NVIDIA H800", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + }, + { + "name": "NVIDIA H800", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + }, + { + "name": "NVIDIA H800", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + } + ], + "slurm": { + "conf": "/cm/shared/apps/slurm/var/etc/slurm/slurm.conf" + }, + "cudaVersion": "12.2" +} \ No newline at end of file diff --git a/run-20250329_005541-bq1jaffa/files/wandb-summary.json b/run-20250329_005541-bq1jaffa/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..fea5e15dda6eb8b3ffcccab00b24d57fa587c95b --- /dev/null +++ b/run-20250329_005541-bq1jaffa/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":24}} \ No newline at end of file diff --git a/run-20250329_005541-bq1jaffa/logs/debug-core.log b/run-20250329_005541-bq1jaffa/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..9146da7f9681fd299b6430f8086bae1bd1ca0d91 --- /dev/null +++ b/run-20250329_005541-bq1jaffa/logs/debug-core.log @@ -0,0 +1,15 @@ +{"time":"2025-03-29T00:55:41.128572776+08:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpaadyf_35/port-52806.txt","pid":52806,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false} +{"time":"2025-03-29T00:55:41.129538675+08:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":52806} +{"time":"2025-03-29T00:55:41.129533087+08:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":38349,"Zone":""}} +{"time":"2025-03-29T00:55:41.310059156+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:46506"} +{"time":"2025-03-29T00:55:41.713050692+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"bq1jaffa","id":"127.0.0.1:46506"} +{"time":"2025-03-29T00:55:41.928489719+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"bq1jaffa","id":"127.0.0.1:46506"} +{"time":"2025-03-29T00:56:07.1453019+08:00","level":"INFO","msg":"handleInformFinish: finish message received","streamId":"bq1jaffa","id":"127.0.0.1:46506"} +{"time":"2025-03-29T00:56:07.146579952+08:00","level":"INFO","msg":"handleInformFinish: stream closed","streamId":"bq1jaffa","id":"127.0.0.1:46506"} +{"time":"2025-03-29T00:56:08.145760813+08:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:46506"} +{"time":"2025-03-29T00:56:08.145785955+08:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:46506"} +{"time":"2025-03-29T00:56:08.14579164+08:00","level":"INFO","msg":"server is shutting down"} +{"time":"2025-03-29T00:56:08.145814767+08:00","level":"INFO","msg":"connection: closing","id":"127.0.0.1:46506"} +{"time":"2025-03-29T00:56:08.145848309+08:00","level":"INFO","msg":"connection: closed successfully","id":"127.0.0.1:46506"} +{"time":"2025-03-29T00:56:08.145850825+08:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:46506"} +{"time":"2025-03-29T00:56:08.145853752+08:00","level":"INFO","msg":"server is closed"} diff --git a/run-20250329_005541-bq1jaffa/logs/debug-internal.log b/run-20250329_005541-bq1jaffa/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..40d7a34a74a6a3f722976f6af2930a18c3fc9fee --- /dev/null +++ b/run-20250329_005541-bq1jaffa/logs/debug-internal.log @@ -0,0 +1,16 @@ +{"time":"2025-03-29T00:55:41.714661009+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.8","symlink path":"../outputs/test_7B/wandb/run-20250329_005541-bq1jaffa/logs/debug-core.log"} +{"time":"2025-03-29T00:55:41.928417193+08:00","level":"INFO","msg":"created new stream","id":"bq1jaffa"} +{"time":"2025-03-29T00:55:41.928482583+08:00","level":"INFO","msg":"stream: started","id":"bq1jaffa"} +{"time":"2025-03-29T00:55:41.928501227+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"bq1jaffa"} +{"time":"2025-03-29T00:55:41.928513756+08:00","level":"INFO","msg":"sender: started","stream_id":"bq1jaffa"} +{"time":"2025-03-29T00:55:41.928511582+08:00","level":"INFO","msg":"handler: started","stream_id":"bq1jaffa"} +{"time":"2025-03-29T00:55:42.22838417+08:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2025-03-29T00:56:06.166942314+08:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2025-03-29T00:56:06.16760159+08:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2025-03-29T00:56:06.921670341+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-03-29T00:56:07.143514035+08:00","level":"INFO","msg":"handler: operation stats","stats":{}} +{"time":"2025-03-29T00:56:07.145585911+08:00","level":"INFO","msg":"stream: closing","id":"bq1jaffa"} +{"time":"2025-03-29T00:56:07.145626123+08:00","level":"INFO","msg":"handler: closed","stream_id":"bq1jaffa"} +{"time":"2025-03-29T00:56:07.145635477+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"bq1jaffa"} +{"time":"2025-03-29T00:56:07.145639618+08:00","level":"INFO","msg":"sender: closed","stream_id":"bq1jaffa"} +{"time":"2025-03-29T00:56:07.146568555+08:00","level":"INFO","msg":"stream: closed","id":"bq1jaffa"} diff --git a/run-20250329_005541-bq1jaffa/logs/debug.log b/run-20250329_005541-bq1jaffa/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..5ae8c20ed880ce6a3ff26656ac428abda8cc9c0b --- /dev/null +++ b/run-20250329_005541-bq1jaffa/logs/debug.log @@ -0,0 +1,29 @@ +2025-03-29 00:55:41,708 INFO MainThread:52806 [wandb_setup.py:_flush():67] Current SDK version is 0.19.8 +2025-03-29 00:55:41,709 INFO MainThread:52806 [wandb_setup.py:_flush():67] Configure stats pid to 52806 +2025-03-29 00:55:41,709 INFO MainThread:52806 [wandb_setup.py:_flush():67] Loading settings from /home/yangyaodong/.config/wandb/settings +2025-03-29 00:55:41,709 INFO MainThread:52806 [wandb_setup.py:_flush():67] Loading settings from /aifs4su/yaodong/hantao/align-anything/scripts/wandb/settings +2025-03-29 00:55:41,709 INFO MainThread:52806 [wandb_setup.py:_flush():67] Loading settings from environment variables +2025-03-29 00:55:41,709 INFO MainThread:52806 [wandb_init.py:setup_run_log_directory():647] Logging user logs to ../outputs/test_7B/wandb/run-20250329_005541-bq1jaffa/logs/debug.log +2025-03-29 00:55:41,709 INFO MainThread:52806 [wandb_init.py:setup_run_log_directory():648] Logging internal logs to ../outputs/test_7B/wandb/run-20250329_005541-bq1jaffa/logs/debug-internal.log +2025-03-29 00:55:41,709 INFO MainThread:52806 [wandb_init.py:init():761] calling init triggers +2025-03-29 00:55:41,709 INFO MainThread:52806 [wandb_init.py:init():766] wandb.init called with sweep_config: {} +config: {'train_cfgs': {'save_checkpoint': False, 'load_checkpoint': False, 'ds_cfgs': 'ds_z3_config.json', 'epochs': 3, 'seed': 42, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 16, 'gradient_checkpointing': True, 'learning_rate': 2e-05, 'lr_scheduler_type': 'cosine', 'lr_warmup_ratio': 0.03, 'weight_decay': 0.0, 'adam_betas': [0.9, 0.95], 'adam_epsilon': 1e-08, 'bf16': True, 'fp16': False, 'eval_strategy': 'epoch', 'eval_interval': 10, 'freeze_mm_proj': False, 'freeze_vision_tower': True, 'freeze_language_model': False, 'max_grad_norm': 1.0}, 'data_cfgs': {'load_multi_datasets': False, 'train_datasets': '/aifs4su/yaodong/hantao/datasets/MMInstruct-GPT4V_mistral-7b_cosi_cut/merged/top1-10', 'train_template': 'MM_TI2T_LLAVA', 'train_size': {}, 'train_split': 'train', 'train_name': 'text-image-to-text', 'train_data_files': {}, 'train_optional_args': [], 'eval_datasets': {}, 'eval_template': {}, 'eval_name': {}, 'eval_size': {}, 'eval_split': {}, 'eval_subset': {}, 'eval_data_files': {}, 'eval_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'align-anything', 'log_run_name': 'sft', 'output_dir': '../outputs/test_7B', 'cache_dir': {}, 'save_total_limit': 6}, 'model_cfgs': {'model_name_or_path': '/aifs4su/yaodong/hantao/models/llava-v1.6-mistral-7b-hf', 'trust_remote_code': True, 'model_max_length': 2048}, 'special_tokens': {}, '_wandb': {}} +2025-03-29 00:55:41,709 INFO MainThread:52806 [wandb_init.py:init():784] starting backend +2025-03-29 00:55:41,709 INFO MainThread:52806 [wandb_init.py:init():788] sending inform_init request +2025-03-29 00:55:41,711 INFO MainThread:52806 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2025-03-29 00:55:41,711 INFO MainThread:52806 [wandb_init.py:init():798] backend started and connected +2025-03-29 00:55:41,712 INFO MainThread:52806 [wandb_init.py:init():891] updated telemetry +2025-03-29 00:55:41,722 INFO MainThread:52806 [wandb_init.py:init():915] communicating run to backend with 90.0 second timeout +2025-03-29 00:55:42,226 INFO MainThread:52806 [wandb_init.py:init():990] starting run threads in backend +2025-03-29 00:55:42,378 INFO MainThread:52806 [wandb_run.py:_console_start():2375] atexit reg +2025-03-29 00:55:42,378 INFO MainThread:52806 [wandb_run.py:_redirect():2227] redirect: wrap_raw +2025-03-29 00:55:42,378 INFO MainThread:52806 [wandb_run.py:_redirect():2292] Wrapping output streams. +2025-03-29 00:55:42,378 INFO MainThread:52806 [wandb_run.py:_redirect():2315] Redirects installed. +2025-03-29 00:55:42,380 INFO MainThread:52806 [wandb_init.py:init():1032] run started, returning control to user process +2025-03-29 00:56:06,165 INFO MainThread:52806 [wandb_run.py:_finish():2112] finishing run htlou/align-anything/bq1jaffa +2025-03-29 00:56:06,165 INFO MainThread:52806 [wandb_run.py:_atexit_cleanup():2340] got exitcode: 0 +2025-03-29 00:56:06,166 INFO MainThread:52806 [wandb_run.py:_restore():2322] restore +2025-03-29 00:56:06,166 INFO MainThread:52806 [wandb_run.py:_restore():2328] restore done +2025-03-29 00:56:07,144 INFO MainThread:52806 [wandb_run.py:_footer_history_summary_info():3956] rendering history +2025-03-29 00:56:07,144 INFO MainThread:52806 [wandb_run.py:_footer_history_summary_info():3988] rendering summary +2025-03-29 00:56:07,144 INFO MainThread:52806 [wandb_run.py:_footer_sync_info():3917] logging synced files diff --git a/run-20250329_005541-bq1jaffa/run-bq1jaffa.wandb b/run-20250329_005541-bq1jaffa/run-bq1jaffa.wandb new file mode 100644 index 0000000000000000000000000000000000000000..d19ce321c0731b5c93ff80cd5b7fbd6680f73b64 Binary files /dev/null and b/run-20250329_005541-bq1jaffa/run-bq1jaffa.wandb differ diff --git a/run-20250329_005923-ri8qcnsm/files/config.yaml b/run-20250329_005923-ri8qcnsm/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0034e884827829202e5342ad55c39ebf85d93c49 --- /dev/null +++ b/run-20250329_005923-ri8qcnsm/files/config.yaml @@ -0,0 +1,95 @@ +_wandb: + value: + cli_version: 0.19.8 + m: [] + python_version: 3.11.11 + t: + "1": + - 1 + - 5 + - 11 + - 41 + - 49 + - 51 + - 53 + - 55 + - 63 + - 71 + - 83 + - 98 + - 105 + "2": + - 1 + - 5 + - 11 + - 41 + - 49 + - 51 + - 53 + - 55 + - 63 + - 71 + - 83 + - 98 + - 105 + "3": + - 2 + - 13 + - 16 + - 23 + - 55 + "4": 3.11.11 + "5": 0.19.8 + "6": 4.51.0.dev0 + "8": + - 5 + "12": 0.19.8 + "13": linux-x86_64 +data_cfgs: + value: + eval_optional_args: [] + load_multi_datasets: false + train_datasets: /aifs4su/yaodong/hantao/datasets/MMInstruct-GPT4V_mistral-7b_cosi_cut/merged/top1-10 + train_name: text-image-to-text + train_optional_args: [] + train_split: train + train_template: MM_TI2T_LLAVA +logger_cfgs: + value: + log_project: align-anything + log_run_name: sft + log_type: wandb + output_dir: ../outputs/test_7B + save_total_limit: 6 +model_cfgs: + value: + model_max_length: 2048 + model_name_or_path: /aifs4su/yaodong/hantao/models/llava-v1.6-mistral-7b-hf + trust_remote_code: true +train_cfgs: + value: + adam_betas: + - 0.9 + - 0.95 + adam_epsilon: 1e-08 + bf16: true + ds_cfgs: ds_z3_config.json + epochs: 3 + eval_interval: 10 + eval_strategy: epoch + fp16: false + freeze_language_model: false + freeze_mm_proj: false + freeze_vision_tower: true + gradient_accumulation_steps: 16 + gradient_checkpointing: true + learning_rate: 2e-05 + load_checkpoint: false + lr_scheduler_type: cosine + lr_warmup_ratio: 0.03 + max_grad_norm: 1 + per_device_eval_batch_size: 1 + per_device_train_batch_size: 1 + save_checkpoint: false + seed: 42 + weight_decay: 0 diff --git a/run-20250329_005923-ri8qcnsm/files/output.log b/run-20250329_005923-ri8qcnsm/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..7a53c4f1e9bbf0f4ae88457362b757d51f263944 --- /dev/null +++ b/run-20250329_005923-ri8qcnsm/files/output.log @@ -0,0 +1,56 @@ +***** Running training ***** +Resuming from checkpoint 1/3 epoch : 0%| | 0/7326 [00:00 + File "", line 198, in _run_module_as_main + File "", line 88, in _run_code + File "/aifs4su/yaodong/hantao/align-anything/align_anything/trainers/text_image_to_text/sft.py", line 100, in + sys.exit(main()) + ^^^^^^ + File "/aifs4su/yaodong/hantao/align-anything/align_anything/trainers/text_image_to_text/sft.py", line 95, in main + trainer.train() + File "/aifs4su/yaodong/hantao/align-anything/align_anything/trainers/text_to_text/sft.py", line 140, in train + print('First batch: ', next(iter(self.train_dataloader))) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/torch/utils/data/dataloader.py", line 701, in __next__ + data = self._next_data() + ^^^^^^^^^^^^^^^^^ + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/torch/utils/data/dataloader.py", line 757, in _next_data + data = self._dataset_fetcher.fetch(index) # may raise StopIteration + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/torch/utils/data/_utils/fetch.py", line 55, in fetch + return self.collate_fn(data) + ^^^^^^^^^^^^^^^^^^^^^ + File "/aifs4su/yaodong/hantao/align-anything/align_anything/datasets/text_image_to_text/supervised.py", line 179, in __call__ + multi_modal_padding = self.processor( + ^^^^^^^^^^^^^^^ + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/transformers/models/llava_next/processing_llava_next.py", line 162, in __call__ + image_size = next(image_sizes) + ^^^^^^^^^^^^^^^^^ +StopIteration +[rank0]: Traceback (most recent call last): +[rank0]: File "", line 198, in _run_module_as_main +[rank0]: File "", line 88, in _run_code +[rank0]: File "/aifs4su/yaodong/hantao/align-anything/align_anything/trainers/text_image_to_text/sft.py", line 100, in +[rank0]: sys.exit(main()) +[rank0]: ^^^^^^ +[rank0]: File "/aifs4su/yaodong/hantao/align-anything/align_anything/trainers/text_image_to_text/sft.py", line 95, in main +[rank0]: trainer.train() +[rank0]: File "/aifs4su/yaodong/hantao/align-anything/align_anything/trainers/text_to_text/sft.py", line 140, in train +[rank0]: print('First batch: ', next(iter(self.train_dataloader))) +[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank0]: File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/torch/utils/data/dataloader.py", line 701, in __next__ +[rank0]: data = self._next_data() +[rank0]: ^^^^^^^^^^^^^^^^^ +[rank0]: File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/torch/utils/data/dataloader.py", line 757, in _next_data +[rank0]: data = self._dataset_fetcher.fetch(index) # may raise StopIteration +[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank0]: File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/torch/utils/data/_utils/fetch.py", line 55, in fetch +[rank0]: return self.collate_fn(data) +[rank0]: ^^^^^^^^^^^^^^^^^^^^^ +[rank0]: File "/aifs4su/yaodong/hantao/align-anything/align_anything/datasets/text_image_to_text/supervised.py", line 179, in __call__ +[rank0]: multi_modal_padding = self.processor( +[rank0]: ^^^^^^^^^^^^^^^ +[rank0]: File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/transformers/models/llava_next/processing_llava_next.py", line 162, in __call__ +[rank0]: image_size = next(image_sizes) +[rank0]: ^^^^^^^^^^^^^^^^^ +[rank0]: StopIteration diff --git a/run-20250329_005923-ri8qcnsm/files/requirements.txt b/run-20250329_005923-ri8qcnsm/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..1bd434ed56b038dff142c2a66ee591bbfc57f256 --- /dev/null +++ b/run-20250329_005923-ri8qcnsm/files/requirements.txt @@ -0,0 +1,167 @@ +maskrcnn_benchmark==0.0.0 +webdataset==0.2.111 +websockets==15.0.1 +typer==0.15.2 +blobfile==3.0.0 +pooch==1.8.2 +python-dateutil==2.9.0.post0 +gmpy2==2.2.1 +httpcore==1.0.7 +charset-normalizer==3.3.2 +torchlibrosa==0.1.0 +multiprocess==0.70.16 +Werkzeug==3.1.3 +aiofiles==23.2.1 +six==1.17.0 +typing_extensions==4.12.2 +psutil==7.0.0 +frozenlist==1.5.0 +einops==0.8.1 +flash_attn==2.7.4.post1 +PySocks==1.7.1 +regex==2024.11.6 +markdown-it-py==3.0.0 +ruff==0.11.2 +docker-pycreds==0.4.0 +protobuf==5.29.4 +resampy==0.4.3 +aiohappyeyeballs==2.6.1 +httpx==0.28.1 +encodec==0.1.1 +ffmpy==0.5.0 +mkl_random==1.2.8 +soxr==0.5.0.post1 +absl-py==2.2.1 +networkx==3.4.2 +h5py==3.13.0 +hjson==3.1.0 +tensorboard==2.19.0 +aiosignal==1.3.2 +pip==25.0 +triton==3.1.0 +zipp==3.21.0 +ftfy==6.3.1 +attrs==25.3.0 +requests==2.32.3 +progressbar==2.5 +sniffio==1.3.1 +lxml==5.3.1 +starlette==0.46.1 +Markdown==3.7 +mdurl==0.1.2 +torchaudio==2.5.1 +safetensors==0.5.3 +opencv-python==4.6.0.66 +torchvision==0.20.1 +shellingham==1.5.4 +gradio==5.23.1 +timm==1.0.15 +multidict==6.2.0 +semantic-version==2.10.0 +numba==0.60.0 +gradio_client==1.8.0 +pydantic_core==2.33.0 +dill==0.3.8 +msgpack==1.1.0 +sentry-sdk==2.24.1 +grpcio==1.71.0 +cffi==1.17.1 +PyYAML==6.0.2 +tensorboard-data-server==0.7.2 +fastapi==0.115.12 +lazy_loader==0.4 +mkl_fft==1.3.11 +annotated-types==0.7.0 +scikit-learn==1.6.1 +wget==3.2 +setuptools==75.8.0 +certifi==2025.1.31 +click==8.1.8 +laion_clap==1.1.5 +Pygments==2.19.1 +tomlkit==0.13.2 +idna==3.7 +propcache==0.3.1 +platformdirs==4.3.7 +align-anything==0.0.1.dev0 +deepspeed==0.16.5 +smmap==5.0.2 +pillow==11.1.0 +typing-inspection==0.4.0 +braceexpand==0.1.7 +decorator==5.2.1 +pandas==2.2.3 +huggingface-hub==0.29.3 +pyarrow==19.0.1 +tokenizers==0.21.1 +GitPython==3.1.44 +xxhash==3.5.0 +packaging==24.2 +numpy==1.23.4 +setproctitle==1.3.5 +llvmlite==0.43.0 +tiktoken==0.9.0 +mpmath==1.3.0 +nvidia-ml-py==12.570.86 +pydantic==2.11.0 +datasets==3.5.0 +librosa==0.11.0 +frechet_audio_distance==0.3.1 +sympy==1.13.1 +safehttpx==0.1.6 +Jinja2==3.1.6 +h11==0.14.0 +aiohttp==3.11.14 +diffusers==0.32.2 +tqdm==4.67.1 +filelock==3.13.1 +transformers==4.51.0.dev0 +scipy==1.10.1 +audioread==3.0.1 +sentencepiece==0.2.0 +pytz==2025.2 +tzdata==2025.2 +python-multipart==0.0.20 +urllib3==2.3.0 +pycryptodomex==3.22.0 +yarl==1.18.3 +pydub==0.25.1 +pycparser==2.22 +soundfile==0.13.1 +wcwidth==0.2.13 +groovy==0.1.2 +torch==2.5.1 +anyio==4.9.0 +wandb==0.19.8 +joblib==1.4.2 +fsspec==2024.12.0 +peft==0.15.1 +accelerate==1.5.2 +py-cpuinfo==9.0.0 +uvicorn==0.34.0 +orjson==3.10.16 +Brotli==1.0.9 +rich==13.9.4 +importlib_metadata==8.6.1 +ninja==1.11.1.4 +wheel==0.45.1 +MarkupSafe==3.0.2 +threadpoolctl==3.6.0 +gitdb==4.0.12 +mkl-service==2.4.0 +typing_extensions==4.12.2 +tomli==2.0.1 +zipp==3.19.2 +wheel==0.43.0 +jaraco.text==3.12.1 +packaging==24.2 +autocommand==2.2.2 +jaraco.functools==4.0.1 +jaraco.collections==5.1.0 +platformdirs==4.2.2 +more-itertools==10.3.0 +inflect==7.3.1 +jaraco.context==5.3.0 +typeguard==4.3.0 +backports.tarfile==1.2.0 +importlib_metadata==8.0.0 diff --git a/run-20250329_005923-ri8qcnsm/files/wandb-metadata.json b/run-20250329_005923-ri8qcnsm/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..ad9533b7739d6375e64dcdc33e726fa74985f9b6 --- /dev/null +++ b/run-20250329_005923-ri8qcnsm/files/wandb-metadata.json @@ -0,0 +1,106 @@ +{ + "os": "Linux-5.15.0-1040-nvidia-x86_64-with-glibc2.35", + "python": "CPython 3.11.11", + "startedAt": "2025-03-28T16:59:23.847239Z", + "args": [ + "--local_rank=0", + "--model_name_or_path", + "/aifs4su/yaodong/hantao/models/llava-v1.6-mistral-7b-hf", + "--train_datasets", + "/aifs4su/yaodong/hantao/datasets/MMInstruct-GPT4V_mistral-7b_cosi_cut/merged/top1-10", + "--train_template", + "MM_TI2T_LLAVA", + "--train_split", + "train", + "--train_name", + "text-image-to-text", + "--output_dir", + "../outputs/test_7B", + "--save_total_limit", + "6", + "--train_batch_size", + "8", + "--epochs", + "3" + ], + "program": "-m align_anything.trainers.text_image_to_text.sft", + "git": { + "remote": "git@github.com-hantao:PKU-Alignment/align-anything.git", + "commit": "106588f9802757a3283c1aff1f33ea9afd737f31" + }, + "email": "2200017789@stu.pku.edu.cn", + "root": "../outputs/test_7B", + "host": "dgx-092", + "executable": "/aifs4su/yaodong/miniconda3/envs/hantao_llama/bin/python", + "cpu_count": 112, + "cpu_count_logical": 224, + "gpu": "NVIDIA H800", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888556142592", + "used": "148609388544" + } + }, + "memory": { + "total": "2164195454976" + }, + "cpu": { + "count": 112, + "countLogical": 224 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA H800", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + }, + { + "name": "NVIDIA H800", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + }, + { + "name": "NVIDIA H800", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + }, + { + "name": "NVIDIA H800", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + }, + { + "name": "NVIDIA H800", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + }, + { + "name": "NVIDIA H800", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + }, + { + "name": "NVIDIA H800", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + }, + { + "name": "NVIDIA H800", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + } + ], + "slurm": { + "conf": "/cm/shared/apps/slurm/var/etc/slurm/slurm.conf" + }, + "cudaVersion": "12.2" +} \ No newline at end of file diff --git a/run-20250329_005923-ri8qcnsm/files/wandb-summary.json b/run-20250329_005923-ri8qcnsm/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..6c37fe1cbbb8aed86fd461a79642cb991e4d35cf --- /dev/null +++ b/run-20250329_005923-ri8qcnsm/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":0}} \ No newline at end of file diff --git a/run-20250329_005923-ri8qcnsm/logs/debug-core.log b/run-20250329_005923-ri8qcnsm/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..4d1e90446ffd0dd806dbbc8f8ff96f4f221980ca --- /dev/null +++ b/run-20250329_005923-ri8qcnsm/logs/debug-core.log @@ -0,0 +1,7 @@ +{"time":"2025-03-29T00:59:23.271226442+08:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmp0vu2vr0n/port-61239.txt","pid":61239,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false} +{"time":"2025-03-29T00:59:23.272384135+08:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":61239} +{"time":"2025-03-29T00:59:23.272379059+08:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":34401,"Zone":""}} +{"time":"2025-03-29T00:59:23.452682211+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:35112"} +{"time":"2025-03-29T00:59:23.848757276+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"ri8qcnsm","id":"127.0.0.1:35112"} +{"time":"2025-03-29T00:59:24.064445723+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"ri8qcnsm","id":"127.0.0.1:35112"} +{"time":"2025-03-29T00:59:25.292121321+08:00","level":"INFO","msg":"received shutdown signal","signal":15} diff --git a/run-20250329_005923-ri8qcnsm/logs/debug-internal.log b/run-20250329_005923-ri8qcnsm/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..5fa507e99f953bf5301f0062cb400be44b4a9f6e --- /dev/null +++ b/run-20250329_005923-ri8qcnsm/logs/debug-internal.log @@ -0,0 +1,9 @@ +{"time":"2025-03-29T00:59:23.850425099+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.8","symlink path":"../outputs/test_7B/wandb/run-20250329_005923-ri8qcnsm/logs/debug-core.log"} +{"time":"2025-03-29T00:59:24.064339392+08:00","level":"INFO","msg":"created new stream","id":"ri8qcnsm"} +{"time":"2025-03-29T00:59:24.06443795+08:00","level":"INFO","msg":"stream: started","id":"ri8qcnsm"} +{"time":"2025-03-29T00:59:24.064469266+08:00","level":"INFO","msg":"handler: started","stream_id":"ri8qcnsm"} +{"time":"2025-03-29T00:59:24.064476586+08:00","level":"INFO","msg":"sender: started","stream_id":"ri8qcnsm"} +{"time":"2025-03-29T00:59:24.064515434+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"ri8qcnsm"} +{"time":"2025-03-29T00:59:24.365807489+08:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2025-03-29T00:59:24.674011224+08:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2025-03-29T00:59:24.674607355+08:00","level":"INFO","msg":"Stopped system monitor"} diff --git a/run-20250329_005923-ri8qcnsm/logs/debug.log b/run-20250329_005923-ri8qcnsm/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..5daf9158402a5e368d829a4bf1f8862343f6949a --- /dev/null +++ b/run-20250329_005923-ri8qcnsm/logs/debug.log @@ -0,0 +1,26 @@ +2025-03-29 00:59:23,843 INFO MainThread:61239 [wandb_setup.py:_flush():67] Current SDK version is 0.19.8 +2025-03-29 00:59:23,844 INFO MainThread:61239 [wandb_setup.py:_flush():67] Configure stats pid to 61239 +2025-03-29 00:59:23,844 INFO MainThread:61239 [wandb_setup.py:_flush():67] Loading settings from /home/yangyaodong/.config/wandb/settings +2025-03-29 00:59:23,844 INFO MainThread:61239 [wandb_setup.py:_flush():67] Loading settings from /aifs4su/yaodong/hantao/align-anything/scripts/wandb/settings +2025-03-29 00:59:23,844 INFO MainThread:61239 [wandb_setup.py:_flush():67] Loading settings from environment variables +2025-03-29 00:59:23,844 INFO MainThread:61239 [wandb_init.py:setup_run_log_directory():647] Logging user logs to ../outputs/test_7B/wandb/run-20250329_005923-ri8qcnsm/logs/debug.log +2025-03-29 00:59:23,844 INFO MainThread:61239 [wandb_init.py:setup_run_log_directory():648] Logging internal logs to ../outputs/test_7B/wandb/run-20250329_005923-ri8qcnsm/logs/debug-internal.log +2025-03-29 00:59:23,844 INFO MainThread:61239 [wandb_init.py:init():761] calling init triggers +2025-03-29 00:59:23,844 INFO MainThread:61239 [wandb_init.py:init():766] wandb.init called with sweep_config: {} +config: {'train_cfgs': {'save_checkpoint': False, 'load_checkpoint': False, 'ds_cfgs': 'ds_z3_config.json', 'epochs': 3, 'seed': 42, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 16, 'gradient_checkpointing': True, 'learning_rate': 2e-05, 'lr_scheduler_type': 'cosine', 'lr_warmup_ratio': 0.03, 'weight_decay': 0.0, 'adam_betas': [0.9, 0.95], 'adam_epsilon': 1e-08, 'bf16': True, 'fp16': False, 'eval_strategy': 'epoch', 'eval_interval': 10, 'freeze_mm_proj': False, 'freeze_vision_tower': True, 'freeze_language_model': False, 'max_grad_norm': 1.0}, 'data_cfgs': {'load_multi_datasets': False, 'train_datasets': '/aifs4su/yaodong/hantao/datasets/MMInstruct-GPT4V_mistral-7b_cosi_cut/merged/top1-10', 'train_template': 'MM_TI2T_LLAVA', 'train_size': {}, 'train_split': 'train', 'train_name': 'text-image-to-text', 'train_data_files': {}, 'train_optional_args': [], 'eval_datasets': {}, 'eval_template': {}, 'eval_name': {}, 'eval_size': {}, 'eval_split': {}, 'eval_subset': {}, 'eval_data_files': {}, 'eval_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'align-anything', 'log_run_name': 'sft', 'output_dir': '../outputs/test_7B', 'cache_dir': {}, 'save_total_limit': 6}, 'model_cfgs': {'model_name_or_path': '/aifs4su/yaodong/hantao/models/llava-v1.6-mistral-7b-hf', 'trust_remote_code': True, 'model_max_length': 2048}, 'special_tokens': {}, '_wandb': {}} +2025-03-29 00:59:23,844 INFO MainThread:61239 [wandb_init.py:init():784] starting backend +2025-03-29 00:59:23,844 INFO MainThread:61239 [wandb_init.py:init():788] sending inform_init request +2025-03-29 00:59:23,847 INFO MainThread:61239 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2025-03-29 00:59:23,847 INFO MainThread:61239 [wandb_init.py:init():798] backend started and connected +2025-03-29 00:59:23,848 INFO MainThread:61239 [wandb_init.py:init():891] updated telemetry +2025-03-29 00:59:23,859 INFO MainThread:61239 [wandb_init.py:init():915] communicating run to backend with 90.0 second timeout +2025-03-29 00:59:24,362 INFO MainThread:61239 [wandb_init.py:init():990] starting run threads in backend +2025-03-29 00:59:24,541 INFO MainThread:61239 [wandb_run.py:_console_start():2375] atexit reg +2025-03-29 00:59:24,541 INFO MainThread:61239 [wandb_run.py:_redirect():2227] redirect: wrap_raw +2025-03-29 00:59:24,541 INFO MainThread:61239 [wandb_run.py:_redirect():2292] Wrapping output streams. +2025-03-29 00:59:24,541 INFO MainThread:61239 [wandb_run.py:_redirect():2315] Redirects installed. +2025-03-29 00:59:24,544 INFO MainThread:61239 [wandb_init.py:init():1032] run started, returning control to user process +2025-03-29 00:59:24,631 INFO MainThread:61239 [wandb_run.py:_finish():2112] finishing run htlou/align-anything/ri8qcnsm +2025-03-29 00:59:24,631 INFO MainThread:61239 [wandb_run.py:_atexit_cleanup():2340] got exitcode: 0 +2025-03-29 00:59:24,631 INFO MainThread:61239 [wandb_run.py:_restore():2322] restore +2025-03-29 00:59:24,631 INFO MainThread:61239 [wandb_run.py:_restore():2328] restore done diff --git a/run-20250329_005923-ri8qcnsm/run-ri8qcnsm.wandb b/run-20250329_005923-ri8qcnsm/run-ri8qcnsm.wandb new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/run-20250329_010208-a0hu2wd5/files/config.yaml b/run-20250329_010208-a0hu2wd5/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0034e884827829202e5342ad55c39ebf85d93c49 --- /dev/null +++ b/run-20250329_010208-a0hu2wd5/files/config.yaml @@ -0,0 +1,95 @@ +_wandb: + value: + cli_version: 0.19.8 + m: [] + python_version: 3.11.11 + t: + "1": + - 1 + - 5 + - 11 + - 41 + - 49 + - 51 + - 53 + - 55 + - 63 + - 71 + - 83 + - 98 + - 105 + "2": + - 1 + - 5 + - 11 + - 41 + - 49 + - 51 + - 53 + - 55 + - 63 + - 71 + - 83 + - 98 + - 105 + "3": + - 2 + - 13 + - 16 + - 23 + - 55 + "4": 3.11.11 + "5": 0.19.8 + "6": 4.51.0.dev0 + "8": + - 5 + "12": 0.19.8 + "13": linux-x86_64 +data_cfgs: + value: + eval_optional_args: [] + load_multi_datasets: false + train_datasets: /aifs4su/yaodong/hantao/datasets/MMInstruct-GPT4V_mistral-7b_cosi_cut/merged/top1-10 + train_name: text-image-to-text + train_optional_args: [] + train_split: train + train_template: MM_TI2T_LLAVA +logger_cfgs: + value: + log_project: align-anything + log_run_name: sft + log_type: wandb + output_dir: ../outputs/test_7B + save_total_limit: 6 +model_cfgs: + value: + model_max_length: 2048 + model_name_or_path: /aifs4su/yaodong/hantao/models/llava-v1.6-mistral-7b-hf + trust_remote_code: true +train_cfgs: + value: + adam_betas: + - 0.9 + - 0.95 + adam_epsilon: 1e-08 + bf16: true + ds_cfgs: ds_z3_config.json + epochs: 3 + eval_interval: 10 + eval_strategy: epoch + fp16: false + freeze_language_model: false + freeze_mm_proj: false + freeze_vision_tower: true + gradient_accumulation_steps: 16 + gradient_checkpointing: true + learning_rate: 2e-05 + load_checkpoint: false + lr_scheduler_type: cosine + lr_warmup_ratio: 0.03 + max_grad_norm: 1 + per_device_eval_batch_size: 1 + per_device_train_batch_size: 1 + save_checkpoint: false + seed: 42 + weight_decay: 0 diff --git a/run-20250329_010208-a0hu2wd5/files/output.log b/run-20250329_010208-a0hu2wd5/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..618390150ebc75de1c926b00b436432cb1cad766 --- /dev/null +++ b/run-20250329_010208-a0hu2wd5/files/output.log @@ -0,0 +1,3 @@ +***** Running training ***** +Resuming from checkpoint 1/3 epoch : 0%| | 0/7326 [00:00 diff --git a/run-20250329_010208-a0hu2wd5/files/requirements.txt b/run-20250329_010208-a0hu2wd5/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..1bd434ed56b038dff142c2a66ee591bbfc57f256 --- /dev/null +++ b/run-20250329_010208-a0hu2wd5/files/requirements.txt @@ -0,0 +1,167 @@ +maskrcnn_benchmark==0.0.0 +webdataset==0.2.111 +websockets==15.0.1 +typer==0.15.2 +blobfile==3.0.0 +pooch==1.8.2 +python-dateutil==2.9.0.post0 +gmpy2==2.2.1 +httpcore==1.0.7 +charset-normalizer==3.3.2 +torchlibrosa==0.1.0 +multiprocess==0.70.16 +Werkzeug==3.1.3 +aiofiles==23.2.1 +six==1.17.0 +typing_extensions==4.12.2 +psutil==7.0.0 +frozenlist==1.5.0 +einops==0.8.1 +flash_attn==2.7.4.post1 +PySocks==1.7.1 +regex==2024.11.6 +markdown-it-py==3.0.0 +ruff==0.11.2 +docker-pycreds==0.4.0 +protobuf==5.29.4 +resampy==0.4.3 +aiohappyeyeballs==2.6.1 +httpx==0.28.1 +encodec==0.1.1 +ffmpy==0.5.0 +mkl_random==1.2.8 +soxr==0.5.0.post1 +absl-py==2.2.1 +networkx==3.4.2 +h5py==3.13.0 +hjson==3.1.0 +tensorboard==2.19.0 +aiosignal==1.3.2 +pip==25.0 +triton==3.1.0 +zipp==3.21.0 +ftfy==6.3.1 +attrs==25.3.0 +requests==2.32.3 +progressbar==2.5 +sniffio==1.3.1 +lxml==5.3.1 +starlette==0.46.1 +Markdown==3.7 +mdurl==0.1.2 +torchaudio==2.5.1 +safetensors==0.5.3 +opencv-python==4.6.0.66 +torchvision==0.20.1 +shellingham==1.5.4 +gradio==5.23.1 +timm==1.0.15 +multidict==6.2.0 +semantic-version==2.10.0 +numba==0.60.0 +gradio_client==1.8.0 +pydantic_core==2.33.0 +dill==0.3.8 +msgpack==1.1.0 +sentry-sdk==2.24.1 +grpcio==1.71.0 +cffi==1.17.1 +PyYAML==6.0.2 +tensorboard-data-server==0.7.2 +fastapi==0.115.12 +lazy_loader==0.4 +mkl_fft==1.3.11 +annotated-types==0.7.0 +scikit-learn==1.6.1 +wget==3.2 +setuptools==75.8.0 +certifi==2025.1.31 +click==8.1.8 +laion_clap==1.1.5 +Pygments==2.19.1 +tomlkit==0.13.2 +idna==3.7 +propcache==0.3.1 +platformdirs==4.3.7 +align-anything==0.0.1.dev0 +deepspeed==0.16.5 +smmap==5.0.2 +pillow==11.1.0 +typing-inspection==0.4.0 +braceexpand==0.1.7 +decorator==5.2.1 +pandas==2.2.3 +huggingface-hub==0.29.3 +pyarrow==19.0.1 +tokenizers==0.21.1 +GitPython==3.1.44 +xxhash==3.5.0 +packaging==24.2 +numpy==1.23.4 +setproctitle==1.3.5 +llvmlite==0.43.0 +tiktoken==0.9.0 +mpmath==1.3.0 +nvidia-ml-py==12.570.86 +pydantic==2.11.0 +datasets==3.5.0 +librosa==0.11.0 +frechet_audio_distance==0.3.1 +sympy==1.13.1 +safehttpx==0.1.6 +Jinja2==3.1.6 +h11==0.14.0 +aiohttp==3.11.14 +diffusers==0.32.2 +tqdm==4.67.1 +filelock==3.13.1 +transformers==4.51.0.dev0 +scipy==1.10.1 +audioread==3.0.1 +sentencepiece==0.2.0 +pytz==2025.2 +tzdata==2025.2 +python-multipart==0.0.20 +urllib3==2.3.0 +pycryptodomex==3.22.0 +yarl==1.18.3 +pydub==0.25.1 +pycparser==2.22 +soundfile==0.13.1 +wcwidth==0.2.13 +groovy==0.1.2 +torch==2.5.1 +anyio==4.9.0 +wandb==0.19.8 +joblib==1.4.2 +fsspec==2024.12.0 +peft==0.15.1 +accelerate==1.5.2 +py-cpuinfo==9.0.0 +uvicorn==0.34.0 +orjson==3.10.16 +Brotli==1.0.9 +rich==13.9.4 +importlib_metadata==8.6.1 +ninja==1.11.1.4 +wheel==0.45.1 +MarkupSafe==3.0.2 +threadpoolctl==3.6.0 +gitdb==4.0.12 +mkl-service==2.4.0 +typing_extensions==4.12.2 +tomli==2.0.1 +zipp==3.19.2 +wheel==0.43.0 +jaraco.text==3.12.1 +packaging==24.2 +autocommand==2.2.2 +jaraco.functools==4.0.1 +jaraco.collections==5.1.0 +platformdirs==4.2.2 +more-itertools==10.3.0 +inflect==7.3.1 +jaraco.context==5.3.0 +typeguard==4.3.0 +backports.tarfile==1.2.0 +importlib_metadata==8.0.0 diff --git a/run-20250329_010208-a0hu2wd5/files/wandb-metadata.json b/run-20250329_010208-a0hu2wd5/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..478772942e8149396fe546e6d5c86861060c29fb --- /dev/null +++ b/run-20250329_010208-a0hu2wd5/files/wandb-metadata.json @@ -0,0 +1,35 @@ +{ + "os": "Linux-5.15.0-1040-nvidia-x86_64-with-glibc2.35", + "python": "CPython 3.11.11", + "startedAt": "2025-03-28T17:02:08.019014Z", + "args": [ + "--local_rank=0", + "--model_name_or_path", + "/aifs4su/yaodong/hantao/models/llava-v1.6-mistral-7b-hf", + "--train_datasets", + "/aifs4su/yaodong/hantao/datasets/MMInstruct-GPT4V_mistral-7b_cosi_cut/merged/top1-10", + "--train_template", + "MM_TI2T_LLAVA", + "--train_split", + "train", + "--train_name", + "text-image-to-text", + "--output_dir", + "../outputs/test_7B", + "--save_total_limit", + "6", + "--train_batch_size", + "8", + "--epochs", + "3" + ], + "program": "-m align_anything.trainers.text_image_to_text.sft", + "git": { + "remote": "git@github.com-hantao:PKU-Alignment/align-anything.git", + "commit": "106588f9802757a3283c1aff1f33ea9afd737f31" + }, + "email": "2200017789@stu.pku.edu.cn", + "root": "../outputs/test_7B", + "host": "dgx-092", + "executable": "/aifs4su/yaodong/miniconda3/envs/hantao_llama/bin/python" +} \ No newline at end of file diff --git a/run-20250329_010208-a0hu2wd5/files/wandb-summary.json b/run-20250329_010208-a0hu2wd5/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..6c37fe1cbbb8aed86fd461a79642cb991e4d35cf --- /dev/null +++ b/run-20250329_010208-a0hu2wd5/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":0}} \ No newline at end of file diff --git a/run-20250329_010208-a0hu2wd5/logs/debug-core.log b/run-20250329_010208-a0hu2wd5/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..4f51b5e199ced93fcdfac49c32787b3f736baf33 --- /dev/null +++ b/run-20250329_010208-a0hu2wd5/logs/debug-core.log @@ -0,0 +1,14 @@ +{"time":"2025-03-29T01:02:07.424732633+08:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmp0h8id8py/port-68772.txt","pid":68772,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false} +{"time":"2025-03-29T01:02:07.426096763+08:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":68772} +{"time":"2025-03-29T01:02:07.426093248+08:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":46481,"Zone":""}} +{"time":"2025-03-29T01:02:07.606217367+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:49406"} +{"time":"2025-03-29T01:02:08.020688051+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"a0hu2wd5","id":"127.0.0.1:49406"} +{"time":"2025-03-29T01:02:08.237972639+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"a0hu2wd5","id":"127.0.0.1:49406"} +{"time":"2025-03-29T01:02:10.007075929+08:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:49406"} +{"time":"2025-03-29T01:02:10.00721001+08:00","level":"INFO","msg":"server is shutting down"} +{"time":"2025-03-29T01:02:10.007206204+08:00","level":"INFO","msg":"connection: closing","id":"127.0.0.1:49406"} +{"time":"2025-03-29T01:02:10.007280283+08:00","level":"INFO","msg":"connection: closed successfully","id":"127.0.0.1:49406"} +{"time":"2025-03-29T01:02:10.296320318+08:00","level":"ERROR","msg":"processOutgoingData: flush error","error":"write tcp 127.0.0.1:46481->127.0.0.1:49406: use of closed network connection","id":"127.0.0.1:49406"} +{"time":"2025-03-29T01:02:10.297319605+08:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:49406"} +{"time":"2025-03-29T01:02:10.297328949+08:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:49406"} +{"time":"2025-03-29T01:02:10.297332559+08:00","level":"INFO","msg":"server is closed"} diff --git a/run-20250329_010208-a0hu2wd5/logs/debug-internal.log b/run-20250329_010208-a0hu2wd5/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..1051178f3790406c1a1cd7bf3560fc999ebf64ed --- /dev/null +++ b/run-20250329_010208-a0hu2wd5/logs/debug-internal.log @@ -0,0 +1,16 @@ +{"time":"2025-03-29T01:02:08.02261839+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.8","symlink path":"../outputs/test_7B/wandb/run-20250329_010208-a0hu2wd5/logs/debug-core.log"} +{"time":"2025-03-29T01:02:08.237877625+08:00","level":"INFO","msg":"created new stream","id":"a0hu2wd5"} +{"time":"2025-03-29T01:02:08.237964406+08:00","level":"INFO","msg":"stream: started","id":"a0hu2wd5"} +{"time":"2025-03-29T01:02:08.237985493+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"a0hu2wd5"} +{"time":"2025-03-29T01:02:08.23799238+08:00","level":"INFO","msg":"sender: started","stream_id":"a0hu2wd5"} +{"time":"2025-03-29T01:02:08.238003087+08:00","level":"INFO","msg":"handler: started","stream_id":"a0hu2wd5"} +{"time":"2025-03-29T01:02:08.542675379+08:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2025-03-29T01:02:08.742090676+08:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2025-03-29T01:02:08.793967069+08:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2025-03-29T01:02:10.007179604+08:00","level":"INFO","msg":"stream: closing","id":"a0hu2wd5"} +{"time":"2025-03-29T01:02:10.007246271+08:00","level":"WARN","msg":"sender: received Exit record more than once, ignoring"} +{"time":"2025-03-29T01:02:10.071313359+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-03-29T01:02:10.296267321+08:00","level":"INFO","msg":"handler: closed","stream_id":"a0hu2wd5"} +{"time":"2025-03-29T01:02:10.296316506+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"a0hu2wd5"} +{"time":"2025-03-29T01:02:10.296356535+08:00","level":"INFO","msg":"sender: closed","stream_id":"a0hu2wd5"} +{"time":"2025-03-29T01:02:10.297246635+08:00","level":"INFO","msg":"stream: closed","id":"a0hu2wd5"} diff --git a/run-20250329_010208-a0hu2wd5/logs/debug.log b/run-20250329_010208-a0hu2wd5/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..927029a88f5d854e9c73ace3bf854c653c4cef26 --- /dev/null +++ b/run-20250329_010208-a0hu2wd5/logs/debug.log @@ -0,0 +1,47 @@ +2025-03-29 01:02:08,015 INFO MainThread:68772 [wandb_setup.py:_flush():67] Current SDK version is 0.19.8 +2025-03-29 01:02:08,016 INFO MainThread:68772 [wandb_setup.py:_flush():67] Configure stats pid to 68772 +2025-03-29 01:02:08,016 INFO MainThread:68772 [wandb_setup.py:_flush():67] Loading settings from /home/yangyaodong/.config/wandb/settings +2025-03-29 01:02:08,016 INFO MainThread:68772 [wandb_setup.py:_flush():67] Loading settings from /aifs4su/yaodong/hantao/align-anything/scripts/wandb/settings +2025-03-29 01:02:08,016 INFO MainThread:68772 [wandb_setup.py:_flush():67] Loading settings from environment variables +2025-03-29 01:02:08,016 INFO MainThread:68772 [wandb_init.py:setup_run_log_directory():647] Logging user logs to ../outputs/test_7B/wandb/run-20250329_010208-a0hu2wd5/logs/debug.log +2025-03-29 01:02:08,016 INFO MainThread:68772 [wandb_init.py:setup_run_log_directory():648] Logging internal logs to ../outputs/test_7B/wandb/run-20250329_010208-a0hu2wd5/logs/debug-internal.log +2025-03-29 01:02:08,016 INFO MainThread:68772 [wandb_init.py:init():761] calling init triggers +2025-03-29 01:02:08,016 INFO MainThread:68772 [wandb_init.py:init():766] wandb.init called with sweep_config: {} +config: {'train_cfgs': {'save_checkpoint': False, 'load_checkpoint': False, 'ds_cfgs': 'ds_z3_config.json', 'epochs': 3, 'seed': 42, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 16, 'gradient_checkpointing': True, 'learning_rate': 2e-05, 'lr_scheduler_type': 'cosine', 'lr_warmup_ratio': 0.03, 'weight_decay': 0.0, 'adam_betas': [0.9, 0.95], 'adam_epsilon': 1e-08, 'bf16': True, 'fp16': False, 'eval_strategy': 'epoch', 'eval_interval': 10, 'freeze_mm_proj': False, 'freeze_vision_tower': True, 'freeze_language_model': False, 'max_grad_norm': 1.0}, 'data_cfgs': {'load_multi_datasets': False, 'train_datasets': '/aifs4su/yaodong/hantao/datasets/MMInstruct-GPT4V_mistral-7b_cosi_cut/merged/top1-10', 'train_template': 'MM_TI2T_LLAVA', 'train_size': {}, 'train_split': 'train', 'train_name': 'text-image-to-text', 'train_data_files': {}, 'train_optional_args': [], 'eval_datasets': {}, 'eval_template': {}, 'eval_name': {}, 'eval_size': {}, 'eval_split': {}, 'eval_subset': {}, 'eval_data_files': {}, 'eval_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'align-anything', 'log_run_name': 'sft', 'output_dir': '../outputs/test_7B', 'cache_dir': {}, 'save_total_limit': 6}, 'model_cfgs': {'model_name_or_path': '/aifs4su/yaodong/hantao/models/llava-v1.6-mistral-7b-hf', 'trust_remote_code': True, 'model_max_length': 2048}, 'special_tokens': {}, '_wandb': {}} +2025-03-29 01:02:08,016 INFO MainThread:68772 [wandb_init.py:init():784] starting backend +2025-03-29 01:02:08,016 INFO MainThread:68772 [wandb_init.py:init():788] sending inform_init request +2025-03-29 01:02:08,018 INFO MainThread:68772 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2025-03-29 01:02:08,018 INFO MainThread:68772 [wandb_init.py:init():798] backend started and connected +2025-03-29 01:02:08,020 INFO MainThread:68772 [wandb_init.py:init():891] updated telemetry +2025-03-29 01:02:08,031 INFO MainThread:68772 [wandb_init.py:init():915] communicating run to backend with 90.0 second timeout +2025-03-29 01:02:08,539 INFO MainThread:68772 [wandb_init.py:init():990] starting run threads in backend +2025-03-29 01:02:08,696 INFO MainThread:68772 [wandb_run.py:_console_start():2375] atexit reg +2025-03-29 01:02:08,697 INFO MainThread:68772 [wandb_run.py:_redirect():2227] redirect: wrap_raw +2025-03-29 01:02:08,697 INFO MainThread:68772 [wandb_run.py:_redirect():2292] Wrapping output streams. +2025-03-29 01:02:08,697 INFO MainThread:68772 [wandb_run.py:_redirect():2315] Redirects installed. +2025-03-29 01:02:08,699 INFO MainThread:68772 [wandb_init.py:init():1032] run started, returning control to user process +2025-03-29 01:02:08,724 INFO MainThread:68772 [wandb_run.py:_finish():2112] finishing run htlou/align-anything/a0hu2wd5 +2025-03-29 01:02:08,724 INFO MainThread:68772 [wandb_run.py:_atexit_cleanup():2340] got exitcode: 0 +2025-03-29 01:02:08,725 INFO MainThread:68772 [wandb_run.py:_restore():2322] restore +2025-03-29 01:02:08,725 INFO MainThread:68772 [wandb_run.py:_restore():2328] restore done +2025-03-29 01:02:09,725 INFO MainThread:68772 [wandb_run.py:_restore():2322] restore +2025-03-29 01:02:09,725 INFO MainThread:68772 [wandb_run.py:_restore():2328] restore done +2025-03-29 01:02:09,725 ERROR MainThread:68772 [wandb_run.py:_atexit_cleanup():2361] Problem finishing run +Traceback (most recent call last): + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 2352, in _atexit_cleanup + self._on_finish() + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 2609, in _on_finish + wait_with_progress( + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/wandb/sdk/mailbox/wait_with_progress.py", line 24, in wait_with_progress + return wait_all_with_progress( + ^^^^^^^^^^^^^^^^^^^^^^^ + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/wandb/sdk/mailbox/wait_with_progress.py", line 87, in wait_all_with_progress + return asyncio_compat.run(progress_loop_with_timeout) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/wandb/sdk/lib/asyncio_compat.py", line 27, in run + future = executor.submit(runner.run, fn) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/concurrent/futures/thread.py", line 169, in submit + raise RuntimeError('cannot schedule new futures after ' +RuntimeError: cannot schedule new futures after interpreter shutdown +2025-03-29 01:02:10,006 INFO MsgRouterThr:68772 [mailbox.py:close():129] Closing mailbox, abandoning 2 handles. diff --git a/run-20250329_010208-a0hu2wd5/run-a0hu2wd5.wandb b/run-20250329_010208-a0hu2wd5/run-a0hu2wd5.wandb new file mode 100644 index 0000000000000000000000000000000000000000..ffb7648ce2db0ef0068fc20d3719740ab45e210a Binary files /dev/null and b/run-20250329_010208-a0hu2wd5/run-a0hu2wd5.wandb differ diff --git a/run-20250329_010934-3x35hjks/files/output.log b/run-20250329_010934-3x35hjks/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..f7d425c70c130543b5fdc45b923673c7d907f40f --- /dev/null +++ b/run-20250329_010934-3x35hjks/files/output.log @@ -0,0 +1,97 @@ +***** Running training ***** +Resuming from checkpoint 1/3 epoch : 0%| | 0/7326 [00:00 2048). Running this sequence through the model will result in indexing errors +/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/torch/utils/checkpoint.py:87: UserWarning: None of the inputs have requires_grad=True. Gradients will be None + warnings.warn( +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. +Training 1/3 epoch (loss 0.0352): 7%|████████████████▍ | 479/7326 [08:24<2:10:36, 1.14s/it]Traceback (most recent call last): +[2025-03-29 01:12:32,143] [INFO] [logging.py:107:log_dist] [Rank 0] step=10, skipped=0, lr=[1.5384615384615387e-05, 1.5384615384615387e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2025-03-29 01:12:32,143] [INFO] [timer.py:264:stop] epoch=0/micro_step=160/global_step=10, RunningAvgSamplesPerSec=10.10993134859456, CurrSamplesPerSec=10.887130614655668, MemAllocated=14.6GB, MaxMemAllocated=19.68GB +[2025-03-29 01:15:16,130] [INFO] [logging.py:107:log_dist] [Rank 0] step=20, skipped=0, lr=[1.9987846310933768e-05, 1.9987846310933768e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2025-03-29 01:15:16,130] [INFO] [timer.py:264:stop] epoch=0/micro_step=320/global_step=20, RunningAvgSamplesPerSec=10.37800408898891, CurrSamplesPerSec=10.557907378763698, MemAllocated=14.6GB, MaxMemAllocated=19.68GB + File "", line 198, in _run_module_as_main + File "", line 88, in _run_code + File "/aifs4su/yaodong/hantao/align-anything/align_anything/trainers/text_image_to_text/sft.py", line 100, in + sys.exit(main()) + ^^^^^^ + File "/aifs4su/yaodong/hantao/align-anything/align_anything/trainers/text_image_to_text/sft.py", line 95, in main + trainer.train() + File "/aifs4su/yaodong/hantao/align-anything/align_anything/trainers/text_to_text/sft.py", line 139, in train + for batch_idx, batch in enumerate(self.train_dataloader): + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/torch/utils/data/dataloader.py", line 701, in __next__ + data = self._next_data() + ^^^^^^^^^^^^^^^^^ + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/torch/utils/data/dataloader.py", line 757, in _next_data + data = self._dataset_fetcher.fetch(index) # may raise StopIteration + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/torch/utils/data/_utils/fetch.py", line 55, in fetch + return self.collate_fn(data) + ^^^^^^^^^^^^^^^^^^^^^ + File "/aifs4su/yaodong/hantao/align-anything/align_anything/datasets/text_image_to_text/supervised.py", line 179, in __call__ + multi_modal_padding = self.processor( + ^^^^^^^^^^^^^^^ + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/transformers/models/llava_next/processing_llava_next.py", line 141, in __call__ + image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"]) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/transformers/image_processing_utils.py", line 42, in __call__ + return self.preprocess(images, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/transformers/models/llava_next/image_processing_llava_next.py", line 675, in preprocess + images = [to_numpy_array(image) for image in images] + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/transformers/models/llava_next/image_processing_llava_next.py", line 675, in + images = [to_numpy_array(image) for image in images] + ^^^^^^^^^^^^^^^^^^^^^ + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/transformers/image_utils.py", line 337, in to_numpy_array + return np.array(img) + ^^^^^^^^^^^^^ + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/PIL/Image.py", line 747, in __array_interface__ + new["data"] = self.tobytes() + ^^^^^^^^^^^^^^ + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/PIL/Image.py", line 809, in tobytes + bytes_consumed, errcode, data = e.encode(bufsize) + ^^^^^^^^^^^^^^^^^ +KeyboardInterrupt +[rank0]: Traceback (most recent call last): +[rank0]: File "", line 198, in _run_module_as_main +[rank0]: File "", line 88, in _run_code +[rank0]: File "/aifs4su/yaodong/hantao/align-anything/align_anything/trainers/text_image_to_text/sft.py", line 100, in +[rank0]: sys.exit(main()) +[rank0]: ^^^^^^ +[rank0]: File "/aifs4su/yaodong/hantao/align-anything/align_anything/trainers/text_image_to_text/sft.py", line 95, in main +[rank0]: trainer.train() +[rank0]: File "/aifs4su/yaodong/hantao/align-anything/align_anything/trainers/text_to_text/sft.py", line 139, in train +[rank0]: for batch_idx, batch in enumerate(self.train_dataloader): +[rank0]: File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/torch/utils/data/dataloader.py", line 701, in __next__ +[rank0]: data = self._next_data() +[rank0]: ^^^^^^^^^^^^^^^^^ +[rank0]: File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/torch/utils/data/dataloader.py", line 757, in _next_data +[rank0]: data = self._dataset_fetcher.fetch(index) # may raise StopIteration +[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank0]: File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/torch/utils/data/_utils/fetch.py", line 55, in fetch +[rank0]: return self.collate_fn(data) +[rank0]: ^^^^^^^^^^^^^^^^^^^^^ +[rank0]: File "/aifs4su/yaodong/hantao/align-anything/align_anything/datasets/text_image_to_text/supervised.py", line 179, in __call__ +[rank0]: multi_modal_padding = self.processor( +[rank0]: ^^^^^^^^^^^^^^^ +[rank0]: File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/transformers/models/llava_next/processing_llava_next.py", line 141, in __call__ +[rank0]: image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"]) +[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank0]: File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/transformers/image_processing_utils.py", line 42, in __call__ +[rank0]: return self.preprocess(images, **kwargs) +[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank0]: File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/transformers/models/llava_next/image_processing_llava_next.py", line 675, in preprocess +[rank0]: images = [to_numpy_array(image) for image in images] +[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank0]: File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/transformers/models/llava_next/image_processing_llava_next.py", line 675, in +[rank0]: images = [to_numpy_array(image) for image in images] +[rank0]: ^^^^^^^^^^^^^^^^^^^^^ +[rank0]: File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/transformers/image_utils.py", line 337, in to_numpy_array +[rank0]: return np.array(img) +[rank0]: ^^^^^^^^^^^^^ +[rank0]: File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/PIL/Image.py", line 747, in __array_interface__ +[rank0]: new["data"] = self.tobytes() +[rank0]: ^^^^^^^^^^^^^^ +[rank0]: File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/PIL/Image.py", line 809, in tobytes +[rank0]: bytes_consumed, errcode, data = e.encode(bufsize) +[rank0]: ^^^^^^^^^^^^^^^^^ +[rank0]: KeyboardInterrupt diff --git a/run-20250329_010934-3x35hjks/files/requirements.txt b/run-20250329_010934-3x35hjks/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..83b485a0b59e151549cdb2ba387e6b6f4dfe8ef6 --- /dev/null +++ b/run-20250329_010934-3x35hjks/files/requirements.txt @@ -0,0 +1,183 @@ +maskrcnn_benchmark==0.0.0 +webdataset==0.2.111 +websockets==15.0.1 +typer==0.15.2 +blobfile==3.0.0 +pooch==1.8.2 +matplotlib==3.10.1 +python-dateutil==2.9.0.post0 +gmpy2==2.2.1 +httpcore==1.0.7 +charset-normalizer==3.3.2 +torchlibrosa==0.1.0 +contourpy==1.3.1 +multiprocess==0.70.16 +Werkzeug==3.1.3 +aiofiles==23.2.1 +six==1.17.0 +sse-starlette==2.2.1 +typing_extensions==4.12.2 +psutil==7.0.0 +kiwisolver==1.4.8 +frozenlist==1.5.0 +einops==0.8.1 +flash_attn==2.7.4.post1 +PySocks==1.7.1 +regex==2024.11.6 +markdown-it-py==3.0.0 +ruff==0.11.2 +docker-pycreds==0.4.0 +pyparsing==3.2.3 +protobuf==5.29.4 +resampy==0.4.3 +tokenizers==0.21.0 +aiohappyeyeballs==2.6.1 +llamafactory==0.9.3.dev0 +httpx==0.28.1 +encodec==0.1.1 +ffmpy==0.5.0 +mkl_random==1.2.8 +soxr==0.5.0.post1 +gradio==5.21.0 +absl-py==2.2.1 +networkx==3.4.2 +h5py==3.13.0 +hjson==3.1.0 +tensorboard==2.19.0 +aiosignal==1.3.2 +pip==25.0 +triton==3.1.0 +zipp==3.21.0 +ftfy==6.3.1 +peft==0.15.0 +attrs==25.3.0 +trl==0.9.6 +requests==2.32.3 +progressbar==2.5 +sniffio==1.3.1 +lxml==5.3.1 +starlette==0.46.1 +Markdown==3.7 +mdurl==0.1.2 +torchaudio==2.5.1 +safetensors==0.5.3 +opencv-python==4.6.0.66 +torchvision==0.20.1 +shellingham==1.5.4 +timm==1.0.15 +multidict==6.2.0 +semantic-version==2.10.0 +numba==0.60.0 +MarkupSafe==2.1.5 +pydantic_core==2.33.0 +dill==0.3.8 +msgpack==1.1.0 +sentry-sdk==2.24.1 +grpcio==1.71.0 +cffi==1.17.1 +gradio_client==1.7.2 +PyYAML==6.0.2 +tensorboard-data-server==0.7.2 +termcolor==2.5.0 +fastapi==0.115.12 +lazy_loader==0.4 +mkl_fft==1.3.11 +annotated-types==0.7.0 +scikit-learn==1.6.1 +wget==3.2 +setuptools==75.8.0 +certifi==2025.1.31 +click==8.1.8 +laion_clap==1.1.5 +Pygments==2.19.1 +tomlkit==0.13.2 +idna==3.7 +propcache==0.3.1 +platformdirs==4.3.7 +align-anything==0.0.1.dev0 +deepspeed==0.16.5 +smmap==5.0.2 +pillow==11.1.0 +fonttools==4.56.0 +typing-inspection==0.4.0 +braceexpand==0.1.7 +decorator==5.2.1 +shtab==1.7.1 +pandas==2.2.3 +huggingface-hub==0.29.3 +pyarrow==19.0.1 +GitPython==3.1.44 +xxhash==3.5.0 +packaging==24.2 +numpy==1.23.4 +setproctitle==1.3.5 +llvmlite==0.43.0 +tiktoken==0.9.0 +mpmath==1.3.0 +nvidia-ml-py==12.570.86 +pydantic==2.11.0 +librosa==0.11.0 +frechet_audio_distance==0.3.1 +sympy==1.13.1 +safehttpx==0.1.6 +Jinja2==3.1.6 +tyro==0.8.14 +h11==0.14.0 +aiohttp==3.11.14 +diffusers==0.32.2 +tqdm==4.67.1 +filelock==3.13.1 +scipy==1.10.1 +audioread==3.0.1 +fire==0.7.0 +sentencepiece==0.2.0 +pytz==2025.2 +tzdata==2025.2 +python-multipart==0.0.20 +urllib3==2.3.0 +pycryptodomex==3.22.0 +yarl==1.18.3 +pydub==0.25.1 +pycparser==2.22 +datasets==3.4.1 +soundfile==0.13.1 +transformers==4.50.0 +wcwidth==0.2.13 +groovy==0.1.2 +torch==2.5.1 +anyio==4.9.0 +wandb==0.19.8 +joblib==1.4.2 +fsspec==2024.12.0 +accelerate==1.5.2 +py-cpuinfo==9.0.0 +docstring_parser==0.16 +cycler==0.12.1 +uvicorn==0.34.0 +orjson==3.10.16 +av==14.2.0 +Brotli==1.0.9 +rich==13.9.4 +importlib_metadata==8.6.1 +ninja==1.11.1.4 +wheel==0.45.1 +threadpoolctl==3.6.0 +gitdb==4.0.12 +mkl-service==2.4.0 +llamafactory==0.9.3.dev0 +typing_extensions==4.12.2 +tomli==2.0.1 +zipp==3.19.2 +wheel==0.43.0 +jaraco.text==3.12.1 +packaging==24.2 +autocommand==2.2.2 +jaraco.functools==4.0.1 +jaraco.collections==5.1.0 +platformdirs==4.2.2 +more-itertools==10.3.0 +inflect==7.3.1 +jaraco.context==5.3.0 +typeguard==4.3.0 +backports.tarfile==1.2.0 +importlib_metadata==8.0.0 diff --git a/run-20250329_010934-3x35hjks/files/wandb-metadata.json b/run-20250329_010934-3x35hjks/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..2b778ed03ee8f45f9fe72a951107f036d970e5ce --- /dev/null +++ b/run-20250329_010934-3x35hjks/files/wandb-metadata.json @@ -0,0 +1,106 @@ +{ + "os": "Linux-5.15.0-1040-nvidia-x86_64-with-glibc2.35", + "python": "CPython 3.11.11", + "startedAt": "2025-03-28T17:09:34.736558Z", + "args": [ + "--local_rank=0", + "--model_name_or_path", + "/aifs4su/yaodong/hantao/models/llava-v1.6-mistral-7b-hf", + "--train_datasets", + "/aifs4su/yaodong/hantao/datasets/MMInstruct-GPT4V_mistral-7b_cosi_cut/merged/top1-10", + "--train_template", + "MM_TI2T_LLAVA", + "--train_split", + "train", + "--train_name", + "text-image-to-text", + "--output_dir", + "../outputs/test_7B", + "--save_total_limit", + "6", + "--train_batch_size", + "8", + "--epochs", + "3" + ], + "program": "-m align_anything.trainers.text_image_to_text.sft", + "git": { + "remote": "git@github.com-hantao:PKU-Alignment/align-anything.git", + "commit": "106588f9802757a3283c1aff1f33ea9afd737f31" + }, + "email": "2200017789@stu.pku.edu.cn", + "root": "../outputs/test_7B", + "host": "dgx-092", + "executable": "/aifs4su/yaodong/miniconda3/envs/hantao_llama/bin/python", + "cpu_count": 112, + "cpu_count_logical": 224, + "gpu": "NVIDIA H800", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888556142592", + "used": "148610625536" + } + }, + "memory": { + "total": "2164195454976" + }, + "cpu": { + "count": 112, + "countLogical": 224 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA H800", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + }, + { + "name": "NVIDIA H800", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + }, + { + "name": "NVIDIA H800", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + }, + { + "name": "NVIDIA H800", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + }, + { + "name": "NVIDIA H800", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + }, + { + "name": "NVIDIA H800", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + }, + { + "name": "NVIDIA H800", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + }, + { + "name": "NVIDIA H800", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + } + ], + "slurm": { + "conf": "/cm/shared/apps/slurm/var/etc/slurm/slurm.conf" + }, + "cudaVersion": "12.2" +} \ No newline at end of file diff --git a/run-20250329_010934-3x35hjks/logs/debug-core.log b/run-20250329_010934-3x35hjks/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..d8d632d05ff9e6f26beebb7865959a41e7a1371a --- /dev/null +++ b/run-20250329_010934-3x35hjks/logs/debug-core.log @@ -0,0 +1,7 @@ +{"time":"2025-03-29T01:09:34.14317334+08:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmp3rt3wng5/port-81257.txt","pid":81257,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false} +{"time":"2025-03-29T01:09:34.144083097+08:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":81257} +{"time":"2025-03-29T01:09:34.144078566+08:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":44293,"Zone":""}} +{"time":"2025-03-29T01:09:34.325040121+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:34274"} +{"time":"2025-03-29T01:09:34.737947334+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"3x35hjks","id":"127.0.0.1:34274"} +{"time":"2025-03-29T01:09:34.951643328+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"3x35hjks","id":"127.0.0.1:34274"} +{"time":"2025-03-29T01:18:00.391846314+08:00","level":"INFO","msg":"received shutdown signal","signal":15} diff --git a/run-20250329_010934-3x35hjks/logs/debug-internal.log b/run-20250329_010934-3x35hjks/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..ca80586195c39a4d943ee3c4a72ed8ee068729a2 --- /dev/null +++ b/run-20250329_010934-3x35hjks/logs/debug-internal.log @@ -0,0 +1,9 @@ +{"time":"2025-03-29T01:09:34.739162327+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.8","symlink path":"../outputs/test_7B/wandb/run-20250329_010934-3x35hjks/logs/debug-core.log"} +{"time":"2025-03-29T01:09:34.951574206+08:00","level":"INFO","msg":"created new stream","id":"3x35hjks"} +{"time":"2025-03-29T01:09:34.951635597+08:00","level":"INFO","msg":"stream: started","id":"3x35hjks"} +{"time":"2025-03-29T01:09:34.951657365+08:00","level":"INFO","msg":"handler: started","stream_id":"3x35hjks"} +{"time":"2025-03-29T01:09:34.95166162+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"3x35hjks"} +{"time":"2025-03-29T01:09:34.951678817+08:00","level":"INFO","msg":"sender: started","stream_id":"3x35hjks"} +{"time":"2025-03-29T01:09:35.344804071+08:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2025-03-29T01:18:00.362781618+08:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2025-03-29T01:18:00.363356131+08:00","level":"INFO","msg":"Stopped system monitor"} diff --git a/run-20250329_010934-3x35hjks/logs/debug.log b/run-20250329_010934-3x35hjks/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..98b8904cf18d279cfdfe68afb35f21846b8579f5 --- /dev/null +++ b/run-20250329_010934-3x35hjks/logs/debug.log @@ -0,0 +1,26 @@ +2025-03-29 01:09:34,733 INFO MainThread:81257 [wandb_setup.py:_flush():67] Current SDK version is 0.19.8 +2025-03-29 01:09:34,733 INFO MainThread:81257 [wandb_setup.py:_flush():67] Configure stats pid to 81257 +2025-03-29 01:09:34,733 INFO MainThread:81257 [wandb_setup.py:_flush():67] Loading settings from /home/yangyaodong/.config/wandb/settings +2025-03-29 01:09:34,733 INFO MainThread:81257 [wandb_setup.py:_flush():67] Loading settings from /aifs4su/yaodong/hantao/align-anything/scripts/wandb/settings +2025-03-29 01:09:34,733 INFO MainThread:81257 [wandb_setup.py:_flush():67] Loading settings from environment variables +2025-03-29 01:09:34,733 INFO MainThread:81257 [wandb_init.py:setup_run_log_directory():647] Logging user logs to ../outputs/test_7B/wandb/run-20250329_010934-3x35hjks/logs/debug.log +2025-03-29 01:09:34,733 INFO MainThread:81257 [wandb_init.py:setup_run_log_directory():648] Logging internal logs to ../outputs/test_7B/wandb/run-20250329_010934-3x35hjks/logs/debug-internal.log +2025-03-29 01:09:34,733 INFO MainThread:81257 [wandb_init.py:init():761] calling init triggers +2025-03-29 01:09:34,733 INFO MainThread:81257 [wandb_init.py:init():766] wandb.init called with sweep_config: {} +config: {'train_cfgs': {'save_checkpoint': False, 'load_checkpoint': False, 'ds_cfgs': 'ds_z3_config.json', 'epochs': 3, 'seed': 42, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 16, 'gradient_checkpointing': True, 'learning_rate': 2e-05, 'lr_scheduler_type': 'cosine', 'lr_warmup_ratio': 0.03, 'weight_decay': 0.0, 'adam_betas': [0.9, 0.95], 'adam_epsilon': 1e-08, 'bf16': True, 'fp16': False, 'eval_strategy': 'epoch', 'eval_interval': 10, 'freeze_mm_proj': False, 'freeze_vision_tower': True, 'freeze_language_model': False, 'max_grad_norm': 1.0}, 'data_cfgs': {'load_multi_datasets': False, 'train_datasets': '/aifs4su/yaodong/hantao/datasets/MMInstruct-GPT4V_mistral-7b_cosi_cut/merged/top1-10', 'train_template': 'MM_TI2T_LLAVA', 'train_size': {}, 'train_split': 'train', 'train_name': 'text-image-to-text', 'train_data_files': {}, 'train_optional_args': [], 'eval_datasets': {}, 'eval_template': {}, 'eval_name': {}, 'eval_size': {}, 'eval_split': {}, 'eval_subset': {}, 'eval_data_files': {}, 'eval_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'align-anything', 'log_run_name': 'sft', 'output_dir': '../outputs/test_7B', 'cache_dir': {}, 'save_total_limit': 6}, 'model_cfgs': {'model_name_or_path': '/aifs4su/yaodong/hantao/models/llava-v1.6-mistral-7b-hf', 'trust_remote_code': True, 'model_max_length': 2048}, 'special_tokens': {}, '_wandb': {}} +2025-03-29 01:09:34,733 INFO MainThread:81257 [wandb_init.py:init():784] starting backend +2025-03-29 01:09:34,733 INFO MainThread:81257 [wandb_init.py:init():788] sending inform_init request +2025-03-29 01:09:34,736 INFO MainThread:81257 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2025-03-29 01:09:34,736 INFO MainThread:81257 [wandb_init.py:init():798] backend started and connected +2025-03-29 01:09:34,737 INFO MainThread:81257 [wandb_init.py:init():891] updated telemetry +2025-03-29 01:09:34,747 INFO MainThread:81257 [wandb_init.py:init():915] communicating run to backend with 90.0 second timeout +2025-03-29 01:09:35,342 INFO MainThread:81257 [wandb_init.py:init():990] starting run threads in backend +2025-03-29 01:09:35,529 INFO MainThread:81257 [wandb_run.py:_console_start():2375] atexit reg +2025-03-29 01:09:35,530 INFO MainThread:81257 [wandb_run.py:_redirect():2227] redirect: wrap_raw +2025-03-29 01:09:35,530 INFO MainThread:81257 [wandb_run.py:_redirect():2292] Wrapping output streams. +2025-03-29 01:09:35,530 INFO MainThread:81257 [wandb_run.py:_redirect():2315] Redirects installed. +2025-03-29 01:09:35,532 INFO MainThread:81257 [wandb_init.py:init():1032] run started, returning control to user process +2025-03-29 01:18:00,361 INFO MainThread:81257 [wandb_run.py:_finish():2112] finishing run htlou/align-anything/3x35hjks +2025-03-29 01:18:00,361 INFO MainThread:81257 [wandb_run.py:_atexit_cleanup():2340] got exitcode: 0 +2025-03-29 01:18:00,362 INFO MainThread:81257 [wandb_run.py:_restore():2322] restore +2025-03-29 01:18:00,362 INFO MainThread:81257 [wandb_run.py:_restore():2328] restore done diff --git a/run-20250329_010934-3x35hjks/run-3x35hjks.wandb b/run-20250329_010934-3x35hjks/run-3x35hjks.wandb new file mode 100644 index 0000000000000000000000000000000000000000..16b182660ffa35b9695293c854b53e7fb0bbcc05 --- /dev/null +++ b/run-20250329_010934-3x35hjks/run-3x35hjks.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:018d7de6a6385986532bce0154bacf9c0124ca5140a3a936efe3f7b7f796aee2 +size 720896 diff --git a/run-20250329_012009-usfepy7k/files/output.log b/run-20250329_012009-usfepy7k/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..f265860d422f19125e8332eb172a6372b4950c7e --- /dev/null +++ b/run-20250329_012009-usfepy7k/files/output.log @@ -0,0 +1,154 @@ +***** Running training ***** +Resuming from checkpoint 1/3 epoch : 0%| | 0/7326 [00:00 2048). Running this sequence through the model will result in indexing errors +/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/torch/utils/checkpoint.py:87: UserWarning: None of the inputs have requires_grad=True. Gradients will be None + warnings.warn( +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. +Training 1/3 epoch (loss 9.1875): 1%|█▋ | 50/7326 [00:59<2:04:31, 1.03s/it]Traceback (most recent call last): + File "", line 198, in _run_module_as_main + File "", line 88, in _run_code + File "/aifs4su/yaodong/hantao/align-anything/align_anything/trainers/text_image_to_text/sft.py", line 100, in + sys.exit(main()) + ^^^^^^ + File "/aifs4su/yaodong/hantao/align-anything/align_anything/trainers/text_image_to_text/sft.py", line 95, in main + trainer.train() + File "/aifs4su/yaodong/hantao/align-anything/align_anything/trainers/text_to_text/sft.py", line 143, in train + info = self.train_step(batch) + ^^^^^^^^^^^^^^^^^^^^^^ + File "/aifs4su/yaodong/hantao/align-anything/align_anything/trainers/text_to_text/sft.py", line 102, in train_step + loss = self.loss(sft_batch)['loss'] + ^^^^^^^^^^^^^^^^^^^^ + File "/aifs4su/yaodong/hantao/align-anything/align_anything/trainers/text_to_text/sft.py", line 97, in loss + outputs = self.model(**self.infer_batch(sft_batch)) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl + return forward_call(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/deepspeed/utils/nvtx.py", line 20, in wrapped_fn + ret_val = func(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^ + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 2030, in forward + loss = self.module(*inputs, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1844, in _call_impl + return inner() + ^^^^^^^ + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1790, in inner + result = forward_call(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func + return func(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^ + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/transformers/models/llava_next/modeling_llava_next.py", line 652, in forward + outputs = self.language_model( + ^^^^^^^^^^^^^^^^^^^^ + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1844, in _call_impl + return inner() + ^^^^^^^ + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1790, in inner + result = forward_call(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func + return func(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^ + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/transformers/models/mistral/modeling_mistral.py", line 842, in forward + outputs = self.model( + ^^^^^^^^^^^ + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1844, in _call_impl + return inner() + ^^^^^^^ + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1790, in inner + result = forward_call(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/transformers/models/mistral/modeling_mistral.py", line 554, in forward + layer_outputs = self._gradient_checkpointing_func( + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/torch/_compile.py", line 32, in inner + return disable_fn(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py", line 632, in _fn + return fn(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^ + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/torch/utils/checkpoint.py", line 489, in checkpoint + return CheckpointFunction.apply(function, preserve, *args) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/torch/autograd/function.py", line 575, in apply + return super().apply(*args, **kwargs) # type: ignore[misc] + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/torch/utils/checkpoint.py", line 264, in forward + outputs = run_function(*args) + ^^^^^^^^^^^^^^^^^^^ + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1844, in _call_impl + return inner() + ^^^^^^^ + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1790, in inner + result = forward_call(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/transformers/models/mistral/modeling_mistral.py", line 247, in forward + hidden_states, self_attn_weights = self.self_attn( + ^^^^^^^^^^^^^^^ + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1844, in _call_impl + return inner() + ^^^^^^^ + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1790, in inner + result = forward_call(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/transformers/models/mistral/modeling_mistral.py", line 162, in forward + query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2) + ^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1844, in _call_impl + return inner() + ^^^^^^^ + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1779, in inner + args_result = hook(self, args) + ^^^^^^^^^^^^^^^^ + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/deepspeed/utils/nvtx.py", line 20, in wrapped_fn + ret_val = func(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^ + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/deepspeed/runtime/zero/parameter_offload.py", line 292, in _pre_forward_module_hook + self.pre_sub_module_forward_function(module) + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context + return func(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^ + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/deepspeed/runtime/zero/parameter_offload.py", line 467, in pre_sub_module_forward_function + param_coordinator.fetch_sub_module(sub_module, forward=True) + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py", line 632, in _fn + return fn(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^ + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/deepspeed/utils/nvtx.py", line 20, in wrapped_fn + ret_val = func(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^ + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context + return func(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^ + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/deepspeed/runtime/zero/partitioned_param_coordinator.py", line 334, in fetch_sub_module + self.__inflight_param_registry.pop(param).wait(handle_dependency=not fast_fetch) + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/deepspeed/runtime/zero/partition_parameters.py", line 732, in wait + handle.wait(handle_dependency) + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/deepspeed/utils/nvtx.py", line 20, in wrapped_fn + ret_val = func(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^ + File "/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/deepspeed/runtime/zero/partition_parameters.py", line 707, in wait + param.data = instrument_w_nvtx(torch.cat)(partitions).view(param.ds_shape) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +KeyboardInterrupt diff --git a/run-20250329_012009-usfepy7k/files/requirements.txt b/run-20250329_012009-usfepy7k/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..83b485a0b59e151549cdb2ba387e6b6f4dfe8ef6 --- /dev/null +++ b/run-20250329_012009-usfepy7k/files/requirements.txt @@ -0,0 +1,183 @@ +maskrcnn_benchmark==0.0.0 +webdataset==0.2.111 +websockets==15.0.1 +typer==0.15.2 +blobfile==3.0.0 +pooch==1.8.2 +matplotlib==3.10.1 +python-dateutil==2.9.0.post0 +gmpy2==2.2.1 +httpcore==1.0.7 +charset-normalizer==3.3.2 +torchlibrosa==0.1.0 +contourpy==1.3.1 +multiprocess==0.70.16 +Werkzeug==3.1.3 +aiofiles==23.2.1 +six==1.17.0 +sse-starlette==2.2.1 +typing_extensions==4.12.2 +psutil==7.0.0 +kiwisolver==1.4.8 +frozenlist==1.5.0 +einops==0.8.1 +flash_attn==2.7.4.post1 +PySocks==1.7.1 +regex==2024.11.6 +markdown-it-py==3.0.0 +ruff==0.11.2 +docker-pycreds==0.4.0 +pyparsing==3.2.3 +protobuf==5.29.4 +resampy==0.4.3 +tokenizers==0.21.0 +aiohappyeyeballs==2.6.1 +llamafactory==0.9.3.dev0 +httpx==0.28.1 +encodec==0.1.1 +ffmpy==0.5.0 +mkl_random==1.2.8 +soxr==0.5.0.post1 +gradio==5.21.0 +absl-py==2.2.1 +networkx==3.4.2 +h5py==3.13.0 +hjson==3.1.0 +tensorboard==2.19.0 +aiosignal==1.3.2 +pip==25.0 +triton==3.1.0 +zipp==3.21.0 +ftfy==6.3.1 +peft==0.15.0 +attrs==25.3.0 +trl==0.9.6 +requests==2.32.3 +progressbar==2.5 +sniffio==1.3.1 +lxml==5.3.1 +starlette==0.46.1 +Markdown==3.7 +mdurl==0.1.2 +torchaudio==2.5.1 +safetensors==0.5.3 +opencv-python==4.6.0.66 +torchvision==0.20.1 +shellingham==1.5.4 +timm==1.0.15 +multidict==6.2.0 +semantic-version==2.10.0 +numba==0.60.0 +MarkupSafe==2.1.5 +pydantic_core==2.33.0 +dill==0.3.8 +msgpack==1.1.0 +sentry-sdk==2.24.1 +grpcio==1.71.0 +cffi==1.17.1 +gradio_client==1.7.2 +PyYAML==6.0.2 +tensorboard-data-server==0.7.2 +termcolor==2.5.0 +fastapi==0.115.12 +lazy_loader==0.4 +mkl_fft==1.3.11 +annotated-types==0.7.0 +scikit-learn==1.6.1 +wget==3.2 +setuptools==75.8.0 +certifi==2025.1.31 +click==8.1.8 +laion_clap==1.1.5 +Pygments==2.19.1 +tomlkit==0.13.2 +idna==3.7 +propcache==0.3.1 +platformdirs==4.3.7 +align-anything==0.0.1.dev0 +deepspeed==0.16.5 +smmap==5.0.2 +pillow==11.1.0 +fonttools==4.56.0 +typing-inspection==0.4.0 +braceexpand==0.1.7 +decorator==5.2.1 +shtab==1.7.1 +pandas==2.2.3 +huggingface-hub==0.29.3 +pyarrow==19.0.1 +GitPython==3.1.44 +xxhash==3.5.0 +packaging==24.2 +numpy==1.23.4 +setproctitle==1.3.5 +llvmlite==0.43.0 +tiktoken==0.9.0 +mpmath==1.3.0 +nvidia-ml-py==12.570.86 +pydantic==2.11.0 +librosa==0.11.0 +frechet_audio_distance==0.3.1 +sympy==1.13.1 +safehttpx==0.1.6 +Jinja2==3.1.6 +tyro==0.8.14 +h11==0.14.0 +aiohttp==3.11.14 +diffusers==0.32.2 +tqdm==4.67.1 +filelock==3.13.1 +scipy==1.10.1 +audioread==3.0.1 +fire==0.7.0 +sentencepiece==0.2.0 +pytz==2025.2 +tzdata==2025.2 +python-multipart==0.0.20 +urllib3==2.3.0 +pycryptodomex==3.22.0 +yarl==1.18.3 +pydub==0.25.1 +pycparser==2.22 +datasets==3.4.1 +soundfile==0.13.1 +transformers==4.50.0 +wcwidth==0.2.13 +groovy==0.1.2 +torch==2.5.1 +anyio==4.9.0 +wandb==0.19.8 +joblib==1.4.2 +fsspec==2024.12.0 +accelerate==1.5.2 +py-cpuinfo==9.0.0 +docstring_parser==0.16 +cycler==0.12.1 +uvicorn==0.34.0 +orjson==3.10.16 +av==14.2.0 +Brotli==1.0.9 +rich==13.9.4 +importlib_metadata==8.6.1 +ninja==1.11.1.4 +wheel==0.45.1 +threadpoolctl==3.6.0 +gitdb==4.0.12 +mkl-service==2.4.0 +llamafactory==0.9.3.dev0 +typing_extensions==4.12.2 +tomli==2.0.1 +zipp==3.19.2 +wheel==0.43.0 +jaraco.text==3.12.1 +packaging==24.2 +autocommand==2.2.2 +jaraco.functools==4.0.1 +jaraco.collections==5.1.0 +platformdirs==4.2.2 +more-itertools==10.3.0 +inflect==7.3.1 +jaraco.context==5.3.0 +typeguard==4.3.0 +backports.tarfile==1.2.0 +importlib_metadata==8.0.0 diff --git a/run-20250329_012009-usfepy7k/files/wandb-metadata.json b/run-20250329_012009-usfepy7k/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..9c0764b5b73ef9c697d0f5595299ec3fdaa12378 --- /dev/null +++ b/run-20250329_012009-usfepy7k/files/wandb-metadata.json @@ -0,0 +1,106 @@ +{ + "os": "Linux-5.15.0-1040-nvidia-x86_64-with-glibc2.35", + "python": "CPython 3.11.11", + "startedAt": "2025-03-28T17:20:09.516632Z", + "args": [ + "--local_rank=0", + "--model_name_or_path", + "/aifs4su/yaodong/hantao/models/llava-v1.6-mistral-7b-hf", + "--train_datasets", + "/aifs4su/yaodong/hantao/datasets/MMInstruct-GPT4V_mistral-7b_cosi_cut/merged/top1-10", + "--train_template", + "MM_TI2T_LLAVA", + "--train_split", + "train", + "--train_name", + "text-image-to-text", + "--output_dir", + "../outputs/test_7B", + "--save_total_limit", + "6", + "--train_batch_size", + "8", + "--epochs", + "3" + ], + "program": "-m align_anything.trainers.text_image_to_text.sft", + "git": { + "remote": "git@github.com-hantao:PKU-Alignment/align-anything.git", + "commit": "106588f9802757a3283c1aff1f33ea9afd737f31" + }, + "email": "2200017789@stu.pku.edu.cn", + "root": "../outputs/test_7B", + "host": "dgx-092", + "executable": "/aifs4su/yaodong/miniconda3/envs/hantao_llama/bin/python", + "cpu_count": 112, + "cpu_count_logical": 224, + "gpu": "NVIDIA H800", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888556142592", + "used": "148611805184" + } + }, + "memory": { + "total": "2164195454976" + }, + "cpu": { + "count": 112, + "countLogical": 224 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA H800", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + }, + { + "name": "NVIDIA H800", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + }, + { + "name": "NVIDIA H800", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + }, + { + "name": "NVIDIA H800", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + }, + { + "name": "NVIDIA H800", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + }, + { + "name": "NVIDIA H800", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + }, + { + "name": "NVIDIA H800", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + }, + { + "name": "NVIDIA H800", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + } + ], + "slurm": { + "conf": "/cm/shared/apps/slurm/var/etc/slurm/slurm.conf" + }, + "cudaVersion": "12.2" +} \ No newline at end of file diff --git a/run-20250329_012009-usfepy7k/logs/debug-core.log b/run-20250329_012009-usfepy7k/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..9db7a48f965c6291c53b3c1793067fd76600d7d0 --- /dev/null +++ b/run-20250329_012009-usfepy7k/logs/debug-core.log @@ -0,0 +1,7 @@ +{"time":"2025-03-29T01:20:08.940791145+08:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmph3v6xxw7/port-98024.txt","pid":98024,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false} +{"time":"2025-03-29T01:20:08.941761996+08:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":98024} +{"time":"2025-03-29T01:20:08.941758889+08:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":41829,"Zone":""}} +{"time":"2025-03-29T01:20:09.121369295+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:35188"} +{"time":"2025-03-29T01:20:09.518160947+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"usfepy7k","id":"127.0.0.1:35188"} +{"time":"2025-03-29T01:20:09.733908769+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"usfepy7k","id":"127.0.0.1:35188"} +{"time":"2025-03-29T01:21:10.549825176+08:00","level":"INFO","msg":"received shutdown signal","signal":15} diff --git a/run-20250329_012009-usfepy7k/logs/debug-internal.log b/run-20250329_012009-usfepy7k/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..c99db30f80ffc42e1e8f909e3e04c06d85fcd4e4 --- /dev/null +++ b/run-20250329_012009-usfepy7k/logs/debug-internal.log @@ -0,0 +1,7 @@ +{"time":"2025-03-29T01:20:09.519462752+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.8","symlink path":"../outputs/test_7B/wandb/run-20250329_012009-usfepy7k/logs/debug-core.log"} +{"time":"2025-03-29T01:20:09.733842228+08:00","level":"INFO","msg":"created new stream","id":"usfepy7k"} +{"time":"2025-03-29T01:20:09.733900575+08:00","level":"INFO","msg":"stream: started","id":"usfepy7k"} +{"time":"2025-03-29T01:20:09.733916678+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"usfepy7k"} +{"time":"2025-03-29T01:20:09.73393085+08:00","level":"INFO","msg":"handler: started","stream_id":"usfepy7k"} +{"time":"2025-03-29T01:20:09.733938004+08:00","level":"INFO","msg":"sender: started","stream_id":"usfepy7k"} +{"time":"2025-03-29T01:20:10.029395847+08:00","level":"INFO","msg":"Starting system monitor"} diff --git a/run-20250329_012009-usfepy7k/logs/debug.log b/run-20250329_012009-usfepy7k/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..67ab8d13736e451dfad67d34865a769afdb3df4a --- /dev/null +++ b/run-20250329_012009-usfepy7k/logs/debug.log @@ -0,0 +1,26 @@ +2025-03-29 01:20:09,513 INFO MainThread:98024 [wandb_setup.py:_flush():67] Current SDK version is 0.19.8 +2025-03-29 01:20:09,513 INFO MainThread:98024 [wandb_setup.py:_flush():67] Configure stats pid to 98024 +2025-03-29 01:20:09,513 INFO MainThread:98024 [wandb_setup.py:_flush():67] Loading settings from /home/yangyaodong/.config/wandb/settings +2025-03-29 01:20:09,513 INFO MainThread:98024 [wandb_setup.py:_flush():67] Loading settings from /aifs4su/yaodong/hantao/align-anything/scripts/wandb/settings +2025-03-29 01:20:09,513 INFO MainThread:98024 [wandb_setup.py:_flush():67] Loading settings from environment variables +2025-03-29 01:20:09,514 INFO MainThread:98024 [wandb_init.py:setup_run_log_directory():647] Logging user logs to ../outputs/test_7B/wandb/run-20250329_012009-usfepy7k/logs/debug.log +2025-03-29 01:20:09,514 INFO MainThread:98024 [wandb_init.py:setup_run_log_directory():648] Logging internal logs to ../outputs/test_7B/wandb/run-20250329_012009-usfepy7k/logs/debug-internal.log +2025-03-29 01:20:09,514 INFO MainThread:98024 [wandb_init.py:init():761] calling init triggers +2025-03-29 01:20:09,514 INFO MainThread:98024 [wandb_init.py:init():766] wandb.init called with sweep_config: {} +config: {'train_cfgs': {'save_checkpoint': False, 'load_checkpoint': False, 'ds_cfgs': 'ds_z3_config.json', 'epochs': 3, 'seed': 42, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 16, 'gradient_checkpointing': True, 'learning_rate': 2e-05, 'lr_scheduler_type': 'cosine', 'lr_warmup_ratio': 0.03, 'weight_decay': 0.0, 'adam_betas': [0.9, 0.95], 'adam_epsilon': 1e-08, 'bf16': True, 'fp16': False, 'eval_strategy': 'epoch', 'eval_interval': 10, 'freeze_mm_proj': False, 'freeze_vision_tower': True, 'freeze_language_model': False, 'max_grad_norm': 1.0}, 'data_cfgs': {'load_multi_datasets': False, 'train_datasets': '/aifs4su/yaodong/hantao/datasets/MMInstruct-GPT4V_mistral-7b_cosi_cut/merged/top1-10', 'train_template': 'MM_TI2T_LLAVA', 'train_size': {}, 'train_split': 'train', 'train_name': 'text-image-to-text', 'train_data_files': {}, 'train_optional_args': [], 'eval_datasets': {}, 'eval_template': {}, 'eval_name': {}, 'eval_size': {}, 'eval_split': {}, 'eval_subset': {}, 'eval_data_files': {}, 'eval_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'align-anything', 'log_run_name': 'sft', 'output_dir': '../outputs/test_7B', 'cache_dir': {}, 'save_total_limit': 6}, 'model_cfgs': {'model_name_or_path': '/aifs4su/yaodong/hantao/models/llava-v1.6-mistral-7b-hf', 'trust_remote_code': True, 'model_max_length': 2048}, 'special_tokens': {}, '_wandb': {}} +2025-03-29 01:20:09,514 INFO MainThread:98024 [wandb_init.py:init():784] starting backend +2025-03-29 01:20:09,514 INFO MainThread:98024 [wandb_init.py:init():788] sending inform_init request +2025-03-29 01:20:09,516 INFO MainThread:98024 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2025-03-29 01:20:09,516 INFO MainThread:98024 [wandb_init.py:init():798] backend started and connected +2025-03-29 01:20:09,517 INFO MainThread:98024 [wandb_init.py:init():891] updated telemetry +2025-03-29 01:20:09,528 INFO MainThread:98024 [wandb_init.py:init():915] communicating run to backend with 90.0 second timeout +2025-03-29 01:20:10,027 INFO MainThread:98024 [wandb_init.py:init():990] starting run threads in backend +2025-03-29 01:20:10,199 INFO MainThread:98024 [wandb_run.py:_console_start():2375] atexit reg +2025-03-29 01:20:10,199 INFO MainThread:98024 [wandb_run.py:_redirect():2227] redirect: wrap_raw +2025-03-29 01:20:10,199 INFO MainThread:98024 [wandb_run.py:_redirect():2292] Wrapping output streams. +2025-03-29 01:20:10,199 INFO MainThread:98024 [wandb_run.py:_redirect():2315] Redirects installed. +2025-03-29 01:20:10,201 INFO MainThread:98024 [wandb_init.py:init():1032] run started, returning control to user process +2025-03-29 01:21:10,537 INFO MainThread:98024 [wandb_run.py:_finish():2112] finishing run htlou/align-anything/usfepy7k +2025-03-29 01:21:10,537 INFO MainThread:98024 [wandb_run.py:_atexit_cleanup():2340] got exitcode: 0 +2025-03-29 01:21:10,537 INFO MainThread:98024 [wandb_run.py:_restore():2322] restore +2025-03-29 01:21:10,537 INFO MainThread:98024 [wandb_run.py:_restore():2328] restore done diff --git a/run-20250329_012009-usfepy7k/run-usfepy7k.wandb b/run-20250329_012009-usfepy7k/run-usfepy7k.wandb new file mode 100644 index 0000000000000000000000000000000000000000..cd6e8d59dd3bce5b604ec7ffc6a3d2f3a20be1b9 Binary files /dev/null and b/run-20250329_012009-usfepy7k/run-usfepy7k.wandb differ diff --git a/run-20250329_012205-co1ecmky/files/output.log b/run-20250329_012205-co1ecmky/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..33b323ddd5ff48f32ae8d138a2554209bc1c9ce0 --- /dev/null +++ b/run-20250329_012205-co1ecmky/files/output.log @@ -0,0 +1,8 @@ +***** Running training ***** +Resuming from checkpoint 1/3 epoch : 0%| | 0/7326 [00:00 2048). Running this sequence through the model will result in indexing errors +/aifs4su/yaodong/miniconda3/envs/hantao_llama/lib/python3.11/site-packages/torch/utils/checkpoint.py:87: UserWarning: None of the inputs have requires_grad=True. Gradients will be None + warnings.warn( +`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. +Training 1/3 epoch (loss 0.6562): 4%|█████████ | 265/7326 [04:44<2:02:28, 1.04s/it]Traceback (most recent call last): +[2025-03-29 01:25:01,535] [INFO] [logging.py:107:log_dist] [Rank 0] step=10, skipped=0, lr=[1.5384615384615387e-05, 1.5384615384615387e-05], mom=[[0.9, 0.95], [0.9, 0.95]] +[2025-03-29 01:25:01,536] [INFO] [timer.py:264:stop] epoch=0/micro_step=160/global_step=10, RunningAvgSamplesPerSec=10.529360575746388, CurrSamplesPerSec=11.550168683891114, MemAllocated=14.6GB, MaxMemAllocated=19.68GB diff --git a/run-20250329_012205-co1ecmky/files/requirements.txt b/run-20250329_012205-co1ecmky/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..83b485a0b59e151549cdb2ba387e6b6f4dfe8ef6 --- /dev/null +++ b/run-20250329_012205-co1ecmky/files/requirements.txt @@ -0,0 +1,183 @@ +maskrcnn_benchmark==0.0.0 +webdataset==0.2.111 +websockets==15.0.1 +typer==0.15.2 +blobfile==3.0.0 +pooch==1.8.2 +matplotlib==3.10.1 +python-dateutil==2.9.0.post0 +gmpy2==2.2.1 +httpcore==1.0.7 +charset-normalizer==3.3.2 +torchlibrosa==0.1.0 +contourpy==1.3.1 +multiprocess==0.70.16 +Werkzeug==3.1.3 +aiofiles==23.2.1 +six==1.17.0 +sse-starlette==2.2.1 +typing_extensions==4.12.2 +psutil==7.0.0 +kiwisolver==1.4.8 +frozenlist==1.5.0 +einops==0.8.1 +flash_attn==2.7.4.post1 +PySocks==1.7.1 +regex==2024.11.6 +markdown-it-py==3.0.0 +ruff==0.11.2 +docker-pycreds==0.4.0 +pyparsing==3.2.3 +protobuf==5.29.4 +resampy==0.4.3 +tokenizers==0.21.0 +aiohappyeyeballs==2.6.1 +llamafactory==0.9.3.dev0 +httpx==0.28.1 +encodec==0.1.1 +ffmpy==0.5.0 +mkl_random==1.2.8 +soxr==0.5.0.post1 +gradio==5.21.0 +absl-py==2.2.1 +networkx==3.4.2 +h5py==3.13.0 +hjson==3.1.0 +tensorboard==2.19.0 +aiosignal==1.3.2 +pip==25.0 +triton==3.1.0 +zipp==3.21.0 +ftfy==6.3.1 +peft==0.15.0 +attrs==25.3.0 +trl==0.9.6 +requests==2.32.3 +progressbar==2.5 +sniffio==1.3.1 +lxml==5.3.1 +starlette==0.46.1 +Markdown==3.7 +mdurl==0.1.2 +torchaudio==2.5.1 +safetensors==0.5.3 +opencv-python==4.6.0.66 +torchvision==0.20.1 +shellingham==1.5.4 +timm==1.0.15 +multidict==6.2.0 +semantic-version==2.10.0 +numba==0.60.0 +MarkupSafe==2.1.5 +pydantic_core==2.33.0 +dill==0.3.8 +msgpack==1.1.0 +sentry-sdk==2.24.1 +grpcio==1.71.0 +cffi==1.17.1 +gradio_client==1.7.2 +PyYAML==6.0.2 +tensorboard-data-server==0.7.2 +termcolor==2.5.0 +fastapi==0.115.12 +lazy_loader==0.4 +mkl_fft==1.3.11 +annotated-types==0.7.0 +scikit-learn==1.6.1 +wget==3.2 +setuptools==75.8.0 +certifi==2025.1.31 +click==8.1.8 +laion_clap==1.1.5 +Pygments==2.19.1 +tomlkit==0.13.2 +idna==3.7 +propcache==0.3.1 +platformdirs==4.3.7 +align-anything==0.0.1.dev0 +deepspeed==0.16.5 +smmap==5.0.2 +pillow==11.1.0 +fonttools==4.56.0 +typing-inspection==0.4.0 +braceexpand==0.1.7 +decorator==5.2.1 +shtab==1.7.1 +pandas==2.2.3 +huggingface-hub==0.29.3 +pyarrow==19.0.1 +GitPython==3.1.44 +xxhash==3.5.0 +packaging==24.2 +numpy==1.23.4 +setproctitle==1.3.5 +llvmlite==0.43.0 +tiktoken==0.9.0 +mpmath==1.3.0 +nvidia-ml-py==12.570.86 +pydantic==2.11.0 +librosa==0.11.0 +frechet_audio_distance==0.3.1 +sympy==1.13.1 +safehttpx==0.1.6 +Jinja2==3.1.6 +tyro==0.8.14 +h11==0.14.0 +aiohttp==3.11.14 +diffusers==0.32.2 +tqdm==4.67.1 +filelock==3.13.1 +scipy==1.10.1 +audioread==3.0.1 +fire==0.7.0 +sentencepiece==0.2.0 +pytz==2025.2 +tzdata==2025.2 +python-multipart==0.0.20 +urllib3==2.3.0 +pycryptodomex==3.22.0 +yarl==1.18.3 +pydub==0.25.1 +pycparser==2.22 +datasets==3.4.1 +soundfile==0.13.1 +transformers==4.50.0 +wcwidth==0.2.13 +groovy==0.1.2 +torch==2.5.1 +anyio==4.9.0 +wandb==0.19.8 +joblib==1.4.2 +fsspec==2024.12.0 +accelerate==1.5.2 +py-cpuinfo==9.0.0 +docstring_parser==0.16 +cycler==0.12.1 +uvicorn==0.34.0 +orjson==3.10.16 +av==14.2.0 +Brotli==1.0.9 +rich==13.9.4 +importlib_metadata==8.6.1 +ninja==1.11.1.4 +wheel==0.45.1 +threadpoolctl==3.6.0 +gitdb==4.0.12 +mkl-service==2.4.0 +llamafactory==0.9.3.dev0 +typing_extensions==4.12.2 +tomli==2.0.1 +zipp==3.19.2 +wheel==0.43.0 +jaraco.text==3.12.1 +packaging==24.2 +autocommand==2.2.2 +jaraco.functools==4.0.1 +jaraco.collections==5.1.0 +platformdirs==4.2.2 +more-itertools==10.3.0 +inflect==7.3.1 +jaraco.context==5.3.0 +typeguard==4.3.0 +backports.tarfile==1.2.0 +importlib_metadata==8.0.0 diff --git a/run-20250329_012205-co1ecmky/files/wandb-metadata.json b/run-20250329_012205-co1ecmky/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..fe1f9de4a14c60d51e08e4280ece3e8614621193 --- /dev/null +++ b/run-20250329_012205-co1ecmky/files/wandb-metadata.json @@ -0,0 +1,106 @@ +{ + "os": "Linux-5.15.0-1040-nvidia-x86_64-with-glibc2.35", + "python": "CPython 3.11.11", + "startedAt": "2025-03-28T17:22:05.249856Z", + "args": [ + "--local_rank=0", + "--model_name_or_path", + "/aifs4su/yaodong/hantao/models/llava-v1.6-mistral-7b-hf", + "--train_datasets", + "/aifs4su/yaodong/hantao/datasets/MMInstruct-GPT4V_mistral-7b_cosi_cut/merged/top1-10", + "--train_template", + "MM_TI2T_LLAVA", + "--train_split", + "train", + "--train_name", + "text-image-to-text", + "--output_dir", + "../outputs/test_7B", + "--save_total_limit", + "6", + "--train_batch_size", + "8", + "--epochs", + "3" + ], + "program": "-m align_anything.trainers.text_image_to_text.sft", + "git": { + "remote": "git@github.com-hantao:PKU-Alignment/align-anything.git", + "commit": "106588f9802757a3283c1aff1f33ea9afd737f31" + }, + "email": "2200017789@stu.pku.edu.cn", + "root": "../outputs/test_7B", + "host": "dgx-092", + "executable": "/aifs4su/yaodong/miniconda3/envs/hantao_llama/bin/python", + "cpu_count": 112, + "cpu_count_logical": 224, + "gpu": "NVIDIA H800", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888556142592", + "used": "148611952640" + } + }, + "memory": { + "total": "2164195454976" + }, + "cpu": { + "count": 112, + "countLogical": 224 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA H800", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + }, + { + "name": "NVIDIA H800", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + }, + { + "name": "NVIDIA H800", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + }, + { + "name": "NVIDIA H800", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + }, + { + "name": "NVIDIA H800", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + }, + { + "name": "NVIDIA H800", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + }, + { + "name": "NVIDIA H800", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + }, + { + "name": "NVIDIA H800", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper" + } + ], + "slurm": { + "conf": "/cm/shared/apps/slurm/var/etc/slurm/slurm.conf" + }, + "cudaVersion": "12.2" +} \ No newline at end of file diff --git a/run-20250329_012205-co1ecmky/logs/debug-core.log b/run-20250329_012205-co1ecmky/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..e1243a0b2270c45d2f396ac5b2a762f661712882 --- /dev/null +++ b/run-20250329_012205-co1ecmky/logs/debug-core.log @@ -0,0 +1,7 @@ +{"time":"2025-03-29T01:22:04.66534089+08:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpv6n8wy7c/port-104999.txt","pid":104999,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false} +{"time":"2025-03-29T01:22:04.666328183+08:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":104999} +{"time":"2025-03-29T01:22:04.666331722+08:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":44323,"Zone":""}} +{"time":"2025-03-29T01:22:04.84500083+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:50136"} +{"time":"2025-03-29T01:22:05.251233529+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"co1ecmky","id":"127.0.0.1:50136"} +{"time":"2025-03-29T01:22:05.467317188+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"co1ecmky","id":"127.0.0.1:50136"} +{"time":"2025-03-29T01:26:51.27808375+08:00","level":"INFO","msg":"received shutdown signal","signal":15} diff --git a/run-20250329_012205-co1ecmky/logs/debug-internal.log b/run-20250329_012205-co1ecmky/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..aad9315db59b0e49d398b5f74261c271e6a306a9 --- /dev/null +++ b/run-20250329_012205-co1ecmky/logs/debug-internal.log @@ -0,0 +1,7 @@ +{"time":"2025-03-29T01:22:05.252520599+08:00","level":"INFO","msg":"stream: starting","core version":"0.19.8","symlink path":"../outputs/test_7B/wandb/run-20250329_012205-co1ecmky/logs/debug-core.log"} +{"time":"2025-03-29T01:22:05.467254306+08:00","level":"INFO","msg":"created new stream","id":"co1ecmky"} +{"time":"2025-03-29T01:22:05.467309548+08:00","level":"INFO","msg":"stream: started","id":"co1ecmky"} +{"time":"2025-03-29T01:22:05.467331707+08:00","level":"INFO","msg":"handler: started","stream_id":"co1ecmky"} +{"time":"2025-03-29T01:22:05.467333162+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"co1ecmky"} +{"time":"2025-03-29T01:22:05.467336174+08:00","level":"INFO","msg":"sender: started","stream_id":"co1ecmky"} +{"time":"2025-03-29T01:22:05.772490021+08:00","level":"INFO","msg":"Starting system monitor"} diff --git a/run-20250329_012205-co1ecmky/logs/debug.log b/run-20250329_012205-co1ecmky/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..e339d8f002e195108d107e4808b55b264e5ae300 --- /dev/null +++ b/run-20250329_012205-co1ecmky/logs/debug.log @@ -0,0 +1,24 @@ +2025-03-29 01:22:05,246 INFO MainThread:104999 [wandb_setup.py:_flush():67] Current SDK version is 0.19.8 +2025-03-29 01:22:05,246 INFO MainThread:104999 [wandb_setup.py:_flush():67] Configure stats pid to 104999 +2025-03-29 01:22:05,246 INFO MainThread:104999 [wandb_setup.py:_flush():67] Loading settings from /home/yangyaodong/.config/wandb/settings +2025-03-29 01:22:05,246 INFO MainThread:104999 [wandb_setup.py:_flush():67] Loading settings from /aifs4su/yaodong/hantao/align-anything/scripts/wandb/settings +2025-03-29 01:22:05,246 INFO MainThread:104999 [wandb_setup.py:_flush():67] Loading settings from environment variables +2025-03-29 01:22:05,246 INFO MainThread:104999 [wandb_init.py:setup_run_log_directory():647] Logging user logs to ../outputs/test_7B/wandb/run-20250329_012205-co1ecmky/logs/debug.log +2025-03-29 01:22:05,246 INFO MainThread:104999 [wandb_init.py:setup_run_log_directory():648] Logging internal logs to ../outputs/test_7B/wandb/run-20250329_012205-co1ecmky/logs/debug-internal.log +2025-03-29 01:22:05,246 INFO MainThread:104999 [wandb_init.py:init():761] calling init triggers +2025-03-29 01:22:05,246 INFO MainThread:104999 [wandb_init.py:init():766] wandb.init called with sweep_config: {} +config: {'train_cfgs': {'save_checkpoint': False, 'load_checkpoint': False, 'ds_cfgs': 'ds_z3_config.json', 'epochs': 3, 'seed': 42, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 16, 'gradient_checkpointing': True, 'learning_rate': 2e-05, 'lr_scheduler_type': 'cosine', 'lr_warmup_ratio': 0.03, 'weight_decay': 0.0, 'adam_betas': [0.9, 0.95], 'adam_epsilon': 1e-08, 'bf16': True, 'fp16': False, 'eval_strategy': 'epoch', 'eval_interval': 10, 'freeze_mm_proj': False, 'freeze_vision_tower': True, 'freeze_language_model': False, 'max_grad_norm': 1.0}, 'data_cfgs': {'load_multi_datasets': False, 'train_datasets': '/aifs4su/yaodong/hantao/datasets/MMInstruct-GPT4V_mistral-7b_cosi_cut/merged/top1-10', 'train_template': 'MM_TI2T_LLAVA', 'train_size': {}, 'train_split': 'train', 'train_name': 'text-image-to-text', 'train_data_files': {}, 'train_optional_args': [], 'eval_datasets': {}, 'eval_template': {}, 'eval_name': {}, 'eval_size': {}, 'eval_split': {}, 'eval_subset': {}, 'eval_data_files': {}, 'eval_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'align-anything', 'log_run_name': 'sft', 'output_dir': '../outputs/test_7B', 'cache_dir': {}, 'save_total_limit': 6}, 'model_cfgs': {'model_name_or_path': '/aifs4su/yaodong/hantao/models/llava-v1.6-mistral-7b-hf', 'trust_remote_code': True, 'model_max_length': 2048}, 'special_tokens': {}, '_wandb': {}} +2025-03-29 01:22:05,246 INFO MainThread:104999 [wandb_init.py:init():784] starting backend +2025-03-29 01:22:05,246 INFO MainThread:104999 [wandb_init.py:init():788] sending inform_init request +2025-03-29 01:22:05,249 INFO MainThread:104999 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2025-03-29 01:22:05,249 INFO MainThread:104999 [wandb_init.py:init():798] backend started and connected +2025-03-29 01:22:05,251 INFO MainThread:104999 [wandb_init.py:init():891] updated telemetry +2025-03-29 01:22:05,262 INFO MainThread:104999 [wandb_init.py:init():915] communicating run to backend with 90.0 second timeout +2025-03-29 01:22:05,770 INFO MainThread:104999 [wandb_init.py:init():990] starting run threads in backend +2025-03-29 01:22:05,989 INFO MainThread:104999 [wandb_run.py:_console_start():2375] atexit reg +2025-03-29 01:22:05,990 INFO MainThread:104999 [wandb_run.py:_redirect():2227] redirect: wrap_raw +2025-03-29 01:22:05,990 INFO MainThread:104999 [wandb_run.py:_redirect():2292] Wrapping output streams. +2025-03-29 01:22:05,990 INFO MainThread:104999 [wandb_run.py:_redirect():2315] Redirects installed. +2025-03-29 01:22:05,992 INFO MainThread:104999 [wandb_init.py:init():1032] run started, returning control to user process +2025-03-29 01:26:51,278 INFO MainThread:104999 [wandb_run.py:_finish():2112] finishing run htlou/align-anything/co1ecmky +2025-03-29 01:26:51,278 INFO MainThread:104999 [wandb_run.py:_atexit_cleanup():2340] got exitcode: 0 diff --git a/run-20250329_012205-co1ecmky/run-co1ecmky.wandb b/run-20250329_012205-co1ecmky/run-co1ecmky.wandb new file mode 100644 index 0000000000000000000000000000000000000000..1876e7ed887df2bdeea227e2d76423f864a7ce0f --- /dev/null +++ b/run-20250329_012205-co1ecmky/run-co1ecmky.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e85fc17c9a06974495f61773295cc5e9c2dc95685211002f23903df63eebd4c4 +size 393216