diff --git a/wandb/run-20241030_010759-dim9v1es/files/wandb-metadata.json b/wandb/run-20241030_010759-dim9v1es/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..a1a29128a90880d67e34a4f37ea8c990e90750d0 --- /dev/null +++ b/wandb/run-20241030_010759-dim9v1es/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-10-30T05:07:59.121382Z", + "args": [ + "--perturbation", + "reverse_control", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "7", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1719200272384" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241030_010759-dim9v1es/files/wandb-summary.json b/wandb/run-20241030_010759-dim9v1es/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..6c37fe1cbbb8aed86fd461a79642cb991e4d35cf --- /dev/null +++ b/wandb/run-20241030_010759-dim9v1es/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":0}} \ No newline at end of file diff --git a/wandb/run-20241030_010759-dim9v1es/logs/debug-internal.log b/wandb/run-20241030_010759-dim9v1es/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..f09d4cd23477aa305709b7457bafeb273e57072b --- /dev/null +++ b/wandb/run-20241030_010759-dim9v1es/logs/debug-internal.log @@ -0,0 +1,16 @@ +{"time":"2024-10-30T01:07:59.123018178-04:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-10-30T01:07:59.123029468-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_010759-dim9v1es/logs/debug-core.log"} +{"time":"2024-10-30T01:07:59.228528967-04:00","level":"INFO","msg":"created new stream","id":"dim9v1es"} +{"time":"2024-10-30T01:07:59.228567837-04:00","level":"INFO","msg":"stream: started","id":"dim9v1es"} +{"time":"2024-10-30T01:07:59.228581067-04:00","level":"INFO","msg":"sender: started","stream_id":"dim9v1es"} +{"time":"2024-10-30T01:07:59.228568237-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"dim9v1es"}} +{"time":"2024-10-30T01:07:59.228568187-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"dim9v1es"}} +{"time":"2024-10-30T01:07:59.441316995-04:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2024-10-30T01:07:59.536719185-04:00","level":"INFO","msg":"stream: closing","id":"dim9v1es"} +{"time":"2024-10-30T01:07:59.536770865-04:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2024-10-30T01:07:59.53739974-04:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2024-10-30T01:08:00.081295733-04:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2024-10-30T01:08:00.206167083-04:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"dim9v1es"}} +{"time":"2024-10-30T01:08:00.206226274-04:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"dim9v1es"}} +{"time":"2024-10-30T01:08:00.206291324-04:00","level":"INFO","msg":"sender: closed","stream_id":"dim9v1es"} +{"time":"2024-10-30T01:08:00.206325864-04:00","level":"INFO","msg":"stream: closed","id":"dim9v1es"} diff --git a/wandb/run-20241030_010759-dim9v1es/logs/debug.log b/wandb/run-20241030_010759-dim9v1es/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..834e7bac02fbbec4cd3d5b338bde8b88b91b6a7b --- /dev/null +++ b/wandb/run-20241030_010759-dim9v1es/logs/debug.log @@ -0,0 +1,27 @@ +2024-10-30 01:07:59,119 INFO MainThread:322462 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-10-30 01:07:59,119 INFO MainThread:322462 [wandb_setup.py:_flush():79] Configure stats pid to 322462 +2024-10-30 01:07:59,119 INFO MainThread:322462 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-10-30 01:07:59,119 INFO MainThread:322462 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-10-30 01:07:59,119 INFO MainThread:322462 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-10-30 01:07:59,119 INFO MainThread:322462 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-10-30 01:07:59,119 INFO MainThread:322462 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-10-30 01:07:59,119 INFO MainThread:322462 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-10-30 01:07:59,119 INFO MainThread:322462 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_010759-dim9v1es/logs/debug.log +2024-10-30 01:07:59,119 INFO MainThread:322462 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_010759-dim9v1es/logs/debug-internal.log +2024-10-30 01:07:59,119 INFO MainThread:322462 [wandb_init.py:init():621] calling init triggers +2024-10-30 01:07:59,119 INFO MainThread:322462 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-10-30 01:07:59,119 INFO MainThread:322462 [wandb_init.py:init():671] starting backend +2024-10-30 01:07:59,119 INFO MainThread:322462 [wandb_init.py:init():675] sending inform_init request +2024-10-30 01:07:59,120 INFO MainThread:322462 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-10-30 01:07:59,121 INFO MainThread:322462 [wandb_init.py:init():688] backend started and connected +2024-10-30 01:07:59,124 INFO MainThread:322462 [wandb_init.py:init():783] updated telemetry +2024-10-30 01:07:59,156 INFO MainThread:322462 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-10-30 01:07:59,438 INFO MainThread:322462 [wandb_init.py:init():867] starting run threads in backend +2024-10-30 01:07:59,533 INFO MainThread:322462 [wandb_run.py:_console_start():2463] atexit reg +2024-10-30 01:07:59,533 INFO MainThread:322462 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-10-30 01:07:59,533 INFO MainThread:322462 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-10-30 01:07:59,533 INFO MainThread:322462 [wandb_run.py:_redirect():2401] Redirects installed. +2024-10-30 01:07:59,535 INFO MainThread:322462 [wandb_init.py:init():911] run started, returning control to user process +2024-10-30 01:07:59,535 INFO MainThread:322462 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_control', 'train_set': '10M', 'batch_size': 3, 'epoch': 7, 'seed': 0} +2024-10-30 01:07:59,536 WARNING MsgRouterThr:322462 [router.py:message_loop():77] message_loop has been closed diff --git a/wandb/run-20241030_012617-yt7vh1dq/files/output.log b/wandb/run-20241030_012617-yt7vh1dq/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..e53ae3d6909872986257e38503a288ad80db241f --- /dev/null +++ b/wandb/run-20241030_012617-yt7vh1dq/files/output.log @@ -0,0 +1,2 @@ +Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:04<00:00, 2.36s/it] +Map: 11%|██████████████ | 2000/18140 [00:06<00:52, 308.60 examples/s] diff --git a/wandb/run-20241030_012617-yt7vh1dq/files/requirements.txt b/wandb/run-20241030_012617-yt7vh1dq/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416 --- /dev/null +++ b/wandb/run-20241030_012617-yt7vh1dq/files/requirements.txt @@ -0,0 +1,147 @@ +funcsigs==1.0.2 +sentry-sdk==2.17.0 +multiprocess==0.70.16 +numpy==1.26.2 +pluralizer==1.2.0 +debugpy==1.6.7 +nvidia-cudnn-cu11==8.5.0.96 +deepspeed==0.15.2 +data==0.4 +pandas==2.1.3 +tomli==2.0.1 +charset-normalizer==3.3.2 +attrs==24.2.0 +aiosignal==1.3.1 +fsspec==2023.10.0 +nvidia-cusparse-cu11==11.7.4.91 +zipp==3.12.0 +mypy-extensions==1.0.0 +datasets==3.0.1 +joblib==1.3.2 +hjson==3.1.0 +traitlets==5.7.1 +stack-data==0.6.0 +transformers==4.45.1 +sympy==1.11.1 +Pygments==2.15.0 +docker-pycreds==0.4.0 +dill==0.3.8 +wheel==0.44.0 +prompt-toolkit==3.0.30 +parso==0.8.3 +ipykernel==6.23.1 +pyarrow==17.0.0 +certifi==2023.11.17 +nvidia-cufft-cu11==10.9.0.58 +six==1.16.0 +pydantic==2.9.2 +click==8.1.7 +nest-asyncio==1.5.6 +gmpy2==2.1.0 +matplotlib==3.8.2 +scipy==1.11.4 +typing_extensions==4.12.2 +statsmodels==0.14.0 +huggingface-hub==0.25.0 +frozenlist==1.4.1 +gpustat==1.1.1 +nvidia-nvtx-cu11==11.7.91 +safetensors==0.4.5 +stanza==1.9.2 +decorator==5.1.1 +seaborn==0.13.0 +sentencepiece==0.2.0 +PyYAML==6.0.1 +black==24.8.0 +protobuf==4.25.1 +pickleshare==0.7.5 +peft==0.13.0 +triton==2.0.0 +nvidia-cuda-runtime-cu11==11.7.99 +Jinja2==3.1.2 +nvidia-cusolver-cu11==11.4.0.1 +executing==1.2.0 +jupyter_client==8.1.0 +pluggy==1.3.0 +cmake==3.30.3 +pytz==2023.3.post1 +aiohappyeyeballs==2.4.2 +kiwisolver==1.4.5 +py-cpuinfo==9.0.0 +Pillow==10.1.0 +ptyprocess==0.7.0 +importlib_resources==6.4.5 +GitPython==3.1.43 +importlib-metadata==6.0.0 +iniconfig==2.0.0 +scikit-learn==1.3.2 +exceptiongroup==1.1.0 +networkx==2.8.6 +accelerate==1.0.0 +nltk==3.8.1 +shutilwhich==1.1.0 +fonttools==4.45.1 +future==0.18.3 +aiohttp==3.10.6 +wcwidth==0.2.5 +idna==3.6 +filelock==3.12.2 +pathspec==0.12.1 +jupyter_core==5.1.0 +lit==18.1.8 +nvidia-curand-cu11==10.2.10.91 +nvidia-cublas-cu11==11.10.3.66 +nvidia-ml-py==12.560.30 +msgpack==1.1.0 +python-dateutil==2.8.2 +blessed==1.20.0 +packaging==23.0 +gitdb==4.0.11 +yarl==1.13.0 +emoji==2.8.0 +tzdata==2023.3 +cycler==0.12.1 +tornado==6.2 +backcall==0.2.0 +plotnine==0.12.4 +ninja==1.11.1.1 +latex==0.7.0 +wandb==0.18.5 +setproctitle==1.3.3 +threadpoolctl==3.2.0 +requests==2.32.3 +pyparsing==3.1.1 +smmap==5.0.1 +pyzmq==23.0.0 +async-timeout==4.0.3 +annotated-types==0.7.0 +matplotlib-inline==0.1.6 +latexcodec==1.0.0 +ipython==8.0.0 +patsy==0.5.3 +contourpy==1.2.0 +multidict==6.1.0 +mizani==0.9.3 +urllib3==2.1.0 +tokenizers==0.20.0 +MarkupSafe==2.1.2 +pip==24.2 +pexpect==4.8.0 +tqdm==4.66.5 +jedi==0.18.2 +pydantic_core==2.23.4 +tempdir==0.7.1 +mpmath==1.2.1 +setuptools==72.1.0 +pytest==7.4.3 +pure-eval==0.2.2 +psutil==5.9.1 +comm==0.1.2 +nvidia-cuda-cupti-cu11==11.7.101 +nvidia-cuda-nvrtc-cu11==11.7.99 +regex==2023.10.3 +platformdirs==2.5.2 +asttokens==2.2.1 +torch==2.0.0 +nvidia-nccl-cu11==2.14.3 +xxhash==3.5.0 diff --git a/wandb/run-20241030_012617-yt7vh1dq/files/wandb-metadata.json b/wandb/run-20241030_012617-yt7vh1dq/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..2c54bd4724db32a1cda3272082e3d60eb81487a5 --- /dev/null +++ b/wandb/run-20241030_012617-yt7vh1dq/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-10-30T05:26:17.324794Z", + "args": [ + "--perturbation", + "reverse_control", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "7", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1709772775424" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241030_012617-yt7vh1dq/logs/debug-internal.log b/wandb/run-20241030_012617-yt7vh1dq/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..70a031dec399cd9ca958fbd5469b4158a7dc6646 --- /dev/null +++ b/wandb/run-20241030_012617-yt7vh1dq/logs/debug-internal.log @@ -0,0 +1,8 @@ +{"time":"2024-10-30T01:26:17.327161166-04:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-10-30T01:26:17.327175976-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_012617-yt7vh1dq/logs/debug-core.log"} +{"time":"2024-10-30T01:26:17.435118413-04:00","level":"INFO","msg":"created new stream","id":"yt7vh1dq"} +{"time":"2024-10-30T01:26:17.435165823-04:00","level":"INFO","msg":"stream: started","id":"yt7vh1dq"} +{"time":"2024-10-30T01:26:17.435237323-04:00","level":"INFO","msg":"sender: started","stream_id":"yt7vh1dq"} +{"time":"2024-10-30T01:26:17.435183773-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"yt7vh1dq"}} +{"time":"2024-10-30T01:26:17.435252003-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"yt7vh1dq"}} +{"time":"2024-10-30T01:26:17.695977809-04:00","level":"INFO","msg":"Starting system monitor"} diff --git a/wandb/run-20241030_012617-yt7vh1dq/logs/debug.log b/wandb/run-20241030_012617-yt7vh1dq/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..7ed3b0a4b1d0f4e8ed1af0fead97f3848cd4ecd9 --- /dev/null +++ b/wandb/run-20241030_012617-yt7vh1dq/logs/debug.log @@ -0,0 +1,26 @@ +2024-10-30 01:26:17,323 INFO MainThread:332624 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-10-30 01:26:17,323 INFO MainThread:332624 [wandb_setup.py:_flush():79] Configure stats pid to 332624 +2024-10-30 01:26:17,323 INFO MainThread:332624 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-10-30 01:26:17,323 INFO MainThread:332624 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-10-30 01:26:17,323 INFO MainThread:332624 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-10-30 01:26:17,323 INFO MainThread:332624 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-10-30 01:26:17,323 INFO MainThread:332624 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-10-30 01:26:17,323 INFO MainThread:332624 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-10-30 01:26:17,323 INFO MainThread:332624 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_012617-yt7vh1dq/logs/debug.log +2024-10-30 01:26:17,323 INFO MainThread:332624 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_012617-yt7vh1dq/logs/debug-internal.log +2024-10-30 01:26:17,323 INFO MainThread:332624 [wandb_init.py:init():621] calling init triggers +2024-10-30 01:26:17,323 INFO MainThread:332624 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-10-30 01:26:17,323 INFO MainThread:332624 [wandb_init.py:init():671] starting backend +2024-10-30 01:26:17,323 INFO MainThread:332624 [wandb_init.py:init():675] sending inform_init request +2024-10-30 01:26:17,324 INFO MainThread:332624 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-10-30 01:26:17,324 INFO MainThread:332624 [wandb_init.py:init():688] backend started and connected +2024-10-30 01:26:17,328 INFO MainThread:332624 [wandb_init.py:init():783] updated telemetry +2024-10-30 01:26:17,385 INFO MainThread:332624 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-10-30 01:26:17,692 INFO MainThread:332624 [wandb_init.py:init():867] starting run threads in backend +2024-10-30 01:26:17,844 INFO MainThread:332624 [wandb_run.py:_console_start():2463] atexit reg +2024-10-30 01:26:17,844 INFO MainThread:332624 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-10-30 01:26:17,844 INFO MainThread:332624 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-10-30 01:26:17,844 INFO MainThread:332624 [wandb_run.py:_redirect():2401] Redirects installed. +2024-10-30 01:26:17,849 INFO MainThread:332624 [wandb_init.py:init():911] run started, returning control to user process +2024-10-30 01:26:17,850 INFO MainThread:332624 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_control', 'train_set': '10M', 'batch_size': 3, 'epoch': 7, 'seed': 0} diff --git a/wandb/run-20241030_013141-bkcoggdw/files/config.yaml b/wandb/run-20241030_013141-bkcoggdw/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d471293c627dbaeaf5be47c709016b3dfbf28c6c --- /dev/null +++ b/wandb/run-20241030_013141-bkcoggdw/files/config.yaml @@ -0,0 +1,47 @@ +_wandb: + value: + cli_version: 0.18.5 + m: [] + python_version: 3.9.19 + t: + "1": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "2": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "3": + - 13 + - 23 + - 55 + "4": 3.9.19 + "5": 0.18.5 + "6": 4.45.1 + "8": + - 5 + "12": 0.18.5 + "13": linux-x86_64 +batch_size: + value: 3 +epoch: + value: 7 +perturbation: + value: reverse_full +seed: + value: 0 +train_set: + value: 10M diff --git a/wandb/run-20241030_013141-bkcoggdw/files/output.log b/wandb/run-20241030_013141-bkcoggdw/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..d0ecc4c27edb1cb376892301d35b62359512b1a3 --- /dev/null +++ b/wandb/run-20241030_013141-bkcoggdw/files/output.log @@ -0,0 +1,62 @@ +wandb: 500 encountered ({"errors":[{"message":"An internal error occurred. Please contact support.","path":["upsertBucket"]}],"data":{"upsertBucket":null}}), retrying request +model.safetensors.index.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20.9k/20.9k [00:00<00:00, 11.2MB/s] +Downloading shards: 0%| | 0/2 [01:04 + model = AutoModelForCausalLM.from_pretrained(model_name, + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/models/auto/auto_factory.py", line 564, in from_pretrained + return model_class.from_pretrained( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/modeling_utils.py", line 3769, in from_pretrained + resolved_archive_file, sharded_metadata = get_checkpoint_shard_files( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 1098, in get_checkpoint_shard_files + cached_filename = cached_file( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 403, in cached_file + resolved_file = hf_hub_download( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_deprecation.py", line 101, in inner_f + return f(*args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn + return fn(*args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1232, in hf_hub_download + return _hf_hub_download_to_cache_dir( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1381, in _hf_hub_download_to_cache_dir + _download_to_tmp_and_move( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1915, in _download_to_tmp_and_move + http_get( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 558, in http_get + return http_get( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/std.py", line 1140, in __exit__ + self.close() + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/std.py", line 1275, in close + self._decr_instances(self) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/std.py", line 696, in _decr_instances + with cls._lock: + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/std.py", line 110, in __enter__ + def __enter__(self): +KeyboardInterrupt diff --git a/wandb/run-20241030_013141-bkcoggdw/files/requirements.txt b/wandb/run-20241030_013141-bkcoggdw/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416 --- /dev/null +++ b/wandb/run-20241030_013141-bkcoggdw/files/requirements.txt @@ -0,0 +1,147 @@ +funcsigs==1.0.2 +sentry-sdk==2.17.0 +multiprocess==0.70.16 +numpy==1.26.2 +pluralizer==1.2.0 +debugpy==1.6.7 +nvidia-cudnn-cu11==8.5.0.96 +deepspeed==0.15.2 +data==0.4 +pandas==2.1.3 +tomli==2.0.1 +charset-normalizer==3.3.2 +attrs==24.2.0 +aiosignal==1.3.1 +fsspec==2023.10.0 +nvidia-cusparse-cu11==11.7.4.91 +zipp==3.12.0 +mypy-extensions==1.0.0 +datasets==3.0.1 +joblib==1.3.2 +hjson==3.1.0 +traitlets==5.7.1 +stack-data==0.6.0 +transformers==4.45.1 +sympy==1.11.1 +Pygments==2.15.0 +docker-pycreds==0.4.0 +dill==0.3.8 +wheel==0.44.0 +prompt-toolkit==3.0.30 +parso==0.8.3 +ipykernel==6.23.1 +pyarrow==17.0.0 +certifi==2023.11.17 +nvidia-cufft-cu11==10.9.0.58 +six==1.16.0 +pydantic==2.9.2 +click==8.1.7 +nest-asyncio==1.5.6 +gmpy2==2.1.0 +matplotlib==3.8.2 +scipy==1.11.4 +typing_extensions==4.12.2 +statsmodels==0.14.0 +huggingface-hub==0.25.0 +frozenlist==1.4.1 +gpustat==1.1.1 +nvidia-nvtx-cu11==11.7.91 +safetensors==0.4.5 +stanza==1.9.2 +decorator==5.1.1 +seaborn==0.13.0 +sentencepiece==0.2.0 +PyYAML==6.0.1 +black==24.8.0 +protobuf==4.25.1 +pickleshare==0.7.5 +peft==0.13.0 +triton==2.0.0 +nvidia-cuda-runtime-cu11==11.7.99 +Jinja2==3.1.2 +nvidia-cusolver-cu11==11.4.0.1 +executing==1.2.0 +jupyter_client==8.1.0 +pluggy==1.3.0 +cmake==3.30.3 +pytz==2023.3.post1 +aiohappyeyeballs==2.4.2 +kiwisolver==1.4.5 +py-cpuinfo==9.0.0 +Pillow==10.1.0 +ptyprocess==0.7.0 +importlib_resources==6.4.5 +GitPython==3.1.43 +importlib-metadata==6.0.0 +iniconfig==2.0.0 +scikit-learn==1.3.2 +exceptiongroup==1.1.0 +networkx==2.8.6 +accelerate==1.0.0 +nltk==3.8.1 +shutilwhich==1.1.0 +fonttools==4.45.1 +future==0.18.3 +aiohttp==3.10.6 +wcwidth==0.2.5 +idna==3.6 +filelock==3.12.2 +pathspec==0.12.1 +jupyter_core==5.1.0 +lit==18.1.8 +nvidia-curand-cu11==10.2.10.91 +nvidia-cublas-cu11==11.10.3.66 +nvidia-ml-py==12.560.30 +msgpack==1.1.0 +python-dateutil==2.8.2 +blessed==1.20.0 +packaging==23.0 +gitdb==4.0.11 +yarl==1.13.0 +emoji==2.8.0 +tzdata==2023.3 +cycler==0.12.1 +tornado==6.2 +backcall==0.2.0 +plotnine==0.12.4 +ninja==1.11.1.1 +latex==0.7.0 +wandb==0.18.5 +setproctitle==1.3.3 +threadpoolctl==3.2.0 +requests==2.32.3 +pyparsing==3.1.1 +smmap==5.0.1 +pyzmq==23.0.0 +async-timeout==4.0.3 +annotated-types==0.7.0 +matplotlib-inline==0.1.6 +latexcodec==1.0.0 +ipython==8.0.0 +patsy==0.5.3 +contourpy==1.2.0 +multidict==6.1.0 +mizani==0.9.3 +urllib3==2.1.0 +tokenizers==0.20.0 +MarkupSafe==2.1.2 +pip==24.2 +pexpect==4.8.0 +tqdm==4.66.5 +jedi==0.18.2 +pydantic_core==2.23.4 +tempdir==0.7.1 +mpmath==1.2.1 +setuptools==72.1.0 +pytest==7.4.3 +pure-eval==0.2.2 +psutil==5.9.1 +comm==0.1.2 +nvidia-cuda-cupti-cu11==11.7.101 +nvidia-cuda-nvrtc-cu11==11.7.99 +regex==2023.10.3 +platformdirs==2.5.2 +asttokens==2.2.1 +torch==2.0.0 +nvidia-nccl-cu11==2.14.3 +xxhash==3.5.0 diff --git a/wandb/run-20241030_013141-bkcoggdw/files/wandb-metadata.json b/wandb/run-20241030_013141-bkcoggdw/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..384dc5d9c3ccca87869fb92ab5b8c911825746ed --- /dev/null +++ b/wandb/run-20241030_013141-bkcoggdw/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-10-30T05:31:41.693480Z", + "args": [ + "--perturbation", + "reverse_full", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "7", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1709824425984" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241030_013141-bkcoggdw/files/wandb-summary.json b/wandb/run-20241030_013141-bkcoggdw/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..b155621703e444311fe2da9b782a2b70b5491169 --- /dev/null +++ b/wandb/run-20241030_013141-bkcoggdw/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":94}} \ No newline at end of file diff --git a/wandb/run-20241030_013141-bkcoggdw/logs/debug-internal.log b/wandb/run-20241030_013141-bkcoggdw/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..d69bd8925e81bf8fd31ec3529e2edc6cf85e29a7 --- /dev/null +++ b/wandb/run-20241030_013141-bkcoggdw/logs/debug-internal.log @@ -0,0 +1,12 @@ +{"time":"2024-10-30T01:31:41.69578299-04:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-10-30T01:31:41.6958064-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_013141-bkcoggdw/logs/debug-core.log"} +{"time":"2024-10-30T01:31:41.804007808-04:00","level":"INFO","msg":"created new stream","id":"bkcoggdw"} +{"time":"2024-10-30T01:31:41.804049448-04:00","level":"INFO","msg":"stream: started","id":"bkcoggdw"} +{"time":"2024-10-30T01:31:41.804096128-04:00","level":"INFO","msg":"sender: started","stream_id":"bkcoggdw"} +{"time":"2024-10-30T01:31:41.804097678-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"bkcoggdw"}} +{"time":"2024-10-30T01:31:41.804074928-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"bkcoggdw"}} +{"time":"2024-10-30T01:31:41.90989171-04:00","level":"INFO","msg":"api: retrying HTTP error","status":500,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T01:31:44.314369215-04:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2024-10-30T01:33:16.64084496-04:00","level":"INFO","msg":"stream: closing","id":"bkcoggdw"} +{"time":"2024-10-30T01:33:16.64087647-04:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2024-10-30T01:33:16.641390294-04:00","level":"INFO","msg":"Stopped system monitor"} diff --git a/wandb/run-20241030_112852-av3r7rx8/files/wandb-metadata.json b/wandb/run-20241030_112852-av3r7rx8/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..4cf16452ad5800bb82814bb30ae5db662ec9588f --- /dev/null +++ b/wandb/run-20241030_112852-av3r7rx8/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-10-30T15:28:52.925806Z", + "args": [ + "--perturbation", + "reverse_control", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "3", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1710831611904" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241030_225833-giupspdj/logs/debug-internal.log b/wandb/run-20241030_225833-giupspdj/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..d44c542ac2782d7a17ea50d32c95ef95ea8035f8 --- /dev/null +++ b/wandb/run-20241030_225833-giupspdj/logs/debug-internal.log @@ -0,0 +1,8 @@ +{"time":"2024-10-30T22:58:33.52447176-04:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-10-30T22:58:33.52448387-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_225833-giupspdj/logs/debug-core.log"} +{"time":"2024-10-30T22:58:33.631043427-04:00","level":"INFO","msg":"created new stream","id":"giupspdj"} +{"time":"2024-10-30T22:58:33.631075407-04:00","level":"INFO","msg":"stream: started","id":"giupspdj"} +{"time":"2024-10-30T22:58:33.631121257-04:00","level":"INFO","msg":"sender: started","stream_id":"giupspdj"} +{"time":"2024-10-30T22:58:33.631092947-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"giupspdj"}} +{"time":"2024-10-30T22:58:33.631105957-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"giupspdj"}} +{"time":"2024-10-30T22:58:33.831702761-04:00","level":"INFO","msg":"Starting system monitor"} diff --git a/wandb/run-20241031_001055-32u9qnul/files/output.log b/wandb/run-20241031_001055-32u9qnul/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..d6ca833b35444b28034c5e68bd2d8c659b61d6e5 --- /dev/null +++ b/wandb/run-20241031_001055-32u9qnul/files/output.log @@ -0,0 +1,13 @@ +Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:05<00:00, 2.84s/it] +tokenized_valid: Dataset({ + features: ['input_ids', 'attention_mask'], + num_rows: 600 +}) +/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead + warnings.warn( +[2024-10-31 00:11:03,787] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-10-31 00:11:12,645] [INFO] [comm.py:652:init_distributed] cdb=None +Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination +Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root... +Loading extension module cpu_adam... +Time to load cpu_adam op: 5.372655630111694 seconds diff --git a/wandb/run-20241031_001055-32u9qnul/files/requirements.txt b/wandb/run-20241031_001055-32u9qnul/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416 --- /dev/null +++ b/wandb/run-20241031_001055-32u9qnul/files/requirements.txt @@ -0,0 +1,147 @@ +funcsigs==1.0.2 +sentry-sdk==2.17.0 +multiprocess==0.70.16 +numpy==1.26.2 +pluralizer==1.2.0 +debugpy==1.6.7 +nvidia-cudnn-cu11==8.5.0.96 +deepspeed==0.15.2 +data==0.4 +pandas==2.1.3 +tomli==2.0.1 +charset-normalizer==3.3.2 +attrs==24.2.0 +aiosignal==1.3.1 +fsspec==2023.10.0 +nvidia-cusparse-cu11==11.7.4.91 +zipp==3.12.0 +mypy-extensions==1.0.0 +datasets==3.0.1 +joblib==1.3.2 +hjson==3.1.0 +traitlets==5.7.1 +stack-data==0.6.0 +transformers==4.45.1 +sympy==1.11.1 +Pygments==2.15.0 +docker-pycreds==0.4.0 +dill==0.3.8 +wheel==0.44.0 +prompt-toolkit==3.0.30 +parso==0.8.3 +ipykernel==6.23.1 +pyarrow==17.0.0 +certifi==2023.11.17 +nvidia-cufft-cu11==10.9.0.58 +six==1.16.0 +pydantic==2.9.2 +click==8.1.7 +nest-asyncio==1.5.6 +gmpy2==2.1.0 +matplotlib==3.8.2 +scipy==1.11.4 +typing_extensions==4.12.2 +statsmodels==0.14.0 +huggingface-hub==0.25.0 +frozenlist==1.4.1 +gpustat==1.1.1 +nvidia-nvtx-cu11==11.7.91 +safetensors==0.4.5 +stanza==1.9.2 +decorator==5.1.1 +seaborn==0.13.0 +sentencepiece==0.2.0 +PyYAML==6.0.1 +black==24.8.0 +protobuf==4.25.1 +pickleshare==0.7.5 +peft==0.13.0 +triton==2.0.0 +nvidia-cuda-runtime-cu11==11.7.99 +Jinja2==3.1.2 +nvidia-cusolver-cu11==11.4.0.1 +executing==1.2.0 +jupyter_client==8.1.0 +pluggy==1.3.0 +cmake==3.30.3 +pytz==2023.3.post1 +aiohappyeyeballs==2.4.2 +kiwisolver==1.4.5 +py-cpuinfo==9.0.0 +Pillow==10.1.0 +ptyprocess==0.7.0 +importlib_resources==6.4.5 +GitPython==3.1.43 +importlib-metadata==6.0.0 +iniconfig==2.0.0 +scikit-learn==1.3.2 +exceptiongroup==1.1.0 +networkx==2.8.6 +accelerate==1.0.0 +nltk==3.8.1 +shutilwhich==1.1.0 +fonttools==4.45.1 +future==0.18.3 +aiohttp==3.10.6 +wcwidth==0.2.5 +idna==3.6 +filelock==3.12.2 +pathspec==0.12.1 +jupyter_core==5.1.0 +lit==18.1.8 +nvidia-curand-cu11==10.2.10.91 +nvidia-cublas-cu11==11.10.3.66 +nvidia-ml-py==12.560.30 +msgpack==1.1.0 +python-dateutil==2.8.2 +blessed==1.20.0 +packaging==23.0 +gitdb==4.0.11 +yarl==1.13.0 +emoji==2.8.0 +tzdata==2023.3 +cycler==0.12.1 +tornado==6.2 +backcall==0.2.0 +plotnine==0.12.4 +ninja==1.11.1.1 +latex==0.7.0 +wandb==0.18.5 +setproctitle==1.3.3 +threadpoolctl==3.2.0 +requests==2.32.3 +pyparsing==3.1.1 +smmap==5.0.1 +pyzmq==23.0.0 +async-timeout==4.0.3 +annotated-types==0.7.0 +matplotlib-inline==0.1.6 +latexcodec==1.0.0 +ipython==8.0.0 +patsy==0.5.3 +contourpy==1.2.0 +multidict==6.1.0 +mizani==0.9.3 +urllib3==2.1.0 +tokenizers==0.20.0 +MarkupSafe==2.1.2 +pip==24.2 +pexpect==4.8.0 +tqdm==4.66.5 +jedi==0.18.2 +pydantic_core==2.23.4 +tempdir==0.7.1 +mpmath==1.2.1 +setuptools==72.1.0 +pytest==7.4.3 +pure-eval==0.2.2 +psutil==5.9.1 +comm==0.1.2 +nvidia-cuda-cupti-cu11==11.7.101 +nvidia-cuda-nvrtc-cu11==11.7.99 +regex==2023.10.3 +platformdirs==2.5.2 +asttokens==2.2.1 +torch==2.0.0 +nvidia-nccl-cu11==2.14.3 +xxhash==3.5.0 diff --git a/wandb/run-20241031_001055-32u9qnul/files/wandb-metadata.json b/wandb/run-20241031_001055-32u9qnul/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..a6ab8a9f5a3a84eedc7d3300c02e6d025fff21f3 --- /dev/null +++ b/wandb/run-20241031_001055-32u9qnul/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-10-31T04:10:55.973455Z", + "args": [ + "--perturbation", + "reverse_full", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "6", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1728856920064" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241031_001055-sr4xke8e/files/wandb-metadata.json b/wandb/run-20241031_001055-sr4xke8e/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..2dd5d1779d154c5240ad450a940c656286731617 --- /dev/null +++ b/wandb/run-20241031_001055-sr4xke8e/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-10-31T04:10:55.613835Z", + "args": [ + "--perturbation", + "reverse_full", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "6", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1728850759680" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241031_001055-sr4xke8e/logs/debug-internal.log b/wandb/run-20241031_001055-sr4xke8e/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..5e0ff061b071aa5ae140dd99663db4ac6ac4da04 --- /dev/null +++ b/wandb/run-20241031_001055-sr4xke8e/logs/debug-internal.log @@ -0,0 +1,8 @@ +{"time":"2024-10-31T00:10:55.615453654-04:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-10-31T00:10:55.615464774-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241031_001055-sr4xke8e/logs/debug-core.log"} +{"time":"2024-10-31T00:10:55.72181439-04:00","level":"INFO","msg":"created new stream","id":"sr4xke8e"} +{"time":"2024-10-31T00:10:55.7218437-04:00","level":"INFO","msg":"stream: started","id":"sr4xke8e"} +{"time":"2024-10-31T00:10:55.721915701-04:00","level":"INFO","msg":"sender: started","stream_id":"sr4xke8e"} +{"time":"2024-10-31T00:10:55.721914011-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"sr4xke8e"}} +{"time":"2024-10-31T00:10:55.721899881-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"sr4xke8e"}} +{"time":"2024-10-31T00:10:55.919527304-04:00","level":"INFO","msg":"Starting system monitor"} diff --git a/wandb/run-20241031_114700-3cqkhntc/files/requirements.txt b/wandb/run-20241031_114700-3cqkhntc/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416 --- /dev/null +++ b/wandb/run-20241031_114700-3cqkhntc/files/requirements.txt @@ -0,0 +1,147 @@ +funcsigs==1.0.2 +sentry-sdk==2.17.0 +multiprocess==0.70.16 +numpy==1.26.2 +pluralizer==1.2.0 +debugpy==1.6.7 +nvidia-cudnn-cu11==8.5.0.96 +deepspeed==0.15.2 +data==0.4 +pandas==2.1.3 +tomli==2.0.1 +charset-normalizer==3.3.2 +attrs==24.2.0 +aiosignal==1.3.1 +fsspec==2023.10.0 +nvidia-cusparse-cu11==11.7.4.91 +zipp==3.12.0 +mypy-extensions==1.0.0 +datasets==3.0.1 +joblib==1.3.2 +hjson==3.1.0 +traitlets==5.7.1 +stack-data==0.6.0 +transformers==4.45.1 +sympy==1.11.1 +Pygments==2.15.0 +docker-pycreds==0.4.0 +dill==0.3.8 +wheel==0.44.0 +prompt-toolkit==3.0.30 +parso==0.8.3 +ipykernel==6.23.1 +pyarrow==17.0.0 +certifi==2023.11.17 +nvidia-cufft-cu11==10.9.0.58 +six==1.16.0 +pydantic==2.9.2 +click==8.1.7 +nest-asyncio==1.5.6 +gmpy2==2.1.0 +matplotlib==3.8.2 +scipy==1.11.4 +typing_extensions==4.12.2 +statsmodels==0.14.0 +huggingface-hub==0.25.0 +frozenlist==1.4.1 +gpustat==1.1.1 +nvidia-nvtx-cu11==11.7.91 +safetensors==0.4.5 +stanza==1.9.2 +decorator==5.1.1 +seaborn==0.13.0 +sentencepiece==0.2.0 +PyYAML==6.0.1 +black==24.8.0 +protobuf==4.25.1 +pickleshare==0.7.5 +peft==0.13.0 +triton==2.0.0 +nvidia-cuda-runtime-cu11==11.7.99 +Jinja2==3.1.2 +nvidia-cusolver-cu11==11.4.0.1 +executing==1.2.0 +jupyter_client==8.1.0 +pluggy==1.3.0 +cmake==3.30.3 +pytz==2023.3.post1 +aiohappyeyeballs==2.4.2 +kiwisolver==1.4.5 +py-cpuinfo==9.0.0 +Pillow==10.1.0 +ptyprocess==0.7.0 +importlib_resources==6.4.5 +GitPython==3.1.43 +importlib-metadata==6.0.0 +iniconfig==2.0.0 +scikit-learn==1.3.2 +exceptiongroup==1.1.0 +networkx==2.8.6 +accelerate==1.0.0 +nltk==3.8.1 +shutilwhich==1.1.0 +fonttools==4.45.1 +future==0.18.3 +aiohttp==3.10.6 +wcwidth==0.2.5 +idna==3.6 +filelock==3.12.2 +pathspec==0.12.1 +jupyter_core==5.1.0 +lit==18.1.8 +nvidia-curand-cu11==10.2.10.91 +nvidia-cublas-cu11==11.10.3.66 +nvidia-ml-py==12.560.30 +msgpack==1.1.0 +python-dateutil==2.8.2 +blessed==1.20.0 +packaging==23.0 +gitdb==4.0.11 +yarl==1.13.0 +emoji==2.8.0 +tzdata==2023.3 +cycler==0.12.1 +tornado==6.2 +backcall==0.2.0 +plotnine==0.12.4 +ninja==1.11.1.1 +latex==0.7.0 +wandb==0.18.5 +setproctitle==1.3.3 +threadpoolctl==3.2.0 +requests==2.32.3 +pyparsing==3.1.1 +smmap==5.0.1 +pyzmq==23.0.0 +async-timeout==4.0.3 +annotated-types==0.7.0 +matplotlib-inline==0.1.6 +latexcodec==1.0.0 +ipython==8.0.0 +patsy==0.5.3 +contourpy==1.2.0 +multidict==6.1.0 +mizani==0.9.3 +urllib3==2.1.0 +tokenizers==0.20.0 +MarkupSafe==2.1.2 +pip==24.2 +pexpect==4.8.0 +tqdm==4.66.5 +jedi==0.18.2 +pydantic_core==2.23.4 +tempdir==0.7.1 +mpmath==1.2.1 +setuptools==72.1.0 +pytest==7.4.3 +pure-eval==0.2.2 +psutil==5.9.1 +comm==0.1.2 +nvidia-cuda-cupti-cu11==11.7.101 +nvidia-cuda-nvrtc-cu11==11.7.99 +regex==2023.10.3 +platformdirs==2.5.2 +asttokens==2.2.1 +torch==2.0.0 +nvidia-nccl-cu11==2.14.3 +xxhash==3.5.0 diff --git a/wandb/run-20241031_114700-3cqkhntc/files/wandb-metadata.json b/wandb/run-20241031_114700-3cqkhntc/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..d8fd0987a40149c07002f44e26d4340781782d10 --- /dev/null +++ b/wandb/run-20241031_114700-3cqkhntc/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-10-31T15:47:00.289124Z", + "args": [ + "--perturbation", + "reverse_full", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "6", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1753158594560" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241031_114700-q0d78n2b/files/requirements.txt b/wandb/run-20241031_114700-q0d78n2b/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416 --- /dev/null +++ b/wandb/run-20241031_114700-q0d78n2b/files/requirements.txt @@ -0,0 +1,147 @@ +funcsigs==1.0.2 +sentry-sdk==2.17.0 +multiprocess==0.70.16 +numpy==1.26.2 +pluralizer==1.2.0 +debugpy==1.6.7 +nvidia-cudnn-cu11==8.5.0.96 +deepspeed==0.15.2 +data==0.4 +pandas==2.1.3 +tomli==2.0.1 +charset-normalizer==3.3.2 +attrs==24.2.0 +aiosignal==1.3.1 +fsspec==2023.10.0 +nvidia-cusparse-cu11==11.7.4.91 +zipp==3.12.0 +mypy-extensions==1.0.0 +datasets==3.0.1 +joblib==1.3.2 +hjson==3.1.0 +traitlets==5.7.1 +stack-data==0.6.0 +transformers==4.45.1 +sympy==1.11.1 +Pygments==2.15.0 +docker-pycreds==0.4.0 +dill==0.3.8 +wheel==0.44.0 +prompt-toolkit==3.0.30 +parso==0.8.3 +ipykernel==6.23.1 +pyarrow==17.0.0 +certifi==2023.11.17 +nvidia-cufft-cu11==10.9.0.58 +six==1.16.0 +pydantic==2.9.2 +click==8.1.7 +nest-asyncio==1.5.6 +gmpy2==2.1.0 +matplotlib==3.8.2 +scipy==1.11.4 +typing_extensions==4.12.2 +statsmodels==0.14.0 +huggingface-hub==0.25.0 +frozenlist==1.4.1 +gpustat==1.1.1 +nvidia-nvtx-cu11==11.7.91 +safetensors==0.4.5 +stanza==1.9.2 +decorator==5.1.1 +seaborn==0.13.0 +sentencepiece==0.2.0 +PyYAML==6.0.1 +black==24.8.0 +protobuf==4.25.1 +pickleshare==0.7.5 +peft==0.13.0 +triton==2.0.0 +nvidia-cuda-runtime-cu11==11.7.99 +Jinja2==3.1.2 +nvidia-cusolver-cu11==11.4.0.1 +executing==1.2.0 +jupyter_client==8.1.0 +pluggy==1.3.0 +cmake==3.30.3 +pytz==2023.3.post1 +aiohappyeyeballs==2.4.2 +kiwisolver==1.4.5 +py-cpuinfo==9.0.0 +Pillow==10.1.0 +ptyprocess==0.7.0 +importlib_resources==6.4.5 +GitPython==3.1.43 +importlib-metadata==6.0.0 +iniconfig==2.0.0 +scikit-learn==1.3.2 +exceptiongroup==1.1.0 +networkx==2.8.6 +accelerate==1.0.0 +nltk==3.8.1 +shutilwhich==1.1.0 +fonttools==4.45.1 +future==0.18.3 +aiohttp==3.10.6 +wcwidth==0.2.5 +idna==3.6 +filelock==3.12.2 +pathspec==0.12.1 +jupyter_core==5.1.0 +lit==18.1.8 +nvidia-curand-cu11==10.2.10.91 +nvidia-cublas-cu11==11.10.3.66 +nvidia-ml-py==12.560.30 +msgpack==1.1.0 +python-dateutil==2.8.2 +blessed==1.20.0 +packaging==23.0 +gitdb==4.0.11 +yarl==1.13.0 +emoji==2.8.0 +tzdata==2023.3 +cycler==0.12.1 +tornado==6.2 +backcall==0.2.0 +plotnine==0.12.4 +ninja==1.11.1.1 +latex==0.7.0 +wandb==0.18.5 +setproctitle==1.3.3 +threadpoolctl==3.2.0 +requests==2.32.3 +pyparsing==3.1.1 +smmap==5.0.1 +pyzmq==23.0.0 +async-timeout==4.0.3 +annotated-types==0.7.0 +matplotlib-inline==0.1.6 +latexcodec==1.0.0 +ipython==8.0.0 +patsy==0.5.3 +contourpy==1.2.0 +multidict==6.1.0 +mizani==0.9.3 +urllib3==2.1.0 +tokenizers==0.20.0 +MarkupSafe==2.1.2 +pip==24.2 +pexpect==4.8.0 +tqdm==4.66.5 +jedi==0.18.2 +pydantic_core==2.23.4 +tempdir==0.7.1 +mpmath==1.2.1 +setuptools==72.1.0 +pytest==7.4.3 +pure-eval==0.2.2 +psutil==5.9.1 +comm==0.1.2 +nvidia-cuda-cupti-cu11==11.7.101 +nvidia-cuda-nvrtc-cu11==11.7.99 +regex==2023.10.3 +platformdirs==2.5.2 +asttokens==2.2.1 +torch==2.0.0 +nvidia-nccl-cu11==2.14.3 +xxhash==3.5.0 diff --git a/wandb/run-20241031_114700-q0d78n2b/files/wandb-metadata.json b/wandb/run-20241031_114700-q0d78n2b/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..446eee014fa68da60f0b4df9a048fe33cc073063 --- /dev/null +++ b/wandb/run-20241031_114700-q0d78n2b/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-10-31T15:47:00.243502Z", + "args": [ + "--perturbation", + "reverse_full", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "6", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1753158594560" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241031_114700-q0d78n2b/logs/debug-internal.log b/wandb/run-20241031_114700-q0d78n2b/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..fd446dc1e65013e9fdccc866f9587219ddfb6e68 --- /dev/null +++ b/wandb/run-20241031_114700-q0d78n2b/logs/debug-internal.log @@ -0,0 +1,8 @@ +{"time":"2024-10-31T11:47:00.246260836-04:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-10-31T11:47:00.246281016-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241031_114700-q0d78n2b/logs/debug-core.log"} +{"time":"2024-10-31T11:47:00.352833535-04:00","level":"INFO","msg":"created new stream","id":"q0d78n2b"} +{"time":"2024-10-31T11:47:00.352859865-04:00","level":"INFO","msg":"stream: started","id":"q0d78n2b"} +{"time":"2024-10-31T11:47:00.352931156-04:00","level":"INFO","msg":"sender: started","stream_id":"q0d78n2b"} +{"time":"2024-10-31T11:47:00.352897086-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"q0d78n2b"}} +{"time":"2024-10-31T11:47:00.352894256-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"q0d78n2b"}} +{"time":"2024-10-31T11:47:00.611011859-04:00","level":"INFO","msg":"Starting system monitor"} diff --git a/wandb/run-20241031_114700-q0d78n2b/logs/debug.log b/wandb/run-20241031_114700-q0d78n2b/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..a408b9f0693fe259c47f5be0304df1a0969ed1b9 --- /dev/null +++ b/wandb/run-20241031_114700-q0d78n2b/logs/debug.log @@ -0,0 +1,26 @@ +2024-10-31 11:47:00,241 INFO MainThread:554145 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-10-31 11:47:00,241 INFO MainThread:554145 [wandb_setup.py:_flush():79] Configure stats pid to 554145 +2024-10-31 11:47:00,241 INFO MainThread:554145 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-10-31 11:47:00,241 INFO MainThread:554145 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-10-31 11:47:00,241 INFO MainThread:554145 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-10-31 11:47:00,241 INFO MainThread:554145 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-10-31 11:47:00,241 INFO MainThread:554145 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-10-31 11:47:00,241 INFO MainThread:554145 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-10-31 11:47:00,241 INFO MainThread:554145 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241031_114700-q0d78n2b/logs/debug.log +2024-10-31 11:47:00,241 INFO MainThread:554145 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241031_114700-q0d78n2b/logs/debug-internal.log +2024-10-31 11:47:00,241 INFO MainThread:554145 [wandb_init.py:init():621] calling init triggers +2024-10-31 11:47:00,241 INFO MainThread:554145 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-10-31 11:47:00,241 INFO MainThread:554145 [wandb_init.py:init():671] starting backend +2024-10-31 11:47:00,241 INFO MainThread:554145 [wandb_init.py:init():675] sending inform_init request +2024-10-31 11:47:00,243 INFO MainThread:554145 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-10-31 11:47:00,243 INFO MainThread:554145 [wandb_init.py:init():688] backend started and connected +2024-10-31 11:47:00,247 INFO MainThread:554145 [wandb_init.py:init():783] updated telemetry +2024-10-31 11:47:00,278 INFO MainThread:554145 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-10-31 11:47:00,608 INFO MainThread:554145 [wandb_init.py:init():867] starting run threads in backend +2024-10-31 11:47:00,695 INFO MainThread:554145 [wandb_run.py:_console_start():2463] atexit reg +2024-10-31 11:47:00,695 INFO MainThread:554145 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-10-31 11:47:00,695 INFO MainThread:554145 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-10-31 11:47:00,695 INFO MainThread:554145 [wandb_run.py:_redirect():2401] Redirects installed. +2024-10-31 11:47:00,696 INFO MainThread:554145 [wandb_init.py:init():911] run started, returning control to user process +2024-10-31 11:47:00,697 INFO MainThread:554145 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_full', 'train_set': '10M', 'batch_size': 3, 'epoch': 6, 'seed': 0, 'lr': 0.0001} diff --git a/wandb/run-20241031_122005-nip14lm6/files/config.yaml b/wandb/run-20241031_122005-nip14lm6/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..725c8381c5f9fe81efa0c182e9fe88850f0f19e9 --- /dev/null +++ b/wandb/run-20241031_122005-nip14lm6/files/config.yaml @@ -0,0 +1,49 @@ +_wandb: + value: + cli_version: 0.18.5 + m: [] + python_version: 3.9.19 + t: + "1": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "2": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "3": + - 13 + - 23 + - 55 + "4": 3.9.19 + "5": 0.18.5 + "6": 4.45.1 + "8": + - 5 + "12": 0.18.5 + "13": linux-x86_64 +batch_size: + value: 3 +epoch: + value: 6 +lr: + value: 5e-06 +perturbation: + value: reverse_full +seed: + value: 0 +train_set: + value: 10M diff --git a/wandb/run-20241031_122005-nip14lm6/files/output.log b/wandb/run-20241031_122005-nip14lm6/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..7932a49196018ff934fd534bc05f01d11a0e95d8 --- /dev/null +++ b/wandb/run-20241031_122005-nip14lm6/files/output.log @@ -0,0 +1,35 @@ +model.safetensors.index.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20.9k/20.9k [00:00<00:00, 8.51MB/s] +Downloading shards: 0%| | 0/2 [00:22 + model = AutoModelForCausalLM.from_pretrained(model_name, + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/models/auto/auto_factory.py", line 564, in from_pretrained + return model_class.from_pretrained( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/modeling_utils.py", line 3769, in from_pretrained + resolved_archive_file, sharded_metadata = get_checkpoint_shard_files( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 1098, in get_checkpoint_shard_files + cached_filename = cached_file( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 403, in cached_file + resolved_file = hf_hub_download( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_deprecation.py", line 101, in inner_f + return f(*args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn + return fn(*args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1232, in hf_hub_download + return _hf_hub_download_to_cache_dir( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1380, in _hf_hub_download_to_cache_dir + with WeakFileLock(lock_path): + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/contextlib.py", line 119, in __enter__ + return next(self.gen) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_fixes.py", line 98, in WeakFileLock + lock.acquire() + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/filelock/_api.py", line 225, in acquire + time.sleep(poll_interval) +KeyboardInterrupt diff --git a/wandb/run-20241031_122005-nip14lm6/files/requirements.txt b/wandb/run-20241031_122005-nip14lm6/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416 --- /dev/null +++ b/wandb/run-20241031_122005-nip14lm6/files/requirements.txt @@ -0,0 +1,147 @@ +funcsigs==1.0.2 +sentry-sdk==2.17.0 +multiprocess==0.70.16 +numpy==1.26.2 +pluralizer==1.2.0 +debugpy==1.6.7 +nvidia-cudnn-cu11==8.5.0.96 +deepspeed==0.15.2 +data==0.4 +pandas==2.1.3 +tomli==2.0.1 +charset-normalizer==3.3.2 +attrs==24.2.0 +aiosignal==1.3.1 +fsspec==2023.10.0 +nvidia-cusparse-cu11==11.7.4.91 +zipp==3.12.0 +mypy-extensions==1.0.0 +datasets==3.0.1 +joblib==1.3.2 +hjson==3.1.0 +traitlets==5.7.1 +stack-data==0.6.0 +transformers==4.45.1 +sympy==1.11.1 +Pygments==2.15.0 +docker-pycreds==0.4.0 +dill==0.3.8 +wheel==0.44.0 +prompt-toolkit==3.0.30 +parso==0.8.3 +ipykernel==6.23.1 +pyarrow==17.0.0 +certifi==2023.11.17 +nvidia-cufft-cu11==10.9.0.58 +six==1.16.0 +pydantic==2.9.2 +click==8.1.7 +nest-asyncio==1.5.6 +gmpy2==2.1.0 +matplotlib==3.8.2 +scipy==1.11.4 +typing_extensions==4.12.2 +statsmodels==0.14.0 +huggingface-hub==0.25.0 +frozenlist==1.4.1 +gpustat==1.1.1 +nvidia-nvtx-cu11==11.7.91 +safetensors==0.4.5 +stanza==1.9.2 +decorator==5.1.1 +seaborn==0.13.0 +sentencepiece==0.2.0 +PyYAML==6.0.1 +black==24.8.0 +protobuf==4.25.1 +pickleshare==0.7.5 +peft==0.13.0 +triton==2.0.0 +nvidia-cuda-runtime-cu11==11.7.99 +Jinja2==3.1.2 +nvidia-cusolver-cu11==11.4.0.1 +executing==1.2.0 +jupyter_client==8.1.0 +pluggy==1.3.0 +cmake==3.30.3 +pytz==2023.3.post1 +aiohappyeyeballs==2.4.2 +kiwisolver==1.4.5 +py-cpuinfo==9.0.0 +Pillow==10.1.0 +ptyprocess==0.7.0 +importlib_resources==6.4.5 +GitPython==3.1.43 +importlib-metadata==6.0.0 +iniconfig==2.0.0 +scikit-learn==1.3.2 +exceptiongroup==1.1.0 +networkx==2.8.6 +accelerate==1.0.0 +nltk==3.8.1 +shutilwhich==1.1.0 +fonttools==4.45.1 +future==0.18.3 +aiohttp==3.10.6 +wcwidth==0.2.5 +idna==3.6 +filelock==3.12.2 +pathspec==0.12.1 +jupyter_core==5.1.0 +lit==18.1.8 +nvidia-curand-cu11==10.2.10.91 +nvidia-cublas-cu11==11.10.3.66 +nvidia-ml-py==12.560.30 +msgpack==1.1.0 +python-dateutil==2.8.2 +blessed==1.20.0 +packaging==23.0 +gitdb==4.0.11 +yarl==1.13.0 +emoji==2.8.0 +tzdata==2023.3 +cycler==0.12.1 +tornado==6.2 +backcall==0.2.0 +plotnine==0.12.4 +ninja==1.11.1.1 +latex==0.7.0 +wandb==0.18.5 +setproctitle==1.3.3 +threadpoolctl==3.2.0 +requests==2.32.3 +pyparsing==3.1.1 +smmap==5.0.1 +pyzmq==23.0.0 +async-timeout==4.0.3 +annotated-types==0.7.0 +matplotlib-inline==0.1.6 +latexcodec==1.0.0 +ipython==8.0.0 +patsy==0.5.3 +contourpy==1.2.0 +multidict==6.1.0 +mizani==0.9.3 +urllib3==2.1.0 +tokenizers==0.20.0 +MarkupSafe==2.1.2 +pip==24.2 +pexpect==4.8.0 +tqdm==4.66.5 +jedi==0.18.2 +pydantic_core==2.23.4 +tempdir==0.7.1 +mpmath==1.2.1 +setuptools==72.1.0 +pytest==7.4.3 +pure-eval==0.2.2 +psutil==5.9.1 +comm==0.1.2 +nvidia-cuda-cupti-cu11==11.7.101 +nvidia-cuda-nvrtc-cu11==11.7.99 +regex==2023.10.3 +platformdirs==2.5.2 +asttokens==2.2.1 +torch==2.0.0 +nvidia-nccl-cu11==2.14.3 +xxhash==3.5.0 diff --git a/wandb/run-20241031_122005-nip14lm6/files/wandb-metadata.json b/wandb/run-20241031_122005-nip14lm6/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..56b499dd5cc4956cb199e725267386b547157f1d --- /dev/null +++ b/wandb/run-20241031_122005-nip14lm6/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-10-31T16:20:05.846194Z", + "args": [ + "--perturbation", + "reverse_full", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "6", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1753159847936" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241031_122005-nip14lm6/files/wandb-summary.json b/wandb/run-20241031_122005-nip14lm6/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..15f6b8e9049a55292dab131278b3f2fc1f52e50d --- /dev/null +++ b/wandb/run-20241031_122005-nip14lm6/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":23}} \ No newline at end of file diff --git a/wandb/run-20241031_122005-nip14lm6/run-nip14lm6.wandb b/wandb/run-20241031_122005-nip14lm6/run-nip14lm6.wandb new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/wandb/run-20241101_012613-k6o0lha8/files/output.log b/wandb/run-20241101_012613-k6o0lha8/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..5824b3c1b0e7410256b3374a200db5bd3cc11c9d --- /dev/null +++ b/wandb/run-20241101_012613-k6o0lha8/files/output.log @@ -0,0 +1,12 @@ +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", line 164, in + dataset = load_dataset('babylm_dataset_test.py', name=dataset_name, trust_remote_code=True) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/datasets/load.py", line 2074, in load_dataset + builder_instance = load_dataset_builder( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/datasets/load.py", line 1832, in load_dataset_builder + builder_instance: DatasetBuilder = builder_cls( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/datasets/builder.py", line 342, in __init__ + self.config, self.config_id = self._create_builder_config( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/datasets/builder.py", line 569, in _create_builder_config + raise ValueError( +ValueError: BuilderConfig 'babylm_shuffle_nodeterministic_10M_seed0' not found. Available: ['babylm_hop_control_10M_seed0', 'babylm_hop_tokens4_10M_seed0', 'babylm_hop_words4_10M_seed0', 'babylm_reverse_control_10M_seed0', 'babylm_reverse_partial_10M_seed0', 'babylm_reverse_full_10M_seed0', 'babylm_shuffle_control_10M_seed0', 'babylm_shuffle_nondeterministic_10M_seed0', 'babylm_shuffle_deterministic21_10M_seed0', 'babylm_shuffle_deterministic57_10M_seed0', 'babylm_shuffle_deterministic84_10M_seed0', 'babylm_shuffle_local3_10M_seed0', 'babylm_shuffle_local5_10M_seed0', 'babylm_shuffle_local10_10M_seed0', 'babylm_shuffle_even_odd_10M_seed0'] diff --git a/wandb/run-20241101_012613-k6o0lha8/files/wandb-metadata.json b/wandb/run-20241101_012613-k6o0lha8/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..6e96a1dfccdea4e071278636df0097986c8a027a --- /dev/null +++ b/wandb/run-20241101_012613-k6o0lha8/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-11-01T05:26:13.051361Z", + "args": [ + "--perturbation", + "shuffle_nodeterministic", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "6", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1753992237056" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241101_012613-k6o0lha8/run-k6o0lha8.wandb b/wandb/run-20241101_012613-k6o0lha8/run-k6o0lha8.wandb new file mode 100644 index 0000000000000000000000000000000000000000..22c8b57f9fb7484be6fc7d6cdbf69f3414a83821 Binary files /dev/null and b/wandb/run-20241101_012613-k6o0lha8/run-k6o0lha8.wandb differ diff --git a/wandb/run-20241101_012733-9v55tr72/files/output.log b/wandb/run-20241101_012733-9v55tr72/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..1274fd7be7f3cdff563854e73f6319af33003741 --- /dev/null +++ b/wandb/run-20241101_012733-9v55tr72/files/output.log @@ -0,0 +1,196 @@ +100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1098312/1098312 [00:04<00:00, 225385.84it/s] +100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1098312/1098312 [00:00<00:00, 2536102.07it/s] +100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16426/16426 [00:00<00:00, 27033.16it/s] +Generating train split: 16425 examples [00:08, 1830.77 examples/s]█████████████████████████████████████████████████████████ | 14150/16426 [00:00<00:00, 29025.68it/s] +100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1085967/1085967 [00:05<00:00, 206269.15it/s] +100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1085967/1085967 [00:00<00:00, 2626307.34it/s] +100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17014/17014 [00:00<00:00, 24718.45it/s] +Generating validation split: 17013 examples [00:10, 1685.09 examples/s]█████████████████████████████████▏ | 12320/17014 [00:00<00:00, 26810.27it/s] +100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1031242/1031242 [00:04<00:00, 250719.52it/s] +100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1031242/1031242 [00:00<00:00, 3139247.02it/s] +100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15439/15439 [00:00<00:00, 27030.75it/s] +Generating test split: 15438 examples [00:08, 1840.00 examples/s]█████████████████████████████████████████████████████████████▉ | 13736/15439 [00:00<00:00, 34826.46it/s] +Downloading shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [02:32<00:00, 76.36s/it] +Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:06<00:00, 3.45s/it] +Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16425/16425 [00:54<00:00, 303.11 examples/s] +Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17013/17013 [00:57<00:00, 297.31 examples/s] +tokenized_valid: Dataset({ + features: ['input_ids', 'attention_mask'], + num_rows: 600 +}) +/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead + warnings.warn( +[2024-11-01 01:32:35,310] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-11-01 01:32:46,120] [INFO] [comm.py:652:init_distributed] cdb=None +[2024-11-01 01:32:46,120] [INFO] [comm.py:683:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher. +Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination +Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root... +Loading extension module cpu_adam... +Time to load cpu_adam op: 5.5455732345581055 seconds +wandb: WARNING The `run_name` is currently set to the same value as `TrainingArguments.output_dir`. If this was not intended, please specify a different run name by setting the `TrainingArguments.run_name` parameter. + +{'loss': 3.0928, 'grad_norm': 0.0, 'learning_rate': 5e-06, 'epoch': 0.0} +{'loss': 3.0737, 'grad_norm': 0.0, 'learning_rate': 5e-06, 'epoch': 0.0} +{'loss': 3.1154, 'grad_norm': 0.0, 'learning_rate': 5e-06, 'epoch': 0.01} +{'loss': 3.1109, 'grad_norm': 0.0, 'learning_rate': 5e-06, 'epoch': 0.01} +{'loss': 3.1179, 'grad_norm': 0.0, 'learning_rate': 5e-06, 'epoch': 0.01} +{'loss': 3.089, 'grad_norm': 0.0, 'learning_rate': 5e-06, 'epoch': 0.01} +{'loss': 3.1042, 'grad_norm': 0.0, 'learning_rate': 5e-06, 'epoch': 0.02} +{'loss': 3.109, 'grad_norm': 0.0, 'learning_rate': 5e-06, 'epoch': 0.02} +{'loss': 3.097, 'grad_norm': 0.0, 'learning_rate': 5e-06, 'epoch': 0.02} +{'loss': 3.1119, 'grad_norm': 0.0, 'learning_rate': 5e-06, 'epoch': 0.02} + +{'eval_loss': 3.1238040924072266, 'eval_runtime': 12.4211, 'eval_samples_per_second': 48.305, 'eval_steps_per_second': 1.047, 'epoch': 0.02} +{'loss': 3.0899, 'grad_norm': 0.0, 'learning_rate': 5e-06, 'epoch': 0.02} +{'loss': 3.1001, 'grad_norm': 0.0, 'learning_rate': 5e-06, 'epoch': 0.03} +{'loss': 3.118, 'grad_norm': 0.0, 'learning_rate': 5e-06, 'epoch': 0.03} +{'loss': 3.1069, 'grad_norm': 0.0, 'learning_rate': 5e-06, 'epoch': 0.03} +{'loss': 3.0758, 'grad_norm': 6.654611587524414, 'learning_rate': 4.998172514619883e-06, 'epoch': 0.03} +{'loss': 3.0993, 'grad_norm': 6.654611587524414, 'learning_rate': 4.998172514619883e-06, 'epoch': 0.04} +{'loss': 3.0696, 'grad_norm': 9.038572311401367, 'learning_rate': 4.996345029239767e-06, 'epoch': 0.04} +{'loss': 3.1161, 'grad_norm': 9.038572311401367, 'learning_rate': 4.996345029239767e-06, 'epoch': 0.04} +{'loss': 3.0935, 'grad_norm': 8.783886909484863, 'learning_rate': 4.9945175438596495e-06, 'epoch': 0.04} +{'loss': 3.0074, 'grad_norm': 5.36458158493042, 'learning_rate': 4.992690058479532e-06, 'epoch': 0.04} +{'eval_loss': 2.966599225997925, 'eval_runtime': 11.5387, 'eval_samples_per_second': 51.999, 'eval_steps_per_second': 1.127, 'epoch': 0.04} +{'loss': 2.9491, 'grad_norm': 4.384786605834961, 'learning_rate': 4.990862573099415e-06, 'epoch': 0.05} +{'loss': 2.919, 'grad_norm': 5.37711238861084, 'learning_rate': 4.989035087719299e-06, 'epoch': 0.05} +{'loss': 2.8708, 'grad_norm': 4.1505208015441895, 'learning_rate': 4.987207602339182e-06, 'epoch': 0.05} +{'loss': 2.8378, 'grad_norm': 2.6863813400268555, 'learning_rate': 4.985380116959065e-06, 'epoch': 0.05} +{'loss': 2.8356, 'grad_norm': 2.7242753505706787, 'learning_rate': 4.983552631578948e-06, 'epoch': 0.05} +{'loss': 2.7966, 'grad_norm': 2.6609349250793457, 'learning_rate': 4.9817251461988304e-06, 'epoch': 0.06} +{'loss': 2.7441, 'grad_norm': 2.2204103469848633, 'learning_rate': 4.979897660818714e-06, 'epoch': 0.06} +{'loss': 2.7614, 'grad_norm': 2.374406099319458, 'learning_rate': 4.978070175438597e-06, 'epoch': 0.06} +{'loss': 2.7582, 'grad_norm': 2.696918249130249, 'learning_rate': 4.9762426900584795e-06, 'epoch': 0.06} +{'loss': 2.6845, 'grad_norm': 1.8222397565841675, 'learning_rate': 4.974415204678363e-06, 'epoch': 0.07} +{'eval_loss': 2.758892297744751, 'eval_runtime': 11.5398, 'eval_samples_per_second': 51.994, 'eval_steps_per_second': 1.127, 'epoch': 0.07} +{'loss': 2.6874, 'grad_norm': 2.076284408569336, 'learning_rate': 4.972587719298246e-06, 'epoch': 0.07} +{'loss': 2.693, 'grad_norm': 2.0412065982818604, 'learning_rate': 4.970760233918129e-06, 'epoch': 0.07} +{'loss': 2.6601, 'grad_norm': 1.8842229843139648, 'learning_rate': 4.968932748538012e-06, 'epoch': 0.07} +{'loss': 2.6749, 'grad_norm': 1.756975531578064, 'learning_rate': 4.967105263157895e-06, 'epoch': 0.07} +{'loss': 2.6141, 'grad_norm': 2.0640175342559814, 'learning_rate': 4.9652777777777786e-06, 'epoch': 0.08} +{'loss': 2.608, 'grad_norm': 1.5173723697662354, 'learning_rate': 4.963450292397661e-06, 'epoch': 0.08} +{'loss': 2.5623, 'grad_norm': 1.8280211687088013, 'learning_rate': 4.961622807017544e-06, 'epoch': 0.08} +{'loss': 2.6046, 'grad_norm': 1.990080714225769, 'learning_rate': 4.959795321637428e-06, 'epoch': 0.08} +{'loss': 2.5329, 'grad_norm': 1.4346381425857544, 'learning_rate': 4.95796783625731e-06, 'epoch': 0.09} +{'loss': 2.5307, 'grad_norm': 1.45533287525177, 'learning_rate': 4.956140350877193e-06, 'epoch': 0.09} +{'eval_loss': 2.629573345184326, 'eval_runtime': 11.5669, 'eval_samples_per_second': 51.872, 'eval_steps_per_second': 1.124, 'epoch': 0.09} +{'loss': 2.5591, 'grad_norm': 1.5484964847564697, 'learning_rate': 4.954312865497076e-06, 'epoch': 0.09} +{'loss': 2.5403, 'grad_norm': 1.3261419534683228, 'learning_rate': 4.9524853801169595e-06, 'epoch': 0.09} +{'loss': 2.5176, 'grad_norm': 1.4234470129013062, 'learning_rate': 4.950657894736843e-06, 'epoch': 0.09} +{'loss': 2.4846, 'grad_norm': 1.5438008308410645, 'learning_rate': 4.948830409356726e-06, 'epoch': 0.1} +{'loss': 2.4678, 'grad_norm': 1.4391041994094849, 'learning_rate': 4.947002923976609e-06, 'epoch': 0.1} +{'loss': 2.5105, 'grad_norm': 1.7514405250549316, 'learning_rate': 4.945175438596491e-06, 'epoch': 0.1} +{'loss': 2.5033, 'grad_norm': 1.2241393327713013, 'learning_rate': 4.943347953216375e-06, 'epoch': 0.1} +{'loss': 2.4888, 'grad_norm': 1.4796929359436035, 'learning_rate': 4.941520467836258e-06, 'epoch': 0.11} +{'loss': 2.497, 'grad_norm': 1.3036240339279175, 'learning_rate': 4.9396929824561404e-06, 'epoch': 0.11} +{'loss': 2.4566, 'grad_norm': 1.309809923171997, 'learning_rate': 4.937865497076024e-06, 'epoch': 0.11} +{'eval_loss': 2.558666944503784, 'eval_runtime': 11.5691, 'eval_samples_per_second': 51.862, 'eval_steps_per_second': 1.124, 'epoch': 0.11} +{'loss': 2.4567, 'grad_norm': 1.414117455482483, 'learning_rate': 4.936038011695907e-06, 'epoch': 0.11} +{'loss': 2.49, 'grad_norm': 1.4788432121276855, 'learning_rate': 4.9342105263157895e-06, 'epoch': 0.11} +{'loss': 2.4243, 'grad_norm': 1.4120174646377563, 'learning_rate': 4.932383040935672e-06, 'epoch': 0.12} +{'loss': 2.4309, 'grad_norm': 1.6317367553710938, 'learning_rate': 4.930555555555556e-06, 'epoch': 0.12} +{'loss': 2.4456, 'grad_norm': 1.1397351026535034, 'learning_rate': 4.9287280701754395e-06, 'epoch': 0.12} +{'loss': 2.4707, 'grad_norm': 1.6220897436141968, 'learning_rate': 4.926900584795322e-06, 'epoch': 0.12} +{'loss': 2.4705, 'grad_norm': 1.2757837772369385, 'learning_rate': 4.925073099415205e-06, 'epoch': 0.12} +{'loss': 2.4143, 'grad_norm': 1.3141602277755737, 'learning_rate': 4.9232456140350886e-06, 'epoch': 0.13} +{'loss': 2.4199, 'grad_norm': 1.5668749809265137, 'learning_rate': 4.921418128654971e-06, 'epoch': 0.13} +{'loss': 2.4307, 'grad_norm': 1.4094359874725342, 'learning_rate': 4.919590643274854e-06, 'epoch': 0.13} +{'eval_loss': 2.517282247543335, 'eval_runtime': 11.5691, 'eval_samples_per_second': 51.862, 'eval_steps_per_second': 1.124, 'epoch': 0.13} +{'loss': 2.3747, 'grad_norm': 1.752899169921875, 'learning_rate': 4.917763157894737e-06, 'epoch': 0.13} +{'loss': 2.4056, 'grad_norm': 1.739943027496338, 'learning_rate': 4.91593567251462e-06, 'epoch': 0.14} +{'loss': 2.4286, 'grad_norm': 1.6286025047302246, 'learning_rate': 4.914108187134503e-06, 'epoch': 0.14} +{'loss': 2.4063, 'grad_norm': 1.542277455329895, 'learning_rate': 4.912280701754386e-06, 'epoch': 0.14} +{'loss': 2.412, 'grad_norm': 1.8128482103347778, 'learning_rate': 4.9104532163742695e-06, 'epoch': 0.14} +{'loss': 2.4342, 'grad_norm': 1.3743454217910767, 'learning_rate': 4.908625730994152e-06, 'epoch': 0.14} +{'loss': 2.3785, 'grad_norm': 2.225510835647583, 'learning_rate': 4.906798245614036e-06, 'epoch': 0.15} +{'loss': 2.4023, 'grad_norm': 1.531154990196228, 'learning_rate': 4.904970760233919e-06, 'epoch': 0.15} +{'loss': 2.4038, 'grad_norm': 1.983007788658142, 'learning_rate': 4.903143274853801e-06, 'epoch': 0.15} +{'loss': 2.3977, 'grad_norm': 1.4333405494689941, 'learning_rate': 4.901315789473685e-06, 'epoch': 0.15} +{'eval_loss': 2.488751173019409, 'eval_runtime': 11.6055, 'eval_samples_per_second': 51.7, 'eval_steps_per_second': 1.12, 'epoch': 0.15} +{'loss': 2.381, 'grad_norm': 1.7076454162597656, 'learning_rate': 4.899488304093568e-06, 'epoch': 0.16} +{'loss': 2.3719, 'grad_norm': 1.627768874168396, 'learning_rate': 4.8976608187134504e-06, 'epoch': 0.16} +{'loss': 2.3685, 'grad_norm': 1.3088836669921875, 'learning_rate': 4.895833333333333e-06, 'epoch': 0.16} +{'loss': 2.3684, 'grad_norm': 1.7792292833328247, 'learning_rate': 4.894005847953217e-06, 'epoch': 0.16} +{'loss': 2.377, 'grad_norm': 1.4323128461837769, 'learning_rate': 4.8921783625731e-06, 'epoch': 0.16} +{'loss': 2.366, 'grad_norm': 1.5406019687652588, 'learning_rate': 4.890350877192983e-06, 'epoch': 0.17} +{'loss': 2.3304, 'grad_norm': 1.864188313484192, 'learning_rate': 4.888523391812866e-06, 'epoch': 0.17} +{'loss': 2.3666, 'grad_norm': 1.6635836362838745, 'learning_rate': 4.886695906432749e-06, 'epoch': 0.17} +{'loss': 2.3656, 'grad_norm': 1.360572099685669, 'learning_rate': 4.884868421052632e-06, 'epoch': 0.17} +{'loss': 2.3807, 'grad_norm': 1.5489475727081299, 'learning_rate': 4.883040935672515e-06, 'epoch': 0.18} +{'eval_loss': 2.464202880859375, 'eval_runtime': 11.5982, 'eval_samples_per_second': 51.732, 'eval_steps_per_second': 1.121, 'epoch': 0.18} +{'loss': 2.351, 'grad_norm': 1.4594776630401611, 'learning_rate': 4.881213450292398e-06, 'epoch': 0.18} +{'loss': 2.3653, 'grad_norm': 1.4087573289871216, 'learning_rate': 4.879385964912281e-06, 'epoch': 0.18} +{'loss': 2.3573, 'grad_norm': 2.222598075866699, 'learning_rate': 4.877558479532164e-06, 'epoch': 0.18} +{'loss': 2.4051, 'grad_norm': 1.8786218166351318, 'learning_rate': 4.875730994152047e-06, 'epoch': 0.18} +{'loss': 2.3461, 'grad_norm': 1.4465943574905396, 'learning_rate': 4.8739035087719296e-06, 'epoch': 0.19} +{'loss': 2.3144, 'grad_norm': 1.9490894079208374, 'learning_rate': 4.872076023391813e-06, 'epoch': 0.19} +{'loss': 2.3444, 'grad_norm': 1.7288326025009155, 'learning_rate': 4.870248538011697e-06, 'epoch': 0.19} +{'loss': 2.3569, 'grad_norm': 1.7530410289764404, 'learning_rate': 4.8684210526315795e-06, 'epoch': 0.19} +{'loss': 2.3743, 'grad_norm': 1.4135267734527588, 'learning_rate': 4.866593567251462e-06, 'epoch': 0.19} +{'loss': 2.3217, 'grad_norm': 1.8368803262710571, 'learning_rate': 4.864766081871346e-06, 'epoch': 0.2} +{'eval_loss': 2.450228452682495, 'eval_runtime': 11.5967, 'eval_samples_per_second': 51.739, 'eval_steps_per_second': 1.121, 'epoch': 0.2} +{'loss': 2.3678, 'grad_norm': 1.3603743314743042, 'learning_rate': 4.862938596491229e-06, 'epoch': 0.2} +{'loss': 2.3663, 'grad_norm': 1.9931479692459106, 'learning_rate': 4.861111111111111e-06, 'epoch': 0.2} +{'loss': 2.3366, 'grad_norm': 1.4983241558074951, 'learning_rate': 4.859283625730994e-06, 'epoch': 0.2} +{'loss': 2.3388, 'grad_norm': 1.9140528440475464, 'learning_rate': 4.857456140350878e-06, 'epoch': 0.21} +{'loss': 2.3373, 'grad_norm': 1.4306626319885254, 'learning_rate': 4.8556286549707604e-06, 'epoch': 0.21} +{'loss': 2.3393, 'grad_norm': 1.8524028062820435, 'learning_rate': 4.853801169590643e-06, 'epoch': 0.21} +{'loss': 2.3076, 'grad_norm': 1.4418741464614868, 'learning_rate': 4.851973684210527e-06, 'epoch': 0.21} +{'loss': 2.3428, 'grad_norm': 1.6648645401000977, 'learning_rate': 4.8501461988304095e-06, 'epoch': 0.21} +{'loss': 2.3321, 'grad_norm': 1.887403130531311, 'learning_rate': 4.848318713450293e-06, 'epoch': 0.22} +{'loss': 2.3366, 'grad_norm': 1.9936954975128174, 'learning_rate': 4.846491228070176e-06, 'epoch': 0.22} +{'eval_loss': 2.4382259845733643, 'eval_runtime': 11.6173, 'eval_samples_per_second': 51.647, 'eval_steps_per_second': 1.119, 'epoch': 0.22} +{'loss': 2.3472, 'grad_norm': 1.8773130178451538, 'learning_rate': 4.844663742690059e-06, 'epoch': 0.22} +{'loss': 2.3142, 'grad_norm': 1.8776212930679321, 'learning_rate': 4.842836257309942e-06, 'epoch': 0.22} +{'loss': 2.3106, 'grad_norm': 2.305266857147217, 'learning_rate': 4.841008771929825e-06, 'epoch': 0.23} +{'loss': 2.3276, 'grad_norm': 2.13682222366333, 'learning_rate': 4.839181286549708e-06, 'epoch': 0.23} +{'loss': 2.3762, 'grad_norm': 1.4358876943588257, 'learning_rate': 4.8373538011695905e-06, 'epoch': 0.23} +{'loss': 2.3149, 'grad_norm': 1.7932581901550293, 'learning_rate': 4.835526315789474e-06, 'epoch': 0.23} +{'loss': 2.275, 'grad_norm': 1.6192528009414673, 'learning_rate': 4.833698830409358e-06, 'epoch': 0.23} +{'loss': 2.2949, 'grad_norm': 2.0717737674713135, 'learning_rate': 4.83187134502924e-06, 'epoch': 0.24} +{'loss': 2.3171, 'grad_norm': 1.6378692388534546, 'learning_rate': 4.830043859649123e-06, 'epoch': 0.24} +{'loss': 2.3051, 'grad_norm': 1.669114112854004, 'learning_rate': 4.828216374269007e-06, 'epoch': 0.24} +{'eval_loss': 2.4258029460906982, 'eval_runtime': 11.6018, 'eval_samples_per_second': 51.716, 'eval_steps_per_second': 1.121, 'epoch': 0.24} +{'loss': 2.3045, 'grad_norm': 1.3886950016021729, 'learning_rate': 4.8263888888888895e-06, 'epoch': 0.24} +{'loss': 2.3076, 'grad_norm': 1.7219699621200562, 'learning_rate': 4.824561403508772e-06, 'epoch': 0.25} +{'loss': 2.2857, 'grad_norm': 1.4992568492889404, 'learning_rate': 4.822733918128655e-06, 'epoch': 0.25} +{'loss': 2.3096, 'grad_norm': 1.7140436172485352, 'learning_rate': 4.820906432748539e-06, 'epoch': 0.25} +{'loss': 2.3194, 'grad_norm': 1.6086301803588867, 'learning_rate': 4.819078947368421e-06, 'epoch': 0.25} +{'loss': 2.3498, 'grad_norm': 1.5028151273727417, 'learning_rate': 4.817251461988304e-06, 'epoch': 0.25} +{'loss': 2.2867, 'grad_norm': 1.7474697828292847, 'learning_rate': 4.815423976608188e-06, 'epoch': 0.26} +{'loss': 2.2662, 'grad_norm': 1.6730782985687256, 'learning_rate': 4.8135964912280704e-06, 'epoch': 0.26} +{'loss': 2.2926, 'grad_norm': 2.4538962841033936, 'learning_rate': 4.811769005847954e-06, 'epoch': 0.26} +{'loss': 2.3217, 'grad_norm': 1.6457512378692627, 'learning_rate': 4.809941520467837e-06, 'epoch': 0.26} +{'eval_loss': 2.408306121826172, 'eval_runtime': 11.6092, 'eval_samples_per_second': 51.683, 'eval_steps_per_second': 1.12, 'epoch': 0.26} +{'loss': 2.2943, 'grad_norm': 1.8019167184829712, 'learning_rate': 4.8081140350877195e-06, 'epoch': 0.27} +{'loss': 2.2945, 'grad_norm': 1.3667467832565308, 'learning_rate': 4.806286549707603e-06, 'epoch': 0.27} +{'loss': 2.2925, 'grad_norm': 1.6296675205230713, 'learning_rate': 4.804459064327486e-06, 'epoch': 0.27} +{'loss': 2.2902, 'grad_norm': 1.9956955909729004, 'learning_rate': 4.802631578947369e-06, 'epoch': 0.27} +{'loss': 2.2738, 'grad_norm': 1.850484848022461, 'learning_rate': 4.800804093567251e-06, 'epoch': 0.27} +{'loss': 2.2809, 'grad_norm': 1.682741403579712, 'learning_rate': 4.798976608187135e-06, 'epoch': 0.28} +{'loss': 2.2944, 'grad_norm': 1.5462265014648438, 'learning_rate': 4.797149122807018e-06, 'epoch': 0.28} +{'loss': 2.3055, 'grad_norm': 1.6992024183273315, 'learning_rate': 4.7953216374269005e-06, 'epoch': 0.28} +{'loss': 2.2811, 'grad_norm': 2.0903217792510986, 'learning_rate': 4.793494152046784e-06, 'epoch': 0.28} +{'loss': 2.3058, 'grad_norm': 1.9676622152328491, 'learning_rate': 4.791666666666668e-06, 'epoch': 0.28} +{'eval_loss': 2.40899658203125, 'eval_runtime': 11.5988, 'eval_samples_per_second': 51.729, 'eval_steps_per_second': 1.121, 'epoch': 0.28} +{'loss': 2.3126, 'grad_norm': 1.6649582386016846, 'learning_rate': 4.78983918128655e-06, 'epoch': 0.29} +{'loss': 2.2789, 'grad_norm': 2.4453353881835938, 'learning_rate': 4.788011695906433e-06, 'epoch': 0.29} +{'loss': 2.3292, 'grad_norm': 2.011908769607544, 'learning_rate': 4.786184210526316e-06, 'epoch': 0.29} +{'loss': 2.2913, 'grad_norm': 1.4906234741210938, 'learning_rate': 4.7843567251461995e-06, 'epoch': 0.29} +{'loss': 2.2931, 'grad_norm': 2.1027095317840576, 'learning_rate': 4.782529239766082e-06, 'epoch': 0.3} +{'loss': 2.3031, 'grad_norm': 1.4204366207122803, 'learning_rate': 4.780701754385965e-06, 'epoch': 0.3} +{'loss': 2.3196, 'grad_norm': 2.1822638511657715, 'learning_rate': 4.778874269005848e-06, 'epoch': 0.3} +{'loss': 2.2743, 'grad_norm': 1.7422493696212769, 'learning_rate': 4.777046783625731e-06, 'epoch': 0.3} +{'loss': 2.285, 'grad_norm': 1.6661350727081299, 'learning_rate': 4.775219298245615e-06, 'epoch': 0.3} +{'loss': 2.295, 'grad_norm': 1.7462584972381592, 'learning_rate': 4.773391812865498e-06, 'epoch': 0.31} +{'eval_loss': 2.4030065536499023, 'eval_runtime': 11.608, 'eval_samples_per_second': 51.688, 'eval_steps_per_second': 1.12, 'epoch': 0.31} +{'loss': 2.2568, 'grad_norm': 1.5412518978118896, 'learning_rate': 4.7715643274853804e-06, 'epoch': 0.31} +{'loss': 2.2638, 'grad_norm': 1.5836228132247925, 'learning_rate': 4.769736842105264e-06, 'epoch': 0.31} +{'loss': 2.2783, 'grad_norm': 1.7100133895874023, 'learning_rate': 4.767909356725147e-06, 'epoch': 0.31} +{'loss': 2.3331, 'grad_norm': 1.7988075017929077, 'learning_rate': 4.7660818713450295e-06, 'epoch': 0.32} +{'loss': 2.297, 'grad_norm': 1.66475510597229, 'learning_rate': 4.764254385964912e-06, 'epoch': 0.32} +{'loss': 2.2671, 'grad_norm': 1.811797857284546, 'learning_rate': 4.762426900584796e-06, 'epoch': 0.32} +{'loss': 2.2555, 'grad_norm': 1.4660041332244873, 'learning_rate': 4.760599415204679e-06, 'epoch': 0.32} +{'loss': 2.2699, 'grad_norm': 2.041257381439209, 'learning_rate': 4.758771929824561e-06, 'epoch': 0.32} +{'loss': 2.2772, 'grad_norm': 1.6798973083496094, 'learning_rate': 4.756944444444445e-06, 'epoch': 0.33} diff --git a/wandb/run-20241101_012733-9v55tr72/files/requirements.txt b/wandb/run-20241101_012733-9v55tr72/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416 --- /dev/null +++ b/wandb/run-20241101_012733-9v55tr72/files/requirements.txt @@ -0,0 +1,147 @@ +funcsigs==1.0.2 +sentry-sdk==2.17.0 +multiprocess==0.70.16 +numpy==1.26.2 +pluralizer==1.2.0 +debugpy==1.6.7 +nvidia-cudnn-cu11==8.5.0.96 +deepspeed==0.15.2 +data==0.4 +pandas==2.1.3 +tomli==2.0.1 +charset-normalizer==3.3.2 +attrs==24.2.0 +aiosignal==1.3.1 +fsspec==2023.10.0 +nvidia-cusparse-cu11==11.7.4.91 +zipp==3.12.0 +mypy-extensions==1.0.0 +datasets==3.0.1 +joblib==1.3.2 +hjson==3.1.0 +traitlets==5.7.1 +stack-data==0.6.0 +transformers==4.45.1 +sympy==1.11.1 +Pygments==2.15.0 +docker-pycreds==0.4.0 +dill==0.3.8 +wheel==0.44.0 +prompt-toolkit==3.0.30 +parso==0.8.3 +ipykernel==6.23.1 +pyarrow==17.0.0 +certifi==2023.11.17 +nvidia-cufft-cu11==10.9.0.58 +six==1.16.0 +pydantic==2.9.2 +click==8.1.7 +nest-asyncio==1.5.6 +gmpy2==2.1.0 +matplotlib==3.8.2 +scipy==1.11.4 +typing_extensions==4.12.2 +statsmodels==0.14.0 +huggingface-hub==0.25.0 +frozenlist==1.4.1 +gpustat==1.1.1 +nvidia-nvtx-cu11==11.7.91 +safetensors==0.4.5 +stanza==1.9.2 +decorator==5.1.1 +seaborn==0.13.0 +sentencepiece==0.2.0 +PyYAML==6.0.1 +black==24.8.0 +protobuf==4.25.1 +pickleshare==0.7.5 +peft==0.13.0 +triton==2.0.0 +nvidia-cuda-runtime-cu11==11.7.99 +Jinja2==3.1.2 +nvidia-cusolver-cu11==11.4.0.1 +executing==1.2.0 +jupyter_client==8.1.0 +pluggy==1.3.0 +cmake==3.30.3 +pytz==2023.3.post1 +aiohappyeyeballs==2.4.2 +kiwisolver==1.4.5 +py-cpuinfo==9.0.0 +Pillow==10.1.0 +ptyprocess==0.7.0 +importlib_resources==6.4.5 +GitPython==3.1.43 +importlib-metadata==6.0.0 +iniconfig==2.0.0 +scikit-learn==1.3.2 +exceptiongroup==1.1.0 +networkx==2.8.6 +accelerate==1.0.0 +nltk==3.8.1 +shutilwhich==1.1.0 +fonttools==4.45.1 +future==0.18.3 +aiohttp==3.10.6 +wcwidth==0.2.5 +idna==3.6 +filelock==3.12.2 +pathspec==0.12.1 +jupyter_core==5.1.0 +lit==18.1.8 +nvidia-curand-cu11==10.2.10.91 +nvidia-cublas-cu11==11.10.3.66 +nvidia-ml-py==12.560.30 +msgpack==1.1.0 +python-dateutil==2.8.2 +blessed==1.20.0 +packaging==23.0 +gitdb==4.0.11 +yarl==1.13.0 +emoji==2.8.0 +tzdata==2023.3 +cycler==0.12.1 +tornado==6.2 +backcall==0.2.0 +plotnine==0.12.4 +ninja==1.11.1.1 +latex==0.7.0 +wandb==0.18.5 +setproctitle==1.3.3 +threadpoolctl==3.2.0 +requests==2.32.3 +pyparsing==3.1.1 +smmap==5.0.1 +pyzmq==23.0.0 +async-timeout==4.0.3 +annotated-types==0.7.0 +matplotlib-inline==0.1.6 +latexcodec==1.0.0 +ipython==8.0.0 +patsy==0.5.3 +contourpy==1.2.0 +multidict==6.1.0 +mizani==0.9.3 +urllib3==2.1.0 +tokenizers==0.20.0 +MarkupSafe==2.1.2 +pip==24.2 +pexpect==4.8.0 +tqdm==4.66.5 +jedi==0.18.2 +pydantic_core==2.23.4 +tempdir==0.7.1 +mpmath==1.2.1 +setuptools==72.1.0 +pytest==7.4.3 +pure-eval==0.2.2 +psutil==5.9.1 +comm==0.1.2 +nvidia-cuda-cupti-cu11==11.7.101 +nvidia-cuda-nvrtc-cu11==11.7.99 +regex==2023.10.3 +platformdirs==2.5.2 +asttokens==2.2.1 +torch==2.0.0 +nvidia-nccl-cu11==2.14.3 +xxhash==3.5.0 diff --git a/wandb/run-20241101_012733-9v55tr72/files/wandb-metadata.json b/wandb/run-20241101_012733-9v55tr72/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..81f4e96e0da386e846c29c764f9b5648d64b82ed --- /dev/null +++ b/wandb/run-20241101_012733-9v55tr72/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-11-01T05:27:33.891704Z", + "args": [ + "--perturbation", + "shuffle_nondeterministic", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "6", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1753992261632" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241101_012733-9v55tr72/logs/debug-internal.log b/wandb/run-20241101_012733-9v55tr72/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..e82bcf5af30ba4419f4145672e1330c5342eee52 --- /dev/null +++ b/wandb/run-20241101_012733-9v55tr72/logs/debug-internal.log @@ -0,0 +1,8 @@ +{"time":"2024-11-01T01:27:33.89478546-04:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-11-01T01:27:33.89479698-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_012733-9v55tr72/logs/debug-core.log"} +{"time":"2024-11-01T01:27:34.000399053-04:00","level":"INFO","msg":"created new stream","id":"9v55tr72"} +{"time":"2024-11-01T01:27:34.000437843-04:00","level":"INFO","msg":"stream: started","id":"9v55tr72"} +{"time":"2024-11-01T01:27:34.000488243-04:00","level":"INFO","msg":"sender: started","stream_id":"9v55tr72"} +{"time":"2024-11-01T01:27:34.000483083-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"9v55tr72"}} +{"time":"2024-11-01T01:27:34.000469223-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"9v55tr72"}} +{"time":"2024-11-01T01:27:34.182959789-04:00","level":"INFO","msg":"Starting system monitor"} diff --git a/wandb/run-20241101_012733-9v55tr72/logs/debug.log b/wandb/run-20241101_012733-9v55tr72/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..db741336f5379f9f7eb42dda8ab3cf65ebec06ff --- /dev/null +++ b/wandb/run-20241101_012733-9v55tr72/logs/debug.log @@ -0,0 +1,29 @@ +2024-11-01 01:27:33,888 INFO MainThread:678552 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-11-01 01:27:33,888 INFO MainThread:678552 [wandb_setup.py:_flush():79] Configure stats pid to 678552 +2024-11-01 01:27:33,888 INFO MainThread:678552 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-11-01 01:27:33,888 INFO MainThread:678552 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-11-01 01:27:33,888 INFO MainThread:678552 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-11-01 01:27:33,889 INFO MainThread:678552 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-11-01 01:27:33,889 INFO MainThread:678552 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-11-01 01:27:33,889 INFO MainThread:678552 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-11-01 01:27:33,889 INFO MainThread:678552 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_012733-9v55tr72/logs/debug.log +2024-11-01 01:27:33,889 INFO MainThread:678552 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_012733-9v55tr72/logs/debug-internal.log +2024-11-01 01:27:33,889 INFO MainThread:678552 [wandb_init.py:init():621] calling init triggers +2024-11-01 01:27:33,889 INFO MainThread:678552 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-11-01 01:27:33,889 INFO MainThread:678552 [wandb_init.py:init():671] starting backend +2024-11-01 01:27:33,889 INFO MainThread:678552 [wandb_init.py:init():675] sending inform_init request +2024-11-01 01:27:33,891 INFO MainThread:678552 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-11-01 01:27:33,891 INFO MainThread:678552 [wandb_init.py:init():688] backend started and connected +2024-11-01 01:27:33,894 INFO MainThread:678552 [wandb_init.py:init():783] updated telemetry +2024-11-01 01:27:33,912 INFO MainThread:678552 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-11-01 01:27:34,178 INFO MainThread:678552 [wandb_init.py:init():867] starting run threads in backend +2024-11-01 01:27:34,264 INFO MainThread:678552 [wandb_run.py:_console_start():2463] atexit reg +2024-11-01 01:27:34,264 INFO MainThread:678552 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-11-01 01:27:34,264 INFO MainThread:678552 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-11-01 01:27:34,264 INFO MainThread:678552 [wandb_run.py:_redirect():2401] Redirects installed. +2024-11-01 01:27:34,266 INFO MainThread:678552 [wandb_init.py:init():911] run started, returning control to user process +2024-11-01 01:27:34,266 INFO MainThread:678552 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'shuffle_nondeterministic', 'train_set': '10M', 'batch_size': 3, 'epoch': 6, 'seed': 0, 'lr': 5e-06} +2024-11-01 01:33:19,616 INFO MainThread:678552 [wandb_run.py:_config_callback():1390] config_cb None None {'vocab_size': 128256, 'max_position_embeddings': 131072, 'hidden_size': 3072, 'intermediate_size': 8192, 'num_hidden_layers': 28, 'num_attention_heads': 24, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'pretraining_tp': 1, 'use_cache': True, 'rope_theta': 500000.0, 'rope_scaling': {'factor': 32.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}, 'attention_bias': False, 'attention_dropout': 0.0, 'mlp_bias': False, 'head_dim': 128, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': True, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['LlamaForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 128000, 'pad_token_id': None, 'eos_token_id': 128001, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'meta-llama/Llama-3.2-3B', 'transformers_version': '4.45.1', 'model_type': 'llama', 'output_dir': './checkpoints/Llama-3.2-3B/babylm_shuffle_nondeterministic_10M_seed0/runs', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 3, 'per_device_eval_batch_size': 8, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 2, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-06, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 6, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': './logs', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 150, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 0, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': False, 'fp16': True, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 10, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': './checkpoints/Llama-3.2-3B/babylm_shuffle_nondeterministic_10M_seed0/runs', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': 'deepspeed_config/train_dp_config.json', 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'evaluation_strategy': 'steps', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'eval_use_gather_object': False} +2024-11-01 01:33:19,623 INFO MainThread:678552 [wandb_config.py:__setitem__():154] config set model/num_parameters = 3212749824 - > +2024-11-01 01:33:19,623 INFO MainThread:678552 [wandb_run.py:_config_callback():1390] config_cb model/num_parameters 3212749824 None diff --git a/wandb/run-20241101_094656-ae4hctp0/files/output.log b/wandb/run-20241101_094656-ae4hctp0/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..127126551236cb66b04fdb9d1bbf00e4210b9038 --- /dev/null +++ b/wandb/run-20241101_094656-ae4hctp0/files/output.log @@ -0,0 +1,13 @@ +Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:05<00:00, 2.71s/it] +tokenized_valid: Dataset({ + features: ['input_ids', 'attention_mask'], + num_rows: 600 +}) +/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead + warnings.warn( +[2024-11-01 09:47:04,113] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-11-01 09:47:14,377] [INFO] [comm.py:652:init_distributed] cdb=None +Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination +Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root... +Loading extension module cpu_adam... +Time to load cpu_adam op: 4.517135381698608 seconds diff --git a/wandb/run-20241101_094656-ae4hctp0/files/requirements.txt b/wandb/run-20241101_094656-ae4hctp0/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416 --- /dev/null +++ b/wandb/run-20241101_094656-ae4hctp0/files/requirements.txt @@ -0,0 +1,147 @@ +funcsigs==1.0.2 +sentry-sdk==2.17.0 +multiprocess==0.70.16 +numpy==1.26.2 +pluralizer==1.2.0 +debugpy==1.6.7 +nvidia-cudnn-cu11==8.5.0.96 +deepspeed==0.15.2 +data==0.4 +pandas==2.1.3 +tomli==2.0.1 +charset-normalizer==3.3.2 +attrs==24.2.0 +aiosignal==1.3.1 +fsspec==2023.10.0 +nvidia-cusparse-cu11==11.7.4.91 +zipp==3.12.0 +mypy-extensions==1.0.0 +datasets==3.0.1 +joblib==1.3.2 +hjson==3.1.0 +traitlets==5.7.1 +stack-data==0.6.0 +transformers==4.45.1 +sympy==1.11.1 +Pygments==2.15.0 +docker-pycreds==0.4.0 +dill==0.3.8 +wheel==0.44.0 +prompt-toolkit==3.0.30 +parso==0.8.3 +ipykernel==6.23.1 +pyarrow==17.0.0 +certifi==2023.11.17 +nvidia-cufft-cu11==10.9.0.58 +six==1.16.0 +pydantic==2.9.2 +click==8.1.7 +nest-asyncio==1.5.6 +gmpy2==2.1.0 +matplotlib==3.8.2 +scipy==1.11.4 +typing_extensions==4.12.2 +statsmodels==0.14.0 +huggingface-hub==0.25.0 +frozenlist==1.4.1 +gpustat==1.1.1 +nvidia-nvtx-cu11==11.7.91 +safetensors==0.4.5 +stanza==1.9.2 +decorator==5.1.1 +seaborn==0.13.0 +sentencepiece==0.2.0 +PyYAML==6.0.1 +black==24.8.0 +protobuf==4.25.1 +pickleshare==0.7.5 +peft==0.13.0 +triton==2.0.0 +nvidia-cuda-runtime-cu11==11.7.99 +Jinja2==3.1.2 +nvidia-cusolver-cu11==11.4.0.1 +executing==1.2.0 +jupyter_client==8.1.0 +pluggy==1.3.0 +cmake==3.30.3 +pytz==2023.3.post1 +aiohappyeyeballs==2.4.2 +kiwisolver==1.4.5 +py-cpuinfo==9.0.0 +Pillow==10.1.0 +ptyprocess==0.7.0 +importlib_resources==6.4.5 +GitPython==3.1.43 +importlib-metadata==6.0.0 +iniconfig==2.0.0 +scikit-learn==1.3.2 +exceptiongroup==1.1.0 +networkx==2.8.6 +accelerate==1.0.0 +nltk==3.8.1 +shutilwhich==1.1.0 +fonttools==4.45.1 +future==0.18.3 +aiohttp==3.10.6 +wcwidth==0.2.5 +idna==3.6 +filelock==3.12.2 +pathspec==0.12.1 +jupyter_core==5.1.0 +lit==18.1.8 +nvidia-curand-cu11==10.2.10.91 +nvidia-cublas-cu11==11.10.3.66 +nvidia-ml-py==12.560.30 +msgpack==1.1.0 +python-dateutil==2.8.2 +blessed==1.20.0 +packaging==23.0 +gitdb==4.0.11 +yarl==1.13.0 +emoji==2.8.0 +tzdata==2023.3 +cycler==0.12.1 +tornado==6.2 +backcall==0.2.0 +plotnine==0.12.4 +ninja==1.11.1.1 +latex==0.7.0 +wandb==0.18.5 +setproctitle==1.3.3 +threadpoolctl==3.2.0 +requests==2.32.3 +pyparsing==3.1.1 +smmap==5.0.1 +pyzmq==23.0.0 +async-timeout==4.0.3 +annotated-types==0.7.0 +matplotlib-inline==0.1.6 +latexcodec==1.0.0 +ipython==8.0.0 +patsy==0.5.3 +contourpy==1.2.0 +multidict==6.1.0 +mizani==0.9.3 +urllib3==2.1.0 +tokenizers==0.20.0 +MarkupSafe==2.1.2 +pip==24.2 +pexpect==4.8.0 +tqdm==4.66.5 +jedi==0.18.2 +pydantic_core==2.23.4 +tempdir==0.7.1 +mpmath==1.2.1 +setuptools==72.1.0 +pytest==7.4.3 +pure-eval==0.2.2 +psutil==5.9.1 +comm==0.1.2 +nvidia-cuda-cupti-cu11==11.7.101 +nvidia-cuda-nvrtc-cu11==11.7.99 +regex==2023.10.3 +platformdirs==2.5.2 +asttokens==2.2.1 +torch==2.0.0 +nvidia-nccl-cu11==2.14.3 +xxhash==3.5.0 diff --git a/wandb/run-20241101_094656-ae4hctp0/files/wandb-metadata.json b/wandb/run-20241101_094656-ae4hctp0/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..ddfb439dc3cbb516110be7211d21973744242988 --- /dev/null +++ b/wandb/run-20241101_094656-ae4hctp0/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-11-01T13:46:56.380225Z", + "args": [ + "--perturbation", + "reverse_control", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "7", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1754695659520" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241101_094656-ae4hctp0/logs/debug.log b/wandb/run-20241101_094656-ae4hctp0/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..fc8b27166a694bd466abb7311225057a611a3c3a --- /dev/null +++ b/wandb/run-20241101_094656-ae4hctp0/logs/debug.log @@ -0,0 +1,26 @@ +2024-11-01 09:46:56,378 INFO MainThread:786690 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-11-01 09:46:56,378 INFO MainThread:786690 [wandb_setup.py:_flush():79] Configure stats pid to 786690 +2024-11-01 09:46:56,378 INFO MainThread:786690 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-11-01 09:46:56,378 INFO MainThread:786690 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-11-01 09:46:56,378 INFO MainThread:786690 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-11-01 09:46:56,378 INFO MainThread:786690 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-11-01 09:46:56,378 INFO MainThread:786690 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-11-01 09:46:56,378 INFO MainThread:786690 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-11-01 09:46:56,378 INFO MainThread:786690 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_094656-ae4hctp0/logs/debug.log +2024-11-01 09:46:56,378 INFO MainThread:786690 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_094656-ae4hctp0/logs/debug-internal.log +2024-11-01 09:46:56,378 INFO MainThread:786690 [wandb_init.py:init():621] calling init triggers +2024-11-01 09:46:56,378 INFO MainThread:786690 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-11-01 09:46:56,378 INFO MainThread:786690 [wandb_init.py:init():671] starting backend +2024-11-01 09:46:56,378 INFO MainThread:786690 [wandb_init.py:init():675] sending inform_init request +2024-11-01 09:46:56,379 INFO MainThread:786690 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-11-01 09:46:56,380 INFO MainThread:786690 [wandb_init.py:init():688] backend started and connected +2024-11-01 09:46:56,383 INFO MainThread:786690 [wandb_init.py:init():783] updated telemetry +2024-11-01 09:46:56,411 INFO MainThread:786690 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-11-01 09:46:56,711 INFO MainThread:786690 [wandb_init.py:init():867] starting run threads in backend +2024-11-01 09:46:56,846 INFO MainThread:786690 [wandb_run.py:_console_start():2463] atexit reg +2024-11-01 09:46:56,846 INFO MainThread:786690 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-11-01 09:46:56,846 INFO MainThread:786690 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-11-01 09:46:56,847 INFO MainThread:786690 [wandb_run.py:_redirect():2401] Redirects installed. +2024-11-01 09:46:56,849 INFO MainThread:786690 [wandb_init.py:init():911] run started, returning control to user process +2024-11-01 09:46:56,849 INFO MainThread:786690 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_control', 'train_set': '10M', 'batch_size': 3, 'epoch': 7, 'seed': 0, 'lr': 5e-06} diff --git a/wandb/run-20241101_094656-ae4hctp0/run-ae4hctp0.wandb b/wandb/run-20241101_094656-ae4hctp0/run-ae4hctp0.wandb new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/wandb/run-20241101_200517-77b12390/files/output.log b/wandb/run-20241101_200517-77b12390/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..73134a1b145f9b9eae01e20f4bdcd927010d9ab3 --- /dev/null +++ b/wandb/run-20241101_200517-77b12390/files/output.log @@ -0,0 +1,57 @@ +Downloading shards: 0%| | 0/2 [00:00 +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/std.py", line 1196, in __iter__ + self.close() + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/std.py", line 1302, in close + self.display(pos=0) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/std.py", line 1495, in display + self.sp(self.__str__() if msg is None else msg) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/std.py", line 459, in print_status + fp_write('\r' + s + (' ' * max(last_len[0] - len_s, 0))) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/std.py", line 452, in fp_write + fp.write(str(s)) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/utils.py", line 196, in inner + return func(*args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/wandb/sdk/lib/redirect.py", line 648, in write + cb(data) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/wandb/sdk/wandb_run.py", line 2386, in + lambda data: self._console_raw_callback("stderr", data), + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/wandb/sdk/wandb_run.py", line 403, in wrapper_fn + return func(self, *args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/wandb/sdk/wandb_run.py", line 1547, in _console_raw_callback + self._backend.interface.publish_output_raw(name, data) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/wandb/sdk/interface/interface.py", line 721, in publish_output_raw + self._publish_output_raw(o) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/wandb/sdk/interface/interface_shared.py", line 79, in _publish_output_raw + self._publish(rec) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/wandb/sdk/interface/interface_sock.py", line 50, in _publish + self._assign(record) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/wandb/sdk/interface/interface_sock.py", line 45, in _assign + def _assign(self, record: Any) -> None: +KeyboardInterrupt: +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", line 173, in + model = AutoModelForCausalLM.from_pretrained(model_name, + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/models/auto/auto_factory.py", line 564, in from_pretrained + return model_class.from_pretrained( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/modeling_utils.py", line 3769, in from_pretrained + resolved_archive_file, sharded_metadata = get_checkpoint_shard_files( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 1098, in get_checkpoint_shard_files + cached_filename = cached_file( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 403, in cached_file + resolved_file = hf_hub_download( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_deprecation.py", line 101, in inner_f + return f(*args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn + return fn(*args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1232, in hf_hub_download + return _hf_hub_download_to_cache_dir( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1380, in _hf_hub_download_to_cache_dir + with WeakFileLock(lock_path): + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/contextlib.py", line 119, in __enter__ + return next(self.gen) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_fixes.py", line 98, in WeakFileLock + lock.acquire() + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/filelock/_api.py", line 225, in acquire + time.sleep(poll_interval) +KeyboardInterrupt diff --git a/wandb/run-20241101_200517-77b12390/files/wandb-metadata.json b/wandb/run-20241101_200517-77b12390/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..37f79f63e00320deea3302d6743afafd21b3338e --- /dev/null +++ b/wandb/run-20241101_200517-77b12390/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-11-02T00:05:17.462510Z", + "args": [ + "--perturbation", + "shuffle_nondeterministic", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "3", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1754801557504" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241101_200517-77b12390/files/wandb-summary.json b/wandb/run-20241101_200517-77b12390/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..c437ff1a48b0e53a8cdd36dcd584a8e6b22b4bc2 --- /dev/null +++ b/wandb/run-20241101_200517-77b12390/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":7}} \ No newline at end of file diff --git a/wandb/run-20241101_200517-77b12390/logs/debug-internal.log b/wandb/run-20241101_200517-77b12390/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..6ac4efda0e718cdf43d1a08e2192d5cc5b75c282 --- /dev/null +++ b/wandb/run-20241101_200517-77b12390/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2024-11-01T20:05:17.45897271-04:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-11-01T20:05:17.45899154-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_200517-77b12390/logs/debug-core.log"} +{"time":"2024-11-01T20:05:17.566867501-04:00","level":"INFO","msg":"created new stream","id":"77b12390"} +{"time":"2024-11-01T20:05:17.566910622-04:00","level":"INFO","msg":"stream: started","id":"77b12390"} +{"time":"2024-11-01T20:05:17.567008312-04:00","level":"INFO","msg":"sender: started","stream_id":"77b12390"} +{"time":"2024-11-01T20:05:17.566939652-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"77b12390"}} +{"time":"2024-11-01T20:05:17.566955952-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"77b12390"}} +{"time":"2024-11-01T20:05:17.736070962-04:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2024-11-01T20:05:25.266366833-04:00","level":"INFO","msg":"stream: closing","id":"77b12390"} +{"time":"2024-11-01T20:05:25.266402524-04:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2024-11-01T20:05:25.266957688-04:00","level":"INFO","msg":"Stopped system monitor"} diff --git a/wandb/run-20241101_200517-77b12390/logs/debug.log b/wandb/run-20241101_200517-77b12390/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..71947adb65477bd25fab337dc910ef1e17f6f6d1 --- /dev/null +++ b/wandb/run-20241101_200517-77b12390/logs/debug.log @@ -0,0 +1,27 @@ +2024-11-01 20:05:17,453 INFO MainThread:870381 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-11-01 20:05:17,453 INFO MainThread:870381 [wandb_setup.py:_flush():79] Configure stats pid to 870381 +2024-11-01 20:05:17,453 INFO MainThread:870381 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-11-01 20:05:17,453 INFO MainThread:870381 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-11-01 20:05:17,453 INFO MainThread:870381 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-11-01 20:05:17,453 INFO MainThread:870381 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-11-01 20:05:17,453 INFO MainThread:870381 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-11-01 20:05:17,453 INFO MainThread:870381 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-11-01 20:05:17,454 INFO MainThread:870381 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_200517-77b12390/logs/debug.log +2024-11-01 20:05:17,454 INFO MainThread:870381 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_200517-77b12390/logs/debug-internal.log +2024-11-01 20:05:17,454 INFO MainThread:870381 [wandb_init.py:init():621] calling init triggers +2024-11-01 20:05:17,454 INFO MainThread:870381 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-11-01 20:05:17,454 INFO MainThread:870381 [wandb_init.py:init():671] starting backend +2024-11-01 20:05:17,454 INFO MainThread:870381 [wandb_init.py:init():675] sending inform_init request +2024-11-01 20:05:17,456 INFO MainThread:870381 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-11-01 20:05:17,456 INFO MainThread:870381 [wandb_init.py:init():688] backend started and connected +2024-11-01 20:05:17,465 INFO MainThread:870381 [wandb_init.py:init():783] updated telemetry +2024-11-01 20:05:17,492 INFO MainThread:870381 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-11-01 20:05:17,733 INFO MainThread:870381 [wandb_init.py:init():867] starting run threads in backend +2024-11-01 20:05:17,821 INFO MainThread:870381 [wandb_run.py:_console_start():2463] atexit reg +2024-11-01 20:05:17,821 INFO MainThread:870381 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-11-01 20:05:17,821 INFO MainThread:870381 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-11-01 20:05:17,821 INFO MainThread:870381 [wandb_run.py:_redirect():2401] Redirects installed. +2024-11-01 20:05:17,822 INFO MainThread:870381 [wandb_init.py:init():911] run started, returning control to user process +2024-11-01 20:05:17,822 INFO MainThread:870381 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'shuffle_nondeterministic', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0, 'lr': 5e-06} +2024-11-01 20:05:25,266 WARNING MsgRouterThr:870381 [router.py:message_loop():77] message_loop has been closed diff --git a/wandb/run-20241101_200517-iopieyi0/run-iopieyi0.wandb b/wandb/run-20241101_200517-iopieyi0/run-iopieyi0.wandb new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/wandb/run-20241101_201707-qparlnlu/run-qparlnlu.wandb b/wandb/run-20241101_201707-qparlnlu/run-qparlnlu.wandb new file mode 100644 index 0000000000000000000000000000000000000000..84712ce5952c0d074021247a0bfdf12463bdccbb Binary files /dev/null and b/wandb/run-20241101_201707-qparlnlu/run-qparlnlu.wandb differ diff --git a/wandb/run-20241105_161832-sl992h9i/logs/debug-internal.log b/wandb/run-20241105_161832-sl992h9i/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..3bc4d3693d6886135e94299dfb271daaf90d3682 --- /dev/null +++ b/wandb/run-20241105_161832-sl992h9i/logs/debug-internal.log @@ -0,0 +1,8 @@ +{"time":"2024-11-05T16:18:32.541242362-05:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-11-05T16:18:32.541252362-05:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241105_161832-sl992h9i/logs/debug-core.log"} +{"time":"2024-11-05T16:18:32.64870613-05:00","level":"INFO","msg":"created new stream","id":"sl992h9i"} +{"time":"2024-11-05T16:18:32.648925761-05:00","level":"INFO","msg":"stream: started","id":"sl992h9i"} +{"time":"2024-11-05T16:18:32.648950381-05:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"sl992h9i"}} +{"time":"2024-11-05T16:18:32.649202603-05:00","level":"INFO","msg":"sender: started","stream_id":"sl992h9i"} +{"time":"2024-11-05T16:18:32.649258273-05:00","level":"INFO","msg":"handler: started","stream_id":{"value":"sl992h9i"}} +{"time":"2024-11-05T16:18:32.866729655-05:00","level":"INFO","msg":"Starting system monitor"} diff --git a/wandb/run-20241105_161832-sl992h9i/logs/debug.log b/wandb/run-20241105_161832-sl992h9i/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..919a7094c2ccc3c0af70ed62eb8a0c87e1a32bb3 --- /dev/null +++ b/wandb/run-20241105_161832-sl992h9i/logs/debug.log @@ -0,0 +1,29 @@ +2024-11-05 16:18:32,537 INFO MainThread:1773597 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-11-05 16:18:32,537 INFO MainThread:1773597 [wandb_setup.py:_flush():79] Configure stats pid to 1773597 +2024-11-05 16:18:32,537 INFO MainThread:1773597 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-11-05 16:18:32,537 INFO MainThread:1773597 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-11-05 16:18:32,537 INFO MainThread:1773597 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-11-05 16:18:32,538 INFO MainThread:1773597 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-11-05 16:18:32,538 INFO MainThread:1773597 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-11-05 16:18:32,538 INFO MainThread:1773597 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-11-05 16:18:32,538 INFO MainThread:1773597 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241105_161832-sl992h9i/logs/debug.log +2024-11-05 16:18:32,538 INFO MainThread:1773597 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241105_161832-sl992h9i/logs/debug-internal.log +2024-11-05 16:18:32,538 INFO MainThread:1773597 [wandb_init.py:init():621] calling init triggers +2024-11-05 16:18:32,538 INFO MainThread:1773597 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-11-05 16:18:32,538 INFO MainThread:1773597 [wandb_init.py:init():671] starting backend +2024-11-05 16:18:32,538 INFO MainThread:1773597 [wandb_init.py:init():675] sending inform_init request +2024-11-05 16:18:32,539 INFO MainThread:1773597 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-11-05 16:18:32,539 INFO MainThread:1773597 [wandb_init.py:init():688] backend started and connected +2024-11-05 16:18:32,541 INFO MainThread:1773597 [wandb_init.py:init():783] updated telemetry +2024-11-05 16:18:32,575 INFO MainThread:1773597 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-11-05 16:18:32,863 INFO MainThread:1773597 [wandb_init.py:init():867] starting run threads in backend +2024-11-05 16:18:32,949 INFO MainThread:1773597 [wandb_run.py:_console_start():2463] atexit reg +2024-11-05 16:18:32,949 INFO MainThread:1773597 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-11-05 16:18:32,950 INFO MainThread:1773597 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-11-05 16:18:32,950 INFO MainThread:1773597 [wandb_run.py:_redirect():2401] Redirects installed. +2024-11-05 16:18:32,951 INFO MainThread:1773597 [wandb_init.py:init():911] run started, returning control to user process +2024-11-05 16:18:32,951 INFO MainThread:1773597 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'shuffle_deterministic21', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0, 'lr': 5e-06} +2024-11-05 16:24:36,566 INFO MainThread:1773597 [wandb_run.py:_config_callback():1390] config_cb None None {'vocab_size': 128256, 'max_position_embeddings': 131072, 'hidden_size': 3072, 'intermediate_size': 8192, 'num_hidden_layers': 28, 'num_attention_heads': 24, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'pretraining_tp': 1, 'use_cache': True, 'rope_theta': 500000.0, 'rope_scaling': {'factor': 32.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}, 'attention_bias': False, 'attention_dropout': 0.0, 'mlp_bias': False, 'head_dim': 128, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': True, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['LlamaForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 128000, 'pad_token_id': None, 'eos_token_id': 128001, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'meta-llama/Llama-3.2-3B', 'transformers_version': '4.45.1', 'model_type': 'llama', 'output_dir': './checkpoints/Llama-3.2-3B/babylm_shuffle_deterministic21_10M_seed0/runs', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 3, 'per_device_eval_batch_size': 8, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 2, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-06, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.1, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': './logs', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 100, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 0, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': False, 'fp16': True, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 10, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': './checkpoints/Llama-3.2-3B/babylm_shuffle_deterministic21_10M_seed0/runs', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': 'deepspeed_config/train_dp_config.json', 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'evaluation_strategy': 'steps', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'eval_use_gather_object': False} +2024-11-05 16:24:36,569 INFO MainThread:1773597 [wandb_config.py:__setitem__():154] config set model/num_parameters = 3212749824 - > +2024-11-05 16:24:36,569 INFO MainThread:1773597 [wandb_run.py:_config_callback():1390] config_cb model/num_parameters 3212749824 None diff --git a/wandb/run-20241105_162824-fa9ep6qh/files/requirements.txt b/wandb/run-20241105_162824-fa9ep6qh/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416 --- /dev/null +++ b/wandb/run-20241105_162824-fa9ep6qh/files/requirements.txt @@ -0,0 +1,147 @@ +funcsigs==1.0.2 +sentry-sdk==2.17.0 +multiprocess==0.70.16 +numpy==1.26.2 +pluralizer==1.2.0 +debugpy==1.6.7 +nvidia-cudnn-cu11==8.5.0.96 +deepspeed==0.15.2 +data==0.4 +pandas==2.1.3 +tomli==2.0.1 +charset-normalizer==3.3.2 +attrs==24.2.0 +aiosignal==1.3.1 +fsspec==2023.10.0 +nvidia-cusparse-cu11==11.7.4.91 +zipp==3.12.0 +mypy-extensions==1.0.0 +datasets==3.0.1 +joblib==1.3.2 +hjson==3.1.0 +traitlets==5.7.1 +stack-data==0.6.0 +transformers==4.45.1 +sympy==1.11.1 +Pygments==2.15.0 +docker-pycreds==0.4.0 +dill==0.3.8 +wheel==0.44.0 +prompt-toolkit==3.0.30 +parso==0.8.3 +ipykernel==6.23.1 +pyarrow==17.0.0 +certifi==2023.11.17 +nvidia-cufft-cu11==10.9.0.58 +six==1.16.0 +pydantic==2.9.2 +click==8.1.7 +nest-asyncio==1.5.6 +gmpy2==2.1.0 +matplotlib==3.8.2 +scipy==1.11.4 +typing_extensions==4.12.2 +statsmodels==0.14.0 +huggingface-hub==0.25.0 +frozenlist==1.4.1 +gpustat==1.1.1 +nvidia-nvtx-cu11==11.7.91 +safetensors==0.4.5 +stanza==1.9.2 +decorator==5.1.1 +seaborn==0.13.0 +sentencepiece==0.2.0 +PyYAML==6.0.1 +black==24.8.0 +protobuf==4.25.1 +pickleshare==0.7.5 +peft==0.13.0 +triton==2.0.0 +nvidia-cuda-runtime-cu11==11.7.99 +Jinja2==3.1.2 +nvidia-cusolver-cu11==11.4.0.1 +executing==1.2.0 +jupyter_client==8.1.0 +pluggy==1.3.0 +cmake==3.30.3 +pytz==2023.3.post1 +aiohappyeyeballs==2.4.2 +kiwisolver==1.4.5 +py-cpuinfo==9.0.0 +Pillow==10.1.0 +ptyprocess==0.7.0 +importlib_resources==6.4.5 +GitPython==3.1.43 +importlib-metadata==6.0.0 +iniconfig==2.0.0 +scikit-learn==1.3.2 +exceptiongroup==1.1.0 +networkx==2.8.6 +accelerate==1.0.0 +nltk==3.8.1 +shutilwhich==1.1.0 +fonttools==4.45.1 +future==0.18.3 +aiohttp==3.10.6 +wcwidth==0.2.5 +idna==3.6 +filelock==3.12.2 +pathspec==0.12.1 +jupyter_core==5.1.0 +lit==18.1.8 +nvidia-curand-cu11==10.2.10.91 +nvidia-cublas-cu11==11.10.3.66 +nvidia-ml-py==12.560.30 +msgpack==1.1.0 +python-dateutil==2.8.2 +blessed==1.20.0 +packaging==23.0 +gitdb==4.0.11 +yarl==1.13.0 +emoji==2.8.0 +tzdata==2023.3 +cycler==0.12.1 +tornado==6.2 +backcall==0.2.0 +plotnine==0.12.4 +ninja==1.11.1.1 +latex==0.7.0 +wandb==0.18.5 +setproctitle==1.3.3 +threadpoolctl==3.2.0 +requests==2.32.3 +pyparsing==3.1.1 +smmap==5.0.1 +pyzmq==23.0.0 +async-timeout==4.0.3 +annotated-types==0.7.0 +matplotlib-inline==0.1.6 +latexcodec==1.0.0 +ipython==8.0.0 +patsy==0.5.3 +contourpy==1.2.0 +multidict==6.1.0 +mizani==0.9.3 +urllib3==2.1.0 +tokenizers==0.20.0 +MarkupSafe==2.1.2 +pip==24.2 +pexpect==4.8.0 +tqdm==4.66.5 +jedi==0.18.2 +pydantic_core==2.23.4 +tempdir==0.7.1 +mpmath==1.2.1 +setuptools==72.1.0 +pytest==7.4.3 +pure-eval==0.2.2 +psutil==5.9.1 +comm==0.1.2 +nvidia-cuda-cupti-cu11==11.7.101 +nvidia-cuda-nvrtc-cu11==11.7.99 +regex==2023.10.3 +platformdirs==2.5.2 +asttokens==2.2.1 +torch==2.0.0 +nvidia-nccl-cu11==2.14.3 +xxhash==3.5.0 diff --git a/wandb/run-20241105_162824-fa9ep6qh/logs/debug.log b/wandb/run-20241105_162824-fa9ep6qh/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..5a35297ade645a0097ec42bed9de6cd507afa7bc --- /dev/null +++ b/wandb/run-20241105_162824-fa9ep6qh/logs/debug.log @@ -0,0 +1,26 @@ +2024-11-05 16:28:24,422 INFO MainThread:1777856 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-11-05 16:28:24,422 INFO MainThread:1777856 [wandb_setup.py:_flush():79] Configure stats pid to 1777856 +2024-11-05 16:28:24,422 INFO MainThread:1777856 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-11-05 16:28:24,422 INFO MainThread:1777856 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-11-05 16:28:24,423 INFO MainThread:1777856 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-11-05 16:28:24,423 INFO MainThread:1777856 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-11-05 16:28:24,423 INFO MainThread:1777856 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-11-05 16:28:24,423 INFO MainThread:1777856 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-11-05 16:28:24,423 INFO MainThread:1777856 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241105_162824-fa9ep6qh/logs/debug.log +2024-11-05 16:28:24,423 INFO MainThread:1777856 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241105_162824-fa9ep6qh/logs/debug-internal.log +2024-11-05 16:28:24,423 INFO MainThread:1777856 [wandb_init.py:init():621] calling init triggers +2024-11-05 16:28:24,423 INFO MainThread:1777856 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-11-05 16:28:24,423 INFO MainThread:1777856 [wandb_init.py:init():671] starting backend +2024-11-05 16:28:24,423 INFO MainThread:1777856 [wandb_init.py:init():675] sending inform_init request +2024-11-05 16:28:24,425 INFO MainThread:1777856 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-11-05 16:28:24,425 INFO MainThread:1777856 [wandb_init.py:init():688] backend started and connected +2024-11-05 16:28:24,428 INFO MainThread:1777856 [wandb_init.py:init():783] updated telemetry +2024-11-05 16:28:24,454 INFO MainThread:1777856 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-11-05 16:28:24,752 INFO MainThread:1777856 [wandb_init.py:init():867] starting run threads in backend +2024-11-05 16:28:24,846 INFO MainThread:1777856 [wandb_run.py:_console_start():2463] atexit reg +2024-11-05 16:28:24,846 INFO MainThread:1777856 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-11-05 16:28:24,847 INFO MainThread:1777856 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-11-05 16:28:24,847 INFO MainThread:1777856 [wandb_run.py:_redirect():2401] Redirects installed. +2024-11-05 16:28:24,849 INFO MainThread:1777856 [wandb_init.py:init():911] run started, returning control to user process +2024-11-05 16:28:24,850 INFO MainThread:1777856 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'shuffle_deterministic21', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0, 'lr': 5e-06} diff --git a/wandb/run-20241105_162858-6py0unak/run-6py0unak.wandb b/wandb/run-20241105_162858-6py0unak/run-6py0unak.wandb new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/wandb/run-20241106_234111-v7pqfaqj/files/config.yaml b/wandb/run-20241106_234111-v7pqfaqj/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..45c9885a80c6c6551af0939982b4765da8541cf4 --- /dev/null +++ b/wandb/run-20241106_234111-v7pqfaqj/files/config.yaml @@ -0,0 +1,49 @@ +_wandb: + value: + cli_version: 0.18.5 + m: [] + python_version: 3.9.19 + t: + "1": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "2": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "3": + - 13 + - 23 + - 55 + "4": 3.9.19 + "5": 0.18.5 + "6": 4.45.1 + "8": + - 5 + "12": 0.18.5 + "13": linux-x86_64 +batch_size: + value: 3 +epoch: + value: 3 +lr: + value: 5e-06 +perturbation: + value: shuffle_even_odd +seed: + value: 0 +train_set: + value: 10M diff --git a/wandb/run-20241106_234111-v7pqfaqj/files/output.log b/wandb/run-20241106_234111-v7pqfaqj/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..67dc001e59e5b45c00aba7db4d17418174736f80 --- /dev/null +++ b/wandb/run-20241106_234111-v7pqfaqj/files/output.log @@ -0,0 +1,43 @@ +Downloading shards: 0%| | 0/2 [00:00 +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/std.py", line 1196, in __iter__ + self.close() + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/std.py", line 1302, in close + self.display(pos=0) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/std.py", line 1495, in display + self.sp(self.__str__() if msg is None else msg) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/std.py", line 458, in print_status + len_s = disp_len(s) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/utils.py", line 383, in disp_len + return _text_width(RE_ANSI.sub('', data)) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/utils.py", line 375, in _text_width + return sum(2 if east_asian_width(ch) in 'FW' else 1 for ch in str(s)) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/utils.py", line 375, in + return sum(2 if east_asian_width(ch) in 'FW' else 1 for ch in str(s)) +KeyboardInterrupt: +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", line 174, in + model = AutoModelForCausalLM.from_pretrained(model_name, + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/models/auto/auto_factory.py", line 564, in from_pretrained + return model_class.from_pretrained( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/modeling_utils.py", line 3769, in from_pretrained + resolved_archive_file, sharded_metadata = get_checkpoint_shard_files( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 1098, in get_checkpoint_shard_files + cached_filename = cached_file( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 403, in cached_file + resolved_file = hf_hub_download( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_deprecation.py", line 101, in inner_f + return f(*args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn + return fn(*args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1232, in hf_hub_download + return _hf_hub_download_to_cache_dir( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1380, in _hf_hub_download_to_cache_dir + with WeakFileLock(lock_path): + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/contextlib.py", line 119, in __enter__ + return next(self.gen) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_fixes.py", line 98, in WeakFileLock + lock.acquire() + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/filelock/_api.py", line 225, in acquire + time.sleep(poll_interval) +KeyboardInterrupt diff --git a/wandb/run-20241106_234111-v7pqfaqj/files/requirements.txt b/wandb/run-20241106_234111-v7pqfaqj/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416 --- /dev/null +++ b/wandb/run-20241106_234111-v7pqfaqj/files/requirements.txt @@ -0,0 +1,147 @@ +funcsigs==1.0.2 +sentry-sdk==2.17.0 +multiprocess==0.70.16 +numpy==1.26.2 +pluralizer==1.2.0 +debugpy==1.6.7 +nvidia-cudnn-cu11==8.5.0.96 +deepspeed==0.15.2 +data==0.4 +pandas==2.1.3 +tomli==2.0.1 +charset-normalizer==3.3.2 +attrs==24.2.0 +aiosignal==1.3.1 +fsspec==2023.10.0 +nvidia-cusparse-cu11==11.7.4.91 +zipp==3.12.0 +mypy-extensions==1.0.0 +datasets==3.0.1 +joblib==1.3.2 +hjson==3.1.0 +traitlets==5.7.1 +stack-data==0.6.0 +transformers==4.45.1 +sympy==1.11.1 +Pygments==2.15.0 +docker-pycreds==0.4.0 +dill==0.3.8 +wheel==0.44.0 +prompt-toolkit==3.0.30 +parso==0.8.3 +ipykernel==6.23.1 +pyarrow==17.0.0 +certifi==2023.11.17 +nvidia-cufft-cu11==10.9.0.58 +six==1.16.0 +pydantic==2.9.2 +click==8.1.7 +nest-asyncio==1.5.6 +gmpy2==2.1.0 +matplotlib==3.8.2 +scipy==1.11.4 +typing_extensions==4.12.2 +statsmodels==0.14.0 +huggingface-hub==0.25.0 +frozenlist==1.4.1 +gpustat==1.1.1 +nvidia-nvtx-cu11==11.7.91 +safetensors==0.4.5 +stanza==1.9.2 +decorator==5.1.1 +seaborn==0.13.0 +sentencepiece==0.2.0 +PyYAML==6.0.1 +black==24.8.0 +protobuf==4.25.1 +pickleshare==0.7.5 +peft==0.13.0 +triton==2.0.0 +nvidia-cuda-runtime-cu11==11.7.99 +Jinja2==3.1.2 +nvidia-cusolver-cu11==11.4.0.1 +executing==1.2.0 +jupyter_client==8.1.0 +pluggy==1.3.0 +cmake==3.30.3 +pytz==2023.3.post1 +aiohappyeyeballs==2.4.2 +kiwisolver==1.4.5 +py-cpuinfo==9.0.0 +Pillow==10.1.0 +ptyprocess==0.7.0 +importlib_resources==6.4.5 +GitPython==3.1.43 +importlib-metadata==6.0.0 +iniconfig==2.0.0 +scikit-learn==1.3.2 +exceptiongroup==1.1.0 +networkx==2.8.6 +accelerate==1.0.0 +nltk==3.8.1 +shutilwhich==1.1.0 +fonttools==4.45.1 +future==0.18.3 +aiohttp==3.10.6 +wcwidth==0.2.5 +idna==3.6 +filelock==3.12.2 +pathspec==0.12.1 +jupyter_core==5.1.0 +lit==18.1.8 +nvidia-curand-cu11==10.2.10.91 +nvidia-cublas-cu11==11.10.3.66 +nvidia-ml-py==12.560.30 +msgpack==1.1.0 +python-dateutil==2.8.2 +blessed==1.20.0 +packaging==23.0 +gitdb==4.0.11 +yarl==1.13.0 +emoji==2.8.0 +tzdata==2023.3 +cycler==0.12.1 +tornado==6.2 +backcall==0.2.0 +plotnine==0.12.4 +ninja==1.11.1.1 +latex==0.7.0 +wandb==0.18.5 +setproctitle==1.3.3 +threadpoolctl==3.2.0 +requests==2.32.3 +pyparsing==3.1.1 +smmap==5.0.1 +pyzmq==23.0.0 +async-timeout==4.0.3 +annotated-types==0.7.0 +matplotlib-inline==0.1.6 +latexcodec==1.0.0 +ipython==8.0.0 +patsy==0.5.3 +contourpy==1.2.0 +multidict==6.1.0 +mizani==0.9.3 +urllib3==2.1.0 +tokenizers==0.20.0 +MarkupSafe==2.1.2 +pip==24.2 +pexpect==4.8.0 +tqdm==4.66.5 +jedi==0.18.2 +pydantic_core==2.23.4 +tempdir==0.7.1 +mpmath==1.2.1 +setuptools==72.1.0 +pytest==7.4.3 +pure-eval==0.2.2 +psutil==5.9.1 +comm==0.1.2 +nvidia-cuda-cupti-cu11==11.7.101 +nvidia-cuda-nvrtc-cu11==11.7.99 +regex==2023.10.3 +platformdirs==2.5.2 +asttokens==2.2.1 +torch==2.0.0 +nvidia-nccl-cu11==2.14.3 +xxhash==3.5.0 diff --git a/wandb/run-20241106_234111-v7pqfaqj/logs/debug-internal.log b/wandb/run-20241106_234111-v7pqfaqj/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..9200cd1a9604def4926562b9dfa5f33ba3955450 --- /dev/null +++ b/wandb/run-20241106_234111-v7pqfaqj/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2024-11-06T23:41:11.203613171-05:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-11-06T23:41:11.203623461-05:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241106_234111-v7pqfaqj/logs/debug-core.log"} +{"time":"2024-11-06T23:41:11.309763781-05:00","level":"INFO","msg":"created new stream","id":"v7pqfaqj"} +{"time":"2024-11-06T23:41:11.309797181-05:00","level":"INFO","msg":"stream: started","id":"v7pqfaqj"} +{"time":"2024-11-06T23:41:11.309856822-05:00","level":"INFO","msg":"sender: started","stream_id":"v7pqfaqj"} +{"time":"2024-11-06T23:41:11.309818252-05:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"v7pqfaqj"}} +{"time":"2024-11-06T23:41:11.309881832-05:00","level":"INFO","msg":"handler: started","stream_id":{"value":"v7pqfaqj"}} +{"time":"2024-11-06T23:41:11.514969656-05:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2024-11-06T23:43:02.304539765-05:00","level":"INFO","msg":"stream: closing","id":"v7pqfaqj"} +{"time":"2024-11-06T23:43:02.304582275-05:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2024-11-06T23:43:02.305136719-05:00","level":"INFO","msg":"Stopped system monitor"} diff --git a/wandb/run-20241106_234111-v7pqfaqj/logs/debug.log b/wandb/run-20241106_234111-v7pqfaqj/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..80b6cf6e464cfbe74e794b683479d1a6701f6942 --- /dev/null +++ b/wandb/run-20241106_234111-v7pqfaqj/logs/debug.log @@ -0,0 +1,27 @@ +2024-11-06 23:41:11,199 INFO MainThread:1997395 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-11-06 23:41:11,199 INFO MainThread:1997395 [wandb_setup.py:_flush():79] Configure stats pid to 1997395 +2024-11-06 23:41:11,199 INFO MainThread:1997395 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-11-06 23:41:11,199 INFO MainThread:1997395 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-11-06 23:41:11,200 INFO MainThread:1997395 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-11-06 23:41:11,200 INFO MainThread:1997395 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-11-06 23:41:11,200 INFO MainThread:1997395 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-11-06 23:41:11,200 INFO MainThread:1997395 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-11-06 23:41:11,200 INFO MainThread:1997395 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241106_234111-v7pqfaqj/logs/debug.log +2024-11-06 23:41:11,200 INFO MainThread:1997395 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241106_234111-v7pqfaqj/logs/debug-internal.log +2024-11-06 23:41:11,200 INFO MainThread:1997395 [wandb_init.py:init():621] calling init triggers +2024-11-06 23:41:11,200 INFO MainThread:1997395 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-11-06 23:41:11,200 INFO MainThread:1997395 [wandb_init.py:init():671] starting backend +2024-11-06 23:41:11,200 INFO MainThread:1997395 [wandb_init.py:init():675] sending inform_init request +2024-11-06 23:41:11,201 INFO MainThread:1997395 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-11-06 23:41:11,201 INFO MainThread:1997395 [wandb_init.py:init():688] backend started and connected +2024-11-06 23:41:11,204 INFO MainThread:1997395 [wandb_init.py:init():783] updated telemetry +2024-11-06 23:41:11,227 INFO MainThread:1997395 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-11-06 23:41:11,511 INFO MainThread:1997395 [wandb_init.py:init():867] starting run threads in backend +2024-11-06 23:41:11,624 INFO MainThread:1997395 [wandb_run.py:_console_start():2463] atexit reg +2024-11-06 23:41:11,624 INFO MainThread:1997395 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-11-06 23:41:11,624 INFO MainThread:1997395 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-11-06 23:41:11,624 INFO MainThread:1997395 [wandb_run.py:_redirect():2401] Redirects installed. +2024-11-06 23:41:11,626 INFO MainThread:1997395 [wandb_init.py:init():911] run started, returning control to user process +2024-11-06 23:41:11,626 INFO MainThread:1997395 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'shuffle_even_odd', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0, 'lr': 5e-06} +2024-11-06 23:43:02,304 WARNING MsgRouterThr:1997395 [router.py:message_loop():77] message_loop has been closed diff --git a/wandb/run-20241107_160818-1kyoikt6/run-1kyoikt6.wandb b/wandb/run-20241107_160818-1kyoikt6/run-1kyoikt6.wandb new file mode 100644 index 0000000000000000000000000000000000000000..bae7aa0efd8cc8a9f486b401d7153ee3eb875b8e Binary files /dev/null and b/wandb/run-20241107_160818-1kyoikt6/run-1kyoikt6.wandb differ diff --git a/wandb/run-20241113_180154-y8wvn5hq/files/output.log b/wandb/run-20241113_180154-y8wvn5hq/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..ab4a8210003ba08325765d147693863e16b8737c --- /dev/null +++ b/wandb/run-20241113_180154-y8wvn5hq/files/output.log @@ -0,0 +1,29 @@ +100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1098360/1098360 [00:04<00:00, 269079.17it/s] +100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1098360/1098360 [00:00<00:00, 3537253.85it/s] +100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17520/17520 [00:00<00:00, 38005.50it/s] +Generating train split: 17519 examples [00:07, 2284.97 examples/s]██████████████████████████████████████████████████████████████████████████▊ | 13440/17520 [00:00<00:00, 38164.07it/s] +100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1086121/1086121 [00:04<00:00, 251453.58it/s] +100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1086121/1086121 [00:00<00:00, 3190550.09it/s] +100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 18141/18141 [00:00<00:00, 36805.97it/s] +Generating validation split: 18140 examples [00:08, 2245.37 examples/s]████████████████████████████████████████████████████████████████████████████████████▏ | 15397/18141 [00:00<00:00, 38602.13it/s] +100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1031323/1031323 [00:03<00:00, 287511.18it/s] +100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1031323/1031323 [00:00<00:00, 3701774.08it/s] +100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16483/16483 [00:00<00:00, 37332.58it/s] +Generating test split: 16482 examples [00:06, 2366.18 examples/s]████████████████████████████████████████████████████████████████ | 11548/16483 [00:00<00:00, 36946.41it/s] +config.json: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 844/844 [00:00<00:00, 589kB/s] +Downloading shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [02:32<00:00, 76.15s/it] +Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:06<00:00, 3.03s/it] +Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17519/17519 [00:49<00:00, 353.64 examples/s] +Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 18140/18140 [00:50<00:00, 358.38 examples/s] +tokenized_valid: Dataset({ + features: ['input_ids', 'attention_mask'], + num_rows: 1000 +}) +/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead + warnings.warn( +[2024-11-13 18:06:38,687] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-11-13 18:06:49,762] [INFO] [comm.py:652:init_distributed] cdb=None +Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination +Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root... +Loading extension module cpu_adam... +Time to load cpu_adam op: 12.725266218185425 seconds diff --git a/wandb/run-20241113_180154-y8wvn5hq/files/wandb-metadata.json b/wandb/run-20241113_180154-y8wvn5hq/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..9d488cac5a4f0dab8caee1a17cc3c0fff6cb57cf --- /dev/null +++ b/wandb/run-20241113_180154-y8wvn5hq/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-11-13T23:01:54.134835Z", + "args": [ + "--perturbation", + "reverse_full", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "3", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_ftp.py", + "codePath": "train/train_ftp.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_ftp.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1744802353152" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241113_180154-y8wvn5hq/logs/debug-internal.log b/wandb/run-20241113_180154-y8wvn5hq/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..926e2cb335a135cbb4c32f6d1692e15bda6950fb --- /dev/null +++ b/wandb/run-20241113_180154-y8wvn5hq/logs/debug-internal.log @@ -0,0 +1,8 @@ +{"time":"2024-11-13T18:01:54.13856869-05:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-11-13T18:01:54.13860319-05:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241113_180154-y8wvn5hq/logs/debug-core.log"} +{"time":"2024-11-13T18:01:54.252154361-05:00","level":"INFO","msg":"created new stream","id":"y8wvn5hq"} +{"time":"2024-11-13T18:01:54.252214081-05:00","level":"INFO","msg":"stream: started","id":"y8wvn5hq"} +{"time":"2024-11-13T18:01:54.252242311-05:00","level":"INFO","msg":"sender: started","stream_id":"y8wvn5hq"} +{"time":"2024-11-13T18:01:54.252224001-05:00","level":"INFO","msg":"handler: started","stream_id":{"value":"y8wvn5hq"}} +{"time":"2024-11-13T18:01:54.252245641-05:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"y8wvn5hq"}} +{"time":"2024-11-13T18:01:54.482362144-05:00","level":"INFO","msg":"Starting system monitor"} diff --git a/wandb/run-20241115_125218-rrve0rbk/files/output.log b/wandb/run-20241115_125218-rrve0rbk/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..5aa223281be6a039d3f567b0f3eb7240c3890288 --- /dev/null +++ b/wandb/run-20241115_125218-rrve0rbk/files/output.log @@ -0,0 +1,23 @@ +config.json: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 844/844 [00:00<00:00, 258kB/s] +model.safetensors.index.json: 100%|████████████████████████████████████████████████████████████████████████████████████████████| 20.9k/20.9k [00:00<00:00, 18.4MB/s] +model-00001-of-00002.safetensors: 100%|████████████████████████████████████████████████████████████████████████████████████████| 4.97G/4.97G [01:58<00:00, 42.0MB/s] +Downloading shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [02:33<00:00, 76.60s/it] +Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:03<00:00, 1.92s/it] +generation_config.json: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 185/185 [00:00<00:00, 54.9kB/s] +Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16425/16425 [00:43<00:00, 377.79 examples/s] +Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17013/17013 [00:44<00:00, 380.17 examples/s] +tokenized_valid: Dataset({ + features: ['input_ids', 'attention_mask'], + num_rows: 1000 +}) +/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead + warnings.warn( +[2024-11-15 12:56:26,083] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-11-15 12:56:31,912] [INFO] [comm.py:652:init_distributed] cdb=None +Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination +Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root... +Emitting ninja build file /home/chunhui/.cache/torch_extensions/py39_cu117/cpu_adam/build.ninja... +Building extension module cpu_adam... +Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +Loading extension module cpu_adam... +Time to load cpu_adam op: 5.30732536315918 seconds diff --git a/wandb/run-20241115_125218-rrve0rbk/files/requirements.txt b/wandb/run-20241115_125218-rrve0rbk/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416 --- /dev/null +++ b/wandb/run-20241115_125218-rrve0rbk/files/requirements.txt @@ -0,0 +1,147 @@ +funcsigs==1.0.2 +sentry-sdk==2.17.0 +multiprocess==0.70.16 +numpy==1.26.2 +pluralizer==1.2.0 +debugpy==1.6.7 +nvidia-cudnn-cu11==8.5.0.96 +deepspeed==0.15.2 +data==0.4 +pandas==2.1.3 +tomli==2.0.1 +charset-normalizer==3.3.2 +attrs==24.2.0 +aiosignal==1.3.1 +fsspec==2023.10.0 +nvidia-cusparse-cu11==11.7.4.91 +zipp==3.12.0 +mypy-extensions==1.0.0 +datasets==3.0.1 +joblib==1.3.2 +hjson==3.1.0 +traitlets==5.7.1 +stack-data==0.6.0 +transformers==4.45.1 +sympy==1.11.1 +Pygments==2.15.0 +docker-pycreds==0.4.0 +dill==0.3.8 +wheel==0.44.0 +prompt-toolkit==3.0.30 +parso==0.8.3 +ipykernel==6.23.1 +pyarrow==17.0.0 +certifi==2023.11.17 +nvidia-cufft-cu11==10.9.0.58 +six==1.16.0 +pydantic==2.9.2 +click==8.1.7 +nest-asyncio==1.5.6 +gmpy2==2.1.0 +matplotlib==3.8.2 +scipy==1.11.4 +typing_extensions==4.12.2 +statsmodels==0.14.0 +huggingface-hub==0.25.0 +frozenlist==1.4.1 +gpustat==1.1.1 +nvidia-nvtx-cu11==11.7.91 +safetensors==0.4.5 +stanza==1.9.2 +decorator==5.1.1 +seaborn==0.13.0 +sentencepiece==0.2.0 +PyYAML==6.0.1 +black==24.8.0 +protobuf==4.25.1 +pickleshare==0.7.5 +peft==0.13.0 +triton==2.0.0 +nvidia-cuda-runtime-cu11==11.7.99 +Jinja2==3.1.2 +nvidia-cusolver-cu11==11.4.0.1 +executing==1.2.0 +jupyter_client==8.1.0 +pluggy==1.3.0 +cmake==3.30.3 +pytz==2023.3.post1 +aiohappyeyeballs==2.4.2 +kiwisolver==1.4.5 +py-cpuinfo==9.0.0 +Pillow==10.1.0 +ptyprocess==0.7.0 +importlib_resources==6.4.5 +GitPython==3.1.43 +importlib-metadata==6.0.0 +iniconfig==2.0.0 +scikit-learn==1.3.2 +exceptiongroup==1.1.0 +networkx==2.8.6 +accelerate==1.0.0 +nltk==3.8.1 +shutilwhich==1.1.0 +fonttools==4.45.1 +future==0.18.3 +aiohttp==3.10.6 +wcwidth==0.2.5 +idna==3.6 +filelock==3.12.2 +pathspec==0.12.1 +jupyter_core==5.1.0 +lit==18.1.8 +nvidia-curand-cu11==10.2.10.91 +nvidia-cublas-cu11==11.10.3.66 +nvidia-ml-py==12.560.30 +msgpack==1.1.0 +python-dateutil==2.8.2 +blessed==1.20.0 +packaging==23.0 +gitdb==4.0.11 +yarl==1.13.0 +emoji==2.8.0 +tzdata==2023.3 +cycler==0.12.1 +tornado==6.2 +backcall==0.2.0 +plotnine==0.12.4 +ninja==1.11.1.1 +latex==0.7.0 +wandb==0.18.5 +setproctitle==1.3.3 +threadpoolctl==3.2.0 +requests==2.32.3 +pyparsing==3.1.1 +smmap==5.0.1 +pyzmq==23.0.0 +async-timeout==4.0.3 +annotated-types==0.7.0 +matplotlib-inline==0.1.6 +latexcodec==1.0.0 +ipython==8.0.0 +patsy==0.5.3 +contourpy==1.2.0 +multidict==6.1.0 +mizani==0.9.3 +urllib3==2.1.0 +tokenizers==0.20.0 +MarkupSafe==2.1.2 +pip==24.2 +pexpect==4.8.0 +tqdm==4.66.5 +jedi==0.18.2 +pydantic_core==2.23.4 +tempdir==0.7.1 +mpmath==1.2.1 +setuptools==72.1.0 +pytest==7.4.3 +pure-eval==0.2.2 +psutil==5.9.1 +comm==0.1.2 +nvidia-cuda-cupti-cu11==11.7.101 +nvidia-cuda-nvrtc-cu11==11.7.99 +regex==2023.10.3 +platformdirs==2.5.2 +asttokens==2.2.1 +torch==2.0.0 +nvidia-nccl-cu11==2.14.3 +xxhash==3.5.0 diff --git a/wandb/run-20241115_125218-rrve0rbk/files/wandb-metadata.json b/wandb/run-20241115_125218-rrve0rbk/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..35ad49ba614cfe26b6352dac7279e1f68c441251 --- /dev/null +++ b/wandb/run-20241115_125218-rrve0rbk/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-11-15T17:52:18.165441Z", + "args": [ + "--perturbation", + "shuffle_deterministic84", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "3", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_ftp.py", + "codePath": "train/train_ftp.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_ftp.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1762493665280" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241115_125218-rrve0rbk/logs/debug-internal.log b/wandb/run-20241115_125218-rrve0rbk/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..a89ebf414e2306400790ea6a8eb6b033b98fd456 --- /dev/null +++ b/wandb/run-20241115_125218-rrve0rbk/logs/debug-internal.log @@ -0,0 +1,10 @@ +{"time":"2024-11-15T12:52:18.166300035-05:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-11-15T12:52:18.166310145-05:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241115_125218-rrve0rbk/logs/debug-core.log"} +{"time":"2024-11-15T12:52:18.275265339-05:00","level":"INFO","msg":"created new stream","id":"rrve0rbk"} +{"time":"2024-11-15T12:52:18.275299929-05:00","level":"INFO","msg":"stream: started","id":"rrve0rbk"} +{"time":"2024-11-15T12:52:18.275345179-05:00","level":"INFO","msg":"sender: started","stream_id":"rrve0rbk"} +{"time":"2024-11-15T12:52:18.275322849-05:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"rrve0rbk"}} +{"time":"2024-11-15T12:52:18.275333929-05:00","level":"INFO","msg":"handler: started","stream_id":{"value":"rrve0rbk"}} +{"time":"2024-11-15T12:52:18.531175062-05:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2024-11-15T14:35:03.896023097-05:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/yaning1001-dartmouth-college/exp-impo-shuffle/rrve0rbk/file_stream"} +{"time":"2024-11-15T14:52:44.59452379-05:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": read tcp 10.81.0.110:36132->35.186.228.49:443: read: connection reset by peer"} diff --git a/wandb/run-20241115_125218-rrve0rbk/logs/debug.log b/wandb/run-20241115_125218-rrve0rbk/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..3914487e1e37f02bfb67fef6a9685bd9d8d63767 --- /dev/null +++ b/wandb/run-20241115_125218-rrve0rbk/logs/debug.log @@ -0,0 +1,26 @@ +2024-11-15 12:52:18,162 INFO MainThread:2609855 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-11-15 12:52:18,162 INFO MainThread:2609855 [wandb_setup.py:_flush():79] Configure stats pid to 2609855 +2024-11-15 12:52:18,162 INFO MainThread:2609855 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-11-15 12:52:18,162 INFO MainThread:2609855 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-11-15 12:52:18,162 INFO MainThread:2609855 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-11-15 12:52:18,162 INFO MainThread:2609855 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-11-15 12:52:18,162 INFO MainThread:2609855 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_ftp.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_ftp.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_ftp.py'} +2024-11-15 12:52:18,162 INFO MainThread:2609855 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-11-15 12:52:18,162 INFO MainThread:2609855 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241115_125218-rrve0rbk/logs/debug.log +2024-11-15 12:52:18,163 INFO MainThread:2609855 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241115_125218-rrve0rbk/logs/debug-internal.log +2024-11-15 12:52:18,163 INFO MainThread:2609855 [wandb_init.py:init():621] calling init triggers +2024-11-15 12:52:18,163 INFO MainThread:2609855 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-11-15 12:52:18,163 INFO MainThread:2609855 [wandb_init.py:init():671] starting backend +2024-11-15 12:52:18,163 INFO MainThread:2609855 [wandb_init.py:init():675] sending inform_init request +2024-11-15 12:52:18,164 INFO MainThread:2609855 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-11-15 12:52:18,165 INFO MainThread:2609855 [wandb_init.py:init():688] backend started and connected +2024-11-15 12:52:18,167 INFO MainThread:2609855 [wandb_init.py:init():783] updated telemetry +2024-11-15 12:52:18,184 INFO MainThread:2609855 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-11-15 12:52:18,528 INFO MainThread:2609855 [wandb_init.py:init():867] starting run threads in backend +2024-11-15 12:52:18,618 INFO MainThread:2609855 [wandb_run.py:_console_start():2463] atexit reg +2024-11-15 12:52:18,618 INFO MainThread:2609855 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-11-15 12:52:18,618 INFO MainThread:2609855 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-11-15 12:52:18,618 INFO MainThread:2609855 [wandb_run.py:_redirect():2401] Redirects installed. +2024-11-15 12:52:18,620 INFO MainThread:2609855 [wandb_init.py:init():911] run started, returning control to user process +2024-11-15 12:52:18,620 INFO MainThread:2609855 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'shuffle_deterministic84', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0, 'lr': 5e-06} diff --git a/wandb/run-20241128_161554-907lsb28/logs/debug-internal.log b/wandb/run-20241128_161554-907lsb28/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..8d2dc365ac0331eccaf0b9f80876093d91786fd3 --- /dev/null +++ b/wandb/run-20241128_161554-907lsb28/logs/debug-internal.log @@ -0,0 +1,19 @@ +{"time":"2024-11-28T16:15:54.215655506-05:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-11-28T16:15:54.215678516-05:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241128_161554-907lsb28/logs/debug-core.log"} +{"time":"2024-11-28T16:15:54.426700134-05:00","level":"INFO","msg":"created new stream","id":"907lsb28"} +{"time":"2024-11-28T16:15:54.426746584-05:00","level":"INFO","msg":"stream: started","id":"907lsb28"} +{"time":"2024-11-28T16:15:54.426791414-05:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"907lsb28"}} +{"time":"2024-11-28T16:15:54.427138797-05:00","level":"INFO","msg":"handler: started","stream_id":{"value":"907lsb28"}} +{"time":"2024-11-28T16:15:54.427186257-05:00","level":"INFO","msg":"sender: started","stream_id":"907lsb28"} +{"time":"2024-11-28T16:15:54.603855087-05:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2024-11-28T22:30:24.964340473-05:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/yaning1001-dartmouth-college/exp-impo-reverse/907lsb28/file_stream"} +{"time":"2024-11-29T03:53:49.678920958-05:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/yaning1001-dartmouth-college/exp-impo-reverse/907lsb28/file_stream"} +{"time":"2024-11-29T06:59:15.382511638-05:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2024-11-29T06:59:15.388445668-05:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2024-11-29T06:59:15.845898959-05:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2024-11-29T06:59:16.007005818-05:00","level":"INFO","msg":"handler: operation stats","stats":{}} +{"time":"2024-11-29T06:59:17.020469811-05:00","level":"INFO","msg":"stream: closing","id":"907lsb28"} +{"time":"2024-11-29T06:59:17.020502291-05:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"907lsb28"}} +{"time":"2024-11-29T06:59:17.020523081-05:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"907lsb28"}} +{"time":"2024-11-29T06:59:17.020579451-05:00","level":"INFO","msg":"sender: closed","stream_id":"907lsb28"} +{"time":"2024-11-29T06:59:17.020631992-05:00","level":"INFO","msg":"stream: closed","id":"907lsb28"} diff --git a/wandb/run-20241128_161554-907lsb28/logs/debug.log b/wandb/run-20241128_161554-907lsb28/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..8379c6a57899a14a89219105461c35b590bd8360 --- /dev/null +++ b/wandb/run-20241128_161554-907lsb28/logs/debug.log @@ -0,0 +1,33 @@ +2024-11-28 16:15:54,208 INFO MainThread:3101596 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-11-28 16:15:54,208 INFO MainThread:3101596 [wandb_setup.py:_flush():79] Configure stats pid to 3101596 +2024-11-28 16:15:54,208 INFO MainThread:3101596 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-11-28 16:15:54,208 INFO MainThread:3101596 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-11-28 16:15:54,208 INFO MainThread:3101596 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-11-28 16:15:54,208 INFO MainThread:3101596 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-11-28 16:15:54,208 INFO MainThread:3101596 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_llama_1B.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_llama_1B.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_llama_1B.py'} +2024-11-28 16:15:54,208 INFO MainThread:3101596 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-11-28 16:15:54,208 INFO MainThread:3101596 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241128_161554-907lsb28/logs/debug.log +2024-11-28 16:15:54,208 INFO MainThread:3101596 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241128_161554-907lsb28/logs/debug-internal.log +2024-11-28 16:15:54,208 INFO MainThread:3101596 [wandb_init.py:init():621] calling init triggers +2024-11-28 16:15:54,208 INFO MainThread:3101596 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-11-28 16:15:54,208 INFO MainThread:3101596 [wandb_init.py:init():671] starting backend +2024-11-28 16:15:54,208 INFO MainThread:3101596 [wandb_init.py:init():675] sending inform_init request +2024-11-28 16:15:54,210 INFO MainThread:3101596 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-11-28 16:15:54,210 INFO MainThread:3101596 [wandb_init.py:init():688] backend started and connected +2024-11-28 16:15:54,216 INFO MainThread:3101596 [wandb_init.py:init():783] updated telemetry +2024-11-28 16:15:54,251 INFO MainThread:3101596 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-11-28 16:15:54,600 INFO MainThread:3101596 [wandb_init.py:init():867] starting run threads in backend +2024-11-28 16:15:54,714 INFO MainThread:3101596 [wandb_run.py:_console_start():2463] atexit reg +2024-11-28 16:15:54,714 INFO MainThread:3101596 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-11-28 16:15:54,714 INFO MainThread:3101596 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-11-28 16:15:54,714 INFO MainThread:3101596 [wandb_run.py:_redirect():2401] Redirects installed. +2024-11-28 16:15:54,716 INFO MainThread:3101596 [wandb_init.py:init():911] run started, returning control to user process +2024-11-28 16:15:54,716 INFO MainThread:3101596 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_control', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0, 'lr': 5e-06} +2024-11-29 06:59:15,374 INFO MainThread:3101596 [wandb_run.py:_finish():2158] finishing run yaning1001-dartmouth-college/exp-impo-reverse/907lsb28 +2024-11-29 06:59:15,381 INFO MainThread:3101596 [wandb_run.py:_atexit_cleanup():2426] got exitcode: 0 +2024-11-29 06:59:15,381 INFO MainThread:3101596 [wandb_run.py:_restore():2408] restore +2024-11-29 06:59:15,382 INFO MainThread:3101596 [wandb_run.py:_restore():2414] restore done +2024-11-29 06:59:17,010 INFO MainThread:3101596 [wandb_run.py:_footer_history_summary_info():3975] rendering history +2024-11-29 06:59:17,010 INFO MainThread:3101596 [wandb_run.py:_footer_history_summary_info():4007] rendering summary +2024-11-29 06:59:17,019 INFO MainThread:3101596 [wandb_run.py:_footer_sync_info():3934] logging synced files