diff --git a/wandb/run-20241030_010306-2i7gykhq/files/config.yaml b/wandb/run-20241030_010306-2i7gykhq/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f8546482f67cbb9041af9d756594322c485d67a9 --- /dev/null +++ b/wandb/run-20241030_010306-2i7gykhq/files/config.yaml @@ -0,0 +1,47 @@ +_wandb: + value: + cli_version: 0.18.5 + m: [] + python_version: 3.9.19 + t: + "1": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "2": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "3": + - 13 + - 23 + - 55 + "4": 3.9.19 + "5": 0.18.5 + "6": 4.45.1 + "8": + - 5 + "12": 0.18.5 + "13": linux-x86_64 +batch_size: + value: 3 +epoch: + value: 7 +perturbation: + value: reverse_control +seed: + value: 0 +train_set: + value: 10M diff --git a/wandb/run-20241030_010306-2i7gykhq/files/output.log b/wandb/run-20241030_010306-2i7gykhq/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..1777f063b107f651dddc063d1d3a3fa80015bf9e --- /dev/null +++ b/wandb/run-20241030_010306-2i7gykhq/files/output.log @@ -0,0 +1,4 @@ +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", line 162, in + dataset_name = f"babylm_{args.perturbation}_{args.train_zset}_seed{args.seed}" +AttributeError: 'Namespace' object has no attribute 'train_zset' diff --git a/wandb/run-20241030_010306-2i7gykhq/files/wandb-metadata.json b/wandb/run-20241030_010306-2i7gykhq/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..9ac51d06d88a1e80a5abff1f3e747d66ca660b2e --- /dev/null +++ b/wandb/run-20241030_010306-2i7gykhq/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-10-30T05:03:06.008901Z", + "args": [ + "--perturbation", + "reverse_control", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "7", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1719200055296" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241030_010306-2i7gykhq/files/wandb-summary.json b/wandb/run-20241030_010306-2i7gykhq/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..6c37fe1cbbb8aed86fd461a79642cb991e4d35cf --- /dev/null +++ b/wandb/run-20241030_010306-2i7gykhq/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":0}} \ No newline at end of file diff --git a/wandb/run-20241030_010306-2i7gykhq/logs/debug-internal.log b/wandb/run-20241030_010306-2i7gykhq/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..121bf0ae69c34ca75c4de2231dbbcd4987959b1a --- /dev/null +++ b/wandb/run-20241030_010306-2i7gykhq/logs/debug-internal.log @@ -0,0 +1,16 @@ +{"time":"2024-10-30T01:03:06.0117365-04:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-10-30T01:03:06.01174823-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_010306-2i7gykhq/logs/debug-core.log"} +{"time":"2024-10-30T01:03:06.120361557-04:00","level":"INFO","msg":"created new stream","id":"2i7gykhq"} +{"time":"2024-10-30T01:03:06.120402537-04:00","level":"INFO","msg":"stream: started","id":"2i7gykhq"} +{"time":"2024-10-30T01:03:06.120481838-04:00","level":"INFO","msg":"sender: started","stream_id":"2i7gykhq"} +{"time":"2024-10-30T01:03:06.120465068-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"2i7gykhq"}} +{"time":"2024-10-30T01:03:06.120437948-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"2i7gykhq"}} +{"time":"2024-10-30T01:03:06.343082168-04:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2024-10-30T01:03:06.442934695-04:00","level":"INFO","msg":"stream: closing","id":"2i7gykhq"} +{"time":"2024-10-30T01:03:06.442971726-04:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2024-10-30T01:03:06.443431019-04:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2024-10-30T01:03:08.147923119-04:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2024-10-30T01:03:08.245191018-04:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"2i7gykhq"}} +{"time":"2024-10-30T01:03:08.245294648-04:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"2i7gykhq"}} +{"time":"2024-10-30T01:03:08.245303978-04:00","level":"INFO","msg":"sender: closed","stream_id":"2i7gykhq"} +{"time":"2024-10-30T01:03:08.245465209-04:00","level":"INFO","msg":"stream: closed","id":"2i7gykhq"} diff --git a/wandb/run-20241030_010306-2i7gykhq/logs/debug.log b/wandb/run-20241030_010306-2i7gykhq/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..46af9ea442f18f142eb6992aaf4e922a6bca06ad --- /dev/null +++ b/wandb/run-20241030_010306-2i7gykhq/logs/debug.log @@ -0,0 +1,27 @@ +2024-10-30 01:03:06,006 INFO MainThread:320661 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-10-30 01:03:06,006 INFO MainThread:320661 [wandb_setup.py:_flush():79] Configure stats pid to 320661 +2024-10-30 01:03:06,006 INFO MainThread:320661 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-10-30 01:03:06,006 INFO MainThread:320661 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-10-30 01:03:06,006 INFO MainThread:320661 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-10-30 01:03:06,006 INFO MainThread:320661 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-10-30 01:03:06,006 INFO MainThread:320661 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-10-30 01:03:06,006 INFO MainThread:320661 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-10-30 01:03:06,006 INFO MainThread:320661 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_010306-2i7gykhq/logs/debug.log +2024-10-30 01:03:06,006 INFO MainThread:320661 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_010306-2i7gykhq/logs/debug-internal.log +2024-10-30 01:03:06,006 INFO MainThread:320661 [wandb_init.py:init():621] calling init triggers +2024-10-30 01:03:06,007 INFO MainThread:320661 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-10-30 01:03:06,007 INFO MainThread:320661 [wandb_init.py:init():671] starting backend +2024-10-30 01:03:06,007 INFO MainThread:320661 [wandb_init.py:init():675] sending inform_init request +2024-10-30 01:03:06,008 INFO MainThread:320661 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-10-30 01:03:06,008 INFO MainThread:320661 [wandb_init.py:init():688] backend started and connected +2024-10-30 01:03:06,012 INFO MainThread:320661 [wandb_init.py:init():783] updated telemetry +2024-10-30 01:03:06,044 INFO MainThread:320661 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-10-30 01:03:06,339 INFO MainThread:320661 [wandb_init.py:init():867] starting run threads in backend +2024-10-30 01:03:06,439 INFO MainThread:320661 [wandb_run.py:_console_start():2463] atexit reg +2024-10-30 01:03:06,439 INFO MainThread:320661 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-10-30 01:03:06,439 INFO MainThread:320661 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-10-30 01:03:06,439 INFO MainThread:320661 [wandb_run.py:_redirect():2401] Redirects installed. +2024-10-30 01:03:06,441 INFO MainThread:320661 [wandb_init.py:init():911] run started, returning control to user process +2024-10-30 01:03:06,441 INFO MainThread:320661 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_control', 'train_set': '10M', 'batch_size': 3, 'epoch': 7, 'seed': 0} +2024-10-30 01:03:06,443 WARNING MsgRouterThr:320661 [router.py:message_loop():77] message_loop has been closed diff --git a/wandb/run-20241030_010306-2i7gykhq/run-2i7gykhq.wandb b/wandb/run-20241030_010306-2i7gykhq/run-2i7gykhq.wandb new file mode 100644 index 0000000000000000000000000000000000000000..c5cef69f663328729cb91ba6e651c311a79a75b3 Binary files /dev/null and b/wandb/run-20241030_010306-2i7gykhq/run-2i7gykhq.wandb differ diff --git a/wandb/run-20241030_011509-3dp0dtmk/files/requirements.txt b/wandb/run-20241030_011509-3dp0dtmk/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416 --- /dev/null +++ b/wandb/run-20241030_011509-3dp0dtmk/files/requirements.txt @@ -0,0 +1,147 @@ +funcsigs==1.0.2 +sentry-sdk==2.17.0 +multiprocess==0.70.16 +numpy==1.26.2 +pluralizer==1.2.0 +debugpy==1.6.7 +nvidia-cudnn-cu11==8.5.0.96 +deepspeed==0.15.2 +data==0.4 +pandas==2.1.3 +tomli==2.0.1 +charset-normalizer==3.3.2 +attrs==24.2.0 +aiosignal==1.3.1 +fsspec==2023.10.0 +nvidia-cusparse-cu11==11.7.4.91 +zipp==3.12.0 +mypy-extensions==1.0.0 +datasets==3.0.1 +joblib==1.3.2 +hjson==3.1.0 +traitlets==5.7.1 +stack-data==0.6.0 +transformers==4.45.1 +sympy==1.11.1 +Pygments==2.15.0 +docker-pycreds==0.4.0 +dill==0.3.8 +wheel==0.44.0 +prompt-toolkit==3.0.30 +parso==0.8.3 +ipykernel==6.23.1 +pyarrow==17.0.0 +certifi==2023.11.17 +nvidia-cufft-cu11==10.9.0.58 +six==1.16.0 +pydantic==2.9.2 +click==8.1.7 +nest-asyncio==1.5.6 +gmpy2==2.1.0 +matplotlib==3.8.2 +scipy==1.11.4 +typing_extensions==4.12.2 +statsmodels==0.14.0 +huggingface-hub==0.25.0 +frozenlist==1.4.1 +gpustat==1.1.1 +nvidia-nvtx-cu11==11.7.91 +safetensors==0.4.5 +stanza==1.9.2 +decorator==5.1.1 +seaborn==0.13.0 +sentencepiece==0.2.0 +PyYAML==6.0.1 +black==24.8.0 +protobuf==4.25.1 +pickleshare==0.7.5 +peft==0.13.0 +triton==2.0.0 +nvidia-cuda-runtime-cu11==11.7.99 +Jinja2==3.1.2 +nvidia-cusolver-cu11==11.4.0.1 +executing==1.2.0 +jupyter_client==8.1.0 +pluggy==1.3.0 +cmake==3.30.3 +pytz==2023.3.post1 +aiohappyeyeballs==2.4.2 +kiwisolver==1.4.5 +py-cpuinfo==9.0.0 +Pillow==10.1.0 +ptyprocess==0.7.0 +importlib_resources==6.4.5 +GitPython==3.1.43 +importlib-metadata==6.0.0 +iniconfig==2.0.0 +scikit-learn==1.3.2 +exceptiongroup==1.1.0 +networkx==2.8.6 +accelerate==1.0.0 +nltk==3.8.1 +shutilwhich==1.1.0 +fonttools==4.45.1 +future==0.18.3 +aiohttp==3.10.6 +wcwidth==0.2.5 +idna==3.6 +filelock==3.12.2 +pathspec==0.12.1 +jupyter_core==5.1.0 +lit==18.1.8 +nvidia-curand-cu11==10.2.10.91 +nvidia-cublas-cu11==11.10.3.66 +nvidia-ml-py==12.560.30 +msgpack==1.1.0 +python-dateutil==2.8.2 +blessed==1.20.0 +packaging==23.0 +gitdb==4.0.11 +yarl==1.13.0 +emoji==2.8.0 +tzdata==2023.3 +cycler==0.12.1 +tornado==6.2 +backcall==0.2.0 +plotnine==0.12.4 +ninja==1.11.1.1 +latex==0.7.0 +wandb==0.18.5 +setproctitle==1.3.3 +threadpoolctl==3.2.0 +requests==2.32.3 +pyparsing==3.1.1 +smmap==5.0.1 +pyzmq==23.0.0 +async-timeout==4.0.3 +annotated-types==0.7.0 +matplotlib-inline==0.1.6 +latexcodec==1.0.0 +ipython==8.0.0 +patsy==0.5.3 +contourpy==1.2.0 +multidict==6.1.0 +mizani==0.9.3 +urllib3==2.1.0 +tokenizers==0.20.0 +MarkupSafe==2.1.2 +pip==24.2 +pexpect==4.8.0 +tqdm==4.66.5 +jedi==0.18.2 +pydantic_core==2.23.4 +tempdir==0.7.1 +mpmath==1.2.1 +setuptools==72.1.0 +pytest==7.4.3 +pure-eval==0.2.2 +psutil==5.9.1 +comm==0.1.2 +nvidia-cuda-cupti-cu11==11.7.101 +nvidia-cuda-nvrtc-cu11==11.7.99 +regex==2023.10.3 +platformdirs==2.5.2 +asttokens==2.2.1 +torch==2.0.0 +nvidia-nccl-cu11==2.14.3 +xxhash==3.5.0 diff --git a/wandb/run-20241030_011509-3dp0dtmk/logs/debug-internal.log b/wandb/run-20241030_011509-3dp0dtmk/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..008e76d71d0ca79400287b6802db912a85fa2d77 --- /dev/null +++ b/wandb/run-20241030_011509-3dp0dtmk/logs/debug-internal.log @@ -0,0 +1,8 @@ +{"time":"2024-10-30T01:15:09.513233148-04:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-10-30T01:15:09.513243808-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_011509-3dp0dtmk/logs/debug-core.log"} +{"time":"2024-10-30T01:15:09.61864921-04:00","level":"INFO","msg":"created new stream","id":"3dp0dtmk"} +{"time":"2024-10-30T01:15:09.618680701-04:00","level":"INFO","msg":"stream: started","id":"3dp0dtmk"} +{"time":"2024-10-30T01:15:09.618709051-04:00","level":"INFO","msg":"sender: started","stream_id":"3dp0dtmk"} +{"time":"2024-10-30T01:15:09.618691701-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"3dp0dtmk"}} +{"time":"2024-10-30T01:15:09.618713711-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"3dp0dtmk"}} +{"time":"2024-10-30T01:15:09.827809374-04:00","level":"INFO","msg":"Starting system monitor"} diff --git a/wandb/run-20241030_011509-cqcwsj7s/files/output.log b/wandb/run-20241030_011509-cqcwsj7s/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..79645183b08c6dbe04298204e06c345f13c58936 --- /dev/null +++ b/wandb/run-20241030_011509-cqcwsj7s/files/output.log @@ -0,0 +1,15 @@ +Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:04<00:00, 2.30s/it] +Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17519/17519 [00:52<00:00, 334.96 examples/s] +Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 18140/18140 [00:52<00:00, 343.59 examples/s] +tokenized_valid: Dataset({ + features: ['input_ids', 'attention_mask'], + num_rows: 600 +}) +/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead + warnings.warn( +[2024-10-30 01:17:00,868] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-10-30 01:17:07,837] [INFO] [comm.py:652:init_distributed] cdb=None +Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination +Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root... +Loading extension module cpu_adam... +Time to load cpu_adam op: 4.274901390075684 seconds diff --git a/wandb/run-20241030_011509-cqcwsj7s/files/requirements.txt b/wandb/run-20241030_011509-cqcwsj7s/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416 --- /dev/null +++ b/wandb/run-20241030_011509-cqcwsj7s/files/requirements.txt @@ -0,0 +1,147 @@ +funcsigs==1.0.2 +sentry-sdk==2.17.0 +multiprocess==0.70.16 +numpy==1.26.2 +pluralizer==1.2.0 +debugpy==1.6.7 +nvidia-cudnn-cu11==8.5.0.96 +deepspeed==0.15.2 +data==0.4 +pandas==2.1.3 +tomli==2.0.1 +charset-normalizer==3.3.2 +attrs==24.2.0 +aiosignal==1.3.1 +fsspec==2023.10.0 +nvidia-cusparse-cu11==11.7.4.91 +zipp==3.12.0 +mypy-extensions==1.0.0 +datasets==3.0.1 +joblib==1.3.2 +hjson==3.1.0 +traitlets==5.7.1 +stack-data==0.6.0 +transformers==4.45.1 +sympy==1.11.1 +Pygments==2.15.0 +docker-pycreds==0.4.0 +dill==0.3.8 +wheel==0.44.0 +prompt-toolkit==3.0.30 +parso==0.8.3 +ipykernel==6.23.1 +pyarrow==17.0.0 +certifi==2023.11.17 +nvidia-cufft-cu11==10.9.0.58 +six==1.16.0 +pydantic==2.9.2 +click==8.1.7 +nest-asyncio==1.5.6 +gmpy2==2.1.0 +matplotlib==3.8.2 +scipy==1.11.4 +typing_extensions==4.12.2 +statsmodels==0.14.0 +huggingface-hub==0.25.0 +frozenlist==1.4.1 +gpustat==1.1.1 +nvidia-nvtx-cu11==11.7.91 +safetensors==0.4.5 +stanza==1.9.2 +decorator==5.1.1 +seaborn==0.13.0 +sentencepiece==0.2.0 +PyYAML==6.0.1 +black==24.8.0 +protobuf==4.25.1 +pickleshare==0.7.5 +peft==0.13.0 +triton==2.0.0 +nvidia-cuda-runtime-cu11==11.7.99 +Jinja2==3.1.2 +nvidia-cusolver-cu11==11.4.0.1 +executing==1.2.0 +jupyter_client==8.1.0 +pluggy==1.3.0 +cmake==3.30.3 +pytz==2023.3.post1 +aiohappyeyeballs==2.4.2 +kiwisolver==1.4.5 +py-cpuinfo==9.0.0 +Pillow==10.1.0 +ptyprocess==0.7.0 +importlib_resources==6.4.5 +GitPython==3.1.43 +importlib-metadata==6.0.0 +iniconfig==2.0.0 +scikit-learn==1.3.2 +exceptiongroup==1.1.0 +networkx==2.8.6 +accelerate==1.0.0 +nltk==3.8.1 +shutilwhich==1.1.0 +fonttools==4.45.1 +future==0.18.3 +aiohttp==3.10.6 +wcwidth==0.2.5 +idna==3.6 +filelock==3.12.2 +pathspec==0.12.1 +jupyter_core==5.1.0 +lit==18.1.8 +nvidia-curand-cu11==10.2.10.91 +nvidia-cublas-cu11==11.10.3.66 +nvidia-ml-py==12.560.30 +msgpack==1.1.0 +python-dateutil==2.8.2 +blessed==1.20.0 +packaging==23.0 +gitdb==4.0.11 +yarl==1.13.0 +emoji==2.8.0 +tzdata==2023.3 +cycler==0.12.1 +tornado==6.2 +backcall==0.2.0 +plotnine==0.12.4 +ninja==1.11.1.1 +latex==0.7.0 +wandb==0.18.5 +setproctitle==1.3.3 +threadpoolctl==3.2.0 +requests==2.32.3 +pyparsing==3.1.1 +smmap==5.0.1 +pyzmq==23.0.0 +async-timeout==4.0.3 +annotated-types==0.7.0 +matplotlib-inline==0.1.6 +latexcodec==1.0.0 +ipython==8.0.0 +patsy==0.5.3 +contourpy==1.2.0 +multidict==6.1.0 +mizani==0.9.3 +urllib3==2.1.0 +tokenizers==0.20.0 +MarkupSafe==2.1.2 +pip==24.2 +pexpect==4.8.0 +tqdm==4.66.5 +jedi==0.18.2 +pydantic_core==2.23.4 +tempdir==0.7.1 +mpmath==1.2.1 +setuptools==72.1.0 +pytest==7.4.3 +pure-eval==0.2.2 +psutil==5.9.1 +comm==0.1.2 +nvidia-cuda-cupti-cu11==11.7.101 +nvidia-cuda-nvrtc-cu11==11.7.99 +regex==2023.10.3 +platformdirs==2.5.2 +asttokens==2.2.1 +torch==2.0.0 +nvidia-nccl-cu11==2.14.3 +xxhash==3.5.0 diff --git a/wandb/run-20241030_011509-cqcwsj7s/files/wandb-metadata.json b/wandb/run-20241030_011509-cqcwsj7s/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..daecce1cc2c5bdb8bf63088aa1e48c02a4d28cf9 --- /dev/null +++ b/wandb/run-20241030_011509-cqcwsj7s/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-10-30T05:15:09.348320Z", + "args": [ + "--perturbation", + "reverse_control", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "7", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1719287033856" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241030_011509-cqcwsj7s/logs/debug-internal.log b/wandb/run-20241030_011509-cqcwsj7s/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..35a62b1438243ad79bff208c7ab10611551e05f6 --- /dev/null +++ b/wandb/run-20241030_011509-cqcwsj7s/logs/debug-internal.log @@ -0,0 +1,8 @@ +{"time":"2024-10-30T01:15:09.350927711-04:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-10-30T01:15:09.350949441-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_011509-cqcwsj7s/logs/debug-core.log"} +{"time":"2024-10-30T01:15:09.4574055-04:00","level":"INFO","msg":"created new stream","id":"cqcwsj7s"} +{"time":"2024-10-30T01:15:09.457454561-04:00","level":"INFO","msg":"stream: started","id":"cqcwsj7s"} +{"time":"2024-10-30T01:15:09.457507131-04:00","level":"INFO","msg":"sender: started","stream_id":"cqcwsj7s"} +{"time":"2024-10-30T01:15:09.457476101-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"cqcwsj7s"}} +{"time":"2024-10-30T01:15:09.457490051-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"cqcwsj7s"}} +{"time":"2024-10-30T01:15:09.678893618-04:00","level":"INFO","msg":"Starting system monitor"} diff --git a/wandb/run-20241030_222932-l8nv7d2l/files/wandb-metadata.json b/wandb/run-20241030_222932-l8nv7d2l/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..da1154a6e7ccb36ff86f8db1750a118268411bdd --- /dev/null +++ b/wandb/run-20241030_222932-l8nv7d2l/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-10-31T02:29:32.386006Z", + "args": [ + "--perturbation", + "reverse_control", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "3", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1710969503744" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241030_222932-lsfm0d2q/files/output.log b/wandb/run-20241030_222932-lsfm0d2q/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..5c20053cb585af3422629e7cfea2d2125281acf1 --- /dev/null +++ b/wandb/run-20241030_222932-lsfm0d2q/files/output.log @@ -0,0 +1,43 @@ +Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:18<00:00, 9.35s/it] +tokenized_valid: Dataset({ + features: ['input_ids', 'attention_mask'], + num_rows: 600 +}) +/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead + warnings.warn( +[2024-10-30 22:29:54,126] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-10-30 22:30:03,508] [INFO] [comm.py:652:init_distributed] cdb=None +Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination +Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root... +Emitting ninja build file /home/chunhui/.cache/torch_extensions/py39_cu117/cpu_adam/build.ninja... +Building extension module cpu_adam... +Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +Loading extension module cpu_adam... +Time to load cpu_adam op: 5.208661794662476 seconds +wandb: WARNING Fatal error while uploading data. Some run data will not be synced, but it will still be written to disk. Use `wandb sync` at the end of the run to try uploading. +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", line 219, in + trainer.train() + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/trainer.py", line 2052, in train + return inner_training_loop( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/trainer.py", line 2388, in _inner_training_loop + tr_loss_step = self.training_step(model, inputs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/trainer.py", line 3518, in training_step + self.accelerator.backward(loss, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/accelerate/accelerator.py", line 2238, in backward + self.deepspeed_engine_wrapped.backward(loss, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/accelerate/utils/deepspeed.py", line 186, in backward + self.engine.backward(loss, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/deepspeed/utils/nvtx.py", line 18, in wrapped_fn + ret_val = func(*args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/deepspeed/runtime/engine.py", line 2020, in backward + self.optimizer.backward(loss, retain_graph=retain_graph) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 2063, in backward + self.loss_scaler.backward(loss.float(), retain_graph=retain_graph) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/deepspeed/runtime/fp16/loss_scaler.py", line 63, in backward + scaled_loss.backward(retain_graph=retain_graph) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/torch/_tensor.py", line 487, in backward + torch.autograd.backward( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/torch/autograd/__init__.py", line 200, in backward + Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass +KeyboardInterrupt diff --git a/wandb/run-20241030_231835-i7aqcsql/files/output.log b/wandb/run-20241030_231835-i7aqcsql/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..a8471b4a35729313c039722545faba00c07654de --- /dev/null +++ b/wandb/run-20241030_231835-i7aqcsql/files/output.log @@ -0,0 +1,13 @@ +Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:05<00:00, 2.86s/it] +tokenized_valid: Dataset({ + features: ['input_ids', 'attention_mask'], + num_rows: 600 +}) +/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead + warnings.warn( +[2024-10-30 23:18:43,451] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-10-30 23:18:52,678] [INFO] [comm.py:652:init_distributed] cdb=None +Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination +Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root... +Loading extension module cpu_adam... +Time to load cpu_adam op: 5.939827919006348 seconds diff --git a/wandb/run-20241030_231835-i7aqcsql/files/requirements.txt b/wandb/run-20241030_231835-i7aqcsql/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416 --- /dev/null +++ b/wandb/run-20241030_231835-i7aqcsql/files/requirements.txt @@ -0,0 +1,147 @@ +funcsigs==1.0.2 +sentry-sdk==2.17.0 +multiprocess==0.70.16 +numpy==1.26.2 +pluralizer==1.2.0 +debugpy==1.6.7 +nvidia-cudnn-cu11==8.5.0.96 +deepspeed==0.15.2 +data==0.4 +pandas==2.1.3 +tomli==2.0.1 +charset-normalizer==3.3.2 +attrs==24.2.0 +aiosignal==1.3.1 +fsspec==2023.10.0 +nvidia-cusparse-cu11==11.7.4.91 +zipp==3.12.0 +mypy-extensions==1.0.0 +datasets==3.0.1 +joblib==1.3.2 +hjson==3.1.0 +traitlets==5.7.1 +stack-data==0.6.0 +transformers==4.45.1 +sympy==1.11.1 +Pygments==2.15.0 +docker-pycreds==0.4.0 +dill==0.3.8 +wheel==0.44.0 +prompt-toolkit==3.0.30 +parso==0.8.3 +ipykernel==6.23.1 +pyarrow==17.0.0 +certifi==2023.11.17 +nvidia-cufft-cu11==10.9.0.58 +six==1.16.0 +pydantic==2.9.2 +click==8.1.7 +nest-asyncio==1.5.6 +gmpy2==2.1.0 +matplotlib==3.8.2 +scipy==1.11.4 +typing_extensions==4.12.2 +statsmodels==0.14.0 +huggingface-hub==0.25.0 +frozenlist==1.4.1 +gpustat==1.1.1 +nvidia-nvtx-cu11==11.7.91 +safetensors==0.4.5 +stanza==1.9.2 +decorator==5.1.1 +seaborn==0.13.0 +sentencepiece==0.2.0 +PyYAML==6.0.1 +black==24.8.0 +protobuf==4.25.1 +pickleshare==0.7.5 +peft==0.13.0 +triton==2.0.0 +nvidia-cuda-runtime-cu11==11.7.99 +Jinja2==3.1.2 +nvidia-cusolver-cu11==11.4.0.1 +executing==1.2.0 +jupyter_client==8.1.0 +pluggy==1.3.0 +cmake==3.30.3 +pytz==2023.3.post1 +aiohappyeyeballs==2.4.2 +kiwisolver==1.4.5 +py-cpuinfo==9.0.0 +Pillow==10.1.0 +ptyprocess==0.7.0 +importlib_resources==6.4.5 +GitPython==3.1.43 +importlib-metadata==6.0.0 +iniconfig==2.0.0 +scikit-learn==1.3.2 +exceptiongroup==1.1.0 +networkx==2.8.6 +accelerate==1.0.0 +nltk==3.8.1 +shutilwhich==1.1.0 +fonttools==4.45.1 +future==0.18.3 +aiohttp==3.10.6 +wcwidth==0.2.5 +idna==3.6 +filelock==3.12.2 +pathspec==0.12.1 +jupyter_core==5.1.0 +lit==18.1.8 +nvidia-curand-cu11==10.2.10.91 +nvidia-cublas-cu11==11.10.3.66 +nvidia-ml-py==12.560.30 +msgpack==1.1.0 +python-dateutil==2.8.2 +blessed==1.20.0 +packaging==23.0 +gitdb==4.0.11 +yarl==1.13.0 +emoji==2.8.0 +tzdata==2023.3 +cycler==0.12.1 +tornado==6.2 +backcall==0.2.0 +plotnine==0.12.4 +ninja==1.11.1.1 +latex==0.7.0 +wandb==0.18.5 +setproctitle==1.3.3 +threadpoolctl==3.2.0 +requests==2.32.3 +pyparsing==3.1.1 +smmap==5.0.1 +pyzmq==23.0.0 +async-timeout==4.0.3 +annotated-types==0.7.0 +matplotlib-inline==0.1.6 +latexcodec==1.0.0 +ipython==8.0.0 +patsy==0.5.3 +contourpy==1.2.0 +multidict==6.1.0 +mizani==0.9.3 +urllib3==2.1.0 +tokenizers==0.20.0 +MarkupSafe==2.1.2 +pip==24.2 +pexpect==4.8.0 +tqdm==4.66.5 +jedi==0.18.2 +pydantic_core==2.23.4 +tempdir==0.7.1 +mpmath==1.2.1 +setuptools==72.1.0 +pytest==7.4.3 +pure-eval==0.2.2 +psutil==5.9.1 +comm==0.1.2 +nvidia-cuda-cupti-cu11==11.7.101 +nvidia-cuda-nvrtc-cu11==11.7.99 +regex==2023.10.3 +platformdirs==2.5.2 +asttokens==2.2.1 +torch==2.0.0 +nvidia-nccl-cu11==2.14.3 +xxhash==3.5.0 diff --git a/wandb/run-20241030_231835-i7aqcsql/logs/debug.log b/wandb/run-20241030_231835-i7aqcsql/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..1e2ff464b9dfbe03d81c381c2a5995575842b322 --- /dev/null +++ b/wandb/run-20241030_231835-i7aqcsql/logs/debug.log @@ -0,0 +1,26 @@ +2024-10-30 23:18:35,491 INFO MainThread:457828 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-10-30 23:18:35,491 INFO MainThread:457828 [wandb_setup.py:_flush():79] Configure stats pid to 457828 +2024-10-30 23:18:35,491 INFO MainThread:457828 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-10-30 23:18:35,491 INFO MainThread:457828 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-10-30 23:18:35,491 INFO MainThread:457828 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-10-30 23:18:35,491 INFO MainThread:457828 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-10-30 23:18:35,491 INFO MainThread:457828 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-10-30 23:18:35,492 INFO MainThread:457828 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-10-30 23:18:35,492 INFO MainThread:457828 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_231835-i7aqcsql/logs/debug.log +2024-10-30 23:18:35,492 INFO MainThread:457828 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_231835-i7aqcsql/logs/debug-internal.log +2024-10-30 23:18:35,492 INFO MainThread:457828 [wandb_init.py:init():621] calling init triggers +2024-10-30 23:18:35,492 INFO MainThread:457828 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-10-30 23:18:35,492 INFO MainThread:457828 [wandb_init.py:init():671] starting backend +2024-10-30 23:18:35,492 INFO MainThread:457828 [wandb_init.py:init():675] sending inform_init request +2024-10-30 23:18:35,493 INFO MainThread:457828 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-10-30 23:18:35,493 INFO MainThread:457828 [wandb_init.py:init():688] backend started and connected +2024-10-30 23:18:35,496 INFO MainThread:457828 [wandb_init.py:init():783] updated telemetry +2024-10-30 23:18:35,522 INFO MainThread:457828 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-10-30 23:18:35,838 INFO MainThread:457828 [wandb_init.py:init():867] starting run threads in backend +2024-10-30 23:18:35,950 INFO MainThread:457828 [wandb_run.py:_console_start():2463] atexit reg +2024-10-30 23:18:35,950 INFO MainThread:457828 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-10-30 23:18:35,950 INFO MainThread:457828 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-10-30 23:18:35,950 INFO MainThread:457828 [wandb_run.py:_redirect():2401] Redirects installed. +2024-10-30 23:18:35,952 INFO MainThread:457828 [wandb_init.py:init():911] run started, returning control to user process +2024-10-30 23:18:35,952 INFO MainThread:457828 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_full', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0} diff --git a/wandb/run-20241031_000839-mehxid7z/files/output.log b/wandb/run-20241031_000839-mehxid7z/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..6f875965839eb160a80dc05e9f4dc3b519f35cbe --- /dev/null +++ b/wandb/run-20241031_000839-mehxid7z/files/output.log @@ -0,0 +1,13 @@ +Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:18<00:00, 9.31s/it] +tokenized_valid: Dataset({ + features: ['input_ids', 'attention_mask'], + num_rows: 600 +}) +/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead + warnings.warn( +[2024-10-31 00:09:00,064] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-10-31 00:09:09,744] [INFO] [comm.py:652:init_distributed] cdb=None +Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination +Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root... +Loading extension module cpu_adam... +Time to load cpu_adam op: 5.402003526687622 seconds diff --git a/wandb/run-20241031_122114-2k9672ya/files/config.yaml b/wandb/run-20241031_122114-2k9672ya/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..edfa239fe9670ab026a3223bac576877fd80297e --- /dev/null +++ b/wandb/run-20241031_122114-2k9672ya/files/config.yaml @@ -0,0 +1,50 @@ +_wandb: + value: + cli_version: 0.18.5 + m: [] + python_version: 3.9.19 + t: + "1": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "2": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "3": + - 2 + - 13 + - 23 + - 55 + "4": 3.9.19 + "5": 0.18.5 + "6": 4.45.1 + "8": + - 5 + "12": 0.18.5 + "13": linux-x86_64 +batch_size: + value: 3 +epoch: + value: 6 +lr: + value: 5e-06 +perturbation: + value: reverse_full +seed: + value: 0 +train_set: + value: 10M diff --git a/wandb/run-20241031_122114-2k9672ya/files/output.log b/wandb/run-20241031_122114-2k9672ya/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..d8e84a59bfb2c40d25567a17b4bd117a05b74e96 --- /dev/null +++ b/wandb/run-20241031_122114-2k9672ya/files/output.log @@ -0,0 +1,17 @@ +Downloading shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [02:08<00:00, 64.47s/it] +Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:05<00:00, 2.61s/it] +tokenized_valid: Dataset({ + features: ['input_ids', 'attention_mask'], + num_rows: 600 +}) +/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead + warnings.warn( +[2024-10-31 12:23:30,593] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-10-31 12:23:38,777] [INFO] [comm.py:652:init_distributed] cdb=None +Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination +Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root... +Emitting ninja build file /home/chunhui/.cache/torch_extensions/py39_cu117/cpu_adam/build.ninja... +Building extension module cpu_adam... +Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +Loading extension module cpu_adam... +Time to load cpu_adam op: 5.0573859214782715 seconds diff --git a/wandb/run-20241031_122114-2k9672ya/files/wandb-metadata.json b/wandb/run-20241031_122114-2k9672ya/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..3b1004e82d49bf06e4289700e34f5247424cd4c6 --- /dev/null +++ b/wandb/run-20241031_122114-2k9672ya/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-10-31T16:21:14.171538Z", + "args": [ + "--perturbation", + "reverse_full", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "6", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1753159966720" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241031_122114-2k9672ya/files/wandb-summary.json b/wandb/run-20241031_122114-2k9672ya/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..f78ea47926dfc1c734e02453aa9e6ce3692f2be4 --- /dev/null +++ b/wandb/run-20241031_122114-2k9672ya/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":32015}} \ No newline at end of file diff --git a/wandb/run-20241031_122114-2k9672ya/logs/debug-internal.log b/wandb/run-20241031_122114-2k9672ya/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..26806a00be0e2b5da64aad6f074ce0e7a667a817 --- /dev/null +++ b/wandb/run-20241031_122114-2k9672ya/logs/debug-internal.log @@ -0,0 +1,22 @@ +{"time":"2024-10-31T12:21:14.17421403-04:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-10-31T12:21:14.17422767-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241031_122114-2k9672ya/logs/debug-core.log"} +{"time":"2024-10-31T12:21:14.283263175-04:00","level":"INFO","msg":"created new stream","id":"2k9672ya"} +{"time":"2024-10-31T12:21:14.283309355-04:00","level":"INFO","msg":"stream: started","id":"2k9672ya"} +{"time":"2024-10-31T12:21:14.283383545-04:00","level":"INFO","msg":"sender: started","stream_id":"2k9672ya"} +{"time":"2024-10-31T12:21:14.283382385-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"2k9672ya"}} +{"time":"2024-10-31T12:21:14.283377385-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"2k9672ya"}} +{"time":"2024-10-31T12:21:14.488458951-04:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2024-10-31T15:06:23.896231311-04:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/yaning1001-dartmouth-college/impossible_llm_reverse/2k9672ya/file_stream"} +{"time":"2024-10-31T16:36:28.430733687-04:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"} +{"time":"2024-10-31T17:53:37.431874892-04:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"} +{"time":"2024-10-31T18:43:04.954227971-04:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"} +{"time":"2024-10-31T18:47:37.271013751-04:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/yaning1001-dartmouth-college/impossible_llm_reverse/2k9672ya/file_stream"} +{"time":"2024-10-31T21:14:49.439472129-04:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2024-10-31T21:14:49.51063436-04:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2024-10-31T21:14:50.160847475-04:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2024-10-31T21:14:50.291547228-04:00","level":"INFO","msg":"handler: operation stats","stats":{}} +{"time":"2024-10-31T21:14:51.421940751-04:00","level":"INFO","msg":"stream: closing","id":"2k9672ya"} +{"time":"2024-10-31T21:14:51.421967541-04:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"2k9672ya"}} +{"time":"2024-10-31T21:14:51.421996271-04:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"2k9672ya"}} +{"time":"2024-10-31T21:14:51.422018811-04:00","level":"INFO","msg":"sender: closed","stream_id":"2k9672ya"} +{"time":"2024-10-31T21:14:51.422075131-04:00","level":"INFO","msg":"stream: closed","id":"2k9672ya"} diff --git a/wandb/run-20241031_122114-2k9672ya/logs/debug.log b/wandb/run-20241031_122114-2k9672ya/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..887e030f4dcdbc2e3284796d5bc8623170528d85 --- /dev/null +++ b/wandb/run-20241031_122114-2k9672ya/logs/debug.log @@ -0,0 +1,33 @@ +2024-10-31 12:21:14,168 INFO MainThread:558434 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-10-31 12:21:14,168 INFO MainThread:558434 [wandb_setup.py:_flush():79] Configure stats pid to 558434 +2024-10-31 12:21:14,168 INFO MainThread:558434 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-10-31 12:21:14,168 INFO MainThread:558434 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-10-31 12:21:14,168 INFO MainThread:558434 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-10-31 12:21:14,168 INFO MainThread:558434 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-10-31 12:21:14,168 INFO MainThread:558434 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-10-31 12:21:14,168 INFO MainThread:558434 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-10-31 12:21:14,168 INFO MainThread:558434 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241031_122114-2k9672ya/logs/debug.log +2024-10-31 12:21:14,168 INFO MainThread:558434 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241031_122114-2k9672ya/logs/debug-internal.log +2024-10-31 12:21:14,168 INFO MainThread:558434 [wandb_init.py:init():621] calling init triggers +2024-10-31 12:21:14,169 INFO MainThread:558434 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-10-31 12:21:14,169 INFO MainThread:558434 [wandb_init.py:init():671] starting backend +2024-10-31 12:21:14,169 INFO MainThread:558434 [wandb_init.py:init():675] sending inform_init request +2024-10-31 12:21:14,170 INFO MainThread:558434 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-10-31 12:21:14,171 INFO MainThread:558434 [wandb_init.py:init():688] backend started and connected +2024-10-31 12:21:14,173 INFO MainThread:558434 [wandb_init.py:init():783] updated telemetry +2024-10-31 12:21:14,203 INFO MainThread:558434 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-10-31 12:21:14,485 INFO MainThread:558434 [wandb_init.py:init():867] starting run threads in backend +2024-10-31 12:21:14,573 INFO MainThread:558434 [wandb_run.py:_console_start():2463] atexit reg +2024-10-31 12:21:14,573 INFO MainThread:558434 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-10-31 12:21:14,573 INFO MainThread:558434 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-10-31 12:21:14,573 INFO MainThread:558434 [wandb_run.py:_redirect():2401] Redirects installed. +2024-10-31 12:21:14,575 INFO MainThread:558434 [wandb_init.py:init():911] run started, returning control to user process +2024-10-31 12:21:14,575 INFO MainThread:558434 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_full', 'train_set': '10M', 'batch_size': 3, 'epoch': 6, 'seed': 0, 'lr': 5e-06} +2024-10-31 21:14:49,288 INFO MainThread:558434 [wandb_run.py:_finish():2158] finishing run yaning1001-dartmouth-college/impossible_llm_reverse/2k9672ya +2024-10-31 21:14:49,400 INFO MainThread:558434 [wandb_run.py:_atexit_cleanup():2426] got exitcode: 0 +2024-10-31 21:14:49,400 INFO MainThread:558434 [wandb_run.py:_restore():2408] restore +2024-10-31 21:14:49,400 INFO MainThread:558434 [wandb_run.py:_restore():2414] restore done +2024-10-31 21:14:51,293 INFO MainThread:558434 [wandb_run.py:_footer_history_summary_info():3975] rendering history +2024-10-31 21:14:51,293 INFO MainThread:558434 [wandb_run.py:_footer_history_summary_info():4007] rendering summary +2024-10-31 21:14:51,421 INFO MainThread:558434 [wandb_run.py:_footer_sync_info():3934] logging synced files diff --git a/wandb/run-20241101_012438-tr39r2kv/files/config.yaml b/wandb/run-20241101_012438-tr39r2kv/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..16d25ee0e4092474360045faaf0959cfc1b3e91d --- /dev/null +++ b/wandb/run-20241101_012438-tr39r2kv/files/config.yaml @@ -0,0 +1,49 @@ +_wandb: + value: + cli_version: 0.18.5 + m: [] + python_version: 3.9.19 + t: + "1": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "2": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "3": + - 13 + - 23 + - 55 + "4": 3.9.19 + "5": 0.18.5 + "6": 4.45.1 + "8": + - 5 + "12": 0.18.5 + "13": linux-x86_64 +batch_size: + value: 3 +epoch: + value: 6 +lr: + value: 5e-06 +perturbation: + value: shuffle_nodeterministic +seed: + value: 0 +train_set: + value: 10M diff --git a/wandb/run-20241101_012438-tr39r2kv/files/output.log b/wandb/run-20241101_012438-tr39r2kv/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..5824b3c1b0e7410256b3374a200db5bd3cc11c9d --- /dev/null +++ b/wandb/run-20241101_012438-tr39r2kv/files/output.log @@ -0,0 +1,12 @@ +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", line 164, in + dataset = load_dataset('babylm_dataset_test.py', name=dataset_name, trust_remote_code=True) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/datasets/load.py", line 2074, in load_dataset + builder_instance = load_dataset_builder( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/datasets/load.py", line 1832, in load_dataset_builder + builder_instance: DatasetBuilder = builder_cls( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/datasets/builder.py", line 342, in __init__ + self.config, self.config_id = self._create_builder_config( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/datasets/builder.py", line 569, in _create_builder_config + raise ValueError( +ValueError: BuilderConfig 'babylm_shuffle_nodeterministic_10M_seed0' not found. Available: ['babylm_hop_control_10M_seed0', 'babylm_hop_tokens4_10M_seed0', 'babylm_hop_words4_10M_seed0', 'babylm_reverse_control_10M_seed0', 'babylm_reverse_partial_10M_seed0', 'babylm_reverse_full_10M_seed0', 'babylm_shuffle_control_10M_seed0', 'babylm_shuffle_nondeterministic_10M_seed0', 'babylm_shuffle_deterministic21_10M_seed0', 'babylm_shuffle_deterministic57_10M_seed0', 'babylm_shuffle_deterministic84_10M_seed0', 'babylm_shuffle_local3_10M_seed0', 'babylm_shuffle_local5_10M_seed0', 'babylm_shuffle_local10_10M_seed0', 'babylm_shuffle_even_odd_10M_seed0'] diff --git a/wandb/run-20241101_012438-tr39r2kv/files/wandb-summary.json b/wandb/run-20241101_012438-tr39r2kv/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..6c37fe1cbbb8aed86fd461a79642cb991e4d35cf --- /dev/null +++ b/wandb/run-20241101_012438-tr39r2kv/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":0}} \ No newline at end of file diff --git a/wandb/run-20241101_012438-tr39r2kv/logs/debug-internal.log b/wandb/run-20241101_012438-tr39r2kv/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..e488620ea654b942569b3c200530ef215a164131 --- /dev/null +++ b/wandb/run-20241101_012438-tr39r2kv/logs/debug-internal.log @@ -0,0 +1,16 @@ +{"time":"2024-11-01T01:24:38.169173693-04:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-11-01T01:24:38.169187803-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_012438-tr39r2kv/logs/debug-core.log"} +{"time":"2024-11-01T01:24:38.280395547-04:00","level":"INFO","msg":"created new stream","id":"tr39r2kv"} +{"time":"2024-11-01T01:24:38.280440227-04:00","level":"INFO","msg":"stream: started","id":"tr39r2kv"} +{"time":"2024-11-01T01:24:38.280482807-04:00","level":"INFO","msg":"sender: started","stream_id":"tr39r2kv"} +{"time":"2024-11-01T01:24:38.280484247-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"tr39r2kv"}} +{"time":"2024-11-01T01:24:38.280477617-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"tr39r2kv"}} +{"time":"2024-11-01T01:24:38.465343943-04:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2024-11-01T01:24:38.590950117-04:00","level":"INFO","msg":"stream: closing","id":"tr39r2kv"} +{"time":"2024-11-01T01:24:38.590992697-04:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2024-11-01T01:24:38.63883528-04:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2024-11-01T01:24:39.16923887-04:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2024-11-01T01:24:39.290094771-04:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"tr39r2kv"}} +{"time":"2024-11-01T01:24:39.290144111-04:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"tr39r2kv"}} +{"time":"2024-11-01T01:24:39.290152561-04:00","level":"INFO","msg":"sender: closed","stream_id":"tr39r2kv"} +{"time":"2024-11-01T01:24:39.290211281-04:00","level":"INFO","msg":"stream: closed","id":"tr39r2kv"} diff --git a/wandb/run-20241101_012733-4u8e027p/logs/debug-internal.log b/wandb/run-20241101_012733-4u8e027p/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..8f781b554c19b531186108fa3c459206885956b2 --- /dev/null +++ b/wandb/run-20241101_012733-4u8e027p/logs/debug-internal.log @@ -0,0 +1,8 @@ +{"time":"2024-11-01T01:27:33.995935712-04:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-11-01T01:27:33.995950772-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_012733-4u8e027p/logs/debug-core.log"} +{"time":"2024-11-01T01:27:34.10392569-04:00","level":"INFO","msg":"created new stream","id":"4u8e027p"} +{"time":"2024-11-01T01:27:34.103978121-04:00","level":"INFO","msg":"stream: started","id":"4u8e027p"} +{"time":"2024-11-01T01:27:34.104114652-04:00","level":"INFO","msg":"sender: started","stream_id":"4u8e027p"} +{"time":"2024-11-01T01:27:34.104042611-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"4u8e027p"}} +{"time":"2024-11-01T01:27:34.104027311-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"4u8e027p"}} +{"time":"2024-11-01T01:27:34.323754805-04:00","level":"INFO","msg":"Starting system monitor"} diff --git a/wandb/run-20241101_012733-e3zsr634/files/requirements.txt b/wandb/run-20241101_012733-e3zsr634/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416 --- /dev/null +++ b/wandb/run-20241101_012733-e3zsr634/files/requirements.txt @@ -0,0 +1,147 @@ +funcsigs==1.0.2 +sentry-sdk==2.17.0 +multiprocess==0.70.16 +numpy==1.26.2 +pluralizer==1.2.0 +debugpy==1.6.7 +nvidia-cudnn-cu11==8.5.0.96 +deepspeed==0.15.2 +data==0.4 +pandas==2.1.3 +tomli==2.0.1 +charset-normalizer==3.3.2 +attrs==24.2.0 +aiosignal==1.3.1 +fsspec==2023.10.0 +nvidia-cusparse-cu11==11.7.4.91 +zipp==3.12.0 +mypy-extensions==1.0.0 +datasets==3.0.1 +joblib==1.3.2 +hjson==3.1.0 +traitlets==5.7.1 +stack-data==0.6.0 +transformers==4.45.1 +sympy==1.11.1 +Pygments==2.15.0 +docker-pycreds==0.4.0 +dill==0.3.8 +wheel==0.44.0 +prompt-toolkit==3.0.30 +parso==0.8.3 +ipykernel==6.23.1 +pyarrow==17.0.0 +certifi==2023.11.17 +nvidia-cufft-cu11==10.9.0.58 +six==1.16.0 +pydantic==2.9.2 +click==8.1.7 +nest-asyncio==1.5.6 +gmpy2==2.1.0 +matplotlib==3.8.2 +scipy==1.11.4 +typing_extensions==4.12.2 +statsmodels==0.14.0 +huggingface-hub==0.25.0 +frozenlist==1.4.1 +gpustat==1.1.1 +nvidia-nvtx-cu11==11.7.91 +safetensors==0.4.5 +stanza==1.9.2 +decorator==5.1.1 +seaborn==0.13.0 +sentencepiece==0.2.0 +PyYAML==6.0.1 +black==24.8.0 +protobuf==4.25.1 +pickleshare==0.7.5 +peft==0.13.0 +triton==2.0.0 +nvidia-cuda-runtime-cu11==11.7.99 +Jinja2==3.1.2 +nvidia-cusolver-cu11==11.4.0.1 +executing==1.2.0 +jupyter_client==8.1.0 +pluggy==1.3.0 +cmake==3.30.3 +pytz==2023.3.post1 +aiohappyeyeballs==2.4.2 +kiwisolver==1.4.5 +py-cpuinfo==9.0.0 +Pillow==10.1.0 +ptyprocess==0.7.0 +importlib_resources==6.4.5 +GitPython==3.1.43 +importlib-metadata==6.0.0 +iniconfig==2.0.0 +scikit-learn==1.3.2 +exceptiongroup==1.1.0 +networkx==2.8.6 +accelerate==1.0.0 +nltk==3.8.1 +shutilwhich==1.1.0 +fonttools==4.45.1 +future==0.18.3 +aiohttp==3.10.6 +wcwidth==0.2.5 +idna==3.6 +filelock==3.12.2 +pathspec==0.12.1 +jupyter_core==5.1.0 +lit==18.1.8 +nvidia-curand-cu11==10.2.10.91 +nvidia-cublas-cu11==11.10.3.66 +nvidia-ml-py==12.560.30 +msgpack==1.1.0 +python-dateutil==2.8.2 +blessed==1.20.0 +packaging==23.0 +gitdb==4.0.11 +yarl==1.13.0 +emoji==2.8.0 +tzdata==2023.3 +cycler==0.12.1 +tornado==6.2 +backcall==0.2.0 +plotnine==0.12.4 +ninja==1.11.1.1 +latex==0.7.0 +wandb==0.18.5 +setproctitle==1.3.3 +threadpoolctl==3.2.0 +requests==2.32.3 +pyparsing==3.1.1 +smmap==5.0.1 +pyzmq==23.0.0 +async-timeout==4.0.3 +annotated-types==0.7.0 +matplotlib-inline==0.1.6 +latexcodec==1.0.0 +ipython==8.0.0 +patsy==0.5.3 +contourpy==1.2.0 +multidict==6.1.0 +mizani==0.9.3 +urllib3==2.1.0 +tokenizers==0.20.0 +MarkupSafe==2.1.2 +pip==24.2 +pexpect==4.8.0 +tqdm==4.66.5 +jedi==0.18.2 +pydantic_core==2.23.4 +tempdir==0.7.1 +mpmath==1.2.1 +setuptools==72.1.0 +pytest==7.4.3 +pure-eval==0.2.2 +psutil==5.9.1 +comm==0.1.2 +nvidia-cuda-cupti-cu11==11.7.101 +nvidia-cuda-nvrtc-cu11==11.7.99 +regex==2023.10.3 +platformdirs==2.5.2 +asttokens==2.2.1 +torch==2.0.0 +nvidia-nccl-cu11==2.14.3 +xxhash==3.5.0 diff --git a/wandb/run-20241101_012733-e3zsr634/logs/debug-internal.log b/wandb/run-20241101_012733-e3zsr634/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..774525a72232a67392753ea541acd16b7838cd21 --- /dev/null +++ b/wandb/run-20241101_012733-e3zsr634/logs/debug-internal.log @@ -0,0 +1,8 @@ +{"time":"2024-11-01T01:27:33.960171934-04:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-11-01T01:27:33.960186144-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_012733-e3zsr634/logs/debug-core.log"} +{"time":"2024-11-01T01:27:34.065020141-04:00","level":"INFO","msg":"created new stream","id":"e3zsr634"} +{"time":"2024-11-01T01:27:34.065052661-04:00","level":"INFO","msg":"stream: started","id":"e3zsr634"} +{"time":"2024-11-01T01:27:34.065102351-04:00","level":"INFO","msg":"sender: started","stream_id":"e3zsr634"} +{"time":"2024-11-01T01:27:34.065089571-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"e3zsr634"}} +{"time":"2024-11-01T01:27:34.065106681-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"e3zsr634"}} +{"time":"2024-11-01T01:27:34.21774298-04:00","level":"INFO","msg":"Starting system monitor"} diff --git a/wandb/run-20241101_012733-e3zsr634/logs/debug.log b/wandb/run-20241101_012733-e3zsr634/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..518ce38972566eac6278c3079ea32b5a0b29cdd3 --- /dev/null +++ b/wandb/run-20241101_012733-e3zsr634/logs/debug.log @@ -0,0 +1,26 @@ +2024-11-01 01:27:33,956 INFO MainThread:678554 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-11-01 01:27:33,956 INFO MainThread:678554 [wandb_setup.py:_flush():79] Configure stats pid to 678554 +2024-11-01 01:27:33,956 INFO MainThread:678554 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-11-01 01:27:33,956 INFO MainThread:678554 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-11-01 01:27:33,956 INFO MainThread:678554 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-11-01 01:27:33,956 INFO MainThread:678554 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-11-01 01:27:33,956 INFO MainThread:678554 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-11-01 01:27:33,956 INFO MainThread:678554 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-11-01 01:27:33,956 INFO MainThread:678554 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_012733-e3zsr634/logs/debug.log +2024-11-01 01:27:33,956 INFO MainThread:678554 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_012733-e3zsr634/logs/debug-internal.log +2024-11-01 01:27:33,956 INFO MainThread:678554 [wandb_init.py:init():621] calling init triggers +2024-11-01 01:27:33,956 INFO MainThread:678554 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-11-01 01:27:33,956 INFO MainThread:678554 [wandb_init.py:init():671] starting backend +2024-11-01 01:27:33,956 INFO MainThread:678554 [wandb_init.py:init():675] sending inform_init request +2024-11-01 01:27:33,957 INFO MainThread:678554 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-11-01 01:27:33,958 INFO MainThread:678554 [wandb_init.py:init():688] backend started and connected +2024-11-01 01:27:33,961 INFO MainThread:678554 [wandb_init.py:init():783] updated telemetry +2024-11-01 01:27:33,985 INFO MainThread:678554 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-11-01 01:27:34,215 INFO MainThread:678554 [wandb_init.py:init():867] starting run threads in backend +2024-11-01 01:27:34,304 INFO MainThread:678554 [wandb_run.py:_console_start():2463] atexit reg +2024-11-01 01:27:34,304 INFO MainThread:678554 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-11-01 01:27:34,305 INFO MainThread:678554 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-11-01 01:27:34,305 INFO MainThread:678554 [wandb_run.py:_redirect():2401] Redirects installed. +2024-11-01 01:27:34,306 INFO MainThread:678554 [wandb_init.py:init():911] run started, returning control to user process +2024-11-01 01:27:34,306 INFO MainThread:678554 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'shuffle_nondeterministic', 'train_set': '10M', 'batch_size': 3, 'epoch': 6, 'seed': 0, 'lr': 5e-06} diff --git a/wandb/run-20241101_200502-28ivel81/files/requirements.txt b/wandb/run-20241101_200502-28ivel81/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416 --- /dev/null +++ b/wandb/run-20241101_200502-28ivel81/files/requirements.txt @@ -0,0 +1,147 @@ +funcsigs==1.0.2 +sentry-sdk==2.17.0 +multiprocess==0.70.16 +numpy==1.26.2 +pluralizer==1.2.0 +debugpy==1.6.7 +nvidia-cudnn-cu11==8.5.0.96 +deepspeed==0.15.2 +data==0.4 +pandas==2.1.3 +tomli==2.0.1 +charset-normalizer==3.3.2 +attrs==24.2.0 +aiosignal==1.3.1 +fsspec==2023.10.0 +nvidia-cusparse-cu11==11.7.4.91 +zipp==3.12.0 +mypy-extensions==1.0.0 +datasets==3.0.1 +joblib==1.3.2 +hjson==3.1.0 +traitlets==5.7.1 +stack-data==0.6.0 +transformers==4.45.1 +sympy==1.11.1 +Pygments==2.15.0 +docker-pycreds==0.4.0 +dill==0.3.8 +wheel==0.44.0 +prompt-toolkit==3.0.30 +parso==0.8.3 +ipykernel==6.23.1 +pyarrow==17.0.0 +certifi==2023.11.17 +nvidia-cufft-cu11==10.9.0.58 +six==1.16.0 +pydantic==2.9.2 +click==8.1.7 +nest-asyncio==1.5.6 +gmpy2==2.1.0 +matplotlib==3.8.2 +scipy==1.11.4 +typing_extensions==4.12.2 +statsmodels==0.14.0 +huggingface-hub==0.25.0 +frozenlist==1.4.1 +gpustat==1.1.1 +nvidia-nvtx-cu11==11.7.91 +safetensors==0.4.5 +stanza==1.9.2 +decorator==5.1.1 +seaborn==0.13.0 +sentencepiece==0.2.0 +PyYAML==6.0.1 +black==24.8.0 +protobuf==4.25.1 +pickleshare==0.7.5 +peft==0.13.0 +triton==2.0.0 +nvidia-cuda-runtime-cu11==11.7.99 +Jinja2==3.1.2 +nvidia-cusolver-cu11==11.4.0.1 +executing==1.2.0 +jupyter_client==8.1.0 +pluggy==1.3.0 +cmake==3.30.3 +pytz==2023.3.post1 +aiohappyeyeballs==2.4.2 +kiwisolver==1.4.5 +py-cpuinfo==9.0.0 +Pillow==10.1.0 +ptyprocess==0.7.0 +importlib_resources==6.4.5 +GitPython==3.1.43 +importlib-metadata==6.0.0 +iniconfig==2.0.0 +scikit-learn==1.3.2 +exceptiongroup==1.1.0 +networkx==2.8.6 +accelerate==1.0.0 +nltk==3.8.1 +shutilwhich==1.1.0 +fonttools==4.45.1 +future==0.18.3 +aiohttp==3.10.6 +wcwidth==0.2.5 +idna==3.6 +filelock==3.12.2 +pathspec==0.12.1 +jupyter_core==5.1.0 +lit==18.1.8 +nvidia-curand-cu11==10.2.10.91 +nvidia-cublas-cu11==11.10.3.66 +nvidia-ml-py==12.560.30 +msgpack==1.1.0 +python-dateutil==2.8.2 +blessed==1.20.0 +packaging==23.0 +gitdb==4.0.11 +yarl==1.13.0 +emoji==2.8.0 +tzdata==2023.3 +cycler==0.12.1 +tornado==6.2 +backcall==0.2.0 +plotnine==0.12.4 +ninja==1.11.1.1 +latex==0.7.0 +wandb==0.18.5 +setproctitle==1.3.3 +threadpoolctl==3.2.0 +requests==2.32.3 +pyparsing==3.1.1 +smmap==5.0.1 +pyzmq==23.0.0 +async-timeout==4.0.3 +annotated-types==0.7.0 +matplotlib-inline==0.1.6 +latexcodec==1.0.0 +ipython==8.0.0 +patsy==0.5.3 +contourpy==1.2.0 +multidict==6.1.0 +mizani==0.9.3 +urllib3==2.1.0 +tokenizers==0.20.0 +MarkupSafe==2.1.2 +pip==24.2 +pexpect==4.8.0 +tqdm==4.66.5 +jedi==0.18.2 +pydantic_core==2.23.4 +tempdir==0.7.1 +mpmath==1.2.1 +setuptools==72.1.0 +pytest==7.4.3 +pure-eval==0.2.2 +psutil==5.9.1 +comm==0.1.2 +nvidia-cuda-cupti-cu11==11.7.101 +nvidia-cuda-nvrtc-cu11==11.7.99 +regex==2023.10.3 +platformdirs==2.5.2 +asttokens==2.2.1 +torch==2.0.0 +nvidia-nccl-cu11==2.14.3 +xxhash==3.5.0 diff --git a/wandb/run-20241101_200502-28ivel81/logs/debug-internal.log b/wandb/run-20241101_200502-28ivel81/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..d395ec72ebe2c9f3335efa89534f750d79ec2c18 --- /dev/null +++ b/wandb/run-20241101_200502-28ivel81/logs/debug-internal.log @@ -0,0 +1,8 @@ +{"time":"2024-11-01T20:05:02.696528925-04:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-11-01T20:05:02.696551646-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_200502-28ivel81/logs/debug-core.log"} +{"time":"2024-11-01T20:05:02.804187905-04:00","level":"INFO","msg":"created new stream","id":"28ivel81"} +{"time":"2024-11-01T20:05:02.804240726-04:00","level":"INFO","msg":"stream: started","id":"28ivel81"} +{"time":"2024-11-01T20:05:02.804338597-04:00","level":"INFO","msg":"sender: started","stream_id":"28ivel81"} +{"time":"2024-11-01T20:05:02.804329676-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"28ivel81"}} +{"time":"2024-11-01T20:05:02.804273736-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"28ivel81"}} +{"time":"2024-11-01T20:05:03.033544975-04:00","level":"INFO","msg":"Starting system monitor"} diff --git a/wandb/run-20241101_200502-28ivel81/logs/debug.log b/wandb/run-20241101_200502-28ivel81/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..fd95d3018f6069a4de4116688fde68244e1422da --- /dev/null +++ b/wandb/run-20241101_200502-28ivel81/logs/debug.log @@ -0,0 +1,26 @@ +2024-11-01 20:05:02,690 INFO MainThread:869511 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-11-01 20:05:02,690 INFO MainThread:869511 [wandb_setup.py:_flush():79] Configure stats pid to 869511 +2024-11-01 20:05:02,690 INFO MainThread:869511 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-11-01 20:05:02,690 INFO MainThread:869511 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-11-01 20:05:02,690 INFO MainThread:869511 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-11-01 20:05:02,690 INFO MainThread:869511 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-11-01 20:05:02,690 INFO MainThread:869511 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-11-01 20:05:02,690 INFO MainThread:869511 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-11-01 20:05:02,690 INFO MainThread:869511 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_200502-28ivel81/logs/debug.log +2024-11-01 20:05:02,691 INFO MainThread:869511 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_200502-28ivel81/logs/debug-internal.log +2024-11-01 20:05:02,691 INFO MainThread:869511 [wandb_init.py:init():621] calling init triggers +2024-11-01 20:05:02,691 INFO MainThread:869511 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-11-01 20:05:02,691 INFO MainThread:869511 [wandb_init.py:init():671] starting backend +2024-11-01 20:05:02,691 INFO MainThread:869511 [wandb_init.py:init():675] sending inform_init request +2024-11-01 20:05:02,693 INFO MainThread:869511 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-11-01 20:05:02,693 INFO MainThread:869511 [wandb_init.py:init():688] backend started and connected +2024-11-01 20:05:02,698 INFO MainThread:869511 [wandb_init.py:init():783] updated telemetry +2024-11-01 20:05:02,727 INFO MainThread:869511 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-11-01 20:05:03,030 INFO MainThread:869511 [wandb_init.py:init():867] starting run threads in backend +2024-11-01 20:05:03,121 INFO MainThread:869511 [wandb_run.py:_console_start():2463] atexit reg +2024-11-01 20:05:03,121 INFO MainThread:869511 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-11-01 20:05:03,121 INFO MainThread:869511 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-11-01 20:05:03,121 INFO MainThread:869511 [wandb_run.py:_redirect():2401] Redirects installed. +2024-11-01 20:05:03,123 INFO MainThread:869511 [wandb_init.py:init():911] run started, returning control to user process +2024-11-01 20:05:03,123 INFO MainThread:869511 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'shuffle_nondeterministic', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0, 'lr': 5e-06} diff --git a/wandb/run-20241101_200517-f3b150ss/files/config.yaml b/wandb/run-20241101_200517-f3b150ss/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4e34fec43368c51fb4bd10a24a21dd490ecdba44 --- /dev/null +++ b/wandb/run-20241101_200517-f3b150ss/files/config.yaml @@ -0,0 +1,49 @@ +_wandb: + value: + cli_version: 0.18.5 + m: [] + python_version: 3.9.19 + t: + "1": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "2": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "3": + - 13 + - 23 + - 55 + "4": 3.9.19 + "5": 0.18.5 + "6": 4.45.1 + "8": + - 5 + "12": 0.18.5 + "13": linux-x86_64 +batch_size: + value: 3 +epoch: + value: 3 +lr: + value: 5e-06 +perturbation: + value: shuffle_nondeterministic +seed: + value: 0 +train_set: + value: 10M diff --git a/wandb/run-20241101_200517-f3b150ss/files/output.log b/wandb/run-20241101_200517-f3b150ss/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..567601afa4d674b46a1e8af443003d6fa581422e --- /dev/null +++ b/wandb/run-20241101_200517-f3b150ss/files/output.log @@ -0,0 +1,42 @@ +Downloading shards: 0%| | 0/2 [00:07 + model = AutoModelForCausalLM.from_pretrained(model_name, + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/models/auto/auto_factory.py", line 564, in from_pretrained + return model_class.from_pretrained( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/modeling_utils.py", line 3769, in from_pretrained + resolved_archive_file, sharded_metadata = get_checkpoint_shard_files( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 1098, in get_checkpoint_shard_files + cached_filename = cached_file( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 403, in cached_file + resolved_file = hf_hub_download( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_deprecation.py", line 101, in inner_f + return f(*args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn + return fn(*args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1232, in hf_hub_download + return _hf_hub_download_to_cache_dir( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1380, in _hf_hub_download_to_cache_dir + with WeakFileLock(lock_path): + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/contextlib.py", line 119, in __enter__ + return next(self.gen) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_fixes.py", line 98, in WeakFileLock + lock.acquire() + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/filelock/_api.py", line 225, in acquire + time.sleep(poll_interval) +KeyboardInterrupt diff --git a/wandb/run-20241101_200517-f3b150ss/files/requirements.txt b/wandb/run-20241101_200517-f3b150ss/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416 --- /dev/null +++ b/wandb/run-20241101_200517-f3b150ss/files/requirements.txt @@ -0,0 +1,147 @@ +funcsigs==1.0.2 +sentry-sdk==2.17.0 +multiprocess==0.70.16 +numpy==1.26.2 +pluralizer==1.2.0 +debugpy==1.6.7 +nvidia-cudnn-cu11==8.5.0.96 +deepspeed==0.15.2 +data==0.4 +pandas==2.1.3 +tomli==2.0.1 +charset-normalizer==3.3.2 +attrs==24.2.0 +aiosignal==1.3.1 +fsspec==2023.10.0 +nvidia-cusparse-cu11==11.7.4.91 +zipp==3.12.0 +mypy-extensions==1.0.0 +datasets==3.0.1 +joblib==1.3.2 +hjson==3.1.0 +traitlets==5.7.1 +stack-data==0.6.0 +transformers==4.45.1 +sympy==1.11.1 +Pygments==2.15.0 +docker-pycreds==0.4.0 +dill==0.3.8 +wheel==0.44.0 +prompt-toolkit==3.0.30 +parso==0.8.3 +ipykernel==6.23.1 +pyarrow==17.0.0 +certifi==2023.11.17 +nvidia-cufft-cu11==10.9.0.58 +six==1.16.0 +pydantic==2.9.2 +click==8.1.7 +nest-asyncio==1.5.6 +gmpy2==2.1.0 +matplotlib==3.8.2 +scipy==1.11.4 +typing_extensions==4.12.2 +statsmodels==0.14.0 +huggingface-hub==0.25.0 +frozenlist==1.4.1 +gpustat==1.1.1 +nvidia-nvtx-cu11==11.7.91 +safetensors==0.4.5 +stanza==1.9.2 +decorator==5.1.1 +seaborn==0.13.0 +sentencepiece==0.2.0 +PyYAML==6.0.1 +black==24.8.0 +protobuf==4.25.1 +pickleshare==0.7.5 +peft==0.13.0 +triton==2.0.0 +nvidia-cuda-runtime-cu11==11.7.99 +Jinja2==3.1.2 +nvidia-cusolver-cu11==11.4.0.1 +executing==1.2.0 +jupyter_client==8.1.0 +pluggy==1.3.0 +cmake==3.30.3 +pytz==2023.3.post1 +aiohappyeyeballs==2.4.2 +kiwisolver==1.4.5 +py-cpuinfo==9.0.0 +Pillow==10.1.0 +ptyprocess==0.7.0 +importlib_resources==6.4.5 +GitPython==3.1.43 +importlib-metadata==6.0.0 +iniconfig==2.0.0 +scikit-learn==1.3.2 +exceptiongroup==1.1.0 +networkx==2.8.6 +accelerate==1.0.0 +nltk==3.8.1 +shutilwhich==1.1.0 +fonttools==4.45.1 +future==0.18.3 +aiohttp==3.10.6 +wcwidth==0.2.5 +idna==3.6 +filelock==3.12.2 +pathspec==0.12.1 +jupyter_core==5.1.0 +lit==18.1.8 +nvidia-curand-cu11==10.2.10.91 +nvidia-cublas-cu11==11.10.3.66 +nvidia-ml-py==12.560.30 +msgpack==1.1.0 +python-dateutil==2.8.2 +blessed==1.20.0 +packaging==23.0 +gitdb==4.0.11 +yarl==1.13.0 +emoji==2.8.0 +tzdata==2023.3 +cycler==0.12.1 +tornado==6.2 +backcall==0.2.0 +plotnine==0.12.4 +ninja==1.11.1.1 +latex==0.7.0 +wandb==0.18.5 +setproctitle==1.3.3 +threadpoolctl==3.2.0 +requests==2.32.3 +pyparsing==3.1.1 +smmap==5.0.1 +pyzmq==23.0.0 +async-timeout==4.0.3 +annotated-types==0.7.0 +matplotlib-inline==0.1.6 +latexcodec==1.0.0 +ipython==8.0.0 +patsy==0.5.3 +contourpy==1.2.0 +multidict==6.1.0 +mizani==0.9.3 +urllib3==2.1.0 +tokenizers==0.20.0 +MarkupSafe==2.1.2 +pip==24.2 +pexpect==4.8.0 +tqdm==4.66.5 +jedi==0.18.2 +pydantic_core==2.23.4 +tempdir==0.7.1 +mpmath==1.2.1 +setuptools==72.1.0 +pytest==7.4.3 +pure-eval==0.2.2 +psutil==5.9.1 +comm==0.1.2 +nvidia-cuda-cupti-cu11==11.7.101 +nvidia-cuda-nvrtc-cu11==11.7.99 +regex==2023.10.3 +platformdirs==2.5.2 +asttokens==2.2.1 +torch==2.0.0 +nvidia-nccl-cu11==2.14.3 +xxhash==3.5.0 diff --git a/wandb/run-20241101_200517-f3b150ss/files/wandb-metadata.json b/wandb/run-20241101_200517-f3b150ss/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..709817a213c680ef11df8ce27ce892b11118c858 --- /dev/null +++ b/wandb/run-20241101_200517-f3b150ss/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-11-02T00:05:17.187494Z", + "args": [ + "--perturbation", + "shuffle_nondeterministic", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "3", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1754801557504" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241101_200517-f3b150ss/files/wandb-summary.json b/wandb/run-20241101_200517-f3b150ss/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..779a0b5a17ed7980591fb713a2a4db505a51ed4b --- /dev/null +++ b/wandb/run-20241101_200517-f3b150ss/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":8}} \ No newline at end of file diff --git a/wandb/run-20241101_200517-f3b150ss/logs/debug-internal.log b/wandb/run-20241101_200517-f3b150ss/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..6ef1fbc80072416b1b6fd63187d432148d3d75fd --- /dev/null +++ b/wandb/run-20241101_200517-f3b150ss/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2024-11-01T20:05:17.189781964-04:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-11-01T20:05:17.189800414-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_200517-f3b150ss/logs/debug-core.log"} +{"time":"2024-11-01T20:05:17.295261458-04:00","level":"INFO","msg":"created new stream","id":"f3b150ss"} +{"time":"2024-11-01T20:05:17.295289028-04:00","level":"INFO","msg":"stream: started","id":"f3b150ss"} +{"time":"2024-11-01T20:05:17.295344069-04:00","level":"INFO","msg":"sender: started","stream_id":"f3b150ss"} +{"time":"2024-11-01T20:05:17.295315769-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"f3b150ss"}} +{"time":"2024-11-01T20:05:17.295333749-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"f3b150ss"}} +{"time":"2024-11-01T20:05:17.655340154-04:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2024-11-01T20:05:25.26316459-04:00","level":"INFO","msg":"stream: closing","id":"f3b150ss"} +{"time":"2024-11-01T20:05:25.263209061-04:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2024-11-01T20:05:25.263959546-04:00","level":"INFO","msg":"Stopped system monitor"} diff --git a/wandb/run-20241101_200517-f3b150ss/logs/debug.log b/wandb/run-20241101_200517-f3b150ss/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..1888fa74810b37a50dc7fac718917da5eb0b91cd --- /dev/null +++ b/wandb/run-20241101_200517-f3b150ss/logs/debug.log @@ -0,0 +1,27 @@ +2024-11-01 20:05:17,184 INFO MainThread:870385 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-11-01 20:05:17,184 INFO MainThread:870385 [wandb_setup.py:_flush():79] Configure stats pid to 870385 +2024-11-01 20:05:17,184 INFO MainThread:870385 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-11-01 20:05:17,184 INFO MainThread:870385 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-11-01 20:05:17,184 INFO MainThread:870385 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-11-01 20:05:17,184 INFO MainThread:870385 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-11-01 20:05:17,185 INFO MainThread:870385 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-11-01 20:05:17,185 INFO MainThread:870385 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-11-01 20:05:17,185 INFO MainThread:870385 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_200517-f3b150ss/logs/debug.log +2024-11-01 20:05:17,185 INFO MainThread:870385 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_200517-f3b150ss/logs/debug-internal.log +2024-11-01 20:05:17,185 INFO MainThread:870385 [wandb_init.py:init():621] calling init triggers +2024-11-01 20:05:17,185 INFO MainThread:870385 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-11-01 20:05:17,185 INFO MainThread:870385 [wandb_init.py:init():671] starting backend +2024-11-01 20:05:17,185 INFO MainThread:870385 [wandb_init.py:init():675] sending inform_init request +2024-11-01 20:05:17,186 INFO MainThread:870385 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-11-01 20:05:17,187 INFO MainThread:870385 [wandb_init.py:init():688] backend started and connected +2024-11-01 20:05:17,191 INFO MainThread:870385 [wandb_init.py:init():783] updated telemetry +2024-11-01 20:05:17,223 INFO MainThread:870385 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-11-01 20:05:17,652 INFO MainThread:870385 [wandb_init.py:init():867] starting run threads in backend +2024-11-01 20:05:17,739 INFO MainThread:870385 [wandb_run.py:_console_start():2463] atexit reg +2024-11-01 20:05:17,739 INFO MainThread:870385 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-11-01 20:05:17,739 INFO MainThread:870385 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-11-01 20:05:17,739 INFO MainThread:870385 [wandb_run.py:_redirect():2401] Redirects installed. +2024-11-01 20:05:17,741 INFO MainThread:870385 [wandb_init.py:init():911] run started, returning control to user process +2024-11-01 20:05:17,741 INFO MainThread:870385 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'shuffle_nondeterministic', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0, 'lr': 5e-06} +2024-11-01 20:05:25,263 WARNING MsgRouterThr:870385 [router.py:message_loop():77] message_loop has been closed diff --git a/wandb/run-20241101_200517-f3b150ss/run-f3b150ss.wandb b/wandb/run-20241101_200517-f3b150ss/run-f3b150ss.wandb new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/wandb/run-20241101_201926-5y6ulxig/logs/debug-internal.log b/wandb/run-20241101_201926-5y6ulxig/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..5adf4203cb69a74d0c70858a1eedfc605477a64f --- /dev/null +++ b/wandb/run-20241101_201926-5y6ulxig/logs/debug-internal.log @@ -0,0 +1,8 @@ +{"time":"2024-11-01T20:19:26.872916513-04:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-11-01T20:19:26.872928923-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_201926-5y6ulxig/logs/debug-core.log"} +{"time":"2024-11-01T20:19:26.979140297-04:00","level":"INFO","msg":"created new stream","id":"5y6ulxig"} +{"time":"2024-11-01T20:19:26.979176487-04:00","level":"INFO","msg":"stream: started","id":"5y6ulxig"} +{"time":"2024-11-01T20:19:26.979208097-04:00","level":"INFO","msg":"sender: started","stream_id":"5y6ulxig"} +{"time":"2024-11-01T20:19:26.979191997-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"5y6ulxig"}} +{"time":"2024-11-01T20:19:26.979249518-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"5y6ulxig"}} +{"time":"2024-11-01T20:19:27.191677696-04:00","level":"INFO","msg":"Starting system monitor"} diff --git a/wandb/run-20241105_155905-e2ilsfdb/files/output.log b/wandb/run-20241105_155905-e2ilsfdb/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..4889313e9210ddc0bf0e53a0b8deb84b34f76597 --- /dev/null +++ b/wandb/run-20241105_155905-e2ilsfdb/files/output.log @@ -0,0 +1,19 @@ +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/pathlib.py", line 1323, in mkdir + self._accessor.mkdir(self, mode) +FileNotFoundError: [Errno 2] No such file or directory: '/home/chunhui/.cache/huggingface/datasets/babylm_dataset_test/babylm_shuffle_deterministic21_10M_seed0/0.0.0' + +During handling of the above exception, another exception occurred: + +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", line 165, in + dataset = load_dataset('babylm_dataset_test.py', name=dataset_name, trust_remote_code=True) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/datasets/load.py", line 2096, in load_dataset + builder_instance.download_and_prepare( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/datasets/builder.py", line 855, in download_and_prepare + Path(self._output_dir).parent.mkdir(parents=True, exist_ok=True) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/pathlib.py", line 1327, in mkdir + self.parent.mkdir(parents=True, exist_ok=True) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/pathlib.py", line 1323, in mkdir + self._accessor.mkdir(self, mode) +OSError: [Errno 28] No space left on device: '/home/chunhui/.cache/huggingface/datasets/babylm_dataset_test/babylm_shuffle_deterministic21_10M_seed0' diff --git a/wandb/run-20241105_160217-dgnjdt5g/files/requirements.txt b/wandb/run-20241105_160217-dgnjdt5g/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416 --- /dev/null +++ b/wandb/run-20241105_160217-dgnjdt5g/files/requirements.txt @@ -0,0 +1,147 @@ +funcsigs==1.0.2 +sentry-sdk==2.17.0 +multiprocess==0.70.16 +numpy==1.26.2 +pluralizer==1.2.0 +debugpy==1.6.7 +nvidia-cudnn-cu11==8.5.0.96 +deepspeed==0.15.2 +data==0.4 +pandas==2.1.3 +tomli==2.0.1 +charset-normalizer==3.3.2 +attrs==24.2.0 +aiosignal==1.3.1 +fsspec==2023.10.0 +nvidia-cusparse-cu11==11.7.4.91 +zipp==3.12.0 +mypy-extensions==1.0.0 +datasets==3.0.1 +joblib==1.3.2 +hjson==3.1.0 +traitlets==5.7.1 +stack-data==0.6.0 +transformers==4.45.1 +sympy==1.11.1 +Pygments==2.15.0 +docker-pycreds==0.4.0 +dill==0.3.8 +wheel==0.44.0 +prompt-toolkit==3.0.30 +parso==0.8.3 +ipykernel==6.23.1 +pyarrow==17.0.0 +certifi==2023.11.17 +nvidia-cufft-cu11==10.9.0.58 +six==1.16.0 +pydantic==2.9.2 +click==8.1.7 +nest-asyncio==1.5.6 +gmpy2==2.1.0 +matplotlib==3.8.2 +scipy==1.11.4 +typing_extensions==4.12.2 +statsmodels==0.14.0 +huggingface-hub==0.25.0 +frozenlist==1.4.1 +gpustat==1.1.1 +nvidia-nvtx-cu11==11.7.91 +safetensors==0.4.5 +stanza==1.9.2 +decorator==5.1.1 +seaborn==0.13.0 +sentencepiece==0.2.0 +PyYAML==6.0.1 +black==24.8.0 +protobuf==4.25.1 +pickleshare==0.7.5 +peft==0.13.0 +triton==2.0.0 +nvidia-cuda-runtime-cu11==11.7.99 +Jinja2==3.1.2 +nvidia-cusolver-cu11==11.4.0.1 +executing==1.2.0 +jupyter_client==8.1.0 +pluggy==1.3.0 +cmake==3.30.3 +pytz==2023.3.post1 +aiohappyeyeballs==2.4.2 +kiwisolver==1.4.5 +py-cpuinfo==9.0.0 +Pillow==10.1.0 +ptyprocess==0.7.0 +importlib_resources==6.4.5 +GitPython==3.1.43 +importlib-metadata==6.0.0 +iniconfig==2.0.0 +scikit-learn==1.3.2 +exceptiongroup==1.1.0 +networkx==2.8.6 +accelerate==1.0.0 +nltk==3.8.1 +shutilwhich==1.1.0 +fonttools==4.45.1 +future==0.18.3 +aiohttp==3.10.6 +wcwidth==0.2.5 +idna==3.6 +filelock==3.12.2 +pathspec==0.12.1 +jupyter_core==5.1.0 +lit==18.1.8 +nvidia-curand-cu11==10.2.10.91 +nvidia-cublas-cu11==11.10.3.66 +nvidia-ml-py==12.560.30 +msgpack==1.1.0 +python-dateutil==2.8.2 +blessed==1.20.0 +packaging==23.0 +gitdb==4.0.11 +yarl==1.13.0 +emoji==2.8.0 +tzdata==2023.3 +cycler==0.12.1 +tornado==6.2 +backcall==0.2.0 +plotnine==0.12.4 +ninja==1.11.1.1 +latex==0.7.0 +wandb==0.18.5 +setproctitle==1.3.3 +threadpoolctl==3.2.0 +requests==2.32.3 +pyparsing==3.1.1 +smmap==5.0.1 +pyzmq==23.0.0 +async-timeout==4.0.3 +annotated-types==0.7.0 +matplotlib-inline==0.1.6 +latexcodec==1.0.0 +ipython==8.0.0 +patsy==0.5.3 +contourpy==1.2.0 +multidict==6.1.0 +mizani==0.9.3 +urllib3==2.1.0 +tokenizers==0.20.0 +MarkupSafe==2.1.2 +pip==24.2 +pexpect==4.8.0 +tqdm==4.66.5 +jedi==0.18.2 +pydantic_core==2.23.4 +tempdir==0.7.1 +mpmath==1.2.1 +setuptools==72.1.0 +pytest==7.4.3 +pure-eval==0.2.2 +psutil==5.9.1 +comm==0.1.2 +nvidia-cuda-cupti-cu11==11.7.101 +nvidia-cuda-nvrtc-cu11==11.7.99 +regex==2023.10.3 +platformdirs==2.5.2 +asttokens==2.2.1 +torch==2.0.0 +nvidia-nccl-cu11==2.14.3 +xxhash==3.5.0 diff --git a/wandb/run-20241105_160217-dgnjdt5g/logs/debug.log b/wandb/run-20241105_160217-dgnjdt5g/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..068e33786bf91baa462b928a930038dda88487a3 --- /dev/null +++ b/wandb/run-20241105_160217-dgnjdt5g/logs/debug.log @@ -0,0 +1,27 @@ +2024-11-05 16:02:17,745 INFO MainThread:1770493 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-11-05 16:02:17,745 INFO MainThread:1770493 [wandb_setup.py:_flush():79] Configure stats pid to 1770493 +2024-11-05 16:02:17,745 INFO MainThread:1770493 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-11-05 16:02:17,745 INFO MainThread:1770493 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-11-05 16:02:17,745 INFO MainThread:1770493 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-11-05 16:02:17,745 INFO MainThread:1770493 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-11-05 16:02:17,745 INFO MainThread:1770493 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-11-05 16:02:17,746 INFO MainThread:1770493 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-11-05 16:02:17,746 INFO MainThread:1770493 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241105_160217-dgnjdt5g/logs/debug.log +2024-11-05 16:02:17,746 INFO MainThread:1770493 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241105_160217-dgnjdt5g/logs/debug-internal.log +2024-11-05 16:02:17,746 INFO MainThread:1770493 [wandb_init.py:init():621] calling init triggers +2024-11-05 16:02:17,746 INFO MainThread:1770493 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-11-05 16:02:17,746 INFO MainThread:1770493 [wandb_init.py:init():671] starting backend +2024-11-05 16:02:17,746 INFO MainThread:1770493 [wandb_init.py:init():675] sending inform_init request +2024-11-05 16:02:17,748 INFO MainThread:1770493 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-11-05 16:02:17,748 INFO MainThread:1770493 [wandb_init.py:init():688] backend started and connected +2024-11-05 16:02:17,753 INFO MainThread:1770493 [wandb_init.py:init():783] updated telemetry +2024-11-05 16:02:17,779 INFO MainThread:1770493 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-11-05 16:02:22,979 INFO MainThread:1770493 [wandb_init.py:init():867] starting run threads in backend +2024-11-05 16:02:23,067 INFO MainThread:1770493 [wandb_run.py:_console_start():2463] atexit reg +2024-11-05 16:02:23,067 INFO MainThread:1770493 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-11-05 16:02:23,067 INFO MainThread:1770493 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-11-05 16:02:23,067 INFO MainThread:1770493 [wandb_run.py:_redirect():2401] Redirects installed. +2024-11-05 16:02:23,068 INFO MainThread:1770493 [wandb_init.py:init():911] run started, returning control to user process +2024-11-05 16:02:23,068 INFO MainThread:1770493 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'shuffle_deterministic21', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0, 'lr': 5e-06} +2024-11-05 16:02:23,092 WARNING MsgRouterThr:1770493 [router.py:message_loop():77] message_loop has been closed diff --git a/wandb/run-20241105_160217-dgnjdt5g/run-dgnjdt5g.wandb b/wandb/run-20241105_160217-dgnjdt5g/run-dgnjdt5g.wandb new file mode 100644 index 0000000000000000000000000000000000000000..3ac3c6633a3c1470440afd2c90fe778f5561b76f Binary files /dev/null and b/wandb/run-20241105_160217-dgnjdt5g/run-dgnjdt5g.wandb differ diff --git a/wandb/run-20241105_162858-1nu0waui/files/config.yaml b/wandb/run-20241105_162858-1nu0waui/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3771760e607bdb2dcb978de96a047ebe2475f806 --- /dev/null +++ b/wandb/run-20241105_162858-1nu0waui/files/config.yaml @@ -0,0 +1,49 @@ +_wandb: + value: + cli_version: 0.18.5 + m: [] + python_version: 3.9.19 + t: + "1": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "2": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "3": + - 13 + - 23 + - 55 + "4": 3.9.19 + "5": 0.18.5 + "6": 4.45.1 + "8": + - 5 + "12": 0.18.5 + "13": linux-x86_64 +batch_size: + value: 3 +epoch: + value: 3 +lr: + value: 5e-06 +perturbation: + value: shuffle_deterministic57 +seed: + value: 0 +train_set: + value: 10M diff --git a/wandb/run-20241105_162858-1nu0waui/files/output.log b/wandb/run-20241105_162858-1nu0waui/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..ff314b514802207b2268559411cd06112eae1aae --- /dev/null +++ b/wandb/run-20241105_162858-1nu0waui/files/output.log @@ -0,0 +1,34 @@ +Downloading shards: 0%| | 0/2 [00:00 + model = AutoModelForCausalLM.from_pretrained(model_name, + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/models/auto/auto_factory.py", line 564, in from_pretrained + return model_class.from_pretrained( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/modeling_utils.py", line 3769, in from_pretrained + resolved_archive_file, sharded_metadata = get_checkpoint_shard_files( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 1098, in get_checkpoint_shard_files + cached_filename = cached_file( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 403, in cached_file + resolved_file = hf_hub_download( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_deprecation.py", line 101, in inner_f + return f(*args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn + return fn(*args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1232, in hf_hub_download + return _hf_hub_download_to_cache_dir( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1380, in _hf_hub_download_to_cache_dir + with WeakFileLock(lock_path): + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/contextlib.py", line 119, in __enter__ + return next(self.gen) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_fixes.py", line 98, in WeakFileLock + lock.acquire() + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/filelock/_api.py", line 225, in acquire + time.sleep(poll_interval) +KeyboardInterrupt diff --git a/wandb/run-20241105_162858-1nu0waui/files/requirements.txt b/wandb/run-20241105_162858-1nu0waui/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416 --- /dev/null +++ b/wandb/run-20241105_162858-1nu0waui/files/requirements.txt @@ -0,0 +1,147 @@ +funcsigs==1.0.2 +sentry-sdk==2.17.0 +multiprocess==0.70.16 +numpy==1.26.2 +pluralizer==1.2.0 +debugpy==1.6.7 +nvidia-cudnn-cu11==8.5.0.96 +deepspeed==0.15.2 +data==0.4 +pandas==2.1.3 +tomli==2.0.1 +charset-normalizer==3.3.2 +attrs==24.2.0 +aiosignal==1.3.1 +fsspec==2023.10.0 +nvidia-cusparse-cu11==11.7.4.91 +zipp==3.12.0 +mypy-extensions==1.0.0 +datasets==3.0.1 +joblib==1.3.2 +hjson==3.1.0 +traitlets==5.7.1 +stack-data==0.6.0 +transformers==4.45.1 +sympy==1.11.1 +Pygments==2.15.0 +docker-pycreds==0.4.0 +dill==0.3.8 +wheel==0.44.0 +prompt-toolkit==3.0.30 +parso==0.8.3 +ipykernel==6.23.1 +pyarrow==17.0.0 +certifi==2023.11.17 +nvidia-cufft-cu11==10.9.0.58 +six==1.16.0 +pydantic==2.9.2 +click==8.1.7 +nest-asyncio==1.5.6 +gmpy2==2.1.0 +matplotlib==3.8.2 +scipy==1.11.4 +typing_extensions==4.12.2 +statsmodels==0.14.0 +huggingface-hub==0.25.0 +frozenlist==1.4.1 +gpustat==1.1.1 +nvidia-nvtx-cu11==11.7.91 +safetensors==0.4.5 +stanza==1.9.2 +decorator==5.1.1 +seaborn==0.13.0 +sentencepiece==0.2.0 +PyYAML==6.0.1 +black==24.8.0 +protobuf==4.25.1 +pickleshare==0.7.5 +peft==0.13.0 +triton==2.0.0 +nvidia-cuda-runtime-cu11==11.7.99 +Jinja2==3.1.2 +nvidia-cusolver-cu11==11.4.0.1 +executing==1.2.0 +jupyter_client==8.1.0 +pluggy==1.3.0 +cmake==3.30.3 +pytz==2023.3.post1 +aiohappyeyeballs==2.4.2 +kiwisolver==1.4.5 +py-cpuinfo==9.0.0 +Pillow==10.1.0 +ptyprocess==0.7.0 +importlib_resources==6.4.5 +GitPython==3.1.43 +importlib-metadata==6.0.0 +iniconfig==2.0.0 +scikit-learn==1.3.2 +exceptiongroup==1.1.0 +networkx==2.8.6 +accelerate==1.0.0 +nltk==3.8.1 +shutilwhich==1.1.0 +fonttools==4.45.1 +future==0.18.3 +aiohttp==3.10.6 +wcwidth==0.2.5 +idna==3.6 +filelock==3.12.2 +pathspec==0.12.1 +jupyter_core==5.1.0 +lit==18.1.8 +nvidia-curand-cu11==10.2.10.91 +nvidia-cublas-cu11==11.10.3.66 +nvidia-ml-py==12.560.30 +msgpack==1.1.0 +python-dateutil==2.8.2 +blessed==1.20.0 +packaging==23.0 +gitdb==4.0.11 +yarl==1.13.0 +emoji==2.8.0 +tzdata==2023.3 +cycler==0.12.1 +tornado==6.2 +backcall==0.2.0 +plotnine==0.12.4 +ninja==1.11.1.1 +latex==0.7.0 +wandb==0.18.5 +setproctitle==1.3.3 +threadpoolctl==3.2.0 +requests==2.32.3 +pyparsing==3.1.1 +smmap==5.0.1 +pyzmq==23.0.0 +async-timeout==4.0.3 +annotated-types==0.7.0 +matplotlib-inline==0.1.6 +latexcodec==1.0.0 +ipython==8.0.0 +patsy==0.5.3 +contourpy==1.2.0 +multidict==6.1.0 +mizani==0.9.3 +urllib3==2.1.0 +tokenizers==0.20.0 +MarkupSafe==2.1.2 +pip==24.2 +pexpect==4.8.0 +tqdm==4.66.5 +jedi==0.18.2 +pydantic_core==2.23.4 +tempdir==0.7.1 +mpmath==1.2.1 +setuptools==72.1.0 +pytest==7.4.3 +pure-eval==0.2.2 +psutil==5.9.1 +comm==0.1.2 +nvidia-cuda-cupti-cu11==11.7.101 +nvidia-cuda-nvrtc-cu11==11.7.99 +regex==2023.10.3 +platformdirs==2.5.2 +asttokens==2.2.1 +torch==2.0.0 +nvidia-nccl-cu11==2.14.3 +xxhash==3.5.0 diff --git a/wandb/run-20241105_162858-1nu0waui/files/wandb-metadata.json b/wandb/run-20241105_162858-1nu0waui/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..711441f94d796cdff516b7a1c44a2a091c691a1a --- /dev/null +++ b/wandb/run-20241105_162858-1nu0waui/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-11-05T21:28:58.904440Z", + "args": [ + "--perturbation", + "shuffle_deterministic57", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "3", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1785811787776" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241105_162858-1nu0waui/files/wandb-summary.json b/wandb/run-20241105_162858-1nu0waui/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..15f6b8e9049a55292dab131278b3f2fc1f52e50d --- /dev/null +++ b/wandb/run-20241105_162858-1nu0waui/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":23}} \ No newline at end of file diff --git a/wandb/run-20241105_162858-1nu0waui/logs/debug-internal.log b/wandb/run-20241105_162858-1nu0waui/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..e5a9b229b3a0b7318b5400be361a605477e25a50 --- /dev/null +++ b/wandb/run-20241105_162858-1nu0waui/logs/debug-internal.log @@ -0,0 +1,12 @@ +{"time":"2024-11-05T16:28:58.906741712-05:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-11-05T16:28:58.906754542-05:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241105_162858-1nu0waui/logs/debug-core.log"} +{"time":"2024-11-05T16:28:59.013096012-05:00","level":"INFO","msg":"created new stream","id":"1nu0waui"} +{"time":"2024-11-05T16:28:59.013126942-05:00","level":"INFO","msg":"stream: started","id":"1nu0waui"} +{"time":"2024-11-05T16:28:59.013184862-05:00","level":"INFO","msg":"sender: started","stream_id":"1nu0waui"} +{"time":"2024-11-05T16:28:59.013172112-05:00","level":"INFO","msg":"handler: started","stream_id":{"value":"1nu0waui"}} +{"time":"2024-11-05T16:28:59.013153072-05:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"1nu0waui"}} +{"time":"2024-11-05T16:28:59.207230376-05:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2024-11-05T16:29:22.455795463-05:00","level":"INFO","msg":"stream: closing","id":"1nu0waui"} +{"time":"2024-11-05T16:29:22.455835113-05:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2024-11-05T16:29:22.456425345-05:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2024-11-05T16:29:22.669742218-05:00","level":"INFO","msg":"api: retrying HTTP error","status":503,"url":"https://api.wandb.ai/graphql"} diff --git a/wandb/run-20241105_162858-1nu0waui/logs/debug.log b/wandb/run-20241105_162858-1nu0waui/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..084dcddd2e39108b53ff40c90384abad8c4b4c45 --- /dev/null +++ b/wandb/run-20241105_162858-1nu0waui/logs/debug.log @@ -0,0 +1,27 @@ +2024-11-05 16:28:58,902 INFO MainThread:1778374 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-11-05 16:28:58,902 INFO MainThread:1778374 [wandb_setup.py:_flush():79] Configure stats pid to 1778374 +2024-11-05 16:28:58,902 INFO MainThread:1778374 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-11-05 16:28:58,902 INFO MainThread:1778374 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-11-05 16:28:58,902 INFO MainThread:1778374 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-11-05 16:28:58,902 INFO MainThread:1778374 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-11-05 16:28:58,902 INFO MainThread:1778374 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-11-05 16:28:58,902 INFO MainThread:1778374 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-11-05 16:28:58,902 INFO MainThread:1778374 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241105_162858-1nu0waui/logs/debug.log +2024-11-05 16:28:58,902 INFO MainThread:1778374 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241105_162858-1nu0waui/logs/debug-internal.log +2024-11-05 16:28:58,903 INFO MainThread:1778374 [wandb_init.py:init():621] calling init triggers +2024-11-05 16:28:58,903 INFO MainThread:1778374 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-11-05 16:28:58,903 INFO MainThread:1778374 [wandb_init.py:init():671] starting backend +2024-11-05 16:28:58,903 INFO MainThread:1778374 [wandb_init.py:init():675] sending inform_init request +2024-11-05 16:28:58,903 INFO MainThread:1778374 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-11-05 16:28:58,904 INFO MainThread:1778374 [wandb_init.py:init():688] backend started and connected +2024-11-05 16:28:58,906 INFO MainThread:1778374 [wandb_init.py:init():783] updated telemetry +2024-11-05 16:28:58,929 INFO MainThread:1778374 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-11-05 16:28:59,204 INFO MainThread:1778374 [wandb_init.py:init():867] starting run threads in backend +2024-11-05 16:28:59,296 INFO MainThread:1778374 [wandb_run.py:_console_start():2463] atexit reg +2024-11-05 16:28:59,296 INFO MainThread:1778374 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-11-05 16:28:59,296 INFO MainThread:1778374 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-11-05 16:28:59,296 INFO MainThread:1778374 [wandb_run.py:_redirect():2401] Redirects installed. +2024-11-05 16:28:59,297 INFO MainThread:1778374 [wandb_init.py:init():911] run started, returning control to user process +2024-11-05 16:28:59,298 INFO MainThread:1778374 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'shuffle_deterministic57', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0, 'lr': 5e-06} +2024-11-05 16:29:22,456 WARNING MsgRouterThr:1778374 [router.py:message_loop():77] message_loop has been closed diff --git a/wandb/run-20241105_162858-1nu0waui/run-1nu0waui.wandb b/wandb/run-20241105_162858-1nu0waui/run-1nu0waui.wandb new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/wandb/run-20241106_232725-hsmv8meh/logs/debug-internal.log b/wandb/run-20241106_232725-hsmv8meh/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..a26dd0e01fead766842fe4fc5aefaf54eeb257b5 --- /dev/null +++ b/wandb/run-20241106_232725-hsmv8meh/logs/debug-internal.log @@ -0,0 +1,16 @@ +{"time":"2024-11-06T23:27:25.78433188-05:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-11-06T23:27:25.7843414-05:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241106_232725-hsmv8meh/logs/debug-core.log"} +{"time":"2024-11-06T23:27:25.890097892-05:00","level":"INFO","msg":"created new stream","id":"hsmv8meh"} +{"time":"2024-11-06T23:27:25.890126002-05:00","level":"INFO","msg":"stream: started","id":"hsmv8meh"} +{"time":"2024-11-06T23:27:25.890147893-05:00","level":"INFO","msg":"handler: started","stream_id":{"value":"hsmv8meh"}} +{"time":"2024-11-06T23:27:25.890140363-05:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"hsmv8meh"}} +{"time":"2024-11-06T23:27:25.890198573-05:00","level":"INFO","msg":"sender: started","stream_id":"hsmv8meh"} +{"time":"2024-11-06T23:27:26.370297065-05:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2024-11-06T23:27:50.241907349-05:00","level":"INFO","msg":"stream: closing","id":"hsmv8meh"} +{"time":"2024-11-06T23:27:50.24200176-05:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2024-11-06T23:27:50.242800557-05:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2024-11-06T23:27:50.577659756-05:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2024-11-06T23:27:50.701933317-05:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"hsmv8meh"}} +{"time":"2024-11-06T23:27:50.701954557-05:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"hsmv8meh"}} +{"time":"2024-11-06T23:27:50.701969877-05:00","level":"INFO","msg":"sender: closed","stream_id":"hsmv8meh"} +{"time":"2024-11-06T23:27:50.702059768-05:00","level":"INFO","msg":"stream: closed","id":"hsmv8meh"} diff --git a/wandb/run-20241106_233142-ph2ppnwr/files/wandb-metadata.json b/wandb/run-20241106_233142-ph2ppnwr/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..f29657f35af10e9409951e0bc67d3bf84996cdb9 --- /dev/null +++ b/wandb/run-20241106_233142-ph2ppnwr/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-11-07T04:31:42.030779Z", + "args": [ + "--perturbation", + "shuffle_even_odd", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "3", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1775601938432" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241106_233142-ph2ppnwr/run-ph2ppnwr.wandb b/wandb/run-20241106_233142-ph2ppnwr/run-ph2ppnwr.wandb new file mode 100644 index 0000000000000000000000000000000000000000..036bc91fe85a2227e7f9f819ac48aa91d94a62ec Binary files /dev/null and b/wandb/run-20241106_233142-ph2ppnwr/run-ph2ppnwr.wandb differ diff --git a/wandb/run-20241106_233846-1aof2sz3/files/config.yaml b/wandb/run-20241106_233846-1aof2sz3/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..45c9885a80c6c6551af0939982b4765da8541cf4 --- /dev/null +++ b/wandb/run-20241106_233846-1aof2sz3/files/config.yaml @@ -0,0 +1,49 @@ +_wandb: + value: + cli_version: 0.18.5 + m: [] + python_version: 3.9.19 + t: + "1": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "2": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "3": + - 13 + - 23 + - 55 + "4": 3.9.19 + "5": 0.18.5 + "6": 4.45.1 + "8": + - 5 + "12": 0.18.5 + "13": linux-x86_64 +batch_size: + value: 3 +epoch: + value: 3 +lr: + value: 5e-06 +perturbation: + value: shuffle_even_odd +seed: + value: 0 +train_set: + value: 10M diff --git a/wandb/run-20241106_233846-1aof2sz3/files/output.log b/wandb/run-20241106_233846-1aof2sz3/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..42d9f74d442048fd78123ebe1634f63a808e5e4c --- /dev/null +++ b/wandb/run-20241106_233846-1aof2sz3/files/output.log @@ -0,0 +1,60 @@ +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_http.py", line 406, in hf_raise_for_status + response.raise_for_status() + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/requests/models.py", line 1024, in raise_for_status + raise HTTPError(http_error_msg, response=self) +requests.exceptions.HTTPError: 401 Client Error: Unauthorized for url: https://huggingface.co/meta-llama/Llama-3.2-3B/resolve/main/config.json + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 403, in cached_file + resolved_file = hf_hub_download( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_deprecation.py", line 101, in inner_f + return f(*args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn + return fn(*args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1232, in hf_hub_download + return _hf_hub_download_to_cache_dir( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1339, in _hf_hub_download_to_cache_dir + _raise_on_head_call_error(head_call_error, force_download, local_files_only) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1854, in _raise_on_head_call_error + raise head_call_error + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1746, in _get_metadata_or_catch_error + metadata = get_hf_file_metadata( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn + return fn(*args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1666, in get_hf_file_metadata + r = _request_wrapper( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 364, in _request_wrapper + response = _request_wrapper( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 388, in _request_wrapper + hf_raise_for_status(response) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_http.py", line 423, in hf_raise_for_status + raise _format(GatedRepoError, message, response) from e +huggingface_hub.errors.GatedRepoError: 401 Client Error. (Request ID: Root=1-672c4456-6b39b3eb0ed959de47a01ea0;a80081a7-d00d-4bb2-abfd-076e2520190a) + +Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.2-3B/resolve/main/config.json. +Access to model meta-llama/Llama-3.2-3B is restricted. You must have access to it and be authenticated to access it. Please log in. + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", line 174, in + model = AutoModelForCausalLM.from_pretrained(model_name, + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/models/auto/auto_factory.py", line 526, in from_pretrained + config, kwargs = AutoConfig.from_pretrained( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/models/auto/configuration_auto.py", line 1006, in from_pretrained + config_dict, unused_kwargs = PretrainedConfig.get_config_dict(pretrained_model_name_or_path, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/configuration_utils.py", line 567, in get_config_dict + config_dict, kwargs = cls._get_config_dict(pretrained_model_name_or_path, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/configuration_utils.py", line 626, in _get_config_dict + resolved_config_file = cached_file( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 421, in cached_file + raise EnvironmentError( +OSError: You are trying to access a gated repo. +Make sure to have access to it at https://huggingface.co/meta-llama/Llama-3.2-3B. +401 Client Error. (Request ID: Root=1-672c4456-6b39b3eb0ed959de47a01ea0;a80081a7-d00d-4bb2-abfd-076e2520190a) + +Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.2-3B/resolve/main/config.json. +Access to model meta-llama/Llama-3.2-3B is restricted. You must have access to it and be authenticated to access it. Please log in. diff --git a/wandb/run-20241106_233846-1aof2sz3/files/wandb-metadata.json b/wandb/run-20241106_233846-1aof2sz3/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..bfad93e7ec3e082a040d9c34d85109c7147a1c83 --- /dev/null +++ b/wandb/run-20241106_233846-1aof2sz3/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-11-07T04:38:46.176627Z", + "args": [ + "--perturbation", + "shuffle_even_odd", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "3", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1775602020352" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241106_233846-1aof2sz3/files/wandb-summary.json b/wandb/run-20241106_233846-1aof2sz3/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..6c37fe1cbbb8aed86fd461a79642cb991e4d35cf --- /dev/null +++ b/wandb/run-20241106_233846-1aof2sz3/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":0}} \ No newline at end of file diff --git a/wandb/run-20241106_233846-1aof2sz3/logs/debug-internal.log b/wandb/run-20241106_233846-1aof2sz3/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..6b9d6d17c3b7e10ee1aded0e285caf120ce4bfdb --- /dev/null +++ b/wandb/run-20241106_233846-1aof2sz3/logs/debug-internal.log @@ -0,0 +1,16 @@ +{"time":"2024-11-06T23:38:46.179668978-05:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-11-06T23:38:46.179682638-05:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241106_233846-1aof2sz3/logs/debug-core.log"} +{"time":"2024-11-06T23:38:46.287876966-05:00","level":"INFO","msg":"created new stream","id":"1aof2sz3"} +{"time":"2024-11-06T23:38:46.287903726-05:00","level":"INFO","msg":"stream: started","id":"1aof2sz3"} +{"time":"2024-11-06T23:38:46.287992387-05:00","level":"INFO","msg":"sender: started","stream_id":"1aof2sz3"} +{"time":"2024-11-06T23:38:46.287938217-05:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"1aof2sz3"}} +{"time":"2024-11-06T23:38:46.287992277-05:00","level":"INFO","msg":"handler: started","stream_id":{"value":"1aof2sz3"}} +{"time":"2024-11-06T23:38:46.455015907-05:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2024-11-06T23:38:46.694055382-05:00","level":"INFO","msg":"stream: closing","id":"1aof2sz3"} +{"time":"2024-11-06T23:38:46.694087012-05:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2024-11-06T23:38:46.694390345-05:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2024-11-06T23:38:47.274087-05:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2024-11-06T23:38:47.395139989-05:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"1aof2sz3"}} +{"time":"2024-11-06T23:38:47.3951777-05:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"1aof2sz3"}} +{"time":"2024-11-06T23:38:47.39518511-05:00","level":"INFO","msg":"sender: closed","stream_id":"1aof2sz3"} +{"time":"2024-11-06T23:38:47.39523195-05:00","level":"INFO","msg":"stream: closed","id":"1aof2sz3"} diff --git a/wandb/run-20241106_233846-1aof2sz3/logs/debug.log b/wandb/run-20241106_233846-1aof2sz3/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..7e4c05f957f8899bb2c760af5c8e1e47045cd643 --- /dev/null +++ b/wandb/run-20241106_233846-1aof2sz3/logs/debug.log @@ -0,0 +1,27 @@ +2024-11-06 23:38:46,174 INFO MainThread:1996623 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-11-06 23:38:46,174 INFO MainThread:1996623 [wandb_setup.py:_flush():79] Configure stats pid to 1996623 +2024-11-06 23:38:46,175 INFO MainThread:1996623 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-11-06 23:38:46,175 INFO MainThread:1996623 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-11-06 23:38:46,175 INFO MainThread:1996623 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-11-06 23:38:46,175 INFO MainThread:1996623 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-11-06 23:38:46,175 INFO MainThread:1996623 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-11-06 23:38:46,175 INFO MainThread:1996623 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-11-06 23:38:46,175 INFO MainThread:1996623 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241106_233846-1aof2sz3/logs/debug.log +2024-11-06 23:38:46,175 INFO MainThread:1996623 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241106_233846-1aof2sz3/logs/debug-internal.log +2024-11-06 23:38:46,175 INFO MainThread:1996623 [wandb_init.py:init():621] calling init triggers +2024-11-06 23:38:46,175 INFO MainThread:1996623 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-11-06 23:38:46,175 INFO MainThread:1996623 [wandb_init.py:init():671] starting backend +2024-11-06 23:38:46,175 INFO MainThread:1996623 [wandb_init.py:init():675] sending inform_init request +2024-11-06 23:38:46,176 INFO MainThread:1996623 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-11-06 23:38:46,176 INFO MainThread:1996623 [wandb_init.py:init():688] backend started and connected +2024-11-06 23:38:46,179 INFO MainThread:1996623 [wandb_init.py:init():783] updated telemetry +2024-11-06 23:38:46,205 INFO MainThread:1996623 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-11-06 23:38:46,451 INFO MainThread:1996623 [wandb_init.py:init():867] starting run threads in backend +2024-11-06 23:38:46,560 INFO MainThread:1996623 [wandb_run.py:_console_start():2463] atexit reg +2024-11-06 23:38:46,560 INFO MainThread:1996623 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-11-06 23:38:46,560 INFO MainThread:1996623 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-11-06 23:38:46,560 INFO MainThread:1996623 [wandb_run.py:_redirect():2401] Redirects installed. +2024-11-06 23:38:46,562 INFO MainThread:1996623 [wandb_init.py:init():911] run started, returning control to user process +2024-11-06 23:38:46,562 INFO MainThread:1996623 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'shuffle_even_odd', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0, 'lr': 5e-06} +2024-11-06 23:38:46,694 WARNING MsgRouterThr:1996623 [router.py:message_loop():77] message_loop has been closed diff --git a/wandb/run-20241106_233846-1aof2sz3/run-1aof2sz3.wandb b/wandb/run-20241106_233846-1aof2sz3/run-1aof2sz3.wandb new file mode 100644 index 0000000000000000000000000000000000000000..0e36ea797977813a5e7367465a2e84e2ad770fda Binary files /dev/null and b/wandb/run-20241106_233846-1aof2sz3/run-1aof2sz3.wandb differ diff --git a/wandb/run-20241106_234348-l3eig11b/files/requirements.txt b/wandb/run-20241106_234348-l3eig11b/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416 --- /dev/null +++ b/wandb/run-20241106_234348-l3eig11b/files/requirements.txt @@ -0,0 +1,147 @@ +funcsigs==1.0.2 +sentry-sdk==2.17.0 +multiprocess==0.70.16 +numpy==1.26.2 +pluralizer==1.2.0 +debugpy==1.6.7 +nvidia-cudnn-cu11==8.5.0.96 +deepspeed==0.15.2 +data==0.4 +pandas==2.1.3 +tomli==2.0.1 +charset-normalizer==3.3.2 +attrs==24.2.0 +aiosignal==1.3.1 +fsspec==2023.10.0 +nvidia-cusparse-cu11==11.7.4.91 +zipp==3.12.0 +mypy-extensions==1.0.0 +datasets==3.0.1 +joblib==1.3.2 +hjson==3.1.0 +traitlets==5.7.1 +stack-data==0.6.0 +transformers==4.45.1 +sympy==1.11.1 +Pygments==2.15.0 +docker-pycreds==0.4.0 +dill==0.3.8 +wheel==0.44.0 +prompt-toolkit==3.0.30 +parso==0.8.3 +ipykernel==6.23.1 +pyarrow==17.0.0 +certifi==2023.11.17 +nvidia-cufft-cu11==10.9.0.58 +six==1.16.0 +pydantic==2.9.2 +click==8.1.7 +nest-asyncio==1.5.6 +gmpy2==2.1.0 +matplotlib==3.8.2 +scipy==1.11.4 +typing_extensions==4.12.2 +statsmodels==0.14.0 +huggingface-hub==0.25.0 +frozenlist==1.4.1 +gpustat==1.1.1 +nvidia-nvtx-cu11==11.7.91 +safetensors==0.4.5 +stanza==1.9.2 +decorator==5.1.1 +seaborn==0.13.0 +sentencepiece==0.2.0 +PyYAML==6.0.1 +black==24.8.0 +protobuf==4.25.1 +pickleshare==0.7.5 +peft==0.13.0 +triton==2.0.0 +nvidia-cuda-runtime-cu11==11.7.99 +Jinja2==3.1.2 +nvidia-cusolver-cu11==11.4.0.1 +executing==1.2.0 +jupyter_client==8.1.0 +pluggy==1.3.0 +cmake==3.30.3 +pytz==2023.3.post1 +aiohappyeyeballs==2.4.2 +kiwisolver==1.4.5 +py-cpuinfo==9.0.0 +Pillow==10.1.0 +ptyprocess==0.7.0 +importlib_resources==6.4.5 +GitPython==3.1.43 +importlib-metadata==6.0.0 +iniconfig==2.0.0 +scikit-learn==1.3.2 +exceptiongroup==1.1.0 +networkx==2.8.6 +accelerate==1.0.0 +nltk==3.8.1 +shutilwhich==1.1.0 +fonttools==4.45.1 +future==0.18.3 +aiohttp==3.10.6 +wcwidth==0.2.5 +idna==3.6 +filelock==3.12.2 +pathspec==0.12.1 +jupyter_core==5.1.0 +lit==18.1.8 +nvidia-curand-cu11==10.2.10.91 +nvidia-cublas-cu11==11.10.3.66 +nvidia-ml-py==12.560.30 +msgpack==1.1.0 +python-dateutil==2.8.2 +blessed==1.20.0 +packaging==23.0 +gitdb==4.0.11 +yarl==1.13.0 +emoji==2.8.0 +tzdata==2023.3 +cycler==0.12.1 +tornado==6.2 +backcall==0.2.0 +plotnine==0.12.4 +ninja==1.11.1.1 +latex==0.7.0 +wandb==0.18.5 +setproctitle==1.3.3 +threadpoolctl==3.2.0 +requests==2.32.3 +pyparsing==3.1.1 +smmap==5.0.1 +pyzmq==23.0.0 +async-timeout==4.0.3 +annotated-types==0.7.0 +matplotlib-inline==0.1.6 +latexcodec==1.0.0 +ipython==8.0.0 +patsy==0.5.3 +contourpy==1.2.0 +multidict==6.1.0 +mizani==0.9.3 +urllib3==2.1.0 +tokenizers==0.20.0 +MarkupSafe==2.1.2 +pip==24.2 +pexpect==4.8.0 +tqdm==4.66.5 +jedi==0.18.2 +pydantic_core==2.23.4 +tempdir==0.7.1 +mpmath==1.2.1 +setuptools==72.1.0 +pytest==7.4.3 +pure-eval==0.2.2 +psutil==5.9.1 +comm==0.1.2 +nvidia-cuda-cupti-cu11==11.7.101 +nvidia-cuda-nvrtc-cu11==11.7.99 +regex==2023.10.3 +platformdirs==2.5.2 +asttokens==2.2.1 +torch==2.0.0 +nvidia-nccl-cu11==2.14.3 +xxhash==3.5.0 diff --git a/wandb/run-20241128_161554-ol74k8mz/files/output.log b/wandb/run-20241128_161554-ol74k8mz/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..4b216b1d25482a73e0ee4b5cf4418d9d35001618 --- /dev/null +++ b/wandb/run-20241128_161554-ol74k8mz/files/output.log @@ -0,0 +1,18 @@ +Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17519/17519 [00:47<00:00, 365.01 examples/s] +Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 18140/18140 [00:49<00:00, 367.32 examples/s] +tokenized_valid: Dataset({ + features: ['input_ids', 'attention_mask'], + num_rows: 1000 +}) +/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead + warnings.warn( +[2024-11-28 16:18:34,747] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-11-28 16:18:40,263] [INFO] [comm.py:652:init_distributed] cdb=None +Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination +Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root... +Creating extension directory /home/chunhui/.cache/torch_extensions/py39_cu117/cpu_adam... +Emitting ninja build file /home/chunhui/.cache/torch_extensions/py39_cu117/cpu_adam/build.ninja... +Building extension module cpu_adam... +Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +Loading extension module cpu_adam... +Time to load cpu_adam op: 27.980506658554077 seconds diff --git a/wandb/run-20241128_161554-ol74k8mz/files/wandb-metadata.json b/wandb/run-20241128_161554-ol74k8mz/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..a093b5b6b44bd20186c0231ba2941f6aa240036b --- /dev/null +++ b/wandb/run-20241128_161554-ol74k8mz/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-11-28T21:15:54.210209Z", + "args": [ + "--perturbation", + "reverse_control", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "3", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_llama_1B.py", + "codePath": "train/train_llama_1B.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_llama_1B.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1723122159616" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241128_161554-ol74k8mz/logs/debug-internal.log b/wandb/run-20241128_161554-ol74k8mz/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..98ebade869b6e1c39a195f76276b1657e00c04e1 --- /dev/null +++ b/wandb/run-20241128_161554-ol74k8mz/logs/debug-internal.log @@ -0,0 +1,17 @@ +{"time":"2024-11-28T16:15:54.215655116-05:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-11-28T16:15:54.215688566-05:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241128_161554-ol74k8mz/logs/debug-core.log"} +{"time":"2024-11-28T16:15:54.427360088-05:00","level":"INFO","msg":"created new stream","id":"ol74k8mz"} +{"time":"2024-11-28T16:15:54.427418469-05:00","level":"INFO","msg":"stream: started","id":"ol74k8mz"} +{"time":"2024-11-28T16:15:54.427535709-05:00","level":"INFO","msg":"sender: started","stream_id":"ol74k8mz"} +{"time":"2024-11-28T16:15:54.42755547-05:00","level":"INFO","msg":"handler: started","stream_id":{"value":"ol74k8mz"}} +{"time":"2024-11-28T16:15:54.427502569-05:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"ol74k8mz"}} +{"time":"2024-11-28T16:15:54.612688778-05:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2024-11-29T06:59:15.424945857-05:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2024-11-29T06:59:15.435945052-05:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2024-11-29T06:59:15.810441427-05:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2024-11-29T06:59:16.00727262-05:00","level":"INFO","msg":"handler: operation stats","stats":{}} +{"time":"2024-11-29T06:59:17.02043111-05:00","level":"INFO","msg":"stream: closing","id":"ol74k8mz"} +{"time":"2024-11-29T06:59:17.020461011-05:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"ol74k8mz"}} +{"time":"2024-11-29T06:59:17.020494901-05:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"ol74k8mz"}} +{"time":"2024-11-29T06:59:17.020513291-05:00","level":"INFO","msg":"sender: closed","stream_id":"ol74k8mz"} +{"time":"2024-11-29T06:59:17.020586862-05:00","level":"INFO","msg":"stream: closed","id":"ol74k8mz"} diff --git a/wandb/run-20241128_161554-ol74k8mz/logs/debug.log b/wandb/run-20241128_161554-ol74k8mz/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..d6b9d5a18800da8c0190cb22bc39792be57a0ed6 --- /dev/null +++ b/wandb/run-20241128_161554-ol74k8mz/logs/debug.log @@ -0,0 +1,33 @@ +2024-11-28 16:15:54,206 INFO MainThread:3101595 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-11-28 16:15:54,206 INFO MainThread:3101595 [wandb_setup.py:_flush():79] Configure stats pid to 3101595 +2024-11-28 16:15:54,206 INFO MainThread:3101595 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-11-28 16:15:54,206 INFO MainThread:3101595 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-11-28 16:15:54,206 INFO MainThread:3101595 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-11-28 16:15:54,206 INFO MainThread:3101595 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-11-28 16:15:54,206 INFO MainThread:3101595 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_llama_1B.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_llama_1B.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_llama_1B.py'} +2024-11-28 16:15:54,207 INFO MainThread:3101595 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-11-28 16:15:54,207 INFO MainThread:3101595 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241128_161554-ol74k8mz/logs/debug.log +2024-11-28 16:15:54,207 INFO MainThread:3101595 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241128_161554-ol74k8mz/logs/debug-internal.log +2024-11-28 16:15:54,207 INFO MainThread:3101595 [wandb_init.py:init():621] calling init triggers +2024-11-28 16:15:54,207 INFO MainThread:3101595 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-11-28 16:15:54,207 INFO MainThread:3101595 [wandb_init.py:init():671] starting backend +2024-11-28 16:15:54,207 INFO MainThread:3101595 [wandb_init.py:init():675] sending inform_init request +2024-11-28 16:15:54,209 INFO MainThread:3101595 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-11-28 16:15:54,209 INFO MainThread:3101595 [wandb_init.py:init():688] backend started and connected +2024-11-28 16:15:54,213 INFO MainThread:3101595 [wandb_init.py:init():783] updated telemetry +2024-11-28 16:15:54,242 INFO MainThread:3101595 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-11-28 16:15:54,609 INFO MainThread:3101595 [wandb_init.py:init():867] starting run threads in backend +2024-11-28 16:15:54,714 INFO MainThread:3101595 [wandb_run.py:_console_start():2463] atexit reg +2024-11-28 16:15:54,714 INFO MainThread:3101595 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-11-28 16:15:54,714 INFO MainThread:3101595 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-11-28 16:15:54,715 INFO MainThread:3101595 [wandb_run.py:_redirect():2401] Redirects installed. +2024-11-28 16:15:54,716 INFO MainThread:3101595 [wandb_init.py:init():911] run started, returning control to user process +2024-11-28 16:15:54,717 INFO MainThread:3101595 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_control', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0, 'lr': 5e-06} +2024-11-29 06:59:15,374 INFO MainThread:3101595 [wandb_run.py:_finish():2158] finishing run yaning1001-dartmouth-college/exp-impo-reverse/ol74k8mz +2024-11-29 06:59:15,381 INFO MainThread:3101595 [wandb_run.py:_atexit_cleanup():2426] got exitcode: 0 +2024-11-29 06:59:15,382 INFO MainThread:3101595 [wandb_run.py:_restore():2408] restore +2024-11-29 06:59:15,382 INFO MainThread:3101595 [wandb_run.py:_restore():2414] restore done +2024-11-29 06:59:17,010 INFO MainThread:3101595 [wandb_run.py:_footer_history_summary_info():3975] rendering history +2024-11-29 06:59:17,010 INFO MainThread:3101595 [wandb_run.py:_footer_history_summary_info():4007] rendering summary +2024-11-29 06:59:17,019 INFO MainThread:3101595 [wandb_run.py:_footer_sync_info():3934] logging synced files diff --git a/wandb/run-20241128_161638-a0iw6rlo/files/config.yaml b/wandb/run-20241128_161638-a0iw6rlo/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..66cd8e9cf28a9a270dffc8a3ae97763620fd12ae --- /dev/null +++ b/wandb/run-20241128_161638-a0iw6rlo/files/config.yaml @@ -0,0 +1,531 @@ +_name_or_path: + value: meta-llama/Llama-3.2-1B +_wandb: + value: + cli_version: 0.18.5 + m: + - "1": train/epoch + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": train/global_step + "6": + - 3 + "7": [] + - "1": eval/runtime + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": eval/steps_per_second + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": train/loss + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": train/grad_norm + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": train/learning_rate + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": eval/loss + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": eval/samples_per_second + "5": 2 + "6": + - 1 + - 3 + "7": [] + python_version: 3.9.19 + t: + "1": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "2": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "3": + - 2 + - 7 + - 13 + - 19 + - 23 + - 55 + - 62 + - 66 + "4": 3.9.19 + "5": 0.18.5 + "6": 4.45.1 + "8": + - 5 + "9": + "1": transformers_trainer + "12": 0.18.5 + "13": linux-x86_64 +accelerator_config: + value: + dispatch_batches: null + even_batches: true + gradient_accumulation_kwargs: null + non_blocking: false + split_batches: false + use_seedable_sampler: true +adafactor: + value: false +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +add_cross_attention: + value: false +architectures: + value: + - LlamaForCausalLM +attention_bias: + value: false +attention_dropout: + value: 0 +auto_find_batch_size: + value: false +bad_words_ids: + value: null +batch_eval_metrics: + value: false +batch_size: + value: 3 +begin_suppress_tokens: + value: null +bf16: + value: false +bf16_full_eval: + value: false +bos_token_id: + value: 128000 +chunk_size_feed_forward: + value: 0 +cross_attention_hidden_size: + value: null +data_seed: + value: null +dataloader_drop_last: + value: false +dataloader_num_workers: + value: 0 +dataloader_persistent_workers: + value: false +dataloader_pin_memory: + value: true +dataloader_prefetch_factor: + value: null +ddp_backend: + value: null +ddp_broadcast_buffers: + value: null +ddp_bucket_cap_mb: + value: null +ddp_find_unused_parameters: + value: null +ddp_timeout: + value: 1800 +debug: + value: [] +decoder_start_token_id: + value: null +deepspeed: + value: deepspeed_config/train_dp_config.json +disable_tqdm: + value: false +dispatch_batches: + value: null +diversity_penalty: + value: 0 +do_eval: + value: true +do_predict: + value: false +do_sample: + value: false +do_train: + value: false +early_stopping: + value: false +encoder_no_repeat_ngram_size: + value: 0 +eos_token_id: + value: 128001 +epoch: + value: 3 +eval_accumulation_steps: + value: null +eval_delay: + value: 0 +eval_do_concat_batches: + value: true +eval_on_start: + value: false +eval_steps: + value: 10 +eval_strategy: + value: steps +eval_use_gather_object: + value: false +evaluation_strategy: + value: steps +exponential_decay_length_penalty: + value: null +finetuning_task: + value: null +forced_bos_token_id: + value: null +forced_eos_token_id: + value: null +fp16: + value: true +fp16_backend: + value: auto +fp16_full_eval: + value: false +fp16_opt_level: + value: O1 +fsdp: + value: [] +fsdp_config: + value: + min_num_params: 0 + xla: false + xla_fsdp_grad_ckpt: false + xla_fsdp_v2: false +fsdp_min_num_params: + value: 0 +fsdp_transformer_layer_cls_to_wrap: + value: null +full_determinism: + value: false +gradient_accumulation_steps: + value: 2 +gradient_checkpointing: + value: false +gradient_checkpointing_kwargs: + value: null +greater_is_better: + value: null +group_by_length: + value: false +half_precision_backend: + value: auto +head_dim: + value: 64 +hidden_act: + value: silu +hidden_size: + value: 2048 +hub_always_push: + value: false +hub_model_id: + value: null +hub_private_repo: + value: false +hub_strategy: + value: every_save +hub_token: + value: +id2label: + value: + "0": LABEL_0 + "1": LABEL_1 +ignore_data_skip: + value: false +include_inputs_for_metrics: + value: false +include_num_input_tokens_seen: + value: false +include_tokens_per_second: + value: false +initializer_range: + value: 0.02 +intermediate_size: + value: 8192 +is_decoder: + value: false +is_encoder_decoder: + value: false +jit_mode_eval: + value: false +label_names: + value: null +label_smoothing_factor: + value: 0 +label2id: + value: + LABEL_0: 0 + LABEL_1: 1 +learning_rate: + value: 5e-06 +length_column_name: + value: length +length_penalty: + value: 1 +load_best_model_at_end: + value: false +local_rank: + value: 0 +log_level: + value: passive +log_level_replica: + value: warning +log_on_each_node: + value: true +logging_dir: + value: ./logs +logging_first_step: + value: false +logging_nan_inf_filter: + value: true +logging_steps: + value: 1 +logging_strategy: + value: steps +lr: + value: 5e-06 +lr_scheduler_type: + value: linear +max_grad_norm: + value: 1 +max_length: + value: 20 +max_position_embeddings: + value: 131072 +max_steps: + value: -1 +metric_for_best_model: + value: null +min_length: + value: 0 +mlp_bias: + value: false +model/num_parameters: + value: 1235814400 +model_type: + value: llama +mp_parameters: + value: "" +neftune_noise_alpha: + value: null +no_cuda: + value: false +no_repeat_ngram_size: + value: 0 +num_attention_heads: + value: 32 +num_beam_groups: + value: 1 +num_beams: + value: 1 +num_hidden_layers: + value: 16 +num_key_value_heads: + value: 8 +num_return_sequences: + value: 1 +num_train_epochs: + value: 3 +optim: + value: adamw_torch +optim_args: + value: null +optim_target_modules: + value: null +output_attentions: + value: false +output_dir: + value: ./checkpoints/Llama-3.2-1B/babylm_reverse_full_10M_seed0/runs +output_hidden_states: + value: false +output_scores: + value: false +overwrite_output_dir: + value: false +pad_token_id: + value: null +past_index: + value: -1 +per_device_eval_batch_size: + value: 8 +per_device_train_batch_size: + value: 3 +per_gpu_eval_batch_size: + value: null +per_gpu_train_batch_size: + value: null +perturbation: + value: reverse_full +prediction_loss_only: + value: false +prefix: + value: null +pretraining_tp: + value: 1 +problem_type: + value: null +push_to_hub: + value: false +push_to_hub_model_id: + value: null +push_to_hub_organization: + value: null +push_to_hub_token: + value: +ray_scope: + value: last +remove_invalid_values: + value: false +remove_unused_columns: + value: true +repetition_penalty: + value: 1 +report_to: + value: + - wandb +restore_callback_states_from_checkpoint: + value: false +resume_from_checkpoint: + value: null +return_dict: + value: true +return_dict_in_generate: + value: false +rms_norm_eps: + value: 1e-05 +rope_scaling: + value: + factor: 32 + high_freq_factor: 4 + low_freq_factor: 1 + original_max_position_embeddings: 8192 + rope_type: llama3 +rope_theta: + value: 500000 +run_name: + value: ./checkpoints/Llama-3.2-1B/babylm_reverse_full_10M_seed0/runs +save_on_each_node: + value: false +save_only_model: + value: false +save_safetensors: + value: true +save_steps: + value: 100 +save_strategy: + value: steps +save_total_limit: + value: null +seed: + value: 0 +sep_token_id: + value: null +skip_memory_metrics: + value: true +split_batches: + value: null +suppress_tokens: + value: null +task_specific_params: + value: null +temperature: + value: 1 +tf_legacy_loss: + value: false +tf32: + value: null +tie_encoder_decoder: + value: false +tie_word_embeddings: + value: true +tokenizer_class: + value: null +top_k: + value: 50 +top_p: + value: 1 +torch_compile: + value: false +torch_compile_backend: + value: null +torch_compile_mode: + value: null +torch_dtype: + value: bfloat16 +torch_empty_cache_steps: + value: null +torchdynamo: + value: null +torchscript: + value: false +tpu_metrics_debug: + value: false +tpu_num_cores: + value: null +train_set: + value: 10M +transformers_version: + value: 4.45.1 +typical_p: + value: 1 +use_bfloat16: + value: false +use_cache: + value: true +use_cpu: + value: false +use_ipex: + value: false +use_legacy_prediction_loop: + value: false +use_liger_kernel: + value: false +use_mps_device: + value: false +vocab_size: + value: 128256 +warmup_ratio: + value: 0.1 +warmup_steps: + value: 0 +weight_decay: + value: 0 diff --git a/wandb/run-20241128_161638-a0iw6rlo/files/wandb-summary.json b/wandb/run-20241128_161638-a0iw6rlo/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..1fc4bf7165999448e21e37f1f613d334ba9238f4 --- /dev/null +++ b/wandb/run-20241128_161638-a0iw6rlo/files/wandb-summary.json @@ -0,0 +1 @@ +{"eval/steps_per_second":2.176,"train/epoch":2.9984591679506933,"train/global_step":2919,"total_flos":3.141491282946294e+17,"train_runtime":48371.3651,"_step":3210,"train/learning_rate":3.235629996193377e-08,"_timestamp":1.7328771582926176e+09,"train/loss":1.3643,"train/grad_norm":3.831026792526245,"_wandb":{"runtime":48559},"eval/samples_per_second":51.802,"train_loss":1.5549957860859749,"train_samples_per_second":1.087,"eval/runtime":19.3044,"train_steps_per_second":0.06,"eval/loss":1.817489743232727,"_runtime":48559.862478679} \ No newline at end of file diff --git a/wandb/run-20241129_235322-dmnv987j/files/wandb-metadata.json b/wandb/run-20241129_235322-dmnv987j/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..9a98327b8a3ed808efdb05e5ff6fc12e89219870 --- /dev/null +++ b/wandb/run-20241129_235322-dmnv987j/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-11-30T04:53:22.698479Z", + "args": [ + "--perturbation", + "reverse_full", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "3", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_gpt2.py", + "codePath": "train/train_gpt2.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_gpt2.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1719454396416" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file