diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..360988054fd2a2b662a34f0e3273281c5a006212 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +wandb/run-20241030_011509-cqcwsj7s/run-cqcwsj7s.wandb filter=lfs diff=lfs merge=lfs -text +wandb/run-20241030_013141-v317zdzd/run-v317zdzd.wandb filter=lfs diff=lfs merge=lfs -text +wandb/run-20241101_012733-4u8e027p/run-4u8e027p.wandb filter=lfs diff=lfs merge=lfs -text +wandb/run-20241101_012733-e3zsr634/run-e3zsr634.wandb filter=lfs diff=lfs merge=lfs -text +wandb/run-20241030_011509-3dp0dtmk/run-3dp0dtmk.wandb filter=lfs diff=lfs merge=lfs -text +wandb/run-20241129_235322-dmnv987j/run-dmnv987j.wandb filter=lfs diff=lfs merge=lfs -text diff --git a/wandb/run-20241030_010306-uhzyjdga/files/config.yaml b/wandb/run-20241030_010306-uhzyjdga/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f8546482f67cbb9041af9d756594322c485d67a9 --- /dev/null +++ b/wandb/run-20241030_010306-uhzyjdga/files/config.yaml @@ -0,0 +1,47 @@ +_wandb: + value: + cli_version: 0.18.5 + m: [] + python_version: 3.9.19 + t: + "1": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "2": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "3": + - 13 + - 23 + - 55 + "4": 3.9.19 + "5": 0.18.5 + "6": 4.45.1 + "8": + - 5 + "12": 0.18.5 + "13": linux-x86_64 +batch_size: + value: 3 +epoch: + value: 7 +perturbation: + value: reverse_control +seed: + value: 0 +train_set: + value: 10M diff --git a/wandb/run-20241030_010306-uhzyjdga/files/output.log b/wandb/run-20241030_010306-uhzyjdga/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..1777f063b107f651dddc063d1d3a3fa80015bf9e --- /dev/null +++ b/wandb/run-20241030_010306-uhzyjdga/files/output.log @@ -0,0 +1,4 @@ +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", line 162, in + dataset_name = f"babylm_{args.perturbation}_{args.train_zset}_seed{args.seed}" +AttributeError: 'Namespace' object has no attribute 'train_zset' diff --git a/wandb/run-20241030_011509-3dp0dtmk/run-3dp0dtmk.wandb b/wandb/run-20241030_011509-3dp0dtmk/run-3dp0dtmk.wandb new file mode 100644 index 0000000000000000000000000000000000000000..c2b80ad7055c78d44e854369f657531aa7ee185c --- /dev/null +++ b/wandb/run-20241030_011509-3dp0dtmk/run-3dp0dtmk.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:774953708a5c0869fb723f0b2ad7b4b7f78c8dee37c2a8c29ac293e0b287dea8 +size 163840 diff --git a/wandb/run-20241030_011509-cqcwsj7s/run-cqcwsj7s.wandb b/wandb/run-20241030_011509-cqcwsj7s/run-cqcwsj7s.wandb new file mode 100644 index 0000000000000000000000000000000000000000..a168de04dcfa4916fd6bb47a31838402cef59139 --- /dev/null +++ b/wandb/run-20241030_011509-cqcwsj7s/run-cqcwsj7s.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fcc31a33c557e349e62443d6987df97fb3737a5ae668a6b9912256109d629edb +size 163840 diff --git a/wandb/run-20241030_011509-oh4ul0xu/files/output.log b/wandb/run-20241030_011509-oh4ul0xu/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..6d9f5a60096fcddcd8ce0f25d31326fe4caf217e --- /dev/null +++ b/wandb/run-20241030_011509-oh4ul0xu/files/output.log @@ -0,0 +1,18 @@ +Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:04<00:00, 2.03s/it] +Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17519/17519 [00:53<00:00, 325.05 examples/s] +Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 18140/18140 [00:55<00:00, 326.84 examples/s] +tokenized_valid: Dataset({ + features: ['input_ids', 'attention_mask'], + num_rows: 600 +}) +/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead + warnings.warn( +[2024-10-30 01:17:04,609] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-10-30 01:17:10,763] [INFO] [comm.py:652:init_distributed] cdb=None +Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination +Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root... +Emitting ninja build file /home/chunhui/.cache/torch_extensions/py39_cu117/cpu_adam/build.ninja... +Building extension module cpu_adam... +Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +Loading extension module cpu_adam... +Time to load cpu_adam op: 4.44950008392334 seconds diff --git a/wandb/run-20241030_011509-oh4ul0xu/files/requirements.txt b/wandb/run-20241030_011509-oh4ul0xu/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416 --- /dev/null +++ b/wandb/run-20241030_011509-oh4ul0xu/files/requirements.txt @@ -0,0 +1,147 @@ +funcsigs==1.0.2 +sentry-sdk==2.17.0 +multiprocess==0.70.16 +numpy==1.26.2 +pluralizer==1.2.0 +debugpy==1.6.7 +nvidia-cudnn-cu11==8.5.0.96 +deepspeed==0.15.2 +data==0.4 +pandas==2.1.3 +tomli==2.0.1 +charset-normalizer==3.3.2 +attrs==24.2.0 +aiosignal==1.3.1 +fsspec==2023.10.0 +nvidia-cusparse-cu11==11.7.4.91 +zipp==3.12.0 +mypy-extensions==1.0.0 +datasets==3.0.1 +joblib==1.3.2 +hjson==3.1.0 +traitlets==5.7.1 +stack-data==0.6.0 +transformers==4.45.1 +sympy==1.11.1 +Pygments==2.15.0 +docker-pycreds==0.4.0 +dill==0.3.8 +wheel==0.44.0 +prompt-toolkit==3.0.30 +parso==0.8.3 +ipykernel==6.23.1 +pyarrow==17.0.0 +certifi==2023.11.17 +nvidia-cufft-cu11==10.9.0.58 +six==1.16.0 +pydantic==2.9.2 +click==8.1.7 +nest-asyncio==1.5.6 +gmpy2==2.1.0 +matplotlib==3.8.2 +scipy==1.11.4 +typing_extensions==4.12.2 +statsmodels==0.14.0 +huggingface-hub==0.25.0 +frozenlist==1.4.1 +gpustat==1.1.1 +nvidia-nvtx-cu11==11.7.91 +safetensors==0.4.5 +stanza==1.9.2 +decorator==5.1.1 +seaborn==0.13.0 +sentencepiece==0.2.0 +PyYAML==6.0.1 +black==24.8.0 +protobuf==4.25.1 +pickleshare==0.7.5 +peft==0.13.0 +triton==2.0.0 +nvidia-cuda-runtime-cu11==11.7.99 +Jinja2==3.1.2 +nvidia-cusolver-cu11==11.4.0.1 +executing==1.2.0 +jupyter_client==8.1.0 +pluggy==1.3.0 +cmake==3.30.3 +pytz==2023.3.post1 +aiohappyeyeballs==2.4.2 +kiwisolver==1.4.5 +py-cpuinfo==9.0.0 +Pillow==10.1.0 +ptyprocess==0.7.0 +importlib_resources==6.4.5 +GitPython==3.1.43 +importlib-metadata==6.0.0 +iniconfig==2.0.0 +scikit-learn==1.3.2 +exceptiongroup==1.1.0 +networkx==2.8.6 +accelerate==1.0.0 +nltk==3.8.1 +shutilwhich==1.1.0 +fonttools==4.45.1 +future==0.18.3 +aiohttp==3.10.6 +wcwidth==0.2.5 +idna==3.6 +filelock==3.12.2 +pathspec==0.12.1 +jupyter_core==5.1.0 +lit==18.1.8 +nvidia-curand-cu11==10.2.10.91 +nvidia-cublas-cu11==11.10.3.66 +nvidia-ml-py==12.560.30 +msgpack==1.1.0 +python-dateutil==2.8.2 +blessed==1.20.0 +packaging==23.0 +gitdb==4.0.11 +yarl==1.13.0 +emoji==2.8.0 +tzdata==2023.3 +cycler==0.12.1 +tornado==6.2 +backcall==0.2.0 +plotnine==0.12.4 +ninja==1.11.1.1 +latex==0.7.0 +wandb==0.18.5 +setproctitle==1.3.3 +threadpoolctl==3.2.0 +requests==2.32.3 +pyparsing==3.1.1 +smmap==5.0.1 +pyzmq==23.0.0 +async-timeout==4.0.3 +annotated-types==0.7.0 +matplotlib-inline==0.1.6 +latexcodec==1.0.0 +ipython==8.0.0 +patsy==0.5.3 +contourpy==1.2.0 +multidict==6.1.0 +mizani==0.9.3 +urllib3==2.1.0 +tokenizers==0.20.0 +MarkupSafe==2.1.2 +pip==24.2 +pexpect==4.8.0 +tqdm==4.66.5 +jedi==0.18.2 +pydantic_core==2.23.4 +tempdir==0.7.1 +mpmath==1.2.1 +setuptools==72.1.0 +pytest==7.4.3 +pure-eval==0.2.2 +psutil==5.9.1 +comm==0.1.2 +nvidia-cuda-cupti-cu11==11.7.101 +nvidia-cuda-nvrtc-cu11==11.7.99 +regex==2023.10.3 +platformdirs==2.5.2 +asttokens==2.2.1 +torch==2.0.0 +nvidia-nccl-cu11==2.14.3 +xxhash==3.5.0 diff --git a/wandb/run-20241030_011509-oh4ul0xu/files/wandb-metadata.json b/wandb/run-20241030_011509-oh4ul0xu/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..8168040bcf50dc8a4d1ffaa8cdd8b2539cabe89c --- /dev/null +++ b/wandb/run-20241030_011509-oh4ul0xu/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-10-30T05:15:09.449635Z", + "args": [ + "--perturbation", + "reverse_control", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "7", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1719287033856" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241030_011509-oh4ul0xu/logs/debug-internal.log b/wandb/run-20241030_011509-oh4ul0xu/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..cc389473101cb4d8c3fe4a0fa830c2ae49f0f701 --- /dev/null +++ b/wandb/run-20241030_011509-oh4ul0xu/logs/debug-internal.log @@ -0,0 +1,8 @@ +{"time":"2024-10-30T01:15:09.452194345-04:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-10-30T01:15:09.452230726-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_011509-oh4ul0xu/logs/debug-core.log"} +{"time":"2024-10-30T01:15:09.561903697-04:00","level":"INFO","msg":"created new stream","id":"oh4ul0xu"} +{"time":"2024-10-30T01:15:09.561951377-04:00","level":"INFO","msg":"stream: started","id":"oh4ul0xu"} +{"time":"2024-10-30T01:15:09.562009358-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"oh4ul0xu"}} +{"time":"2024-10-30T01:15:09.562149939-04:00","level":"INFO","msg":"sender: started","stream_id":"oh4ul0xu"} +{"time":"2024-10-30T01:15:09.561992248-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"oh4ul0xu"}} +{"time":"2024-10-30T01:15:09.766220198-04:00","level":"INFO","msg":"Starting system monitor"} diff --git a/wandb/run-20241030_012617-44bz0olo/logs/debug-internal.log b/wandb/run-20241030_012617-44bz0olo/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..a14078764699e439b0906c1a63edee54930a433b --- /dev/null +++ b/wandb/run-20241030_012617-44bz0olo/logs/debug-internal.log @@ -0,0 +1,8 @@ +{"time":"2024-10-30T01:26:17.464896093-04:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-10-30T01:26:17.464907623-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_012617-44bz0olo/logs/debug-core.log"} +{"time":"2024-10-30T01:26:17.672983424-04:00","level":"INFO","msg":"created new stream","id":"44bz0olo"} +{"time":"2024-10-30T01:26:17.673056684-04:00","level":"INFO","msg":"stream: started","id":"44bz0olo"} +{"time":"2024-10-30T01:26:17.673116215-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"44bz0olo"}} +{"time":"2024-10-30T01:26:17.673194555-04:00","level":"INFO","msg":"sender: started","stream_id":"44bz0olo"} +{"time":"2024-10-30T01:26:17.673137025-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"44bz0olo"}} +{"time":"2024-10-30T01:26:17.912768248-04:00","level":"INFO","msg":"Starting system monitor"} diff --git a/wandb/run-20241030_012617-44bz0olo/logs/debug.log b/wandb/run-20241030_012617-44bz0olo/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..0318c4338569ae180a43495c43f9a123129277c4 --- /dev/null +++ b/wandb/run-20241030_012617-44bz0olo/logs/debug.log @@ -0,0 +1,26 @@ +2024-10-30 01:26:17,456 INFO MainThread:332628 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-10-30 01:26:17,456 INFO MainThread:332628 [wandb_setup.py:_flush():79] Configure stats pid to 332628 +2024-10-30 01:26:17,456 INFO MainThread:332628 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-10-30 01:26:17,456 INFO MainThread:332628 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-10-30 01:26:17,456 INFO MainThread:332628 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-10-30 01:26:17,456 INFO MainThread:332628 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-10-30 01:26:17,456 INFO MainThread:332628 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-10-30 01:26:17,456 INFO MainThread:332628 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-10-30 01:26:17,456 INFO MainThread:332628 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_012617-44bz0olo/logs/debug.log +2024-10-30 01:26:17,456 INFO MainThread:332628 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_012617-44bz0olo/logs/debug-internal.log +2024-10-30 01:26:17,456 INFO MainThread:332628 [wandb_init.py:init():621] calling init triggers +2024-10-30 01:26:17,456 INFO MainThread:332628 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-10-30 01:26:17,456 INFO MainThread:332628 [wandb_init.py:init():671] starting backend +2024-10-30 01:26:17,457 INFO MainThread:332628 [wandb_init.py:init():675] sending inform_init request +2024-10-30 01:26:17,457 INFO MainThread:332628 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-10-30 01:26:17,464 INFO MainThread:332628 [wandb_init.py:init():688] backend started and connected +2024-10-30 01:26:17,467 INFO MainThread:332628 [wandb_init.py:init():783] updated telemetry +2024-10-30 01:26:17,533 INFO MainThread:332628 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-10-30 01:26:17,904 INFO MainThread:332628 [wandb_init.py:init():867] starting run threads in backend +2024-10-30 01:26:18,043 INFO MainThread:332628 [wandb_run.py:_console_start():2463] atexit reg +2024-10-30 01:26:18,044 INFO MainThread:332628 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-10-30 01:26:18,044 INFO MainThread:332628 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-10-30 01:26:18,044 INFO MainThread:332628 [wandb_run.py:_redirect():2401] Redirects installed. +2024-10-30 01:26:18,045 INFO MainThread:332628 [wandb_init.py:init():911] run started, returning control to user process +2024-10-30 01:26:18,046 INFO MainThread:332628 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_control', 'train_set': '10M', 'batch_size': 3, 'epoch': 7, 'seed': 0} diff --git a/wandb/run-20241030_012617-591457tl/files/output.log b/wandb/run-20241030_012617-591457tl/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..36339c627c64eb688d73101b66bbef271c125d35 --- /dev/null +++ b/wandb/run-20241030_012617-591457tl/files/output.log @@ -0,0 +1,2 @@ +Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:04<00:00, 2.41s/it] +Map: 11%|██████████████ | 2000/18140 [00:06<00:53, 303.69 examples/s] diff --git a/wandb/run-20241030_012617-591457tl/files/requirements.txt b/wandb/run-20241030_012617-591457tl/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416 --- /dev/null +++ b/wandb/run-20241030_012617-591457tl/files/requirements.txt @@ -0,0 +1,147 @@ +funcsigs==1.0.2 +sentry-sdk==2.17.0 +multiprocess==0.70.16 +numpy==1.26.2 +pluralizer==1.2.0 +debugpy==1.6.7 +nvidia-cudnn-cu11==8.5.0.96 +deepspeed==0.15.2 +data==0.4 +pandas==2.1.3 +tomli==2.0.1 +charset-normalizer==3.3.2 +attrs==24.2.0 +aiosignal==1.3.1 +fsspec==2023.10.0 +nvidia-cusparse-cu11==11.7.4.91 +zipp==3.12.0 +mypy-extensions==1.0.0 +datasets==3.0.1 +joblib==1.3.2 +hjson==3.1.0 +traitlets==5.7.1 +stack-data==0.6.0 +transformers==4.45.1 +sympy==1.11.1 +Pygments==2.15.0 +docker-pycreds==0.4.0 +dill==0.3.8 +wheel==0.44.0 +prompt-toolkit==3.0.30 +parso==0.8.3 +ipykernel==6.23.1 +pyarrow==17.0.0 +certifi==2023.11.17 +nvidia-cufft-cu11==10.9.0.58 +six==1.16.0 +pydantic==2.9.2 +click==8.1.7 +nest-asyncio==1.5.6 +gmpy2==2.1.0 +matplotlib==3.8.2 +scipy==1.11.4 +typing_extensions==4.12.2 +statsmodels==0.14.0 +huggingface-hub==0.25.0 +frozenlist==1.4.1 +gpustat==1.1.1 +nvidia-nvtx-cu11==11.7.91 +safetensors==0.4.5 +stanza==1.9.2 +decorator==5.1.1 +seaborn==0.13.0 +sentencepiece==0.2.0 +PyYAML==6.0.1 +black==24.8.0 +protobuf==4.25.1 +pickleshare==0.7.5 +peft==0.13.0 +triton==2.0.0 +nvidia-cuda-runtime-cu11==11.7.99 +Jinja2==3.1.2 +nvidia-cusolver-cu11==11.4.0.1 +executing==1.2.0 +jupyter_client==8.1.0 +pluggy==1.3.0 +cmake==3.30.3 +pytz==2023.3.post1 +aiohappyeyeballs==2.4.2 +kiwisolver==1.4.5 +py-cpuinfo==9.0.0 +Pillow==10.1.0 +ptyprocess==0.7.0 +importlib_resources==6.4.5 +GitPython==3.1.43 +importlib-metadata==6.0.0 +iniconfig==2.0.0 +scikit-learn==1.3.2 +exceptiongroup==1.1.0 +networkx==2.8.6 +accelerate==1.0.0 +nltk==3.8.1 +shutilwhich==1.1.0 +fonttools==4.45.1 +future==0.18.3 +aiohttp==3.10.6 +wcwidth==0.2.5 +idna==3.6 +filelock==3.12.2 +pathspec==0.12.1 +jupyter_core==5.1.0 +lit==18.1.8 +nvidia-curand-cu11==10.2.10.91 +nvidia-cublas-cu11==11.10.3.66 +nvidia-ml-py==12.560.30 +msgpack==1.1.0 +python-dateutil==2.8.2 +blessed==1.20.0 +packaging==23.0 +gitdb==4.0.11 +yarl==1.13.0 +emoji==2.8.0 +tzdata==2023.3 +cycler==0.12.1 +tornado==6.2 +backcall==0.2.0 +plotnine==0.12.4 +ninja==1.11.1.1 +latex==0.7.0 +wandb==0.18.5 +setproctitle==1.3.3 +threadpoolctl==3.2.0 +requests==2.32.3 +pyparsing==3.1.1 +smmap==5.0.1 +pyzmq==23.0.0 +async-timeout==4.0.3 +annotated-types==0.7.0 +matplotlib-inline==0.1.6 +latexcodec==1.0.0 +ipython==8.0.0 +patsy==0.5.3 +contourpy==1.2.0 +multidict==6.1.0 +mizani==0.9.3 +urllib3==2.1.0 +tokenizers==0.20.0 +MarkupSafe==2.1.2 +pip==24.2 +pexpect==4.8.0 +tqdm==4.66.5 +jedi==0.18.2 +pydantic_core==2.23.4 +tempdir==0.7.1 +mpmath==1.2.1 +setuptools==72.1.0 +pytest==7.4.3 +pure-eval==0.2.2 +psutil==5.9.1 +comm==0.1.2 +nvidia-cuda-cupti-cu11==11.7.101 +nvidia-cuda-nvrtc-cu11==11.7.99 +regex==2023.10.3 +platformdirs==2.5.2 +asttokens==2.2.1 +torch==2.0.0 +nvidia-nccl-cu11==2.14.3 +xxhash==3.5.0 diff --git a/wandb/run-20241030_013141-v317zdzd/run-v317zdzd.wandb b/wandb/run-20241030_013141-v317zdzd/run-v317zdzd.wandb new file mode 100644 index 0000000000000000000000000000000000000000..78af3d1cd6d439b84b63f0784a3d0a79727beb43 --- /dev/null +++ b/wandb/run-20241030_013141-v317zdzd/run-v317zdzd.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7f6e594f9ede19fbbf5de0a7bceb685142e9e3215a6556ae2b238e47e7aba5a +size 131072 diff --git a/wandb/run-20241030_222932-lsfm0d2q/files/requirements.txt b/wandb/run-20241030_222932-lsfm0d2q/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416 --- /dev/null +++ b/wandb/run-20241030_222932-lsfm0d2q/files/requirements.txt @@ -0,0 +1,147 @@ +funcsigs==1.0.2 +sentry-sdk==2.17.0 +multiprocess==0.70.16 +numpy==1.26.2 +pluralizer==1.2.0 +debugpy==1.6.7 +nvidia-cudnn-cu11==8.5.0.96 +deepspeed==0.15.2 +data==0.4 +pandas==2.1.3 +tomli==2.0.1 +charset-normalizer==3.3.2 +attrs==24.2.0 +aiosignal==1.3.1 +fsspec==2023.10.0 +nvidia-cusparse-cu11==11.7.4.91 +zipp==3.12.0 +mypy-extensions==1.0.0 +datasets==3.0.1 +joblib==1.3.2 +hjson==3.1.0 +traitlets==5.7.1 +stack-data==0.6.0 +transformers==4.45.1 +sympy==1.11.1 +Pygments==2.15.0 +docker-pycreds==0.4.0 +dill==0.3.8 +wheel==0.44.0 +prompt-toolkit==3.0.30 +parso==0.8.3 +ipykernel==6.23.1 +pyarrow==17.0.0 +certifi==2023.11.17 +nvidia-cufft-cu11==10.9.0.58 +six==1.16.0 +pydantic==2.9.2 +click==8.1.7 +nest-asyncio==1.5.6 +gmpy2==2.1.0 +matplotlib==3.8.2 +scipy==1.11.4 +typing_extensions==4.12.2 +statsmodels==0.14.0 +huggingface-hub==0.25.0 +frozenlist==1.4.1 +gpustat==1.1.1 +nvidia-nvtx-cu11==11.7.91 +safetensors==0.4.5 +stanza==1.9.2 +decorator==5.1.1 +seaborn==0.13.0 +sentencepiece==0.2.0 +PyYAML==6.0.1 +black==24.8.0 +protobuf==4.25.1 +pickleshare==0.7.5 +peft==0.13.0 +triton==2.0.0 +nvidia-cuda-runtime-cu11==11.7.99 +Jinja2==3.1.2 +nvidia-cusolver-cu11==11.4.0.1 +executing==1.2.0 +jupyter_client==8.1.0 +pluggy==1.3.0 +cmake==3.30.3 +pytz==2023.3.post1 +aiohappyeyeballs==2.4.2 +kiwisolver==1.4.5 +py-cpuinfo==9.0.0 +Pillow==10.1.0 +ptyprocess==0.7.0 +importlib_resources==6.4.5 +GitPython==3.1.43 +importlib-metadata==6.0.0 +iniconfig==2.0.0 +scikit-learn==1.3.2 +exceptiongroup==1.1.0 +networkx==2.8.6 +accelerate==1.0.0 +nltk==3.8.1 +shutilwhich==1.1.0 +fonttools==4.45.1 +future==0.18.3 +aiohttp==3.10.6 +wcwidth==0.2.5 +idna==3.6 +filelock==3.12.2 +pathspec==0.12.1 +jupyter_core==5.1.0 +lit==18.1.8 +nvidia-curand-cu11==10.2.10.91 +nvidia-cublas-cu11==11.10.3.66 +nvidia-ml-py==12.560.30 +msgpack==1.1.0 +python-dateutil==2.8.2 +blessed==1.20.0 +packaging==23.0 +gitdb==4.0.11 +yarl==1.13.0 +emoji==2.8.0 +tzdata==2023.3 +cycler==0.12.1 +tornado==6.2 +backcall==0.2.0 +plotnine==0.12.4 +ninja==1.11.1.1 +latex==0.7.0 +wandb==0.18.5 +setproctitle==1.3.3 +threadpoolctl==3.2.0 +requests==2.32.3 +pyparsing==3.1.1 +smmap==5.0.1 +pyzmq==23.0.0 +async-timeout==4.0.3 +annotated-types==0.7.0 +matplotlib-inline==0.1.6 +latexcodec==1.0.0 +ipython==8.0.0 +patsy==0.5.3 +contourpy==1.2.0 +multidict==6.1.0 +mizani==0.9.3 +urllib3==2.1.0 +tokenizers==0.20.0 +MarkupSafe==2.1.2 +pip==24.2 +pexpect==4.8.0 +tqdm==4.66.5 +jedi==0.18.2 +pydantic_core==2.23.4 +tempdir==0.7.1 +mpmath==1.2.1 +setuptools==72.1.0 +pytest==7.4.3 +pure-eval==0.2.2 +psutil==5.9.1 +comm==0.1.2 +nvidia-cuda-cupti-cu11==11.7.101 +nvidia-cuda-nvrtc-cu11==11.7.99 +regex==2023.10.3 +platformdirs==2.5.2 +asttokens==2.2.1 +torch==2.0.0 +nvidia-nccl-cu11==2.14.3 +xxhash==3.5.0 diff --git a/wandb/run-20241030_231835-tt0m4qwl/files/output.log b/wandb/run-20241030_231835-tt0m4qwl/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..9c48a1ce5646decd99338bc3b2cf9b43bea78b4a --- /dev/null +++ b/wandb/run-20241030_231835-tt0m4qwl/files/output.log @@ -0,0 +1,13 @@ +Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:05<00:00, 2.72s/it] +tokenized_valid: Dataset({ + features: ['input_ids', 'attention_mask'], + num_rows: 600 +}) +/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead + warnings.warn( +[2024-10-30 23:18:42,992] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-10-30 23:18:52,859] [INFO] [comm.py:652:init_distributed] cdb=None +Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination +Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root... +Loading extension module cpu_adam... +Time to load cpu_adam op: 5.861032247543335 seconds diff --git a/wandb/run-20241030_231835-tt0m4qwl/files/requirements.txt b/wandb/run-20241030_231835-tt0m4qwl/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416 --- /dev/null +++ b/wandb/run-20241030_231835-tt0m4qwl/files/requirements.txt @@ -0,0 +1,147 @@ +funcsigs==1.0.2 +sentry-sdk==2.17.0 +multiprocess==0.70.16 +numpy==1.26.2 +pluralizer==1.2.0 +debugpy==1.6.7 +nvidia-cudnn-cu11==8.5.0.96 +deepspeed==0.15.2 +data==0.4 +pandas==2.1.3 +tomli==2.0.1 +charset-normalizer==3.3.2 +attrs==24.2.0 +aiosignal==1.3.1 +fsspec==2023.10.0 +nvidia-cusparse-cu11==11.7.4.91 +zipp==3.12.0 +mypy-extensions==1.0.0 +datasets==3.0.1 +joblib==1.3.2 +hjson==3.1.0 +traitlets==5.7.1 +stack-data==0.6.0 +transformers==4.45.1 +sympy==1.11.1 +Pygments==2.15.0 +docker-pycreds==0.4.0 +dill==0.3.8 +wheel==0.44.0 +prompt-toolkit==3.0.30 +parso==0.8.3 +ipykernel==6.23.1 +pyarrow==17.0.0 +certifi==2023.11.17 +nvidia-cufft-cu11==10.9.0.58 +six==1.16.0 +pydantic==2.9.2 +click==8.1.7 +nest-asyncio==1.5.6 +gmpy2==2.1.0 +matplotlib==3.8.2 +scipy==1.11.4 +typing_extensions==4.12.2 +statsmodels==0.14.0 +huggingface-hub==0.25.0 +frozenlist==1.4.1 +gpustat==1.1.1 +nvidia-nvtx-cu11==11.7.91 +safetensors==0.4.5 +stanza==1.9.2 +decorator==5.1.1 +seaborn==0.13.0 +sentencepiece==0.2.0 +PyYAML==6.0.1 +black==24.8.0 +protobuf==4.25.1 +pickleshare==0.7.5 +peft==0.13.0 +triton==2.0.0 +nvidia-cuda-runtime-cu11==11.7.99 +Jinja2==3.1.2 +nvidia-cusolver-cu11==11.4.0.1 +executing==1.2.0 +jupyter_client==8.1.0 +pluggy==1.3.0 +cmake==3.30.3 +pytz==2023.3.post1 +aiohappyeyeballs==2.4.2 +kiwisolver==1.4.5 +py-cpuinfo==9.0.0 +Pillow==10.1.0 +ptyprocess==0.7.0 +importlib_resources==6.4.5 +GitPython==3.1.43 +importlib-metadata==6.0.0 +iniconfig==2.0.0 +scikit-learn==1.3.2 +exceptiongroup==1.1.0 +networkx==2.8.6 +accelerate==1.0.0 +nltk==3.8.1 +shutilwhich==1.1.0 +fonttools==4.45.1 +future==0.18.3 +aiohttp==3.10.6 +wcwidth==0.2.5 +idna==3.6 +filelock==3.12.2 +pathspec==0.12.1 +jupyter_core==5.1.0 +lit==18.1.8 +nvidia-curand-cu11==10.2.10.91 +nvidia-cublas-cu11==11.10.3.66 +nvidia-ml-py==12.560.30 +msgpack==1.1.0 +python-dateutil==2.8.2 +blessed==1.20.0 +packaging==23.0 +gitdb==4.0.11 +yarl==1.13.0 +emoji==2.8.0 +tzdata==2023.3 +cycler==0.12.1 +tornado==6.2 +backcall==0.2.0 +plotnine==0.12.4 +ninja==1.11.1.1 +latex==0.7.0 +wandb==0.18.5 +setproctitle==1.3.3 +threadpoolctl==3.2.0 +requests==2.32.3 +pyparsing==3.1.1 +smmap==5.0.1 +pyzmq==23.0.0 +async-timeout==4.0.3 +annotated-types==0.7.0 +matplotlib-inline==0.1.6 +latexcodec==1.0.0 +ipython==8.0.0 +patsy==0.5.3 +contourpy==1.2.0 +multidict==6.1.0 +mizani==0.9.3 +urllib3==2.1.0 +tokenizers==0.20.0 +MarkupSafe==2.1.2 +pip==24.2 +pexpect==4.8.0 +tqdm==4.66.5 +jedi==0.18.2 +pydantic_core==2.23.4 +tempdir==0.7.1 +mpmath==1.2.1 +setuptools==72.1.0 +pytest==7.4.3 +pure-eval==0.2.2 +psutil==5.9.1 +comm==0.1.2 +nvidia-cuda-cupti-cu11==11.7.101 +nvidia-cuda-nvrtc-cu11==11.7.99 +regex==2023.10.3 +platformdirs==2.5.2 +asttokens==2.2.1 +torch==2.0.0 +nvidia-nccl-cu11==2.14.3 +xxhash==3.5.0 diff --git a/wandb/run-20241031_000839-mehxid7z/files/wandb-metadata.json b/wandb/run-20241031_000839-mehxid7z/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..0a42f392ddc98b723fa2d41d1250ac0b2bd766ce --- /dev/null +++ b/wandb/run-20241031_000839-mehxid7z/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-10-31T04:08:39.206832Z", + "args": [ + "--perturbation", + "reverse_full", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "6", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1727270539264" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241031_000839-mehxid7z/logs/debug.log b/wandb/run-20241031_000839-mehxid7z/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..e49b54cac4e1d1261ca6836f23b87c1a79cef65b --- /dev/null +++ b/wandb/run-20241031_000839-mehxid7z/logs/debug.log @@ -0,0 +1,26 @@ +2024-10-31 00:08:39,204 INFO MainThread:477295 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-10-31 00:08:39,204 INFO MainThread:477295 [wandb_setup.py:_flush():79] Configure stats pid to 477295 +2024-10-31 00:08:39,204 INFO MainThread:477295 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-10-31 00:08:39,204 INFO MainThread:477295 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-10-31 00:08:39,204 INFO MainThread:477295 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-10-31 00:08:39,204 INFO MainThread:477295 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-10-31 00:08:39,204 INFO MainThread:477295 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-10-31 00:08:39,204 INFO MainThread:477295 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-10-31 00:08:39,204 INFO MainThread:477295 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241031_000839-mehxid7z/logs/debug.log +2024-10-31 00:08:39,204 INFO MainThread:477295 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241031_000839-mehxid7z/logs/debug-internal.log +2024-10-31 00:08:39,204 INFO MainThread:477295 [wandb_init.py:init():621] calling init triggers +2024-10-31 00:08:39,204 INFO MainThread:477295 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-10-31 00:08:39,204 INFO MainThread:477295 [wandb_init.py:init():671] starting backend +2024-10-31 00:08:39,204 INFO MainThread:477295 [wandb_init.py:init():675] sending inform_init request +2024-10-31 00:08:39,206 INFO MainThread:477295 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-10-31 00:08:39,206 INFO MainThread:477295 [wandb_init.py:init():688] backend started and connected +2024-10-31 00:08:39,210 INFO MainThread:477295 [wandb_init.py:init():783] updated telemetry +2024-10-31 00:08:39,267 INFO MainThread:477295 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-10-31 00:08:39,494 INFO MainThread:477295 [wandb_init.py:init():867] starting run threads in backend +2024-10-31 00:08:39,592 INFO MainThread:477295 [wandb_run.py:_console_start():2463] atexit reg +2024-10-31 00:08:39,593 INFO MainThread:477295 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-10-31 00:08:39,593 INFO MainThread:477295 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-10-31 00:08:39,593 INFO MainThread:477295 [wandb_run.py:_redirect():2401] Redirects installed. +2024-10-31 00:08:39,594 INFO MainThread:477295 [wandb_init.py:init():911] run started, returning control to user process +2024-10-31 00:08:39,595 INFO MainThread:477295 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_full', 'train_set': '10M', 'batch_size': 3, 'epoch': 6, 'seed': 0, 'lr': 1e-05} diff --git a/wandb/run-20241031_001055-u3ke9zn3/files/output.log b/wandb/run-20241031_001055-u3ke9zn3/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..0745ee97193ba93d18ed513130eb5eb032b07a38 --- /dev/null +++ b/wandb/run-20241031_001055-u3ke9zn3/files/output.log @@ -0,0 +1,36 @@ +Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:05<00:00, 2.85s/it] +tokenized_valid: Dataset({ + features: ['input_ids', 'attention_mask'], + num_rows: 600 +}) +/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead + warnings.warn( +[2024-10-31 00:11:03,867] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-10-31 00:11:12,798] [INFO] [comm.py:652:init_distributed] cdb=None +Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination +Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root... +Loading extension module cpu_adam... +Time to load cpu_adam op: 5.4102067947387695 seconds +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", line 220, in + trainer.train() + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/trainer.py", line 2052, in train + return inner_training_loop( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/trainer.py", line 2388, in _inner_training_loop + tr_loss_step = self.training_step(model, inputs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/trainer.py", line 3518, in training_step + self.accelerator.backward(loss, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/accelerate/accelerator.py", line 2238, in backward + self.deepspeed_engine_wrapped.backward(loss, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/accelerate/utils/deepspeed.py", line 195, in backward + self.engine.step() + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/deepspeed/runtime/engine.py", line 2213, in step + self._take_model_step(lr_kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/deepspeed/runtime/engine.py", line 2119, in _take_model_step + self.optimizer.step() + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 1878, in step + fp32_partition.to(get_accelerator().current_device_name()).data) +KeyboardInterrupt +Error in atexit._run_exitfuncs: +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/deepspeed/ops/transformer/inference/triton/matmul_ext.py", line 27, in is_nfs_path diff --git a/wandb/run-20241031_001055-u3ke9zn3/files/requirements.txt b/wandb/run-20241031_001055-u3ke9zn3/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416 --- /dev/null +++ b/wandb/run-20241031_001055-u3ke9zn3/files/requirements.txt @@ -0,0 +1,147 @@ +funcsigs==1.0.2 +sentry-sdk==2.17.0 +multiprocess==0.70.16 +numpy==1.26.2 +pluralizer==1.2.0 +debugpy==1.6.7 +nvidia-cudnn-cu11==8.5.0.96 +deepspeed==0.15.2 +data==0.4 +pandas==2.1.3 +tomli==2.0.1 +charset-normalizer==3.3.2 +attrs==24.2.0 +aiosignal==1.3.1 +fsspec==2023.10.0 +nvidia-cusparse-cu11==11.7.4.91 +zipp==3.12.0 +mypy-extensions==1.0.0 +datasets==3.0.1 +joblib==1.3.2 +hjson==3.1.0 +traitlets==5.7.1 +stack-data==0.6.0 +transformers==4.45.1 +sympy==1.11.1 +Pygments==2.15.0 +docker-pycreds==0.4.0 +dill==0.3.8 +wheel==0.44.0 +prompt-toolkit==3.0.30 +parso==0.8.3 +ipykernel==6.23.1 +pyarrow==17.0.0 +certifi==2023.11.17 +nvidia-cufft-cu11==10.9.0.58 +six==1.16.0 +pydantic==2.9.2 +click==8.1.7 +nest-asyncio==1.5.6 +gmpy2==2.1.0 +matplotlib==3.8.2 +scipy==1.11.4 +typing_extensions==4.12.2 +statsmodels==0.14.0 +huggingface-hub==0.25.0 +frozenlist==1.4.1 +gpustat==1.1.1 +nvidia-nvtx-cu11==11.7.91 +safetensors==0.4.5 +stanza==1.9.2 +decorator==5.1.1 +seaborn==0.13.0 +sentencepiece==0.2.0 +PyYAML==6.0.1 +black==24.8.0 +protobuf==4.25.1 +pickleshare==0.7.5 +peft==0.13.0 +triton==2.0.0 +nvidia-cuda-runtime-cu11==11.7.99 +Jinja2==3.1.2 +nvidia-cusolver-cu11==11.4.0.1 +executing==1.2.0 +jupyter_client==8.1.0 +pluggy==1.3.0 +cmake==3.30.3 +pytz==2023.3.post1 +aiohappyeyeballs==2.4.2 +kiwisolver==1.4.5 +py-cpuinfo==9.0.0 +Pillow==10.1.0 +ptyprocess==0.7.0 +importlib_resources==6.4.5 +GitPython==3.1.43 +importlib-metadata==6.0.0 +iniconfig==2.0.0 +scikit-learn==1.3.2 +exceptiongroup==1.1.0 +networkx==2.8.6 +accelerate==1.0.0 +nltk==3.8.1 +shutilwhich==1.1.0 +fonttools==4.45.1 +future==0.18.3 +aiohttp==3.10.6 +wcwidth==0.2.5 +idna==3.6 +filelock==3.12.2 +pathspec==0.12.1 +jupyter_core==5.1.0 +lit==18.1.8 +nvidia-curand-cu11==10.2.10.91 +nvidia-cublas-cu11==11.10.3.66 +nvidia-ml-py==12.560.30 +msgpack==1.1.0 +python-dateutil==2.8.2 +blessed==1.20.0 +packaging==23.0 +gitdb==4.0.11 +yarl==1.13.0 +emoji==2.8.0 +tzdata==2023.3 +cycler==0.12.1 +tornado==6.2 +backcall==0.2.0 +plotnine==0.12.4 +ninja==1.11.1.1 +latex==0.7.0 +wandb==0.18.5 +setproctitle==1.3.3 +threadpoolctl==3.2.0 +requests==2.32.3 +pyparsing==3.1.1 +smmap==5.0.1 +pyzmq==23.0.0 +async-timeout==4.0.3 +annotated-types==0.7.0 +matplotlib-inline==0.1.6 +latexcodec==1.0.0 +ipython==8.0.0 +patsy==0.5.3 +contourpy==1.2.0 +multidict==6.1.0 +mizani==0.9.3 +urllib3==2.1.0 +tokenizers==0.20.0 +MarkupSafe==2.1.2 +pip==24.2 +pexpect==4.8.0 +tqdm==4.66.5 +jedi==0.18.2 +pydantic_core==2.23.4 +tempdir==0.7.1 +mpmath==1.2.1 +setuptools==72.1.0 +pytest==7.4.3 +pure-eval==0.2.2 +psutil==5.9.1 +comm==0.1.2 +nvidia-cuda-cupti-cu11==11.7.101 +nvidia-cuda-nvrtc-cu11==11.7.99 +regex==2023.10.3 +platformdirs==2.5.2 +asttokens==2.2.1 +torch==2.0.0 +nvidia-nccl-cu11==2.14.3 +xxhash==3.5.0 diff --git a/wandb/run-20241031_001055-u3ke9zn3/files/wandb-metadata.json b/wandb/run-20241031_001055-u3ke9zn3/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..2f275a429127d52a66176682bec07ed2ab878199 --- /dev/null +++ b/wandb/run-20241031_001055-u3ke9zn3/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-10-31T04:10:55.936073Z", + "args": [ + "--perturbation", + "reverse_full", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "6", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1728856920064" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241031_001055-u3ke9zn3/logs/debug.log b/wandb/run-20241031_001055-u3ke9zn3/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..92f72d60cab0da8ecb3b9fe85779e9dabd0fe485 --- /dev/null +++ b/wandb/run-20241031_001055-u3ke9zn3/logs/debug.log @@ -0,0 +1,26 @@ +2024-10-31 00:10:55,934 INFO MainThread:479385 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-10-31 00:10:55,934 INFO MainThread:479385 [wandb_setup.py:_flush():79] Configure stats pid to 479385 +2024-10-31 00:10:55,934 INFO MainThread:479385 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-10-31 00:10:55,934 INFO MainThread:479385 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-10-31 00:10:55,934 INFO MainThread:479385 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-10-31 00:10:55,934 INFO MainThread:479385 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-10-31 00:10:55,934 INFO MainThread:479385 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-10-31 00:10:55,934 INFO MainThread:479385 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-10-31 00:10:55,934 INFO MainThread:479385 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241031_001055-u3ke9zn3/logs/debug.log +2024-10-31 00:10:55,934 INFO MainThread:479385 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241031_001055-u3ke9zn3/logs/debug-internal.log +2024-10-31 00:10:55,934 INFO MainThread:479385 [wandb_init.py:init():621] calling init triggers +2024-10-31 00:10:55,934 INFO MainThread:479385 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-10-31 00:10:55,934 INFO MainThread:479385 [wandb_init.py:init():671] starting backend +2024-10-31 00:10:55,934 INFO MainThread:479385 [wandb_init.py:init():675] sending inform_init request +2024-10-31 00:10:55,935 INFO MainThread:479385 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-10-31 00:10:55,935 INFO MainThread:479385 [wandb_init.py:init():688] backend started and connected +2024-10-31 00:10:55,939 INFO MainThread:479385 [wandb_init.py:init():783] updated telemetry +2024-10-31 00:10:55,980 INFO MainThread:479385 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-10-31 00:10:56,328 INFO MainThread:479385 [wandb_init.py:init():867] starting run threads in backend +2024-10-31 00:10:56,445 INFO MainThread:479385 [wandb_run.py:_console_start():2463] atexit reg +2024-10-31 00:10:56,445 INFO MainThread:479385 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-10-31 00:10:56,445 INFO MainThread:479385 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-10-31 00:10:56,445 INFO MainThread:479385 [wandb_run.py:_redirect():2401] Redirects installed. +2024-10-31 00:10:56,447 INFO MainThread:479385 [wandb_init.py:init():911] run started, returning control to user process +2024-10-31 00:10:56,447 INFO MainThread:479385 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_full', 'train_set': '10M', 'batch_size': 3, 'epoch': 6, 'seed': 0, 'lr': 1e-05} diff --git a/wandb/run-20241031_122005-xvi7ifef/files/output.log b/wandb/run-20241031_122005-xvi7ifef/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..eb4aa943e49e5a53f1083045c310c2f58cb52f69 --- /dev/null +++ b/wandb/run-20241031_122005-xvi7ifef/files/output.log @@ -0,0 +1,38 @@ +Downloading shards: 0%| | 0/2 [00:22 + model = AutoModelForCausalLM.from_pretrained(model_name, + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/models/auto/auto_factory.py", line 564, in from_pretrained + return model_class.from_pretrained( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/modeling_utils.py", line 3769, in from_pretrained + resolved_archive_file, sharded_metadata = get_checkpoint_shard_files( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 1098, in get_checkpoint_shard_files + cached_filename = cached_file( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 403, in cached_file + resolved_file = hf_hub_download( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_deprecation.py", line 101, in inner_f + return f(*args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn + return fn(*args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1232, in hf_hub_download + return _hf_hub_download_to_cache_dir( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1380, in _hf_hub_download_to_cache_dir + with WeakFileLock(lock_path): + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/contextlib.py", line 119, in __enter__ + return next(self.gen) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_fixes.py", line 98, in WeakFileLock + lock.acquire() + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/filelock/_api.py", line 225, in acquire + time.sleep(poll_interval) +KeyboardInterrupt diff --git a/wandb/run-20241031_122005-xvi7ifef/files/wandb-metadata.json b/wandb/run-20241031_122005-xvi7ifef/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..0eb9727242a098785c6e41f22a78e40c84a3881b --- /dev/null +++ b/wandb/run-20241031_122005-xvi7ifef/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-10-31T16:20:05.891835Z", + "args": [ + "--perturbation", + "reverse_full", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "6", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1753159847936" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241031_122005-xvi7ifef/files/wandb-summary.json b/wandb/run-20241031_122005-xvi7ifef/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..15f6b8e9049a55292dab131278b3f2fc1f52e50d --- /dev/null +++ b/wandb/run-20241031_122005-xvi7ifef/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":23}} \ No newline at end of file diff --git a/wandb/run-20241031_122005-xvi7ifef/logs/debug-internal.log b/wandb/run-20241031_122005-xvi7ifef/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..72868e514c6af9bd93343919572ba490d4706259 --- /dev/null +++ b/wandb/run-20241031_122005-xvi7ifef/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2024-10-31T12:20:05.893701815-04:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-10-31T12:20:05.893713785-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241031_122005-xvi7ifef/logs/debug-core.log"} +{"time":"2024-10-31T12:20:06.001382077-04:00","level":"INFO","msg":"created new stream","id":"xvi7ifef"} +{"time":"2024-10-31T12:20:06.001427307-04:00","level":"INFO","msg":"stream: started","id":"xvi7ifef"} +{"time":"2024-10-31T12:20:06.001462087-04:00","level":"INFO","msg":"sender: started","stream_id":"xvi7ifef"} +{"time":"2024-10-31T12:20:06.001438197-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"xvi7ifef"}} +{"time":"2024-10-31T12:20:06.001453797-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"xvi7ifef"}} +{"time":"2024-10-31T12:20:06.189578398-04:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2024-10-31T12:20:29.345121727-04:00","level":"INFO","msg":"stream: closing","id":"xvi7ifef"} +{"time":"2024-10-31T12:20:29.345240347-04:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2024-10-31T12:20:29.346116817-04:00","level":"INFO","msg":"Stopped system monitor"} diff --git a/wandb/run-20241031_122005-xvi7ifef/logs/debug.log b/wandb/run-20241031_122005-xvi7ifef/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..948a3fecb4d5977b50cf4c376769b1e6ba433026 --- /dev/null +++ b/wandb/run-20241031_122005-xvi7ifef/logs/debug.log @@ -0,0 +1,27 @@ +2024-10-31 12:20:05,888 INFO MainThread:557187 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-10-31 12:20:05,888 INFO MainThread:557187 [wandb_setup.py:_flush():79] Configure stats pid to 557187 +2024-10-31 12:20:05,888 INFO MainThread:557187 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-10-31 12:20:05,888 INFO MainThread:557187 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-10-31 12:20:05,888 INFO MainThread:557187 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-10-31 12:20:05,888 INFO MainThread:557187 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-10-31 12:20:05,888 INFO MainThread:557187 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-10-31 12:20:05,889 INFO MainThread:557187 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-10-31 12:20:05,889 INFO MainThread:557187 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241031_122005-xvi7ifef/logs/debug.log +2024-10-31 12:20:05,889 INFO MainThread:557187 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241031_122005-xvi7ifef/logs/debug-internal.log +2024-10-31 12:20:05,889 INFO MainThread:557187 [wandb_init.py:init():621] calling init triggers +2024-10-31 12:20:05,889 INFO MainThread:557187 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-10-31 12:20:05,889 INFO MainThread:557187 [wandb_init.py:init():671] starting backend +2024-10-31 12:20:05,889 INFO MainThread:557187 [wandb_init.py:init():675] sending inform_init request +2024-10-31 12:20:05,891 INFO MainThread:557187 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-10-31 12:20:05,891 INFO MainThread:557187 [wandb_init.py:init():688] backend started and connected +2024-10-31 12:20:05,894 INFO MainThread:557187 [wandb_init.py:init():783] updated telemetry +2024-10-31 12:20:05,922 INFO MainThread:557187 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-10-31 12:20:06,186 INFO MainThread:557187 [wandb_init.py:init():867] starting run threads in backend +2024-10-31 12:20:06,278 INFO MainThread:557187 [wandb_run.py:_console_start():2463] atexit reg +2024-10-31 12:20:06,278 INFO MainThread:557187 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-10-31 12:20:06,278 INFO MainThread:557187 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-10-31 12:20:06,278 INFO MainThread:557187 [wandb_run.py:_redirect():2401] Redirects installed. +2024-10-31 12:20:06,279 INFO MainThread:557187 [wandb_init.py:init():911] run started, returning control to user process +2024-10-31 12:20:06,280 INFO MainThread:557187 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_full', 'train_set': '10M', 'batch_size': 3, 'epoch': 6, 'seed': 0, 'lr': 5e-06} +2024-10-31 12:20:29,345 WARNING MsgRouterThr:557187 [router.py:message_loop():77] message_loop has been closed diff --git a/wandb/run-20241031_122005-xvi7ifef/run-xvi7ifef.wandb b/wandb/run-20241031_122005-xvi7ifef/run-xvi7ifef.wandb new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/wandb/run-20241101_012438-tr39r2kv/files/wandb-metadata.json b/wandb/run-20241101_012438-tr39r2kv/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..912d2418f0d249366f891979328b3b61316f3867 --- /dev/null +++ b/wandb/run-20241101_012438-tr39r2kv/files/wandb-metadata.json @@ -0,0 +1,29 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-11-01T05:24:38.166620Z", + "args": [ + "--perturbation", + "shuffle_nodeterministic", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "6", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py" +} \ No newline at end of file diff --git a/wandb/run-20241101_012438-tr39r2kv/logs/debug.log b/wandb/run-20241101_012438-tr39r2kv/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..71d40c38e844d2a5ff2fb5324b44b5333899b93b --- /dev/null +++ b/wandb/run-20241101_012438-tr39r2kv/logs/debug.log @@ -0,0 +1,27 @@ +2024-11-01 01:24:38,164 INFO MainThread:676350 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-11-01 01:24:38,164 INFO MainThread:676350 [wandb_setup.py:_flush():79] Configure stats pid to 676350 +2024-11-01 01:24:38,164 INFO MainThread:676350 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-11-01 01:24:38,164 INFO MainThread:676350 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-11-01 01:24:38,164 INFO MainThread:676350 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-11-01 01:24:38,165 INFO MainThread:676350 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-11-01 01:24:38,165 INFO MainThread:676350 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-11-01 01:24:38,165 INFO MainThread:676350 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-11-01 01:24:38,165 INFO MainThread:676350 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_012438-tr39r2kv/logs/debug.log +2024-11-01 01:24:38,165 INFO MainThread:676350 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_012438-tr39r2kv/logs/debug-internal.log +2024-11-01 01:24:38,165 INFO MainThread:676350 [wandb_init.py:init():621] calling init triggers +2024-11-01 01:24:38,165 INFO MainThread:676350 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-11-01 01:24:38,165 INFO MainThread:676350 [wandb_init.py:init():671] starting backend +2024-11-01 01:24:38,165 INFO MainThread:676350 [wandb_init.py:init():675] sending inform_init request +2024-11-01 01:24:38,166 INFO MainThread:676350 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-11-01 01:24:38,166 INFO MainThread:676350 [wandb_init.py:init():688] backend started and connected +2024-11-01 01:24:38,169 INFO MainThread:676350 [wandb_init.py:init():783] updated telemetry +2024-11-01 01:24:38,201 INFO MainThread:676350 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-11-01 01:24:38,462 INFO MainThread:676350 [wandb_init.py:init():867] starting run threads in backend +2024-11-01 01:24:38,576 INFO MainThread:676350 [wandb_run.py:_console_start():2463] atexit reg +2024-11-01 01:24:38,577 INFO MainThread:676350 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-11-01 01:24:38,577 INFO MainThread:676350 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-11-01 01:24:38,577 INFO MainThread:676350 [wandb_run.py:_redirect():2401] Redirects installed. +2024-11-01 01:24:38,579 INFO MainThread:676350 [wandb_init.py:init():911] run started, returning control to user process +2024-11-01 01:24:38,579 INFO MainThread:676350 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'shuffle_nodeterministic', 'train_set': '10M', 'batch_size': 3, 'epoch': 6, 'seed': 0, 'lr': 5e-06} +2024-11-01 01:24:38,591 WARNING MsgRouterThr:676350 [router.py:message_loop():77] message_loop has been closed diff --git a/wandb/run-20241101_012612-2icf8uyp/files/output.log b/wandb/run-20241101_012612-2icf8uyp/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..5824b3c1b0e7410256b3374a200db5bd3cc11c9d --- /dev/null +++ b/wandb/run-20241101_012612-2icf8uyp/files/output.log @@ -0,0 +1,12 @@ +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", line 164, in + dataset = load_dataset('babylm_dataset_test.py', name=dataset_name, trust_remote_code=True) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/datasets/load.py", line 2074, in load_dataset + builder_instance = load_dataset_builder( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/datasets/load.py", line 1832, in load_dataset_builder + builder_instance: DatasetBuilder = builder_cls( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/datasets/builder.py", line 342, in __init__ + self.config, self.config_id = self._create_builder_config( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/datasets/builder.py", line 569, in _create_builder_config + raise ValueError( +ValueError: BuilderConfig 'babylm_shuffle_nodeterministic_10M_seed0' not found. Available: ['babylm_hop_control_10M_seed0', 'babylm_hop_tokens4_10M_seed0', 'babylm_hop_words4_10M_seed0', 'babylm_reverse_control_10M_seed0', 'babylm_reverse_partial_10M_seed0', 'babylm_reverse_full_10M_seed0', 'babylm_shuffle_control_10M_seed0', 'babylm_shuffle_nondeterministic_10M_seed0', 'babylm_shuffle_deterministic21_10M_seed0', 'babylm_shuffle_deterministic57_10M_seed0', 'babylm_shuffle_deterministic84_10M_seed0', 'babylm_shuffle_local3_10M_seed0', 'babylm_shuffle_local5_10M_seed0', 'babylm_shuffle_local10_10M_seed0', 'babylm_shuffle_even_odd_10M_seed0'] diff --git a/wandb/run-20241101_012612-2icf8uyp/files/wandb-summary.json b/wandb/run-20241101_012612-2icf8uyp/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..6c37fe1cbbb8aed86fd461a79642cb991e4d35cf --- /dev/null +++ b/wandb/run-20241101_012612-2icf8uyp/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":0}} \ No newline at end of file diff --git a/wandb/run-20241101_012612-2icf8uyp/logs/debug-internal.log b/wandb/run-20241101_012612-2icf8uyp/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..6705ce98e5c4ab44c092ef980709029c69c34cdd --- /dev/null +++ b/wandb/run-20241101_012612-2icf8uyp/logs/debug-internal.log @@ -0,0 +1,16 @@ +{"time":"2024-11-01T01:26:12.959111943-04:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-11-01T01:26:12.959129113-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_012612-2icf8uyp/logs/debug-core.log"} +{"time":"2024-11-01T01:26:13.06672005-04:00","level":"INFO","msg":"created new stream","id":"2icf8uyp"} +{"time":"2024-11-01T01:26:13.06676589-04:00","level":"INFO","msg":"stream: started","id":"2icf8uyp"} +{"time":"2024-11-01T01:26:13.067468275-04:00","level":"INFO","msg":"sender: started","stream_id":"2icf8uyp"} +{"time":"2024-11-01T01:26:13.06678695-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"2icf8uyp"}} +{"time":"2024-11-01T01:26:13.0668113-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"2icf8uyp"}} +{"time":"2024-11-01T01:26:13.261986126-04:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2024-11-01T01:26:13.37198602-04:00","level":"INFO","msg":"stream: closing","id":"2icf8uyp"} +{"time":"2024-11-01T01:26:13.37204529-04:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2024-11-01T01:26:13.372541984-04:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2024-11-01T01:26:14.110429588-04:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2024-11-01T01:26:14.235735809-04:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"2icf8uyp"}} +{"time":"2024-11-01T01:26:14.235805009-04:00","level":"INFO","msg":"sender: closed","stream_id":"2icf8uyp"} +{"time":"2024-11-01T01:26:14.235790089-04:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"2icf8uyp"}} +{"time":"2024-11-01T01:26:14.236052171-04:00","level":"INFO","msg":"stream: closed","id":"2icf8uyp"} diff --git a/wandb/run-20241101_012612-kwpxhz4q/files/output.log b/wandb/run-20241101_012612-kwpxhz4q/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..5824b3c1b0e7410256b3374a200db5bd3cc11c9d --- /dev/null +++ b/wandb/run-20241101_012612-kwpxhz4q/files/output.log @@ -0,0 +1,12 @@ +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", line 164, in + dataset = load_dataset('babylm_dataset_test.py', name=dataset_name, trust_remote_code=True) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/datasets/load.py", line 2074, in load_dataset + builder_instance = load_dataset_builder( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/datasets/load.py", line 1832, in load_dataset_builder + builder_instance: DatasetBuilder = builder_cls( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/datasets/builder.py", line 342, in __init__ + self.config, self.config_id = self._create_builder_config( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/datasets/builder.py", line 569, in _create_builder_config + raise ValueError( +ValueError: BuilderConfig 'babylm_shuffle_nodeterministic_10M_seed0' not found. Available: ['babylm_hop_control_10M_seed0', 'babylm_hop_tokens4_10M_seed0', 'babylm_hop_words4_10M_seed0', 'babylm_reverse_control_10M_seed0', 'babylm_reverse_partial_10M_seed0', 'babylm_reverse_full_10M_seed0', 'babylm_shuffle_control_10M_seed0', 'babylm_shuffle_nondeterministic_10M_seed0', 'babylm_shuffle_deterministic21_10M_seed0', 'babylm_shuffle_deterministic57_10M_seed0', 'babylm_shuffle_deterministic84_10M_seed0', 'babylm_shuffle_local3_10M_seed0', 'babylm_shuffle_local5_10M_seed0', 'babylm_shuffle_local10_10M_seed0', 'babylm_shuffle_even_odd_10M_seed0'] diff --git a/wandb/run-20241101_012612-kwpxhz4q/logs/debug.log b/wandb/run-20241101_012612-kwpxhz4q/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..daa51bfbd5a490a916d46f7f3fa93ccec9e3fb1d --- /dev/null +++ b/wandb/run-20241101_012612-kwpxhz4q/logs/debug.log @@ -0,0 +1,27 @@ +2024-11-01 01:26:12,562 INFO MainThread:677638 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-11-01 01:26:12,562 INFO MainThread:677638 [wandb_setup.py:_flush():79] Configure stats pid to 677638 +2024-11-01 01:26:12,562 INFO MainThread:677638 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-11-01 01:26:12,562 INFO MainThread:677638 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-11-01 01:26:12,562 INFO MainThread:677638 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-11-01 01:26:12,562 INFO MainThread:677638 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-11-01 01:26:12,562 INFO MainThread:677638 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-11-01 01:26:12,562 INFO MainThread:677638 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-11-01 01:26:12,562 INFO MainThread:677638 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_012612-kwpxhz4q/logs/debug.log +2024-11-01 01:26:12,562 INFO MainThread:677638 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_012612-kwpxhz4q/logs/debug-internal.log +2024-11-01 01:26:12,562 INFO MainThread:677638 [wandb_init.py:init():621] calling init triggers +2024-11-01 01:26:12,562 INFO MainThread:677638 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-11-01 01:26:12,562 INFO MainThread:677638 [wandb_init.py:init():671] starting backend +2024-11-01 01:26:12,562 INFO MainThread:677638 [wandb_init.py:init():675] sending inform_init request +2024-11-01 01:26:12,563 INFO MainThread:677638 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-11-01 01:26:12,564 INFO MainThread:677638 [wandb_init.py:init():688] backend started and connected +2024-11-01 01:26:12,568 INFO MainThread:677638 [wandb_init.py:init():783] updated telemetry +2024-11-01 01:26:12,593 INFO MainThread:677638 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-11-01 01:26:12,862 INFO MainThread:677638 [wandb_init.py:init():867] starting run threads in backend +2024-11-01 01:26:12,965 INFO MainThread:677638 [wandb_run.py:_console_start():2463] atexit reg +2024-11-01 01:26:12,965 INFO MainThread:677638 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-11-01 01:26:12,965 INFO MainThread:677638 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-11-01 01:26:12,965 INFO MainThread:677638 [wandb_run.py:_redirect():2401] Redirects installed. +2024-11-01 01:26:12,967 INFO MainThread:677638 [wandb_init.py:init():911] run started, returning control to user process +2024-11-01 01:26:12,967 INFO MainThread:677638 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'shuffle_nodeterministic', 'train_set': '10M', 'batch_size': 3, 'epoch': 6, 'seed': 0, 'lr': 5e-06} +2024-11-01 01:26:12,976 WARNING MsgRouterThr:677638 [router.py:message_loop():77] message_loop has been closed diff --git a/wandb/run-20241101_012612-kwpxhz4q/run-kwpxhz4q.wandb b/wandb/run-20241101_012612-kwpxhz4q/run-kwpxhz4q.wandb new file mode 100644 index 0000000000000000000000000000000000000000..33dae860115e9dcad6e2ee6e4ccf74d91cba63fe Binary files /dev/null and b/wandb/run-20241101_012612-kwpxhz4q/run-kwpxhz4q.wandb differ diff --git a/wandb/run-20241101_012733-4u8e027p/run-4u8e027p.wandb b/wandb/run-20241101_012733-4u8e027p/run-4u8e027p.wandb new file mode 100644 index 0000000000000000000000000000000000000000..8a89c7c1384cc3cf64870a047af398f38f280a14 --- /dev/null +++ b/wandb/run-20241101_012733-4u8e027p/run-4u8e027p.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8186a12a300f9a4c63bfd615e8ba5e7dfb9ff87f03aa3b2e0db9927c1fbd2c93 +size 1015808 diff --git a/wandb/run-20241101_012733-e3zsr634/run-e3zsr634.wandb b/wandb/run-20241101_012733-e3zsr634/run-e3zsr634.wandb new file mode 100644 index 0000000000000000000000000000000000000000..1f57d70cef586843baf1781f2a122e3e2972307a --- /dev/null +++ b/wandb/run-20241101_012733-e3zsr634/run-e3zsr634.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94ee42471a30396c5c6bc1d3008739f6570cb77323089d6b7050c5abd4de1101 +size 1015808 diff --git a/wandb/run-20241101_012734-m18lsdzn/files/output.log b/wandb/run-20241101_012734-m18lsdzn/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..a2a0ae11d384b7c425f45978183f6a8c31411683 --- /dev/null +++ b/wandb/run-20241101_012734-m18lsdzn/files/output.log @@ -0,0 +1,19 @@ +model.safetensors.index.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20.9k/20.9k [00:00<00:00, 4.24MB/s] +model-00001-of-00002.safetensors: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 4.97G/4.97G [01:58<00:00, 42.0MB/s] +model-00002-of-00002.safetensors: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.46G/1.46G [00:34<00:00, 42.5MB/s] +Downloading shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [02:32<00:00, 76.36s/it] +Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:06<00:00, 3.43s/it] +Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16425/16425 [00:52<00:00, 314.14 examples/s] +Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17013/17013 [00:53<00:00, 317.54 examples/s] +tokenized_valid: Dataset({ + features: ['input_ids', 'attention_mask'], + num_rows: 600 +}) +/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead + warnings.warn( +[2024-11-01 01:32:29,634] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-11-01 01:32:39,928] [INFO] [comm.py:652:init_distributed] cdb=None +Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination +Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root... +Loading extension module cpu_adam... +Time to load cpu_adam op: 5.3903725147247314 seconds diff --git a/wandb/run-20241101_200502-28ivel81/run-28ivel81.wandb b/wandb/run-20241101_200502-28ivel81/run-28ivel81.wandb new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/wandb/run-20241101_200502-8iosiep6/files/output.log b/wandb/run-20241101_200502-8iosiep6/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..04ca77dae62c42467f2e3713ee4aa0dba0d01be1 --- /dev/null +++ b/wandb/run-20241101_200502-8iosiep6/files/output.log @@ -0,0 +1 @@ +Loading checkpoint shards: 0%| | 0/2 [00:00 + dataset = load_dataset('babylm_dataset_test.py', name=dataset_name, trust_remote_code=True) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/datasets/load.py", line 2096, in load_dataset + builder_instance.download_and_prepare( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/datasets/builder.py", line 855, in download_and_prepare + Path(self._output_dir).parent.mkdir(parents=True, exist_ok=True) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/pathlib.py", line 1327, in mkdir + self.parent.mkdir(parents=True, exist_ok=True) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/pathlib.py", line 1323, in mkdir + self._accessor.mkdir(self, mode) +OSError: [Errno 28] No space left on device: '/home/chunhui/.cache/huggingface/datasets/babylm_dataset_test/babylm_shuffle_deterministic21_10M_seed0' diff --git a/wandb/run-20241105_160217-dgnjdt5g/files/wandb-summary.json b/wandb/run-20241105_160217-dgnjdt5g/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..a59211b910c7b68e6827eb6c887d30d98244727c --- /dev/null +++ b/wandb/run-20241105_160217-dgnjdt5g/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":5}} \ No newline at end of file diff --git a/wandb/run-20241105_162824-8jiuqja7/files/output.log b/wandb/run-20241105_162824-8jiuqja7/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..f2166ab915d79f46bacdae161b09bf30c92ebfd4 --- /dev/null +++ b/wandb/run-20241105_162824-8jiuqja7/files/output.log @@ -0,0 +1,42 @@ +wandb: 500 encountered ({"errors":[{"message":"An internal error occurred. Please contact support.","path":["upsertBucket"]}],"data":{"upsertBucket":null}}), retrying request +Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:05<00:00, 2.96s/it] +Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17013/17013 [00:47<00:00, 359.85 examples/s] +tokenized_valid: Dataset({ + features: ['input_ids', 'attention_mask'], + num_rows: 1000 +}) +/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead + warnings.warn( +[2024-11-05 16:29:22,117] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-11-05 16:29:30,513] [INFO] [comm.py:652:init_distributed] cdb=None +[2024-11-05 16:29:30,513] [INFO] [comm.py:683:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher. +Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination +Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root... +Emitting ninja build file /home/chunhui/.cache/torch_extensions/py39_cu117/cpu_adam/build.ninja... +Building extension module cpu_adam... +Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +Loading extension module cpu_adam... +Time to load cpu_adam op: 5.061752796173096 seconds +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", line 222, in + trainer.train() + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/trainer.py", line 2052, in train + return inner_training_loop( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/trainer.py", line 2204, in _inner_training_loop + model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/accelerate/accelerator.py", line 1344, in prepare + result = self._prepare_deepspeed(*args) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/accelerate/accelerator.py", line 1851, in _prepare_deepspeed + engine, optimizer, _, lr_scheduler = ds_initialize(**kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/deepspeed/__init__.py", line 193, in initialize + engine = DeepSpeedEngine(args=args, + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/deepspeed/runtime/engine.py", line 313, in __init__ + self._configure_optimizer(optimizer, model_parameters) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/deepspeed/runtime/engine.py", line 1302, in _configure_optimizer + self.optimizer = self._configure_zero_optimizer(basic_optimizer) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/deepspeed/runtime/engine.py", line 1560, in _configure_zero_optimizer + optimizer = DeepSpeedZeroOptimizer( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 326, in __init__ + param.cpu_data = param.data.cpu() +KeyboardInterrupt diff --git a/wandb/run-20241105_162824-8jiuqja7/files/requirements.txt b/wandb/run-20241105_162824-8jiuqja7/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416 --- /dev/null +++ b/wandb/run-20241105_162824-8jiuqja7/files/requirements.txt @@ -0,0 +1,147 @@ +funcsigs==1.0.2 +sentry-sdk==2.17.0 +multiprocess==0.70.16 +numpy==1.26.2 +pluralizer==1.2.0 +debugpy==1.6.7 +nvidia-cudnn-cu11==8.5.0.96 +deepspeed==0.15.2 +data==0.4 +pandas==2.1.3 +tomli==2.0.1 +charset-normalizer==3.3.2 +attrs==24.2.0 +aiosignal==1.3.1 +fsspec==2023.10.0 +nvidia-cusparse-cu11==11.7.4.91 +zipp==3.12.0 +mypy-extensions==1.0.0 +datasets==3.0.1 +joblib==1.3.2 +hjson==3.1.0 +traitlets==5.7.1 +stack-data==0.6.0 +transformers==4.45.1 +sympy==1.11.1 +Pygments==2.15.0 +docker-pycreds==0.4.0 +dill==0.3.8 +wheel==0.44.0 +prompt-toolkit==3.0.30 +parso==0.8.3 +ipykernel==6.23.1 +pyarrow==17.0.0 +certifi==2023.11.17 +nvidia-cufft-cu11==10.9.0.58 +six==1.16.0 +pydantic==2.9.2 +click==8.1.7 +nest-asyncio==1.5.6 +gmpy2==2.1.0 +matplotlib==3.8.2 +scipy==1.11.4 +typing_extensions==4.12.2 +statsmodels==0.14.0 +huggingface-hub==0.25.0 +frozenlist==1.4.1 +gpustat==1.1.1 +nvidia-nvtx-cu11==11.7.91 +safetensors==0.4.5 +stanza==1.9.2 +decorator==5.1.1 +seaborn==0.13.0 +sentencepiece==0.2.0 +PyYAML==6.0.1 +black==24.8.0 +protobuf==4.25.1 +pickleshare==0.7.5 +peft==0.13.0 +triton==2.0.0 +nvidia-cuda-runtime-cu11==11.7.99 +Jinja2==3.1.2 +nvidia-cusolver-cu11==11.4.0.1 +executing==1.2.0 +jupyter_client==8.1.0 +pluggy==1.3.0 +cmake==3.30.3 +pytz==2023.3.post1 +aiohappyeyeballs==2.4.2 +kiwisolver==1.4.5 +py-cpuinfo==9.0.0 +Pillow==10.1.0 +ptyprocess==0.7.0 +importlib_resources==6.4.5 +GitPython==3.1.43 +importlib-metadata==6.0.0 +iniconfig==2.0.0 +scikit-learn==1.3.2 +exceptiongroup==1.1.0 +networkx==2.8.6 +accelerate==1.0.0 +nltk==3.8.1 +shutilwhich==1.1.0 +fonttools==4.45.1 +future==0.18.3 +aiohttp==3.10.6 +wcwidth==0.2.5 +idna==3.6 +filelock==3.12.2 +pathspec==0.12.1 +jupyter_core==5.1.0 +lit==18.1.8 +nvidia-curand-cu11==10.2.10.91 +nvidia-cublas-cu11==11.10.3.66 +nvidia-ml-py==12.560.30 +msgpack==1.1.0 +python-dateutil==2.8.2 +blessed==1.20.0 +packaging==23.0 +gitdb==4.0.11 +yarl==1.13.0 +emoji==2.8.0 +tzdata==2023.3 +cycler==0.12.1 +tornado==6.2 +backcall==0.2.0 +plotnine==0.12.4 +ninja==1.11.1.1 +latex==0.7.0 +wandb==0.18.5 +setproctitle==1.3.3 +threadpoolctl==3.2.0 +requests==2.32.3 +pyparsing==3.1.1 +smmap==5.0.1 +pyzmq==23.0.0 +async-timeout==4.0.3 +annotated-types==0.7.0 +matplotlib-inline==0.1.6 +latexcodec==1.0.0 +ipython==8.0.0 +patsy==0.5.3 +contourpy==1.2.0 +multidict==6.1.0 +mizani==0.9.3 +urllib3==2.1.0 +tokenizers==0.20.0 +MarkupSafe==2.1.2 +pip==24.2 +pexpect==4.8.0 +tqdm==4.66.5 +jedi==0.18.2 +pydantic_core==2.23.4 +tempdir==0.7.1 +mpmath==1.2.1 +setuptools==72.1.0 +pytest==7.4.3 +pure-eval==0.2.2 +psutil==5.9.1 +comm==0.1.2 +nvidia-cuda-cupti-cu11==11.7.101 +nvidia-cuda-nvrtc-cu11==11.7.99 +regex==2023.10.3 +platformdirs==2.5.2 +asttokens==2.2.1 +torch==2.0.0 +nvidia-nccl-cu11==2.14.3 +xxhash==3.5.0 diff --git a/wandb/run-20241105_162824-8jiuqja7/files/wandb-metadata.json b/wandb/run-20241105_162824-8jiuqja7/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..8ea95654101cd6f2ab443f6b531976dcda2c98fc --- /dev/null +++ b/wandb/run-20241105_162824-8jiuqja7/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-11-05T21:28:24.373116Z", + "args": [ + "--perturbation", + "shuffle_deterministic21", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "3", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1785662992384" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241105_162824-8jiuqja7/logs/debug-internal.log b/wandb/run-20241105_162824-8jiuqja7/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..950888fa461be63cc6b0f0dd9fc2fe13bb114b77 --- /dev/null +++ b/wandb/run-20241105_162824-8jiuqja7/logs/debug-internal.log @@ -0,0 +1,9 @@ +{"time":"2024-11-05T16:28:24.376765683-05:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-11-05T16:28:24.376792993-05:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241105_162824-8jiuqja7/logs/debug-core.log"} +{"time":"2024-11-05T16:28:24.486846318-05:00","level":"INFO","msg":"created new stream","id":"8jiuqja7"} +{"time":"2024-11-05T16:28:24.486902458-05:00","level":"INFO","msg":"stream: started","id":"8jiuqja7"} +{"time":"2024-11-05T16:28:24.488329528-05:00","level":"INFO","msg":"sender: started","stream_id":"8jiuqja7"} +{"time":"2024-11-05T16:28:24.487012259-05:00","level":"INFO","msg":"handler: started","stream_id":{"value":"8jiuqja7"}} +{"time":"2024-11-05T16:28:24.486949588-05:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"8jiuqja7"}} +{"time":"2024-11-05T16:28:24.606025727-05:00","level":"INFO","msg":"api: retrying HTTP error","status":500,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-11-05T16:28:27.20885453-05:00","level":"INFO","msg":"Starting system monitor"} diff --git a/wandb/run-20241105_162824-8jiuqja7/logs/debug.log b/wandb/run-20241105_162824-8jiuqja7/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..151a5abe8e4322ad8c49a0c1c40fff4c64235a53 --- /dev/null +++ b/wandb/run-20241105_162824-8jiuqja7/logs/debug.log @@ -0,0 +1,26 @@ +2024-11-05 16:28:24,369 INFO MainThread:1777855 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-11-05 16:28:24,369 INFO MainThread:1777855 [wandb_setup.py:_flush():79] Configure stats pid to 1777855 +2024-11-05 16:28:24,369 INFO MainThread:1777855 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-11-05 16:28:24,369 INFO MainThread:1777855 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-11-05 16:28:24,369 INFO MainThread:1777855 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-11-05 16:28:24,369 INFO MainThread:1777855 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-11-05 16:28:24,370 INFO MainThread:1777855 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-11-05 16:28:24,370 INFO MainThread:1777855 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-11-05 16:28:24,370 INFO MainThread:1777855 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241105_162824-8jiuqja7/logs/debug.log +2024-11-05 16:28:24,370 INFO MainThread:1777855 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241105_162824-8jiuqja7/logs/debug-internal.log +2024-11-05 16:28:24,370 INFO MainThread:1777855 [wandb_init.py:init():621] calling init triggers +2024-11-05 16:28:24,370 INFO MainThread:1777855 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-11-05 16:28:24,370 INFO MainThread:1777855 [wandb_init.py:init():671] starting backend +2024-11-05 16:28:24,370 INFO MainThread:1777855 [wandb_init.py:init():675] sending inform_init request +2024-11-05 16:28:24,372 INFO MainThread:1777855 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-11-05 16:28:24,372 INFO MainThread:1777855 [wandb_init.py:init():688] backend started and connected +2024-11-05 16:28:24,375 INFO MainThread:1777855 [wandb_init.py:init():783] updated telemetry +2024-11-05 16:28:24,403 INFO MainThread:1777855 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-11-05 16:28:27,205 INFO MainThread:1777855 [wandb_init.py:init():867] starting run threads in backend +2024-11-05 16:28:27,293 INFO MainThread:1777855 [wandb_run.py:_console_start():2463] atexit reg +2024-11-05 16:28:27,293 INFO MainThread:1777855 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-11-05 16:28:27,293 INFO MainThread:1777855 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-11-05 16:28:27,293 INFO MainThread:1777855 [wandb_run.py:_redirect():2401] Redirects installed. +2024-11-05 16:28:27,295 INFO MainThread:1777855 [wandb_init.py:init():911] run started, returning control to user process +2024-11-05 16:28:27,295 INFO MainThread:1777855 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'shuffle_deterministic21', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0, 'lr': 5e-06} diff --git a/wandb/run-20241105_162824-8jiuqja7/run-8jiuqja7.wandb b/wandb/run-20241105_162824-8jiuqja7/run-8jiuqja7.wandb new file mode 100644 index 0000000000000000000000000000000000000000..6a1e34a577b8f4e6ca0b11bea7e1a5048e23f18d Binary files /dev/null and b/wandb/run-20241105_162824-8jiuqja7/run-8jiuqja7.wandb differ diff --git a/wandb/run-20241106_224416-qm85h10r/files/output.log b/wandb/run-20241106_224416-qm85h10r/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..1f755f05b1d26cb867188f7e454ce8fab4c6d457 --- /dev/null +++ b/wandb/run-20241106_224416-qm85h10r/files/output.log @@ -0,0 +1,39 @@ +Downloading shards: 0%| | 0/2 [00:00 +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/std.py", line 1196, in __iter__ + self.close() + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/std.py", line 1302, in close + self.display(pos=0) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/std.py", line 1495, in display + self.sp(self.__str__() if msg is None else msg) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/std.py", line 1151, in __str__ + return self.format_meter(**self.format_dict) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/std.py", line 1460, in format_dict + 'bar_format': self.bar_format, 'postfix': self.postfix, +KeyboardInterrupt: +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", line 174, in + model = AutoModelForCausalLM.from_pretrained(model_name, + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/models/auto/auto_factory.py", line 564, in from_pretrained + return model_class.from_pretrained( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/modeling_utils.py", line 3769, in from_pretrained + resolved_archive_file, sharded_metadata = get_checkpoint_shard_files( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 1098, in get_checkpoint_shard_files + cached_filename = cached_file( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 403, in cached_file + resolved_file = hf_hub_download( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_deprecation.py", line 101, in inner_f + return f(*args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn + return fn(*args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1232, in hf_hub_download + return _hf_hub_download_to_cache_dir( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1380, in _hf_hub_download_to_cache_dir + with WeakFileLock(lock_path): + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/contextlib.py", line 119, in __enter__ + return next(self.gen) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_fixes.py", line 98, in WeakFileLock + lock.acquire() + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/filelock/_api.py", line 225, in acquire + time.sleep(poll_interval) +KeyboardInterrupt diff --git a/wandb/run-20241106_224416-qm85h10r/files/requirements.txt b/wandb/run-20241106_224416-qm85h10r/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416 --- /dev/null +++ b/wandb/run-20241106_224416-qm85h10r/files/requirements.txt @@ -0,0 +1,147 @@ +funcsigs==1.0.2 +sentry-sdk==2.17.0 +multiprocess==0.70.16 +numpy==1.26.2 +pluralizer==1.2.0 +debugpy==1.6.7 +nvidia-cudnn-cu11==8.5.0.96 +deepspeed==0.15.2 +data==0.4 +pandas==2.1.3 +tomli==2.0.1 +charset-normalizer==3.3.2 +attrs==24.2.0 +aiosignal==1.3.1 +fsspec==2023.10.0 +nvidia-cusparse-cu11==11.7.4.91 +zipp==3.12.0 +mypy-extensions==1.0.0 +datasets==3.0.1 +joblib==1.3.2 +hjson==3.1.0 +traitlets==5.7.1 +stack-data==0.6.0 +transformers==4.45.1 +sympy==1.11.1 +Pygments==2.15.0 +docker-pycreds==0.4.0 +dill==0.3.8 +wheel==0.44.0 +prompt-toolkit==3.0.30 +parso==0.8.3 +ipykernel==6.23.1 +pyarrow==17.0.0 +certifi==2023.11.17 +nvidia-cufft-cu11==10.9.0.58 +six==1.16.0 +pydantic==2.9.2 +click==8.1.7 +nest-asyncio==1.5.6 +gmpy2==2.1.0 +matplotlib==3.8.2 +scipy==1.11.4 +typing_extensions==4.12.2 +statsmodels==0.14.0 +huggingface-hub==0.25.0 +frozenlist==1.4.1 +gpustat==1.1.1 +nvidia-nvtx-cu11==11.7.91 +safetensors==0.4.5 +stanza==1.9.2 +decorator==5.1.1 +seaborn==0.13.0 +sentencepiece==0.2.0 +PyYAML==6.0.1 +black==24.8.0 +protobuf==4.25.1 +pickleshare==0.7.5 +peft==0.13.0 +triton==2.0.0 +nvidia-cuda-runtime-cu11==11.7.99 +Jinja2==3.1.2 +nvidia-cusolver-cu11==11.4.0.1 +executing==1.2.0 +jupyter_client==8.1.0 +pluggy==1.3.0 +cmake==3.30.3 +pytz==2023.3.post1 +aiohappyeyeballs==2.4.2 +kiwisolver==1.4.5 +py-cpuinfo==9.0.0 +Pillow==10.1.0 +ptyprocess==0.7.0 +importlib_resources==6.4.5 +GitPython==3.1.43 +importlib-metadata==6.0.0 +iniconfig==2.0.0 +scikit-learn==1.3.2 +exceptiongroup==1.1.0 +networkx==2.8.6 +accelerate==1.0.0 +nltk==3.8.1 +shutilwhich==1.1.0 +fonttools==4.45.1 +future==0.18.3 +aiohttp==3.10.6 +wcwidth==0.2.5 +idna==3.6 +filelock==3.12.2 +pathspec==0.12.1 +jupyter_core==5.1.0 +lit==18.1.8 +nvidia-curand-cu11==10.2.10.91 +nvidia-cublas-cu11==11.10.3.66 +nvidia-ml-py==12.560.30 +msgpack==1.1.0 +python-dateutil==2.8.2 +blessed==1.20.0 +packaging==23.0 +gitdb==4.0.11 +yarl==1.13.0 +emoji==2.8.0 +tzdata==2023.3 +cycler==0.12.1 +tornado==6.2 +backcall==0.2.0 +plotnine==0.12.4 +ninja==1.11.1.1 +latex==0.7.0 +wandb==0.18.5 +setproctitle==1.3.3 +threadpoolctl==3.2.0 +requests==2.32.3 +pyparsing==3.1.1 +smmap==5.0.1 +pyzmq==23.0.0 +async-timeout==4.0.3 +annotated-types==0.7.0 +matplotlib-inline==0.1.6 +latexcodec==1.0.0 +ipython==8.0.0 +patsy==0.5.3 +contourpy==1.2.0 +multidict==6.1.0 +mizani==0.9.3 +urllib3==2.1.0 +tokenizers==0.20.0 +MarkupSafe==2.1.2 +pip==24.2 +pexpect==4.8.0 +tqdm==4.66.5 +jedi==0.18.2 +pydantic_core==2.23.4 +tempdir==0.7.1 +mpmath==1.2.1 +setuptools==72.1.0 +pytest==7.4.3 +pure-eval==0.2.2 +psutil==5.9.1 +comm==0.1.2 +nvidia-cuda-cupti-cu11==11.7.101 +nvidia-cuda-nvrtc-cu11==11.7.99 +regex==2023.10.3 +platformdirs==2.5.2 +asttokens==2.2.1 +torch==2.0.0 +nvidia-nccl-cu11==2.14.3 +xxhash==3.5.0 diff --git a/wandb/run-20241106_224416-qm85h10r/files/wandb-metadata.json b/wandb/run-20241106_224416-qm85h10r/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..534106e4beceba4fa729ff77f96db9bd484fa21f --- /dev/null +++ b/wandb/run-20241106_224416-qm85h10r/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-11-07T03:44:16.911575Z", + "args": [ + "--perturbation", + "shuffle_deterministic84", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "3", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1774852624384" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241106_224416-qm85h10r/files/wandb-summary.json b/wandb/run-20241106_224416-qm85h10r/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..1d52051e315a7a21a9d9e5a40a517408bb086162 --- /dev/null +++ b/wandb/run-20241106_224416-qm85h10r/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":2}} \ No newline at end of file diff --git a/wandb/run-20241106_224416-qm85h10r/logs/debug-internal.log b/wandb/run-20241106_224416-qm85h10r/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..f0f144b5cc185a409f748f3bfa67f3b31713df88 --- /dev/null +++ b/wandb/run-20241106_224416-qm85h10r/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2024-11-06T22:44:16.913757251-05:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-11-06T22:44:16.913770151-05:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241106_224416-qm85h10r/logs/debug-core.log"} +{"time":"2024-11-06T22:44:17.021816611-05:00","level":"INFO","msg":"created new stream","id":"qm85h10r"} +{"time":"2024-11-06T22:44:17.021877401-05:00","level":"INFO","msg":"stream: started","id":"qm85h10r"} +{"time":"2024-11-06T22:44:17.021910891-05:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"qm85h10r"}} +{"time":"2024-11-06T22:44:17.022030212-05:00","level":"INFO","msg":"sender: started","stream_id":"qm85h10r"} +{"time":"2024-11-06T22:44:17.022004502-05:00","level":"INFO","msg":"handler: started","stream_id":{"value":"qm85h10r"}} +{"time":"2024-11-06T22:44:17.191726778-05:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2024-11-06T22:44:19.091829758-05:00","level":"INFO","msg":"stream: closing","id":"qm85h10r"} +{"time":"2024-11-06T22:44:19.091884889-05:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2024-11-06T22:44:19.092976857-05:00","level":"INFO","msg":"Stopped system monitor"} diff --git a/wandb/run-20241106_224416-qm85h10r/logs/debug.log b/wandb/run-20241106_224416-qm85h10r/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..3bba01ed41ae91c05ff4b3d81be35fc9842df471 --- /dev/null +++ b/wandb/run-20241106_224416-qm85h10r/logs/debug.log @@ -0,0 +1,27 @@ +2024-11-06 22:44:16,908 INFO MainThread:1982914 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-11-06 22:44:16,908 INFO MainThread:1982914 [wandb_setup.py:_flush():79] Configure stats pid to 1982914 +2024-11-06 22:44:16,908 INFO MainThread:1982914 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-11-06 22:44:16,908 INFO MainThread:1982914 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-11-06 22:44:16,908 INFO MainThread:1982914 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-11-06 22:44:16,908 INFO MainThread:1982914 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-11-06 22:44:16,908 INFO MainThread:1982914 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-11-06 22:44:16,908 INFO MainThread:1982914 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-11-06 22:44:16,908 INFO MainThread:1982914 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241106_224416-qm85h10r/logs/debug.log +2024-11-06 22:44:16,909 INFO MainThread:1982914 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241106_224416-qm85h10r/logs/debug-internal.log +2024-11-06 22:44:16,909 INFO MainThread:1982914 [wandb_init.py:init():621] calling init triggers +2024-11-06 22:44:16,909 INFO MainThread:1982914 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-11-06 22:44:16,909 INFO MainThread:1982914 [wandb_init.py:init():671] starting backend +2024-11-06 22:44:16,909 INFO MainThread:1982914 [wandb_init.py:init():675] sending inform_init request +2024-11-06 22:44:16,910 INFO MainThread:1982914 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-11-06 22:44:16,911 INFO MainThread:1982914 [wandb_init.py:init():688] backend started and connected +2024-11-06 22:44:16,914 INFO MainThread:1982914 [wandb_init.py:init():783] updated telemetry +2024-11-06 22:44:16,935 INFO MainThread:1982914 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-11-06 22:44:17,188 INFO MainThread:1982914 [wandb_init.py:init():867] starting run threads in backend +2024-11-06 22:44:17,283 INFO MainThread:1982914 [wandb_run.py:_console_start():2463] atexit reg +2024-11-06 22:44:17,283 INFO MainThread:1982914 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-11-06 22:44:17,283 INFO MainThread:1982914 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-11-06 22:44:17,283 INFO MainThread:1982914 [wandb_run.py:_redirect():2401] Redirects installed. +2024-11-06 22:44:17,286 INFO MainThread:1982914 [wandb_init.py:init():911] run started, returning control to user process +2024-11-06 22:44:17,287 INFO MainThread:1982914 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'shuffle_deterministic84', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0, 'lr': 5e-06} +2024-11-06 22:44:19,092 WARNING MsgRouterThr:1982914 [router.py:message_loop():77] message_loop has been closed diff --git a/wandb/run-20241106_232725-hsmv8meh/files/config.yaml b/wandb/run-20241106_232725-hsmv8meh/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..45c9885a80c6c6551af0939982b4765da8541cf4 --- /dev/null +++ b/wandb/run-20241106_232725-hsmv8meh/files/config.yaml @@ -0,0 +1,49 @@ +_wandb: + value: + cli_version: 0.18.5 + m: [] + python_version: 3.9.19 + t: + "1": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "2": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "3": + - 13 + - 23 + - 55 + "4": 3.9.19 + "5": 0.18.5 + "6": 4.45.1 + "8": + - 5 + "12": 0.18.5 + "13": linux-x86_64 +batch_size: + value: 3 +epoch: + value: 3 +lr: + value: 5e-06 +perturbation: + value: shuffle_even_odd +seed: + value: 0 +train_set: + value: 10M diff --git a/wandb/run-20241106_232725-hsmv8meh/files/output.log b/wandb/run-20241106_232725-hsmv8meh/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..d532cae2029e21ae571681250504af85ad04247b --- /dev/null +++ b/wandb/run-20241106_232725-hsmv8meh/files/output.log @@ -0,0 +1,60 @@ +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_http.py", line 406, in hf_raise_for_status + response.raise_for_status() + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/requests/models.py", line 1024, in raise_for_status + raise HTTPError(http_error_msg, response=self) +requests.exceptions.HTTPError: 401 Client Error: Unauthorized for url: https://huggingface.co/meta-llama/Llama-3.2-3B/resolve/main/config.json + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 403, in cached_file + resolved_file = hf_hub_download( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_deprecation.py", line 101, in inner_f + return f(*args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn + return fn(*args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1232, in hf_hub_download + return _hf_hub_download_to_cache_dir( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1339, in _hf_hub_download_to_cache_dir + _raise_on_head_call_error(head_call_error, force_download, local_files_only) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1854, in _raise_on_head_call_error + raise head_call_error + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1746, in _get_metadata_or_catch_error + metadata = get_hf_file_metadata( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn + return fn(*args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1666, in get_hf_file_metadata + r = _request_wrapper( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 364, in _request_wrapper + response = _request_wrapper( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 388, in _request_wrapper + hf_raise_for_status(response) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_http.py", line 423, in hf_raise_for_status + raise _format(GatedRepoError, message, response) from e +huggingface_hub.errors.GatedRepoError: 401 Client Error. (Request ID: Root=1-672c41c6-757c33fc348a60132a184376;77614ac9-5777-4710-b972-37cdafbebed6) + +Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.2-3B/resolve/main/config.json. +Access to model meta-llama/Llama-3.2-3B is restricted. You must have access to it and be authenticated to access it. Please log in. + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", line 174, in + model = AutoModelForCausalLM.from_pretrained(model_name, + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/models/auto/auto_factory.py", line 526, in from_pretrained + config, kwargs = AutoConfig.from_pretrained( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/models/auto/configuration_auto.py", line 1006, in from_pretrained + config_dict, unused_kwargs = PretrainedConfig.get_config_dict(pretrained_model_name_or_path, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/configuration_utils.py", line 567, in get_config_dict + config_dict, kwargs = cls._get_config_dict(pretrained_model_name_or_path, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/configuration_utils.py", line 626, in _get_config_dict + resolved_config_file = cached_file( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 421, in cached_file + raise EnvironmentError( +OSError: You are trying to access a gated repo. +Make sure to have access to it at https://huggingface.co/meta-llama/Llama-3.2-3B. +401 Client Error. (Request ID: Root=1-672c41c6-757c33fc348a60132a184376;77614ac9-5777-4710-b972-37cdafbebed6) + +Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.2-3B/resolve/main/config.json. +Access to model meta-llama/Llama-3.2-3B is restricted. You must have access to it and be authenticated to access it. Please log in. diff --git a/wandb/run-20241106_232725-hsmv8meh/files/wandb-metadata.json b/wandb/run-20241106_232725-hsmv8meh/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..547d204eb25bc577a6f2d77e57b57df66b6c8083 --- /dev/null +++ b/wandb/run-20241106_232725-hsmv8meh/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-11-07T04:27:25.782570Z", + "args": [ + "--perturbation", + "shuffle_even_odd", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "3", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1775367774208" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241106_232725-hsmv8meh/files/wandb-summary.json b/wandb/run-20241106_232725-hsmv8meh/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..fea5e15dda6eb8b3ffcccab00b24d57fa587c95b --- /dev/null +++ b/wandb/run-20241106_232725-hsmv8meh/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":24}} \ No newline at end of file diff --git a/wandb/run-20241106_232725-hsmv8meh/logs/debug.log b/wandb/run-20241106_232725-hsmv8meh/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..3a3c758ba0dec81aae3c69426dafd61500a37ae3 --- /dev/null +++ b/wandb/run-20241106_232725-hsmv8meh/logs/debug.log @@ -0,0 +1,27 @@ +2024-11-06 23:27:25,780 INFO MainThread:1993251 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-11-06 23:27:25,780 INFO MainThread:1993251 [wandb_setup.py:_flush():79] Configure stats pid to 1993251 +2024-11-06 23:27:25,780 INFO MainThread:1993251 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-11-06 23:27:25,780 INFO MainThread:1993251 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-11-06 23:27:25,780 INFO MainThread:1993251 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-11-06 23:27:25,780 INFO MainThread:1993251 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-11-06 23:27:25,780 INFO MainThread:1993251 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-11-06 23:27:25,780 INFO MainThread:1993251 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-11-06 23:27:25,781 INFO MainThread:1993251 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241106_232725-hsmv8meh/logs/debug.log +2024-11-06 23:27:25,781 INFO MainThread:1993251 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241106_232725-hsmv8meh/logs/debug-internal.log +2024-11-06 23:27:25,781 INFO MainThread:1993251 [wandb_init.py:init():621] calling init triggers +2024-11-06 23:27:25,781 INFO MainThread:1993251 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-11-06 23:27:25,781 INFO MainThread:1993251 [wandb_init.py:init():671] starting backend +2024-11-06 23:27:25,781 INFO MainThread:1993251 [wandb_init.py:init():675] sending inform_init request +2024-11-06 23:27:25,782 INFO MainThread:1993251 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-11-06 23:27:25,782 INFO MainThread:1993251 [wandb_init.py:init():688] backend started and connected +2024-11-06 23:27:25,785 INFO MainThread:1993251 [wandb_init.py:init():783] updated telemetry +2024-11-06 23:27:25,809 INFO MainThread:1993251 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-11-06 23:27:26,367 INFO MainThread:1993251 [wandb_init.py:init():867] starting run threads in backend +2024-11-06 23:27:26,457 INFO MainThread:1993251 [wandb_run.py:_console_start():2463] atexit reg +2024-11-06 23:27:26,457 INFO MainThread:1993251 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-11-06 23:27:26,457 INFO MainThread:1993251 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-11-06 23:27:26,458 INFO MainThread:1993251 [wandb_run.py:_redirect():2401] Redirects installed. +2024-11-06 23:27:26,459 INFO MainThread:1993251 [wandb_init.py:init():911] run started, returning control to user process +2024-11-06 23:27:26,459 INFO MainThread:1993251 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'shuffle_even_odd', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0, 'lr': 5e-06} +2024-11-06 23:27:50,242 WARNING MsgRouterThr:1993251 [router.py:message_loop():77] message_loop has been closed diff --git a/wandb/run-20241106_232725-hsmv8meh/run-hsmv8meh.wandb b/wandb/run-20241106_232725-hsmv8meh/run-hsmv8meh.wandb new file mode 100644 index 0000000000000000000000000000000000000000..861e7193bd2ebdb9500f7b94d7fcf22d3c0da246 Binary files /dev/null and b/wandb/run-20241106_232725-hsmv8meh/run-hsmv8meh.wandb differ diff --git a/wandb/run-20241128_161554-ol74k8mz/files/config.yaml b/wandb/run-20241128_161554-ol74k8mz/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8d07c4054eb6cea6e4e93d3fb630156b4b6a6798 --- /dev/null +++ b/wandb/run-20241128_161554-ol74k8mz/files/config.yaml @@ -0,0 +1,50 @@ +_wandb: + value: + cli_version: 0.18.5 + m: [] + python_version: 3.9.19 + t: + "1": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "2": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "3": + - 2 + - 13 + - 23 + - 55 + "4": 3.9.19 + "5": 0.18.5 + "6": 4.45.1 + "8": + - 5 + "12": 0.18.5 + "13": linux-x86_64 +batch_size: + value: 3 +epoch: + value: 3 +lr: + value: 5e-06 +perturbation: + value: reverse_control +seed: + value: 0 +train_set: + value: 10M diff --git a/wandb/run-20241128_161554-ol74k8mz/files/wandb-summary.json b/wandb/run-20241128_161554-ol74k8mz/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..0f6aab003b4b8541a72d07e8d6bea8280b1dfec1 --- /dev/null +++ b/wandb/run-20241128_161554-ol74k8mz/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":53001}} \ No newline at end of file diff --git a/wandb/run-20241128_161638-a0iw6rlo/files/wandb-metadata.json b/wandb/run-20241128_161638-a0iw6rlo/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..e586077cdf1deae80375be1dc88fef6e86e907d9 --- /dev/null +++ b/wandb/run-20241128_161638-a0iw6rlo/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-11-28T21:16:38.430249Z", + "args": [ + "--perturbation", + "reverse_full", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "3", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_llama_1B.py", + "codePath": "train/train_llama_1B.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_llama_1B.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1723122212864" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241128_161638-a0iw6rlo/logs/debug.log b/wandb/run-20241128_161638-a0iw6rlo/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..0a5abfdd9f8e2a535511aa2a3903d7acbfcdffac --- /dev/null +++ b/wandb/run-20241128_161638-a0iw6rlo/logs/debug.log @@ -0,0 +1,36 @@ +2024-11-28 16:16:38,428 INFO MainThread:3102256 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-11-28 16:16:38,428 INFO MainThread:3102256 [wandb_setup.py:_flush():79] Configure stats pid to 3102256 +2024-11-28 16:16:38,428 INFO MainThread:3102256 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-11-28 16:16:38,428 INFO MainThread:3102256 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-11-28 16:16:38,428 INFO MainThread:3102256 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-11-28 16:16:38,428 INFO MainThread:3102256 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-11-28 16:16:38,428 INFO MainThread:3102256 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_llama_1B.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_llama_1B.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_llama_1B.py'} +2024-11-28 16:16:38,428 INFO MainThread:3102256 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-11-28 16:16:38,428 INFO MainThread:3102256 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241128_161638-a0iw6rlo/logs/debug.log +2024-11-28 16:16:38,428 INFO MainThread:3102256 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241128_161638-a0iw6rlo/logs/debug-internal.log +2024-11-28 16:16:38,428 INFO MainThread:3102256 [wandb_init.py:init():621] calling init triggers +2024-11-28 16:16:38,428 INFO MainThread:3102256 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-11-28 16:16:38,428 INFO MainThread:3102256 [wandb_init.py:init():671] starting backend +2024-11-28 16:16:38,428 INFO MainThread:3102256 [wandb_init.py:init():675] sending inform_init request +2024-11-28 16:16:38,429 INFO MainThread:3102256 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-11-28 16:16:38,430 INFO MainThread:3102256 [wandb_init.py:init():688] backend started and connected +2024-11-28 16:16:38,434 INFO MainThread:3102256 [wandb_init.py:init():783] updated telemetry +2024-11-28 16:16:38,463 INFO MainThread:3102256 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-11-28 16:16:38,699 INFO MainThread:3102256 [wandb_init.py:init():867] starting run threads in backend +2024-11-28 16:16:38,789 INFO MainThread:3102256 [wandb_run.py:_console_start():2463] atexit reg +2024-11-28 16:16:38,789 INFO MainThread:3102256 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-11-28 16:16:38,789 INFO MainThread:3102256 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-11-28 16:16:38,789 INFO MainThread:3102256 [wandb_run.py:_redirect():2401] Redirects installed. +2024-11-28 16:16:38,791 INFO MainThread:3102256 [wandb_init.py:init():911] run started, returning control to user process +2024-11-28 16:16:38,791 INFO MainThread:3102256 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_full', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0, 'lr': 5e-06} +2024-11-28 16:19:46,930 INFO MainThread:3102256 [wandb_run.py:_config_callback():1390] config_cb None None {'vocab_size': 128256, 'max_position_embeddings': 131072, 'hidden_size': 2048, 'intermediate_size': 8192, 'num_hidden_layers': 16, 'num_attention_heads': 32, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'pretraining_tp': 1, 'use_cache': True, 'rope_theta': 500000.0, 'rope_scaling': {'factor': 32.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}, 'attention_bias': False, 'attention_dropout': 0.0, 'mlp_bias': False, 'head_dim': 64, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': True, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['LlamaForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 128000, 'pad_token_id': None, 'eos_token_id': 128001, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'meta-llama/Llama-3.2-1B', 'transformers_version': '4.45.1', 'model_type': 'llama', 'output_dir': './checkpoints/Llama-3.2-1B/babylm_reverse_full_10M_seed0/runs', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 3, 'per_device_eval_batch_size': 8, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 2, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-06, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.1, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': './logs', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 100, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 0, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': False, 'fp16': True, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 10, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': './checkpoints/Llama-3.2-1B/babylm_reverse_full_10M_seed0/runs', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': 'deepspeed_config/train_dp_config.json', 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'evaluation_strategy': 'steps', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'eval_use_gather_object': False} +2024-11-28 16:19:46,932 INFO MainThread:3102256 [wandb_config.py:__setitem__():154] config set model/num_parameters = 1235814400 - > +2024-11-28 16:19:46,932 INFO MainThread:3102256 [wandb_run.py:_config_callback():1390] config_cb model/num_parameters 1235814400 None +2024-11-29 05:45:58,294 INFO MainThread:3102256 [wandb_run.py:_finish():2158] finishing run yaning1001-dartmouth-college/exp-impo-reverse/a0iw6rlo +2024-11-29 05:45:58,398 INFO MainThread:3102256 [wandb_run.py:_atexit_cleanup():2426] got exitcode: 0 +2024-11-29 05:45:58,398 INFO MainThread:3102256 [wandb_run.py:_restore():2408] restore +2024-11-29 05:45:58,398 INFO MainThread:3102256 [wandb_run.py:_restore():2414] restore done +2024-11-29 05:46:00,762 INFO MainThread:3102256 [wandb_run.py:_footer_history_summary_info():3975] rendering history +2024-11-29 05:46:00,763 INFO MainThread:3102256 [wandb_run.py:_footer_history_summary_info():4007] rendering summary +2024-11-29 05:46:00,772 INFO MainThread:3102256 [wandb_run.py:_footer_sync_info():3934] logging synced files diff --git a/wandb/run-20241129_235241-3ftwr60m/files/output.log b/wandb/run-20241129_235241-3ftwr60m/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..2928f959b1c495a52efaf828d2ea2428c2949fa2 --- /dev/null +++ b/wandb/run-20241129_235241-3ftwr60m/files/output.log @@ -0,0 +1,44 @@ +Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17519/17519 [00:32<00:00, 539.49 examples/s] +Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 18140/18140 [00:35<00:00, 512.06 examples/s] +tokenized_valid: Dataset({ + features: ['input_ids', 'attention_mask'], + num_rows: 1000 +}) +/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead + warnings.warn( +[2024-11-29 23:54:16,754] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-11-29 23:54:19,739] [INFO] [comm.py:652:init_distributed] cdb=None +Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination +Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root... +Creating extension directory /home/chunhui/.cache/torch_extensions/py39_cu117/cpu_adam... +Emitting ninja build file /home/chunhui/.cache/torch_extensions/py39_cu117/cpu_adam/build.ninja... +Building extension module cpu_adam... +Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +Loading extension module cpu_adam... +Time to load cpu_adam op: 28.528657913208008 seconds +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_gpt2.py", line 114, in + trainer.train() + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/trainer.py", line 2052, in train + return inner_training_loop( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/trainer.py", line 2388, in _inner_training_loop + tr_loss_step = self.training_step(model, inputs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/trainer.py", line 3518, in training_step + self.accelerator.backward(loss, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/accelerate/accelerator.py", line 2238, in backward + self.deepspeed_engine_wrapped.backward(loss, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/accelerate/utils/deepspeed.py", line 186, in backward + self.engine.backward(loss, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/deepspeed/utils/nvtx.py", line 18, in wrapped_fn + ret_val = func(*args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/deepspeed/runtime/engine.py", line 2020, in backward + self.optimizer.backward(loss, retain_graph=retain_graph) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 2063, in backward + self.loss_scaler.backward(loss.float(), retain_graph=retain_graph) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/deepspeed/runtime/fp16/loss_scaler.py", line 63, in backward + scaled_loss.backward(retain_graph=retain_graph) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/torch/_tensor.py", line 487, in backward + torch.autograd.backward( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/torch/autograd/__init__.py", line 200, in backward + Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass +KeyboardInterrupt diff --git a/wandb/run-20241129_235241-3ftwr60m/files/wandb-metadata.json b/wandb/run-20241129_235241-3ftwr60m/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..ef2930193a1a1b20a423538f00a05b76ff96a1ec --- /dev/null +++ b/wandb/run-20241129_235241-3ftwr60m/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-11-30T04:52:41.156259Z", + "args": [ + "--perturbation", + "reverse_control", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "3", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_gpt2.py", + "codePath": "train/train_gpt2.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_gpt2.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1719074058240" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241129_235241-3ftwr60m/logs/debug.log b/wandb/run-20241129_235241-3ftwr60m/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..27c0be6f00d9756bf45c3be066570e6f288ee6f0 --- /dev/null +++ b/wandb/run-20241129_235241-3ftwr60m/logs/debug.log @@ -0,0 +1,26 @@ +2024-11-29 23:52:41,154 INFO MainThread:3200480 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-11-29 23:52:41,154 INFO MainThread:3200480 [wandb_setup.py:_flush():79] Configure stats pid to 3200480 +2024-11-29 23:52:41,154 INFO MainThread:3200480 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-11-29 23:52:41,154 INFO MainThread:3200480 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-11-29 23:52:41,154 INFO MainThread:3200480 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-11-29 23:52:41,154 INFO MainThread:3200480 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-11-29 23:52:41,154 INFO MainThread:3200480 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_gpt2.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_gpt2.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_gpt2.py'} +2024-11-29 23:52:41,154 INFO MainThread:3200480 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-11-29 23:52:41,154 INFO MainThread:3200480 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241129_235241-3ftwr60m/logs/debug.log +2024-11-29 23:52:41,154 INFO MainThread:3200480 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241129_235241-3ftwr60m/logs/debug-internal.log +2024-11-29 23:52:41,155 INFO MainThread:3200480 [wandb_init.py:init():621] calling init triggers +2024-11-29 23:52:41,155 INFO MainThread:3200480 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-11-29 23:52:41,155 INFO MainThread:3200480 [wandb_init.py:init():671] starting backend +2024-11-29 23:52:41,155 INFO MainThread:3200480 [wandb_init.py:init():675] sending inform_init request +2024-11-29 23:52:41,155 INFO MainThread:3200480 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-11-29 23:52:41,156 INFO MainThread:3200480 [wandb_init.py:init():688] backend started and connected +2024-11-29 23:52:41,159 INFO MainThread:3200480 [wandb_init.py:init():783] updated telemetry +2024-11-29 23:52:41,179 INFO MainThread:3200480 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-11-29 23:52:41,457 INFO MainThread:3200480 [wandb_init.py:init():867] starting run threads in backend +2024-11-29 23:52:41,554 INFO MainThread:3200480 [wandb_run.py:_console_start():2463] atexit reg +2024-11-29 23:52:41,554 INFO MainThread:3200480 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-11-29 23:52:41,554 INFO MainThread:3200480 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-11-29 23:52:41,554 INFO MainThread:3200480 [wandb_run.py:_redirect():2401] Redirects installed. +2024-11-29 23:52:41,556 INFO MainThread:3200480 [wandb_init.py:init():911] run started, returning control to user process +2024-11-29 23:52:41,557 INFO MainThread:3200480 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_control', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0, 'lr': 5e-06} diff --git a/wandb/run-20241129_235322-dmnv987j/logs/debug-core.log b/wandb/run-20241129_235322-dmnv987j/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..f91cd6b02de6a6ec78409443d6d3e29fe713b2b3 --- /dev/null +++ b/wandb/run-20241129_235322-dmnv987j/logs/debug-core.log @@ -0,0 +1,24 @@ +{"time":"2024-11-29T23:53:22.347749866-05:00","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/tmp381bxy7m/port-3201035.txt","pid":3201035,"debug":false,"disable-analytics":false} +{"time":"2024-11-29T23:53:22.347781046-05:00","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false} +{"time":"2024-11-29T23:53:22.348347541-05:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":3201035} +{"time":"2024-11-29T23:53:22.348352301-05:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":37153,"Zone":""}} +{"time":"2024-11-29T23:53:22.516481832-05:00","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/tmp247q5ejk/port-3201036.txt","pid":3201036,"debug":false,"disable-analytics":false} +{"time":"2024-11-29T23:53:22.516525052-05:00","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false} +{"time":"2024-11-29T23:53:22.517164117-05:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":3201036} +{"time":"2024-11-29T23:53:22.517175547-05:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":42113,"Zone":""}} +{"time":"2024-11-29T23:53:22.541083849-05:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:37500"} +{"time":"2024-11-29T23:53:22.543159876-05:00","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/tmp8356drfi/port-3201034.txt","pid":3201034,"debug":false,"disable-analytics":false} +{"time":"2024-11-29T23:53:22.543210716-05:00","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false} +{"time":"2024-11-29T23:53:22.54369121-05:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":3201034} +{"time":"2024-11-29T23:53:22.5436952-05:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":39837,"Zone":""}} +{"time":"2024-11-29T23:53:22.700622841-05:00","level":"INFO","msg":"handleInformInit: received","streamId":"dmnv987j","id":"127.0.0.1:37500"} +{"time":"2024-11-29T23:53:22.709600433-05:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:46774"} +{"time":"2024-11-29T23:53:22.736569859-05:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:53958"} +{"time":"2024-11-29T23:53:22.80628015-05:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"dmnv987j","id":"127.0.0.1:37500"} +{"time":"2024-11-29T23:53:22.871288412-05:00","level":"INFO","msg":"handleInformInit: received","streamId":"bxqdruiw","id":"127.0.0.1:46774"} +{"time":"2024-11-29T23:53:22.887605973-05:00","level":"INFO","msg":"handleInformInit: received","streamId":"uoj0g6xp","id":"127.0.0.1:53958"} +{"time":"2024-11-29T23:53:22.97810181-05:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"bxqdruiw","id":"127.0.0.1:46774"} +{"time":"2024-11-29T23:53:22.993992888-05:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"uoj0g6xp","id":"127.0.0.1:53958"} +{"time":"2024-11-30T00:02:09.611158081-05:00","level":"INFO","msg":"Parent process exited, terminating service process."} +{"time":"2024-11-30T00:02:09.78251932-05:00","level":"INFO","msg":"Parent process exited, terminating service process."} +{"time":"2024-11-30T00:02:09.933472721-05:00","level":"INFO","msg":"Parent process exited, terminating service process."} diff --git a/wandb/run-20241129_235322-dmnv987j/run-dmnv987j.wandb b/wandb/run-20241129_235322-dmnv987j/run-dmnv987j.wandb new file mode 100644 index 0000000000000000000000000000000000000000..f5cb92174045149b6bf6d70860a5bf7ad8d3f9f5 --- /dev/null +++ b/wandb/run-20241129_235322-dmnv987j/run-dmnv987j.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ed7fb9e059a50ebad73536226d260027d181a8fa45042daec7e5fb6dc3081aa +size 425984