diff --git a/.gitattributes b/.gitattributes index fc11d84836fbe4639b2a81fa3379877fa4a4b75f..4a58d2912f3c3acfe06a6707ffad50fbcbadd049 100644 --- a/.gitattributes +++ b/.gitattributes @@ -92,3 +92,8 @@ wandb/run-20241106_232725-f16bcfrx/run-f16bcfrx.wandb filter=lfs diff=lfs merge= wandb/run-20241105_163244-59l4qxgx/run-59l4qxgx.wandb filter=lfs diff=lfs merge=lfs -text wandb/run-20241101_202058-ptl7coag/run-ptl7coag.wandb filter=lfs diff=lfs merge=lfs -text wandb/run-20241129_235322-bxqdruiw/run-bxqdruiw.wandb filter=lfs diff=lfs merge=lfs -text +wandb/run-20241030_233740-np98q8en/run-np98q8en.wandb filter=lfs diff=lfs merge=lfs -text +wandb/run-20241101_202058-hjyig8so/run-hjyig8so.wandb filter=lfs diff=lfs merge=lfs -text +wandb/run-20241101_012733-3tsgnm2p/run-3tsgnm2p.wandb filter=lfs diff=lfs merge=lfs -text +wandb/run-20241030_112852-av3r7rx8/run-av3r7rx8.wandb filter=lfs diff=lfs merge=lfs -text +wandb/run-20241115_125218-rrve0rbk/run-rrve0rbk.wandb filter=lfs diff=lfs merge=lfs -text diff --git a/wandb/run-20241030_013339-pahr4hk1/files/output.log b/wandb/run-20241030_013339-pahr4hk1/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..25f7745e230d4f0d6a9ee016537fd6c75f7292e2 --- /dev/null +++ b/wandb/run-20241030_013339-pahr4hk1/files/output.log @@ -0,0 +1,21 @@ +config.json: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 844/844 [00:00<00:00, 279kB/s] +model.safetensors.index.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20.9k/20.9k [00:00<00:00, 18.9MB/s] +model-00001-of-00002.safetensors: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 4.97G/4.97G [01:57<00:00, 42.1MB/s] +model-00002-of-00002.safetensors: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.46G/1.46G [00:34<00:00, 42.5MB/s] +Downloading shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [02:32<00:00, 76.19s/it] +Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:04<00:00, 2.10s/it] +generation_config.json: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 185/185 [00:00<00:00, 74.7kB/s] +Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17519/17519 [00:57<00:00, 302.93 examples/s] +Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 18140/18140 [00:57<00:00, 316.26 examples/s] +tokenized_valid: Dataset({ + features: ['input_ids', 'attention_mask'], + num_rows: 600 +}) +/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead + warnings.warn( +[2024-10-30 01:38:13,749] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-10-30 01:38:21,729] [INFO] [comm.py:652:init_distributed] cdb=None +Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination +Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root... +Loading extension module cpu_adam... +Time to load cpu_adam op: 4.509259939193726 seconds diff --git a/wandb/run-20241030_013339-pahr4hk1/files/requirements.txt b/wandb/run-20241030_013339-pahr4hk1/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416 --- /dev/null +++ b/wandb/run-20241030_013339-pahr4hk1/files/requirements.txt @@ -0,0 +1,147 @@ +funcsigs==1.0.2 +sentry-sdk==2.17.0 +multiprocess==0.70.16 +numpy==1.26.2 +pluralizer==1.2.0 +debugpy==1.6.7 +nvidia-cudnn-cu11==8.5.0.96 +deepspeed==0.15.2 +data==0.4 +pandas==2.1.3 +tomli==2.0.1 +charset-normalizer==3.3.2 +attrs==24.2.0 +aiosignal==1.3.1 +fsspec==2023.10.0 +nvidia-cusparse-cu11==11.7.4.91 +zipp==3.12.0 +mypy-extensions==1.0.0 +datasets==3.0.1 +joblib==1.3.2 +hjson==3.1.0 +traitlets==5.7.1 +stack-data==0.6.0 +transformers==4.45.1 +sympy==1.11.1 +Pygments==2.15.0 +docker-pycreds==0.4.0 +dill==0.3.8 +wheel==0.44.0 +prompt-toolkit==3.0.30 +parso==0.8.3 +ipykernel==6.23.1 +pyarrow==17.0.0 +certifi==2023.11.17 +nvidia-cufft-cu11==10.9.0.58 +six==1.16.0 +pydantic==2.9.2 +click==8.1.7 +nest-asyncio==1.5.6 +gmpy2==2.1.0 +matplotlib==3.8.2 +scipy==1.11.4 +typing_extensions==4.12.2 +statsmodels==0.14.0 +huggingface-hub==0.25.0 +frozenlist==1.4.1 +gpustat==1.1.1 +nvidia-nvtx-cu11==11.7.91 +safetensors==0.4.5 +stanza==1.9.2 +decorator==5.1.1 +seaborn==0.13.0 +sentencepiece==0.2.0 +PyYAML==6.0.1 +black==24.8.0 +protobuf==4.25.1 +pickleshare==0.7.5 +peft==0.13.0 +triton==2.0.0 +nvidia-cuda-runtime-cu11==11.7.99 +Jinja2==3.1.2 +nvidia-cusolver-cu11==11.4.0.1 +executing==1.2.0 +jupyter_client==8.1.0 +pluggy==1.3.0 +cmake==3.30.3 +pytz==2023.3.post1 +aiohappyeyeballs==2.4.2 +kiwisolver==1.4.5 +py-cpuinfo==9.0.0 +Pillow==10.1.0 +ptyprocess==0.7.0 +importlib_resources==6.4.5 +GitPython==3.1.43 +importlib-metadata==6.0.0 +iniconfig==2.0.0 +scikit-learn==1.3.2 +exceptiongroup==1.1.0 +networkx==2.8.6 +accelerate==1.0.0 +nltk==3.8.1 +shutilwhich==1.1.0 +fonttools==4.45.1 +future==0.18.3 +aiohttp==3.10.6 +wcwidth==0.2.5 +idna==3.6 +filelock==3.12.2 +pathspec==0.12.1 +jupyter_core==5.1.0 +lit==18.1.8 +nvidia-curand-cu11==10.2.10.91 +nvidia-cublas-cu11==11.10.3.66 +nvidia-ml-py==12.560.30 +msgpack==1.1.0 +python-dateutil==2.8.2 +blessed==1.20.0 +packaging==23.0 +gitdb==4.0.11 +yarl==1.13.0 +emoji==2.8.0 +tzdata==2023.3 +cycler==0.12.1 +tornado==6.2 +backcall==0.2.0 +plotnine==0.12.4 +ninja==1.11.1.1 +latex==0.7.0 +wandb==0.18.5 +setproctitle==1.3.3 +threadpoolctl==3.2.0 +requests==2.32.3 +pyparsing==3.1.1 +smmap==5.0.1 +pyzmq==23.0.0 +async-timeout==4.0.3 +annotated-types==0.7.0 +matplotlib-inline==0.1.6 +latexcodec==1.0.0 +ipython==8.0.0 +patsy==0.5.3 +contourpy==1.2.0 +multidict==6.1.0 +mizani==0.9.3 +urllib3==2.1.0 +tokenizers==0.20.0 +MarkupSafe==2.1.2 +pip==24.2 +pexpect==4.8.0 +tqdm==4.66.5 +jedi==0.18.2 +pydantic_core==2.23.4 +tempdir==0.7.1 +mpmath==1.2.1 +setuptools==72.1.0 +pytest==7.4.3 +pure-eval==0.2.2 +psutil==5.9.1 +comm==0.1.2 +nvidia-cuda-cupti-cu11==11.7.101 +nvidia-cuda-nvrtc-cu11==11.7.99 +regex==2023.10.3 +platformdirs==2.5.2 +asttokens==2.2.1 +torch==2.0.0 +nvidia-nccl-cu11==2.14.3 +xxhash==3.5.0 diff --git a/wandb/run-20241030_013339-pahr4hk1/files/wandb-metadata.json b/wandb/run-20241030_013339-pahr4hk1/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..164d966dc1945d88de7e661aa2f20976a21d3f9d --- /dev/null +++ b/wandb/run-20241030_013339-pahr4hk1/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-10-30T05:33:39.715818Z", + "args": [ + "--perturbation", + "reverse_full", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "7", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1710081847296" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241030_013339-pahr4hk1/logs/debug-internal.log b/wandb/run-20241030_013339-pahr4hk1/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..3b74ecd94f1299b97283f907e5d8c384b84039a4 --- /dev/null +++ b/wandb/run-20241030_013339-pahr4hk1/logs/debug-internal.log @@ -0,0 +1,8 @@ +{"time":"2024-10-30T01:33:39.718219018-04:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-10-30T01:33:39.718234308-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_013339-pahr4hk1/logs/debug-core.log"} +{"time":"2024-10-30T01:33:39.825323399-04:00","level":"INFO","msg":"created new stream","id":"pahr4hk1"} +{"time":"2024-10-30T01:33:39.825359099-04:00","level":"INFO","msg":"stream: started","id":"pahr4hk1"} +{"time":"2024-10-30T01:33:39.825400679-04:00","level":"INFO","msg":"sender: started","stream_id":"pahr4hk1"} +{"time":"2024-10-30T01:33:39.825385829-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"pahr4hk1"}} +{"time":"2024-10-30T01:33:39.825383059-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"pahr4hk1"}} +{"time":"2024-10-30T01:33:39.990823292-04:00","level":"INFO","msg":"Starting system monitor"} diff --git a/wandb/run-20241030_013339-pahr4hk1/logs/debug.log b/wandb/run-20241030_013339-pahr4hk1/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..75ee7e25d57d739f65af2a17445144eb6825985d --- /dev/null +++ b/wandb/run-20241030_013339-pahr4hk1/logs/debug.log @@ -0,0 +1,26 @@ +2024-10-30 01:33:39,713 INFO MainThread:337258 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-10-30 01:33:39,714 INFO MainThread:337258 [wandb_setup.py:_flush():79] Configure stats pid to 337258 +2024-10-30 01:33:39,714 INFO MainThread:337258 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-10-30 01:33:39,714 INFO MainThread:337258 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-10-30 01:33:39,714 INFO MainThread:337258 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-10-30 01:33:39,714 INFO MainThread:337258 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-10-30 01:33:39,714 INFO MainThread:337258 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-10-30 01:33:39,714 INFO MainThread:337258 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-10-30 01:33:39,714 INFO MainThread:337258 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_013339-pahr4hk1/logs/debug.log +2024-10-30 01:33:39,714 INFO MainThread:337258 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_013339-pahr4hk1/logs/debug-internal.log +2024-10-30 01:33:39,714 INFO MainThread:337258 [wandb_init.py:init():621] calling init triggers +2024-10-30 01:33:39,714 INFO MainThread:337258 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-10-30 01:33:39,714 INFO MainThread:337258 [wandb_init.py:init():671] starting backend +2024-10-30 01:33:39,714 INFO MainThread:337258 [wandb_init.py:init():675] sending inform_init request +2024-10-30 01:33:39,715 INFO MainThread:337258 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-10-30 01:33:39,715 INFO MainThread:337258 [wandb_init.py:init():688] backend started and connected +2024-10-30 01:33:39,718 INFO MainThread:337258 [wandb_init.py:init():783] updated telemetry +2024-10-30 01:33:39,748 INFO MainThread:337258 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-10-30 01:33:39,986 INFO MainThread:337258 [wandb_init.py:init():867] starting run threads in backend +2024-10-30 01:33:40,076 INFO MainThread:337258 [wandb_run.py:_console_start():2463] atexit reg +2024-10-30 01:33:40,076 INFO MainThread:337258 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-10-30 01:33:40,076 INFO MainThread:337258 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-10-30 01:33:40,076 INFO MainThread:337258 [wandb_run.py:_redirect():2401] Redirects installed. +2024-10-30 01:33:40,077 INFO MainThread:337258 [wandb_init.py:init():911] run started, returning control to user process +2024-10-30 01:33:40,078 INFO MainThread:337258 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_full', 'train_set': '10M', 'batch_size': 3, 'epoch': 7, 'seed': 0} diff --git a/wandb/run-20241030_112852-av3r7rx8/run-av3r7rx8.wandb b/wandb/run-20241030_112852-av3r7rx8/run-av3r7rx8.wandb new file mode 100644 index 0000000000000000000000000000000000000000..e1889d60cd6b1335417e7f194da470a45cc0426d --- /dev/null +++ b/wandb/run-20241030_112852-av3r7rx8/run-av3r7rx8.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82826fe5130b74ba5b11ab4c02aeb2ce26500de1060a76c495fae27c1f928f5f +size 14185176 diff --git a/wandb/run-20241030_112852-mfvd6tgw/files/config.yaml b/wandb/run-20241030_112852-mfvd6tgw/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f639ffb58b2b21d07229ba50bd66c486bf92404a --- /dev/null +++ b/wandb/run-20241030_112852-mfvd6tgw/files/config.yaml @@ -0,0 +1,48 @@ +_wandb: + value: + cli_version: 0.18.5 + m: [] + python_version: 3.9.19 + t: + "1": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "2": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "3": + - 2 + - 13 + - 23 + - 55 + "4": 3.9.19 + "5": 0.18.5 + "6": 4.45.1 + "8": + - 5 + "12": 0.18.5 + "13": linux-x86_64 +batch_size: + value: 3 +epoch: + value: 3 +perturbation: + value: reverse_control +seed: + value: 0 +train_set: + value: 10M diff --git a/wandb/run-20241030_112852-mfvd6tgw/files/output.log b/wandb/run-20241030_112852-mfvd6tgw/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..6087df3e4060a5013ac65c15ed315a59a9c97d23 --- /dev/null +++ b/wandb/run-20241030_112852-mfvd6tgw/files/output.log @@ -0,0 +1,17 @@ +Downloading shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [02:08<00:00, 64.23s/it] +Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:04<00:00, 2.16s/it] +generation_config.json: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 185/185 [00:00<00:00, 57.7kB/s] +Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 18140/18140 [00:49<00:00, 367.87 examples/s] +tokenized_valid: Dataset({ + features: ['input_ids', 'attention_mask'], + num_rows: 600 +}) +/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead + warnings.warn( +[2024-10-30 11:31:57,082] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-10-30 11:32:05,240] [INFO] [comm.py:652:init_distributed] cdb=None +Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination +Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root... +Loading extension module cpu_adam... +Time to load cpu_adam op: 4.680240869522095 seconds +wandb: WARNING Fatal error while uploading data. Some run data will not be synced, but it will still be written to disk. Use `wandb sync` at the end of the run to try uploading. diff --git a/wandb/run-20241030_112852-mfvd6tgw/files/wandb-metadata.json b/wandb/run-20241030_112852-mfvd6tgw/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..3edf6791a99cf2bdc102b41272fac42b6ae62ff7 --- /dev/null +++ b/wandb/run-20241030_112852-mfvd6tgw/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-10-30T15:28:52.868095Z", + "args": [ + "--perturbation", + "reverse_control", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "3", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1710831611904" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241030_112852-mfvd6tgw/files/wandb-summary.json b/wandb/run-20241030_112852-mfvd6tgw/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..d3aab1510450119f0c2bf41f933600948398d3c4 --- /dev/null +++ b/wandb/run-20241030_112852-mfvd6tgw/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":23503}} \ No newline at end of file diff --git a/wandb/run-20241030_112852-mfvd6tgw/logs/debug-internal.log b/wandb/run-20241030_112852-mfvd6tgw/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..4e766c877158ad5aa5eb91bab0b069ecffa261f1 --- /dev/null +++ b/wandb/run-20241030_112852-mfvd6tgw/logs/debug-internal.log @@ -0,0 +1,108 @@ +{"time":"2024-10-30T11:28:52.871579904-04:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-10-30T11:28:52.871600584-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_112852-mfvd6tgw/logs/debug-core.log"} +{"time":"2024-10-30T11:28:52.978841939-04:00","level":"INFO","msg":"created new stream","id":"mfvd6tgw"} +{"time":"2024-10-30T11:28:52.978902739-04:00","level":"INFO","msg":"stream: started","id":"mfvd6tgw"} +{"time":"2024-10-30T11:28:52.978926819-04:00","level":"INFO","msg":"sender: started","stream_id":"mfvd6tgw"} +{"time":"2024-10-30T11:28:52.97893628-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"mfvd6tgw"}} +{"time":"2024-10-30T11:28:52.978909109-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"mfvd6tgw"}} +{"time":"2024-10-30T11:28:53.155627531-04:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2024-10-30T14:03:08.474581851-04:00","level":"ERROR","msg":"HTTP error","status":404,"method":"POST","url":"https://api.wandb.ai/files/yaning1001-dartmouth-college/impossible_llm_reverse/mfvd6tgw/file_stream"} +{"time":"2024-10-30T14:03:08.487108589-04:00","level":"ERROR+4","msg":"filestream: fatal error: filestream: failed to upload: 404 Not Found path=files/yaning1001-dartmouth-college/impossible_llm_reverse/mfvd6tgw/file_stream: {\"error\":\"run impossible_llm_reverse/mfvd6tgw not found while streaming file\"}"} +{"time":"2024-10-30T15:29:42.382518231-04:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2024-10-30T18:00:35.96830223-04:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2024-10-30T18:00:35.984486769-04:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2024-10-30T18:00:36.012170229-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:00:36.969090118-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":1.01417858,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:00:38.26382741-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:00:42.839562556-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:00:52.373625385-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:01:08.798689764-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:01:37.002153025-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":61.047236517,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:01:48.282746193-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:02:37.021865642-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":121.066947764,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:02:48.335092366-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:03:37.04344863-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":181.088531722,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:03:48.393221692-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:04:37.067486978-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":241.11257044,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:04:48.448890729-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:05:37.08852491-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":301.133606772,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:05:48.502838457-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:06:37.117047582-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":361.162133104,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:06:48.554665436-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:07:37.139331089-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":421.184416131,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:07:48.61386156-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:08:37.166847552-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":481.211928004,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:08:48.666190809-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:09:37.194996856-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":541.240081368,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:09:48.721077294-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:10:35.955456557-04:00","level":"WARN","msg":"sender: taking a long time","seconds":600.000428857,"work":"WorkRecord(*service_go_proto.Record_Telemetry); Control(connection_id:\"127.0.0.1:35326\")"} +{"time":"2024-10-30T18:10:37.216651416-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":601.261725058,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:10:48.773966229-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:11:37.244172578-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":661.28925569,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:11:48.824895668-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:12:37.298658553-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":721.343740985,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:12:48.922946274-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:13:37.319508977-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":781.364593549,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:13:48.976226497-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:14:37.34947864-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":841.394560982,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:14:49.026538556-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:15:37.371409433-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":901.416490594,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:15:49.077219977-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:16:37.394261418-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":961.43934326,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:16:49.129011563-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:16:49.129142475-04:00","level":"ERROR","msg":"sender: sendConfig:","error":"api: failed sending: POST https://api.wandb.ai/graphql giving up after 21 attempt(s)"} +{"time":"2024-10-30T18:16:49.129310967-04:00","level":"INFO","msg":"sender: succeeded after taking longer than expected","seconds":973.174329898,"work":"WorkRecord(*service_go_proto.Record_Telemetry); Control(connection_id:\"127.0.0.1:35326\")"} +{"time":"2024-10-30T18:16:49.185435119-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:16:49.231279214-04:00","level":"ERROR","msg":"HTTP error","status":404,"method":"POST","url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:16:49.231344485-04:00","level":"ERROR","msg":"runfiles: CreateRunFiles returned error: returned error 404 Not Found: {\"errors\":[{\"message\":\"run impossible_llm_reverse/mfvd6tgw not found during createRunFiles\",\"path\":[\"createRunFiles\"]}],\"data\":{\"createRunFiles\":null}}"} +{"time":"2024-10-30T18:16:51.551546918-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:16:56.224687098-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:17:04.811694165-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:17:23.782790961-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:17:37.423013569-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":48.293350338,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:18:01.969943103-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:18:37.444499646-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":108.314835984,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:19:02.031162287-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:19:37.46831367-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":168.338651879,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:20:02.086803086-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:20:37.49484466-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":228.365184589,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:21:02.14808835-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:21:37.518777326-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":288.389114975,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:22:02.208668814-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:22:37.542083025-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":348.412418624,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:23:02.265589282-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:23:37.568249482-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":408.438587131,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:24:02.332185182-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:24:37.59016655-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":468.460504129,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:25:02.383191439-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:25:37.611919348-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":528.482255227,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:26:02.439712218-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:26:37.63751757-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":588.507844589,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:26:49.130687726-04:00","level":"WARN","msg":"sender: taking a long time","seconds":600.00065645,"work":"WorkRecord(*service_go_proto.Request_Defer); Control(local:true always_send:true)"} +{"time":"2024-10-30T18:27:02.49576742-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:27:37.658073199-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":648.528411148,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:28:02.556603286-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:28:37.678804315-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":708.549129014,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:29:02.616108339-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:29:37.705580303-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":768.575917741,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:30:02.669248598-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:30:37.73008217-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":828.600419649,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:31:02.724511612-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:31:37.753990664-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":888.624324932,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:32:02.784891698-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:32:37.778640619-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":948.648979298,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:33:02.873762515-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:33:02.873850586-04:00","level":"ERROR","msg":"sender: sendConfig:","error":"api: failed sending: POST https://api.wandb.ai/graphql giving up after 21 attempt(s)"} +{"time":"2024-10-30T18:33:02.874229689-04:00","level":"INFO","msg":"sender: succeeded after taking longer than expected","seconds":973.744352185,"work":"WorkRecord(*service_go_proto.Request_Defer); Control(local:true always_send:true)"} +{"time":"2024-10-30T18:33:02.972769976-04:00","level":"ERROR","msg":"HTTP error","status":404,"method":"POST","url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:33:02.972826436-04:00","level":"ERROR","msg":"runfiles: CreateRunFiles returned error: returned error 404 Not Found: {\"errors\":[{\"message\":\"run impossible_llm_reverse/mfvd6tgw not found during createRunFiles\",\"path\":[\"createRunFiles\"]}],\"data\":{\"createRunFiles\":null}}"} +{"time":"2024-10-30T18:33:03.110980965-04:00","level":"ERROR","msg":"HTTP error","status":404,"method":"POST","url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:33:03.111192917-04:00","level":"ERROR","msg":"sender: failed to save job artifact: ArtifactSaver.createManifest: returned error 404 Not Found: {\"errors\":[{\"message\":\"failed to find run impossible_llm_reverse/mfvd6tgw\",\"path\":[\"createArtifactManifest\"]}],\"data\":{\"createArtifactManifest\":null}}"} +{"time":"2024-10-30T18:33:03.162250522-04:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2024-10-30T18:33:03.212467488-04:00","level":"ERROR","msg":"HTTP error","status":404,"method":"POST","url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:33:03.212514949-04:00","level":"ERROR","msg":"runfiles: CreateRunFiles returned error: returned error 404 Not Found: {\"errors\":[{\"message\":\"run impossible_llm_reverse/mfvd6tgw not found during createRunFiles\",\"path\":[\"createRunFiles\"]}],\"data\":{\"createRunFiles\":null}}"} +{"time":"2024-10-30T18:33:04.172896443-04:00","level":"INFO","msg":"stream: closing","id":"mfvd6tgw"} +{"time":"2024-10-30T18:33:04.172936993-04:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"mfvd6tgw"}} +{"time":"2024-10-30T18:33:04.172965223-04:00","level":"INFO","msg":"sender: closed","stream_id":"mfvd6tgw"} +{"time":"2024-10-30T18:33:04.172958023-04:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"mfvd6tgw"}} +{"time":"2024-10-30T18:33:04.173060174-04:00","level":"INFO","msg":"stream: closed","id":"mfvd6tgw"} diff --git a/wandb/run-20241030_112852-mfvd6tgw/logs/debug.log b/wandb/run-20241030_112852-mfvd6tgw/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..7edf7303ae810ee4e3529d327f14d60cd4554f93 --- /dev/null +++ b/wandb/run-20241030_112852-mfvd6tgw/logs/debug.log @@ -0,0 +1,33 @@ +2024-10-30 11:28:52,866 INFO MainThread:367766 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-10-30 11:28:52,866 INFO MainThread:367766 [wandb_setup.py:_flush():79] Configure stats pid to 367766 +2024-10-30 11:28:52,866 INFO MainThread:367766 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-10-30 11:28:52,866 INFO MainThread:367766 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-10-30 11:28:52,866 INFO MainThread:367766 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-10-30 11:28:52,866 INFO MainThread:367766 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-10-30 11:28:52,866 INFO MainThread:367766 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-10-30 11:28:52,866 INFO MainThread:367766 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-10-30 11:28:52,866 INFO MainThread:367766 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_112852-mfvd6tgw/logs/debug.log +2024-10-30 11:28:52,866 INFO MainThread:367766 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_112852-mfvd6tgw/logs/debug-internal.log +2024-10-30 11:28:52,866 INFO MainThread:367766 [wandb_init.py:init():621] calling init triggers +2024-10-30 11:28:52,866 INFO MainThread:367766 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-10-30 11:28:52,866 INFO MainThread:367766 [wandb_init.py:init():671] starting backend +2024-10-30 11:28:52,866 INFO MainThread:367766 [wandb_init.py:init():675] sending inform_init request +2024-10-30 11:28:52,867 INFO MainThread:367766 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-10-30 11:28:52,867 INFO MainThread:367766 [wandb_init.py:init():688] backend started and connected +2024-10-30 11:28:52,871 INFO MainThread:367766 [wandb_init.py:init():783] updated telemetry +2024-10-30 11:28:52,896 INFO MainThread:367766 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-10-30 11:28:53,152 INFO MainThread:367766 [wandb_init.py:init():867] starting run threads in backend +2024-10-30 11:28:53,239 INFO MainThread:367766 [wandb_run.py:_console_start():2463] atexit reg +2024-10-30 11:28:53,239 INFO MainThread:367766 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-10-30 11:28:53,239 INFO MainThread:367766 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-10-30 11:28:53,239 INFO MainThread:367766 [wandb_run.py:_redirect():2401] Redirects installed. +2024-10-30 11:28:53,240 INFO MainThread:367766 [wandb_init.py:init():911] run started, returning control to user process +2024-10-30 11:28:53,240 INFO MainThread:367766 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_control', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0} +2024-10-30 18:00:35,945 INFO MainThread:367766 [wandb_run.py:_finish():2158] finishing run yaning1001-dartmouth-college/impossible_llm_reverse/mfvd6tgw +2024-10-30 18:00:35,954 INFO MainThread:367766 [wandb_run.py:_atexit_cleanup():2426] got exitcode: 0 +2024-10-30 18:00:35,955 INFO MainThread:367766 [wandb_run.py:_restore():2408] restore +2024-10-30 18:00:35,955 INFO MainThread:367766 [wandb_run.py:_restore():2414] restore done +2024-10-30 18:33:04,165 INFO MainThread:367766 [wandb_run.py:_footer_history_summary_info():3975] rendering history +2024-10-30 18:33:04,166 INFO MainThread:367766 [wandb_run.py:_footer_history_summary_info():4007] rendering summary +2024-10-30 18:33:04,172 INFO MainThread:367766 [wandb_run.py:_footer_sync_info():3934] logging synced files diff --git a/wandb/run-20241030_233740-np98q8en/run-np98q8en.wandb b/wandb/run-20241030_233740-np98q8en/run-np98q8en.wandb new file mode 100644 index 0000000000000000000000000000000000000000..28923260ad7b745641f15e36024cbede94cd7dba --- /dev/null +++ b/wandb/run-20241030_233740-np98q8en/run-np98q8en.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a85d468fd545c171fcd433a279c3cfc6210c775149911b0f456f66e43c4b7c64 +size 851968 diff --git a/wandb/run-20241031_000839-acpkxm8c/files/output.log b/wandb/run-20241031_000839-acpkxm8c/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..5f71b0a2c102ae4b7a8db5cd425444030c901e71 --- /dev/null +++ b/wandb/run-20241031_000839-acpkxm8c/files/output.log @@ -0,0 +1,13 @@ +Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:18<00:00, 9.31s/it] +tokenized_valid: Dataset({ + features: ['input_ids', 'attention_mask'], + num_rows: 600 +}) +/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead + warnings.warn( +[2024-10-31 00:09:00,175] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-10-31 00:09:10,031] [INFO] [comm.py:652:init_distributed] cdb=None +Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination +Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root... +Loading extension module cpu_adam... +Time to load cpu_adam op: 5.321483373641968 seconds diff --git a/wandb/run-20241031_000839-acpkxm8c/files/requirements.txt b/wandb/run-20241031_000839-acpkxm8c/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416 --- /dev/null +++ b/wandb/run-20241031_000839-acpkxm8c/files/requirements.txt @@ -0,0 +1,147 @@ +funcsigs==1.0.2 +sentry-sdk==2.17.0 +multiprocess==0.70.16 +numpy==1.26.2 +pluralizer==1.2.0 +debugpy==1.6.7 +nvidia-cudnn-cu11==8.5.0.96 +deepspeed==0.15.2 +data==0.4 +pandas==2.1.3 +tomli==2.0.1 +charset-normalizer==3.3.2 +attrs==24.2.0 +aiosignal==1.3.1 +fsspec==2023.10.0 +nvidia-cusparse-cu11==11.7.4.91 +zipp==3.12.0 +mypy-extensions==1.0.0 +datasets==3.0.1 +joblib==1.3.2 +hjson==3.1.0 +traitlets==5.7.1 +stack-data==0.6.0 +transformers==4.45.1 +sympy==1.11.1 +Pygments==2.15.0 +docker-pycreds==0.4.0 +dill==0.3.8 +wheel==0.44.0 +prompt-toolkit==3.0.30 +parso==0.8.3 +ipykernel==6.23.1 +pyarrow==17.0.0 +certifi==2023.11.17 +nvidia-cufft-cu11==10.9.0.58 +six==1.16.0 +pydantic==2.9.2 +click==8.1.7 +nest-asyncio==1.5.6 +gmpy2==2.1.0 +matplotlib==3.8.2 +scipy==1.11.4 +typing_extensions==4.12.2 +statsmodels==0.14.0 +huggingface-hub==0.25.0 +frozenlist==1.4.1 +gpustat==1.1.1 +nvidia-nvtx-cu11==11.7.91 +safetensors==0.4.5 +stanza==1.9.2 +decorator==5.1.1 +seaborn==0.13.0 +sentencepiece==0.2.0 +PyYAML==6.0.1 +black==24.8.0 +protobuf==4.25.1 +pickleshare==0.7.5 +peft==0.13.0 +triton==2.0.0 +nvidia-cuda-runtime-cu11==11.7.99 +Jinja2==3.1.2 +nvidia-cusolver-cu11==11.4.0.1 +executing==1.2.0 +jupyter_client==8.1.0 +pluggy==1.3.0 +cmake==3.30.3 +pytz==2023.3.post1 +aiohappyeyeballs==2.4.2 +kiwisolver==1.4.5 +py-cpuinfo==9.0.0 +Pillow==10.1.0 +ptyprocess==0.7.0 +importlib_resources==6.4.5 +GitPython==3.1.43 +importlib-metadata==6.0.0 +iniconfig==2.0.0 +scikit-learn==1.3.2 +exceptiongroup==1.1.0 +networkx==2.8.6 +accelerate==1.0.0 +nltk==3.8.1 +shutilwhich==1.1.0 +fonttools==4.45.1 +future==0.18.3 +aiohttp==3.10.6 +wcwidth==0.2.5 +idna==3.6 +filelock==3.12.2 +pathspec==0.12.1 +jupyter_core==5.1.0 +lit==18.1.8 +nvidia-curand-cu11==10.2.10.91 +nvidia-cublas-cu11==11.10.3.66 +nvidia-ml-py==12.560.30 +msgpack==1.1.0 +python-dateutil==2.8.2 +blessed==1.20.0 +packaging==23.0 +gitdb==4.0.11 +yarl==1.13.0 +emoji==2.8.0 +tzdata==2023.3 +cycler==0.12.1 +tornado==6.2 +backcall==0.2.0 +plotnine==0.12.4 +ninja==1.11.1.1 +latex==0.7.0 +wandb==0.18.5 +setproctitle==1.3.3 +threadpoolctl==3.2.0 +requests==2.32.3 +pyparsing==3.1.1 +smmap==5.0.1 +pyzmq==23.0.0 +async-timeout==4.0.3 +annotated-types==0.7.0 +matplotlib-inline==0.1.6 +latexcodec==1.0.0 +ipython==8.0.0 +patsy==0.5.3 +contourpy==1.2.0 +multidict==6.1.0 +mizani==0.9.3 +urllib3==2.1.0 +tokenizers==0.20.0 +MarkupSafe==2.1.2 +pip==24.2 +pexpect==4.8.0 +tqdm==4.66.5 +jedi==0.18.2 +pydantic_core==2.23.4 +tempdir==0.7.1 +mpmath==1.2.1 +setuptools==72.1.0 +pytest==7.4.3 +pure-eval==0.2.2 +psutil==5.9.1 +comm==0.1.2 +nvidia-cuda-cupti-cu11==11.7.101 +nvidia-cuda-nvrtc-cu11==11.7.99 +regex==2023.10.3 +platformdirs==2.5.2 +asttokens==2.2.1 +torch==2.0.0 +nvidia-nccl-cu11==2.14.3 +xxhash==3.5.0 diff --git a/wandb/run-20241031_000839-acpkxm8c/files/wandb-metadata.json b/wandb/run-20241031_000839-acpkxm8c/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..7ef921d695f7e806b17bce296a00c2324a084a1c --- /dev/null +++ b/wandb/run-20241031_000839-acpkxm8c/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-10-31T04:08:39.164895Z", + "args": [ + "--perturbation", + "reverse_full", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "6", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1727270539264" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241031_000839-acpkxm8c/logs/debug-internal.log b/wandb/run-20241031_000839-acpkxm8c/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..fc2d2172b1817a0b6d651c1dc63f8be796e5d5d7 --- /dev/null +++ b/wandb/run-20241031_000839-acpkxm8c/logs/debug-internal.log @@ -0,0 +1,8 @@ +{"time":"2024-10-31T00:08:39.166868509-04:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-10-31T00:08:39.166882039-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241031_000839-acpkxm8c/logs/debug-core.log"} +{"time":"2024-10-31T00:08:39.272997415-04:00","level":"INFO","msg":"created new stream","id":"acpkxm8c"} +{"time":"2024-10-31T00:08:39.273045656-04:00","level":"INFO","msg":"stream: started","id":"acpkxm8c"} +{"time":"2024-10-31T00:08:39.273081016-04:00","level":"INFO","msg":"sender: started","stream_id":"acpkxm8c"} +{"time":"2024-10-31T00:08:39.273075736-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"acpkxm8c"}} +{"time":"2024-10-31T00:08:39.273091156-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"acpkxm8c"}} +{"time":"2024-10-31T00:08:39.475372241-04:00","level":"INFO","msg":"Starting system monitor"} diff --git a/wandb/run-20241031_000839-acpkxm8c/logs/debug.log b/wandb/run-20241031_000839-acpkxm8c/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..4d8953de292a793d3816254846598f89b14a6fe2 --- /dev/null +++ b/wandb/run-20241031_000839-acpkxm8c/logs/debug.log @@ -0,0 +1,26 @@ +2024-10-31 00:08:39,162 INFO MainThread:477299 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-10-31 00:08:39,163 INFO MainThread:477299 [wandb_setup.py:_flush():79] Configure stats pid to 477299 +2024-10-31 00:08:39,163 INFO MainThread:477299 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-10-31 00:08:39,163 INFO MainThread:477299 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-10-31 00:08:39,163 INFO MainThread:477299 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-10-31 00:08:39,163 INFO MainThread:477299 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-10-31 00:08:39,163 INFO MainThread:477299 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-10-31 00:08:39,163 INFO MainThread:477299 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-10-31 00:08:39,163 INFO MainThread:477299 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241031_000839-acpkxm8c/logs/debug.log +2024-10-31 00:08:39,163 INFO MainThread:477299 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241031_000839-acpkxm8c/logs/debug-internal.log +2024-10-31 00:08:39,163 INFO MainThread:477299 [wandb_init.py:init():621] calling init triggers +2024-10-31 00:08:39,163 INFO MainThread:477299 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-10-31 00:08:39,163 INFO MainThread:477299 [wandb_init.py:init():671] starting backend +2024-10-31 00:08:39,163 INFO MainThread:477299 [wandb_init.py:init():675] sending inform_init request +2024-10-31 00:08:39,164 INFO MainThread:477299 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-10-31 00:08:39,164 INFO MainThread:477299 [wandb_init.py:init():688] backend started and connected +2024-10-31 00:08:39,167 INFO MainThread:477299 [wandb_init.py:init():783] updated telemetry +2024-10-31 00:08:39,194 INFO MainThread:477299 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-10-31 00:08:39,472 INFO MainThread:477299 [wandb_init.py:init():867] starting run threads in backend +2024-10-31 00:08:39,571 INFO MainThread:477299 [wandb_run.py:_console_start():2463] atexit reg +2024-10-31 00:08:39,571 INFO MainThread:477299 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-10-31 00:08:39,571 INFO MainThread:477299 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-10-31 00:08:39,571 INFO MainThread:477299 [wandb_run.py:_redirect():2401] Redirects installed. +2024-10-31 00:08:39,572 INFO MainThread:477299 [wandb_init.py:init():911] run started, returning control to user process +2024-10-31 00:08:39,572 INFO MainThread:477299 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_full', 'train_set': '10M', 'batch_size': 3, 'epoch': 6, 'seed': 0, 'lr': 1e-05} diff --git a/wandb/run-20241031_000839-acpkxm8c/run-acpkxm8c.wandb b/wandb/run-20241031_000839-acpkxm8c/run-acpkxm8c.wandb new file mode 100644 index 0000000000000000000000000000000000000000..151e867b7ea6eaf3f4d87e9ca3bb53b7802871c3 Binary files /dev/null and b/wandb/run-20241031_000839-acpkxm8c/run-acpkxm8c.wandb differ diff --git a/wandb/run-20241101_012733-3tsgnm2p/run-3tsgnm2p.wandb b/wandb/run-20241101_012733-3tsgnm2p/run-3tsgnm2p.wandb new file mode 100644 index 0000000000000000000000000000000000000000..315b629654dfdb6a5e5b96fd393792a415a9f5b4 --- /dev/null +++ b/wandb/run-20241101_012733-3tsgnm2p/run-3tsgnm2p.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:157ac6fcdd37aa5de6b664b2b512542501c187eda74f9151bcca4bd636d73df0 +size 1015808 diff --git a/wandb/run-20241101_200502-7hem25r3/files/output.log b/wandb/run-20241101_200502-7hem25r3/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..04ca77dae62c42467f2e3713ee4aa0dba0d01be1 --- /dev/null +++ b/wandb/run-20241101_200502-7hem25r3/files/output.log @@ -0,0 +1 @@ +Loading checkpoint shards: 0%| | 0/2 [00:00 + model = AutoModelForCausalLM.from_pretrained(model_name, + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/models/auto/auto_factory.py", line 564, in from_pretrained + return model_class.from_pretrained( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/modeling_utils.py", line 3769, in from_pretrained + resolved_archive_file, sharded_metadata = get_checkpoint_shard_files( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 1098, in get_checkpoint_shard_files + cached_filename = cached_file( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 403, in cached_file + resolved_file = hf_hub_download( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_deprecation.py", line 101, in inner_f + return f(*args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn + return fn(*args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1232, in hf_hub_download + return _hf_hub_download_to_cache_dir( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1380, in _hf_hub_download_to_cache_dir + with WeakFileLock(lock_path): + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/contextlib.py", line 119, in __enter__ + return next(self.gen) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_fixes.py", line 98, in WeakFileLock + lock.acquire() + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/filelock/_api.py", line 225, in acquire + time.sleep(poll_interval) +KeyboardInterrupt diff --git a/wandb/run-20241101_200517-vzv5zg2q/files/requirements.txt b/wandb/run-20241101_200517-vzv5zg2q/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416 --- /dev/null +++ b/wandb/run-20241101_200517-vzv5zg2q/files/requirements.txt @@ -0,0 +1,147 @@ +funcsigs==1.0.2 +sentry-sdk==2.17.0 +multiprocess==0.70.16 +numpy==1.26.2 +pluralizer==1.2.0 +debugpy==1.6.7 +nvidia-cudnn-cu11==8.5.0.96 +deepspeed==0.15.2 +data==0.4 +pandas==2.1.3 +tomli==2.0.1 +charset-normalizer==3.3.2 +attrs==24.2.0 +aiosignal==1.3.1 +fsspec==2023.10.0 +nvidia-cusparse-cu11==11.7.4.91 +zipp==3.12.0 +mypy-extensions==1.0.0 +datasets==3.0.1 +joblib==1.3.2 +hjson==3.1.0 +traitlets==5.7.1 +stack-data==0.6.0 +transformers==4.45.1 +sympy==1.11.1 +Pygments==2.15.0 +docker-pycreds==0.4.0 +dill==0.3.8 +wheel==0.44.0 +prompt-toolkit==3.0.30 +parso==0.8.3 +ipykernel==6.23.1 +pyarrow==17.0.0 +certifi==2023.11.17 +nvidia-cufft-cu11==10.9.0.58 +six==1.16.0 +pydantic==2.9.2 +click==8.1.7 +nest-asyncio==1.5.6 +gmpy2==2.1.0 +matplotlib==3.8.2 +scipy==1.11.4 +typing_extensions==4.12.2 +statsmodels==0.14.0 +huggingface-hub==0.25.0 +frozenlist==1.4.1 +gpustat==1.1.1 +nvidia-nvtx-cu11==11.7.91 +safetensors==0.4.5 +stanza==1.9.2 +decorator==5.1.1 +seaborn==0.13.0 +sentencepiece==0.2.0 +PyYAML==6.0.1 +black==24.8.0 +protobuf==4.25.1 +pickleshare==0.7.5 +peft==0.13.0 +triton==2.0.0 +nvidia-cuda-runtime-cu11==11.7.99 +Jinja2==3.1.2 +nvidia-cusolver-cu11==11.4.0.1 +executing==1.2.0 +jupyter_client==8.1.0 +pluggy==1.3.0 +cmake==3.30.3 +pytz==2023.3.post1 +aiohappyeyeballs==2.4.2 +kiwisolver==1.4.5 +py-cpuinfo==9.0.0 +Pillow==10.1.0 +ptyprocess==0.7.0 +importlib_resources==6.4.5 +GitPython==3.1.43 +importlib-metadata==6.0.0 +iniconfig==2.0.0 +scikit-learn==1.3.2 +exceptiongroup==1.1.0 +networkx==2.8.6 +accelerate==1.0.0 +nltk==3.8.1 +shutilwhich==1.1.0 +fonttools==4.45.1 +future==0.18.3 +aiohttp==3.10.6 +wcwidth==0.2.5 +idna==3.6 +filelock==3.12.2 +pathspec==0.12.1 +jupyter_core==5.1.0 +lit==18.1.8 +nvidia-curand-cu11==10.2.10.91 +nvidia-cublas-cu11==11.10.3.66 +nvidia-ml-py==12.560.30 +msgpack==1.1.0 +python-dateutil==2.8.2 +blessed==1.20.0 +packaging==23.0 +gitdb==4.0.11 +yarl==1.13.0 +emoji==2.8.0 +tzdata==2023.3 +cycler==0.12.1 +tornado==6.2 +backcall==0.2.0 +plotnine==0.12.4 +ninja==1.11.1.1 +latex==0.7.0 +wandb==0.18.5 +setproctitle==1.3.3 +threadpoolctl==3.2.0 +requests==2.32.3 +pyparsing==3.1.1 +smmap==5.0.1 +pyzmq==23.0.0 +async-timeout==4.0.3 +annotated-types==0.7.0 +matplotlib-inline==0.1.6 +latexcodec==1.0.0 +ipython==8.0.0 +patsy==0.5.3 +contourpy==1.2.0 +multidict==6.1.0 +mizani==0.9.3 +urllib3==2.1.0 +tokenizers==0.20.0 +MarkupSafe==2.1.2 +pip==24.2 +pexpect==4.8.0 +tqdm==4.66.5 +jedi==0.18.2 +pydantic_core==2.23.4 +tempdir==0.7.1 +mpmath==1.2.1 +setuptools==72.1.0 +pytest==7.4.3 +pure-eval==0.2.2 +psutil==5.9.1 +comm==0.1.2 +nvidia-cuda-cupti-cu11==11.7.101 +nvidia-cuda-nvrtc-cu11==11.7.99 +regex==2023.10.3 +platformdirs==2.5.2 +asttokens==2.2.1 +torch==2.0.0 +nvidia-nccl-cu11==2.14.3 +xxhash==3.5.0 diff --git a/wandb/run-20241101_200517-vzv5zg2q/files/wandb-metadata.json b/wandb/run-20241101_200517-vzv5zg2q/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..6b61e81c7d42e03465e54e8a9dc00a6fa0695b3e --- /dev/null +++ b/wandb/run-20241101_200517-vzv5zg2q/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-11-02T00:05:17.600434Z", + "args": [ + "--perturbation", + "shuffle_nondeterministic", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "3", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1754801557504" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241101_200517-vzv5zg2q/files/wandb-summary.json b/wandb/run-20241101_200517-vzv5zg2q/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..c437ff1a48b0e53a8cdd36dcd584a8e6b22b4bc2 --- /dev/null +++ b/wandb/run-20241101_200517-vzv5zg2q/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":7}} \ No newline at end of file diff --git a/wandb/run-20241101_200517-vzv5zg2q/logs/debug-internal.log b/wandb/run-20241101_200517-vzv5zg2q/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..3e5be2032d1c522dcda691a442a539e8d4282bac --- /dev/null +++ b/wandb/run-20241101_200517-vzv5zg2q/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2024-11-01T20:05:17.602749928-04:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-11-01T20:05:17.602763728-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_200517-vzv5zg2q/logs/debug-core.log"} +{"time":"2024-11-01T20:05:17.709188149-04:00","level":"INFO","msg":"created new stream","id":"vzv5zg2q"} +{"time":"2024-11-01T20:05:17.70922197-04:00","level":"INFO","msg":"stream: started","id":"vzv5zg2q"} +{"time":"2024-11-01T20:05:17.70926242-04:00","level":"INFO","msg":"sender: started","stream_id":"vzv5zg2q"} +{"time":"2024-11-01T20:05:17.70924561-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"vzv5zg2q"}} +{"time":"2024-11-01T20:05:17.709359941-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"vzv5zg2q"}} +{"time":"2024-11-01T20:05:17.906758123-04:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2024-11-01T20:05:25.260555442-04:00","level":"INFO","msg":"stream: closing","id":"vzv5zg2q"} +{"time":"2024-11-01T20:05:25.260678473-04:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2024-11-01T20:05:25.261811351-04:00","level":"INFO","msg":"Stopped system monitor"} diff --git a/wandb/run-20241101_200517-vzv5zg2q/logs/debug.log b/wandb/run-20241101_200517-vzv5zg2q/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..16a971703caaa576a1f065ac3efe9a4b18949eb3 --- /dev/null +++ b/wandb/run-20241101_200517-vzv5zg2q/logs/debug.log @@ -0,0 +1,27 @@ +2024-11-01 20:05:17,597 INFO MainThread:870384 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-11-01 20:05:17,597 INFO MainThread:870384 [wandb_setup.py:_flush():79] Configure stats pid to 870384 +2024-11-01 20:05:17,597 INFO MainThread:870384 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-11-01 20:05:17,597 INFO MainThread:870384 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-11-01 20:05:17,597 INFO MainThread:870384 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-11-01 20:05:17,597 INFO MainThread:870384 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-11-01 20:05:17,597 INFO MainThread:870384 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-11-01 20:05:17,597 INFO MainThread:870384 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-11-01 20:05:17,597 INFO MainThread:870384 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_200517-vzv5zg2q/logs/debug.log +2024-11-01 20:05:17,597 INFO MainThread:870384 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_200517-vzv5zg2q/logs/debug-internal.log +2024-11-01 20:05:17,598 INFO MainThread:870384 [wandb_init.py:init():621] calling init triggers +2024-11-01 20:05:17,598 INFO MainThread:870384 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-11-01 20:05:17,598 INFO MainThread:870384 [wandb_init.py:init():671] starting backend +2024-11-01 20:05:17,598 INFO MainThread:870384 [wandb_init.py:init():675] sending inform_init request +2024-11-01 20:05:17,599 INFO MainThread:870384 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-11-01 20:05:17,600 INFO MainThread:870384 [wandb_init.py:init():688] backend started and connected +2024-11-01 20:05:17,603 INFO MainThread:870384 [wandb_init.py:init():783] updated telemetry +2024-11-01 20:05:17,631 INFO MainThread:870384 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-11-01 20:05:17,903 INFO MainThread:870384 [wandb_init.py:init():867] starting run threads in backend +2024-11-01 20:05:17,993 INFO MainThread:870384 [wandb_run.py:_console_start():2463] atexit reg +2024-11-01 20:05:17,993 INFO MainThread:870384 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-11-01 20:05:17,993 INFO MainThread:870384 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-11-01 20:05:17,993 INFO MainThread:870384 [wandb_run.py:_redirect():2401] Redirects installed. +2024-11-01 20:05:17,994 INFO MainThread:870384 [wandb_init.py:init():911] run started, returning control to user process +2024-11-01 20:05:17,994 INFO MainThread:870384 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'shuffle_nondeterministic', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0, 'lr': 5e-06} +2024-11-01 20:05:25,260 WARNING MsgRouterThr:870384 [router.py:message_loop():77] message_loop has been closed diff --git a/wandb/run-20241101_200517-vzv5zg2q/run-vzv5zg2q.wandb b/wandb/run-20241101_200517-vzv5zg2q/run-vzv5zg2q.wandb new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/wandb/run-20241101_201630-e5gt2fir/files/config.yaml b/wandb/run-20241101_201630-e5gt2fir/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..16d25ee0e4092474360045faaf0959cfc1b3e91d --- /dev/null +++ b/wandb/run-20241101_201630-e5gt2fir/files/config.yaml @@ -0,0 +1,49 @@ +_wandb: + value: + cli_version: 0.18.5 + m: [] + python_version: 3.9.19 + t: + "1": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "2": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "3": + - 13 + - 23 + - 55 + "4": 3.9.19 + "5": 0.18.5 + "6": 4.45.1 + "8": + - 5 + "12": 0.18.5 + "13": linux-x86_64 +batch_size: + value: 3 +epoch: + value: 6 +lr: + value: 5e-06 +perturbation: + value: shuffle_nodeterministic +seed: + value: 0 +train_set: + value: 10M diff --git a/wandb/run-20241101_201630-e5gt2fir/files/output.log b/wandb/run-20241101_201630-e5gt2fir/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..5824b3c1b0e7410256b3374a200db5bd3cc11c9d --- /dev/null +++ b/wandb/run-20241101_201630-e5gt2fir/files/output.log @@ -0,0 +1,12 @@ +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", line 164, in + dataset = load_dataset('babylm_dataset_test.py', name=dataset_name, trust_remote_code=True) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/datasets/load.py", line 2074, in load_dataset + builder_instance = load_dataset_builder( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/datasets/load.py", line 1832, in load_dataset_builder + builder_instance: DatasetBuilder = builder_cls( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/datasets/builder.py", line 342, in __init__ + self.config, self.config_id = self._create_builder_config( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/datasets/builder.py", line 569, in _create_builder_config + raise ValueError( +ValueError: BuilderConfig 'babylm_shuffle_nodeterministic_10M_seed0' not found. Available: ['babylm_hop_control_10M_seed0', 'babylm_hop_tokens4_10M_seed0', 'babylm_hop_words4_10M_seed0', 'babylm_reverse_control_10M_seed0', 'babylm_reverse_partial_10M_seed0', 'babylm_reverse_full_10M_seed0', 'babylm_shuffle_control_10M_seed0', 'babylm_shuffle_nondeterministic_10M_seed0', 'babylm_shuffle_deterministic21_10M_seed0', 'babylm_shuffle_deterministic57_10M_seed0', 'babylm_shuffle_deterministic84_10M_seed0', 'babylm_shuffle_local3_10M_seed0', 'babylm_shuffle_local5_10M_seed0', 'babylm_shuffle_local10_10M_seed0', 'babylm_shuffle_even_odd_10M_seed0'] diff --git a/wandb/run-20241101_201630-e5gt2fir/files/wandb-summary.json b/wandb/run-20241101_201630-e5gt2fir/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..6c37fe1cbbb8aed86fd461a79642cb991e4d35cf --- /dev/null +++ b/wandb/run-20241101_201630-e5gt2fir/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":0}} \ No newline at end of file diff --git a/wandb/run-20241101_201630-e5gt2fir/logs/debug-internal.log b/wandb/run-20241101_201630-e5gt2fir/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..91b64cf92788795201d932a67744745e5e4e4baa --- /dev/null +++ b/wandb/run-20241101_201630-e5gt2fir/logs/debug-internal.log @@ -0,0 +1,16 @@ +{"time":"2024-11-01T20:16:30.561699495-04:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-11-01T20:16:30.561714855-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_201630-e5gt2fir/logs/debug-core.log"} +{"time":"2024-11-01T20:16:30.670004167-04:00","level":"INFO","msg":"created new stream","id":"e5gt2fir"} +{"time":"2024-11-01T20:16:30.670039398-04:00","level":"INFO","msg":"stream: started","id":"e5gt2fir"} +{"time":"2024-11-01T20:16:30.670068758-04:00","level":"INFO","msg":"sender: started","stream_id":"e5gt2fir"} +{"time":"2024-11-01T20:16:30.670067278-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"e5gt2fir"}} +{"time":"2024-11-01T20:16:30.670100128-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"e5gt2fir"}} +{"time":"2024-11-01T20:16:30.881477183-04:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2024-11-01T20:16:30.974886093-04:00","level":"INFO","msg":"stream: closing","id":"e5gt2fir"} +{"time":"2024-11-01T20:16:30.974923473-04:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2024-11-01T20:16:30.982511386-04:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2024-11-01T20:16:31.563553389-04:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2024-11-01T20:16:31.686184401-04:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"e5gt2fir"}} +{"time":"2024-11-01T20:16:31.686245181-04:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"e5gt2fir"}} +{"time":"2024-11-01T20:16:31.686264612-04:00","level":"INFO","msg":"sender: closed","stream_id":"e5gt2fir"} +{"time":"2024-11-01T20:16:31.686314312-04:00","level":"INFO","msg":"stream: closed","id":"e5gt2fir"} diff --git a/wandb/run-20241101_201630-e5gt2fir/logs/debug.log b/wandb/run-20241101_201630-e5gt2fir/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..411e21968809c9233cb9d6a90abc0c7ccf696e7c --- /dev/null +++ b/wandb/run-20241101_201630-e5gt2fir/logs/debug.log @@ -0,0 +1,27 @@ +2024-11-01 20:16:30,556 INFO MainThread:874718 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-11-01 20:16:30,557 INFO MainThread:874718 [wandb_setup.py:_flush():79] Configure stats pid to 874718 +2024-11-01 20:16:30,557 INFO MainThread:874718 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-11-01 20:16:30,557 INFO MainThread:874718 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-11-01 20:16:30,557 INFO MainThread:874718 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-11-01 20:16:30,557 INFO MainThread:874718 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-11-01 20:16:30,557 INFO MainThread:874718 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-11-01 20:16:30,557 INFO MainThread:874718 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-11-01 20:16:30,557 INFO MainThread:874718 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_201630-e5gt2fir/logs/debug.log +2024-11-01 20:16:30,557 INFO MainThread:874718 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_201630-e5gt2fir/logs/debug-internal.log +2024-11-01 20:16:30,557 INFO MainThread:874718 [wandb_init.py:init():621] calling init triggers +2024-11-01 20:16:30,557 INFO MainThread:874718 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-11-01 20:16:30,557 INFO MainThread:874718 [wandb_init.py:init():671] starting backend +2024-11-01 20:16:30,557 INFO MainThread:874718 [wandb_init.py:init():675] sending inform_init request +2024-11-01 20:16:30,559 INFO MainThread:874718 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-11-01 20:16:30,559 INFO MainThread:874718 [wandb_init.py:init():688] backend started and connected +2024-11-01 20:16:30,563 INFO MainThread:874718 [wandb_init.py:init():783] updated telemetry +2024-11-01 20:16:30,592 INFO MainThread:874718 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-11-01 20:16:30,878 INFO MainThread:874718 [wandb_init.py:init():867] starting run threads in backend +2024-11-01 20:16:30,965 INFO MainThread:874718 [wandb_run.py:_console_start():2463] atexit reg +2024-11-01 20:16:30,965 INFO MainThread:874718 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-11-01 20:16:30,965 INFO MainThread:874718 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-11-01 20:16:30,965 INFO MainThread:874718 [wandb_run.py:_redirect():2401] Redirects installed. +2024-11-01 20:16:30,967 INFO MainThread:874718 [wandb_init.py:init():911] run started, returning control to user process +2024-11-01 20:16:30,967 INFO MainThread:874718 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'shuffle_nodeterministic', 'train_set': '10M', 'batch_size': 3, 'epoch': 6, 'seed': 0, 'lr': 5e-06} +2024-11-01 20:16:30,974 WARNING MsgRouterThr:874718 [router.py:message_loop():77] message_loop has been closed diff --git a/wandb/run-20241101_202058-hjyig8so/run-hjyig8so.wandb b/wandb/run-20241101_202058-hjyig8so/run-hjyig8so.wandb new file mode 100644 index 0000000000000000000000000000000000000000..8d0b95462c333ae550e6ae3577ba293e6c8a259f --- /dev/null +++ b/wandb/run-20241101_202058-hjyig8so/run-hjyig8so.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2bb18c91a178f700bc9b2e5d6d0e4bea78a095f878d6c225bc65c1b29e8d0dd1 +size 13297708 diff --git a/wandb/run-20241105_160652-v9udw9ab/files/config.yaml b/wandb/run-20241105_160652-v9udw9ab/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ba0e0eb5aa81d7186d91b3c8f342ad5574a4c100 --- /dev/null +++ b/wandb/run-20241105_160652-v9udw9ab/files/config.yaml @@ -0,0 +1,49 @@ +_wandb: + value: + cli_version: 0.18.5 + m: [] + python_version: 3.9.19 + t: + "1": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "2": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "3": + - 13 + - 23 + - 55 + "4": 3.9.19 + "5": 0.18.5 + "6": 4.45.1 + "8": + - 5 + "12": 0.18.5 + "13": linux-x86_64 +batch_size: + value: 3 +epoch: + value: 3 +lr: + value: 5e-06 +perturbation: + value: shuffle_deterministic21 +seed: + value: 0 +train_set: + value: 10M diff --git a/wandb/run-20241105_160652-v9udw9ab/files/output.log b/wandb/run-20241105_160652-v9udw9ab/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..5cf4e355b562d16f06ef7e980576314b71cea8f8 --- /dev/null +++ b/wandb/run-20241105_160652-v9udw9ab/files/output.log @@ -0,0 +1,8 @@ +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", line 165, in + dataset = load_dataset('babylm_dataset_test.py', name=dataset_name, trust_remote_code=True) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/datasets/load.py", line 2096, in load_dataset + builder_instance.download_and_prepare( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/datasets/builder.py", line 875, in download_and_prepare + raise OSError( +OSError: Not enough disk space. Needed: Unknown size (download: Unknown size, generated: Unknown size, post-processed: Unknown size) diff --git a/wandb/run-20241105_160652-v9udw9ab/files/requirements.txt b/wandb/run-20241105_160652-v9udw9ab/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416 --- /dev/null +++ b/wandb/run-20241105_160652-v9udw9ab/files/requirements.txt @@ -0,0 +1,147 @@ +funcsigs==1.0.2 +sentry-sdk==2.17.0 +multiprocess==0.70.16 +numpy==1.26.2 +pluralizer==1.2.0 +debugpy==1.6.7 +nvidia-cudnn-cu11==8.5.0.96 +deepspeed==0.15.2 +data==0.4 +pandas==2.1.3 +tomli==2.0.1 +charset-normalizer==3.3.2 +attrs==24.2.0 +aiosignal==1.3.1 +fsspec==2023.10.0 +nvidia-cusparse-cu11==11.7.4.91 +zipp==3.12.0 +mypy-extensions==1.0.0 +datasets==3.0.1 +joblib==1.3.2 +hjson==3.1.0 +traitlets==5.7.1 +stack-data==0.6.0 +transformers==4.45.1 +sympy==1.11.1 +Pygments==2.15.0 +docker-pycreds==0.4.0 +dill==0.3.8 +wheel==0.44.0 +prompt-toolkit==3.0.30 +parso==0.8.3 +ipykernel==6.23.1 +pyarrow==17.0.0 +certifi==2023.11.17 +nvidia-cufft-cu11==10.9.0.58 +six==1.16.0 +pydantic==2.9.2 +click==8.1.7 +nest-asyncio==1.5.6 +gmpy2==2.1.0 +matplotlib==3.8.2 +scipy==1.11.4 +typing_extensions==4.12.2 +statsmodels==0.14.0 +huggingface-hub==0.25.0 +frozenlist==1.4.1 +gpustat==1.1.1 +nvidia-nvtx-cu11==11.7.91 +safetensors==0.4.5 +stanza==1.9.2 +decorator==5.1.1 +seaborn==0.13.0 +sentencepiece==0.2.0 +PyYAML==6.0.1 +black==24.8.0 +protobuf==4.25.1 +pickleshare==0.7.5 +peft==0.13.0 +triton==2.0.0 +nvidia-cuda-runtime-cu11==11.7.99 +Jinja2==3.1.2 +nvidia-cusolver-cu11==11.4.0.1 +executing==1.2.0 +jupyter_client==8.1.0 +pluggy==1.3.0 +cmake==3.30.3 +pytz==2023.3.post1 +aiohappyeyeballs==2.4.2 +kiwisolver==1.4.5 +py-cpuinfo==9.0.0 +Pillow==10.1.0 +ptyprocess==0.7.0 +importlib_resources==6.4.5 +GitPython==3.1.43 +importlib-metadata==6.0.0 +iniconfig==2.0.0 +scikit-learn==1.3.2 +exceptiongroup==1.1.0 +networkx==2.8.6 +accelerate==1.0.0 +nltk==3.8.1 +shutilwhich==1.1.0 +fonttools==4.45.1 +future==0.18.3 +aiohttp==3.10.6 +wcwidth==0.2.5 +idna==3.6 +filelock==3.12.2 +pathspec==0.12.1 +jupyter_core==5.1.0 +lit==18.1.8 +nvidia-curand-cu11==10.2.10.91 +nvidia-cublas-cu11==11.10.3.66 +nvidia-ml-py==12.560.30 +msgpack==1.1.0 +python-dateutil==2.8.2 +blessed==1.20.0 +packaging==23.0 +gitdb==4.0.11 +yarl==1.13.0 +emoji==2.8.0 +tzdata==2023.3 +cycler==0.12.1 +tornado==6.2 +backcall==0.2.0 +plotnine==0.12.4 +ninja==1.11.1.1 +latex==0.7.0 +wandb==0.18.5 +setproctitle==1.3.3 +threadpoolctl==3.2.0 +requests==2.32.3 +pyparsing==3.1.1 +smmap==5.0.1 +pyzmq==23.0.0 +async-timeout==4.0.3 +annotated-types==0.7.0 +matplotlib-inline==0.1.6 +latexcodec==1.0.0 +ipython==8.0.0 +patsy==0.5.3 +contourpy==1.2.0 +multidict==6.1.0 +mizani==0.9.3 +urllib3==2.1.0 +tokenizers==0.20.0 +MarkupSafe==2.1.2 +pip==24.2 +pexpect==4.8.0 +tqdm==4.66.5 +jedi==0.18.2 +pydantic_core==2.23.4 +tempdir==0.7.1 +mpmath==1.2.1 +setuptools==72.1.0 +pytest==7.4.3 +pure-eval==0.2.2 +psutil==5.9.1 +comm==0.1.2 +nvidia-cuda-cupti-cu11==11.7.101 +nvidia-cuda-nvrtc-cu11==11.7.99 +regex==2023.10.3 +platformdirs==2.5.2 +asttokens==2.2.1 +torch==2.0.0 +nvidia-nccl-cu11==2.14.3 +xxhash==3.5.0 diff --git a/wandb/run-20241105_160652-v9udw9ab/files/wandb-metadata.json b/wandb/run-20241105_160652-v9udw9ab/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..f66e15d93b2849f853a3735a7e6e85d075e42795 --- /dev/null +++ b/wandb/run-20241105_160652-v9udw9ab/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-11-05T21:06:52.157764Z", + "args": [ + "--perturbation", + "shuffle_deterministic21", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "3", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1792542826496" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241105_160652-v9udw9ab/files/wandb-summary.json b/wandb/run-20241105_160652-v9udw9ab/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..a59211b910c7b68e6827eb6c887d30d98244727c --- /dev/null +++ b/wandb/run-20241105_160652-v9udw9ab/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":5}} \ No newline at end of file diff --git a/wandb/run-20241105_160652-v9udw9ab/logs/debug-internal.log b/wandb/run-20241105_160652-v9udw9ab/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..53ac88017c0ff6e3abe4fd7159b552bf31b900b4 --- /dev/null +++ b/wandb/run-20241105_160652-v9udw9ab/logs/debug-internal.log @@ -0,0 +1,17 @@ +{"time":"2024-11-05T16:06:52.159922257-05:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-11-05T16:06:52.159942657-05:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241105_160652-v9udw9ab/logs/debug-core.log"} +{"time":"2024-11-05T16:06:52.268171392-05:00","level":"INFO","msg":"created new stream","id":"v9udw9ab"} +{"time":"2024-11-05T16:06:52.268215922-05:00","level":"INFO","msg":"stream: started","id":"v9udw9ab"} +{"time":"2024-11-05T16:06:52.268269323-05:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"v9udw9ab"}} +{"time":"2024-11-05T16:06:52.268270183-05:00","level":"INFO","msg":"handler: started","stream_id":{"value":"v9udw9ab"}} +{"time":"2024-11-05T16:06:52.268491884-05:00","level":"INFO","msg":"sender: started","stream_id":"v9udw9ab"} +{"time":"2024-11-05T16:06:52.481746757-05:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2024-11-05T16:06:57.712126548-05:00","level":"INFO","msg":"stream: closing","id":"v9udw9ab"} +{"time":"2024-11-05T16:06:57.712223249-05:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2024-11-05T16:06:57.713709807-05:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2024-11-05T16:06:57.799517789-05:00","level":"ERROR","msg":"sender: sendDefer: failed to build job artifact","error":"failed to write data to file: write /tmp/tmpfile-3332236271: no space left on device"} +{"time":"2024-11-05T16:06:58.057468605-05:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2024-11-05T16:06:58.178861806-05:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"v9udw9ab"}} +{"time":"2024-11-05T16:06:58.178915807-05:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"v9udw9ab"}} +{"time":"2024-11-05T16:06:58.178935557-05:00","level":"INFO","msg":"sender: closed","stream_id":"v9udw9ab"} +{"time":"2024-11-05T16:06:58.178979897-05:00","level":"INFO","msg":"stream: closed","id":"v9udw9ab"} diff --git a/wandb/run-20241105_160652-v9udw9ab/logs/debug.log b/wandb/run-20241105_160652-v9udw9ab/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..cb1a24589240129f95d0758aea01bdc1909d9ef2 --- /dev/null +++ b/wandb/run-20241105_160652-v9udw9ab/logs/debug.log @@ -0,0 +1,27 @@ +2024-11-05 16:06:52,154 INFO MainThread:1771273 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-11-05 16:06:52,154 INFO MainThread:1771273 [wandb_setup.py:_flush():79] Configure stats pid to 1771273 +2024-11-05 16:06:52,154 INFO MainThread:1771273 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-11-05 16:06:52,154 INFO MainThread:1771273 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-11-05 16:06:52,154 INFO MainThread:1771273 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-11-05 16:06:52,154 INFO MainThread:1771273 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-11-05 16:06:52,154 INFO MainThread:1771273 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-11-05 16:06:52,154 INFO MainThread:1771273 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-11-05 16:06:52,154 INFO MainThread:1771273 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241105_160652-v9udw9ab/logs/debug.log +2024-11-05 16:06:52,155 INFO MainThread:1771273 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241105_160652-v9udw9ab/logs/debug-internal.log +2024-11-05 16:06:52,155 INFO MainThread:1771273 [wandb_init.py:init():621] calling init triggers +2024-11-05 16:06:52,155 INFO MainThread:1771273 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-11-05 16:06:52,155 INFO MainThread:1771273 [wandb_init.py:init():671] starting backend +2024-11-05 16:06:52,155 INFO MainThread:1771273 [wandb_init.py:init():675] sending inform_init request +2024-11-05 16:06:52,156 INFO MainThread:1771273 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-11-05 16:06:52,157 INFO MainThread:1771273 [wandb_init.py:init():688] backend started and connected +2024-11-05 16:06:52,160 INFO MainThread:1771273 [wandb_init.py:init():783] updated telemetry +2024-11-05 16:06:52,189 INFO MainThread:1771273 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-11-05 16:06:52,478 INFO MainThread:1771273 [wandb_init.py:init():867] starting run threads in backend +2024-11-05 16:06:52,566 INFO MainThread:1771273 [wandb_run.py:_console_start():2463] atexit reg +2024-11-05 16:06:52,566 INFO MainThread:1771273 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-11-05 16:06:52,566 INFO MainThread:1771273 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-11-05 16:06:52,566 INFO MainThread:1771273 [wandb_run.py:_redirect():2401] Redirects installed. +2024-11-05 16:06:52,568 INFO MainThread:1771273 [wandb_init.py:init():911] run started, returning control to user process +2024-11-05 16:06:52,568 INFO MainThread:1771273 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'shuffle_deterministic21', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0, 'lr': 5e-06} +2024-11-05 16:06:57,712 WARNING MsgRouterThr:1771273 [router.py:message_loop():77] message_loop has been closed diff --git a/wandb/run-20241105_160652-v9udw9ab/run-v9udw9ab.wandb b/wandb/run-20241105_160652-v9udw9ab/run-v9udw9ab.wandb new file mode 100644 index 0000000000000000000000000000000000000000..394f4b128920f33bc870443cebffb02328cd785c Binary files /dev/null and b/wandb/run-20241105_160652-v9udw9ab/run-v9udw9ab.wandb differ diff --git a/wandb/run-20241105_162858-hqnfirxi/files/config.yaml b/wandb/run-20241105_162858-hqnfirxi/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3771760e607bdb2dcb978de96a047ebe2475f806 --- /dev/null +++ b/wandb/run-20241105_162858-hqnfirxi/files/config.yaml @@ -0,0 +1,49 @@ +_wandb: + value: + cli_version: 0.18.5 + m: [] + python_version: 3.9.19 + t: + "1": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "2": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "3": + - 13 + - 23 + - 55 + "4": 3.9.19 + "5": 0.18.5 + "6": 4.45.1 + "8": + - 5 + "12": 0.18.5 + "13": linux-x86_64 +batch_size: + value: 3 +epoch: + value: 3 +lr: + value: 5e-06 +perturbation: + value: shuffle_deterministic57 +seed: + value: 0 +train_set: + value: 10M diff --git a/wandb/run-20241105_162858-hqnfirxi/files/output.log b/wandb/run-20241105_162858-hqnfirxi/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..2a036e8d9dccd35813670795de7f557a820d1af5 --- /dev/null +++ b/wandb/run-20241105_162858-hqnfirxi/files/output.log @@ -0,0 +1,76 @@ +100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1098312/1098312 [00:04<00:00, 265238.70it/s] +100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1098312/1098312 [00:00<00:00, 3575903.74it/s] +100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16426/16426 [00:00<00:00, 35435.28it/s] +Generating train split: 16425 examples [00:07, 2176.71 examples/s]█████████████████████████████████████████████████████████ | 14158/16426 [00:00<00:00, 38078.31it/s] +100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1085967/1085967 [00:04<00:00, 252339.30it/s] +100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1085967/1085967 [00:00<00:00, 3499163.97it/s] +100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17014/17014 [00:00<00:00, 36530.93it/s] +Generating validation split: 17013 examples [00:07, 2154.84 examples/s]█████████▏ | 9267/17014 [00:00<00:00, 31305.81it/s] +100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1031242/1031242 [00:03<00:00, 289970.89it/s] +100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1031242/1031242 [00:00<00:00, 3543097.40it/s] +100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15439/15439 [00:00<00:00, 33391.70it/s] +Generating test split: 15438 examples [00:06, 2262.34 examples/s]██████████████████████████████████████████████████████████████████████████▎ | 15138/15439 [00:00<00:00, 38178.32it/s] +config.json: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 844/844 [00:00<00:00, 460kB/s] +model.safetensors.index.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20.9k/20.9k [00:00<00:00, 18.7MB/s] +Downloading shards: 0%| | 0/2 [00:00 + model = AutoModelForCausalLM.from_pretrained(model_name, + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/models/auto/auto_factory.py", line 564, in from_pretrained + return model_class.from_pretrained( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/modeling_utils.py", line 3769, in from_pretrained + resolved_archive_file, sharded_metadata = get_checkpoint_shard_files( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 1098, in get_checkpoint_shard_files + cached_filename = cached_file( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 403, in cached_file + resolved_file = hf_hub_download( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_deprecation.py", line 101, in inner_f + return f(*args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn + return fn(*args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1232, in hf_hub_download + return _hf_hub_download_to_cache_dir( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1381, in _hf_hub_download_to_cache_dir + _download_to_tmp_and_move( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1915, in _download_to_tmp_and_move + http_get( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 558, in http_get + return http_get( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/std.py", line 1140, in __exit__ + self.close() + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/std.py", line 1302, in close + self.display(pos=0) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/std.py", line 1495, in display + self.sp(self.__str__() if msg is None else msg) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/std.py", line 1151, in __str__ + return self.format_meter(**self.format_dict) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/std.py", line 593, in format_meter + format_dict = { +KeyboardInterrupt diff --git a/wandb/run-20241105_162858-hqnfirxi/files/requirements.txt b/wandb/run-20241105_162858-hqnfirxi/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416 --- /dev/null +++ b/wandb/run-20241105_162858-hqnfirxi/files/requirements.txt @@ -0,0 +1,147 @@ +funcsigs==1.0.2 +sentry-sdk==2.17.0 +multiprocess==0.70.16 +numpy==1.26.2 +pluralizer==1.2.0 +debugpy==1.6.7 +nvidia-cudnn-cu11==8.5.0.96 +deepspeed==0.15.2 +data==0.4 +pandas==2.1.3 +tomli==2.0.1 +charset-normalizer==3.3.2 +attrs==24.2.0 +aiosignal==1.3.1 +fsspec==2023.10.0 +nvidia-cusparse-cu11==11.7.4.91 +zipp==3.12.0 +mypy-extensions==1.0.0 +datasets==3.0.1 +joblib==1.3.2 +hjson==3.1.0 +traitlets==5.7.1 +stack-data==0.6.0 +transformers==4.45.1 +sympy==1.11.1 +Pygments==2.15.0 +docker-pycreds==0.4.0 +dill==0.3.8 +wheel==0.44.0 +prompt-toolkit==3.0.30 +parso==0.8.3 +ipykernel==6.23.1 +pyarrow==17.0.0 +certifi==2023.11.17 +nvidia-cufft-cu11==10.9.0.58 +six==1.16.0 +pydantic==2.9.2 +click==8.1.7 +nest-asyncio==1.5.6 +gmpy2==2.1.0 +matplotlib==3.8.2 +scipy==1.11.4 +typing_extensions==4.12.2 +statsmodels==0.14.0 +huggingface-hub==0.25.0 +frozenlist==1.4.1 +gpustat==1.1.1 +nvidia-nvtx-cu11==11.7.91 +safetensors==0.4.5 +stanza==1.9.2 +decorator==5.1.1 +seaborn==0.13.0 +sentencepiece==0.2.0 +PyYAML==6.0.1 +black==24.8.0 +protobuf==4.25.1 +pickleshare==0.7.5 +peft==0.13.0 +triton==2.0.0 +nvidia-cuda-runtime-cu11==11.7.99 +Jinja2==3.1.2 +nvidia-cusolver-cu11==11.4.0.1 +executing==1.2.0 +jupyter_client==8.1.0 +pluggy==1.3.0 +cmake==3.30.3 +pytz==2023.3.post1 +aiohappyeyeballs==2.4.2 +kiwisolver==1.4.5 +py-cpuinfo==9.0.0 +Pillow==10.1.0 +ptyprocess==0.7.0 +importlib_resources==6.4.5 +GitPython==3.1.43 +importlib-metadata==6.0.0 +iniconfig==2.0.0 +scikit-learn==1.3.2 +exceptiongroup==1.1.0 +networkx==2.8.6 +accelerate==1.0.0 +nltk==3.8.1 +shutilwhich==1.1.0 +fonttools==4.45.1 +future==0.18.3 +aiohttp==3.10.6 +wcwidth==0.2.5 +idna==3.6 +filelock==3.12.2 +pathspec==0.12.1 +jupyter_core==5.1.0 +lit==18.1.8 +nvidia-curand-cu11==10.2.10.91 +nvidia-cublas-cu11==11.10.3.66 +nvidia-ml-py==12.560.30 +msgpack==1.1.0 +python-dateutil==2.8.2 +blessed==1.20.0 +packaging==23.0 +gitdb==4.0.11 +yarl==1.13.0 +emoji==2.8.0 +tzdata==2023.3 +cycler==0.12.1 +tornado==6.2 +backcall==0.2.0 +plotnine==0.12.4 +ninja==1.11.1.1 +latex==0.7.0 +wandb==0.18.5 +setproctitle==1.3.3 +threadpoolctl==3.2.0 +requests==2.32.3 +pyparsing==3.1.1 +smmap==5.0.1 +pyzmq==23.0.0 +async-timeout==4.0.3 +annotated-types==0.7.0 +matplotlib-inline==0.1.6 +latexcodec==1.0.0 +ipython==8.0.0 +patsy==0.5.3 +contourpy==1.2.0 +multidict==6.1.0 +mizani==0.9.3 +urllib3==2.1.0 +tokenizers==0.20.0 +MarkupSafe==2.1.2 +pip==24.2 +pexpect==4.8.0 +tqdm==4.66.5 +jedi==0.18.2 +pydantic_core==2.23.4 +tempdir==0.7.1 +mpmath==1.2.1 +setuptools==72.1.0 +pytest==7.4.3 +pure-eval==0.2.2 +psutil==5.9.1 +comm==0.1.2 +nvidia-cuda-cupti-cu11==11.7.101 +nvidia-cuda-nvrtc-cu11==11.7.99 +regex==2023.10.3 +platformdirs==2.5.2 +asttokens==2.2.1 +torch==2.0.0 +nvidia-nccl-cu11==2.14.3 +xxhash==3.5.0 diff --git a/wandb/run-20241105_162858-hqnfirxi/files/wandb-metadata.json b/wandb/run-20241105_162858-hqnfirxi/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..ca1f6c95c2035c994a9690a6bd68e1bf9e007933 --- /dev/null +++ b/wandb/run-20241105_162858-hqnfirxi/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-11-05T21:28:58.714715Z", + "args": [ + "--perturbation", + "shuffle_deterministic57", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "3", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1785811775488" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241105_162858-hqnfirxi/files/wandb-summary.json b/wandb/run-20241105_162858-hqnfirxi/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..15f6b8e9049a55292dab131278b3f2fc1f52e50d --- /dev/null +++ b/wandb/run-20241105_162858-hqnfirxi/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":23}} \ No newline at end of file diff --git a/wandb/run-20241105_162858-hqnfirxi/logs/debug-internal.log b/wandb/run-20241105_162858-hqnfirxi/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..166d5cb013a6a7013d451e3d5e191ac1c0edf6ab --- /dev/null +++ b/wandb/run-20241105_162858-hqnfirxi/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2024-11-05T16:28:58.716956394-05:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-11-05T16:28:58.716971335-05:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241105_162858-hqnfirxi/logs/debug-core.log"} +{"time":"2024-11-05T16:28:58.822710212-05:00","level":"INFO","msg":"created new stream","id":"hqnfirxi"} +{"time":"2024-11-05T16:28:58.822746472-05:00","level":"INFO","msg":"stream: started","id":"hqnfirxi"} +{"time":"2024-11-05T16:28:58.822821592-05:00","level":"INFO","msg":"sender: started","stream_id":"hqnfirxi"} +{"time":"2024-11-05T16:28:58.822773912-05:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"hqnfirxi"}} +{"time":"2024-11-05T16:28:58.822800452-05:00","level":"INFO","msg":"handler: started","stream_id":{"value":"hqnfirxi"}} +{"time":"2024-11-05T16:28:59.013459403-05:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2024-11-05T16:29:22.459796488-05:00","level":"INFO","msg":"stream: closing","id":"hqnfirxi"} +{"time":"2024-11-05T16:29:22.459827448-05:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2024-11-05T16:29:22.46034017-05:00","level":"INFO","msg":"Stopped system monitor"} diff --git a/wandb/run-20241105_162858-hqnfirxi/logs/debug.log b/wandb/run-20241105_162858-hqnfirxi/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..39a93f425a206df193d57b901e4327034f63cfc6 --- /dev/null +++ b/wandb/run-20241105_162858-hqnfirxi/logs/debug.log @@ -0,0 +1,27 @@ +2024-11-05 16:28:58,712 INFO MainThread:1778373 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-11-05 16:28:58,712 INFO MainThread:1778373 [wandb_setup.py:_flush():79] Configure stats pid to 1778373 +2024-11-05 16:28:58,713 INFO MainThread:1778373 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-11-05 16:28:58,713 INFO MainThread:1778373 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-11-05 16:28:58,713 INFO MainThread:1778373 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-11-05 16:28:58,713 INFO MainThread:1778373 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-11-05 16:28:58,713 INFO MainThread:1778373 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-11-05 16:28:58,713 INFO MainThread:1778373 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-11-05 16:28:58,713 INFO MainThread:1778373 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241105_162858-hqnfirxi/logs/debug.log +2024-11-05 16:28:58,713 INFO MainThread:1778373 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241105_162858-hqnfirxi/logs/debug-internal.log +2024-11-05 16:28:58,713 INFO MainThread:1778373 [wandb_init.py:init():621] calling init triggers +2024-11-05 16:28:58,713 INFO MainThread:1778373 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-11-05 16:28:58,713 INFO MainThread:1778373 [wandb_init.py:init():671] starting backend +2024-11-05 16:28:58,713 INFO MainThread:1778373 [wandb_init.py:init():675] sending inform_init request +2024-11-05 16:28:58,714 INFO MainThread:1778373 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-11-05 16:28:58,714 INFO MainThread:1778373 [wandb_init.py:init():688] backend started and connected +2024-11-05 16:28:58,717 INFO MainThread:1778373 [wandb_init.py:init():783] updated telemetry +2024-11-05 16:28:58,759 INFO MainThread:1778373 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-11-05 16:28:59,010 INFO MainThread:1778373 [wandb_init.py:init():867] starting run threads in backend +2024-11-05 16:28:59,097 INFO MainThread:1778373 [wandb_run.py:_console_start():2463] atexit reg +2024-11-05 16:28:59,098 INFO MainThread:1778373 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-11-05 16:28:59,098 INFO MainThread:1778373 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-11-05 16:28:59,098 INFO MainThread:1778373 [wandb_run.py:_redirect():2401] Redirects installed. +2024-11-05 16:28:59,099 INFO MainThread:1778373 [wandb_init.py:init():911] run started, returning control to user process +2024-11-05 16:28:59,099 INFO MainThread:1778373 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'shuffle_deterministic57', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0, 'lr': 5e-06} +2024-11-05 16:29:22,459 WARNING MsgRouterThr:1778373 [router.py:message_loop():77] message_loop has been closed diff --git a/wandb/run-20241105_162858-hqnfirxi/run-hqnfirxi.wandb b/wandb/run-20241105_162858-hqnfirxi/run-hqnfirxi.wandb new file mode 100644 index 0000000000000000000000000000000000000000..f177ddbf4b94f428cadb8c6ee3b65d83ec711384 Binary files /dev/null and b/wandb/run-20241105_162858-hqnfirxi/run-hqnfirxi.wandb differ diff --git a/wandb/run-20241105_163038-cniejveq/files/config.yaml b/wandb/run-20241105_163038-cniejveq/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ba0e0eb5aa81d7186d91b3c8f342ad5574a4c100 --- /dev/null +++ b/wandb/run-20241105_163038-cniejveq/files/config.yaml @@ -0,0 +1,49 @@ +_wandb: + value: + cli_version: 0.18.5 + m: [] + python_version: 3.9.19 + t: + "1": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "2": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "3": + - 13 + - 23 + - 55 + "4": 3.9.19 + "5": 0.18.5 + "6": 4.45.1 + "8": + - 5 + "12": 0.18.5 + "13": linux-x86_64 +batch_size: + value: 3 +epoch: + value: 3 +lr: + value: 5e-06 +perturbation: + value: shuffle_deterministic21 +seed: + value: 0 +train_set: + value: 10M diff --git a/wandb/run-20241105_163038-cniejveq/files/output.log b/wandb/run-20241105_163038-cniejveq/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..750d5c04482a1f74a884f87171f9d7d9aaf9d034 --- /dev/null +++ b/wandb/run-20241105_163038-cniejveq/files/output.log @@ -0,0 +1,52 @@ +config.json: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 844/844 [00:00<00:00, 242kB/s] +Downloading shards: 0%| | 0/2 [01:19 +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/std.py", line 1196, in __iter__ + self.close() + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/std.py", line 1303, in close + fp_write('\n') + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/std.py", line 1287, in fp_write + self.fp.write(str(s)) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/utils.py", line 196, in inner + return func(*args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/wandb/sdk/lib/redirect.py", line 648, in write + cb(data) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/wandb/sdk/wandb_run.py", line 2386, in + lambda data: self._console_raw_callback("stderr", data), + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/wandb/sdk/wandb_run.py", line 403, in wrapper_fn + return func(self, *args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/wandb/sdk/wandb_run.py", line 1547, in _console_raw_callback + self._backend.interface.publish_output_raw(name, data) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/wandb/sdk/interface/interface.py", line 721, in publish_output_raw + self._publish_output_raw(o) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/wandb/sdk/interface/interface_shared.py", line 79, in _publish_output_raw + self._publish(rec) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/wandb/sdk/interface/interface_sock.py", line 50, in _publish + self._assign(record) +KeyboardInterrupt: +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", line 174, in + model = AutoModelForCausalLM.from_pretrained(model_name, + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/models/auto/auto_factory.py", line 564, in from_pretrained + return model_class.from_pretrained( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/modeling_utils.py", line 3769, in from_pretrained + resolved_archive_file, sharded_metadata = get_checkpoint_shard_files( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 1098, in get_checkpoint_shard_files + cached_filename = cached_file( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 403, in cached_file + resolved_file = hf_hub_download( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_deprecation.py", line 101, in inner_f + return f(*args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn + return fn(*args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1232, in hf_hub_download + return _hf_hub_download_to_cache_dir( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1380, in _hf_hub_download_to_cache_dir + with WeakFileLock(lock_path): + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/contextlib.py", line 119, in __enter__ + return next(self.gen) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_fixes.py", line 98, in WeakFileLock + lock.acquire() + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/filelock/_api.py", line 225, in acquire + time.sleep(poll_interval) +KeyboardInterrupt diff --git a/wandb/run-20241105_163038-cniejveq/files/requirements.txt b/wandb/run-20241105_163038-cniejveq/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416 --- /dev/null +++ b/wandb/run-20241105_163038-cniejveq/files/requirements.txt @@ -0,0 +1,147 @@ +funcsigs==1.0.2 +sentry-sdk==2.17.0 +multiprocess==0.70.16 +numpy==1.26.2 +pluralizer==1.2.0 +debugpy==1.6.7 +nvidia-cudnn-cu11==8.5.0.96 +deepspeed==0.15.2 +data==0.4 +pandas==2.1.3 +tomli==2.0.1 +charset-normalizer==3.3.2 +attrs==24.2.0 +aiosignal==1.3.1 +fsspec==2023.10.0 +nvidia-cusparse-cu11==11.7.4.91 +zipp==3.12.0 +mypy-extensions==1.0.0 +datasets==3.0.1 +joblib==1.3.2 +hjson==3.1.0 +traitlets==5.7.1 +stack-data==0.6.0 +transformers==4.45.1 +sympy==1.11.1 +Pygments==2.15.0 +docker-pycreds==0.4.0 +dill==0.3.8 +wheel==0.44.0 +prompt-toolkit==3.0.30 +parso==0.8.3 +ipykernel==6.23.1 +pyarrow==17.0.0 +certifi==2023.11.17 +nvidia-cufft-cu11==10.9.0.58 +six==1.16.0 +pydantic==2.9.2 +click==8.1.7 +nest-asyncio==1.5.6 +gmpy2==2.1.0 +matplotlib==3.8.2 +scipy==1.11.4 +typing_extensions==4.12.2 +statsmodels==0.14.0 +huggingface-hub==0.25.0 +frozenlist==1.4.1 +gpustat==1.1.1 +nvidia-nvtx-cu11==11.7.91 +safetensors==0.4.5 +stanza==1.9.2 +decorator==5.1.1 +seaborn==0.13.0 +sentencepiece==0.2.0 +PyYAML==6.0.1 +black==24.8.0 +protobuf==4.25.1 +pickleshare==0.7.5 +peft==0.13.0 +triton==2.0.0 +nvidia-cuda-runtime-cu11==11.7.99 +Jinja2==3.1.2 +nvidia-cusolver-cu11==11.4.0.1 +executing==1.2.0 +jupyter_client==8.1.0 +pluggy==1.3.0 +cmake==3.30.3 +pytz==2023.3.post1 +aiohappyeyeballs==2.4.2 +kiwisolver==1.4.5 +py-cpuinfo==9.0.0 +Pillow==10.1.0 +ptyprocess==0.7.0 +importlib_resources==6.4.5 +GitPython==3.1.43 +importlib-metadata==6.0.0 +iniconfig==2.0.0 +scikit-learn==1.3.2 +exceptiongroup==1.1.0 +networkx==2.8.6 +accelerate==1.0.0 +nltk==3.8.1 +shutilwhich==1.1.0 +fonttools==4.45.1 +future==0.18.3 +aiohttp==3.10.6 +wcwidth==0.2.5 +idna==3.6 +filelock==3.12.2 +pathspec==0.12.1 +jupyter_core==5.1.0 +lit==18.1.8 +nvidia-curand-cu11==10.2.10.91 +nvidia-cublas-cu11==11.10.3.66 +nvidia-ml-py==12.560.30 +msgpack==1.1.0 +python-dateutil==2.8.2 +blessed==1.20.0 +packaging==23.0 +gitdb==4.0.11 +yarl==1.13.0 +emoji==2.8.0 +tzdata==2023.3 +cycler==0.12.1 +tornado==6.2 +backcall==0.2.0 +plotnine==0.12.4 +ninja==1.11.1.1 +latex==0.7.0 +wandb==0.18.5 +setproctitle==1.3.3 +threadpoolctl==3.2.0 +requests==2.32.3 +pyparsing==3.1.1 +smmap==5.0.1 +pyzmq==23.0.0 +async-timeout==4.0.3 +annotated-types==0.7.0 +matplotlib-inline==0.1.6 +latexcodec==1.0.0 +ipython==8.0.0 +patsy==0.5.3 +contourpy==1.2.0 +multidict==6.1.0 +mizani==0.9.3 +urllib3==2.1.0 +tokenizers==0.20.0 +MarkupSafe==2.1.2 +pip==24.2 +pexpect==4.8.0 +tqdm==4.66.5 +jedi==0.18.2 +pydantic_core==2.23.4 +tempdir==0.7.1 +mpmath==1.2.1 +setuptools==72.1.0 +pytest==7.4.3 +pure-eval==0.2.2 +psutil==5.9.1 +comm==0.1.2 +nvidia-cuda-cupti-cu11==11.7.101 +nvidia-cuda-nvrtc-cu11==11.7.99 +regex==2023.10.3 +platformdirs==2.5.2 +asttokens==2.2.1 +torch==2.0.0 +nvidia-nccl-cu11==2.14.3 +xxhash==3.5.0 diff --git a/wandb/run-20241105_163038-cniejveq/files/wandb-metadata.json b/wandb/run-20241105_163038-cniejveq/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..419a7aa116e6eff4db14f8a6e617bc3d42a18200 --- /dev/null +++ b/wandb/run-20241105_163038-cniejveq/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-11-05T21:30:38.921383Z", + "args": [ + "--perturbation", + "shuffle_deterministic21", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "3", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1785984999424" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241105_163038-cniejveq/files/wandb-summary.json b/wandb/run-20241105_163038-cniejveq/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..51cc9288ee1f562b22e175ccb65f3cee4534742c --- /dev/null +++ b/wandb/run-20241105_163038-cniejveq/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":81}} \ No newline at end of file diff --git a/wandb/run-20241105_163038-cniejveq/logs/debug-internal.log b/wandb/run-20241105_163038-cniejveq/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..d8eac28619ac03580ab203d44a8b29339e280aec --- /dev/null +++ b/wandb/run-20241105_163038-cniejveq/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2024-11-05T16:30:38.924193033-05:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-11-05T16:30:38.924212963-05:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241105_163038-cniejveq/logs/debug-core.log"} +{"time":"2024-11-05T16:30:39.031147386-05:00","level":"INFO","msg":"created new stream","id":"cniejveq"} +{"time":"2024-11-05T16:30:39.031179416-05:00","level":"INFO","msg":"stream: started","id":"cniejveq"} +{"time":"2024-11-05T16:30:39.031221646-05:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"cniejveq"}} +{"time":"2024-11-05T16:30:39.031303087-05:00","level":"INFO","msg":"sender: started","stream_id":"cniejveq"} +{"time":"2024-11-05T16:30:39.031277206-05:00","level":"INFO","msg":"handler: started","stream_id":{"value":"cniejveq"}} +{"time":"2024-11-05T16:30:40.34975767-05:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2024-11-05T16:32:00.308948142-05:00","level":"INFO","msg":"stream: closing","id":"cniejveq"} +{"time":"2024-11-05T16:32:00.308992683-05:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2024-11-05T16:32:00.309786176-05:00","level":"INFO","msg":"Stopped system monitor"} diff --git a/wandb/run-20241105_163038-cniejveq/logs/debug.log b/wandb/run-20241105_163038-cniejveq/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..29942bf80ed797d47d0d166d9b1ba62fa3a626ee --- /dev/null +++ b/wandb/run-20241105_163038-cniejveq/logs/debug.log @@ -0,0 +1,27 @@ +2024-11-05 16:30:38,918 INFO MainThread:1780129 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-11-05 16:30:38,918 INFO MainThread:1780129 [wandb_setup.py:_flush():79] Configure stats pid to 1780129 +2024-11-05 16:30:38,918 INFO MainThread:1780129 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-11-05 16:30:38,918 INFO MainThread:1780129 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-11-05 16:30:38,918 INFO MainThread:1780129 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-11-05 16:30:38,918 INFO MainThread:1780129 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-11-05 16:30:38,918 INFO MainThread:1780129 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-11-05 16:30:38,918 INFO MainThread:1780129 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-11-05 16:30:38,918 INFO MainThread:1780129 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241105_163038-cniejveq/logs/debug.log +2024-11-05 16:30:38,918 INFO MainThread:1780129 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241105_163038-cniejveq/logs/debug-internal.log +2024-11-05 16:30:38,918 INFO MainThread:1780129 [wandb_init.py:init():621] calling init triggers +2024-11-05 16:30:38,918 INFO MainThread:1780129 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-11-05 16:30:38,918 INFO MainThread:1780129 [wandb_init.py:init():671] starting backend +2024-11-05 16:30:38,918 INFO MainThread:1780129 [wandb_init.py:init():675] sending inform_init request +2024-11-05 16:30:38,920 INFO MainThread:1780129 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-11-05 16:30:38,921 INFO MainThread:1780129 [wandb_init.py:init():688] backend started and connected +2024-11-05 16:30:38,925 INFO MainThread:1780129 [wandb_init.py:init():783] updated telemetry +2024-11-05 16:30:38,947 INFO MainThread:1780129 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-11-05 16:30:40,345 INFO MainThread:1780129 [wandb_init.py:init():867] starting run threads in backend +2024-11-05 16:30:40,459 INFO MainThread:1780129 [wandb_run.py:_console_start():2463] atexit reg +2024-11-05 16:30:40,459 INFO MainThread:1780129 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-11-05 16:30:40,459 INFO MainThread:1780129 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-11-05 16:30:40,459 INFO MainThread:1780129 [wandb_run.py:_redirect():2401] Redirects installed. +2024-11-05 16:30:40,461 INFO MainThread:1780129 [wandb_init.py:init():911] run started, returning control to user process +2024-11-05 16:30:40,461 INFO MainThread:1780129 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'shuffle_deterministic21', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0, 'lr': 5e-06} +2024-11-05 16:32:00,309 WARNING MsgRouterThr:1780129 [router.py:message_loop():77] message_loop has been closed diff --git a/wandb/run-20241105_163038-cniejveq/run-cniejveq.wandb b/wandb/run-20241105_163038-cniejveq/run-cniejveq.wandb new file mode 100644 index 0000000000000000000000000000000000000000..a7dccf9c71e1f8621ba81a341c8d56203a74f66c Binary files /dev/null and b/wandb/run-20241105_163038-cniejveq/run-cniejveq.wandb differ diff --git a/wandb/run-20241105_163039-q4e8d8hm/files/output.log b/wandb/run-20241105_163039-q4e8d8hm/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..2b5fe625440481023ff1642584fca6a9524bbb5e --- /dev/null +++ b/wandb/run-20241105_163039-q4e8d8hm/files/output.log @@ -0,0 +1,53 @@ +Downloading shards: 0%| | 0/2 [00:00 +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/std.py", line 1196, in __iter__ + self.close() + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/std.py", line 1302, in close + self.display(pos=0) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/std.py", line 1495, in display + self.sp(self.__str__() if msg is None else msg) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/std.py", line 459, in print_status + fp_write('\r' + s + (' ' * max(last_len[0] - len_s, 0))) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/std.py", line 452, in fp_write + fp.write(str(s)) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/utils.py", line 196, in inner + return func(*args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/wandb/sdk/lib/redirect.py", line 648, in write + cb(data) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/wandb/sdk/wandb_run.py", line 2386, in + lambda data: self._console_raw_callback("stderr", data), + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/wandb/sdk/wandb_run.py", line 403, in wrapper_fn + return func(self, *args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/wandb/sdk/wandb_run.py", line 1547, in _console_raw_callback + self._backend.interface.publish_output_raw(name, data) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/wandb/sdk/interface/interface.py", line 715, in publish_output_raw + otype = pb.OutputRawRecord.OutputType.STDERR + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/google/protobuf/internal/enum_type_wrapper.py", line 94, in __getattr__ + try: +KeyboardInterrupt: +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", line 174, in + model = AutoModelForCausalLM.from_pretrained(model_name, + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/models/auto/auto_factory.py", line 564, in from_pretrained + return model_class.from_pretrained( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/modeling_utils.py", line 3769, in from_pretrained + resolved_archive_file, sharded_metadata = get_checkpoint_shard_files( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 1098, in get_checkpoint_shard_files + cached_filename = cached_file( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 403, in cached_file + resolved_file = hf_hub_download( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_deprecation.py", line 101, in inner_f + return f(*args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn + return fn(*args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1232, in hf_hub_download + return _hf_hub_download_to_cache_dir( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1380, in _hf_hub_download_to_cache_dir + with WeakFileLock(lock_path): + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/contextlib.py", line 119, in __enter__ + return next(self.gen) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_fixes.py", line 98, in WeakFileLock + lock.acquire() + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/filelock/_api.py", line 225, in acquire + time.sleep(poll_interval) +KeyboardInterrupt diff --git a/wandb/run-20241105_163039-q4e8d8hm/files/requirements.txt b/wandb/run-20241105_163039-q4e8d8hm/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416 --- /dev/null +++ b/wandb/run-20241105_163039-q4e8d8hm/files/requirements.txt @@ -0,0 +1,147 @@ +funcsigs==1.0.2 +sentry-sdk==2.17.0 +multiprocess==0.70.16 +numpy==1.26.2 +pluralizer==1.2.0 +debugpy==1.6.7 +nvidia-cudnn-cu11==8.5.0.96 +deepspeed==0.15.2 +data==0.4 +pandas==2.1.3 +tomli==2.0.1 +charset-normalizer==3.3.2 +attrs==24.2.0 +aiosignal==1.3.1 +fsspec==2023.10.0 +nvidia-cusparse-cu11==11.7.4.91 +zipp==3.12.0 +mypy-extensions==1.0.0 +datasets==3.0.1 +joblib==1.3.2 +hjson==3.1.0 +traitlets==5.7.1 +stack-data==0.6.0 +transformers==4.45.1 +sympy==1.11.1 +Pygments==2.15.0 +docker-pycreds==0.4.0 +dill==0.3.8 +wheel==0.44.0 +prompt-toolkit==3.0.30 +parso==0.8.3 +ipykernel==6.23.1 +pyarrow==17.0.0 +certifi==2023.11.17 +nvidia-cufft-cu11==10.9.0.58 +six==1.16.0 +pydantic==2.9.2 +click==8.1.7 +nest-asyncio==1.5.6 +gmpy2==2.1.0 +matplotlib==3.8.2 +scipy==1.11.4 +typing_extensions==4.12.2 +statsmodels==0.14.0 +huggingface-hub==0.25.0 +frozenlist==1.4.1 +gpustat==1.1.1 +nvidia-nvtx-cu11==11.7.91 +safetensors==0.4.5 +stanza==1.9.2 +decorator==5.1.1 +seaborn==0.13.0 +sentencepiece==0.2.0 +PyYAML==6.0.1 +black==24.8.0 +protobuf==4.25.1 +pickleshare==0.7.5 +peft==0.13.0 +triton==2.0.0 +nvidia-cuda-runtime-cu11==11.7.99 +Jinja2==3.1.2 +nvidia-cusolver-cu11==11.4.0.1 +executing==1.2.0 +jupyter_client==8.1.0 +pluggy==1.3.0 +cmake==3.30.3 +pytz==2023.3.post1 +aiohappyeyeballs==2.4.2 +kiwisolver==1.4.5 +py-cpuinfo==9.0.0 +Pillow==10.1.0 +ptyprocess==0.7.0 +importlib_resources==6.4.5 +GitPython==3.1.43 +importlib-metadata==6.0.0 +iniconfig==2.0.0 +scikit-learn==1.3.2 +exceptiongroup==1.1.0 +networkx==2.8.6 +accelerate==1.0.0 +nltk==3.8.1 +shutilwhich==1.1.0 +fonttools==4.45.1 +future==0.18.3 +aiohttp==3.10.6 +wcwidth==0.2.5 +idna==3.6 +filelock==3.12.2 +pathspec==0.12.1 +jupyter_core==5.1.0 +lit==18.1.8 +nvidia-curand-cu11==10.2.10.91 +nvidia-cublas-cu11==11.10.3.66 +nvidia-ml-py==12.560.30 +msgpack==1.1.0 +python-dateutil==2.8.2 +blessed==1.20.0 +packaging==23.0 +gitdb==4.0.11 +yarl==1.13.0 +emoji==2.8.0 +tzdata==2023.3 +cycler==0.12.1 +tornado==6.2 +backcall==0.2.0 +plotnine==0.12.4 +ninja==1.11.1.1 +latex==0.7.0 +wandb==0.18.5 +setproctitle==1.3.3 +threadpoolctl==3.2.0 +requests==2.32.3 +pyparsing==3.1.1 +smmap==5.0.1 +pyzmq==23.0.0 +async-timeout==4.0.3 +annotated-types==0.7.0 +matplotlib-inline==0.1.6 +latexcodec==1.0.0 +ipython==8.0.0 +patsy==0.5.3 +contourpy==1.2.0 +multidict==6.1.0 +mizani==0.9.3 +urllib3==2.1.0 +tokenizers==0.20.0 +MarkupSafe==2.1.2 +pip==24.2 +pexpect==4.8.0 +tqdm==4.66.5 +jedi==0.18.2 +pydantic_core==2.23.4 +tempdir==0.7.1 +mpmath==1.2.1 +setuptools==72.1.0 +pytest==7.4.3 +pure-eval==0.2.2 +psutil==5.9.1 +comm==0.1.2 +nvidia-cuda-cupti-cu11==11.7.101 +nvidia-cuda-nvrtc-cu11==11.7.99 +regex==2023.10.3 +platformdirs==2.5.2 +asttokens==2.2.1 +torch==2.0.0 +nvidia-nccl-cu11==2.14.3 +xxhash==3.5.0 diff --git a/wandb/run-20241105_163039-q4e8d8hm/files/wandb-summary.json b/wandb/run-20241105_163039-q4e8d8hm/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..51cc9288ee1f562b22e175ccb65f3cee4534742c --- /dev/null +++ b/wandb/run-20241105_163039-q4e8d8hm/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":81}} \ No newline at end of file diff --git a/wandb/run-20241105_163039-q4e8d8hm/logs/debug-internal.log b/wandb/run-20241105_163039-q4e8d8hm/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..011cda856af29c3b3a41afa1bef33f8920602096 --- /dev/null +++ b/wandb/run-20241105_163039-q4e8d8hm/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2024-11-05T16:30:39.180032822-05:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-11-05T16:30:39.180048642-05:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241105_163039-q4e8d8hm/logs/debug-core.log"} +{"time":"2024-11-05T16:30:39.28803681-05:00","level":"INFO","msg":"created new stream","id":"q4e8d8hm"} +{"time":"2024-11-05T16:30:39.28809489-05:00","level":"INFO","msg":"stream: started","id":"q4e8d8hm"} +{"time":"2024-11-05T16:30:39.288185601-05:00","level":"INFO","msg":"sender: started","stream_id":"q4e8d8hm"} +{"time":"2024-11-05T16:30:39.288135791-05:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"q4e8d8hm"}} +{"time":"2024-11-05T16:30:39.288221261-05:00","level":"INFO","msg":"handler: started","stream_id":{"value":"q4e8d8hm"}} +{"time":"2024-11-05T16:30:40.357515562-05:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2024-11-05T16:32:00.266339259-05:00","level":"INFO","msg":"stream: closing","id":"q4e8d8hm"} +{"time":"2024-11-05T16:32:00.2663737-05:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2024-11-05T16:32:00.266991342-05:00","level":"INFO","msg":"Stopped system monitor"} diff --git a/wandb/run-20241105_163244-o1vw2gev/files/output.log b/wandb/run-20241105_163244-o1vw2gev/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..b4d602041f8130483127171a2232717ce05a60a6 --- /dev/null +++ b/wandb/run-20241105_163244-o1vw2gev/files/output.log @@ -0,0 +1,16 @@ +Downloading shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:59<00:00, 29.54s/it] +Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:04<00:00, 2.22s/it] +Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16425/16425 [00:46<00:00, 351.83 examples/s] +Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17013/17013 [00:46<00:00, 362.88 examples/s] +tokenized_valid: Dataset({ + features: ['input_ids', 'attention_mask'], + num_rows: 1000 +}) +/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead + warnings.warn( +[2024-11-05 16:35:24,573] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-11-05 16:35:34,845] [INFO] [comm.py:652:init_distributed] cdb=None +Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination +Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root... +Loading extension module cpu_adam... +Time to load cpu_adam op: 4.6926655769348145 seconds diff --git a/wandb/run-20241105_163244-o1vw2gev/files/requirements.txt b/wandb/run-20241105_163244-o1vw2gev/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416 --- /dev/null +++ b/wandb/run-20241105_163244-o1vw2gev/files/requirements.txt @@ -0,0 +1,147 @@ +funcsigs==1.0.2 +sentry-sdk==2.17.0 +multiprocess==0.70.16 +numpy==1.26.2 +pluralizer==1.2.0 +debugpy==1.6.7 +nvidia-cudnn-cu11==8.5.0.96 +deepspeed==0.15.2 +data==0.4 +pandas==2.1.3 +tomli==2.0.1 +charset-normalizer==3.3.2 +attrs==24.2.0 +aiosignal==1.3.1 +fsspec==2023.10.0 +nvidia-cusparse-cu11==11.7.4.91 +zipp==3.12.0 +mypy-extensions==1.0.0 +datasets==3.0.1 +joblib==1.3.2 +hjson==3.1.0 +traitlets==5.7.1 +stack-data==0.6.0 +transformers==4.45.1 +sympy==1.11.1 +Pygments==2.15.0 +docker-pycreds==0.4.0 +dill==0.3.8 +wheel==0.44.0 +prompt-toolkit==3.0.30 +parso==0.8.3 +ipykernel==6.23.1 +pyarrow==17.0.0 +certifi==2023.11.17 +nvidia-cufft-cu11==10.9.0.58 +six==1.16.0 +pydantic==2.9.2 +click==8.1.7 +nest-asyncio==1.5.6 +gmpy2==2.1.0 +matplotlib==3.8.2 +scipy==1.11.4 +typing_extensions==4.12.2 +statsmodels==0.14.0 +huggingface-hub==0.25.0 +frozenlist==1.4.1 +gpustat==1.1.1 +nvidia-nvtx-cu11==11.7.91 +safetensors==0.4.5 +stanza==1.9.2 +decorator==5.1.1 +seaborn==0.13.0 +sentencepiece==0.2.0 +PyYAML==6.0.1 +black==24.8.0 +protobuf==4.25.1 +pickleshare==0.7.5 +peft==0.13.0 +triton==2.0.0 +nvidia-cuda-runtime-cu11==11.7.99 +Jinja2==3.1.2 +nvidia-cusolver-cu11==11.4.0.1 +executing==1.2.0 +jupyter_client==8.1.0 +pluggy==1.3.0 +cmake==3.30.3 +pytz==2023.3.post1 +aiohappyeyeballs==2.4.2 +kiwisolver==1.4.5 +py-cpuinfo==9.0.0 +Pillow==10.1.0 +ptyprocess==0.7.0 +importlib_resources==6.4.5 +GitPython==3.1.43 +importlib-metadata==6.0.0 +iniconfig==2.0.0 +scikit-learn==1.3.2 +exceptiongroup==1.1.0 +networkx==2.8.6 +accelerate==1.0.0 +nltk==3.8.1 +shutilwhich==1.1.0 +fonttools==4.45.1 +future==0.18.3 +aiohttp==3.10.6 +wcwidth==0.2.5 +idna==3.6 +filelock==3.12.2 +pathspec==0.12.1 +jupyter_core==5.1.0 +lit==18.1.8 +nvidia-curand-cu11==10.2.10.91 +nvidia-cublas-cu11==11.10.3.66 +nvidia-ml-py==12.560.30 +msgpack==1.1.0 +python-dateutil==2.8.2 +blessed==1.20.0 +packaging==23.0 +gitdb==4.0.11 +yarl==1.13.0 +emoji==2.8.0 +tzdata==2023.3 +cycler==0.12.1 +tornado==6.2 +backcall==0.2.0 +plotnine==0.12.4 +ninja==1.11.1.1 +latex==0.7.0 +wandb==0.18.5 +setproctitle==1.3.3 +threadpoolctl==3.2.0 +requests==2.32.3 +pyparsing==3.1.1 +smmap==5.0.1 +pyzmq==23.0.0 +async-timeout==4.0.3 +annotated-types==0.7.0 +matplotlib-inline==0.1.6 +latexcodec==1.0.0 +ipython==8.0.0 +patsy==0.5.3 +contourpy==1.2.0 +multidict==6.1.0 +mizani==0.9.3 +urllib3==2.1.0 +tokenizers==0.20.0 +MarkupSafe==2.1.2 +pip==24.2 +pexpect==4.8.0 +tqdm==4.66.5 +jedi==0.18.2 +pydantic_core==2.23.4 +tempdir==0.7.1 +mpmath==1.2.1 +setuptools==72.1.0 +pytest==7.4.3 +pure-eval==0.2.2 +psutil==5.9.1 +comm==0.1.2 +nvidia-cuda-cupti-cu11==11.7.101 +nvidia-cuda-nvrtc-cu11==11.7.99 +regex==2023.10.3 +platformdirs==2.5.2 +asttokens==2.2.1 +torch==2.0.0 +nvidia-nccl-cu11==2.14.3 +xxhash==3.5.0 diff --git a/wandb/run-20241105_163244-o1vw2gev/files/wandb-metadata.json b/wandb/run-20241105_163244-o1vw2gev/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..6885d8ab756dd67ea4101b55f9c44e37e5d1e7a3 --- /dev/null +++ b/wandb/run-20241105_163244-o1vw2gev/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-11-05T21:32:44.913109Z", + "args": [ + "--perturbation", + "shuffle_deterministic57", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "3", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1785985114112" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241105_163244-o1vw2gev/logs/debug.log b/wandb/run-20241105_163244-o1vw2gev/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..ec759edd08650125489cb9db650e769d4aee7571 --- /dev/null +++ b/wandb/run-20241105_163244-o1vw2gev/logs/debug.log @@ -0,0 +1,26 @@ +2024-11-05 16:32:44,910 INFO MainThread:1780850 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-11-05 16:32:44,910 INFO MainThread:1780850 [wandb_setup.py:_flush():79] Configure stats pid to 1780850 +2024-11-05 16:32:44,910 INFO MainThread:1780850 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-11-05 16:32:44,910 INFO MainThread:1780850 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-11-05 16:32:44,910 INFO MainThread:1780850 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-11-05 16:32:44,910 INFO MainThread:1780850 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-11-05 16:32:44,910 INFO MainThread:1780850 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-11-05 16:32:44,910 INFO MainThread:1780850 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-11-05 16:32:44,911 INFO MainThread:1780850 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241105_163244-o1vw2gev/logs/debug.log +2024-11-05 16:32:44,911 INFO MainThread:1780850 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241105_163244-o1vw2gev/logs/debug-internal.log +2024-11-05 16:32:44,911 INFO MainThread:1780850 [wandb_init.py:init():621] calling init triggers +2024-11-05 16:32:44,911 INFO MainThread:1780850 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-11-05 16:32:44,911 INFO MainThread:1780850 [wandb_init.py:init():671] starting backend +2024-11-05 16:32:44,911 INFO MainThread:1780850 [wandb_init.py:init():675] sending inform_init request +2024-11-05 16:32:44,912 INFO MainThread:1780850 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-11-05 16:32:44,912 INFO MainThread:1780850 [wandb_init.py:init():688] backend started and connected +2024-11-05 16:32:44,916 INFO MainThread:1780850 [wandb_init.py:init():783] updated telemetry +2024-11-05 16:32:44,953 INFO MainThread:1780850 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-11-05 16:32:45,189 INFO MainThread:1780850 [wandb_init.py:init():867] starting run threads in backend +2024-11-05 16:32:45,280 INFO MainThread:1780850 [wandb_run.py:_console_start():2463] atexit reg +2024-11-05 16:32:45,280 INFO MainThread:1780850 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-11-05 16:32:45,280 INFO MainThread:1780850 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-11-05 16:32:45,280 INFO MainThread:1780850 [wandb_run.py:_redirect():2401] Redirects installed. +2024-11-05 16:32:45,281 INFO MainThread:1780850 [wandb_init.py:init():911] run started, returning control to user process +2024-11-05 16:32:45,282 INFO MainThread:1780850 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'shuffle_deterministic57', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0, 'lr': 5e-06} diff --git a/wandb/run-20241106_224236-5aea7d25/files/config.yaml b/wandb/run-20241106_224236-5aea7d25/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..507bc548a43d1c7afb495d052831f04f348b9c17 --- /dev/null +++ b/wandb/run-20241106_224236-5aea7d25/files/config.yaml @@ -0,0 +1,49 @@ +_wandb: + value: + cli_version: 0.18.5 + m: [] + python_version: 3.9.19 + t: + "1": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "2": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "3": + - 13 + - 23 + - 55 + "4": 3.9.19 + "5": 0.18.5 + "6": 4.45.1 + "8": + - 5 + "12": 0.18.5 + "13": linux-x86_64 +batch_size: + value: 3 +epoch: + value: 3 +lr: + value: 5e-06 +perturbation: + value: shuffle_deterministic84 +seed: + value: 0 +train_set: + value: 10M diff --git a/wandb/run-20241106_224236-5aea7d25/files/output.log b/wandb/run-20241106_224236-5aea7d25/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..7cfe7ddbf8b31647cb490326c6419d6cefddc948 --- /dev/null +++ b/wandb/run-20241106_224236-5aea7d25/files/output.log @@ -0,0 +1,60 @@ +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_http.py", line 406, in hf_raise_for_status + response.raise_for_status() + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/requests/models.py", line 1024, in raise_for_status + raise HTTPError(http_error_msg, response=self) +requests.exceptions.HTTPError: 401 Client Error: Unauthorized for url: https://huggingface.co/meta-llama/Llama-3.2-3B/resolve/main/config.json + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 403, in cached_file + resolved_file = hf_hub_download( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_deprecation.py", line 101, in inner_f + return f(*args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn + return fn(*args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1232, in hf_hub_download + return _hf_hub_download_to_cache_dir( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1339, in _hf_hub_download_to_cache_dir + _raise_on_head_call_error(head_call_error, force_download, local_files_only) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1854, in _raise_on_head_call_error + raise head_call_error + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1746, in _get_metadata_or_catch_error + metadata = get_hf_file_metadata( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn + return fn(*args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1666, in get_hf_file_metadata + r = _request_wrapper( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 364, in _request_wrapper + response = _request_wrapper( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 388, in _request_wrapper + hf_raise_for_status(response) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_http.py", line 423, in hf_raise_for_status + raise _format(GatedRepoError, message, response) from e +huggingface_hub.errors.GatedRepoError: 401 Client Error. (Request ID: Root=1-672c372c-255ae2fc1630578625cc1a99;81714f15-e160-4e10-9266-d353d024e2ee) + +Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.2-3B/resolve/main/config.json. +Access to model meta-llama/Llama-3.2-3B is restricted. You must have access to it and be authenticated to access it. Please log in. + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", line 174, in + model = AutoModelForCausalLM.from_pretrained(model_name, + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/models/auto/auto_factory.py", line 526, in from_pretrained + config, kwargs = AutoConfig.from_pretrained( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/models/auto/configuration_auto.py", line 1006, in from_pretrained + config_dict, unused_kwargs = PretrainedConfig.get_config_dict(pretrained_model_name_or_path, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/configuration_utils.py", line 567, in get_config_dict + config_dict, kwargs = cls._get_config_dict(pretrained_model_name_or_path, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/configuration_utils.py", line 626, in _get_config_dict + resolved_config_file = cached_file( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 421, in cached_file + raise EnvironmentError( +OSError: You are trying to access a gated repo. +Make sure to have access to it at https://huggingface.co/meta-llama/Llama-3.2-3B. +401 Client Error. (Request ID: Root=1-672c372c-255ae2fc1630578625cc1a99;81714f15-e160-4e10-9266-d353d024e2ee) + +Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.2-3B/resolve/main/config.json. +Access to model meta-llama/Llama-3.2-3B is restricted. You must have access to it and be authenticated to access it. Please log in. diff --git a/wandb/run-20241106_224236-5aea7d25/files/wandb-metadata.json b/wandb/run-20241106_224236-5aea7d25/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..d9a1c571ae442499afd1286c19168ef544e1abd3 --- /dev/null +++ b/wandb/run-20241106_224236-5aea7d25/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-11-07T03:42:36.042058Z", + "args": [ + "--perturbation", + "shuffle_deterministic84", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "3", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1774852591616" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241106_224236-5aea7d25/files/wandb-summary.json b/wandb/run-20241106_224236-5aea7d25/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..6c37fe1cbbb8aed86fd461a79642cb991e4d35cf --- /dev/null +++ b/wandb/run-20241106_224236-5aea7d25/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":0}} \ No newline at end of file diff --git a/wandb/run-20241106_224236-5aea7d25/logs/debug-internal.log b/wandb/run-20241106_224236-5aea7d25/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..2d3a47609f533820a87e235b41404ca00d02eed6 --- /dev/null +++ b/wandb/run-20241106_224236-5aea7d25/logs/debug-internal.log @@ -0,0 +1,16 @@ +{"time":"2024-11-06T22:42:36.043933842-05:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-11-06T22:42:36.043945402-05:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241106_224236-5aea7d25/logs/debug-core.log"} +{"time":"2024-11-06T22:42:36.149854419-05:00","level":"INFO","msg":"created new stream","id":"5aea7d25"} +{"time":"2024-11-06T22:42:36.149884609-05:00","level":"INFO","msg":"stream: started","id":"5aea7d25"} +{"time":"2024-11-06T22:42:36.149918709-05:00","level":"INFO","msg":"sender: started","stream_id":"5aea7d25"} +{"time":"2024-11-06T22:42:36.149899719-05:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"5aea7d25"}} +{"time":"2024-11-06T22:42:36.149949259-05:00","level":"INFO","msg":"handler: started","stream_id":{"value":"5aea7d25"}} +{"time":"2024-11-06T22:42:36.320318646-05:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2024-11-06T22:42:36.542113759-05:00","level":"INFO","msg":"stream: closing","id":"5aea7d25"} +{"time":"2024-11-06T22:42:36.542157509-05:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2024-11-06T22:42:36.542597962-05:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2024-11-06T22:42:36.933080969-05:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2024-11-06T22:42:37.056245272-05:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"5aea7d25"}} +{"time":"2024-11-06T22:42:37.056305772-05:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"5aea7d25"}} +{"time":"2024-11-06T22:42:37.056394493-05:00","level":"INFO","msg":"sender: closed","stream_id":"5aea7d25"} +{"time":"2024-11-06T22:42:37.056435963-05:00","level":"INFO","msg":"stream: closed","id":"5aea7d25"} diff --git a/wandb/run-20241106_224236-5aea7d25/logs/debug.log b/wandb/run-20241106_224236-5aea7d25/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..a84b9665b0bc2181ab06e069b9226d26928fee11 --- /dev/null +++ b/wandb/run-20241106_224236-5aea7d25/logs/debug.log @@ -0,0 +1,27 @@ +2024-11-06 22:42:36,040 INFO MainThread:1982051 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-11-06 22:42:36,040 INFO MainThread:1982051 [wandb_setup.py:_flush():79] Configure stats pid to 1982051 +2024-11-06 22:42:36,040 INFO MainThread:1982051 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-11-06 22:42:36,040 INFO MainThread:1982051 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-11-06 22:42:36,040 INFO MainThread:1982051 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-11-06 22:42:36,040 INFO MainThread:1982051 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-11-06 22:42:36,040 INFO MainThread:1982051 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-11-06 22:42:36,040 INFO MainThread:1982051 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-11-06 22:42:36,040 INFO MainThread:1982051 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241106_224236-5aea7d25/logs/debug.log +2024-11-06 22:42:36,040 INFO MainThread:1982051 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241106_224236-5aea7d25/logs/debug-internal.log +2024-11-06 22:42:36,040 INFO MainThread:1982051 [wandb_init.py:init():621] calling init triggers +2024-11-06 22:42:36,040 INFO MainThread:1982051 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-11-06 22:42:36,040 INFO MainThread:1982051 [wandb_init.py:init():671] starting backend +2024-11-06 22:42:36,040 INFO MainThread:1982051 [wandb_init.py:init():675] sending inform_init request +2024-11-06 22:42:36,041 INFO MainThread:1982051 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-11-06 22:42:36,041 INFO MainThread:1982051 [wandb_init.py:init():688] backend started and connected +2024-11-06 22:42:36,045 INFO MainThread:1982051 [wandb_init.py:init():783] updated telemetry +2024-11-06 22:42:36,086 INFO MainThread:1982051 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-11-06 22:42:36,316 INFO MainThread:1982051 [wandb_init.py:init():867] starting run threads in backend +2024-11-06 22:42:36,409 INFO MainThread:1982051 [wandb_run.py:_console_start():2463] atexit reg +2024-11-06 22:42:36,409 INFO MainThread:1982051 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-11-06 22:42:36,409 INFO MainThread:1982051 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-11-06 22:42:36,409 INFO MainThread:1982051 [wandb_run.py:_redirect():2401] Redirects installed. +2024-11-06 22:42:36,410 INFO MainThread:1982051 [wandb_init.py:init():911] run started, returning control to user process +2024-11-06 22:42:36,410 INFO MainThread:1982051 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'shuffle_deterministic84', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0, 'lr': 5e-06} +2024-11-06 22:42:36,542 WARNING MsgRouterThr:1982051 [router.py:message_loop():77] message_loop has been closed diff --git a/wandb/run-20241106_224236-5aea7d25/run-5aea7d25.wandb b/wandb/run-20241106_224236-5aea7d25/run-5aea7d25.wandb new file mode 100644 index 0000000000000000000000000000000000000000..ea49d5732c15e2a743eaea3be68db040670bc71e Binary files /dev/null and b/wandb/run-20241106_224236-5aea7d25/run-5aea7d25.wandb differ diff --git a/wandb/run-20241115_125218-rrve0rbk/run-rrve0rbk.wandb b/wandb/run-20241115_125218-rrve0rbk/run-rrve0rbk.wandb new file mode 100644 index 0000000000000000000000000000000000000000..8aa1a58f07d05ff3631ed05733aa6bd1d8324fd5 --- /dev/null +++ b/wandb/run-20241115_125218-rrve0rbk/run-rrve0rbk.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9733c49ae5bd5775213807f42d94c8555c8393451dcdd9587837930789ea85e +size 14024704