diff --git a/wandb/run-20241030_012617-0h15y3p4/files/config.yaml b/wandb/run-20241030_012617-0h15y3p4/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f8546482f67cbb9041af9d756594322c485d67a9 --- /dev/null +++ b/wandb/run-20241030_012617-0h15y3p4/files/config.yaml @@ -0,0 +1,47 @@ +_wandb: + value: + cli_version: 0.18.5 + m: [] + python_version: 3.9.19 + t: + "1": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "2": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "3": + - 13 + - 23 + - 55 + "4": 3.9.19 + "5": 0.18.5 + "6": 4.45.1 + "8": + - 5 + "12": 0.18.5 + "13": linux-x86_64 +batch_size: + value: 3 +epoch: + value: 7 +perturbation: + value: reverse_control +seed: + value: 0 +train_set: + value: 10M diff --git a/wandb/run-20241030_012617-0h15y3p4/files/output.log b/wandb/run-20241030_012617-0h15y3p4/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..c78613b68fdc40d0d678a230555c23d17949c02e --- /dev/null +++ b/wandb/run-20241030_012617-0h15y3p4/files/output.log @@ -0,0 +1,24 @@ +Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:03<00:00, 1.96s/it] +Map: 11%|██████████████ | 2000/18140 [00:09<01:18, 206.29 examples/s] +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", line 181, in + tokenized_valid = valid_dataset.map(tokenize_function, batched=True, remove_columns=["text"]) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/datasets/arrow_dataset.py", line 560, in wrapper + out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/datasets/arrow_dataset.py", line 3035, in map + for rank, done, content in Dataset._map_single(**dataset_kwargs): + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/datasets/arrow_dataset.py", line 3438, in _map_single + batch = apply_function_on_filtered_inputs( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/datasets/arrow_dataset.py", line 3300, in apply_function_on_filtered_inputs + processed_inputs = function(*fn_args, *additional_args, **fn_kwargs) + File "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", line 179, in tokenize_function + return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=1024) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/tokenization_utils_base.py", line 3024, in __call__ + encodings = self._call_one(text=text, text_pair=text_pair, **all_kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/tokenization_utils_base.py", line 3112, in _call_one + return self.batch_encode_plus( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/tokenization_utils_base.py", line 3314, in batch_encode_plus + return self._batch_encode_plus( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/tokenization_utils_fast.py", line 529, in _batch_encode_plus + encodings = self._tokenizer.encode_batch( +KeyboardInterrupt diff --git a/wandb/run-20241030_012617-0h15y3p4/files/requirements.txt b/wandb/run-20241030_012617-0h15y3p4/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416 --- /dev/null +++ b/wandb/run-20241030_012617-0h15y3p4/files/requirements.txt @@ -0,0 +1,147 @@ +funcsigs==1.0.2 +sentry-sdk==2.17.0 +multiprocess==0.70.16 +numpy==1.26.2 +pluralizer==1.2.0 +debugpy==1.6.7 +nvidia-cudnn-cu11==8.5.0.96 +deepspeed==0.15.2 +data==0.4 +pandas==2.1.3 +tomli==2.0.1 +charset-normalizer==3.3.2 +attrs==24.2.0 +aiosignal==1.3.1 +fsspec==2023.10.0 +nvidia-cusparse-cu11==11.7.4.91 +zipp==3.12.0 +mypy-extensions==1.0.0 +datasets==3.0.1 +joblib==1.3.2 +hjson==3.1.0 +traitlets==5.7.1 +stack-data==0.6.0 +transformers==4.45.1 +sympy==1.11.1 +Pygments==2.15.0 +docker-pycreds==0.4.0 +dill==0.3.8 +wheel==0.44.0 +prompt-toolkit==3.0.30 +parso==0.8.3 +ipykernel==6.23.1 +pyarrow==17.0.0 +certifi==2023.11.17 +nvidia-cufft-cu11==10.9.0.58 +six==1.16.0 +pydantic==2.9.2 +click==8.1.7 +nest-asyncio==1.5.6 +gmpy2==2.1.0 +matplotlib==3.8.2 +scipy==1.11.4 +typing_extensions==4.12.2 +statsmodels==0.14.0 +huggingface-hub==0.25.0 +frozenlist==1.4.1 +gpustat==1.1.1 +nvidia-nvtx-cu11==11.7.91 +safetensors==0.4.5 +stanza==1.9.2 +decorator==5.1.1 +seaborn==0.13.0 +sentencepiece==0.2.0 +PyYAML==6.0.1 +black==24.8.0 +protobuf==4.25.1 +pickleshare==0.7.5 +peft==0.13.0 +triton==2.0.0 +nvidia-cuda-runtime-cu11==11.7.99 +Jinja2==3.1.2 +nvidia-cusolver-cu11==11.4.0.1 +executing==1.2.0 +jupyter_client==8.1.0 +pluggy==1.3.0 +cmake==3.30.3 +pytz==2023.3.post1 +aiohappyeyeballs==2.4.2 +kiwisolver==1.4.5 +py-cpuinfo==9.0.0 +Pillow==10.1.0 +ptyprocess==0.7.0 +importlib_resources==6.4.5 +GitPython==3.1.43 +importlib-metadata==6.0.0 +iniconfig==2.0.0 +scikit-learn==1.3.2 +exceptiongroup==1.1.0 +networkx==2.8.6 +accelerate==1.0.0 +nltk==3.8.1 +shutilwhich==1.1.0 +fonttools==4.45.1 +future==0.18.3 +aiohttp==3.10.6 +wcwidth==0.2.5 +idna==3.6 +filelock==3.12.2 +pathspec==0.12.1 +jupyter_core==5.1.0 +lit==18.1.8 +nvidia-curand-cu11==10.2.10.91 +nvidia-cublas-cu11==11.10.3.66 +nvidia-ml-py==12.560.30 +msgpack==1.1.0 +python-dateutil==2.8.2 +blessed==1.20.0 +packaging==23.0 +gitdb==4.0.11 +yarl==1.13.0 +emoji==2.8.0 +tzdata==2023.3 +cycler==0.12.1 +tornado==6.2 +backcall==0.2.0 +plotnine==0.12.4 +ninja==1.11.1.1 +latex==0.7.0 +wandb==0.18.5 +setproctitle==1.3.3 +threadpoolctl==3.2.0 +requests==2.32.3 +pyparsing==3.1.1 +smmap==5.0.1 +pyzmq==23.0.0 +async-timeout==4.0.3 +annotated-types==0.7.0 +matplotlib-inline==0.1.6 +latexcodec==1.0.0 +ipython==8.0.0 +patsy==0.5.3 +contourpy==1.2.0 +multidict==6.1.0 +mizani==0.9.3 +urllib3==2.1.0 +tokenizers==0.20.0 +MarkupSafe==2.1.2 +pip==24.2 +pexpect==4.8.0 +tqdm==4.66.5 +jedi==0.18.2 +pydantic_core==2.23.4 +tempdir==0.7.1 +mpmath==1.2.1 +setuptools==72.1.0 +pytest==7.4.3 +pure-eval==0.2.2 +psutil==5.9.1 +comm==0.1.2 +nvidia-cuda-cupti-cu11==11.7.101 +nvidia-cuda-nvrtc-cu11==11.7.99 +regex==2023.10.3 +platformdirs==2.5.2 +asttokens==2.2.1 +torch==2.0.0 +nvidia-nccl-cu11==2.14.3 +xxhash==3.5.0 diff --git a/wandb/run-20241030_012617-0h15y3p4/files/wandb-metadata.json b/wandb/run-20241030_012617-0h15y3p4/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..de14e93a5b60db7fa9a88b096e6ab0f55d17cbff --- /dev/null +++ b/wandb/run-20241030_012617-0h15y3p4/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-10-30T05:26:17.391721Z", + "args": [ + "--perturbation", + "reverse_control", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "7", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1709772775424" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241030_012617-0h15y3p4/files/wandb-summary.json b/wandb/run-20241030_012617-0h15y3p4/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..1476d70fd36aa7b2a81bf4776ad653af3dd34436 --- /dev/null +++ b/wandb/run-20241030_012617-0h15y3p4/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":14}} \ No newline at end of file diff --git a/wandb/run-20241030_012617-0h15y3p4/logs/debug.log b/wandb/run-20241030_012617-0h15y3p4/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..b771b9b8af92083fe681715e0944a8477a493ec7 --- /dev/null +++ b/wandb/run-20241030_012617-0h15y3p4/logs/debug.log @@ -0,0 +1,27 @@ +2024-10-30 01:26:17,385 INFO MainThread:332626 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-10-30 01:26:17,385 INFO MainThread:332626 [wandb_setup.py:_flush():79] Configure stats pid to 332626 +2024-10-30 01:26:17,385 INFO MainThread:332626 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-10-30 01:26:17,386 INFO MainThread:332626 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-10-30 01:26:17,386 INFO MainThread:332626 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-10-30 01:26:17,386 INFO MainThread:332626 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-10-30 01:26:17,386 INFO MainThread:332626 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-10-30 01:26:17,386 INFO MainThread:332626 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-10-30 01:26:17,386 INFO MainThread:332626 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_012617-0h15y3p4/logs/debug.log +2024-10-30 01:26:17,386 INFO MainThread:332626 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_012617-0h15y3p4/logs/debug-internal.log +2024-10-30 01:26:17,386 INFO MainThread:332626 [wandb_init.py:init():621] calling init triggers +2024-10-30 01:26:17,386 INFO MainThread:332626 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-10-30 01:26:17,386 INFO MainThread:332626 [wandb_init.py:init():671] starting backend +2024-10-30 01:26:17,386 INFO MainThread:332626 [wandb_init.py:init():675] sending inform_init request +2024-10-30 01:26:17,391 INFO MainThread:332626 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-10-30 01:26:17,391 INFO MainThread:332626 [wandb_init.py:init():688] backend started and connected +2024-10-30 01:26:17,395 INFO MainThread:332626 [wandb_init.py:init():783] updated telemetry +2024-10-30 01:26:17,456 INFO MainThread:332626 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-10-30 01:26:17,693 INFO MainThread:332626 [wandb_init.py:init():867] starting run threads in backend +2024-10-30 01:26:17,829 INFO MainThread:332626 [wandb_run.py:_console_start():2463] atexit reg +2024-10-30 01:26:17,829 INFO MainThread:332626 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-10-30 01:26:17,829 INFO MainThread:332626 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-10-30 01:26:17,829 INFO MainThread:332626 [wandb_run.py:_redirect():2401] Redirects installed. +2024-10-30 01:26:17,831 INFO MainThread:332626 [wandb_init.py:init():911] run started, returning control to user process +2024-10-30 01:26:17,831 INFO MainThread:332626 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_control', 'train_set': '10M', 'batch_size': 3, 'epoch': 7, 'seed': 0} +2024-10-30 01:26:32,054 WARNING MsgRouterThr:332626 [router.py:message_loop():77] message_loop has been closed diff --git a/wandb/run-20241030_112700-j5l8vh9z/files/output.log b/wandb/run-20241030_112700-j5l8vh9z/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..b2f6de40700ad07202d2c98a78053598b93b2a49 --- /dev/null +++ b/wandb/run-20241030_112700-j5l8vh9z/files/output.log @@ -0,0 +1,43 @@ +Downloading shards: 0%| | 0/2 [01:32 +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/std.py", line 1196, in __iter__ + self.close() + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/std.py", line 1303, in close + fp_write('\n') + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/std.py", line 1287, in fp_write + self.fp.write(str(s)) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/utils.py", line 196, in inner + return func(*args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/wandb/sdk/lib/redirect.py", line 648, in write + cb(data) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/wandb/sdk/wandb_run.py", line 2386, in + lambda data: self._console_raw_callback("stderr", data), + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/wandb/sdk/wandb_run.py", line 400, in wrapper_fn + @functools.wraps(func) +KeyboardInterrupt: +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", line 172, in + model = AutoModelForCausalLM.from_pretrained(model_name, + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/models/auto/auto_factory.py", line 564, in from_pretrained + return model_class.from_pretrained( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/modeling_utils.py", line 3769, in from_pretrained + resolved_archive_file, sharded_metadata = get_checkpoint_shard_files( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 1098, in get_checkpoint_shard_files + cached_filename = cached_file( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 403, in cached_file + resolved_file = hf_hub_download( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_deprecation.py", line 101, in inner_f + return f(*args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn + return fn(*args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1232, in hf_hub_download + return _hf_hub_download_to_cache_dir( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1380, in _hf_hub_download_to_cache_dir + with WeakFileLock(lock_path): + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/contextlib.py", line 119, in __enter__ + return next(self.gen) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_fixes.py", line 98, in WeakFileLock + lock.acquire() + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/filelock/_api.py", line 225, in acquire + time.sleep(poll_interval) +KeyboardInterrupt diff --git a/wandb/run-20241030_112700-j5l8vh9z/files/wandb-metadata.json b/wandb/run-20241030_112700-j5l8vh9z/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..2368b469a7622ce8a9fd6763085f395c896aa6a4 --- /dev/null +++ b/wandb/run-20241030_112700-j5l8vh9z/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-10-30T15:27:00.728207Z", + "args": [ + "--perturbation", + "reverse_control", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "3", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1710831083520" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241030_112700-j5l8vh9z/logs/debug-internal.log b/wandb/run-20241030_112700-j5l8vh9z/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..61d86e3ac061b1c203b67badd2555b9c9aee5e58 --- /dev/null +++ b/wandb/run-20241030_112700-j5l8vh9z/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2024-10-30T11:27:00.732261133-04:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-10-30T11:27:00.732288613-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_112700-j5l8vh9z/logs/debug-core.log"} +{"time":"2024-10-30T11:27:00.843951357-04:00","level":"INFO","msg":"created new stream","id":"j5l8vh9z"} +{"time":"2024-10-30T11:27:00.844003227-04:00","level":"INFO","msg":"stream: started","id":"j5l8vh9z"} +{"time":"2024-10-30T11:27:00.844048658-04:00","level":"INFO","msg":"sender: started","stream_id":"j5l8vh9z"} +{"time":"2024-10-30T11:27:00.844032848-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"j5l8vh9z"}} +{"time":"2024-10-30T11:27:00.844096438-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"j5l8vh9z"}} +{"time":"2024-10-30T11:27:01.06475121-04:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2024-10-30T11:28:34.163673047-04:00","level":"INFO","msg":"stream: closing","id":"j5l8vh9z"} +{"time":"2024-10-30T11:28:34.163727308-04:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2024-10-30T11:28:34.164260991-04:00","level":"INFO","msg":"Stopped system monitor"} diff --git a/wandb/run-20241030_112700-j5l8vh9z/logs/debug.log b/wandb/run-20241030_112700-j5l8vh9z/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..70fa6af18e3ddda9ce755d4c5568b446e21235b4 --- /dev/null +++ b/wandb/run-20241030_112700-j5l8vh9z/logs/debug.log @@ -0,0 +1,27 @@ +2024-10-30 11:27:00,724 INFO MainThread:366801 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-10-30 11:27:00,724 INFO MainThread:366801 [wandb_setup.py:_flush():79] Configure stats pid to 366801 +2024-10-30 11:27:00,724 INFO MainThread:366801 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-10-30 11:27:00,724 INFO MainThread:366801 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-10-30 11:27:00,724 INFO MainThread:366801 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-10-30 11:27:00,724 INFO MainThread:366801 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-10-30 11:27:00,724 INFO MainThread:366801 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-10-30 11:27:00,724 INFO MainThread:366801 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-10-30 11:27:00,724 INFO MainThread:366801 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_112700-j5l8vh9z/logs/debug.log +2024-10-30 11:27:00,725 INFO MainThread:366801 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_112700-j5l8vh9z/logs/debug-internal.log +2024-10-30 11:27:00,725 INFO MainThread:366801 [wandb_init.py:init():621] calling init triggers +2024-10-30 11:27:00,725 INFO MainThread:366801 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-10-30 11:27:00,725 INFO MainThread:366801 [wandb_init.py:init():671] starting backend +2024-10-30 11:27:00,725 INFO MainThread:366801 [wandb_init.py:init():675] sending inform_init request +2024-10-30 11:27:00,727 INFO MainThread:366801 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-10-30 11:27:00,727 INFO MainThread:366801 [wandb_init.py:init():688] backend started and connected +2024-10-30 11:27:00,734 INFO MainThread:366801 [wandb_init.py:init():783] updated telemetry +2024-10-30 11:27:00,767 INFO MainThread:366801 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-10-30 11:27:01,061 INFO MainThread:366801 [wandb_init.py:init():867] starting run threads in backend +2024-10-30 11:27:01,152 INFO MainThread:366801 [wandb_run.py:_console_start():2463] atexit reg +2024-10-30 11:27:01,152 INFO MainThread:366801 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-10-30 11:27:01,152 INFO MainThread:366801 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-10-30 11:27:01,152 INFO MainThread:366801 [wandb_run.py:_redirect():2401] Redirects installed. +2024-10-30 11:27:01,153 INFO MainThread:366801 [wandb_init.py:init():911] run started, returning control to user process +2024-10-30 11:27:01,153 INFO MainThread:366801 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_control', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0} +2024-10-30 11:28:34,163 WARNING MsgRouterThr:366801 [router.py:message_loop():77] message_loop has been closed diff --git a/wandb/run-20241030_112700-j5l8vh9z/run-j5l8vh9z.wandb b/wandb/run-20241030_112700-j5l8vh9z/run-j5l8vh9z.wandb new file mode 100644 index 0000000000000000000000000000000000000000..9272ba814e28998a29a5ccaa03e341c983bd7cdc Binary files /dev/null and b/wandb/run-20241030_112700-j5l8vh9z/run-j5l8vh9z.wandb differ diff --git a/wandb/run-20241030_225833-frh96rd1/files/output.log b/wandb/run-20241030_225833-frh96rd1/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..32fc2696987cc845b2d6bb810e450ed1bebb1433 --- /dev/null +++ b/wandb/run-20241030_225833-frh96rd1/files/output.log @@ -0,0 +1,47 @@ +model.safetensors.index.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20.9k/20.9k [00:00<00:00, 7.08MB/s] +model-00001-of-00002.safetensors: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 4.97G/4.97G [01:58<00:00, 42.1MB/s] +model-00002-of-00002.safetensors: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.46G/1.46G [00:34<00:00, 42.5MB/s] +Downloading shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [02:32<00:00, 76.29s/it] +Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:04<00:00, 2.44s/it] +generation_config.json: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 185/185 [00:00<00:00, 112kB/s] +Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 18140/18140 [00:57<00:00, 317.00 examples/s] +tokenized_valid: Dataset({ + features: ['input_ids', 'attention_mask'], + num_rows: 600 +}) +/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead + warnings.warn( +[2024-10-30 23:02:10,716] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-10-30 23:02:19,960] [INFO] [comm.py:652:init_distributed] cdb=None +Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination +Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root... +Emitting ninja build file /home/chunhui/.cache/torch_extensions/py39_cu117/cpu_adam/build.ninja... +Building extension module cpu_adam... +Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +Loading extension module cpu_adam... +Time to load cpu_adam op: 4.80836296081543 seconds +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", line 219, in + trainer.train() + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/trainer.py", line 2052, in train + return inner_training_loop( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/trainer.py", line 2388, in _inner_training_loop + tr_loss_step = self.training_step(model, inputs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/trainer.py", line 3518, in training_step + self.accelerator.backward(loss, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/accelerate/accelerator.py", line 2238, in backward + self.deepspeed_engine_wrapped.backward(loss, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/accelerate/utils/deepspeed.py", line 195, in backward + self.engine.step() + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/deepspeed/runtime/engine.py", line 2217, in step + self.tput_timer.stop(global_step=self.is_gradient_accumulation_boundary(), report_speed=report_progress) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/deepspeed/utils/timer.py", line 256, in stop + get_accelerator().synchronize() + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/deepspeed/accelerator/cuda_accelerator.py", line 79, in synchronize + return torch.cuda.synchronize(device_index) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/torch/cuda/__init__.py", line 688, in synchronize + return torch._C._cuda_synchronize() +KeyboardInterrupt +Error in atexit._run_exitfuncs: +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/deepspeed/ops/transformer/inference/triton/matmul_ext.py", line 27, in is_nfs_path diff --git a/wandb/run-20241030_225833-giupspdj/files/output.log b/wandb/run-20241030_225833-giupspdj/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..fc8996c4f6d6f3b055b93f5a6c6c673f8962b7f2 --- /dev/null +++ b/wandb/run-20241030_225833-giupspdj/files/output.log @@ -0,0 +1,37 @@ +Downloading shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [02:32<00:00, 76.29s/it] +Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:05<00:00, 2.56s/it] +Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 18140/18140 [00:55<00:00, 325.18 examples/s] +tokenized_valid: Dataset({ + features: ['input_ids', 'attention_mask'], + num_rows: 600 +}) +/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead + warnings.warn( +[2024-10-30 23:02:09,367] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-10-30 23:02:20,085] [INFO] [comm.py:652:init_distributed] cdb=None +Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination +Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root... +Loading extension module cpu_adam... +Time to load cpu_adam op: 4.662875652313232 seconds +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", line 219, in + trainer.train() + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/trainer.py", line 2052, in train + return inner_training_loop( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/trainer.py", line 2388, in _inner_training_loop + tr_loss_step = self.training_step(model, inputs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/trainer.py", line 3518, in training_step + self.accelerator.backward(loss, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/accelerate/accelerator.py", line 2238, in backward + self.deepspeed_engine_wrapped.backward(loss, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/accelerate/utils/deepspeed.py", line 195, in backward + self.engine.step() + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/deepspeed/runtime/engine.py", line 2217, in step + self.tput_timer.stop(global_step=self.is_gradient_accumulation_boundary(), report_speed=report_progress) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/deepspeed/utils/timer.py", line 256, in stop + get_accelerator().synchronize() + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/deepspeed/accelerator/cuda_accelerator.py", line 79, in synchronize + return torch.cuda.synchronize(device_index) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/torch/cuda/__init__.py", line 688, in synchronize + return torch._C._cuda_synchronize() +KeyboardInterrupt diff --git a/wandb/run-20241030_225833-giupspdj/files/requirements.txt b/wandb/run-20241030_225833-giupspdj/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416 --- /dev/null +++ b/wandb/run-20241030_225833-giupspdj/files/requirements.txt @@ -0,0 +1,147 @@ +funcsigs==1.0.2 +sentry-sdk==2.17.0 +multiprocess==0.70.16 +numpy==1.26.2 +pluralizer==1.2.0 +debugpy==1.6.7 +nvidia-cudnn-cu11==8.5.0.96 +deepspeed==0.15.2 +data==0.4 +pandas==2.1.3 +tomli==2.0.1 +charset-normalizer==3.3.2 +attrs==24.2.0 +aiosignal==1.3.1 +fsspec==2023.10.0 +nvidia-cusparse-cu11==11.7.4.91 +zipp==3.12.0 +mypy-extensions==1.0.0 +datasets==3.0.1 +joblib==1.3.2 +hjson==3.1.0 +traitlets==5.7.1 +stack-data==0.6.0 +transformers==4.45.1 +sympy==1.11.1 +Pygments==2.15.0 +docker-pycreds==0.4.0 +dill==0.3.8 +wheel==0.44.0 +prompt-toolkit==3.0.30 +parso==0.8.3 +ipykernel==6.23.1 +pyarrow==17.0.0 +certifi==2023.11.17 +nvidia-cufft-cu11==10.9.0.58 +six==1.16.0 +pydantic==2.9.2 +click==8.1.7 +nest-asyncio==1.5.6 +gmpy2==2.1.0 +matplotlib==3.8.2 +scipy==1.11.4 +typing_extensions==4.12.2 +statsmodels==0.14.0 +huggingface-hub==0.25.0 +frozenlist==1.4.1 +gpustat==1.1.1 +nvidia-nvtx-cu11==11.7.91 +safetensors==0.4.5 +stanza==1.9.2 +decorator==5.1.1 +seaborn==0.13.0 +sentencepiece==0.2.0 +PyYAML==6.0.1 +black==24.8.0 +protobuf==4.25.1 +pickleshare==0.7.5 +peft==0.13.0 +triton==2.0.0 +nvidia-cuda-runtime-cu11==11.7.99 +Jinja2==3.1.2 +nvidia-cusolver-cu11==11.4.0.1 +executing==1.2.0 +jupyter_client==8.1.0 +pluggy==1.3.0 +cmake==3.30.3 +pytz==2023.3.post1 +aiohappyeyeballs==2.4.2 +kiwisolver==1.4.5 +py-cpuinfo==9.0.0 +Pillow==10.1.0 +ptyprocess==0.7.0 +importlib_resources==6.4.5 +GitPython==3.1.43 +importlib-metadata==6.0.0 +iniconfig==2.0.0 +scikit-learn==1.3.2 +exceptiongroup==1.1.0 +networkx==2.8.6 +accelerate==1.0.0 +nltk==3.8.1 +shutilwhich==1.1.0 +fonttools==4.45.1 +future==0.18.3 +aiohttp==3.10.6 +wcwidth==0.2.5 +idna==3.6 +filelock==3.12.2 +pathspec==0.12.1 +jupyter_core==5.1.0 +lit==18.1.8 +nvidia-curand-cu11==10.2.10.91 +nvidia-cublas-cu11==11.10.3.66 +nvidia-ml-py==12.560.30 +msgpack==1.1.0 +python-dateutil==2.8.2 +blessed==1.20.0 +packaging==23.0 +gitdb==4.0.11 +yarl==1.13.0 +emoji==2.8.0 +tzdata==2023.3 +cycler==0.12.1 +tornado==6.2 +backcall==0.2.0 +plotnine==0.12.4 +ninja==1.11.1.1 +latex==0.7.0 +wandb==0.18.5 +setproctitle==1.3.3 +threadpoolctl==3.2.0 +requests==2.32.3 +pyparsing==3.1.1 +smmap==5.0.1 +pyzmq==23.0.0 +async-timeout==4.0.3 +annotated-types==0.7.0 +matplotlib-inline==0.1.6 +latexcodec==1.0.0 +ipython==8.0.0 +patsy==0.5.3 +contourpy==1.2.0 +multidict==6.1.0 +mizani==0.9.3 +urllib3==2.1.0 +tokenizers==0.20.0 +MarkupSafe==2.1.2 +pip==24.2 +pexpect==4.8.0 +tqdm==4.66.5 +jedi==0.18.2 +pydantic_core==2.23.4 +tempdir==0.7.1 +mpmath==1.2.1 +setuptools==72.1.0 +pytest==7.4.3 +pure-eval==0.2.2 +psutil==5.9.1 +comm==0.1.2 +nvidia-cuda-cupti-cu11==11.7.101 +nvidia-cuda-nvrtc-cu11==11.7.99 +regex==2023.10.3 +platformdirs==2.5.2 +asttokens==2.2.1 +torch==2.0.0 +nvidia-nccl-cu11==2.14.3 +xxhash==3.5.0 diff --git a/wandb/run-20241030_225833-giupspdj/files/wandb-metadata.json b/wandb/run-20241030_225833-giupspdj/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..737cd9119ccc1dbf8e6c47dd60dc03514c886d6f --- /dev/null +++ b/wandb/run-20241030_225833-giupspdj/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-10-31T02:58:33.522570Z", + "args": [ + "--perturbation", + "reverse_full", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "3", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1710970511360" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241030_225833-giupspdj/logs/debug.log b/wandb/run-20241030_225833-giupspdj/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..774a295762a125dc603fcaf2ccd0d7ab13b7e0a1 --- /dev/null +++ b/wandb/run-20241030_225833-giupspdj/logs/debug.log @@ -0,0 +1,26 @@ +2024-10-30 22:58:33,520 INFO MainThread:451914 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-10-30 22:58:33,520 INFO MainThread:451914 [wandb_setup.py:_flush():79] Configure stats pid to 451914 +2024-10-30 22:58:33,520 INFO MainThread:451914 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-10-30 22:58:33,520 INFO MainThread:451914 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-10-30 22:58:33,520 INFO MainThread:451914 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-10-30 22:58:33,521 INFO MainThread:451914 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-10-30 22:58:33,521 INFO MainThread:451914 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-10-30 22:58:33,521 INFO MainThread:451914 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-10-30 22:58:33,521 INFO MainThread:451914 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_225833-giupspdj/logs/debug.log +2024-10-30 22:58:33,521 INFO MainThread:451914 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_225833-giupspdj/logs/debug-internal.log +2024-10-30 22:58:33,521 INFO MainThread:451914 [wandb_init.py:init():621] calling init triggers +2024-10-30 22:58:33,521 INFO MainThread:451914 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-10-30 22:58:33,521 INFO MainThread:451914 [wandb_init.py:init():671] starting backend +2024-10-30 22:58:33,521 INFO MainThread:451914 [wandb_init.py:init():675] sending inform_init request +2024-10-30 22:58:33,522 INFO MainThread:451914 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-10-30 22:58:33,522 INFO MainThread:451914 [wandb_init.py:init():688] backend started and connected +2024-10-30 22:58:33,525 INFO MainThread:451914 [wandb_init.py:init():783] updated telemetry +2024-10-30 22:58:33,563 INFO MainThread:451914 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-10-30 22:58:33,828 INFO MainThread:451914 [wandb_init.py:init():867] starting run threads in backend +2024-10-30 22:58:33,940 INFO MainThread:451914 [wandb_run.py:_console_start():2463] atexit reg +2024-10-30 22:58:33,940 INFO MainThread:451914 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-10-30 22:58:33,940 INFO MainThread:451914 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-10-30 22:58:33,940 INFO MainThread:451914 [wandb_run.py:_redirect():2401] Redirects installed. +2024-10-30 22:58:33,942 INFO MainThread:451914 [wandb_init.py:init():911] run started, returning control to user process +2024-10-30 22:58:33,942 INFO MainThread:451914 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_full', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0} diff --git a/wandb/run-20241031_002020-q6ot1vz6/logs/debug-internal.log b/wandb/run-20241031_002020-q6ot1vz6/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..02fd60fc3b50a5bbd53fd5b95611efab4510664e --- /dev/null +++ b/wandb/run-20241031_002020-q6ot1vz6/logs/debug-internal.log @@ -0,0 +1,17 @@ +{"time":"2024-10-31T00:20:20.452039426-04:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-10-31T00:20:20.452050226-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241031_002020-q6ot1vz6/logs/debug-core.log"} +{"time":"2024-10-31T00:20:20.559317297-04:00","level":"INFO","msg":"created new stream","id":"q6ot1vz6"} +{"time":"2024-10-31T00:20:20.559364417-04:00","level":"INFO","msg":"stream: started","id":"q6ot1vz6"} +{"time":"2024-10-31T00:20:20.559412138-04:00","level":"INFO","msg":"sender: started","stream_id":"q6ot1vz6"} +{"time":"2024-10-31T00:20:20.559390118-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"q6ot1vz6"}} +{"time":"2024-10-31T00:20:20.559429398-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"q6ot1vz6"}} +{"time":"2024-10-31T00:20:21.383758901-04:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2024-10-31T09:17:19.826891967-04:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2024-10-31T09:17:19.904241229-04:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2024-10-31T09:17:20.764796755-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"saving job artifact","runtime_seconds":0.618214398,"subtasks":[{"desc":"wandb-job.json","runtime_seconds":0.005705472,"progress":"563B/563B"}]}],"total_operations":1}} +{"time":"2024-10-31T09:17:22.016866722-04:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2024-10-31T09:17:23.132121347-04:00","level":"INFO","msg":"stream: closing","id":"q6ot1vz6"} +{"time":"2024-10-31T09:17:23.132161377-04:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"q6ot1vz6"}} +{"time":"2024-10-31T09:17:23.132197758-04:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"q6ot1vz6"}} +{"time":"2024-10-31T09:17:23.132240638-04:00","level":"INFO","msg":"sender: closed","stream_id":"q6ot1vz6"} +{"time":"2024-10-31T09:17:23.132261438-04:00","level":"INFO","msg":"stream: closed","id":"q6ot1vz6"} diff --git a/wandb/run-20241031_002020-u516mysu/files/config.yaml b/wandb/run-20241031_002020-u516mysu/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c0d8a2ed7dfb36d28050f2045c26df3a861ac3cb --- /dev/null +++ b/wandb/run-20241031_002020-u516mysu/files/config.yaml @@ -0,0 +1,531 @@ +_name_or_path: + value: meta-llama/Llama-3.2-3B +_wandb: + value: + cli_version: 0.18.5 + m: + - "1": train/learning_rate + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": train/global_step + "6": + - 3 + "7": [] + - "1": eval/loss + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": train/grad_norm + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": train/epoch + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": eval/runtime + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": eval/samples_per_second + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": eval/steps_per_second + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": train/loss + "5": 2 + "6": + - 1 + - 3 + "7": [] + python_version: 3.9.19 + t: + "1": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "2": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "3": + - 2 + - 7 + - 13 + - 19 + - 23 + - 55 + - 62 + - 66 + "4": 3.9.19 + "5": 0.18.5 + "6": 4.45.1 + "8": + - 5 + "9": + "1": transformers_trainer + "12": 0.18.5 + "13": linux-x86_64 +accelerator_config: + value: + dispatch_batches: null + even_batches: true + gradient_accumulation_kwargs: null + non_blocking: false + split_batches: false + use_seedable_sampler: true +adafactor: + value: false +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +add_cross_attention: + value: false +architectures: + value: + - LlamaForCausalLM +attention_bias: + value: false +attention_dropout: + value: 0 +auto_find_batch_size: + value: false +bad_words_ids: + value: null +batch_eval_metrics: + value: false +batch_size: + value: 3 +begin_suppress_tokens: + value: null +bf16: + value: false +bf16_full_eval: + value: false +bos_token_id: + value: 128000 +chunk_size_feed_forward: + value: 0 +cross_attention_hidden_size: + value: null +data_seed: + value: null +dataloader_drop_last: + value: false +dataloader_num_workers: + value: 0 +dataloader_persistent_workers: + value: false +dataloader_pin_memory: + value: true +dataloader_prefetch_factor: + value: null +ddp_backend: + value: null +ddp_broadcast_buffers: + value: null +ddp_bucket_cap_mb: + value: null +ddp_find_unused_parameters: + value: null +ddp_timeout: + value: 1800 +debug: + value: [] +decoder_start_token_id: + value: null +deepspeed: + value: deepspeed_config/train_dp_config.json +disable_tqdm: + value: false +dispatch_batches: + value: null +diversity_penalty: + value: 0 +do_eval: + value: true +do_predict: + value: false +do_sample: + value: false +do_train: + value: false +early_stopping: + value: false +encoder_no_repeat_ngram_size: + value: 0 +eos_token_id: + value: 128001 +epoch: + value: 6 +eval_accumulation_steps: + value: null +eval_delay: + value: 0 +eval_do_concat_batches: + value: true +eval_on_start: + value: false +eval_steps: + value: 10 +eval_strategy: + value: steps +eval_use_gather_object: + value: false +evaluation_strategy: + value: steps +exponential_decay_length_penalty: + value: null +finetuning_task: + value: null +forced_bos_token_id: + value: null +forced_eos_token_id: + value: null +fp16: + value: true +fp16_backend: + value: auto +fp16_full_eval: + value: false +fp16_opt_level: + value: O1 +fsdp: + value: [] +fsdp_config: + value: + min_num_params: 0 + xla: false + xla_fsdp_grad_ckpt: false + xla_fsdp_v2: false +fsdp_min_num_params: + value: 0 +fsdp_transformer_layer_cls_to_wrap: + value: null +full_determinism: + value: false +gradient_accumulation_steps: + value: 2 +gradient_checkpointing: + value: false +gradient_checkpointing_kwargs: + value: null +greater_is_better: + value: null +group_by_length: + value: false +half_precision_backend: + value: auto +head_dim: + value: 128 +hidden_act: + value: silu +hidden_size: + value: 3072 +hub_always_push: + value: false +hub_model_id: + value: null +hub_private_repo: + value: false +hub_strategy: + value: every_save +hub_token: + value: +id2label: + value: + "0": LABEL_0 + "1": LABEL_1 +ignore_data_skip: + value: false +include_inputs_for_metrics: + value: false +include_num_input_tokens_seen: + value: false +include_tokens_per_second: + value: false +initializer_range: + value: 0.02 +intermediate_size: + value: 8192 +is_decoder: + value: false +is_encoder_decoder: + value: false +jit_mode_eval: + value: false +label_names: + value: null +label_smoothing_factor: + value: 0 +label2id: + value: + LABEL_0: 0 + LABEL_1: 1 +learning_rate: + value: 1e-05 +length_column_name: + value: length +length_penalty: + value: 1 +load_best_model_at_end: + value: false +local_rank: + value: 0 +log_level: + value: passive +log_level_replica: + value: warning +log_on_each_node: + value: true +logging_dir: + value: ./logs +logging_first_step: + value: false +logging_nan_inf_filter: + value: true +logging_steps: + value: 1 +logging_strategy: + value: steps +lr: + value: 1e-05 +lr_scheduler_type: + value: linear +max_grad_norm: + value: 1 +max_length: + value: 20 +max_position_embeddings: + value: 131072 +max_steps: + value: -1 +metric_for_best_model: + value: null +min_length: + value: 0 +mlp_bias: + value: false +model/num_parameters: + value: 3212749824 +model_type: + value: llama +mp_parameters: + value: "" +neftune_noise_alpha: + value: null +no_cuda: + value: false +no_repeat_ngram_size: + value: 0 +num_attention_heads: + value: 24 +num_beam_groups: + value: 1 +num_beams: + value: 1 +num_hidden_layers: + value: 28 +num_key_value_heads: + value: 8 +num_return_sequences: + value: 1 +num_train_epochs: + value: 6 +optim: + value: adamw_torch +optim_args: + value: null +optim_target_modules: + value: null +output_attentions: + value: false +output_dir: + value: ./checkpoints/Llama-3.2-3B/babylm_reverse_full_10M_seed0/runs +output_hidden_states: + value: false +output_scores: + value: false +overwrite_output_dir: + value: false +pad_token_id: + value: null +past_index: + value: -1 +per_device_eval_batch_size: + value: 8 +per_device_train_batch_size: + value: 3 +per_gpu_eval_batch_size: + value: null +per_gpu_train_batch_size: + value: null +perturbation: + value: reverse_full +prediction_loss_only: + value: false +prefix: + value: null +pretraining_tp: + value: 1 +problem_type: + value: null +push_to_hub: + value: false +push_to_hub_model_id: + value: null +push_to_hub_organization: + value: null +push_to_hub_token: + value: +ray_scope: + value: last +remove_invalid_values: + value: false +remove_unused_columns: + value: true +repetition_penalty: + value: 1 +report_to: + value: + - wandb +restore_callback_states_from_checkpoint: + value: false +resume_from_checkpoint: + value: null +return_dict: + value: true +return_dict_in_generate: + value: false +rms_norm_eps: + value: 1e-05 +rope_scaling: + value: + factor: 32 + high_freq_factor: 4 + low_freq_factor: 1 + original_max_position_embeddings: 8192 + rope_type: llama3 +rope_theta: + value: 500000 +run_name: + value: ./checkpoints/Llama-3.2-3B/babylm_reverse_full_10M_seed0/runs +save_on_each_node: + value: false +save_only_model: + value: false +save_safetensors: + value: true +save_steps: + value: 150 +save_strategy: + value: steps +save_total_limit: + value: null +seed: + value: 0 +sep_token_id: + value: null +skip_memory_metrics: + value: true +split_batches: + value: null +suppress_tokens: + value: null +task_specific_params: + value: null +temperature: + value: 1 +tf_legacy_loss: + value: false +tf32: + value: null +tie_encoder_decoder: + value: false +tie_word_embeddings: + value: true +tokenizer_class: + value: null +top_k: + value: 50 +top_p: + value: 1 +torch_compile: + value: false +torch_compile_backend: + value: null +torch_compile_mode: + value: null +torch_dtype: + value: bfloat16 +torch_empty_cache_steps: + value: null +torchdynamo: + value: null +torchscript: + value: false +tpu_metrics_debug: + value: false +tpu_num_cores: + value: null +train_set: + value: 10M +transformers_version: + value: 4.45.1 +typical_p: + value: 1 +use_bfloat16: + value: false +use_cache: + value: true +use_cpu: + value: false +use_ipex: + value: false +use_legacy_prediction_loop: + value: false +use_liger_kernel: + value: false +use_mps_device: + value: false +vocab_size: + value: 128256 +warmup_ratio: + value: 0 +warmup_steps: + value: 0 +weight_decay: + value: 0 diff --git a/wandb/run-20241031_002020-u516mysu/logs/debug-internal.log b/wandb/run-20241031_002020-u516mysu/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..68b903deeeb62d45d492756eced75ade0a58fa90 --- /dev/null +++ b/wandb/run-20241031_002020-u516mysu/logs/debug-internal.log @@ -0,0 +1,18 @@ +{"time":"2024-10-31T00:20:20.449414915-04:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-10-31T00:20:20.449431546-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241031_002020-u516mysu/logs/debug-core.log"} +{"time":"2024-10-31T00:20:20.559027395-04:00","level":"INFO","msg":"created new stream","id":"u516mysu"} +{"time":"2024-10-31T00:20:20.559094905-04:00","level":"INFO","msg":"stream: started","id":"u516mysu"} +{"time":"2024-10-31T00:20:20.559173576-04:00","level":"INFO","msg":"sender: started","stream_id":"u516mysu"} +{"time":"2024-10-31T00:20:20.559167146-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"u516mysu"}} +{"time":"2024-10-31T00:20:20.559122936-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"u516mysu"}} +{"time":"2024-10-31T00:20:21.390564264-04:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2024-10-31T06:03:54.968293904-04:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/yaning1001-dartmouth-college/impossible_llm_reverse/u516mysu/file_stream"} +{"time":"2024-10-31T09:17:19.827018178-04:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2024-10-31T09:17:19.904181869-04:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2024-10-31T09:17:20.764779975-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"saving job artifact","runtime_seconds":0.617401592}],"total_operations":1}} +{"time":"2024-10-31T09:17:22.149262664-04:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2024-10-31T09:17:23.303472881-04:00","level":"INFO","msg":"stream: closing","id":"u516mysu"} +{"time":"2024-10-31T09:17:23.303503961-04:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"u516mysu"}} +{"time":"2024-10-31T09:17:23.303534061-04:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"u516mysu"}} +{"time":"2024-10-31T09:17:23.303602892-04:00","level":"INFO","msg":"sender: closed","stream_id":"u516mysu"} +{"time":"2024-10-31T09:17:23.303611352-04:00","level":"INFO","msg":"stream: closed","id":"u516mysu"} diff --git a/wandb/run-20241031_002020-u516mysu/logs/debug.log b/wandb/run-20241031_002020-u516mysu/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..1afa6b7ef9c09c53c0777549acfff35de17b1d71 --- /dev/null +++ b/wandb/run-20241031_002020-u516mysu/logs/debug.log @@ -0,0 +1,36 @@ +2024-10-31 00:20:20,445 INFO MainThread:484455 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-10-31 00:20:20,445 INFO MainThread:484455 [wandb_setup.py:_flush():79] Configure stats pid to 484455 +2024-10-31 00:20:20,445 INFO MainThread:484455 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-10-31 00:20:20,445 INFO MainThread:484455 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-10-31 00:20:20,445 INFO MainThread:484455 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-10-31 00:20:20,445 INFO MainThread:484455 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-10-31 00:20:20,445 INFO MainThread:484455 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-10-31 00:20:20,445 INFO MainThread:484455 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-10-31 00:20:20,445 INFO MainThread:484455 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241031_002020-u516mysu/logs/debug.log +2024-10-31 00:20:20,445 INFO MainThread:484455 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241031_002020-u516mysu/logs/debug-internal.log +2024-10-31 00:20:20,445 INFO MainThread:484455 [wandb_init.py:init():621] calling init triggers +2024-10-31 00:20:20,445 INFO MainThread:484455 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-10-31 00:20:20,445 INFO MainThread:484455 [wandb_init.py:init():671] starting backend +2024-10-31 00:20:20,445 INFO MainThread:484455 [wandb_init.py:init():675] sending inform_init request +2024-10-31 00:20:20,446 INFO MainThread:484455 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-10-31 00:20:20,446 INFO MainThread:484455 [wandb_init.py:init():688] backend started and connected +2024-10-31 00:20:20,449 INFO MainThread:484455 [wandb_init.py:init():783] updated telemetry +2024-10-31 00:20:20,478 INFO MainThread:484455 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-10-31 00:20:21,385 INFO MainThread:484455 [wandb_init.py:init():867] starting run threads in backend +2024-10-31 00:20:21,499 INFO MainThread:484455 [wandb_run.py:_console_start():2463] atexit reg +2024-10-31 00:20:21,499 INFO MainThread:484455 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-10-31 00:20:21,499 INFO MainThread:484455 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-10-31 00:20:21,499 INFO MainThread:484455 [wandb_run.py:_redirect():2401] Redirects installed. +2024-10-31 00:20:21,501 INFO MainThread:484455 [wandb_init.py:init():911] run started, returning control to user process +2024-10-31 00:20:21,502 INFO MainThread:484455 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_full', 'train_set': '10M', 'batch_size': 3, 'epoch': 6, 'seed': 0, 'lr': 1e-05} +2024-10-31 00:23:47,389 INFO MainThread:484455 [wandb_run.py:_config_callback():1390] config_cb None None {'vocab_size': 128256, 'max_position_embeddings': 131072, 'hidden_size': 3072, 'intermediate_size': 8192, 'num_hidden_layers': 28, 'num_attention_heads': 24, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'pretraining_tp': 1, 'use_cache': True, 'rope_theta': 500000.0, 'rope_scaling': {'factor': 32.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}, 'attention_bias': False, 'attention_dropout': 0.0, 'mlp_bias': False, 'head_dim': 128, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': True, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['LlamaForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 128000, 'pad_token_id': None, 'eos_token_id': 128001, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'meta-llama/Llama-3.2-3B', 'transformers_version': '4.45.1', 'model_type': 'llama', 'output_dir': './checkpoints/Llama-3.2-3B/babylm_reverse_full_10M_seed0/runs', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 3, 'per_device_eval_batch_size': 8, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 2, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 1e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 6, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': './logs', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 150, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 0, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': False, 'fp16': True, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 10, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': './checkpoints/Llama-3.2-3B/babylm_reverse_full_10M_seed0/runs', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': 'deepspeed_config/train_dp_config.json', 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'evaluation_strategy': 'steps', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'eval_use_gather_object': False} +2024-10-31 00:23:47,392 INFO MainThread:484455 [wandb_config.py:__setitem__():154] config set model/num_parameters = 3212749824 - > +2024-10-31 00:23:47,392 INFO MainThread:484455 [wandb_run.py:_config_callback():1390] config_cb model/num_parameters 3212749824 None +2024-10-31 09:17:19,686 INFO MainThread:484455 [wandb_run.py:_finish():2158] finishing run yaning1001-dartmouth-college/impossible_llm_reverse/u516mysu +2024-10-31 09:17:19,700 INFO MainThread:484455 [wandb_run.py:_atexit_cleanup():2426] got exitcode: 0 +2024-10-31 09:17:19,748 INFO MainThread:484455 [wandb_run.py:_restore():2408] restore +2024-10-31 09:17:19,749 INFO MainThread:484455 [wandb_run.py:_restore():2414] restore done +2024-10-31 09:17:23,296 INFO MainThread:484455 [wandb_run.py:_footer_history_summary_info():3975] rendering history +2024-10-31 09:17:23,297 INFO MainThread:484455 [wandb_run.py:_footer_history_summary_info():4007] rendering summary +2024-10-31 09:17:23,302 INFO MainThread:484455 [wandb_run.py:_footer_sync_info():3934] logging synced files diff --git a/wandb/run-20241031_122005-nip14lm6/logs/debug-internal.log b/wandb/run-20241031_122005-nip14lm6/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..449e7db43958d1811f55c5624677b32ca276915c --- /dev/null +++ b/wandb/run-20241031_122005-nip14lm6/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2024-10-31T12:20:05.848495974-04:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-10-31T12:20:05.848507284-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241031_122005-nip14lm6/logs/debug-core.log"} +{"time":"2024-10-31T12:20:05.957242925-04:00","level":"INFO","msg":"created new stream","id":"nip14lm6"} +{"time":"2024-10-31T12:20:05.957288975-04:00","level":"INFO","msg":"stream: started","id":"nip14lm6"} +{"time":"2024-10-31T12:20:05.957371865-04:00","level":"INFO","msg":"sender: started","stream_id":"nip14lm6"} +{"time":"2024-10-31T12:20:05.957351625-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"nip14lm6"}} +{"time":"2024-10-31T12:20:05.957317185-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"nip14lm6"}} +{"time":"2024-10-31T12:20:06.183349635-04:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2024-10-31T12:20:29.345034007-04:00","level":"INFO","msg":"stream: closing","id":"nip14lm6"} +{"time":"2024-10-31T12:20:29.345114157-04:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2024-10-31T12:20:29.345967457-04:00","level":"INFO","msg":"Stopped system monitor"} diff --git a/wandb/run-20241031_122005-nip14lm6/logs/debug.log b/wandb/run-20241031_122005-nip14lm6/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..4448025aeae355f0d8cccc7100ff8fd3dda627af --- /dev/null +++ b/wandb/run-20241031_122005-nip14lm6/logs/debug.log @@ -0,0 +1,27 @@ +2024-10-31 12:20:05,843 INFO MainThread:557184 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-10-31 12:20:05,843 INFO MainThread:557184 [wandb_setup.py:_flush():79] Configure stats pid to 557184 +2024-10-31 12:20:05,843 INFO MainThread:557184 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-10-31 12:20:05,843 INFO MainThread:557184 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-10-31 12:20:05,843 INFO MainThread:557184 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-10-31 12:20:05,844 INFO MainThread:557184 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-10-31 12:20:05,844 INFO MainThread:557184 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-10-31 12:20:05,844 INFO MainThread:557184 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-10-31 12:20:05,844 INFO MainThread:557184 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241031_122005-nip14lm6/logs/debug.log +2024-10-31 12:20:05,844 INFO MainThread:557184 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241031_122005-nip14lm6/logs/debug-internal.log +2024-10-31 12:20:05,844 INFO MainThread:557184 [wandb_init.py:init():621] calling init triggers +2024-10-31 12:20:05,844 INFO MainThread:557184 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-10-31 12:20:05,844 INFO MainThread:557184 [wandb_init.py:init():671] starting backend +2024-10-31 12:20:05,844 INFO MainThread:557184 [wandb_init.py:init():675] sending inform_init request +2024-10-31 12:20:05,845 INFO MainThread:557184 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-10-31 12:20:05,846 INFO MainThread:557184 [wandb_init.py:init():688] backend started and connected +2024-10-31 12:20:05,848 INFO MainThread:557184 [wandb_init.py:init():783] updated telemetry +2024-10-31 12:20:05,876 INFO MainThread:557184 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-10-31 12:20:06,180 INFO MainThread:557184 [wandb_init.py:init():867] starting run threads in backend +2024-10-31 12:20:06,275 INFO MainThread:557184 [wandb_run.py:_console_start():2463] atexit reg +2024-10-31 12:20:06,275 INFO MainThread:557184 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-10-31 12:20:06,275 INFO MainThread:557184 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-10-31 12:20:06,275 INFO MainThread:557184 [wandb_run.py:_redirect():2401] Redirects installed. +2024-10-31 12:20:06,277 INFO MainThread:557184 [wandb_init.py:init():911] run started, returning control to user process +2024-10-31 12:20:06,278 INFO MainThread:557184 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_full', 'train_set': '10M', 'batch_size': 3, 'epoch': 6, 'seed': 0, 'lr': 5e-06} +2024-10-31 12:20:29,345 WARNING MsgRouterThr:557184 [router.py:message_loop():77] message_loop has been closed diff --git a/wandb/run-20241101_012438-61w48leq/logs/debug-internal.log b/wandb/run-20241101_012438-61w48leq/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..c39014a4072f99cd273db3609f2747a7e4996e57 --- /dev/null +++ b/wandb/run-20241101_012438-61w48leq/logs/debug-internal.log @@ -0,0 +1,16 @@ +{"time":"2024-11-01T01:24:38.163791445-04:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-11-01T01:24:38.163804515-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_012438-61w48leq/logs/debug-core.log"} +{"time":"2024-11-01T01:24:38.271328113-04:00","level":"INFO","msg":"created new stream","id":"61w48leq"} +{"time":"2024-11-01T01:24:38.271368763-04:00","level":"INFO","msg":"stream: started","id":"61w48leq"} +{"time":"2024-11-01T01:24:38.271434224-04:00","level":"INFO","msg":"sender: started","stream_id":"61w48leq"} +{"time":"2024-11-01T01:24:38.271418254-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"61w48leq"}} +{"time":"2024-11-01T01:24:38.271399604-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"61w48leq"}} +{"time":"2024-11-01T01:24:38.451315095-04:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2024-11-01T01:24:38.692246251-04:00","level":"INFO","msg":"stream: closing","id":"61w48leq"} +{"time":"2024-11-01T01:24:38.692349652-04:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2024-11-01T01:24:38.693222148-04:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2024-11-01T01:24:39.149304751-04:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2024-11-01T01:24:39.271906984-04:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"61w48leq"}} +{"time":"2024-11-01T01:24:39.271942114-04:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"61w48leq"}} +{"time":"2024-11-01T01:24:39.271971665-04:00","level":"INFO","msg":"sender: closed","stream_id":"61w48leq"} +{"time":"2024-11-01T01:24:39.272008615-04:00","level":"INFO","msg":"stream: closed","id":"61w48leq"} diff --git a/wandb/run-20241101_012438-61w48leq/logs/debug.log b/wandb/run-20241101_012438-61w48leq/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..68de204bf74879370b4bc4117222ad3bf0a8b263 --- /dev/null +++ b/wandb/run-20241101_012438-61w48leq/logs/debug.log @@ -0,0 +1,27 @@ +2024-11-01 01:24:38,159 INFO MainThread:676353 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-11-01 01:24:38,159 INFO MainThread:676353 [wandb_setup.py:_flush():79] Configure stats pid to 676353 +2024-11-01 01:24:38,159 INFO MainThread:676353 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-11-01 01:24:38,159 INFO MainThread:676353 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-11-01 01:24:38,159 INFO MainThread:676353 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-11-01 01:24:38,159 INFO MainThread:676353 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-11-01 01:24:38,159 INFO MainThread:676353 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-11-01 01:24:38,159 INFO MainThread:676353 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-11-01 01:24:38,159 INFO MainThread:676353 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_012438-61w48leq/logs/debug.log +2024-11-01 01:24:38,159 INFO MainThread:676353 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_012438-61w48leq/logs/debug-internal.log +2024-11-01 01:24:38,159 INFO MainThread:676353 [wandb_init.py:init():621] calling init triggers +2024-11-01 01:24:38,159 INFO MainThread:676353 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-11-01 01:24:38,159 INFO MainThread:676353 [wandb_init.py:init():671] starting backend +2024-11-01 01:24:38,159 INFO MainThread:676353 [wandb_init.py:init():675] sending inform_init request +2024-11-01 01:24:38,160 INFO MainThread:676353 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-11-01 01:24:38,161 INFO MainThread:676353 [wandb_init.py:init():688] backend started and connected +2024-11-01 01:24:38,164 INFO MainThread:676353 [wandb_init.py:init():783] updated telemetry +2024-11-01 01:24:38,193 INFO MainThread:676353 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-11-01 01:24:38,446 INFO MainThread:676353 [wandb_init.py:init():867] starting run threads in backend +2024-11-01 01:24:38,577 INFO MainThread:676353 [wandb_run.py:_console_start():2463] atexit reg +2024-11-01 01:24:38,577 INFO MainThread:676353 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-11-01 01:24:38,577 INFO MainThread:676353 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-11-01 01:24:38,577 INFO MainThread:676353 [wandb_run.py:_redirect():2401] Redirects installed. +2024-11-01 01:24:38,579 INFO MainThread:676353 [wandb_init.py:init():911] run started, returning control to user process +2024-11-01 01:24:38,579 INFO MainThread:676353 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'shuffle_nodeterministic', 'train_set': '10M', 'batch_size': 3, 'epoch': 6, 'seed': 0, 'lr': 5e-06} +2024-11-01 01:24:38,692 WARNING MsgRouterThr:676353 [router.py:message_loop():77] message_loop has been closed diff --git a/wandb/run-20241101_012612-q08jbqqf/files/config.yaml b/wandb/run-20241101_012612-q08jbqqf/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..16d25ee0e4092474360045faaf0959cfc1b3e91d --- /dev/null +++ b/wandb/run-20241101_012612-q08jbqqf/files/config.yaml @@ -0,0 +1,49 @@ +_wandb: + value: + cli_version: 0.18.5 + m: [] + python_version: 3.9.19 + t: + "1": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "2": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "3": + - 13 + - 23 + - 55 + "4": 3.9.19 + "5": 0.18.5 + "6": 4.45.1 + "8": + - 5 + "12": 0.18.5 + "13": linux-x86_64 +batch_size: + value: 3 +epoch: + value: 6 +lr: + value: 5e-06 +perturbation: + value: shuffle_nodeterministic +seed: + value: 0 +train_set: + value: 10M diff --git a/wandb/run-20241101_012612-q08jbqqf/files/wandb-metadata.json b/wandb/run-20241101_012612-q08jbqqf/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..8ca844f44cfc77ae610b2b54bf343f13cb912b30 --- /dev/null +++ b/wandb/run-20241101_012612-q08jbqqf/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-11-01T05:26:12.824647Z", + "args": [ + "--perturbation", + "shuffle_nodeterministic", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "6", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1753992228864" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241101_012612-q08jbqqf/files/wandb-summary.json b/wandb/run-20241101_012612-q08jbqqf/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..6c37fe1cbbb8aed86fd461a79642cb991e4d35cf --- /dev/null +++ b/wandb/run-20241101_012612-q08jbqqf/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":0}} \ No newline at end of file diff --git a/wandb/run-20241101_012612-q08jbqqf/logs/debug-internal.log b/wandb/run-20241101_012612-q08jbqqf/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..fb44c1f3d5d613485f01b7bc8632a31a11de550a --- /dev/null +++ b/wandb/run-20241101_012612-q08jbqqf/logs/debug-internal.log @@ -0,0 +1,16 @@ +{"time":"2024-11-01T01:26:12.826583242-04:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-11-01T01:26:12.826594312-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_012612-q08jbqqf/logs/debug-core.log"} +{"time":"2024-11-01T01:26:12.933324283-04:00","level":"INFO","msg":"created new stream","id":"q08jbqqf"} +{"time":"2024-11-01T01:26:12.933362294-04:00","level":"INFO","msg":"stream: started","id":"q08jbqqf"} +{"time":"2024-11-01T01:26:12.933399464-04:00","level":"INFO","msg":"sender: started","stream_id":"q08jbqqf"} +{"time":"2024-11-01T01:26:12.933367414-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"q08jbqqf"}} +{"time":"2024-11-01T01:26:12.933399204-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"q08jbqqf"}} +{"time":"2024-11-01T01:26:13.132185735-04:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2024-11-01T01:26:13.242452211-04:00","level":"INFO","msg":"stream: closing","id":"q08jbqqf"} +{"time":"2024-11-01T01:26:13.242504401-04:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2024-11-01T01:26:13.242989514-04:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2024-11-01T01:26:13.986483698-04:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2024-11-01T01:26:14.10924044-04:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"q08jbqqf"}} +{"time":"2024-11-01T01:26:14.109339001-04:00","level":"INFO","msg":"sender: closed","stream_id":"q08jbqqf"} +{"time":"2024-11-01T01:26:14.109321291-04:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"q08jbqqf"}} +{"time":"2024-11-01T01:26:14.109424031-04:00","level":"INFO","msg":"stream: closed","id":"q08jbqqf"} diff --git a/wandb/run-20241101_012612-q08jbqqf/logs/debug.log b/wandb/run-20241101_012612-q08jbqqf/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..0ff3c089ad4b805d3b13ab890c328411db76ec11 --- /dev/null +++ b/wandb/run-20241101_012612-q08jbqqf/logs/debug.log @@ -0,0 +1,27 @@ +2024-11-01 01:26:12,822 INFO MainThread:677636 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-11-01 01:26:12,822 INFO MainThread:677636 [wandb_setup.py:_flush():79] Configure stats pid to 677636 +2024-11-01 01:26:12,823 INFO MainThread:677636 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-11-01 01:26:12,823 INFO MainThread:677636 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-11-01 01:26:12,823 INFO MainThread:677636 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-11-01 01:26:12,823 INFO MainThread:677636 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-11-01 01:26:12,823 INFO MainThread:677636 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-11-01 01:26:12,823 INFO MainThread:677636 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-11-01 01:26:12,823 INFO MainThread:677636 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_012612-q08jbqqf/logs/debug.log +2024-11-01 01:26:12,823 INFO MainThread:677636 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_012612-q08jbqqf/logs/debug-internal.log +2024-11-01 01:26:12,823 INFO MainThread:677636 [wandb_init.py:init():621] calling init triggers +2024-11-01 01:26:12,823 INFO MainThread:677636 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-11-01 01:26:12,823 INFO MainThread:677636 [wandb_init.py:init():671] starting backend +2024-11-01 01:26:12,823 INFO MainThread:677636 [wandb_init.py:init():675] sending inform_init request +2024-11-01 01:26:12,824 INFO MainThread:677636 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-11-01 01:26:12,824 INFO MainThread:677636 [wandb_init.py:init():688] backend started and connected +2024-11-01 01:26:12,827 INFO MainThread:677636 [wandb_init.py:init():783] updated telemetry +2024-11-01 01:26:12,846 INFO MainThread:677636 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-11-01 01:26:13,128 INFO MainThread:677636 [wandb_init.py:init():867] starting run threads in backend +2024-11-01 01:26:13,232 INFO MainThread:677636 [wandb_run.py:_console_start():2463] atexit reg +2024-11-01 01:26:13,232 INFO MainThread:677636 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-11-01 01:26:13,232 INFO MainThread:677636 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-11-01 01:26:13,232 INFO MainThread:677636 [wandb_run.py:_redirect():2401] Redirects installed. +2024-11-01 01:26:13,234 INFO MainThread:677636 [wandb_init.py:init():911] run started, returning control to user process +2024-11-01 01:26:13,234 INFO MainThread:677636 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'shuffle_nodeterministic', 'train_set': '10M', 'batch_size': 3, 'epoch': 6, 'seed': 0, 'lr': 5e-06} +2024-11-01 01:26:13,242 WARNING MsgRouterThr:677636 [router.py:message_loop():77] message_loop has been closed diff --git a/wandb/run-20241101_094656-1l5gkwzn/files/output.log b/wandb/run-20241101_094656-1l5gkwzn/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..13fabfc08270592f775992de99610ab2e814da93 --- /dev/null +++ b/wandb/run-20241101_094656-1l5gkwzn/files/output.log @@ -0,0 +1,13 @@ +Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:05<00:00, 2.60s/it] +tokenized_valid: Dataset({ + features: ['input_ids', 'attention_mask'], + num_rows: 600 +}) +/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead + warnings.warn( +[2024-11-01 09:47:03,375] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-11-01 09:47:12,454] [INFO] [comm.py:652:init_distributed] cdb=None +Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination +Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root... +Loading extension module cpu_adam... +Time to load cpu_adam op: 4.764644622802734 seconds diff --git a/wandb/run-20241101_094656-1l5gkwzn/files/requirements.txt b/wandb/run-20241101_094656-1l5gkwzn/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416 --- /dev/null +++ b/wandb/run-20241101_094656-1l5gkwzn/files/requirements.txt @@ -0,0 +1,147 @@ +funcsigs==1.0.2 +sentry-sdk==2.17.0 +multiprocess==0.70.16 +numpy==1.26.2 +pluralizer==1.2.0 +debugpy==1.6.7 +nvidia-cudnn-cu11==8.5.0.96 +deepspeed==0.15.2 +data==0.4 +pandas==2.1.3 +tomli==2.0.1 +charset-normalizer==3.3.2 +attrs==24.2.0 +aiosignal==1.3.1 +fsspec==2023.10.0 +nvidia-cusparse-cu11==11.7.4.91 +zipp==3.12.0 +mypy-extensions==1.0.0 +datasets==3.0.1 +joblib==1.3.2 +hjson==3.1.0 +traitlets==5.7.1 +stack-data==0.6.0 +transformers==4.45.1 +sympy==1.11.1 +Pygments==2.15.0 +docker-pycreds==0.4.0 +dill==0.3.8 +wheel==0.44.0 +prompt-toolkit==3.0.30 +parso==0.8.3 +ipykernel==6.23.1 +pyarrow==17.0.0 +certifi==2023.11.17 +nvidia-cufft-cu11==10.9.0.58 +six==1.16.0 +pydantic==2.9.2 +click==8.1.7 +nest-asyncio==1.5.6 +gmpy2==2.1.0 +matplotlib==3.8.2 +scipy==1.11.4 +typing_extensions==4.12.2 +statsmodels==0.14.0 +huggingface-hub==0.25.0 +frozenlist==1.4.1 +gpustat==1.1.1 +nvidia-nvtx-cu11==11.7.91 +safetensors==0.4.5 +stanza==1.9.2 +decorator==5.1.1 +seaborn==0.13.0 +sentencepiece==0.2.0 +PyYAML==6.0.1 +black==24.8.0 +protobuf==4.25.1 +pickleshare==0.7.5 +peft==0.13.0 +triton==2.0.0 +nvidia-cuda-runtime-cu11==11.7.99 +Jinja2==3.1.2 +nvidia-cusolver-cu11==11.4.0.1 +executing==1.2.0 +jupyter_client==8.1.0 +pluggy==1.3.0 +cmake==3.30.3 +pytz==2023.3.post1 +aiohappyeyeballs==2.4.2 +kiwisolver==1.4.5 +py-cpuinfo==9.0.0 +Pillow==10.1.0 +ptyprocess==0.7.0 +importlib_resources==6.4.5 +GitPython==3.1.43 +importlib-metadata==6.0.0 +iniconfig==2.0.0 +scikit-learn==1.3.2 +exceptiongroup==1.1.0 +networkx==2.8.6 +accelerate==1.0.0 +nltk==3.8.1 +shutilwhich==1.1.0 +fonttools==4.45.1 +future==0.18.3 +aiohttp==3.10.6 +wcwidth==0.2.5 +idna==3.6 +filelock==3.12.2 +pathspec==0.12.1 +jupyter_core==5.1.0 +lit==18.1.8 +nvidia-curand-cu11==10.2.10.91 +nvidia-cublas-cu11==11.10.3.66 +nvidia-ml-py==12.560.30 +msgpack==1.1.0 +python-dateutil==2.8.2 +blessed==1.20.0 +packaging==23.0 +gitdb==4.0.11 +yarl==1.13.0 +emoji==2.8.0 +tzdata==2023.3 +cycler==0.12.1 +tornado==6.2 +backcall==0.2.0 +plotnine==0.12.4 +ninja==1.11.1.1 +latex==0.7.0 +wandb==0.18.5 +setproctitle==1.3.3 +threadpoolctl==3.2.0 +requests==2.32.3 +pyparsing==3.1.1 +smmap==5.0.1 +pyzmq==23.0.0 +async-timeout==4.0.3 +annotated-types==0.7.0 +matplotlib-inline==0.1.6 +latexcodec==1.0.0 +ipython==8.0.0 +patsy==0.5.3 +contourpy==1.2.0 +multidict==6.1.0 +mizani==0.9.3 +urllib3==2.1.0 +tokenizers==0.20.0 +MarkupSafe==2.1.2 +pip==24.2 +pexpect==4.8.0 +tqdm==4.66.5 +jedi==0.18.2 +pydantic_core==2.23.4 +tempdir==0.7.1 +mpmath==1.2.1 +setuptools==72.1.0 +pytest==7.4.3 +pure-eval==0.2.2 +psutil==5.9.1 +comm==0.1.2 +nvidia-cuda-cupti-cu11==11.7.101 +nvidia-cuda-nvrtc-cu11==11.7.99 +regex==2023.10.3 +platformdirs==2.5.2 +asttokens==2.2.1 +torch==2.0.0 +nvidia-nccl-cu11==2.14.3 +xxhash==3.5.0 diff --git a/wandb/run-20241101_094656-1l5gkwzn/files/wandb-metadata.json b/wandb/run-20241101_094656-1l5gkwzn/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..452b0524d4317efa0dbb2f9a90d1a4d7fe1e83eb --- /dev/null +++ b/wandb/run-20241101_094656-1l5gkwzn/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-11-01T13:46:56.156739Z", + "args": [ + "--perturbation", + "reverse_control", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "7", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1754695659520" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241101_094656-1l5gkwzn/logs/debug-internal.log b/wandb/run-20241101_094656-1l5gkwzn/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..d829038c9d5b9554d19a65234536a15a272d2620 --- /dev/null +++ b/wandb/run-20241101_094656-1l5gkwzn/logs/debug-internal.log @@ -0,0 +1,8 @@ +{"time":"2024-11-01T09:46:56.158847121-04:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-11-01T09:46:56.158862091-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_094656-1l5gkwzn/logs/debug-core.log"} +{"time":"2024-11-01T09:46:56.26479573-04:00","level":"INFO","msg":"created new stream","id":"1l5gkwzn"} +{"time":"2024-11-01T09:46:56.264831431-04:00","level":"INFO","msg":"stream: started","id":"1l5gkwzn"} +{"time":"2024-11-01T09:46:56.264866161-04:00","level":"INFO","msg":"sender: started","stream_id":"1l5gkwzn"} +{"time":"2024-11-01T09:46:56.264860761-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"1l5gkwzn"}} +{"time":"2024-11-01T09:46:56.264880331-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"1l5gkwzn"}} +{"time":"2024-11-01T09:46:56.464351323-04:00","level":"INFO","msg":"Starting system monitor"} diff --git a/wandb/run-20241101_094656-1l5gkwzn/logs/debug.log b/wandb/run-20241101_094656-1l5gkwzn/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..5de6a2c78b672edc0b540519172bf6aca0bc2149 --- /dev/null +++ b/wandb/run-20241101_094656-1l5gkwzn/logs/debug.log @@ -0,0 +1,26 @@ +2024-11-01 09:46:56,154 INFO MainThread:786691 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-11-01 09:46:56,154 INFO MainThread:786691 [wandb_setup.py:_flush():79] Configure stats pid to 786691 +2024-11-01 09:46:56,154 INFO MainThread:786691 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-11-01 09:46:56,154 INFO MainThread:786691 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-11-01 09:46:56,154 INFO MainThread:786691 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-11-01 09:46:56,154 INFO MainThread:786691 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-11-01 09:46:56,154 INFO MainThread:786691 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-11-01 09:46:56,154 INFO MainThread:786691 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-11-01 09:46:56,154 INFO MainThread:786691 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_094656-1l5gkwzn/logs/debug.log +2024-11-01 09:46:56,155 INFO MainThread:786691 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_094656-1l5gkwzn/logs/debug-internal.log +2024-11-01 09:46:56,155 INFO MainThread:786691 [wandb_init.py:init():621] calling init triggers +2024-11-01 09:46:56,155 INFO MainThread:786691 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-11-01 09:46:56,155 INFO MainThread:786691 [wandb_init.py:init():671] starting backend +2024-11-01 09:46:56,155 INFO MainThread:786691 [wandb_init.py:init():675] sending inform_init request +2024-11-01 09:46:56,156 INFO MainThread:786691 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-11-01 09:46:56,156 INFO MainThread:786691 [wandb_init.py:init():688] backend started and connected +2024-11-01 09:46:56,159 INFO MainThread:786691 [wandb_init.py:init():783] updated telemetry +2024-11-01 09:46:56,188 INFO MainThread:786691 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-11-01 09:46:56,460 INFO MainThread:786691 [wandb_init.py:init():867] starting run threads in backend +2024-11-01 09:46:56,562 INFO MainThread:786691 [wandb_run.py:_console_start():2463] atexit reg +2024-11-01 09:46:56,562 INFO MainThread:786691 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-11-01 09:46:56,562 INFO MainThread:786691 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-11-01 09:46:56,562 INFO MainThread:786691 [wandb_run.py:_redirect():2401] Redirects installed. +2024-11-01 09:46:56,563 INFO MainThread:786691 [wandb_init.py:init():911] run started, returning control to user process +2024-11-01 09:46:56,563 INFO MainThread:786691 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_control', 'train_set': '10M', 'batch_size': 3, 'epoch': 7, 'seed': 0, 'lr': 5e-06} diff --git a/wandb/run-20241101_094656-1l5gkwzn/run-1l5gkwzn.wandb b/wandb/run-20241101_094656-1l5gkwzn/run-1l5gkwzn.wandb new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/wandb/run-20241101_094656-ae4hctp0/logs/debug-internal.log b/wandb/run-20241101_094656-ae4hctp0/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..ede4f7b5feabb3a899069ab919316ac6ad8850f8 --- /dev/null +++ b/wandb/run-20241101_094656-ae4hctp0/logs/debug-internal.log @@ -0,0 +1,8 @@ +{"time":"2024-11-01T09:46:56.382128939-04:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-11-01T09:46:56.382144359-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_094656-ae4hctp0/logs/debug-core.log"} +{"time":"2024-11-01T09:46:56.488019778-04:00","level":"INFO","msg":"created new stream","id":"ae4hctp0"} +{"time":"2024-11-01T09:46:56.488068848-04:00","level":"INFO","msg":"stream: started","id":"ae4hctp0"} +{"time":"2024-11-01T09:46:56.488137609-04:00","level":"INFO","msg":"sender: started","stream_id":"ae4hctp0"} +{"time":"2024-11-01T09:46:56.488116569-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"ae4hctp0"}} +{"time":"2024-11-01T09:46:56.488165299-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"ae4hctp0"}} +{"time":"2024-11-01T09:46:56.721095514-04:00","level":"INFO","msg":"Starting system monitor"} diff --git a/wandb/run-20241101_200517-iopieyi0/logs/debug.log b/wandb/run-20241101_200517-iopieyi0/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..bd086c650cc4e3715cb37ddfc82c8428eb23a42c --- /dev/null +++ b/wandb/run-20241101_200517-iopieyi0/logs/debug.log @@ -0,0 +1,27 @@ +2024-11-01 20:05:17,138 INFO MainThread:870382 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-11-01 20:05:17,138 INFO MainThread:870382 [wandb_setup.py:_flush():79] Configure stats pid to 870382 +2024-11-01 20:05:17,138 INFO MainThread:870382 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-11-01 20:05:17,138 INFO MainThread:870382 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-11-01 20:05:17,138 INFO MainThread:870382 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-11-01 20:05:17,138 INFO MainThread:870382 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-11-01 20:05:17,138 INFO MainThread:870382 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-11-01 20:05:17,138 INFO MainThread:870382 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-11-01 20:05:17,138 INFO MainThread:870382 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_200517-iopieyi0/logs/debug.log +2024-11-01 20:05:17,138 INFO MainThread:870382 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_200517-iopieyi0/logs/debug-internal.log +2024-11-01 20:05:17,138 INFO MainThread:870382 [wandb_init.py:init():621] calling init triggers +2024-11-01 20:05:17,138 INFO MainThread:870382 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-11-01 20:05:17,139 INFO MainThread:870382 [wandb_init.py:init():671] starting backend +2024-11-01 20:05:17,139 INFO MainThread:870382 [wandb_init.py:init():675] sending inform_init request +2024-11-01 20:05:17,140 INFO MainThread:870382 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-11-01 20:05:17,140 INFO MainThread:870382 [wandb_init.py:init():688] backend started and connected +2024-11-01 20:05:17,144 INFO MainThread:870382 [wandb_init.py:init():783] updated telemetry +2024-11-01 20:05:17,174 INFO MainThread:870382 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-11-01 20:05:17,485 INFO MainThread:870382 [wandb_init.py:init():867] starting run threads in backend +2024-11-01 20:05:17,574 INFO MainThread:870382 [wandb_run.py:_console_start():2463] atexit reg +2024-11-01 20:05:17,574 INFO MainThread:870382 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-11-01 20:05:17,574 INFO MainThread:870382 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-11-01 20:05:17,574 INFO MainThread:870382 [wandb_run.py:_redirect():2401] Redirects installed. +2024-11-01 20:05:17,575 INFO MainThread:870382 [wandb_init.py:init():911] run started, returning control to user process +2024-11-01 20:05:17,575 INFO MainThread:870382 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'shuffle_nondeterministic', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0, 'lr': 5e-06} +2024-11-01 20:05:25,263 WARNING MsgRouterThr:870382 [router.py:message_loop():77] message_loop has been closed diff --git a/wandb/run-20241101_201927-8tmqrwpx/files/config.yaml b/wandb/run-20241101_201927-8tmqrwpx/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4e34fec43368c51fb4bd10a24a21dd490ecdba44 --- /dev/null +++ b/wandb/run-20241101_201927-8tmqrwpx/files/config.yaml @@ -0,0 +1,49 @@ +_wandb: + value: + cli_version: 0.18.5 + m: [] + python_version: 3.9.19 + t: + "1": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "2": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "3": + - 13 + - 23 + - 55 + "4": 3.9.19 + "5": 0.18.5 + "6": 4.45.1 + "8": + - 5 + "12": 0.18.5 + "13": linux-x86_64 +batch_size: + value: 3 +epoch: + value: 3 +lr: + value: 5e-06 +perturbation: + value: shuffle_nondeterministic +seed: + value: 0 +train_set: + value: 10M diff --git a/wandb/run-20241101_201927-8tmqrwpx/files/output.log b/wandb/run-20241101_201927-8tmqrwpx/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..8f5d7632122e3fe769565f1f46c07735f03f81d0 --- /dev/null +++ b/wandb/run-20241101_201927-8tmqrwpx/files/output.log @@ -0,0 +1,49 @@ +Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:05<00:00, 2.57s/it] +tokenized_valid: Dataset({ + features: ['input_ids', 'attention_mask'], + num_rows: 600 +}) +/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead + warnings.warn( +[2024-11-01 20:19:34,442] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-11-01 20:19:43,897] [INFO] [comm.py:652:init_distributed] cdb=None +Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination +Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root... +Loading extension module cpu_adam... +Time to load cpu_adam op: 5.528482675552368 seconds +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", line 220, in + trainer.train() + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/trainer.py", line 2052, in train + return inner_training_loop( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/trainer.py", line 2388, in _inner_training_loop + tr_loss_step = self.training_step(model, inputs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/trainer.py", line 3485, in training_step + loss = self.compute_loss(model, inputs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/trainer.py", line 3532, in compute_loss + outputs = model(**inputs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/deepspeed/utils/nvtx.py", line 18, in wrapped_fn + ret_val = func(*args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/deepspeed/runtime/engine.py", line 1899, in forward + loss = self.module(*inputs, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py", line 1189, in forward + outputs = self.model( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py", line 1000, in forward + layer_outputs = decoder_layer( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py", line 729, in forward + hidden_states, self_attn_weights, present_key_value = self.self_attn( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl + return forward_call(*args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py", line 426, in forward + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/torch/nn/functional.py", line 1845, in softmax + ret = input.softmax(dim, dtype=dtype) +torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 288.00 MiB (GPU 2; 47.54 GiB total capacity; 11.61 GiB already allocated; 228.56 MiB free; 11.64 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF diff --git a/wandb/run-20241101_201927-8tmqrwpx/files/wandb-metadata.json b/wandb/run-20241101_201927-8tmqrwpx/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..941e95de56bd4bf8bcbb80b8ce46b7475cf41903 --- /dev/null +++ b/wandb/run-20241101_201927-8tmqrwpx/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-11-02T00:19:27.013147Z", + "args": [ + "--perturbation", + "shuffle_nondeterministic", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "3", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1754803679232" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241101_201927-8tmqrwpx/files/wandb-summary.json b/wandb/run-20241101_201927-8tmqrwpx/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..e676be4c6d48d550ccacb029fbc772ebc6173ea8 --- /dev/null +++ b/wandb/run-20241101_201927-8tmqrwpx/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":58}} \ No newline at end of file diff --git a/wandb/run-20241101_201927-8tmqrwpx/logs/debug.log b/wandb/run-20241101_201927-8tmqrwpx/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..5ca8a8fa7f4c232e41f9332954c80c234f523ad8 --- /dev/null +++ b/wandb/run-20241101_201927-8tmqrwpx/logs/debug.log @@ -0,0 +1,27 @@ +2024-11-01 20:19:27,009 INFO MainThread:878463 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-11-01 20:19:27,010 INFO MainThread:878463 [wandb_setup.py:_flush():79] Configure stats pid to 878463 +2024-11-01 20:19:27,010 INFO MainThread:878463 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-11-01 20:19:27,010 INFO MainThread:878463 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-11-01 20:19:27,010 INFO MainThread:878463 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-11-01 20:19:27,010 INFO MainThread:878463 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-11-01 20:19:27,010 INFO MainThread:878463 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-11-01 20:19:27,010 INFO MainThread:878463 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-11-01 20:19:27,010 INFO MainThread:878463 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_201927-8tmqrwpx/logs/debug.log +2024-11-01 20:19:27,010 INFO MainThread:878463 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_201927-8tmqrwpx/logs/debug-internal.log +2024-11-01 20:19:27,010 INFO MainThread:878463 [wandb_init.py:init():621] calling init triggers +2024-11-01 20:19:27,010 INFO MainThread:878463 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-11-01 20:19:27,010 INFO MainThread:878463 [wandb_init.py:init():671] starting backend +2024-11-01 20:19:27,010 INFO MainThread:878463 [wandb_init.py:init():675] sending inform_init request +2024-11-01 20:19:27,012 INFO MainThread:878463 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-11-01 20:19:27,012 INFO MainThread:878463 [wandb_init.py:init():688] backend started and connected +2024-11-01 20:19:27,015 INFO MainThread:878463 [wandb_init.py:init():783] updated telemetry +2024-11-01 20:19:27,055 INFO MainThread:878463 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-11-01 20:19:27,353 INFO MainThread:878463 [wandb_init.py:init():867] starting run threads in backend +2024-11-01 20:19:27,444 INFO MainThread:878463 [wandb_run.py:_console_start():2463] atexit reg +2024-11-01 20:19:27,444 INFO MainThread:878463 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-11-01 20:19:27,444 INFO MainThread:878463 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-11-01 20:19:27,444 INFO MainThread:878463 [wandb_run.py:_redirect():2401] Redirects installed. +2024-11-01 20:19:27,446 INFO MainThread:878463 [wandb_init.py:init():911] run started, returning control to user process +2024-11-01 20:19:27,446 INFO MainThread:878463 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'shuffle_nondeterministic', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0, 'lr': 5e-06} +2024-11-01 20:20:25,213 WARNING MsgRouterThr:878463 [router.py:message_loop():77] message_loop has been closed diff --git a/wandb/run-20241101_201927-8tmqrwpx/run-8tmqrwpx.wandb b/wandb/run-20241101_201927-8tmqrwpx/run-8tmqrwpx.wandb new file mode 100644 index 0000000000000000000000000000000000000000..6c33c8d31fbc4c042628ce6336ba26bc8ebe9380 Binary files /dev/null and b/wandb/run-20241101_201927-8tmqrwpx/run-8tmqrwpx.wandb differ diff --git a/wandb/run-20241105_162824-fa9ep6qh/logs/debug-internal.log b/wandb/run-20241105_162824-fa9ep6qh/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..12ec1e4850e033e139424f3816b844015877ffd5 --- /dev/null +++ b/wandb/run-20241105_162824-fa9ep6qh/logs/debug-internal.log @@ -0,0 +1,8 @@ +{"time":"2024-11-05T16:28:24.428721708-05:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-11-05T16:28:24.428748458-05:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241105_162824-fa9ep6qh/logs/debug-core.log"} +{"time":"2024-11-05T16:28:24.539378288-05:00","level":"INFO","msg":"created new stream","id":"fa9ep6qh"} +{"time":"2024-11-05T16:28:24.539423788-05:00","level":"INFO","msg":"stream: started","id":"fa9ep6qh"} +{"time":"2024-11-05T16:28:24.539487428-05:00","level":"INFO","msg":"sender: started","stream_id":"fa9ep6qh"} +{"time":"2024-11-05T16:28:24.539483048-05:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"fa9ep6qh"}} +{"time":"2024-11-05T16:28:24.539488928-05:00","level":"INFO","msg":"handler: started","stream_id":{"value":"fa9ep6qh"}} +{"time":"2024-11-05T16:28:24.757184161-05:00","level":"INFO","msg":"Starting system monitor"} diff --git a/wandb/run-20241105_223842-16dt45ha/files/output.log b/wandb/run-20241105_223842-16dt45ha/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..bbd8e1b0b103d08d71a90dac9d33fbbce0e4dc71 --- /dev/null +++ b/wandb/run-20241105_223842-16dt45ha/files/output.log @@ -0,0 +1,23 @@ +config.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 844/844 [00:00<00:00, 337kB/s] +model.safetensors.index.json: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20.9k/20.9k [00:00<00:00, 19.0MB/s] +model-00001-of-00002.safetensors: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4.97G/4.97G [01:58<00:00, 42.0MB/s] +model-00002-of-00002.safetensors: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.46G/1.46G [00:34<00:00, 42.1MB/s] +Downloading shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [02:32<00:00, 76.48s/it] +Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:04<00:00, 2.35s/it] +generation_config.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 185/185 [00:00<00:00, 74.9kB/s] +Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17013/17013 [00:48<00:00, 349.55 examples/s] +tokenized_valid: Dataset({ + features: ['input_ids', 'attention_mask'], + num_rows: 1000 +}) +/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead + warnings.warn( +[2024-11-05 22:42:10,933] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-11-05 22:42:18,255] [INFO] [comm.py:652:init_distributed] cdb=None +Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination +Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root... +Emitting ninja build file /home/chunhui/.cache/torch_extensions/py39_cu117/cpu_adam/build.ninja... +Building extension module cpu_adam... +Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +Loading extension module cpu_adam... +Time to load cpu_adam op: 4.65723180770874 seconds diff --git a/wandb/run-20241105_223842-16dt45ha/files/wandb-metadata.json b/wandb/run-20241105_223842-16dt45ha/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..e359928bdc06be0020ae82591e31d49883ba162e --- /dev/null +++ b/wandb/run-20241105_223842-16dt45ha/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-11-06T03:38:42.422757Z", + "args": [ + "--perturbation", + "shuffle_deterministic57", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "3", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1785078448128" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241105_223842-16dt45ha/logs/debug-internal.log b/wandb/run-20241105_223842-16dt45ha/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..917126591d74988aa62006d8a8850f844897946b --- /dev/null +++ b/wandb/run-20241105_223842-16dt45ha/logs/debug-internal.log @@ -0,0 +1,20 @@ +{"time":"2024-11-05T22:38:42.4258052-05:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-11-05T22:38:42.42582231-05:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241105_223842-16dt45ha/logs/debug-core.log"} +{"time":"2024-11-05T22:38:42.533615549-05:00","level":"INFO","msg":"created new stream","id":"16dt45ha"} +{"time":"2024-11-05T22:38:42.533662479-05:00","level":"INFO","msg":"stream: started","id":"16dt45ha"} +{"time":"2024-11-05T22:38:42.53368456-05:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"16dt45ha"}} +{"time":"2024-11-05T22:38:42.53379328-05:00","level":"INFO","msg":"sender: started","stream_id":"16dt45ha"} +{"time":"2024-11-05T22:38:42.53375215-05:00","level":"INFO","msg":"handler: started","stream_id":{"value":"16dt45ha"}} +{"time":"2024-11-05T22:38:42.702252167-05:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2024-11-06T13:13:43.051966319-05:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/yaning1001-dartmouth-college/exp-impo-shuffle/16dt45ha/file_stream"} +{"time":"2024-11-06T13:17:11.729410268-05:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2024-11-06T13:59:59.091926096-05:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/yaning1001-dartmouth-college/exp-impo-shuffle/16dt45ha/file_stream"} +{"time":"2024-11-06T20:18:24.44115812-05:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2024-11-06T20:18:24.467757337-05:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2024-11-06T20:18:24.881758243-05:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2024-11-06T20:18:25.015661128-05:00","level":"INFO","msg":"handler: operation stats","stats":{}} +{"time":"2024-11-06T20:18:26.027043225-05:00","level":"INFO","msg":"stream: closing","id":"16dt45ha"} +{"time":"2024-11-06T20:18:26.027121786-05:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"16dt45ha"}} +{"time":"2024-11-06T20:18:26.027163536-05:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"16dt45ha"}} +{"time":"2024-11-06T20:18:26.027225756-05:00","level":"INFO","msg":"sender: closed","stream_id":"16dt45ha"} +{"time":"2024-11-06T20:18:26.027289227-05:00","level":"INFO","msg":"stream: closed","id":"16dt45ha"} diff --git a/wandb/run-20241105_223842-16dt45ha/logs/debug.log b/wandb/run-20241105_223842-16dt45ha/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..2faa777ded1e5752843579ddac954fa3f6522010 --- /dev/null +++ b/wandb/run-20241105_223842-16dt45ha/logs/debug.log @@ -0,0 +1,33 @@ +2024-11-05 22:38:42,419 INFO MainThread:1803214 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-11-05 22:38:42,419 INFO MainThread:1803214 [wandb_setup.py:_flush():79] Configure stats pid to 1803214 +2024-11-05 22:38:42,419 INFO MainThread:1803214 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-11-05 22:38:42,419 INFO MainThread:1803214 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-11-05 22:38:42,419 INFO MainThread:1803214 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-11-05 22:38:42,419 INFO MainThread:1803214 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-11-05 22:38:42,420 INFO MainThread:1803214 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-11-05 22:38:42,420 INFO MainThread:1803214 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-11-05 22:38:42,420 INFO MainThread:1803214 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241105_223842-16dt45ha/logs/debug.log +2024-11-05 22:38:42,420 INFO MainThread:1803214 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241105_223842-16dt45ha/logs/debug-internal.log +2024-11-05 22:38:42,420 INFO MainThread:1803214 [wandb_init.py:init():621] calling init triggers +2024-11-05 22:38:42,420 INFO MainThread:1803214 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-11-05 22:38:42,420 INFO MainThread:1803214 [wandb_init.py:init():671] starting backend +2024-11-05 22:38:42,420 INFO MainThread:1803214 [wandb_init.py:init():675] sending inform_init request +2024-11-05 22:38:42,422 INFO MainThread:1803214 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-11-05 22:38:42,422 INFO MainThread:1803214 [wandb_init.py:init():688] backend started and connected +2024-11-05 22:38:42,426 INFO MainThread:1803214 [wandb_init.py:init():783] updated telemetry +2024-11-05 22:38:42,465 INFO MainThread:1803214 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-11-05 22:38:42,698 INFO MainThread:1803214 [wandb_init.py:init():867] starting run threads in backend +2024-11-05 22:38:42,788 INFO MainThread:1803214 [wandb_run.py:_console_start():2463] atexit reg +2024-11-05 22:38:42,788 INFO MainThread:1803214 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-11-05 22:38:42,788 INFO MainThread:1803214 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-11-05 22:38:42,788 INFO MainThread:1803214 [wandb_run.py:_redirect():2401] Redirects installed. +2024-11-05 22:38:42,790 INFO MainThread:1803214 [wandb_init.py:init():911] run started, returning control to user process +2024-11-05 22:38:42,790 INFO MainThread:1803214 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'shuffle_deterministic57', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0, 'lr': 5e-06} +2024-11-06 20:18:24,430 INFO MainThread:1803214 [wandb_run.py:_finish():2158] finishing run yaning1001-dartmouth-college/exp-impo-shuffle/16dt45ha +2024-11-06 20:18:24,439 INFO MainThread:1803214 [wandb_run.py:_atexit_cleanup():2426] got exitcode: 0 +2024-11-06 20:18:24,440 INFO MainThread:1803214 [wandb_run.py:_restore():2408] restore +2024-11-06 20:18:24,440 INFO MainThread:1803214 [wandb_run.py:_restore():2414] restore done +2024-11-06 20:18:26,018 INFO MainThread:1803214 [wandb_run.py:_footer_history_summary_info():3975] rendering history +2024-11-06 20:18:26,018 INFO MainThread:1803214 [wandb_run.py:_footer_history_summary_info():4007] rendering summary +2024-11-06 20:18:26,026 INFO MainThread:1803214 [wandb_run.py:_footer_sync_info():3934] logging synced files diff --git a/wandb/run-20241106_224236-lcylopmq/files/config.yaml b/wandb/run-20241106_224236-lcylopmq/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..507bc548a43d1c7afb495d052831f04f348b9c17 --- /dev/null +++ b/wandb/run-20241106_224236-lcylopmq/files/config.yaml @@ -0,0 +1,49 @@ +_wandb: + value: + cli_version: 0.18.5 + m: [] + python_version: 3.9.19 + t: + "1": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "2": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "3": + - 13 + - 23 + - 55 + "4": 3.9.19 + "5": 0.18.5 + "6": 4.45.1 + "8": + - 5 + "12": 0.18.5 + "13": linux-x86_64 +batch_size: + value: 3 +epoch: + value: 3 +lr: + value: 5e-06 +perturbation: + value: shuffle_deterministic84 +seed: + value: 0 +train_set: + value: 10M diff --git a/wandb/run-20241106_224236-lcylopmq/files/output.log b/wandb/run-20241106_224236-lcylopmq/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..16507ca71b603951c4fc600d8f40d9a32af49231 --- /dev/null +++ b/wandb/run-20241106_224236-lcylopmq/files/output.log @@ -0,0 +1,60 @@ +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_http.py", line 406, in hf_raise_for_status + response.raise_for_status() + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/requests/models.py", line 1024, in raise_for_status + raise HTTPError(http_error_msg, response=self) +requests.exceptions.HTTPError: 401 Client Error: Unauthorized for url: https://huggingface.co/meta-llama/Llama-3.2-3B/resolve/main/config.json + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 403, in cached_file + resolved_file = hf_hub_download( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_deprecation.py", line 101, in inner_f + return f(*args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn + return fn(*args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1232, in hf_hub_download + return _hf_hub_download_to_cache_dir( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1339, in _hf_hub_download_to_cache_dir + _raise_on_head_call_error(head_call_error, force_download, local_files_only) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1854, in _raise_on_head_call_error + raise head_call_error + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1746, in _get_metadata_or_catch_error + metadata = get_hf_file_metadata( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn + return fn(*args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1666, in get_hf_file_metadata + r = _request_wrapper( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 364, in _request_wrapper + response = _request_wrapper( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 388, in _request_wrapper + hf_raise_for_status(response) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_http.py", line 423, in hf_raise_for_status + raise _format(GatedRepoError, message, response) from e +huggingface_hub.errors.GatedRepoError: 401 Client Error. (Request ID: Root=1-672c372c-7865a00f7bd975bd3318bf53;0f1d0061-5d18-4db3-bc77-04ef93856428) + +Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.2-3B/resolve/main/config.json. +Access to model meta-llama/Llama-3.2-3B is restricted. You must have access to it and be authenticated to access it. Please log in. + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", line 174, in + model = AutoModelForCausalLM.from_pretrained(model_name, + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/models/auto/auto_factory.py", line 526, in from_pretrained + config, kwargs = AutoConfig.from_pretrained( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/models/auto/configuration_auto.py", line 1006, in from_pretrained + config_dict, unused_kwargs = PretrainedConfig.get_config_dict(pretrained_model_name_or_path, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/configuration_utils.py", line 567, in get_config_dict + config_dict, kwargs = cls._get_config_dict(pretrained_model_name_or_path, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/configuration_utils.py", line 626, in _get_config_dict + resolved_config_file = cached_file( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 421, in cached_file + raise EnvironmentError( +OSError: You are trying to access a gated repo. +Make sure to have access to it at https://huggingface.co/meta-llama/Llama-3.2-3B. +401 Client Error. (Request ID: Root=1-672c372c-7865a00f7bd975bd3318bf53;0f1d0061-5d18-4db3-bc77-04ef93856428) + +Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.2-3B/resolve/main/config.json. +Access to model meta-llama/Llama-3.2-3B is restricted. You must have access to it and be authenticated to access it. Please log in. diff --git a/wandb/run-20241106_224236-lcylopmq/files/wandb-metadata.json b/wandb/run-20241106_224236-lcylopmq/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..2a85a7b990920f54d8240bf89ecd28bd11efd74c --- /dev/null +++ b/wandb/run-20241106_224236-lcylopmq/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-11-07T03:42:36.158647Z", + "args": [ + "--perturbation", + "shuffle_deterministic84", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "3", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1774852591616" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241106_224236-lcylopmq/files/wandb-summary.json b/wandb/run-20241106_224236-lcylopmq/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..6c37fe1cbbb8aed86fd461a79642cb991e4d35cf --- /dev/null +++ b/wandb/run-20241106_224236-lcylopmq/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":0}} \ No newline at end of file diff --git a/wandb/run-20241106_224236-lcylopmq/logs/debug-internal.log b/wandb/run-20241106_224236-lcylopmq/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..d9a15cc25de3c0b6d83a401177ddded8076d147f --- /dev/null +++ b/wandb/run-20241106_224236-lcylopmq/logs/debug-internal.log @@ -0,0 +1,16 @@ +{"time":"2024-11-06T22:42:36.160514293-05:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-11-06T22:42:36.160525653-05:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241106_224236-lcylopmq/logs/debug-core.log"} +{"time":"2024-11-06T22:42:36.266257449-05:00","level":"INFO","msg":"created new stream","id":"lcylopmq"} +{"time":"2024-11-06T22:42:36.266285829-05:00","level":"INFO","msg":"stream: started","id":"lcylopmq"} +{"time":"2024-11-06T22:42:36.266359779-05:00","level":"INFO","msg":"sender: started","stream_id":"lcylopmq"} +{"time":"2024-11-06T22:42:36.266358289-05:00","level":"INFO","msg":"handler: started","stream_id":{"value":"lcylopmq"}} +{"time":"2024-11-06T22:42:36.266325909-05:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"lcylopmq"}} +{"time":"2024-11-06T22:42:36.431323953-05:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2024-11-06T22:42:36.726762468-05:00","level":"INFO","msg":"stream: closing","id":"lcylopmq"} +{"time":"2024-11-06T22:42:36.726788588-05:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2024-11-06T22:42:36.727316972-05:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2024-11-06T22:42:37.048793203-05:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2024-11-06T22:42:37.163097395-05:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"lcylopmq"}} +{"time":"2024-11-06T22:42:37.163124896-05:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"lcylopmq"}} +{"time":"2024-11-06T22:42:37.163158956-05:00","level":"INFO","msg":"sender: closed","stream_id":"lcylopmq"} +{"time":"2024-11-06T22:42:37.163174736-05:00","level":"INFO","msg":"stream: closed","id":"lcylopmq"} diff --git a/wandb/run-20241106_224236-lcylopmq/logs/debug.log b/wandb/run-20241106_224236-lcylopmq/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..2bf5eaa78c8405ca2ab279ed85d234f4b6f0ed3f --- /dev/null +++ b/wandb/run-20241106_224236-lcylopmq/logs/debug.log @@ -0,0 +1,27 @@ +2024-11-06 22:42:36,156 INFO MainThread:1982053 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-11-06 22:42:36,157 INFO MainThread:1982053 [wandb_setup.py:_flush():79] Configure stats pid to 1982053 +2024-11-06 22:42:36,157 INFO MainThread:1982053 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-11-06 22:42:36,157 INFO MainThread:1982053 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-11-06 22:42:36,157 INFO MainThread:1982053 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-11-06 22:42:36,157 INFO MainThread:1982053 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-11-06 22:42:36,157 INFO MainThread:1982053 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-11-06 22:42:36,157 INFO MainThread:1982053 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-11-06 22:42:36,157 INFO MainThread:1982053 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241106_224236-lcylopmq/logs/debug.log +2024-11-06 22:42:36,157 INFO MainThread:1982053 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241106_224236-lcylopmq/logs/debug-internal.log +2024-11-06 22:42:36,157 INFO MainThread:1982053 [wandb_init.py:init():621] calling init triggers +2024-11-06 22:42:36,157 INFO MainThread:1982053 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-11-06 22:42:36,157 INFO MainThread:1982053 [wandb_init.py:init():671] starting backend +2024-11-06 22:42:36,157 INFO MainThread:1982053 [wandb_init.py:init():675] sending inform_init request +2024-11-06 22:42:36,158 INFO MainThread:1982053 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-11-06 22:42:36,158 INFO MainThread:1982053 [wandb_init.py:init():688] backend started and connected +2024-11-06 22:42:36,161 INFO MainThread:1982053 [wandb_init.py:init():783] updated telemetry +2024-11-06 22:42:36,183 INFO MainThread:1982053 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-11-06 22:42:36,427 INFO MainThread:1982053 [wandb_init.py:init():867] starting run threads in backend +2024-11-06 22:42:36,525 INFO MainThread:1982053 [wandb_run.py:_console_start():2463] atexit reg +2024-11-06 22:42:36,525 INFO MainThread:1982053 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-11-06 22:42:36,525 INFO MainThread:1982053 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-11-06 22:42:36,525 INFO MainThread:1982053 [wandb_run.py:_redirect():2401] Redirects installed. +2024-11-06 22:42:36,527 INFO MainThread:1982053 [wandb_init.py:init():911] run started, returning control to user process +2024-11-06 22:42:36,527 INFO MainThread:1982053 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'shuffle_deterministic84', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0, 'lr': 5e-06} +2024-11-06 22:42:36,726 WARNING MsgRouterThr:1982053 [router.py:message_loop():77] message_loop has been closed diff --git a/wandb/run-20241106_224236-lcylopmq/run-lcylopmq.wandb b/wandb/run-20241106_224236-lcylopmq/run-lcylopmq.wandb new file mode 100644 index 0000000000000000000000000000000000000000..9301f449e750273f5e3ea7b83966928498540c52 Binary files /dev/null and b/wandb/run-20241106_224236-lcylopmq/run-lcylopmq.wandb differ diff --git a/wandb/run-20241114_090201-6a5c399u/files/config.yaml b/wandb/run-20241114_090201-6a5c399u/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5feb2a98b43e2e1c45b273f0fd6ee4e8b6deba5d --- /dev/null +++ b/wandb/run-20241114_090201-6a5c399u/files/config.yaml @@ -0,0 +1,50 @@ +_wandb: + value: + cli_version: 0.18.5 + m: [] + python_version: 3.9.19 + t: + "1": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "2": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "3": + - 2 + - 13 + - 23 + - 55 + "4": 3.9.19 + "5": 0.18.5 + "6": 4.45.1 + "8": + - 5 + "12": 0.18.5 + "13": linux-x86_64 +batch_size: + value: 3 +epoch: + value: 3 +lr: + value: 5e-06 +perturbation: + value: reverse_full +seed: + value: 0 +train_set: + value: 10M diff --git a/wandb/run-20241114_090201-6a5c399u/files/output.log b/wandb/run-20241114_090201-6a5c399u/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..70e7e2006d8e04a5a709325c3593e4406af7261a --- /dev/null +++ b/wandb/run-20241114_090201-6a5c399u/files/output.log @@ -0,0 +1,14 @@ +Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:18<00:00, 9.17s/it] +Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 18140/18140 [00:48<00:00, 373.53 examples/s] +tokenized_valid: Dataset({ + features: ['input_ids', 'attention_mask'], + num_rows: 1000 +}) +/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead + warnings.warn( +[2024-11-14 09:03:10,872] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-11-14 09:03:18,323] [INFO] [comm.py:652:init_distributed] cdb=None +Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination +Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root... +Loading extension module cpu_adam... +Time to load cpu_adam op: 6.166570425033569 seconds diff --git a/wandb/run-20241114_090201-6a5c399u/files/wandb-metadata.json b/wandb/run-20241114_090201-6a5c399u/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..2f322f8cf1f65f386cf648753131cf6686b755c9 --- /dev/null +++ b/wandb/run-20241114_090201-6a5c399u/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-11-14T14:02:01.488483Z", + "args": [ + "--perturbation", + "reverse_full", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "3", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_ftp.py", + "codePath": "train/train_ftp.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_ftp.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1745683763200" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241114_090201-6a5c399u/files/wandb-summary.json b/wandb/run-20241114_090201-6a5c399u/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..0853048d5f5f5f9c30a84b53b581e5ac26c7f5b3 --- /dev/null +++ b/wandb/run-20241114_090201-6a5c399u/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":46317}} \ No newline at end of file diff --git a/wandb/run-20241114_090201-6a5c399u/logs/debug-internal.log b/wandb/run-20241114_090201-6a5c399u/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..8b8629587b752e76907b56bc91a413d044bd8bba --- /dev/null +++ b/wandb/run-20241114_090201-6a5c399u/logs/debug-internal.log @@ -0,0 +1,19 @@ +{"time":"2024-11-14T09:02:01.491584174-05:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-11-14T09:02:01.491603844-05:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241114_090201-6a5c399u/logs/debug-core.log"} +{"time":"2024-11-14T09:02:01.604859038-05:00","level":"INFO","msg":"created new stream","id":"6a5c399u"} +{"time":"2024-11-14T09:02:01.604928089-05:00","level":"INFO","msg":"stream: started","id":"6a5c399u"} +{"time":"2024-11-14T09:02:01.60499237-05:00","level":"INFO","msg":"sender: started","stream_id":"6a5c399u"} +{"time":"2024-11-14T09:02:01.60496991-05:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"6a5c399u"}} +{"time":"2024-11-14T09:02:01.60499665-05:00","level":"INFO","msg":"handler: started","stream_id":{"value":"6a5c399u"}} +{"time":"2024-11-14T09:02:01.846552564-05:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2024-11-14T12:42:47.253040258-05:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/yaning1001-dartmouth-college/exp-impo-reverse/6a5c399u/file_stream"} +{"time":"2024-11-14T19:05:46.838454977-05:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"} +{"time":"2024-11-14T21:53:58.50116399-05:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2024-11-14T21:53:58.654316298-05:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2024-11-14T21:53:59.303756025-05:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2024-11-14T21:53:59.431028349-05:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"uploading summary","runtime_seconds":0.127079923}],"total_operations":1}} +{"time":"2024-11-14T21:54:00.531902324-05:00","level":"INFO","msg":"stream: closing","id":"6a5c399u"} +{"time":"2024-11-14T21:54:00.532112326-05:00","level":"INFO","msg":"sender: closed","stream_id":"6a5c399u"} +{"time":"2024-11-14T21:54:00.532030445-05:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"6a5c399u"}} +{"time":"2024-11-14T21:54:00.531933455-05:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"6a5c399u"}} +{"time":"2024-11-14T21:54:00.545435925-05:00","level":"INFO","msg":"stream: closed","id":"6a5c399u"} diff --git a/wandb/run-20241114_090201-6a5c399u/logs/debug.log b/wandb/run-20241114_090201-6a5c399u/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..29a27c6f7b3e2acb721f46c139b225ec508026f6 --- /dev/null +++ b/wandb/run-20241114_090201-6a5c399u/logs/debug.log @@ -0,0 +1,33 @@ +2024-11-14 09:02:01,486 INFO MainThread:2573813 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-11-14 09:02:01,486 INFO MainThread:2573813 [wandb_setup.py:_flush():79] Configure stats pid to 2573813 +2024-11-14 09:02:01,486 INFO MainThread:2573813 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-11-14 09:02:01,486 INFO MainThread:2573813 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-11-14 09:02:01,486 INFO MainThread:2573813 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-11-14 09:02:01,486 INFO MainThread:2573813 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-11-14 09:02:01,486 INFO MainThread:2573813 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_ftp.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_ftp.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_ftp.py'} +2024-11-14 09:02:01,486 INFO MainThread:2573813 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-11-14 09:02:01,486 INFO MainThread:2573813 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241114_090201-6a5c399u/logs/debug.log +2024-11-14 09:02:01,486 INFO MainThread:2573813 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241114_090201-6a5c399u/logs/debug-internal.log +2024-11-14 09:02:01,486 INFO MainThread:2573813 [wandb_init.py:init():621] calling init triggers +2024-11-14 09:02:01,486 INFO MainThread:2573813 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-11-14 09:02:01,486 INFO MainThread:2573813 [wandb_init.py:init():671] starting backend +2024-11-14 09:02:01,486 INFO MainThread:2573813 [wandb_init.py:init():675] sending inform_init request +2024-11-14 09:02:01,487 INFO MainThread:2573813 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-11-14 09:02:01,488 INFO MainThread:2573813 [wandb_init.py:init():688] backend started and connected +2024-11-14 09:02:01,492 INFO MainThread:2573813 [wandb_init.py:init():783] updated telemetry +2024-11-14 09:02:01,509 INFO MainThread:2573813 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-11-14 09:02:01,842 INFO MainThread:2573813 [wandb_init.py:init():867] starting run threads in backend +2024-11-14 09:02:01,965 INFO MainThread:2573813 [wandb_run.py:_console_start():2463] atexit reg +2024-11-14 09:02:01,966 INFO MainThread:2573813 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-11-14 09:02:01,966 INFO MainThread:2573813 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-11-14 09:02:01,966 INFO MainThread:2573813 [wandb_run.py:_redirect():2401] Redirects installed. +2024-11-14 09:02:01,968 INFO MainThread:2573813 [wandb_init.py:init():911] run started, returning control to user process +2024-11-14 09:02:01,968 INFO MainThread:2573813 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_full', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0, 'lr': 5e-06} +2024-11-14 21:53:58,405 INFO MainThread:2573813 [wandb_run.py:_finish():2158] finishing run yaning1001-dartmouth-college/exp-impo-reverse/6a5c399u +2024-11-14 21:53:58,429 INFO MainThread:2573813 [wandb_run.py:_atexit_cleanup():2426] got exitcode: 0 +2024-11-14 21:53:58,429 INFO MainThread:2573813 [wandb_run.py:_restore():2408] restore +2024-11-14 21:53:58,429 INFO MainThread:2573813 [wandb_run.py:_restore():2414] restore done +2024-11-14 21:54:00,466 INFO MainThread:2573813 [wandb_run.py:_footer_history_summary_info():3975] rendering history +2024-11-14 21:54:00,466 INFO MainThread:2573813 [wandb_run.py:_footer_history_summary_info():4007] rendering summary +2024-11-14 21:54:00,529 INFO MainThread:2573813 [wandb_run.py:_footer_sync_info():3934] logging synced files diff --git a/wandb/run-20241116_005740-xzrt3mur/files/config.yaml b/wandb/run-20241116_005740-xzrt3mur/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..148d339cca30f16130a38470cf8f83ec34286da5 --- /dev/null +++ b/wandb/run-20241116_005740-xzrt3mur/files/config.yaml @@ -0,0 +1,50 @@ +_wandb: + value: + cli_version: 0.18.5 + m: [] + python_version: 3.9.19 + t: + "1": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "2": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "3": + - 2 + - 13 + - 23 + - 55 + "4": 3.9.19 + "5": 0.18.5 + "6": 4.45.1 + "8": + - 5 + "12": 0.18.5 + "13": linux-x86_64 +batch_size: + value: 3 +epoch: + value: 3 +lr: + value: 5e-06 +perturbation: + value: shuffle_deterministic84 +seed: + value: 0 +train_set: + value: 10M diff --git a/wandb/run-20241116_005740-xzrt3mur/files/output.log b/wandb/run-20241116_005740-xzrt3mur/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..bbbdce9efc36511659213433eeda5a62c2d8ca8f --- /dev/null +++ b/wandb/run-20241116_005740-xzrt3mur/files/output.log @@ -0,0 +1,21 @@ +model.safetensors.index.json: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20.9k/20.9k [00:00<00:00, 6.15MB/s] +model-00001-of-00002.safetensors: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4.97G/4.97G [01:57<00:00, 42.2MB/s] +model-00002-of-00002.safetensors: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.46G/1.46G [00:34<00:00, 42.5MB/s] +Downloading shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [02:32<00:00, 76.13s/it] +Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:04<00:00, 2.08s/it] +Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17013/17013 [00:46<00:00, 362.58 examples/s] +tokenized_valid: Dataset({ + features: ['input_ids', 'attention_mask'], + num_rows: 1000 +}) +/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead + warnings.warn( +[2024-11-16 01:01:06,549] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-11-16 01:01:12,683] [INFO] [comm.py:652:init_distributed] cdb=None +Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination +Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root... +Emitting ninja build file /home/chunhui/.cache/torch_extensions/py39_cu117/cpu_adam/build.ninja... +Building extension module cpu_adam... +Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +Loading extension module cpu_adam... +Time to load cpu_adam op: 6.444216966629028 seconds diff --git a/wandb/run-20241116_005740-xzrt3mur/files/wandb-metadata.json b/wandb/run-20241116_005740-xzrt3mur/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..475ade252b36f3a7600db203f506ffaaf4bd3396 --- /dev/null +++ b/wandb/run-20241116_005740-xzrt3mur/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-11-16T05:57:40.644226Z", + "args": [ + "--perturbation", + "shuffle_deterministic84", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "3", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_ftp.py", + "codePath": "train/train_ftp.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_ftp.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1787098271744" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241116_005740-xzrt3mur/files/wandb-summary.json b/wandb/run-20241116_005740-xzrt3mur/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..1039a6654b28cb6be3fbe374ec28aed4bec8bde7 --- /dev/null +++ b/wandb/run-20241116_005740-xzrt3mur/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":42763}} \ No newline at end of file diff --git a/wandb/run-20241116_005740-xzrt3mur/logs/debug-internal.log b/wandb/run-20241116_005740-xzrt3mur/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..dd9c789da92cbc791aad18b3afaffed125ac5e50 --- /dev/null +++ b/wandb/run-20241116_005740-xzrt3mur/logs/debug-internal.log @@ -0,0 +1,18 @@ +{"time":"2024-11-16T00:57:40.647062341-05:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-11-16T00:57:40.647076591-05:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241116_005740-xzrt3mur/logs/debug-core.log"} +{"time":"2024-11-16T00:57:40.759097257-05:00","level":"INFO","msg":"created new stream","id":"xzrt3mur"} +{"time":"2024-11-16T00:57:40.759169428-05:00","level":"INFO","msg":"stream: started","id":"xzrt3mur"} +{"time":"2024-11-16T00:57:40.759194328-05:00","level":"INFO","msg":"sender: started","stream_id":"xzrt3mur"} +{"time":"2024-11-16T00:57:40.759168828-05:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"xzrt3mur"}} +{"time":"2024-11-16T00:57:40.759191958-05:00","level":"INFO","msg":"handler: started","stream_id":{"value":"xzrt3mur"}} +{"time":"2024-11-16T00:57:40.947337481-05:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2024-11-16T11:20:49.097984756-05:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2024-11-16T12:50:24.495304118-05:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2024-11-16T12:50:24.691016764-05:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2024-11-16T12:50:25.291907192-05:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"saving job artifact","runtime_seconds":0.434485007}],"total_operations":1}} +{"time":"2024-11-16T12:50:27.435115904-05:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2024-11-16T12:50:28.779835771-05:00","level":"INFO","msg":"stream: closing","id":"xzrt3mur"} +{"time":"2024-11-16T12:50:28.779862421-05:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"xzrt3mur"}} +{"time":"2024-11-16T12:50:28.779884771-05:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"xzrt3mur"}} +{"time":"2024-11-16T12:50:28.779924172-05:00","level":"INFO","msg":"sender: closed","stream_id":"xzrt3mur"} +{"time":"2024-11-16T12:50:28.779995232-05:00","level":"INFO","msg":"stream: closed","id":"xzrt3mur"} diff --git a/wandb/run-20241116_005740-xzrt3mur/logs/debug.log b/wandb/run-20241116_005740-xzrt3mur/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..8179713f1f89794a44b4895f4fce5a68c0b6c1d3 --- /dev/null +++ b/wandb/run-20241116_005740-xzrt3mur/logs/debug.log @@ -0,0 +1,33 @@ +2024-11-16 00:57:40,640 INFO MainThread:2657484 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-11-16 00:57:40,640 INFO MainThread:2657484 [wandb_setup.py:_flush():79] Configure stats pid to 2657484 +2024-11-16 00:57:40,640 INFO MainThread:2657484 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-11-16 00:57:40,640 INFO MainThread:2657484 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-11-16 00:57:40,641 INFO MainThread:2657484 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-11-16 00:57:40,641 INFO MainThread:2657484 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-11-16 00:57:40,641 INFO MainThread:2657484 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_ftp.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_ftp.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_ftp.py'} +2024-11-16 00:57:40,641 INFO MainThread:2657484 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-11-16 00:57:40,641 INFO MainThread:2657484 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241116_005740-xzrt3mur/logs/debug.log +2024-11-16 00:57:40,641 INFO MainThread:2657484 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241116_005740-xzrt3mur/logs/debug-internal.log +2024-11-16 00:57:40,641 INFO MainThread:2657484 [wandb_init.py:init():621] calling init triggers +2024-11-16 00:57:40,641 INFO MainThread:2657484 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-11-16 00:57:40,641 INFO MainThread:2657484 [wandb_init.py:init():671] starting backend +2024-11-16 00:57:40,641 INFO MainThread:2657484 [wandb_init.py:init():675] sending inform_init request +2024-11-16 00:57:40,643 INFO MainThread:2657484 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-11-16 00:57:40,643 INFO MainThread:2657484 [wandb_init.py:init():688] backend started and connected +2024-11-16 00:57:40,646 INFO MainThread:2657484 [wandb_init.py:init():783] updated telemetry +2024-11-16 00:57:40,665 INFO MainThread:2657484 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-11-16 00:57:40,943 INFO MainThread:2657484 [wandb_init.py:init():867] starting run threads in backend +2024-11-16 00:57:41,079 INFO MainThread:2657484 [wandb_run.py:_console_start():2463] atexit reg +2024-11-16 00:57:41,079 INFO MainThread:2657484 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-11-16 00:57:41,079 INFO MainThread:2657484 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-11-16 00:57:41,079 INFO MainThread:2657484 [wandb_run.py:_redirect():2401] Redirects installed. +2024-11-16 00:57:41,081 INFO MainThread:2657484 [wandb_init.py:init():911] run started, returning control to user process +2024-11-16 00:57:41,082 INFO MainThread:2657484 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'shuffle_deterministic84', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0, 'lr': 5e-06} +2024-11-16 12:50:24,091 INFO MainThread:2657484 [wandb_run.py:_finish():2158] finishing run yaning1001-dartmouth-college/exp-impo-shuffle/xzrt3mur +2024-11-16 12:50:24,199 INFO MainThread:2657484 [wandb_run.py:_atexit_cleanup():2426] got exitcode: 0 +2024-11-16 12:50:24,289 INFO MainThread:2657484 [wandb_run.py:_restore():2408] restore +2024-11-16 12:50:24,290 INFO MainThread:2657484 [wandb_run.py:_restore():2414] restore done +2024-11-16 12:50:28,579 INFO MainThread:2657484 [wandb_run.py:_footer_history_summary_info():3975] rendering history +2024-11-16 12:50:28,579 INFO MainThread:2657484 [wandb_run.py:_footer_history_summary_info():4007] rendering summary +2024-11-16 12:50:28,736 INFO MainThread:2657484 [wandb_run.py:_footer_sync_info():3934] logging synced files diff --git a/wandb/run-20241119_135256-i48f8k8i/files/config.yaml b/wandb/run-20241119_135256-i48f8k8i/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2697b22f1922754efc3a49feb62c57c7fea4c220 --- /dev/null +++ b/wandb/run-20241119_135256-i48f8k8i/files/config.yaml @@ -0,0 +1,531 @@ +_name_or_path: + value: meta-llama/Llama-3.2-3B +_wandb: + value: + cli_version: 0.18.5 + m: + - "1": train/epoch + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": train/global_step + "6": + - 3 + "7": [] + - "1": eval/samples_per_second + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": eval/steps_per_second + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": eval/loss + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": train/grad_norm + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": train/loss + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": train/learning_rate + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": eval/runtime + "5": 2 + "6": + - 1 + - 3 + "7": [] + python_version: 3.9.19 + t: + "1": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "2": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "3": + - 2 + - 7 + - 13 + - 19 + - 23 + - 55 + - 62 + - 66 + "4": 3.9.19 + "5": 0.18.5 + "6": 4.45.1 + "8": + - 5 + "9": + "1": transformers_trainer + "12": 0.18.5 + "13": linux-x86_64 +accelerator_config: + value: + dispatch_batches: null + even_batches: true + gradient_accumulation_kwargs: null + non_blocking: false + split_batches: false + use_seedable_sampler: true +adafactor: + value: false +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +add_cross_attention: + value: false +architectures: + value: + - LlamaForCausalLM +attention_bias: + value: false +attention_dropout: + value: 0 +auto_find_batch_size: + value: false +bad_words_ids: + value: null +batch_eval_metrics: + value: false +batch_size: + value: 3 +begin_suppress_tokens: + value: null +bf16: + value: false +bf16_full_eval: + value: false +bos_token_id: + value: 128000 +chunk_size_feed_forward: + value: 0 +cross_attention_hidden_size: + value: null +data_seed: + value: null +dataloader_drop_last: + value: false +dataloader_num_workers: + value: 0 +dataloader_persistent_workers: + value: false +dataloader_pin_memory: + value: true +dataloader_prefetch_factor: + value: null +ddp_backend: + value: null +ddp_broadcast_buffers: + value: null +ddp_bucket_cap_mb: + value: null +ddp_find_unused_parameters: + value: null +ddp_timeout: + value: 1800 +debug: + value: [] +decoder_start_token_id: + value: null +deepspeed: + value: deepspeed_config/train_dp_config.json +disable_tqdm: + value: false +dispatch_batches: + value: null +diversity_penalty: + value: 0 +do_eval: + value: true +do_predict: + value: false +do_sample: + value: false +do_train: + value: false +early_stopping: + value: false +encoder_no_repeat_ngram_size: + value: 0 +eos_token_id: + value: 128001 +epoch: + value: 3 +eval_accumulation_steps: + value: null +eval_delay: + value: 0 +eval_do_concat_batches: + value: true +eval_on_start: + value: false +eval_steps: + value: 10 +eval_strategy: + value: steps +eval_use_gather_object: + value: false +evaluation_strategy: + value: steps +exponential_decay_length_penalty: + value: null +finetuning_task: + value: null +forced_bos_token_id: + value: null +forced_eos_token_id: + value: null +fp16: + value: true +fp16_backend: + value: auto +fp16_full_eval: + value: false +fp16_opt_level: + value: O1 +fsdp: + value: [] +fsdp_config: + value: + min_num_params: 0 + xla: false + xla_fsdp_grad_ckpt: false + xla_fsdp_v2: false +fsdp_min_num_params: + value: 0 +fsdp_transformer_layer_cls_to_wrap: + value: null +full_determinism: + value: false +gradient_accumulation_steps: + value: 2 +gradient_checkpointing: + value: false +gradient_checkpointing_kwargs: + value: null +greater_is_better: + value: null +group_by_length: + value: false +half_precision_backend: + value: auto +head_dim: + value: 128 +hidden_act: + value: silu +hidden_size: + value: 3072 +hub_always_push: + value: false +hub_model_id: + value: null +hub_private_repo: + value: false +hub_strategy: + value: every_save +hub_token: + value: +id2label: + value: + "0": LABEL_0 + "1": LABEL_1 +ignore_data_skip: + value: false +include_inputs_for_metrics: + value: false +include_num_input_tokens_seen: + value: false +include_tokens_per_second: + value: false +initializer_range: + value: 0.02 +intermediate_size: + value: 8192 +is_decoder: + value: false +is_encoder_decoder: + value: false +jit_mode_eval: + value: false +label_names: + value: null +label_smoothing_factor: + value: 0 +label2id: + value: + LABEL_0: 0 + LABEL_1: 1 +learning_rate: + value: 5e-06 +length_column_name: + value: length +length_penalty: + value: 1 +load_best_model_at_end: + value: false +local_rank: + value: 0 +log_level: + value: passive +log_level_replica: + value: warning +log_on_each_node: + value: true +logging_dir: + value: ./logs +logging_first_step: + value: false +logging_nan_inf_filter: + value: true +logging_steps: + value: 1 +logging_strategy: + value: steps +lr: + value: 5e-06 +lr_scheduler_type: + value: linear +max_grad_norm: + value: 1 +max_length: + value: 20 +max_position_embeddings: + value: 131072 +max_steps: + value: -1 +metric_for_best_model: + value: null +min_length: + value: 0 +mlp_bias: + value: false +model/num_parameters: + value: 3212749824 +model_type: + value: llama +mp_parameters: + value: "" +neftune_noise_alpha: + value: null +no_cuda: + value: false +no_repeat_ngram_size: + value: 0 +num_attention_heads: + value: 24 +num_beam_groups: + value: 1 +num_beams: + value: 1 +num_hidden_layers: + value: 28 +num_key_value_heads: + value: 8 +num_return_sequences: + value: 1 +num_train_epochs: + value: 3 +optim: + value: adamw_torch +optim_args: + value: null +optim_target_modules: + value: null +output_attentions: + value: false +output_dir: + value: ./checkpoints/Llama-3.2-3B-FTP/babylm_shuffle_local10_10M_seed0/runs +output_hidden_states: + value: false +output_scores: + value: false +overwrite_output_dir: + value: false +pad_token_id: + value: null +past_index: + value: -1 +per_device_eval_batch_size: + value: 8 +per_device_train_batch_size: + value: 3 +per_gpu_eval_batch_size: + value: null +per_gpu_train_batch_size: + value: null +perturbation: + value: shuffle_local10 +prediction_loss_only: + value: false +prefix: + value: null +pretraining_tp: + value: 1 +problem_type: + value: null +push_to_hub: + value: false +push_to_hub_model_id: + value: null +push_to_hub_organization: + value: null +push_to_hub_token: + value: +ray_scope: + value: last +remove_invalid_values: + value: false +remove_unused_columns: + value: true +repetition_penalty: + value: 1 +report_to: + value: + - wandb +restore_callback_states_from_checkpoint: + value: false +resume_from_checkpoint: + value: null +return_dict: + value: true +return_dict_in_generate: + value: false +rms_norm_eps: + value: 1e-05 +rope_scaling: + value: + factor: 32 + high_freq_factor: 4 + low_freq_factor: 1 + original_max_position_embeddings: 8192 + rope_type: llama3 +rope_theta: + value: 500000 +run_name: + value: ./checkpoints/Llama-3.2-3B-FTP/babylm_shuffle_local10_10M_seed0/runs +save_on_each_node: + value: false +save_only_model: + value: false +save_safetensors: + value: true +save_steps: + value: 100 +save_strategy: + value: steps +save_total_limit: + value: null +seed: + value: 0 +sep_token_id: + value: null +skip_memory_metrics: + value: true +split_batches: + value: null +suppress_tokens: + value: null +task_specific_params: + value: null +temperature: + value: 1 +tf_legacy_loss: + value: false +tf32: + value: null +tie_encoder_decoder: + value: false +tie_word_embeddings: + value: true +tokenizer_class: + value: null +top_k: + value: 50 +top_p: + value: 1 +torch_compile: + value: false +torch_compile_backend: + value: null +torch_compile_mode: + value: null +torch_dtype: + value: bfloat16 +torch_empty_cache_steps: + value: null +torchdynamo: + value: null +torchscript: + value: false +tpu_metrics_debug: + value: false +tpu_num_cores: + value: null +train_set: + value: 10M +transformers_version: + value: 4.45.1 +typical_p: + value: 1 +use_bfloat16: + value: false +use_cache: + value: true +use_cpu: + value: false +use_ipex: + value: false +use_legacy_prediction_loop: + value: false +use_liger_kernel: + value: false +use_mps_device: + value: false +vocab_size: + value: 128256 +warmup_ratio: + value: 0.1 +warmup_steps: + value: 0 +weight_decay: + value: 0 diff --git a/wandb/run-20241119_135256-i48f8k8i/files/wandb-summary.json b/wandb/run-20241119_135256-i48f8k8i/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..0c61b3a89be0969169461b29e05eebf1bb0a0a96 --- /dev/null +++ b/wandb/run-20241119_135256-i48f8k8i/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":42875},"_step":3009,"train/global_step":2736,"eval/samples_per_second":26.579,"train_steps_per_second":0.064,"train/learning_rate":3.6555645816409427e-08,"train_runtime":42556.4051,"train/epoch":2.9983561643835617,"eval/runtime":37.6238,"train/grad_norm":2.390791416168213,"eval/loss":1.7726256847381592,"eval/steps_per_second":1.116,"total_flos":8.528958544014213e+17,"train_samples_per_second":1.158,"_timestamp":1.732085251819263e+09,"train/loss":1.5547,"train_loss":1.673318871542027,"_runtime":42874.926636984} \ No newline at end of file diff --git a/wandb/run-20241119_135256-i48f8k8i/logs/debug-internal.log b/wandb/run-20241119_135256-i48f8k8i/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..3156e143d1bd2b974261fcc2ee7c568cd27bdb60 --- /dev/null +++ b/wandb/run-20241119_135256-i48f8k8i/logs/debug-internal.log @@ -0,0 +1,18 @@ +{"time":"2024-11-19T13:52:56.896325342-05:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-11-19T13:52:56.896346472-05:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241119_135256-i48f8k8i/logs/debug-core.log"} +{"time":"2024-11-19T13:52:57.008353535-05:00","level":"INFO","msg":"created new stream","id":"i48f8k8i"} +{"time":"2024-11-19T13:52:57.008423955-05:00","level":"INFO","msg":"stream: started","id":"i48f8k8i"} +{"time":"2024-11-19T13:52:57.008468935-05:00","level":"INFO","msg":"sender: started","stream_id":"i48f8k8i"} +{"time":"2024-11-19T13:52:57.008462395-05:00","level":"INFO","msg":"handler: started","stream_id":{"value":"i48f8k8i"}} +{"time":"2024-11-19T13:52:57.008440745-05:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"i48f8k8i"}} +{"time":"2024-11-19T13:52:57.256723715-05:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2024-11-19T14:26:42.599960136-05:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/yaning1001-dartmouth-college/exp-impo-shuffle/i48f8k8i/file_stream"} +{"time":"2024-11-20T01:47:31.968427857-05:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2024-11-20T01:47:31.978326214-05:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2024-11-20T01:47:32.609111028-05:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2024-11-20T01:47:32.771175491-05:00","level":"INFO","msg":"handler: operation stats","stats":{}} +{"time":"2024-11-20T01:47:33.880918495-05:00","level":"INFO","msg":"stream: closing","id":"i48f8k8i"} +{"time":"2024-11-20T01:47:33.880953885-05:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"i48f8k8i"}} +{"time":"2024-11-20T01:47:33.880983775-05:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"i48f8k8i"}} +{"time":"2024-11-20T01:47:33.881025656-05:00","level":"INFO","msg":"sender: closed","stream_id":"i48f8k8i"} +{"time":"2024-11-20T01:47:33.881098396-05:00","level":"INFO","msg":"stream: closed","id":"i48f8k8i"} diff --git a/wandb/run-20241119_135256-i48f8k8i/logs/debug.log b/wandb/run-20241119_135256-i48f8k8i/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..55a55a989b74da64647f1b55754e0f060dd3f069 --- /dev/null +++ b/wandb/run-20241119_135256-i48f8k8i/logs/debug.log @@ -0,0 +1,36 @@ +2024-11-19 13:52:56,889 INFO MainThread:2719620 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-11-19 13:52:56,889 INFO MainThread:2719620 [wandb_setup.py:_flush():79] Configure stats pid to 2719620 +2024-11-19 13:52:56,889 INFO MainThread:2719620 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-11-19 13:52:56,890 INFO MainThread:2719620 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-11-19 13:52:56,890 INFO MainThread:2719620 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-11-19 13:52:56,890 INFO MainThread:2719620 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-11-19 13:52:56,890 INFO MainThread:2719620 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_ftp.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_ftp.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_ftp.py'} +2024-11-19 13:52:56,890 INFO MainThread:2719620 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-11-19 13:52:56,890 INFO MainThread:2719620 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241119_135256-i48f8k8i/logs/debug.log +2024-11-19 13:52:56,890 INFO MainThread:2719620 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241119_135256-i48f8k8i/logs/debug-internal.log +2024-11-19 13:52:56,890 INFO MainThread:2719620 [wandb_init.py:init():621] calling init triggers +2024-11-19 13:52:56,890 INFO MainThread:2719620 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-11-19 13:52:56,890 INFO MainThread:2719620 [wandb_init.py:init():671] starting backend +2024-11-19 13:52:56,890 INFO MainThread:2719620 [wandb_init.py:init():675] sending inform_init request +2024-11-19 13:52:56,892 INFO MainThread:2719620 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-11-19 13:52:56,892 INFO MainThread:2719620 [wandb_init.py:init():688] backend started and connected +2024-11-19 13:52:56,896 INFO MainThread:2719620 [wandb_init.py:init():783] updated telemetry +2024-11-19 13:52:56,926 INFO MainThread:2719620 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-11-19 13:52:57,253 INFO MainThread:2719620 [wandb_init.py:init():867] starting run threads in backend +2024-11-19 13:52:57,358 INFO MainThread:2719620 [wandb_run.py:_console_start():2463] atexit reg +2024-11-19 13:52:57,358 INFO MainThread:2719620 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-11-19 13:52:57,358 INFO MainThread:2719620 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-11-19 13:52:57,358 INFO MainThread:2719620 [wandb_run.py:_redirect():2401] Redirects installed. +2024-11-19 13:52:57,359 INFO MainThread:2719620 [wandb_init.py:init():911] run started, returning control to user process +2024-11-19 13:52:57,360 INFO MainThread:2719620 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'shuffle_local10', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0, 'lr': 5e-06} +2024-11-19 13:58:15,416 INFO MainThread:2719620 [wandb_run.py:_config_callback():1390] config_cb None None {'vocab_size': 128256, 'max_position_embeddings': 131072, 'hidden_size': 3072, 'intermediate_size': 8192, 'num_hidden_layers': 28, 'num_attention_heads': 24, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'pretraining_tp': 1, 'use_cache': True, 'rope_theta': 500000.0, 'rope_scaling': {'factor': 32.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}, 'attention_bias': False, 'attention_dropout': 0.0, 'mlp_bias': False, 'head_dim': 128, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': True, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['LlamaForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 128000, 'pad_token_id': None, 'eos_token_id': 128001, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'meta-llama/Llama-3.2-3B', 'transformers_version': '4.45.1', 'model_type': 'llama', 'output_dir': './checkpoints/Llama-3.2-3B-FTP/babylm_shuffle_local10_10M_seed0/runs', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 3, 'per_device_eval_batch_size': 8, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 2, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-06, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.1, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': './logs', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 100, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 0, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': False, 'fp16': True, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 10, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': './checkpoints/Llama-3.2-3B-FTP/babylm_shuffle_local10_10M_seed0/runs', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': 'deepspeed_config/train_dp_config.json', 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'evaluation_strategy': 'steps', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'eval_use_gather_object': False} +2024-11-19 13:58:15,418 INFO MainThread:2719620 [wandb_config.py:__setitem__():154] config set model/num_parameters = 3212749824 - > +2024-11-19 13:58:15,418 INFO MainThread:2719620 [wandb_run.py:_config_callback():1390] config_cb model/num_parameters 3212749824 None +2024-11-20 01:47:31,947 INFO MainThread:2719620 [wandb_run.py:_finish():2158] finishing run yaning1001-dartmouth-college/exp-impo-shuffle/i48f8k8i +2024-11-20 01:47:31,967 INFO MainThread:2719620 [wandb_run.py:_atexit_cleanup():2426] got exitcode: 0 +2024-11-20 01:47:31,967 INFO MainThread:2719620 [wandb_run.py:_restore():2408] restore +2024-11-20 01:47:31,968 INFO MainThread:2719620 [wandb_run.py:_restore():2414] restore done +2024-11-20 01:47:33,869 INFO MainThread:2719620 [wandb_run.py:_footer_history_summary_info():3975] rendering history +2024-11-20 01:47:33,870 INFO MainThread:2719620 [wandb_run.py:_footer_history_summary_info():4007] rendering summary +2024-11-20 01:47:33,880 INFO MainThread:2719620 [wandb_run.py:_footer_sync_info():3934] logging synced files diff --git a/wandb/run-20241128_161554-907lsb28/files/config.yaml b/wandb/run-20241128_161554-907lsb28/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8d07c4054eb6cea6e4e93d3fb630156b4b6a6798 --- /dev/null +++ b/wandb/run-20241128_161554-907lsb28/files/config.yaml @@ -0,0 +1,50 @@ +_wandb: + value: + cli_version: 0.18.5 + m: [] + python_version: 3.9.19 + t: + "1": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "2": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "3": + - 2 + - 13 + - 23 + - 55 + "4": 3.9.19 + "5": 0.18.5 + "6": 4.45.1 + "8": + - 5 + "12": 0.18.5 + "13": linux-x86_64 +batch_size: + value: 3 +epoch: + value: 3 +lr: + value: 5e-06 +perturbation: + value: reverse_control +seed: + value: 0 +train_set: + value: 10M diff --git a/wandb/run-20241128_161554-907lsb28/files/output.log b/wandb/run-20241128_161554-907lsb28/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..c1baedaa5b22a4efcb2d4e2039878d4396e90496 --- /dev/null +++ b/wandb/run-20241128_161554-907lsb28/files/output.log @@ -0,0 +1,14 @@ +Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17519/17519 [00:48<00:00, 362.25 examples/s] +Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 18140/18140 [00:49<00:00, 364.14 examples/s] +tokenized_valid: Dataset({ + features: ['input_ids', 'attention_mask'], + num_rows: 1000 +}) +/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead + warnings.warn( +[2024-11-28 16:18:35,555] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-11-28 16:18:41,080] [INFO] [comm.py:652:init_distributed] cdb=None +Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination +Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root... +Loading extension module cpu_adam... +Time to load cpu_adam op: 28.010013341903687 seconds diff --git a/wandb/run-20241128_161554-907lsb28/files/wandb-metadata.json b/wandb/run-20241128_161554-907lsb28/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..9d4ed4c529877634500143f7e6f6bc93590246ad --- /dev/null +++ b/wandb/run-20241128_161554-907lsb28/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-11-28T21:15:54.211208Z", + "args": [ + "--perturbation", + "reverse_control", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "3", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_llama_1B.py", + "codePath": "train/train_llama_1B.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_llama_1B.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1723122159616" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241128_161554-907lsb28/files/wandb-summary.json b/wandb/run-20241128_161554-907lsb28/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..0f6aab003b4b8541a72d07e8d6bea8280b1dfec1 --- /dev/null +++ b/wandb/run-20241128_161554-907lsb28/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":53001}} \ No newline at end of file