diff --git a/wandb/run-20241030_010641-sozqwfy9/files/config.yaml b/wandb/run-20241030_010641-sozqwfy9/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f8546482f67cbb9041af9d756594322c485d67a9 --- /dev/null +++ b/wandb/run-20241030_010641-sozqwfy9/files/config.yaml @@ -0,0 +1,47 @@ +_wandb: + value: + cli_version: 0.18.5 + m: [] + python_version: 3.9.19 + t: + "1": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "2": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "3": + - 13 + - 23 + - 55 + "4": 3.9.19 + "5": 0.18.5 + "6": 4.45.1 + "8": + - 5 + "12": 0.18.5 + "13": linux-x86_64 +batch_size: + value: 3 +epoch: + value: 7 +perturbation: + value: reverse_control +seed: + value: 0 +train_set: + value: 10M diff --git a/wandb/run-20241030_010641-sozqwfy9/files/output.log b/wandb/run-20241030_010641-sozqwfy9/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..1777f063b107f651dddc063d1d3a3fa80015bf9e --- /dev/null +++ b/wandb/run-20241030_010641-sozqwfy9/files/output.log @@ -0,0 +1,4 @@ +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", line 162, in + dataset_name = f"babylm_{args.perturbation}_{args.train_zset}_seed{args.seed}" +AttributeError: 'Namespace' object has no attribute 'train_zset' diff --git a/wandb/run-20241030_010641-sozqwfy9/files/wandb-metadata.json b/wandb/run-20241030_010641-sozqwfy9/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..7647b3a409f7c9984a836b4cff4f34b61834b4e4 --- /dev/null +++ b/wandb/run-20241030_010641-sozqwfy9/files/wandb-metadata.json @@ -0,0 +1,29 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-10-30T05:06:41.543355Z", + "args": [ + "--perturbation", + "reverse_control", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "7", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py" +} \ No newline at end of file diff --git a/wandb/run-20241030_010641-sozqwfy9/files/wandb-summary.json b/wandb/run-20241030_010641-sozqwfy9/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..4e355fc8e9915c58fba97556eba40fd65c826d6a --- /dev/null +++ b/wandb/run-20241030_010641-sozqwfy9/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":1}} \ No newline at end of file diff --git a/wandb/run-20241030_010641-sozqwfy9/logs/debug-internal.log b/wandb/run-20241030_010641-sozqwfy9/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..6c068618bdd3861b5a3124821952bfbbe9b09f40 --- /dev/null +++ b/wandb/run-20241030_010641-sozqwfy9/logs/debug-internal.log @@ -0,0 +1,16 @@ +{"time":"2024-10-30T01:06:41.544820605-04:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-10-30T01:06:41.544828795-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_010641-sozqwfy9/logs/debug-core.log"} +{"time":"2024-10-30T01:06:41.651808474-04:00","level":"INFO","msg":"created new stream","id":"sozqwfy9"} +{"time":"2024-10-30T01:06:41.651840385-04:00","level":"INFO","msg":"stream: started","id":"sozqwfy9"} +{"time":"2024-10-30T01:06:41.651863335-04:00","level":"INFO","msg":"sender: started","stream_id":"sozqwfy9"} +{"time":"2024-10-30T01:06:41.651850035-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"sozqwfy9"}} +{"time":"2024-10-30T01:06:41.651862905-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"sozqwfy9"}} +{"time":"2024-10-30T01:06:43.032746197-04:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2024-10-30T01:06:43.127763565-04:00","level":"INFO","msg":"stream: closing","id":"sozqwfy9"} +{"time":"2024-10-30T01:06:43.127845395-04:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2024-10-30T01:06:43.226914121-04:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2024-10-30T01:06:43.693295593-04:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2024-10-30T01:06:43.805986822-04:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"sozqwfy9"}} +{"time":"2024-10-30T01:06:43.806037683-04:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"sozqwfy9"}} +{"time":"2024-10-30T01:06:43.806051923-04:00","level":"INFO","msg":"sender: closed","stream_id":"sozqwfy9"} +{"time":"2024-10-30T01:06:43.806121243-04:00","level":"INFO","msg":"stream: closed","id":"sozqwfy9"} diff --git a/wandb/run-20241030_010641-sozqwfy9/logs/debug.log b/wandb/run-20241030_010641-sozqwfy9/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..ac24d45b7eaa2ef559cc48fb4ec277553029a947 --- /dev/null +++ b/wandb/run-20241030_010641-sozqwfy9/logs/debug.log @@ -0,0 +1,27 @@ +2024-10-30 01:06:41,541 INFO MainThread:321598 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-10-30 01:06:41,541 INFO MainThread:321598 [wandb_setup.py:_flush():79] Configure stats pid to 321598 +2024-10-30 01:06:41,541 INFO MainThread:321598 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-10-30 01:06:41,541 INFO MainThread:321598 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-10-30 01:06:41,541 INFO MainThread:321598 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-10-30 01:06:41,541 INFO MainThread:321598 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-10-30 01:06:41,541 INFO MainThread:321598 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-10-30 01:06:41,542 INFO MainThread:321598 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-10-30 01:06:41,542 INFO MainThread:321598 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_010641-sozqwfy9/logs/debug.log +2024-10-30 01:06:41,542 INFO MainThread:321598 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_010641-sozqwfy9/logs/debug-internal.log +2024-10-30 01:06:41,542 INFO MainThread:321598 [wandb_init.py:init():621] calling init triggers +2024-10-30 01:06:41,542 INFO MainThread:321598 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-10-30 01:06:41,542 INFO MainThread:321598 [wandb_init.py:init():671] starting backend +2024-10-30 01:06:41,542 INFO MainThread:321598 [wandb_init.py:init():675] sending inform_init request +2024-10-30 01:06:41,542 INFO MainThread:321598 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-10-30 01:06:41,543 INFO MainThread:321598 [wandb_init.py:init():688] backend started and connected +2024-10-30 01:06:41,545 INFO MainThread:321598 [wandb_init.py:init():783] updated telemetry +2024-10-30 01:06:41,569 INFO MainThread:321598 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-10-30 01:06:43,029 INFO MainThread:321598 [wandb_init.py:init():867] starting run threads in backend +2024-10-30 01:06:43,124 INFO MainThread:321598 [wandb_run.py:_console_start():2463] atexit reg +2024-10-30 01:06:43,124 INFO MainThread:321598 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-10-30 01:06:43,124 INFO MainThread:321598 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-10-30 01:06:43,124 INFO MainThread:321598 [wandb_run.py:_redirect():2401] Redirects installed. +2024-10-30 01:06:43,126 INFO MainThread:321598 [wandb_init.py:init():911] run started, returning control to user process +2024-10-30 01:06:43,126 INFO MainThread:321598 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_control', 'train_set': '10M', 'batch_size': 3, 'epoch': 7, 'seed': 0} +2024-10-30 01:06:43,127 WARNING MsgRouterThr:321598 [router.py:message_loop():77] message_loop has been closed diff --git a/wandb/run-20241030_010641-sozqwfy9/run-sozqwfy9.wandb b/wandb/run-20241030_010641-sozqwfy9/run-sozqwfy9.wandb new file mode 100644 index 0000000000000000000000000000000000000000..82177640ecfc7781175535f126f25821a6910767 Binary files /dev/null and b/wandb/run-20241030_010641-sozqwfy9/run-sozqwfy9.wandb differ diff --git a/wandb/run-20241030_010759-0a5n2onp/files/config.yaml b/wandb/run-20241030_010759-0a5n2onp/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f8546482f67cbb9041af9d756594322c485d67a9 --- /dev/null +++ b/wandb/run-20241030_010759-0a5n2onp/files/config.yaml @@ -0,0 +1,47 @@ +_wandb: + value: + cli_version: 0.18.5 + m: [] + python_version: 3.9.19 + t: + "1": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "2": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "3": + - 13 + - 23 + - 55 + "4": 3.9.19 + "5": 0.18.5 + "6": 4.45.1 + "8": + - 5 + "12": 0.18.5 + "13": linux-x86_64 +batch_size: + value: 3 +epoch: + value: 7 +perturbation: + value: reverse_control +seed: + value: 0 +train_set: + value: 10M diff --git a/wandb/run-20241030_010759-0a5n2onp/files/output.log b/wandb/run-20241030_010759-0a5n2onp/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..1777f063b107f651dddc063d1d3a3fa80015bf9e --- /dev/null +++ b/wandb/run-20241030_010759-0a5n2onp/files/output.log @@ -0,0 +1,4 @@ +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", line 162, in + dataset_name = f"babylm_{args.perturbation}_{args.train_zset}_seed{args.seed}" +AttributeError: 'Namespace' object has no attribute 'train_zset' diff --git a/wandb/run-20241030_010759-0a5n2onp/files/wandb-metadata.json b/wandb/run-20241030_010759-0a5n2onp/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..46d3f216d39508bb433dd4b0b125b7e772b0e061 --- /dev/null +++ b/wandb/run-20241030_010759-0a5n2onp/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-10-30T05:07:59.024551Z", + "args": [ + "--perturbation", + "reverse_control", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "7", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1719200268288" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241030_010759-0a5n2onp/files/wandb-summary.json b/wandb/run-20241030_010759-0a5n2onp/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..6c37fe1cbbb8aed86fd461a79642cb991e4d35cf --- /dev/null +++ b/wandb/run-20241030_010759-0a5n2onp/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":0}} \ No newline at end of file diff --git a/wandb/run-20241030_010759-0a5n2onp/logs/debug-internal.log b/wandb/run-20241030_010759-0a5n2onp/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..d26d09d948c930e8a36acebcf8dcbf2f38b154d5 --- /dev/null +++ b/wandb/run-20241030_010759-0a5n2onp/logs/debug-internal.log @@ -0,0 +1,16 @@ +{"time":"2024-10-30T01:07:59.026431301-04:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-10-30T01:07:59.026442171-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_010759-0a5n2onp/logs/debug-core.log"} +{"time":"2024-10-30T01:07:59.133417319-04:00","level":"INFO","msg":"created new stream","id":"0a5n2onp"} +{"time":"2024-10-30T01:07:59.13345488-04:00","level":"INFO","msg":"stream: started","id":"0a5n2onp"} +{"time":"2024-10-30T01:07:59.13347872-04:00","level":"INFO","msg":"sender: started","stream_id":"0a5n2onp"} +{"time":"2024-10-30T01:07:59.13348111-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"0a5n2onp"}} +{"time":"2024-10-30T01:07:59.13347098-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"0a5n2onp"}} +{"time":"2024-10-30T01:07:59.301083731-04:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2024-10-30T01:07:59.400210746-04:00","level":"INFO","msg":"stream: closing","id":"0a5n2onp"} +{"time":"2024-10-30T01:07:59.400243016-04:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2024-10-30T01:07:59.400545198-04:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2024-10-30T01:07:59.953692364-04:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2024-10-30T01:08:00.087291614-04:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"0a5n2onp"}} +{"time":"2024-10-30T01:08:00.087360055-04:00","level":"INFO","msg":"sender: closed","stream_id":"0a5n2onp"} +{"time":"2024-10-30T01:08:00.087352894-04:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"0a5n2onp"}} +{"time":"2024-10-30T01:08:00.087440995-04:00","level":"INFO","msg":"stream: closed","id":"0a5n2onp"} diff --git a/wandb/run-20241030_010759-0a5n2onp/logs/debug.log b/wandb/run-20241030_010759-0a5n2onp/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..4bc1ce94b4301368cf6136b3848bed1efbb7f6af --- /dev/null +++ b/wandb/run-20241030_010759-0a5n2onp/logs/debug.log @@ -0,0 +1,27 @@ +2024-10-30 01:07:59,022 INFO MainThread:322458 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-10-30 01:07:59,022 INFO MainThread:322458 [wandb_setup.py:_flush():79] Configure stats pid to 322458 +2024-10-30 01:07:59,022 INFO MainThread:322458 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-10-30 01:07:59,022 INFO MainThread:322458 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-10-30 01:07:59,022 INFO MainThread:322458 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-10-30 01:07:59,022 INFO MainThread:322458 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-10-30 01:07:59,022 INFO MainThread:322458 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-10-30 01:07:59,022 INFO MainThread:322458 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-10-30 01:07:59,022 INFO MainThread:322458 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_010759-0a5n2onp/logs/debug.log +2024-10-30 01:07:59,023 INFO MainThread:322458 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_010759-0a5n2onp/logs/debug-internal.log +2024-10-30 01:07:59,023 INFO MainThread:322458 [wandb_init.py:init():621] calling init triggers +2024-10-30 01:07:59,023 INFO MainThread:322458 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-10-30 01:07:59,023 INFO MainThread:322458 [wandb_init.py:init():671] starting backend +2024-10-30 01:07:59,023 INFO MainThread:322458 [wandb_init.py:init():675] sending inform_init request +2024-10-30 01:07:59,024 INFO MainThread:322458 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-10-30 01:07:59,024 INFO MainThread:322458 [wandb_init.py:init():688] backend started and connected +2024-10-30 01:07:59,027 INFO MainThread:322458 [wandb_init.py:init():783] updated telemetry +2024-10-30 01:07:59,064 INFO MainThread:322458 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-10-30 01:07:59,297 INFO MainThread:322458 [wandb_init.py:init():867] starting run threads in backend +2024-10-30 01:07:59,396 INFO MainThread:322458 [wandb_run.py:_console_start():2463] atexit reg +2024-10-30 01:07:59,396 INFO MainThread:322458 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-10-30 01:07:59,396 INFO MainThread:322458 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-10-30 01:07:59,396 INFO MainThread:322458 [wandb_run.py:_redirect():2401] Redirects installed. +2024-10-30 01:07:59,398 INFO MainThread:322458 [wandb_init.py:init():911] run started, returning control to user process +2024-10-30 01:07:59,399 INFO MainThread:322458 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_control', 'train_set': '10M', 'batch_size': 3, 'epoch': 7, 'seed': 0} +2024-10-30 01:07:59,400 WARNING MsgRouterThr:322458 [router.py:message_loop():77] message_loop has been closed diff --git a/wandb/run-20241030_010759-0a5n2onp/run-0a5n2onp.wandb b/wandb/run-20241030_010759-0a5n2onp/run-0a5n2onp.wandb new file mode 100644 index 0000000000000000000000000000000000000000..f96dfca2c3b2bb2347f968230473ec642e4b42f9 Binary files /dev/null and b/wandb/run-20241030_010759-0a5n2onp/run-0a5n2onp.wandb differ diff --git a/wandb/run-20241030_010759-imp8g625/files/wandb-metadata.json b/wandb/run-20241030_010759-imp8g625/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..67d847844310900f68a7981cdeaa7939ae29d081 --- /dev/null +++ b/wandb/run-20241030_010759-imp8g625/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-10-30T05:07:59.128982Z", + "args": [ + "--perturbation", + "reverse_control", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "7", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1719200272384" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241030_010759-imp8g625/files/wandb-summary.json b/wandb/run-20241030_010759-imp8g625/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..6c37fe1cbbb8aed86fd461a79642cb991e4d35cf --- /dev/null +++ b/wandb/run-20241030_010759-imp8g625/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":0}} \ No newline at end of file diff --git a/wandb/run-20241030_010759-imp8g625/logs/debug-internal.log b/wandb/run-20241030_010759-imp8g625/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..2652d718f1ef04e117ffca12cad05c9cc5b099ed --- /dev/null +++ b/wandb/run-20241030_010759-imp8g625/logs/debug-internal.log @@ -0,0 +1,16 @@ +{"time":"2024-10-30T01:07:59.130957192-04:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-10-30T01:07:59.130976003-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_010759-imp8g625/logs/debug-core.log"} +{"time":"2024-10-30T01:07:59.237319546-04:00","level":"INFO","msg":"created new stream","id":"imp8g625"} +{"time":"2024-10-30T01:07:59.237366547-04:00","level":"INFO","msg":"stream: started","id":"imp8g625"} +{"time":"2024-10-30T01:07:59.237426827-04:00","level":"INFO","msg":"sender: started","stream_id":"imp8g625"} +{"time":"2024-10-30T01:07:59.237392927-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"imp8g625"}} +{"time":"2024-10-30T01:07:59.237409567-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"imp8g625"}} +{"time":"2024-10-30T01:07:59.460104113-04:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2024-10-30T01:07:59.567682276-04:00","level":"INFO","msg":"stream: closing","id":"imp8g625"} +{"time":"2024-10-30T01:07:59.567714696-04:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2024-10-30T01:07:59.568157119-04:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2024-10-30T01:08:00.146034424-04:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2024-10-30T01:08:00.269377913-04:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"imp8g625"}} +{"time":"2024-10-30T01:08:00.269400854-04:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"imp8g625"}} +{"time":"2024-10-30T01:08:00.269409974-04:00","level":"INFO","msg":"sender: closed","stream_id":"imp8g625"} +{"time":"2024-10-30T01:08:00.269448294-04:00","level":"INFO","msg":"stream: closed","id":"imp8g625"} diff --git a/wandb/run-20241030_010759-imp8g625/logs/debug.log b/wandb/run-20241030_010759-imp8g625/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..b345eff34ba469988f18eb2283b49f30d73c8014 --- /dev/null +++ b/wandb/run-20241030_010759-imp8g625/logs/debug.log @@ -0,0 +1,27 @@ +2024-10-30 01:07:59,126 INFO MainThread:322460 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-10-30 01:07:59,126 INFO MainThread:322460 [wandb_setup.py:_flush():79] Configure stats pid to 322460 +2024-10-30 01:07:59,126 INFO MainThread:322460 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-10-30 01:07:59,126 INFO MainThread:322460 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-10-30 01:07:59,126 INFO MainThread:322460 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-10-30 01:07:59,126 INFO MainThread:322460 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-10-30 01:07:59,126 INFO MainThread:322460 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-10-30 01:07:59,126 INFO MainThread:322460 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-10-30 01:07:59,126 INFO MainThread:322460 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_010759-imp8g625/logs/debug.log +2024-10-30 01:07:59,126 INFO MainThread:322460 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_010759-imp8g625/logs/debug-internal.log +2024-10-30 01:07:59,126 INFO MainThread:322460 [wandb_init.py:init():621] calling init triggers +2024-10-30 01:07:59,127 INFO MainThread:322460 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-10-30 01:07:59,127 INFO MainThread:322460 [wandb_init.py:init():671] starting backend +2024-10-30 01:07:59,127 INFO MainThread:322460 [wandb_init.py:init():675] sending inform_init request +2024-10-30 01:07:59,128 INFO MainThread:322460 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-10-30 01:07:59,128 INFO MainThread:322460 [wandb_init.py:init():688] backend started and connected +2024-10-30 01:07:59,132 INFO MainThread:322460 [wandb_init.py:init():783] updated telemetry +2024-10-30 01:07:59,161 INFO MainThread:322460 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-10-30 01:07:59,457 INFO MainThread:322460 [wandb_init.py:init():867] starting run threads in backend +2024-10-30 01:07:59,563 INFO MainThread:322460 [wandb_run.py:_console_start():2463] atexit reg +2024-10-30 01:07:59,563 INFO MainThread:322460 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-10-30 01:07:59,563 INFO MainThread:322460 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-10-30 01:07:59,564 INFO MainThread:322460 [wandb_run.py:_redirect():2401] Redirects installed. +2024-10-30 01:07:59,566 INFO MainThread:322460 [wandb_init.py:init():911] run started, returning control to user process +2024-10-30 01:07:59,566 INFO MainThread:322460 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_control', 'train_set': '10M', 'batch_size': 3, 'epoch': 7, 'seed': 0} +2024-10-30 01:07:59,567 WARNING MsgRouterThr:322460 [router.py:message_loop():77] message_loop has been closed diff --git a/wandb/run-20241030_011014-zmts6m10/files/config.yaml b/wandb/run-20241030_011014-zmts6m10/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f8546482f67cbb9041af9d756594322c485d67a9 --- /dev/null +++ b/wandb/run-20241030_011014-zmts6m10/files/config.yaml @@ -0,0 +1,47 @@ +_wandb: + value: + cli_version: 0.18.5 + m: [] + python_version: 3.9.19 + t: + "1": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "2": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "3": + - 13 + - 23 + - 55 + "4": 3.9.19 + "5": 0.18.5 + "6": 4.45.1 + "8": + - 5 + "12": 0.18.5 + "13": linux-x86_64 +batch_size: + value: 3 +epoch: + value: 7 +perturbation: + value: reverse_control +seed: + value: 0 +train_set: + value: 10M diff --git a/wandb/run-20241030_011014-zmts6m10/files/output.log b/wandb/run-20241030_011014-zmts6m10/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..b5fe28d6cd97d8e4b051a7648e260c2aeeb1fa31 --- /dev/null +++ b/wandb/run-20241030_011014-zmts6m10/files/output.log @@ -0,0 +1,6 @@ +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", line 165, in + valid_dataset = dataset['validation'] + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/datasets/dataset_dict.py", line 72, in __getitem__ + return super().__getitem__(k) +KeyError: 'validation' diff --git a/wandb/run-20241030_011014-zmts6m10/files/wandb-metadata.json b/wandb/run-20241030_011014-zmts6m10/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..2b76ba866035ff115ee786eba268f7bab2900acd --- /dev/null +++ b/wandb/run-20241030_011014-zmts6m10/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-10-30T05:10:14.058527Z", + "args": [ + "--perturbation", + "reverse_control", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "7", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1719200362496" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241030_011014-zmts6m10/files/wandb-summary.json b/wandb/run-20241030_011014-zmts6m10/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..c437ff1a48b0e53a8cdd36dcd584a8e6b22b4bc2 --- /dev/null +++ b/wandb/run-20241030_011014-zmts6m10/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":7}} \ No newline at end of file diff --git a/wandb/run-20241030_011014-zmts6m10/logs/debug-internal.log b/wandb/run-20241030_011014-zmts6m10/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..2d0f70a582ea111b02b6420462bd8f7e73aa6cc9 --- /dev/null +++ b/wandb/run-20241030_011014-zmts6m10/logs/debug-internal.log @@ -0,0 +1,16 @@ +{"time":"2024-10-30T01:10:14.060931522-04:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-10-30T01:10:14.060945603-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_011014-zmts6m10/logs/debug-core.log"} +{"time":"2024-10-30T01:10:14.16965664-04:00","level":"INFO","msg":"created new stream","id":"zmts6m10"} +{"time":"2024-10-30T01:10:14.169719361-04:00","level":"INFO","msg":"stream: started","id":"zmts6m10"} +{"time":"2024-10-30T01:10:14.169812672-04:00","level":"INFO","msg":"sender: started","stream_id":"zmts6m10"} +{"time":"2024-10-30T01:10:14.169748791-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"zmts6m10"}} +{"time":"2024-10-30T01:10:14.169800151-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"zmts6m10"}} +{"time":"2024-10-30T01:10:14.564909463-04:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2024-10-30T01:10:22.006513638-04:00","level":"INFO","msg":"stream: closing","id":"zmts6m10"} +{"time":"2024-10-30T01:10:22.006567608-04:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2024-10-30T01:10:22.007193673-04:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2024-10-30T01:10:22.414493317-04:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2024-10-30T01:10:22.536976149-04:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"zmts6m10"}} +{"time":"2024-10-30T01:10:22.537005739-04:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"zmts6m10"}} +{"time":"2024-10-30T01:10:22.537036749-04:00","level":"INFO","msg":"sender: closed","stream_id":"zmts6m10"} +{"time":"2024-10-30T01:10:22.53708566-04:00","level":"INFO","msg":"stream: closed","id":"zmts6m10"} diff --git a/wandb/run-20241030_011014-zmts6m10/logs/debug.log b/wandb/run-20241030_011014-zmts6m10/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..eeaeac0bf7508857b8f2ebe1f006d3e3cff51b3f --- /dev/null +++ b/wandb/run-20241030_011014-zmts6m10/logs/debug.log @@ -0,0 +1,27 @@ +2024-10-30 01:10:14,056 INFO MainThread:323567 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-10-30 01:10:14,056 INFO MainThread:323567 [wandb_setup.py:_flush():79] Configure stats pid to 323567 +2024-10-30 01:10:14,056 INFO MainThread:323567 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-10-30 01:10:14,056 INFO MainThread:323567 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-10-30 01:10:14,057 INFO MainThread:323567 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-10-30 01:10:14,057 INFO MainThread:323567 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-10-30 01:10:14,057 INFO MainThread:323567 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-10-30 01:10:14,057 INFO MainThread:323567 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-10-30 01:10:14,057 INFO MainThread:323567 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_011014-zmts6m10/logs/debug.log +2024-10-30 01:10:14,057 INFO MainThread:323567 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_011014-zmts6m10/logs/debug-internal.log +2024-10-30 01:10:14,057 INFO MainThread:323567 [wandb_init.py:init():621] calling init triggers +2024-10-30 01:10:14,057 INFO MainThread:323567 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-10-30 01:10:14,057 INFO MainThread:323567 [wandb_init.py:init():671] starting backend +2024-10-30 01:10:14,057 INFO MainThread:323567 [wandb_init.py:init():675] sending inform_init request +2024-10-30 01:10:14,058 INFO MainThread:323567 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-10-30 01:10:14,058 INFO MainThread:323567 [wandb_init.py:init():688] backend started and connected +2024-10-30 01:10:14,061 INFO MainThread:323567 [wandb_init.py:init():783] updated telemetry +2024-10-30 01:10:14,093 INFO MainThread:323567 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-10-30 01:10:14,561 INFO MainThread:323567 [wandb_init.py:init():867] starting run threads in backend +2024-10-30 01:10:14,660 INFO MainThread:323567 [wandb_run.py:_console_start():2463] atexit reg +2024-10-30 01:10:14,660 INFO MainThread:323567 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-10-30 01:10:14,660 INFO MainThread:323567 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-10-30 01:10:14,660 INFO MainThread:323567 [wandb_run.py:_redirect():2401] Redirects installed. +2024-10-30 01:10:14,661 INFO MainThread:323567 [wandb_init.py:init():911] run started, returning control to user process +2024-10-30 01:10:14,662 INFO MainThread:323567 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_control', 'train_set': '10M', 'batch_size': 3, 'epoch': 7, 'seed': 0} +2024-10-30 01:10:22,006 WARNING MsgRouterThr:323567 [router.py:message_loop():77] message_loop has been closed diff --git a/wandb/run-20241030_011014-zmts6m10/run-zmts6m10.wandb b/wandb/run-20241030_011014-zmts6m10/run-zmts6m10.wandb new file mode 100644 index 0000000000000000000000000000000000000000..6091318a647386d42c5f462c267858307855650b Binary files /dev/null and b/wandb/run-20241030_011014-zmts6m10/run-zmts6m10.wandb differ diff --git a/wandb/run-20241030_013339-dgadwxty/files/output.log b/wandb/run-20241030_013339-dgadwxty/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..2619c9bb4e2b524197da89912c75bb8a231d85e2 --- /dev/null +++ b/wandb/run-20241030_013339-dgadwxty/files/output.log @@ -0,0 +1,38 @@ +Downloading shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [02:32<00:00, 76.19s/it] +Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:05<00:00, 2.52s/it] +Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17519/17519 [00:58<00:00, 301.89 examples/s] +Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 18140/18140 [00:56<00:00, 322.79 examples/s] +tokenized_valid: Dataset({ + features: ['input_ids', 'attention_mask'], + num_rows: 600 +}) +/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead + warnings.warn( +[2024-10-30 01:38:13,521] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-10-30 01:38:21,110] [INFO] [comm.py:652:init_distributed] cdb=None +Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination +Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root... +Loading extension module cpu_adam... +Time to load cpu_adam op: 4.589682579040527 seconds +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", line 219, in + trainer.train() + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/trainer.py", line 2052, in train + return inner_training_loop( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/trainer.py", line 2467, in _inner_training_loop + self._maybe_log_save_evaluate(tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/trainer.py", line 2918, in _maybe_log_save_evaluate + self._save_checkpoint(model, trial, metrics=metrics) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/trainer.py", line 3008, in _save_checkpoint + self.save_model(output_dir, _internal_call=True) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/trainer.py", line 3608, in save_model + state_dict = self.accelerator.get_state_dict(self.deepspeed) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/accelerate/accelerator.py", line 3382, in get_state_dict + state_dict = clone_tensors_for_torch_save(self.unwrap_model(model).state_dict()) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/deepspeed/checkpoint/utils.py", line 60, in clone_tensors_for_torch_save + return type(item)({k: clone_tensors_for_torch_save(v, device) for k, v in item.items()}) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/deepspeed/checkpoint/utils.py", line 60, in + return type(item)({k: clone_tensors_for_torch_save(v, device) for k, v in item.items()}) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/deepspeed/checkpoint/utils.py", line 54, in clone_tensors_for_torch_save + return item.detach().clone().to(device) +KeyboardInterrupt diff --git a/wandb/run-20241030_013339-dgadwxty/files/requirements.txt b/wandb/run-20241030_013339-dgadwxty/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416 --- /dev/null +++ b/wandb/run-20241030_013339-dgadwxty/files/requirements.txt @@ -0,0 +1,147 @@ +funcsigs==1.0.2 +sentry-sdk==2.17.0 +multiprocess==0.70.16 +numpy==1.26.2 +pluralizer==1.2.0 +debugpy==1.6.7 +nvidia-cudnn-cu11==8.5.0.96 +deepspeed==0.15.2 +data==0.4 +pandas==2.1.3 +tomli==2.0.1 +charset-normalizer==3.3.2 +attrs==24.2.0 +aiosignal==1.3.1 +fsspec==2023.10.0 +nvidia-cusparse-cu11==11.7.4.91 +zipp==3.12.0 +mypy-extensions==1.0.0 +datasets==3.0.1 +joblib==1.3.2 +hjson==3.1.0 +traitlets==5.7.1 +stack-data==0.6.0 +transformers==4.45.1 +sympy==1.11.1 +Pygments==2.15.0 +docker-pycreds==0.4.0 +dill==0.3.8 +wheel==0.44.0 +prompt-toolkit==3.0.30 +parso==0.8.3 +ipykernel==6.23.1 +pyarrow==17.0.0 +certifi==2023.11.17 +nvidia-cufft-cu11==10.9.0.58 +six==1.16.0 +pydantic==2.9.2 +click==8.1.7 +nest-asyncio==1.5.6 +gmpy2==2.1.0 +matplotlib==3.8.2 +scipy==1.11.4 +typing_extensions==4.12.2 +statsmodels==0.14.0 +huggingface-hub==0.25.0 +frozenlist==1.4.1 +gpustat==1.1.1 +nvidia-nvtx-cu11==11.7.91 +safetensors==0.4.5 +stanza==1.9.2 +decorator==5.1.1 +seaborn==0.13.0 +sentencepiece==0.2.0 +PyYAML==6.0.1 +black==24.8.0 +protobuf==4.25.1 +pickleshare==0.7.5 +peft==0.13.0 +triton==2.0.0 +nvidia-cuda-runtime-cu11==11.7.99 +Jinja2==3.1.2 +nvidia-cusolver-cu11==11.4.0.1 +executing==1.2.0 +jupyter_client==8.1.0 +pluggy==1.3.0 +cmake==3.30.3 +pytz==2023.3.post1 +aiohappyeyeballs==2.4.2 +kiwisolver==1.4.5 +py-cpuinfo==9.0.0 +Pillow==10.1.0 +ptyprocess==0.7.0 +importlib_resources==6.4.5 +GitPython==3.1.43 +importlib-metadata==6.0.0 +iniconfig==2.0.0 +scikit-learn==1.3.2 +exceptiongroup==1.1.0 +networkx==2.8.6 +accelerate==1.0.0 +nltk==3.8.1 +shutilwhich==1.1.0 +fonttools==4.45.1 +future==0.18.3 +aiohttp==3.10.6 +wcwidth==0.2.5 +idna==3.6 +filelock==3.12.2 +pathspec==0.12.1 +jupyter_core==5.1.0 +lit==18.1.8 +nvidia-curand-cu11==10.2.10.91 +nvidia-cublas-cu11==11.10.3.66 +nvidia-ml-py==12.560.30 +msgpack==1.1.0 +python-dateutil==2.8.2 +blessed==1.20.0 +packaging==23.0 +gitdb==4.0.11 +yarl==1.13.0 +emoji==2.8.0 +tzdata==2023.3 +cycler==0.12.1 +tornado==6.2 +backcall==0.2.0 +plotnine==0.12.4 +ninja==1.11.1.1 +latex==0.7.0 +wandb==0.18.5 +setproctitle==1.3.3 +threadpoolctl==3.2.0 +requests==2.32.3 +pyparsing==3.1.1 +smmap==5.0.1 +pyzmq==23.0.0 +async-timeout==4.0.3 +annotated-types==0.7.0 +matplotlib-inline==0.1.6 +latexcodec==1.0.0 +ipython==8.0.0 +patsy==0.5.3 +contourpy==1.2.0 +multidict==6.1.0 +mizani==0.9.3 +urllib3==2.1.0 +tokenizers==0.20.0 +MarkupSafe==2.1.2 +pip==24.2 +pexpect==4.8.0 +tqdm==4.66.5 +jedi==0.18.2 +pydantic_core==2.23.4 +tempdir==0.7.1 +mpmath==1.2.1 +setuptools==72.1.0 +pytest==7.4.3 +pure-eval==0.2.2 +psutil==5.9.1 +comm==0.1.2 +nvidia-cuda-cupti-cu11==11.7.101 +nvidia-cuda-nvrtc-cu11==11.7.99 +regex==2023.10.3 +platformdirs==2.5.2 +asttokens==2.2.1 +torch==2.0.0 +nvidia-nccl-cu11==2.14.3 +xxhash==3.5.0 diff --git a/wandb/run-20241030_013339-dgadwxty/files/wandb-metadata.json b/wandb/run-20241030_013339-dgadwxty/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..5ecb9a78ecb8ba88faf25c23c3d942ed24ac705c --- /dev/null +++ b/wandb/run-20241030_013339-dgadwxty/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-10-30T05:33:39.878858Z", + "args": [ + "--perturbation", + "reverse_full", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "7", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1710081835008" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241030_013339-dgadwxty/logs/debug-internal.log b/wandb/run-20241030_013339-dgadwxty/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..b8ab82a5e9691de685fc6dc075c5a626521653d0 --- /dev/null +++ b/wandb/run-20241030_013339-dgadwxty/logs/debug-internal.log @@ -0,0 +1,8 @@ +{"time":"2024-10-30T01:33:39.881038903-04:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-10-30T01:33:39.881052453-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_013339-dgadwxty/logs/debug-core.log"} +{"time":"2024-10-30T01:33:39.990088507-04:00","level":"INFO","msg":"created new stream","id":"dgadwxty"} +{"time":"2024-10-30T01:33:39.990129677-04:00","level":"INFO","msg":"stream: started","id":"dgadwxty"} +{"time":"2024-10-30T01:33:39.990203308-04:00","level":"INFO","msg":"sender: started","stream_id":"dgadwxty"} +{"time":"2024-10-30T01:33:39.990163337-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"dgadwxty"}} +{"time":"2024-10-30T01:33:39.990200188-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"dgadwxty"}} +{"time":"2024-10-30T01:33:40.18682614-04:00","level":"INFO","msg":"Starting system monitor"} diff --git a/wandb/run-20241030_013339-dgadwxty/logs/debug.log b/wandb/run-20241030_013339-dgadwxty/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..5102737cef160b85161fa265716f80b8414e7da8 --- /dev/null +++ b/wandb/run-20241030_013339-dgadwxty/logs/debug.log @@ -0,0 +1,26 @@ +2024-10-30 01:33:39,874 INFO MainThread:337257 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-10-30 01:33:39,874 INFO MainThread:337257 [wandb_setup.py:_flush():79] Configure stats pid to 337257 +2024-10-30 01:33:39,875 INFO MainThread:337257 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-10-30 01:33:39,875 INFO MainThread:337257 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-10-30 01:33:39,875 INFO MainThread:337257 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-10-30 01:33:39,875 INFO MainThread:337257 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-10-30 01:33:39,875 INFO MainThread:337257 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-10-30 01:33:39,875 INFO MainThread:337257 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-10-30 01:33:39,875 INFO MainThread:337257 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_013339-dgadwxty/logs/debug.log +2024-10-30 01:33:39,875 INFO MainThread:337257 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_013339-dgadwxty/logs/debug-internal.log +2024-10-30 01:33:39,875 INFO MainThread:337257 [wandb_init.py:init():621] calling init triggers +2024-10-30 01:33:39,875 INFO MainThread:337257 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-10-30 01:33:39,875 INFO MainThread:337257 [wandb_init.py:init():671] starting backend +2024-10-30 01:33:39,875 INFO MainThread:337257 [wandb_init.py:init():675] sending inform_init request +2024-10-30 01:33:39,878 INFO MainThread:337257 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-10-30 01:33:39,878 INFO MainThread:337257 [wandb_init.py:init():688] backend started and connected +2024-10-30 01:33:39,882 INFO MainThread:337257 [wandb_init.py:init():783] updated telemetry +2024-10-30 01:33:39,911 INFO MainThread:337257 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-10-30 01:33:40,183 INFO MainThread:337257 [wandb_init.py:init():867] starting run threads in backend +2024-10-30 01:33:40,280 INFO MainThread:337257 [wandb_run.py:_console_start():2463] atexit reg +2024-10-30 01:33:40,281 INFO MainThread:337257 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-10-30 01:33:40,281 INFO MainThread:337257 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-10-30 01:33:40,281 INFO MainThread:337257 [wandb_run.py:_redirect():2401] Redirects installed. +2024-10-30 01:33:40,282 INFO MainThread:337257 [wandb_init.py:init():911] run started, returning control to user process +2024-10-30 01:33:40,282 INFO MainThread:337257 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_full', 'train_set': '10M', 'batch_size': 3, 'epoch': 7, 'seed': 0} diff --git a/wandb/run-20241030_013339-s77qk5li/files/output.log b/wandb/run-20241030_013339-s77qk5li/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..7d365dde17ea295bde50478ab7b765050b3999b7 --- /dev/null +++ b/wandb/run-20241030_013339-s77qk5li/files/output.log @@ -0,0 +1,41 @@ +Downloading shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [02:32<00:00, 76.21s/it] +Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:04<00:00, 2.25s/it] +Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17519/17519 [00:56<00:00, 308.15 examples/s] +Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 18140/18140 [00:57<00:00, 313.18 examples/s] +tokenized_valid: Dataset({ + features: ['input_ids', 'attention_mask'], + num_rows: 600 +}) +/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead + warnings.warn( +[2024-10-30 01:38:13,602] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-10-30 01:38:21,321] [INFO] [comm.py:652:init_distributed] cdb=None +Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination +Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root... +Emitting ninja build file /home/chunhui/.cache/torch_extensions/py39_cu117/cpu_adam/build.ninja... +Building extension module cpu_adam... +Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +Loading extension module cpu_adam... +Time to load cpu_adam op: 4.4626922607421875 seconds +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", line 219, in + trainer.train() + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/trainer.py", line 2052, in train + return inner_training_loop( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/trainer.py", line 2467, in _inner_training_loop + self._maybe_log_save_evaluate(tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/trainer.py", line 2918, in _maybe_log_save_evaluate + self._save_checkpoint(model, trial, metrics=metrics) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/trainer.py", line 3008, in _save_checkpoint + self.save_model(output_dir, _internal_call=True) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/trainer.py", line 3608, in save_model + state_dict = self.accelerator.get_state_dict(self.deepspeed) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/accelerate/accelerator.py", line 3382, in get_state_dict + state_dict = clone_tensors_for_torch_save(self.unwrap_model(model).state_dict()) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/deepspeed/checkpoint/utils.py", line 60, in clone_tensors_for_torch_save + return type(item)({k: clone_tensors_for_torch_save(v, device) for k, v in item.items()}) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/deepspeed/checkpoint/utils.py", line 60, in + return type(item)({k: clone_tensors_for_torch_save(v, device) for k, v in item.items()}) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/deepspeed/checkpoint/utils.py", line 54, in clone_tensors_for_torch_save + return item.detach().clone().to(device) +KeyboardInterrupt diff --git a/wandb/run-20241030_013339-s77qk5li/files/requirements.txt b/wandb/run-20241030_013339-s77qk5li/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416 --- /dev/null +++ b/wandb/run-20241030_013339-s77qk5li/files/requirements.txt @@ -0,0 +1,147 @@ +funcsigs==1.0.2 +sentry-sdk==2.17.0 +multiprocess==0.70.16 +numpy==1.26.2 +pluralizer==1.2.0 +debugpy==1.6.7 +nvidia-cudnn-cu11==8.5.0.96 +deepspeed==0.15.2 +data==0.4 +pandas==2.1.3 +tomli==2.0.1 +charset-normalizer==3.3.2 +attrs==24.2.0 +aiosignal==1.3.1 +fsspec==2023.10.0 +nvidia-cusparse-cu11==11.7.4.91 +zipp==3.12.0 +mypy-extensions==1.0.0 +datasets==3.0.1 +joblib==1.3.2 +hjson==3.1.0 +traitlets==5.7.1 +stack-data==0.6.0 +transformers==4.45.1 +sympy==1.11.1 +Pygments==2.15.0 +docker-pycreds==0.4.0 +dill==0.3.8 +wheel==0.44.0 +prompt-toolkit==3.0.30 +parso==0.8.3 +ipykernel==6.23.1 +pyarrow==17.0.0 +certifi==2023.11.17 +nvidia-cufft-cu11==10.9.0.58 +six==1.16.0 +pydantic==2.9.2 +click==8.1.7 +nest-asyncio==1.5.6 +gmpy2==2.1.0 +matplotlib==3.8.2 +scipy==1.11.4 +typing_extensions==4.12.2 +statsmodels==0.14.0 +huggingface-hub==0.25.0 +frozenlist==1.4.1 +gpustat==1.1.1 +nvidia-nvtx-cu11==11.7.91 +safetensors==0.4.5 +stanza==1.9.2 +decorator==5.1.1 +seaborn==0.13.0 +sentencepiece==0.2.0 +PyYAML==6.0.1 +black==24.8.0 +protobuf==4.25.1 +pickleshare==0.7.5 +peft==0.13.0 +triton==2.0.0 +nvidia-cuda-runtime-cu11==11.7.99 +Jinja2==3.1.2 +nvidia-cusolver-cu11==11.4.0.1 +executing==1.2.0 +jupyter_client==8.1.0 +pluggy==1.3.0 +cmake==3.30.3 +pytz==2023.3.post1 +aiohappyeyeballs==2.4.2 +kiwisolver==1.4.5 +py-cpuinfo==9.0.0 +Pillow==10.1.0 +ptyprocess==0.7.0 +importlib_resources==6.4.5 +GitPython==3.1.43 +importlib-metadata==6.0.0 +iniconfig==2.0.0 +scikit-learn==1.3.2 +exceptiongroup==1.1.0 +networkx==2.8.6 +accelerate==1.0.0 +nltk==3.8.1 +shutilwhich==1.1.0 +fonttools==4.45.1 +future==0.18.3 +aiohttp==3.10.6 +wcwidth==0.2.5 +idna==3.6 +filelock==3.12.2 +pathspec==0.12.1 +jupyter_core==5.1.0 +lit==18.1.8 +nvidia-curand-cu11==10.2.10.91 +nvidia-cublas-cu11==11.10.3.66 +nvidia-ml-py==12.560.30 +msgpack==1.1.0 +python-dateutil==2.8.2 +blessed==1.20.0 +packaging==23.0 +gitdb==4.0.11 +yarl==1.13.0 +emoji==2.8.0 +tzdata==2023.3 +cycler==0.12.1 +tornado==6.2 +backcall==0.2.0 +plotnine==0.12.4 +ninja==1.11.1.1 +latex==0.7.0 +wandb==0.18.5 +setproctitle==1.3.3 +threadpoolctl==3.2.0 +requests==2.32.3 +pyparsing==3.1.1 +smmap==5.0.1 +pyzmq==23.0.0 +async-timeout==4.0.3 +annotated-types==0.7.0 +matplotlib-inline==0.1.6 +latexcodec==1.0.0 +ipython==8.0.0 +patsy==0.5.3 +contourpy==1.2.0 +multidict==6.1.0 +mizani==0.9.3 +urllib3==2.1.0 +tokenizers==0.20.0 +MarkupSafe==2.1.2 +pip==24.2 +pexpect==4.8.0 +tqdm==4.66.5 +jedi==0.18.2 +pydantic_core==2.23.4 +tempdir==0.7.1 +mpmath==1.2.1 +setuptools==72.1.0 +pytest==7.4.3 +pure-eval==0.2.2 +psutil==5.9.1 +comm==0.1.2 +nvidia-cuda-cupti-cu11==11.7.101 +nvidia-cuda-nvrtc-cu11==11.7.99 +regex==2023.10.3 +platformdirs==2.5.2 +asttokens==2.2.1 +torch==2.0.0 +nvidia-nccl-cu11==2.14.3 +xxhash==3.5.0 diff --git a/wandb/run-20241030_013339-s77qk5li/files/wandb-metadata.json b/wandb/run-20241030_013339-s77qk5li/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..915395a9460949518958b673cce2b40896ba6491 --- /dev/null +++ b/wandb/run-20241030_013339-s77qk5li/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-10-30T05:33:39.943616Z", + "args": [ + "--perturbation", + "reverse_full", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "7", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1710081839104" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241030_013339-s77qk5li/logs/debug-internal.log b/wandb/run-20241030_013339-s77qk5li/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..3cc62ccda0758da44994e80aaae76ee2c80247ce --- /dev/null +++ b/wandb/run-20241030_013339-s77qk5li/logs/debug-internal.log @@ -0,0 +1,8 @@ +{"time":"2024-10-30T01:33:39.945853369-04:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-10-30T01:33:39.945865199-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_013339-s77qk5li/logs/debug-core.log"} +{"time":"2024-10-30T01:33:40.053085091-04:00","level":"INFO","msg":"created new stream","id":"s77qk5li"} +{"time":"2024-10-30T01:33:40.053121041-04:00","level":"INFO","msg":"stream: started","id":"s77qk5li"} +{"time":"2024-10-30T01:33:40.053172051-04:00","level":"INFO","msg":"sender: started","stream_id":"s77qk5li"} +{"time":"2024-10-30T01:33:40.053132691-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"s77qk5li"}} +{"time":"2024-10-30T01:33:40.053160001-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"s77qk5li"}} +{"time":"2024-10-30T01:33:40.209477562-04:00","level":"INFO","msg":"Starting system monitor"} diff --git a/wandb/run-20241030_013339-s77qk5li/logs/debug.log b/wandb/run-20241030_013339-s77qk5li/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..d9313ba79b6506449193398be1f4e544b683235e --- /dev/null +++ b/wandb/run-20241030_013339-s77qk5li/logs/debug.log @@ -0,0 +1,26 @@ +2024-10-30 01:33:39,941 INFO MainThread:337256 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-10-30 01:33:39,941 INFO MainThread:337256 [wandb_setup.py:_flush():79] Configure stats pid to 337256 +2024-10-30 01:33:39,941 INFO MainThread:337256 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-10-30 01:33:39,941 INFO MainThread:337256 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-10-30 01:33:39,942 INFO MainThread:337256 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-10-30 01:33:39,942 INFO MainThread:337256 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-10-30 01:33:39,942 INFO MainThread:337256 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-10-30 01:33:39,942 INFO MainThread:337256 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-10-30 01:33:39,942 INFO MainThread:337256 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_013339-s77qk5li/logs/debug.log +2024-10-30 01:33:39,942 INFO MainThread:337256 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_013339-s77qk5li/logs/debug-internal.log +2024-10-30 01:33:39,942 INFO MainThread:337256 [wandb_init.py:init():621] calling init triggers +2024-10-30 01:33:39,942 INFO MainThread:337256 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-10-30 01:33:39,942 INFO MainThread:337256 [wandb_init.py:init():671] starting backend +2024-10-30 01:33:39,942 INFO MainThread:337256 [wandb_init.py:init():675] sending inform_init request +2024-10-30 01:33:39,943 INFO MainThread:337256 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-10-30 01:33:39,943 INFO MainThread:337256 [wandb_init.py:init():688] backend started and connected +2024-10-30 01:33:39,946 INFO MainThread:337256 [wandb_init.py:init():783] updated telemetry +2024-10-30 01:33:39,977 INFO MainThread:337256 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-10-30 01:33:40,206 INFO MainThread:337256 [wandb_init.py:init():867] starting run threads in backend +2024-10-30 01:33:40,298 INFO MainThread:337256 [wandb_run.py:_console_start():2463] atexit reg +2024-10-30 01:33:40,298 INFO MainThread:337256 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-10-30 01:33:40,298 INFO MainThread:337256 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-10-30 01:33:40,298 INFO MainThread:337256 [wandb_run.py:_redirect():2401] Redirects installed. +2024-10-30 01:33:40,300 INFO MainThread:337256 [wandb_init.py:init():911] run started, returning control to user process +2024-10-30 01:33:40,300 INFO MainThread:337256 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_full', 'train_set': '10M', 'batch_size': 3, 'epoch': 7, 'seed': 0} diff --git a/wandb/run-20241030_112700-jhzkfwvw/files/config.yaml b/wandb/run-20241030_112700-jhzkfwvw/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9eb308beefafe67d93cdcbb581ca11ae60246ed2 --- /dev/null +++ b/wandb/run-20241030_112700-jhzkfwvw/files/config.yaml @@ -0,0 +1,47 @@ +_wandb: + value: + cli_version: 0.18.5 + m: [] + python_version: 3.9.19 + t: + "1": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "2": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "3": + - 13 + - 23 + - 55 + "4": 3.9.19 + "5": 0.18.5 + "6": 4.45.1 + "8": + - 5 + "12": 0.18.5 + "13": linux-x86_64 +batch_size: + value: 3 +epoch: + value: 3 +perturbation: + value: reverse_control +seed: + value: 0 +train_set: + value: 10M diff --git a/wandb/run-20241030_112700-jhzkfwvw/files/output.log b/wandb/run-20241030_112700-jhzkfwvw/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..ff7c27c1e482b44a9a283121f1e6022de92f1588 --- /dev/null +++ b/wandb/run-20241030_112700-jhzkfwvw/files/output.log @@ -0,0 +1,34 @@ +Downloading shards: 0%| | 0/2 [01:32 + model = AutoModelForCausalLM.from_pretrained(model_name, + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/models/auto/auto_factory.py", line 564, in from_pretrained + return model_class.from_pretrained( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/modeling_utils.py", line 3769, in from_pretrained + resolved_archive_file, sharded_metadata = get_checkpoint_shard_files( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 1098, in get_checkpoint_shard_files + cached_filename = cached_file( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 403, in cached_file + resolved_file = hf_hub_download( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_deprecation.py", line 101, in inner_f + return f(*args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn + return fn(*args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1232, in hf_hub_download + return _hf_hub_download_to_cache_dir( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1380, in _hf_hub_download_to_cache_dir + with WeakFileLock(lock_path): + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/contextlib.py", line 119, in __enter__ + return next(self.gen) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_fixes.py", line 98, in WeakFileLock + lock.acquire() + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/filelock/_api.py", line 225, in acquire + time.sleep(poll_interval) +KeyboardInterrupt diff --git a/wandb/run-20241030_112700-jhzkfwvw/files/requirements.txt b/wandb/run-20241030_112700-jhzkfwvw/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416 --- /dev/null +++ b/wandb/run-20241030_112700-jhzkfwvw/files/requirements.txt @@ -0,0 +1,147 @@ +funcsigs==1.0.2 +sentry-sdk==2.17.0 +multiprocess==0.70.16 +numpy==1.26.2 +pluralizer==1.2.0 +debugpy==1.6.7 +nvidia-cudnn-cu11==8.5.0.96 +deepspeed==0.15.2 +data==0.4 +pandas==2.1.3 +tomli==2.0.1 +charset-normalizer==3.3.2 +attrs==24.2.0 +aiosignal==1.3.1 +fsspec==2023.10.0 +nvidia-cusparse-cu11==11.7.4.91 +zipp==3.12.0 +mypy-extensions==1.0.0 +datasets==3.0.1 +joblib==1.3.2 +hjson==3.1.0 +traitlets==5.7.1 +stack-data==0.6.0 +transformers==4.45.1 +sympy==1.11.1 +Pygments==2.15.0 +docker-pycreds==0.4.0 +dill==0.3.8 +wheel==0.44.0 +prompt-toolkit==3.0.30 +parso==0.8.3 +ipykernel==6.23.1 +pyarrow==17.0.0 +certifi==2023.11.17 +nvidia-cufft-cu11==10.9.0.58 +six==1.16.0 +pydantic==2.9.2 +click==8.1.7 +nest-asyncio==1.5.6 +gmpy2==2.1.0 +matplotlib==3.8.2 +scipy==1.11.4 +typing_extensions==4.12.2 +statsmodels==0.14.0 +huggingface-hub==0.25.0 +frozenlist==1.4.1 +gpustat==1.1.1 +nvidia-nvtx-cu11==11.7.91 +safetensors==0.4.5 +stanza==1.9.2 +decorator==5.1.1 +seaborn==0.13.0 +sentencepiece==0.2.0 +PyYAML==6.0.1 +black==24.8.0 +protobuf==4.25.1 +pickleshare==0.7.5 +peft==0.13.0 +triton==2.0.0 +nvidia-cuda-runtime-cu11==11.7.99 +Jinja2==3.1.2 +nvidia-cusolver-cu11==11.4.0.1 +executing==1.2.0 +jupyter_client==8.1.0 +pluggy==1.3.0 +cmake==3.30.3 +pytz==2023.3.post1 +aiohappyeyeballs==2.4.2 +kiwisolver==1.4.5 +py-cpuinfo==9.0.0 +Pillow==10.1.0 +ptyprocess==0.7.0 +importlib_resources==6.4.5 +GitPython==3.1.43 +importlib-metadata==6.0.0 +iniconfig==2.0.0 +scikit-learn==1.3.2 +exceptiongroup==1.1.0 +networkx==2.8.6 +accelerate==1.0.0 +nltk==3.8.1 +shutilwhich==1.1.0 +fonttools==4.45.1 +future==0.18.3 +aiohttp==3.10.6 +wcwidth==0.2.5 +idna==3.6 +filelock==3.12.2 +pathspec==0.12.1 +jupyter_core==5.1.0 +lit==18.1.8 +nvidia-curand-cu11==10.2.10.91 +nvidia-cublas-cu11==11.10.3.66 +nvidia-ml-py==12.560.30 +msgpack==1.1.0 +python-dateutil==2.8.2 +blessed==1.20.0 +packaging==23.0 +gitdb==4.0.11 +yarl==1.13.0 +emoji==2.8.0 +tzdata==2023.3 +cycler==0.12.1 +tornado==6.2 +backcall==0.2.0 +plotnine==0.12.4 +ninja==1.11.1.1 +latex==0.7.0 +wandb==0.18.5 +setproctitle==1.3.3 +threadpoolctl==3.2.0 +requests==2.32.3 +pyparsing==3.1.1 +smmap==5.0.1 +pyzmq==23.0.0 +async-timeout==4.0.3 +annotated-types==0.7.0 +matplotlib-inline==0.1.6 +latexcodec==1.0.0 +ipython==8.0.0 +patsy==0.5.3 +contourpy==1.2.0 +multidict==6.1.0 +mizani==0.9.3 +urllib3==2.1.0 +tokenizers==0.20.0 +MarkupSafe==2.1.2 +pip==24.2 +pexpect==4.8.0 +tqdm==4.66.5 +jedi==0.18.2 +pydantic_core==2.23.4 +tempdir==0.7.1 +mpmath==1.2.1 +setuptools==72.1.0 +pytest==7.4.3 +pure-eval==0.2.2 +psutil==5.9.1 +comm==0.1.2 +nvidia-cuda-cupti-cu11==11.7.101 +nvidia-cuda-nvrtc-cu11==11.7.99 +regex==2023.10.3 +platformdirs==2.5.2 +asttokens==2.2.1 +torch==2.0.0 +nvidia-nccl-cu11==2.14.3 +xxhash==3.5.0 diff --git a/wandb/run-20241030_112700-jhzkfwvw/files/wandb-metadata.json b/wandb/run-20241030_112700-jhzkfwvw/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..955b48ca24b57b532f3c4a8b7ee00a0db9bc12a6 --- /dev/null +++ b/wandb/run-20241030_112700-jhzkfwvw/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-10-30T15:27:00.613781Z", + "args": [ + "--perturbation", + "reverse_control", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "3", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1710831087616" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241030_112700-jhzkfwvw/files/wandb-summary.json b/wandb/run-20241030_112700-jhzkfwvw/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..04b2e8f6eb4447e0a30c904e74100cacb4dc8a48 --- /dev/null +++ b/wandb/run-20241030_112700-jhzkfwvw/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":93}} \ No newline at end of file diff --git a/wandb/run-20241030_112700-jhzkfwvw/logs/debug-internal.log b/wandb/run-20241030_112700-jhzkfwvw/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..22b60b10f3778cb2f8ba590634c6e9daa4d81844 --- /dev/null +++ b/wandb/run-20241030_112700-jhzkfwvw/logs/debug-internal.log @@ -0,0 +1,11 @@ +{"time":"2024-10-30T11:27:00.617213638-04:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-10-30T11:27:00.617233858-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_112700-jhzkfwvw/logs/debug-core.log"} +{"time":"2024-10-30T11:27:00.727987118-04:00","level":"INFO","msg":"created new stream","id":"jhzkfwvw"} +{"time":"2024-10-30T11:27:00.728057478-04:00","level":"INFO","msg":"stream: started","id":"jhzkfwvw"} +{"time":"2024-10-30T11:27:00.728115208-04:00","level":"INFO","msg":"sender: started","stream_id":"jhzkfwvw"} +{"time":"2024-10-30T11:27:00.728102078-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"jhzkfwvw"}} +{"time":"2024-10-30T11:27:00.728090568-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"jhzkfwvw"}} +{"time":"2024-10-30T11:27:00.93145688-04:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2024-10-30T11:28:34.16243391-04:00","level":"INFO","msg":"stream: closing","id":"jhzkfwvw"} +{"time":"2024-10-30T11:28:34.16248596-04:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2024-10-30T11:28:34.163735668-04:00","level":"INFO","msg":"Stopped system monitor"} diff --git a/wandb/run-20241030_112700-jhzkfwvw/logs/debug.log b/wandb/run-20241030_112700-jhzkfwvw/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..f0d6c0227ddd2e3096b65b79a67d3fc98dda5a13 --- /dev/null +++ b/wandb/run-20241030_112700-jhzkfwvw/logs/debug.log @@ -0,0 +1,27 @@ +2024-10-30 11:27:00,611 INFO MainThread:366800 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-10-30 11:27:00,611 INFO MainThread:366800 [wandb_setup.py:_flush():79] Configure stats pid to 366800 +2024-10-30 11:27:00,611 INFO MainThread:366800 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-10-30 11:27:00,611 INFO MainThread:366800 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-10-30 11:27:00,611 INFO MainThread:366800 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-10-30 11:27:00,611 INFO MainThread:366800 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-10-30 11:27:00,611 INFO MainThread:366800 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-10-30 11:27:00,611 INFO MainThread:366800 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-10-30 11:27:00,611 INFO MainThread:366800 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_112700-jhzkfwvw/logs/debug.log +2024-10-30 11:27:00,611 INFO MainThread:366800 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_112700-jhzkfwvw/logs/debug-internal.log +2024-10-30 11:27:00,611 INFO MainThread:366800 [wandb_init.py:init():621] calling init triggers +2024-10-30 11:27:00,611 INFO MainThread:366800 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-10-30 11:27:00,611 INFO MainThread:366800 [wandb_init.py:init():671] starting backend +2024-10-30 11:27:00,611 INFO MainThread:366800 [wandb_init.py:init():675] sending inform_init request +2024-10-30 11:27:00,613 INFO MainThread:366800 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-10-30 11:27:00,613 INFO MainThread:366800 [wandb_init.py:init():688] backend started and connected +2024-10-30 11:27:00,616 INFO MainThread:366800 [wandb_init.py:init():783] updated telemetry +2024-10-30 11:27:00,651 INFO MainThread:366800 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-10-30 11:27:00,928 INFO MainThread:366800 [wandb_init.py:init():867] starting run threads in backend +2024-10-30 11:27:01,055 INFO MainThread:366800 [wandb_run.py:_console_start():2463] atexit reg +2024-10-30 11:27:01,056 INFO MainThread:366800 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-10-30 11:27:01,056 INFO MainThread:366800 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-10-30 11:27:01,056 INFO MainThread:366800 [wandb_run.py:_redirect():2401] Redirects installed. +2024-10-30 11:27:01,057 INFO MainThread:366800 [wandb_init.py:init():911] run started, returning control to user process +2024-10-30 11:27:01,058 INFO MainThread:366800 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_control', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0} +2024-10-30 11:28:34,162 WARNING MsgRouterThr:366800 [router.py:message_loop():77] message_loop has been closed diff --git a/wandb/run-20241030_112700-jhzkfwvw/run-jhzkfwvw.wandb b/wandb/run-20241030_112700-jhzkfwvw/run-jhzkfwvw.wandb new file mode 100644 index 0000000000000000000000000000000000000000..2c7992a72b7724b6217b58dd3a9aae42111e5808 Binary files /dev/null and b/wandb/run-20241030_112700-jhzkfwvw/run-jhzkfwvw.wandb differ diff --git a/wandb/run-20241031_122113-8ldget07/files/config.yaml b/wandb/run-20241031_122113-8ldget07/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..edfa239fe9670ab026a3223bac576877fd80297e --- /dev/null +++ b/wandb/run-20241031_122113-8ldget07/files/config.yaml @@ -0,0 +1,50 @@ +_wandb: + value: + cli_version: 0.18.5 + m: [] + python_version: 3.9.19 + t: + "1": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "2": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "3": + - 2 + - 13 + - 23 + - 55 + "4": 3.9.19 + "5": 0.18.5 + "6": 4.45.1 + "8": + - 5 + "12": 0.18.5 + "13": linux-x86_64 +batch_size: + value: 3 +epoch: + value: 6 +lr: + value: 5e-06 +perturbation: + value: reverse_full +seed: + value: 0 +train_set: + value: 10M diff --git a/wandb/run-20241031_122113-8ldget07/files/output.log b/wandb/run-20241031_122113-8ldget07/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..f41ef402a44447085f345efab4b188ffbb27da81 --- /dev/null +++ b/wandb/run-20241031_122113-8ldget07/files/output.log @@ -0,0 +1,14 @@ +Downloading shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [02:09<00:00, 64.61s/it] +Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:05<00:00, 2.61s/it] +tokenized_valid: Dataset({ + features: ['input_ids', 'attention_mask'], + num_rows: 600 +}) +/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead + warnings.warn( +[2024-10-31 12:23:30,565] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-10-31 12:23:39,013] [INFO] [comm.py:652:init_distributed] cdb=None +Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination +Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root... +Loading extension module cpu_adam... +Time to load cpu_adam op: 5.0257627964019775 seconds diff --git a/wandb/run-20241031_122113-8ldget07/files/wandb-metadata.json b/wandb/run-20241031_122113-8ldget07/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..1bb6ef0b27746cc16ec76013f0bbfac29cf0493d --- /dev/null +++ b/wandb/run-20241031_122113-8ldget07/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-10-31T16:21:13.909771Z", + "args": [ + "--perturbation", + "reverse_full", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "6", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1753159962624" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241031_122113-8ldget07/files/wandb-summary.json b/wandb/run-20241031_122113-8ldget07/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..f78ea47926dfc1c734e02453aa9e6ce3692f2be4 --- /dev/null +++ b/wandb/run-20241031_122113-8ldget07/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":32015}} \ No newline at end of file diff --git a/wandb/run-20241031_122113-8ldget07/logs/debug-internal.log b/wandb/run-20241031_122113-8ldget07/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..19d73e58705f7af14ee7c784ef4dd814ac1ecd00 --- /dev/null +++ b/wandb/run-20241031_122113-8ldget07/logs/debug-internal.log @@ -0,0 +1,18 @@ +{"time":"2024-10-31T12:21:13.912869182-04:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-10-31T12:21:13.912891512-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241031_122113-8ldget07/logs/debug-core.log"} +{"time":"2024-10-31T12:21:14.021742616-04:00","level":"INFO","msg":"created new stream","id":"8ldget07"} +{"time":"2024-10-31T12:21:14.021795536-04:00","level":"INFO","msg":"stream: started","id":"8ldget07"} +{"time":"2024-10-31T12:21:14.021865226-04:00","level":"INFO","msg":"sender: started","stream_id":"8ldget07"} +{"time":"2024-10-31T12:21:14.021867856-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"8ldget07"}} +{"time":"2024-10-31T12:21:14.021843416-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"8ldget07"}} +{"time":"2024-10-31T12:21:14.317036864-04:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2024-10-31T17:51:23.39906822-04:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2024-10-31T21:14:49.439325218-04:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2024-10-31T21:14:49.51060772-04:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2024-10-31T21:14:50.239504017-04:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2024-10-31T21:14:50.358731013-04:00","level":"INFO","msg":"handler: operation stats","stats":{}} +{"time":"2024-10-31T21:14:51.422549145-04:00","level":"INFO","msg":"stream: closing","id":"8ldget07"} +{"time":"2024-10-31T21:14:51.422578535-04:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"8ldget07"}} +{"time":"2024-10-31T21:14:51.422600615-04:00","level":"INFO","msg":"sender: closed","stream_id":"8ldget07"} +{"time":"2024-10-31T21:14:51.422595475-04:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"8ldget07"}} +{"time":"2024-10-31T21:14:51.422696086-04:00","level":"INFO","msg":"stream: closed","id":"8ldget07"} diff --git a/wandb/run-20241031_122113-8ldget07/logs/debug.log b/wandb/run-20241031_122113-8ldget07/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..3bbd660e0a78ef12860d0148600c0e83d6a3a575 --- /dev/null +++ b/wandb/run-20241031_122113-8ldget07/logs/debug.log @@ -0,0 +1,33 @@ +2024-10-31 12:21:13,907 INFO MainThread:558432 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-10-31 12:21:13,907 INFO MainThread:558432 [wandb_setup.py:_flush():79] Configure stats pid to 558432 +2024-10-31 12:21:13,907 INFO MainThread:558432 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-10-31 12:21:13,907 INFO MainThread:558432 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-10-31 12:21:13,907 INFO MainThread:558432 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-10-31 12:21:13,907 INFO MainThread:558432 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-10-31 12:21:13,907 INFO MainThread:558432 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-10-31 12:21:13,907 INFO MainThread:558432 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-10-31 12:21:13,907 INFO MainThread:558432 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241031_122113-8ldget07/logs/debug.log +2024-10-31 12:21:13,907 INFO MainThread:558432 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241031_122113-8ldget07/logs/debug-internal.log +2024-10-31 12:21:13,907 INFO MainThread:558432 [wandb_init.py:init():621] calling init triggers +2024-10-31 12:21:13,907 INFO MainThread:558432 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-10-31 12:21:13,907 INFO MainThread:558432 [wandb_init.py:init():671] starting backend +2024-10-31 12:21:13,907 INFO MainThread:558432 [wandb_init.py:init():675] sending inform_init request +2024-10-31 12:21:13,909 INFO MainThread:558432 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-10-31 12:21:13,909 INFO MainThread:558432 [wandb_init.py:init():688] backend started and connected +2024-10-31 12:21:13,913 INFO MainThread:558432 [wandb_init.py:init():783] updated telemetry +2024-10-31 12:21:13,951 INFO MainThread:558432 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-10-31 12:21:14,314 INFO MainThread:558432 [wandb_init.py:init():867] starting run threads in backend +2024-10-31 12:21:14,400 INFO MainThread:558432 [wandb_run.py:_console_start():2463] atexit reg +2024-10-31 12:21:14,400 INFO MainThread:558432 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-10-31 12:21:14,400 INFO MainThread:558432 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-10-31 12:21:14,400 INFO MainThread:558432 [wandb_run.py:_redirect():2401] Redirects installed. +2024-10-31 12:21:14,402 INFO MainThread:558432 [wandb_init.py:init():911] run started, returning control to user process +2024-10-31 12:21:14,402 INFO MainThread:558432 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_full', 'train_set': '10M', 'batch_size': 3, 'epoch': 6, 'seed': 0, 'lr': 5e-06} +2024-10-31 21:14:49,288 INFO MainThread:558432 [wandb_run.py:_finish():2158] finishing run yaning1001-dartmouth-college/impossible_llm_reverse/8ldget07 +2024-10-31 21:14:49,400 INFO MainThread:558432 [wandb_run.py:_atexit_cleanup():2426] got exitcode: 0 +2024-10-31 21:14:49,400 INFO MainThread:558432 [wandb_run.py:_restore():2408] restore +2024-10-31 21:14:49,400 INFO MainThread:558432 [wandb_run.py:_restore():2414] restore done +2024-10-31 21:14:51,361 INFO MainThread:558432 [wandb_run.py:_footer_history_summary_info():3975] rendering history +2024-10-31 21:14:51,361 INFO MainThread:558432 [wandb_run.py:_footer_history_summary_info():4007] rendering summary +2024-10-31 21:14:51,421 INFO MainThread:558432 [wandb_run.py:_footer_sync_info():3934] logging synced files diff --git a/wandb/run-20241101_094656-b81aanqd/files/output.log b/wandb/run-20241101_094656-b81aanqd/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..fba5bbd94d07636d0dcdb3e0b04e9daa435669fa --- /dev/null +++ b/wandb/run-20241101_094656-b81aanqd/files/output.log @@ -0,0 +1,13 @@ +Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:05<00:00, 2.54s/it] +tokenized_valid: Dataset({ + features: ['input_ids', 'attention_mask'], + num_rows: 600 +}) +/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead + warnings.warn( +[2024-11-01 09:47:03,559] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-11-01 09:47:12,684] [INFO] [comm.py:652:init_distributed] cdb=None +Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination +Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root... +Loading extension module cpu_adam... +Time to load cpu_adam op: 4.639373540878296 seconds diff --git a/wandb/run-20241101_094656-b81aanqd/files/requirements.txt b/wandb/run-20241101_094656-b81aanqd/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416 --- /dev/null +++ b/wandb/run-20241101_094656-b81aanqd/files/requirements.txt @@ -0,0 +1,147 @@ +funcsigs==1.0.2 +sentry-sdk==2.17.0 +multiprocess==0.70.16 +numpy==1.26.2 +pluralizer==1.2.0 +debugpy==1.6.7 +nvidia-cudnn-cu11==8.5.0.96 +deepspeed==0.15.2 +data==0.4 +pandas==2.1.3 +tomli==2.0.1 +charset-normalizer==3.3.2 +attrs==24.2.0 +aiosignal==1.3.1 +fsspec==2023.10.0 +nvidia-cusparse-cu11==11.7.4.91 +zipp==3.12.0 +mypy-extensions==1.0.0 +datasets==3.0.1 +joblib==1.3.2 +hjson==3.1.0 +traitlets==5.7.1 +stack-data==0.6.0 +transformers==4.45.1 +sympy==1.11.1 +Pygments==2.15.0 +docker-pycreds==0.4.0 +dill==0.3.8 +wheel==0.44.0 +prompt-toolkit==3.0.30 +parso==0.8.3 +ipykernel==6.23.1 +pyarrow==17.0.0 +certifi==2023.11.17 +nvidia-cufft-cu11==10.9.0.58 +six==1.16.0 +pydantic==2.9.2 +click==8.1.7 +nest-asyncio==1.5.6 +gmpy2==2.1.0 +matplotlib==3.8.2 +scipy==1.11.4 +typing_extensions==4.12.2 +statsmodels==0.14.0 +huggingface-hub==0.25.0 +frozenlist==1.4.1 +gpustat==1.1.1 +nvidia-nvtx-cu11==11.7.91 +safetensors==0.4.5 +stanza==1.9.2 +decorator==5.1.1 +seaborn==0.13.0 +sentencepiece==0.2.0 +PyYAML==6.0.1 +black==24.8.0 +protobuf==4.25.1 +pickleshare==0.7.5 +peft==0.13.0 +triton==2.0.0 +nvidia-cuda-runtime-cu11==11.7.99 +Jinja2==3.1.2 +nvidia-cusolver-cu11==11.4.0.1 +executing==1.2.0 +jupyter_client==8.1.0 +pluggy==1.3.0 +cmake==3.30.3 +pytz==2023.3.post1 +aiohappyeyeballs==2.4.2 +kiwisolver==1.4.5 +py-cpuinfo==9.0.0 +Pillow==10.1.0 +ptyprocess==0.7.0 +importlib_resources==6.4.5 +GitPython==3.1.43 +importlib-metadata==6.0.0 +iniconfig==2.0.0 +scikit-learn==1.3.2 +exceptiongroup==1.1.0 +networkx==2.8.6 +accelerate==1.0.0 +nltk==3.8.1 +shutilwhich==1.1.0 +fonttools==4.45.1 +future==0.18.3 +aiohttp==3.10.6 +wcwidth==0.2.5 +idna==3.6 +filelock==3.12.2 +pathspec==0.12.1 +jupyter_core==5.1.0 +lit==18.1.8 +nvidia-curand-cu11==10.2.10.91 +nvidia-cublas-cu11==11.10.3.66 +nvidia-ml-py==12.560.30 +msgpack==1.1.0 +python-dateutil==2.8.2 +blessed==1.20.0 +packaging==23.0 +gitdb==4.0.11 +yarl==1.13.0 +emoji==2.8.0 +tzdata==2023.3 +cycler==0.12.1 +tornado==6.2 +backcall==0.2.0 +plotnine==0.12.4 +ninja==1.11.1.1 +latex==0.7.0 +wandb==0.18.5 +setproctitle==1.3.3 +threadpoolctl==3.2.0 +requests==2.32.3 +pyparsing==3.1.1 +smmap==5.0.1 +pyzmq==23.0.0 +async-timeout==4.0.3 +annotated-types==0.7.0 +matplotlib-inline==0.1.6 +latexcodec==1.0.0 +ipython==8.0.0 +patsy==0.5.3 +contourpy==1.2.0 +multidict==6.1.0 +mizani==0.9.3 +urllib3==2.1.0 +tokenizers==0.20.0 +MarkupSafe==2.1.2 +pip==24.2 +pexpect==4.8.0 +tqdm==4.66.5 +jedi==0.18.2 +pydantic_core==2.23.4 +tempdir==0.7.1 +mpmath==1.2.1 +setuptools==72.1.0 +pytest==7.4.3 +pure-eval==0.2.2 +psutil==5.9.1 +comm==0.1.2 +nvidia-cuda-cupti-cu11==11.7.101 +nvidia-cuda-nvrtc-cu11==11.7.99 +regex==2023.10.3 +platformdirs==2.5.2 +asttokens==2.2.1 +torch==2.0.0 +nvidia-nccl-cu11==2.14.3 +xxhash==3.5.0 diff --git a/wandb/run-20241101_094656-b81aanqd/files/wandb-metadata.json b/wandb/run-20241101_094656-b81aanqd/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..6963ecd6c7b5906e735d2cb2f49f819fb9385ee9 --- /dev/null +++ b/wandb/run-20241101_094656-b81aanqd/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-11-01T13:46:56.161131Z", + "args": [ + "--perturbation", + "reverse_control", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "7", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1754695659520" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241101_094656-b81aanqd/logs/debug-internal.log b/wandb/run-20241101_094656-b81aanqd/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..57bc54f7769183fb5e6a903b52db1ca260274ed9 --- /dev/null +++ b/wandb/run-20241101_094656-b81aanqd/logs/debug-internal.log @@ -0,0 +1,8 @@ +{"time":"2024-11-01T09:46:56.163429183-04:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-11-01T09:46:56.163444503-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_094656-b81aanqd/logs/debug-core.log"} +{"time":"2024-11-01T09:46:56.270429849-04:00","level":"INFO","msg":"created new stream","id":"b81aanqd"} +{"time":"2024-11-01T09:46:56.270462529-04:00","level":"INFO","msg":"stream: started","id":"b81aanqd"} +{"time":"2024-11-01T09:46:56.27052033-04:00","level":"INFO","msg":"sender: started","stream_id":"b81aanqd"} +{"time":"2024-11-01T09:46:56.27050816-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"b81aanqd"}} +{"time":"2024-11-01T09:46:56.27050657-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"b81aanqd"}} +{"time":"2024-11-01T09:46:56.681085765-04:00","level":"INFO","msg":"Starting system monitor"} diff --git a/wandb/run-20241101_094656-b81aanqd/logs/debug.log b/wandb/run-20241101_094656-b81aanqd/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..51b52c8596bd9e25df60b575fbf09520559b3e11 --- /dev/null +++ b/wandb/run-20241101_094656-b81aanqd/logs/debug.log @@ -0,0 +1,26 @@ +2024-11-01 09:46:56,159 INFO MainThread:786688 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-11-01 09:46:56,159 INFO MainThread:786688 [wandb_setup.py:_flush():79] Configure stats pid to 786688 +2024-11-01 09:46:56,159 INFO MainThread:786688 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-11-01 09:46:56,159 INFO MainThread:786688 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-11-01 09:46:56,159 INFO MainThread:786688 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-11-01 09:46:56,159 INFO MainThread:786688 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-11-01 09:46:56,159 INFO MainThread:786688 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-11-01 09:46:56,159 INFO MainThread:786688 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-11-01 09:46:56,159 INFO MainThread:786688 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_094656-b81aanqd/logs/debug.log +2024-11-01 09:46:56,159 INFO MainThread:786688 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_094656-b81aanqd/logs/debug-internal.log +2024-11-01 09:46:56,159 INFO MainThread:786688 [wandb_init.py:init():621] calling init triggers +2024-11-01 09:46:56,159 INFO MainThread:786688 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-11-01 09:46:56,159 INFO MainThread:786688 [wandb_init.py:init():671] starting backend +2024-11-01 09:46:56,159 INFO MainThread:786688 [wandb_init.py:init():675] sending inform_init request +2024-11-01 09:46:56,160 INFO MainThread:786688 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-11-01 09:46:56,160 INFO MainThread:786688 [wandb_init.py:init():688] backend started and connected +2024-11-01 09:46:56,164 INFO MainThread:786688 [wandb_init.py:init():783] updated telemetry +2024-11-01 09:46:56,210 INFO MainThread:786688 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-11-01 09:46:56,678 INFO MainThread:786688 [wandb_init.py:init():867] starting run threads in backend +2024-11-01 09:46:56,779 INFO MainThread:786688 [wandb_run.py:_console_start():2463] atexit reg +2024-11-01 09:46:56,779 INFO MainThread:786688 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-11-01 09:46:56,779 INFO MainThread:786688 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-11-01 09:46:56,779 INFO MainThread:786688 [wandb_run.py:_redirect():2401] Redirects installed. +2024-11-01 09:46:56,780 INFO MainThread:786688 [wandb_init.py:init():911] run started, returning control to user process +2024-11-01 09:46:56,781 INFO MainThread:786688 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_control', 'train_set': '10M', 'batch_size': 3, 'epoch': 7, 'seed': 0, 'lr': 5e-06} diff --git a/wandb/run-20241101_094656-b81aanqd/run-b81aanqd.wandb b/wandb/run-20241101_094656-b81aanqd/run-b81aanqd.wandb new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/wandb/run-20241105_163039-457mt3e1/logs/debug.log b/wandb/run-20241105_163039-457mt3e1/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..399f2b3db42488979fb169d2ff23e9479f171af0 --- /dev/null +++ b/wandb/run-20241105_163039-457mt3e1/logs/debug.log @@ -0,0 +1,27 @@ +2024-11-05 16:30:39,139 INFO MainThread:1780127 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-11-05 16:30:39,140 INFO MainThread:1780127 [wandb_setup.py:_flush():79] Configure stats pid to 1780127 +2024-11-05 16:30:39,140 INFO MainThread:1780127 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-11-05 16:30:39,140 INFO MainThread:1780127 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-11-05 16:30:39,140 INFO MainThread:1780127 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-11-05 16:30:39,140 INFO MainThread:1780127 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-11-05 16:30:39,140 INFO MainThread:1780127 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-11-05 16:30:39,140 INFO MainThread:1780127 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-11-05 16:30:39,140 INFO MainThread:1780127 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241105_163039-457mt3e1/logs/debug.log +2024-11-05 16:30:39,140 INFO MainThread:1780127 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241105_163039-457mt3e1/logs/debug-internal.log +2024-11-05 16:30:39,140 INFO MainThread:1780127 [wandb_init.py:init():621] calling init triggers +2024-11-05 16:30:39,140 INFO MainThread:1780127 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-11-05 16:30:39,140 INFO MainThread:1780127 [wandb_init.py:init():671] starting backend +2024-11-05 16:30:39,140 INFO MainThread:1780127 [wandb_init.py:init():675] sending inform_init request +2024-11-05 16:30:39,141 INFO MainThread:1780127 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-11-05 16:30:39,141 INFO MainThread:1780127 [wandb_init.py:init():688] backend started and connected +2024-11-05 16:30:39,145 INFO MainThread:1780127 [wandb_init.py:init():783] updated telemetry +2024-11-05 16:30:39,181 INFO MainThread:1780127 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-11-05 16:30:40,393 INFO MainThread:1780127 [wandb_init.py:init():867] starting run threads in backend +2024-11-05 16:30:40,482 INFO MainThread:1780127 [wandb_run.py:_console_start():2463] atexit reg +2024-11-05 16:30:40,482 INFO MainThread:1780127 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-11-05 16:30:40,482 INFO MainThread:1780127 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-11-05 16:30:40,482 INFO MainThread:1780127 [wandb_run.py:_redirect():2401] Redirects installed. +2024-11-05 16:30:40,484 INFO MainThread:1780127 [wandb_init.py:init():911] run started, returning control to user process +2024-11-05 16:30:40,484 INFO MainThread:1780127 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'shuffle_deterministic21', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0, 'lr': 5e-06} +2024-11-05 16:32:00,264 WARNING MsgRouterThr:1780127 [router.py:message_loop():77] message_loop has been closed diff --git a/wandb/run-20241106_224020-gk9fw9zk/files/config.yaml b/wandb/run-20241106_224020-gk9fw9zk/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..507bc548a43d1c7afb495d052831f04f348b9c17 --- /dev/null +++ b/wandb/run-20241106_224020-gk9fw9zk/files/config.yaml @@ -0,0 +1,49 @@ +_wandb: + value: + cli_version: 0.18.5 + m: [] + python_version: 3.9.19 + t: + "1": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "2": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "3": + - 13 + - 23 + - 55 + "4": 3.9.19 + "5": 0.18.5 + "6": 4.45.1 + "8": + - 5 + "12": 0.18.5 + "13": linux-x86_64 +batch_size: + value: 3 +epoch: + value: 3 +lr: + value: 5e-06 +perturbation: + value: shuffle_deterministic84 +seed: + value: 0 +train_set: + value: 10M diff --git a/wandb/run-20241106_224020-gk9fw9zk/files/output.log b/wandb/run-20241106_224020-gk9fw9zk/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..0e7e2f44b7701d6573b42446182c33ca7fd3ec38 --- /dev/null +++ b/wandb/run-20241106_224020-gk9fw9zk/files/output.log @@ -0,0 +1,60 @@ +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_http.py", line 406, in hf_raise_for_status + response.raise_for_status() + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/requests/models.py", line 1024, in raise_for_status + raise HTTPError(http_error_msg, response=self) +requests.exceptions.HTTPError: 401 Client Error: Unauthorized for url: https://huggingface.co/meta-llama/Llama-3.2-3B/resolve/main/config.json + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 403, in cached_file + resolved_file = hf_hub_download( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_deprecation.py", line 101, in inner_f + return f(*args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn + return fn(*args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1232, in hf_hub_download + return _hf_hub_download_to_cache_dir( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1339, in _hf_hub_download_to_cache_dir + _raise_on_head_call_error(head_call_error, force_download, local_files_only) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1854, in _raise_on_head_call_error + raise head_call_error + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1746, in _get_metadata_or_catch_error + metadata = get_hf_file_metadata( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn + return fn(*args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1666, in get_hf_file_metadata + r = _request_wrapper( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 364, in _request_wrapper + response = _request_wrapper( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 388, in _request_wrapper + hf_raise_for_status(response) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_http.py", line 423, in hf_raise_for_status + raise _format(GatedRepoError, message, response) from e +huggingface_hub.errors.GatedRepoError: 401 Client Error. (Request ID: Root=1-672c36a5-1b350e044efcc22740f22eaf;5323d0f5-3a86-44b6-b0b6-22d4d4bd4fd0) + +Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.2-3B/resolve/main/config.json. +Access to model meta-llama/Llama-3.2-3B is restricted. You must have access to it and be authenticated to access it. Please log in. + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", line 174, in + model = AutoModelForCausalLM.from_pretrained(model_name, + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/models/auto/auto_factory.py", line 526, in from_pretrained + config, kwargs = AutoConfig.from_pretrained( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/models/auto/configuration_auto.py", line 1006, in from_pretrained + config_dict, unused_kwargs = PretrainedConfig.get_config_dict(pretrained_model_name_or_path, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/configuration_utils.py", line 567, in get_config_dict + config_dict, kwargs = cls._get_config_dict(pretrained_model_name_or_path, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/configuration_utils.py", line 626, in _get_config_dict + resolved_config_file = cached_file( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 421, in cached_file + raise EnvironmentError( +OSError: You are trying to access a gated repo. +Make sure to have access to it at https://huggingface.co/meta-llama/Llama-3.2-3B. +401 Client Error. (Request ID: Root=1-672c36a5-1b350e044efcc22740f22eaf;5323d0f5-3a86-44b6-b0b6-22d4d4bd4fd0) + +Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.2-3B/resolve/main/config.json. +Access to model meta-llama/Llama-3.2-3B is restricted. You must have access to it and be authenticated to access it. Please log in. diff --git a/wandb/run-20241106_224020-gk9fw9zk/files/wandb-metadata.json b/wandb/run-20241106_224020-gk9fw9zk/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..dc8d5c262fade06a977ac6ae4e10344833434025 --- /dev/null +++ b/wandb/run-20241106_224020-gk9fw9zk/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-11-07T03:40:20.473473Z", + "args": [ + "--perturbation", + "shuffle_deterministic84", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "3", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1774852558848" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241106_224020-gk9fw9zk/files/wandb-summary.json b/wandb/run-20241106_224020-gk9fw9zk/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..6c37fe1cbbb8aed86fd461a79642cb991e4d35cf --- /dev/null +++ b/wandb/run-20241106_224020-gk9fw9zk/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":0}} \ No newline at end of file diff --git a/wandb/run-20241106_224020-gk9fw9zk/logs/debug-internal.log b/wandb/run-20241106_224020-gk9fw9zk/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..d4379611650de1cc7dda83636ab11d84863dbcde --- /dev/null +++ b/wandb/run-20241106_224020-gk9fw9zk/logs/debug-internal.log @@ -0,0 +1,16 @@ +{"time":"2024-11-06T22:40:20.476629774-05:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-11-06T22:40:20.476650184-05:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241106_224020-gk9fw9zk/logs/debug-core.log"} +{"time":"2024-11-06T22:40:20.589677329-05:00","level":"INFO","msg":"created new stream","id":"gk9fw9zk"} +{"time":"2024-11-06T22:40:20.589732189-05:00","level":"INFO","msg":"stream: started","id":"gk9fw9zk"} +{"time":"2024-11-06T22:40:20.58978637-05:00","level":"INFO","msg":"sender: started","stream_id":"gk9fw9zk"} +{"time":"2024-11-06T22:40:20.58977959-05:00","level":"INFO","msg":"handler: started","stream_id":{"value":"gk9fw9zk"}} +{"time":"2024-11-06T22:40:20.58977781-05:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"gk9fw9zk"}} +{"time":"2024-11-06T22:40:20.798900777-05:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2024-11-06T22:40:21.039365679-05:00","level":"INFO","msg":"stream: closing","id":"gk9fw9zk"} +{"time":"2024-11-06T22:40:21.03942302-05:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2024-11-06T22:40:21.040149716-05:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2024-11-06T22:40:21.495020517-05:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2024-11-06T22:40:21.632309927-05:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"gk9fw9zk"}} +{"time":"2024-11-06T22:40:21.632352158-05:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"gk9fw9zk"}} +{"time":"2024-11-06T22:40:21.632368338-05:00","level":"INFO","msg":"sender: closed","stream_id":"gk9fw9zk"} +{"time":"2024-11-06T22:40:21.632408018-05:00","level":"INFO","msg":"stream: closed","id":"gk9fw9zk"} diff --git a/wandb/run-20241106_224020-gk9fw9zk/logs/debug.log b/wandb/run-20241106_224020-gk9fw9zk/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..8f468af69765ca3ba49fe31ce0ca5896938c26e3 --- /dev/null +++ b/wandb/run-20241106_224020-gk9fw9zk/logs/debug.log @@ -0,0 +1,27 @@ +2024-11-06 22:40:20,471 INFO MainThread:1980804 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-11-06 22:40:20,471 INFO MainThread:1980804 [wandb_setup.py:_flush():79] Configure stats pid to 1980804 +2024-11-06 22:40:20,471 INFO MainThread:1980804 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-11-06 22:40:20,471 INFO MainThread:1980804 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-11-06 22:40:20,471 INFO MainThread:1980804 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-11-06 22:40:20,471 INFO MainThread:1980804 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-11-06 22:40:20,471 INFO MainThread:1980804 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-11-06 22:40:20,471 INFO MainThread:1980804 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-11-06 22:40:20,471 INFO MainThread:1980804 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241106_224020-gk9fw9zk/logs/debug.log +2024-11-06 22:40:20,471 INFO MainThread:1980804 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241106_224020-gk9fw9zk/logs/debug-internal.log +2024-11-06 22:40:20,472 INFO MainThread:1980804 [wandb_init.py:init():621] calling init triggers +2024-11-06 22:40:20,472 INFO MainThread:1980804 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-11-06 22:40:20,472 INFO MainThread:1980804 [wandb_init.py:init():671] starting backend +2024-11-06 22:40:20,472 INFO MainThread:1980804 [wandb_init.py:init():675] sending inform_init request +2024-11-06 22:40:20,472 INFO MainThread:1980804 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-11-06 22:40:20,473 INFO MainThread:1980804 [wandb_init.py:init():688] backend started and connected +2024-11-06 22:40:20,476 INFO MainThread:1980804 [wandb_init.py:init():783] updated telemetry +2024-11-06 22:40:20,503 INFO MainThread:1980804 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-11-06 22:40:20,796 INFO MainThread:1980804 [wandb_init.py:init():867] starting run threads in backend +2024-11-06 22:40:20,895 INFO MainThread:1980804 [wandb_run.py:_console_start():2463] atexit reg +2024-11-06 22:40:20,895 INFO MainThread:1980804 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-11-06 22:40:20,895 INFO MainThread:1980804 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-11-06 22:40:20,895 INFO MainThread:1980804 [wandb_run.py:_redirect():2401] Redirects installed. +2024-11-06 22:40:20,897 INFO MainThread:1980804 [wandb_init.py:init():911] run started, returning control to user process +2024-11-06 22:40:20,897 INFO MainThread:1980804 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'shuffle_deterministic84', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0, 'lr': 5e-06} +2024-11-06 22:40:21,039 WARNING MsgRouterThr:1980804 [router.py:message_loop():77] message_loop has been closed diff --git a/wandb/run-20241106_224020-gk9fw9zk/run-gk9fw9zk.wandb b/wandb/run-20241106_224020-gk9fw9zk/run-gk9fw9zk.wandb new file mode 100644 index 0000000000000000000000000000000000000000..e153237f0e1110b59f1f85b3b449c87688b4950f Binary files /dev/null and b/wandb/run-20241106_224020-gk9fw9zk/run-gk9fw9zk.wandb differ diff --git a/wandb/run-20241106_224236-qpjf6t0w/files/config.yaml b/wandb/run-20241106_224236-qpjf6t0w/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..507bc548a43d1c7afb495d052831f04f348b9c17 --- /dev/null +++ b/wandb/run-20241106_224236-qpjf6t0w/files/config.yaml @@ -0,0 +1,49 @@ +_wandb: + value: + cli_version: 0.18.5 + m: [] + python_version: 3.9.19 + t: + "1": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "2": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "3": + - 13 + - 23 + - 55 + "4": 3.9.19 + "5": 0.18.5 + "6": 4.45.1 + "8": + - 5 + "12": 0.18.5 + "13": linux-x86_64 +batch_size: + value: 3 +epoch: + value: 3 +lr: + value: 5e-06 +perturbation: + value: shuffle_deterministic84 +seed: + value: 0 +train_set: + value: 10M diff --git a/wandb/run-20241106_224236-qpjf6t0w/files/output.log b/wandb/run-20241106_224236-qpjf6t0w/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..fc0e1c51d0e4f19a4d3e77fc85380257574ab8f9 --- /dev/null +++ b/wandb/run-20241106_224236-qpjf6t0w/files/output.log @@ -0,0 +1,60 @@ +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_http.py", line 406, in hf_raise_for_status + response.raise_for_status() + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/requests/models.py", line 1024, in raise_for_status + raise HTTPError(http_error_msg, response=self) +requests.exceptions.HTTPError: 401 Client Error: Unauthorized for url: https://huggingface.co/meta-llama/Llama-3.2-3B/resolve/main/config.json + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 403, in cached_file + resolved_file = hf_hub_download( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_deprecation.py", line 101, in inner_f + return f(*args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn + return fn(*args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1232, in hf_hub_download + return _hf_hub_download_to_cache_dir( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1339, in _hf_hub_download_to_cache_dir + _raise_on_head_call_error(head_call_error, force_download, local_files_only) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1854, in _raise_on_head_call_error + raise head_call_error + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1746, in _get_metadata_or_catch_error + metadata = get_hf_file_metadata( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn + return fn(*args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1666, in get_hf_file_metadata + r = _request_wrapper( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 364, in _request_wrapper + response = _request_wrapper( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 388, in _request_wrapper + hf_raise_for_status(response) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_http.py", line 423, in hf_raise_for_status + raise _format(GatedRepoError, message, response) from e +huggingface_hub.errors.GatedRepoError: 401 Client Error. (Request ID: Root=1-672c372c-3357a49e4853ee8730e60b8f;fa6bbdbe-7861-4e08-bd0c-b32b7abc2f3e) + +Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.2-3B/resolve/main/config.json. +Access to model meta-llama/Llama-3.2-3B is restricted. You must have access to it and be authenticated to access it. Please log in. + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", line 174, in + model = AutoModelForCausalLM.from_pretrained(model_name, + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/models/auto/auto_factory.py", line 526, in from_pretrained + config, kwargs = AutoConfig.from_pretrained( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/models/auto/configuration_auto.py", line 1006, in from_pretrained + config_dict, unused_kwargs = PretrainedConfig.get_config_dict(pretrained_model_name_or_path, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/configuration_utils.py", line 567, in get_config_dict + config_dict, kwargs = cls._get_config_dict(pretrained_model_name_or_path, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/configuration_utils.py", line 626, in _get_config_dict + resolved_config_file = cached_file( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 421, in cached_file + raise EnvironmentError( +OSError: You are trying to access a gated repo. +Make sure to have access to it at https://huggingface.co/meta-llama/Llama-3.2-3B. +401 Client Error. (Request ID: Root=1-672c372c-3357a49e4853ee8730e60b8f;fa6bbdbe-7861-4e08-bd0c-b32b7abc2f3e) + +Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.2-3B/resolve/main/config.json. +Access to model meta-llama/Llama-3.2-3B is restricted. You must have access to it and be authenticated to access it. Please log in. diff --git a/wandb/run-20241106_224236-qpjf6t0w/files/wandb-metadata.json b/wandb/run-20241106_224236-qpjf6t0w/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..6588da50f3372537489a2ca7e8ad287d61311507 --- /dev/null +++ b/wandb/run-20241106_224236-qpjf6t0w/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-11-07T03:42:36.127603Z", + "args": [ + "--perturbation", + "shuffle_deterministic84", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "3", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1774852591616" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241106_224236-qpjf6t0w/files/wandb-summary.json b/wandb/run-20241106_224236-qpjf6t0w/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..6c37fe1cbbb8aed86fd461a79642cb991e4d35cf --- /dev/null +++ b/wandb/run-20241106_224236-qpjf6t0w/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":0}} \ No newline at end of file diff --git a/wandb/run-20241106_224236-qpjf6t0w/logs/debug-internal.log b/wandb/run-20241106_224236-qpjf6t0w/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..3dafcebb6c69aef412112523a90786cf80478926 --- /dev/null +++ b/wandb/run-20241106_224236-qpjf6t0w/logs/debug-internal.log @@ -0,0 +1,16 @@ +{"time":"2024-11-06T22:42:36.130765588-05:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-11-06T22:42:36.130786588-05:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241106_224236-qpjf6t0w/logs/debug-core.log"} +{"time":"2024-11-06T22:42:36.238018425-05:00","level":"INFO","msg":"created new stream","id":"qpjf6t0w"} +{"time":"2024-11-06T22:42:36.238048465-05:00","level":"INFO","msg":"stream: started","id":"qpjf6t0w"} +{"time":"2024-11-06T22:42:36.238101746-05:00","level":"INFO","msg":"sender: started","stream_id":"qpjf6t0w"} +{"time":"2024-11-06T22:42:36.238085016-05:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"qpjf6t0w"}} +{"time":"2024-11-06T22:42:36.238149556-05:00","level":"INFO","msg":"handler: started","stream_id":{"value":"qpjf6t0w"}} +{"time":"2024-11-06T22:42:36.407959148-05:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2024-11-06T22:42:36.645926169-05:00","level":"INFO","msg":"stream: closing","id":"qpjf6t0w"} +{"time":"2024-11-06T22:42:36.646446943-05:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2024-11-06T22:42:36.646928967-05:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2024-11-06T22:42:36.995233659-05:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2024-11-06T22:42:37.136882888-05:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"qpjf6t0w"}} +{"time":"2024-11-06T22:42:37.136944259-05:00","level":"INFO","msg":"sender: closed","stream_id":"qpjf6t0w"} +{"time":"2024-11-06T22:42:37.136934499-05:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"qpjf6t0w"}} +{"time":"2024-11-06T22:42:37.13702102-05:00","level":"INFO","msg":"stream: closed","id":"qpjf6t0w"} diff --git a/wandb/run-20241106_224236-qpjf6t0w/logs/debug.log b/wandb/run-20241106_224236-qpjf6t0w/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..d2895bd728a0d24ad7e90abfae79e4dea6f547ec --- /dev/null +++ b/wandb/run-20241106_224236-qpjf6t0w/logs/debug.log @@ -0,0 +1,27 @@ +2024-11-06 22:42:36,125 INFO MainThread:1982052 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-11-06 22:42:36,126 INFO MainThread:1982052 [wandb_setup.py:_flush():79] Configure stats pid to 1982052 +2024-11-06 22:42:36,126 INFO MainThread:1982052 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-11-06 22:42:36,126 INFO MainThread:1982052 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-11-06 22:42:36,126 INFO MainThread:1982052 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-11-06 22:42:36,126 INFO MainThread:1982052 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-11-06 22:42:36,126 INFO MainThread:1982052 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-11-06 22:42:36,126 INFO MainThread:1982052 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-11-06 22:42:36,126 INFO MainThread:1982052 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241106_224236-qpjf6t0w/logs/debug.log +2024-11-06 22:42:36,126 INFO MainThread:1982052 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241106_224236-qpjf6t0w/logs/debug-internal.log +2024-11-06 22:42:36,126 INFO MainThread:1982052 [wandb_init.py:init():621] calling init triggers +2024-11-06 22:42:36,126 INFO MainThread:1982052 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-11-06 22:42:36,126 INFO MainThread:1982052 [wandb_init.py:init():671] starting backend +2024-11-06 22:42:36,126 INFO MainThread:1982052 [wandb_init.py:init():675] sending inform_init request +2024-11-06 22:42:36,127 INFO MainThread:1982052 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-11-06 22:42:36,127 INFO MainThread:1982052 [wandb_init.py:init():688] backend started and connected +2024-11-06 22:42:36,130 INFO MainThread:1982052 [wandb_init.py:init():783] updated telemetry +2024-11-06 22:42:36,150 INFO MainThread:1982052 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-11-06 22:42:36,405 INFO MainThread:1982052 [wandb_init.py:init():867] starting run threads in backend +2024-11-06 22:42:36,508 INFO MainThread:1982052 [wandb_run.py:_console_start():2463] atexit reg +2024-11-06 22:42:36,508 INFO MainThread:1982052 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-11-06 22:42:36,508 INFO MainThread:1982052 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-11-06 22:42:36,508 INFO MainThread:1982052 [wandb_run.py:_redirect():2401] Redirects installed. +2024-11-06 22:42:36,510 INFO MainThread:1982052 [wandb_init.py:init():911] run started, returning control to user process +2024-11-06 22:42:36,510 INFO MainThread:1982052 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'shuffle_deterministic84', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0, 'lr': 5e-06} +2024-11-06 22:42:36,646 WARNING MsgRouterThr:1982052 [router.py:message_loop():77] message_loop has been closed diff --git a/wandb/run-20241106_224236-qpjf6t0w/run-qpjf6t0w.wandb b/wandb/run-20241106_224236-qpjf6t0w/run-qpjf6t0w.wandb new file mode 100644 index 0000000000000000000000000000000000000000..3b7838d06b1da2c0cc0958e7e6af04473a5cadd7 Binary files /dev/null and b/wandb/run-20241106_224236-qpjf6t0w/run-qpjf6t0w.wandb differ diff --git a/wandb/run-20241107_160909-8q592a50/files/config.yaml b/wandb/run-20241107_160909-8q592a50/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..96b12b207e28eaec5c0b633b6174448a8532a549 --- /dev/null +++ b/wandb/run-20241107_160909-8q592a50/files/config.yaml @@ -0,0 +1,50 @@ +_wandb: + value: + cli_version: 0.18.5 + m: [] + python_version: 3.9.19 + t: + "1": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "2": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "3": + - 2 + - 13 + - 23 + - 55 + "4": 3.9.19 + "5": 0.18.5 + "6": 4.45.1 + "8": + - 5 + "12": 0.18.5 + "13": linux-x86_64 +batch_size: + value: 3 +epoch: + value: 3 +lr: + value: 5e-06 +perturbation: + value: shuffle_even_odd +seed: + value: 0 +train_set: + value: 10M diff --git a/wandb/run-20241107_160909-8q592a50/files/output.log b/wandb/run-20241107_160909-8q592a50/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..cd4ee15a601ed90d88373e56b879d8a01321a233 --- /dev/null +++ b/wandb/run-20241107_160909-8q592a50/files/output.log @@ -0,0 +1,19 @@ +config.json: 100%|█████████████████████████████████████████████| 844/844 [00:00<00:00, 355kB/s] +Downloading shards: 100%|████████████████████████████████████████| 2/2 [02:32<00:00, 76.12s/it] +Loading checkpoint shards: 100%|█████████████████████████████████| 2/2 [00:05<00:00, 2.88s/it] +Map: 100%|███████████████████████████████████████| 17013/17013 [00:51<00:00, 328.81 examples/s] +tokenized_valid: Dataset({ + features: ['input_ids', 'attention_mask'], + num_rows: 1000 +}) +/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead + warnings.warn( +[2024-11-07 16:12:42,436] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-11-07 16:12:52,873] [INFO] [comm.py:652:init_distributed] cdb=None +Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination +Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root... +Emitting ninja build file /home/chunhui/.cache/torch_extensions/py39_cu117/cpu_adam/build.ninja... +Building extension module cpu_adam... +Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +Loading extension module cpu_adam... +Time to load cpu_adam op: 6.338869571685791 seconds diff --git a/wandb/run-20241107_160909-8q592a50/files/wandb-metadata.json b/wandb/run-20241107_160909-8q592a50/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..7b614109cbe10bf148a8b7521e769f0c4273c226 --- /dev/null +++ b/wandb/run-20241107_160909-8q592a50/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-11-07T21:09:09.625183Z", + "args": [ + "--perturbation", + "shuffle_even_odd", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "3", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1742796308480" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241128_161638-uo9cs4db/files/wandb-summary.json b/wandb/run-20241128_161638-uo9cs4db/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..b47b0ad94e10d6a9a6ab27e63a37a56e1daa465e --- /dev/null +++ b/wandb/run-20241128_161638-uo9cs4db/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":48559}} \ No newline at end of file diff --git a/wandb/run-20241128_161638-uo9cs4db/logs/debug.log b/wandb/run-20241128_161638-uo9cs4db/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..daf9a9492858f1409e53c0cbe4aea8b38aa7feac --- /dev/null +++ b/wandb/run-20241128_161638-uo9cs4db/logs/debug.log @@ -0,0 +1,33 @@ +2024-11-28 16:16:38,580 INFO MainThread:3102257 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-11-28 16:16:38,580 INFO MainThread:3102257 [wandb_setup.py:_flush():79] Configure stats pid to 3102257 +2024-11-28 16:16:38,580 INFO MainThread:3102257 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-11-28 16:16:38,580 INFO MainThread:3102257 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-11-28 16:16:38,580 INFO MainThread:3102257 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-11-28 16:16:38,580 INFO MainThread:3102257 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-11-28 16:16:38,580 INFO MainThread:3102257 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_llama_1B.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_llama_1B.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_llama_1B.py'} +2024-11-28 16:16:38,580 INFO MainThread:3102257 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-11-28 16:16:38,581 INFO MainThread:3102257 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241128_161638-uo9cs4db/logs/debug.log +2024-11-28 16:16:38,581 INFO MainThread:3102257 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241128_161638-uo9cs4db/logs/debug-internal.log +2024-11-28 16:16:38,581 INFO MainThread:3102257 [wandb_init.py:init():621] calling init triggers +2024-11-28 16:16:38,581 INFO MainThread:3102257 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-11-28 16:16:38,581 INFO MainThread:3102257 [wandb_init.py:init():671] starting backend +2024-11-28 16:16:38,581 INFO MainThread:3102257 [wandb_init.py:init():675] sending inform_init request +2024-11-28 16:16:38,583 INFO MainThread:3102257 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-11-28 16:16:38,583 INFO MainThread:3102257 [wandb_init.py:init():688] backend started and connected +2024-11-28 16:16:38,587 INFO MainThread:3102257 [wandb_init.py:init():783] updated telemetry +2024-11-28 16:16:38,614 INFO MainThread:3102257 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-11-28 16:16:38,851 INFO MainThread:3102257 [wandb_init.py:init():867] starting run threads in backend +2024-11-28 16:16:38,941 INFO MainThread:3102257 [wandb_run.py:_console_start():2463] atexit reg +2024-11-28 16:16:38,942 INFO MainThread:3102257 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-11-28 16:16:38,942 INFO MainThread:3102257 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-11-28 16:16:38,942 INFO MainThread:3102257 [wandb_run.py:_redirect():2401] Redirects installed. +2024-11-28 16:16:38,943 INFO MainThread:3102257 [wandb_init.py:init():911] run started, returning control to user process +2024-11-28 16:16:38,944 INFO MainThread:3102257 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_full', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0, 'lr': 5e-06} +2024-11-29 05:45:58,292 INFO MainThread:3102257 [wandb_run.py:_finish():2158] finishing run yaning1001-dartmouth-college/exp-impo-reverse/uo9cs4db +2024-11-29 05:45:58,360 INFO MainThread:3102257 [wandb_run.py:_atexit_cleanup():2426] got exitcode: 0 +2024-11-29 05:45:58,380 INFO MainThread:3102257 [wandb_run.py:_restore():2408] restore +2024-11-29 05:45:58,380 INFO MainThread:3102257 [wandb_run.py:_restore():2414] restore done +2024-11-29 05:46:01,164 INFO MainThread:3102257 [wandb_run.py:_footer_history_summary_info():3975] rendering history +2024-11-29 05:46:01,164 INFO MainThread:3102257 [wandb_run.py:_footer_history_summary_info():4007] rendering summary +2024-11-29 05:46:01,173 INFO MainThread:3102257 [wandb_run.py:_footer_sync_info():3934] logging synced files