diff --git a/.gitattributes b/.gitattributes index 20119c20db2815e6e83ce38839d4835bac6e58ae..6539ded51ee24796405b3aabb36ed8c302b4b76e 100644 --- a/.gitattributes +++ b/.gitattributes @@ -110,3 +110,8 @@ wandb/run-20241105_163248-thalxhcd/run-thalxhcd.wandb filter=lfs diff=lfs merge= wandb/run-20241031_002020-q6ot1vz6/run-q6ot1vz6.wandb filter=lfs diff=lfs merge=lfs -text wandb/run-20241106_234348-l3eig11b/run-l3eig11b.wandb filter=lfs diff=lfs merge=lfs -text wandb/run-20241031_122114-2k9672ya/run-2k9672ya.wandb filter=lfs diff=lfs merge=lfs -text +wandb/run-20241030_013339-dgadwxty/run-dgadwxty.wandb filter=lfs diff=lfs merge=lfs -text +wandb/run-20241101_200535-hnfjoqai/run-hnfjoqai.wandb filter=lfs diff=lfs merge=lfs -text +wandb/run-20241030_013339-s77qk5li/run-s77qk5li.wandb filter=lfs diff=lfs merge=lfs -text +wandb/run-20241101_200535-6xsf0vem/run-6xsf0vem.wandb filter=lfs diff=lfs merge=lfs -text +wandb/run-20241030_011509-zmlu7388/run-zmlu7388.wandb filter=lfs diff=lfs merge=lfs -text diff --git a/wandb/run-20241030_010305-4ki7693c/files/config.yaml b/wandb/run-20241030_010305-4ki7693c/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f8546482f67cbb9041af9d756594322c485d67a9 --- /dev/null +++ b/wandb/run-20241030_010305-4ki7693c/files/config.yaml @@ -0,0 +1,47 @@ +_wandb: + value: + cli_version: 0.18.5 + m: [] + python_version: 3.9.19 + t: + "1": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "2": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "3": + - 13 + - 23 + - 55 + "4": 3.9.19 + "5": 0.18.5 + "6": 4.45.1 + "8": + - 5 + "12": 0.18.5 + "13": linux-x86_64 +batch_size: + value: 3 +epoch: + value: 7 +perturbation: + value: reverse_control +seed: + value: 0 +train_set: + value: 10M diff --git a/wandb/run-20241030_010305-4ki7693c/files/output.log b/wandb/run-20241030_010305-4ki7693c/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..1777f063b107f651dddc063d1d3a3fa80015bf9e --- /dev/null +++ b/wandb/run-20241030_010305-4ki7693c/files/output.log @@ -0,0 +1,4 @@ +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", line 162, in + dataset_name = f"babylm_{args.perturbation}_{args.train_zset}_seed{args.seed}" +AttributeError: 'Namespace' object has no attribute 'train_zset' diff --git a/wandb/run-20241030_010305-4ki7693c/files/wandb-metadata.json b/wandb/run-20241030_010305-4ki7693c/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..075ef336085bd945d08ee5a4b5cb18fa14df423c --- /dev/null +++ b/wandb/run-20241030_010305-4ki7693c/files/wandb-metadata.json @@ -0,0 +1,29 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-10-30T05:03:05.639656Z", + "args": [ + "--perturbation", + "reverse_control", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "7", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py" +} \ No newline at end of file diff --git a/wandb/run-20241030_010305-4ki7693c/files/wandb-summary.json b/wandb/run-20241030_010305-4ki7693c/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..6c37fe1cbbb8aed86fd461a79642cb991e4d35cf --- /dev/null +++ b/wandb/run-20241030_010305-4ki7693c/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":0}} \ No newline at end of file diff --git a/wandb/run-20241030_010305-4ki7693c/logs/debug-internal.log b/wandb/run-20241030_010305-4ki7693c/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..f62a4e5ff244a78f49021533922024ba8d3695ac --- /dev/null +++ b/wandb/run-20241030_010305-4ki7693c/logs/debug-internal.log @@ -0,0 +1,16 @@ +{"time":"2024-10-30T01:03:05.642979784-04:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-10-30T01:03:05.643001845-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_010305-4ki7693c/logs/debug-core.log"} +{"time":"2024-10-30T01:03:05.751540951-04:00","level":"INFO","msg":"created new stream","id":"4ki7693c"} +{"time":"2024-10-30T01:03:05.751579781-04:00","level":"INFO","msg":"stream: started","id":"4ki7693c"} +{"time":"2024-10-30T01:03:05.751623931-04:00","level":"INFO","msg":"sender: started","stream_id":"4ki7693c"} +{"time":"2024-10-30T01:03:05.751594441-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"4ki7693c"}} +{"time":"2024-10-30T01:03:05.751612271-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"4ki7693c"}} +{"time":"2024-10-30T01:03:05.943856433-04:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2024-10-30T01:03:06.036999384-04:00","level":"INFO","msg":"stream: closing","id":"4ki7693c"} +{"time":"2024-10-30T01:03:06.037043194-04:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2024-10-30T01:03:06.051196751-04:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2024-10-30T01:03:07.432977083-04:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2024-10-30T01:03:07.547907803-04:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"4ki7693c"}} +{"time":"2024-10-30T01:03:07.547982493-04:00","level":"INFO","msg":"sender: closed","stream_id":"4ki7693c"} +{"time":"2024-10-30T01:03:07.547981023-04:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"4ki7693c"}} +{"time":"2024-10-30T01:03:07.548091724-04:00","level":"INFO","msg":"stream: closed","id":"4ki7693c"} diff --git a/wandb/run-20241030_010305-4ki7693c/logs/debug.log b/wandb/run-20241030_010305-4ki7693c/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..9a5a82dbf9f4d53fab040cdc3f63688953248278 --- /dev/null +++ b/wandb/run-20241030_010305-4ki7693c/logs/debug.log @@ -0,0 +1,27 @@ +2024-10-30 01:03:05,637 INFO MainThread:320659 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-10-30 01:03:05,638 INFO MainThread:320659 [wandb_setup.py:_flush():79] Configure stats pid to 320659 +2024-10-30 01:03:05,638 INFO MainThread:320659 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-10-30 01:03:05,638 INFO MainThread:320659 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-10-30 01:03:05,638 INFO MainThread:320659 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-10-30 01:03:05,638 INFO MainThread:320659 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-10-30 01:03:05,638 INFO MainThread:320659 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-10-30 01:03:05,638 INFO MainThread:320659 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-10-30 01:03:05,638 INFO MainThread:320659 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_010305-4ki7693c/logs/debug.log +2024-10-30 01:03:05,638 INFO MainThread:320659 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_010305-4ki7693c/logs/debug-internal.log +2024-10-30 01:03:05,638 INFO MainThread:320659 [wandb_init.py:init():621] calling init triggers +2024-10-30 01:03:05,638 INFO MainThread:320659 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-10-30 01:03:05,638 INFO MainThread:320659 [wandb_init.py:init():671] starting backend +2024-10-30 01:03:05,638 INFO MainThread:320659 [wandb_init.py:init():675] sending inform_init request +2024-10-30 01:03:05,639 INFO MainThread:320659 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-10-30 01:03:05,639 INFO MainThread:320659 [wandb_init.py:init():688] backend started and connected +2024-10-30 01:03:05,641 INFO MainThread:320659 [wandb_init.py:init():783] updated telemetry +2024-10-30 01:03:05,664 INFO MainThread:320659 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-10-30 01:03:05,940 INFO MainThread:320659 [wandb_init.py:init():867] starting run threads in backend +2024-10-30 01:03:06,034 INFO MainThread:320659 [wandb_run.py:_console_start():2463] atexit reg +2024-10-30 01:03:06,034 INFO MainThread:320659 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-10-30 01:03:06,034 INFO MainThread:320659 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-10-30 01:03:06,034 INFO MainThread:320659 [wandb_run.py:_redirect():2401] Redirects installed. +2024-10-30 01:03:06,035 INFO MainThread:320659 [wandb_init.py:init():911] run started, returning control to user process +2024-10-30 01:03:06,036 INFO MainThread:320659 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_control', 'train_set': '10M', 'batch_size': 3, 'epoch': 7, 'seed': 0} +2024-10-30 01:03:06,037 WARNING MsgRouterThr:320659 [router.py:message_loop():77] message_loop has been closed diff --git a/wandb/run-20241030_010305-4ki7693c/run-4ki7693c.wandb b/wandb/run-20241030_010305-4ki7693c/run-4ki7693c.wandb new file mode 100644 index 0000000000000000000000000000000000000000..ae2c7bae0f60b9be4c2f447491537040031d851e Binary files /dev/null and b/wandb/run-20241030_010305-4ki7693c/run-4ki7693c.wandb differ diff --git a/wandb/run-20241030_011509-zmlu7388/run-zmlu7388.wandb b/wandb/run-20241030_011509-zmlu7388/run-zmlu7388.wandb new file mode 100644 index 0000000000000000000000000000000000000000..a23f2d4c857cf928266398a164cc49d666c0d75a --- /dev/null +++ b/wandb/run-20241030_011509-zmlu7388/run-zmlu7388.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:040fe78d8ec4b781ed5ef6142b4febcf6226f4d06df169e2ee75ef4a7b3149d1 +size 163840 diff --git a/wandb/run-20241030_013339-dgadwxty/run-dgadwxty.wandb b/wandb/run-20241030_013339-dgadwxty/run-dgadwxty.wandb new file mode 100644 index 0000000000000000000000000000000000000000..9de3f0033d23286014f4b313568ab746e78dfc1a --- /dev/null +++ b/wandb/run-20241030_013339-dgadwxty/run-dgadwxty.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:589f69288e34d7adbf24b1cbd3ee3d6189d8cd39e1c0aa92d97593459f8c1f11 +size 20119552 diff --git a/wandb/run-20241030_013339-s77qk5li/run-s77qk5li.wandb b/wandb/run-20241030_013339-s77qk5li/run-s77qk5li.wandb new file mode 100644 index 0000000000000000000000000000000000000000..79802a5602d5a20a397d088817655a03ba08e491 --- /dev/null +++ b/wandb/run-20241030_013339-s77qk5li/run-s77qk5li.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c67abff0d802c9d8d917c03111d5b98030ac30597afe446dadeb16459c0339b +size 20119552 diff --git a/wandb/run-20241030_112852-qf0srieq/files/config.yaml b/wandb/run-20241030_112852-qf0srieq/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f639ffb58b2b21d07229ba50bd66c486bf92404a --- /dev/null +++ b/wandb/run-20241030_112852-qf0srieq/files/config.yaml @@ -0,0 +1,48 @@ +_wandb: + value: + cli_version: 0.18.5 + m: [] + python_version: 3.9.19 + t: + "1": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "2": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "3": + - 2 + - 13 + - 23 + - 55 + "4": 3.9.19 + "5": 0.18.5 + "6": 4.45.1 + "8": + - 5 + "12": 0.18.5 + "13": linux-x86_64 +batch_size: + value: 3 +epoch: + value: 3 +perturbation: + value: reverse_control +seed: + value: 0 +train_set: + value: 10M diff --git a/wandb/run-20241030_112852-qf0srieq/files/output.log b/wandb/run-20241030_112852-qf0srieq/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..10600b8fb0ca8b908463e6c0a1ff17c906c3db82 --- /dev/null +++ b/wandb/run-20241030_112852-qf0srieq/files/output.log @@ -0,0 +1,17 @@ +model-00001-of-00002.safetensors: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 4.97G/4.97G [01:33<00:00, 41.3MB/s] +Downloading shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [02:08<00:00, 64.25s/it] +Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:04<00:00, 2.30s/it] +Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 18140/18140 [00:48<00:00, 372.82 examples/s] +tokenized_valid: Dataset({ + features: ['input_ids', 'attention_mask'], + num_rows: 600 +}) +/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead + warnings.warn( +[2024-10-30 11:31:56,623] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-10-30 11:32:04,855] [INFO] [comm.py:652:init_distributed] cdb=None +Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination +Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root... +Loading extension module cpu_adam... +Time to load cpu_adam op: 4.642748594284058 seconds +wandb: WARNING Fatal error while uploading data. Some run data will not be synced, but it will still be written to disk. Use `wandb sync` at the end of the run to try uploading. diff --git a/wandb/run-20241030_112852-qf0srieq/files/wandb-metadata.json b/wandb/run-20241030_112852-qf0srieq/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..075794e19482f0d99619b91edc418018c4b68fc2 --- /dev/null +++ b/wandb/run-20241030_112852-qf0srieq/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-10-30T15:28:52.808778Z", + "args": [ + "--perturbation", + "reverse_control", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "3", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1710831611904" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241030_112852-qf0srieq/files/wandb-summary.json b/wandb/run-20241030_112852-qf0srieq/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..d3aab1510450119f0c2bf41f933600948398d3c4 --- /dev/null +++ b/wandb/run-20241030_112852-qf0srieq/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":23503}} \ No newline at end of file diff --git a/wandb/run-20241030_112852-qf0srieq/logs/debug-internal.log b/wandb/run-20241030_112852-qf0srieq/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..0db613fa890402e477b9f378a378cd9e7a61207a --- /dev/null +++ b/wandb/run-20241030_112852-qf0srieq/logs/debug-internal.log @@ -0,0 +1,107 @@ +{"time":"2024-10-30T11:28:52.811130791-04:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-10-30T11:28:52.811141081-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_112852-qf0srieq/logs/debug-core.log"} +{"time":"2024-10-30T11:28:52.917206018-04:00","level":"INFO","msg":"created new stream","id":"qf0srieq"} +{"time":"2024-10-30T11:28:52.917234588-04:00","level":"INFO","msg":"stream: started","id":"qf0srieq"} +{"time":"2024-10-30T11:28:52.917337019-04:00","level":"INFO","msg":"sender: started","stream_id":"qf0srieq"} +{"time":"2024-10-30T11:28:52.917290668-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"qf0srieq"}} +{"time":"2024-10-30T11:28:52.917283768-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"qf0srieq"}} +{"time":"2024-10-30T11:28:53.127590893-04:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2024-10-30T14:03:23.466556608-04:00","level":"ERROR","msg":"HTTP error","status":404,"method":"POST","url":"https://api.wandb.ai/files/yaning1001-dartmouth-college/impossible_llm_reverse/qf0srieq/file_stream"} +{"time":"2024-10-30T14:03:23.470904408-04:00","level":"ERROR+4","msg":"filestream: fatal error: filestream: failed to upload: 404 Not Found path=files/yaning1001-dartmouth-college/impossible_llm_reverse/qf0srieq/file_stream: {\"error\":\"run impossible_llm_reverse/qf0srieq not found while streaming file\"}"} +{"time":"2024-10-30T18:00:35.968510571-04:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2024-10-30T18:00:35.984835172-04:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2024-10-30T18:00:36.01388325-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:00:36.968916927-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":1.010176656,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:00:38.18070286-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:00:43.234030085-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:00:51.514450735-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:01:08.632732116-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:01:36.992239115-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":61.033489373,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:01:43.980932764-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:02:37.012879837-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":121.054135236,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:02:44.036787297-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:03:37.032386632-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":181.07363706,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:03:44.087114894-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:04:37.058210791-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":241.09946743,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:04:44.146732173-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:05:37.08031714-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":301.121564228,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:05:44.202578641-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:06:37.103155122-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":361.14441734,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:06:44.254811138-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:07:37.129370298-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":421.170622677,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:07:44.304687106-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:08:37.187762317-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":481.229015945,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:08:44.353692443-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:09:37.205365399-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":541.246616438,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:09:44.410062874-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:10:35.95923303-04:00","level":"WARN","msg":"sender: taking a long time","seconds":600.000334216,"work":"WorkRecord(*service_go_proto.Record_Telemetry); Control(connection_id:\"127.0.0.1:54966\")"} +{"time":"2024-10-30T18:10:37.228433012-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":601.269685091,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:10:44.460819633-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:11:37.252924346-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":661.294174825,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:11:44.510745999-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:12:37.279579159-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":721.320828287,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:12:44.561934485-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:13:37.30132052-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":781.342568848,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:13:44.612933316-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:14:37.326717713-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":841.367977471,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:14:44.662885212-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:15:37.352013001-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":901.39327482,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:15:44.719534816-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:16:37.377043784-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":961.418290923,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:16:44.771072869-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:16:44.771190161-04:00","level":"ERROR","msg":"sender: sendConfig:","error":"api: failed sending: POST https://api.wandb.ai/graphql giving up after 21 attempt(s)"} +{"time":"2024-10-30T18:16:44.771395633-04:00","level":"INFO","msg":"sender: succeeded after taking longer than expected","seconds":968.812598021,"work":"WorkRecord(*service_go_proto.Record_Telemetry); Control(connection_id:\"127.0.0.1:54966\")"} +{"time":"2024-10-30T18:16:44.827636949-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:16:44.870800532-04:00","level":"ERROR","msg":"HTTP error","status":404,"method":"POST","url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:16:44.870865293-04:00","level":"ERROR","msg":"runfiles: CreateRunFiles returned error: returned error 404 Not Found: {\"errors\":[{\"message\":\"run impossible_llm_reverse/qf0srieq not found during createRunFiles\",\"path\":[\"createRunFiles\"]}],\"data\":{\"createRunFiles\":null}}"} +{"time":"2024-10-30T18:16:47.272026705-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:16:52.216994148-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:17:01.180898323-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:17:17.315501289-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:17:37.396983698-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":52.62529182,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:17:53.545827597-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:18:37.418983081-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":112.647293623,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:18:53.600763399-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:19:37.437849459-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":172.666171092,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:19:53.656455438-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:20:37.467570843-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":232.695889516,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:20:53.722959862-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:21:37.493223385-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":292.721535378,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:21:53.77677964-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:22:37.516133857-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":352.744444299,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:22:53.831863788-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:23:37.542829635-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":412.771142798,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:23:53.88651383-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:24:37.566591857-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":472.7949116,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:24:53.943522118-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:25:37.588022475-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":532.816332248,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:25:54.011175643-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:26:37.616856963-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":592.845168906,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:26:44.77275561-04:00","level":"WARN","msg":"sender: taking a long time","seconds":600.000793211,"work":"WorkRecord(*service_go_proto.Request_Defer); Control(local:true always_send:true)"} +{"time":"2024-10-30T18:26:54.082149884-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:27:37.638631965-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":652.866944558,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:27:54.135251151-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:28:37.658947741-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":712.887259204,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:28:54.198855109-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:29:37.681597392-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":772.909908875,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:29:54.26606069-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:30:37.704016353-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":832.932328576,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:30:54.329584432-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:31:37.730566652-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":892.958878965,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:31:54.417597707-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:32:37.760964501-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":952.989275424,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:32:54.47204599-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:32:54.472190042-04:00","level":"ERROR","msg":"sender: sendConfig:","error":"api: failed sending: POST https://api.wandb.ai/graphql giving up after 21 attempt(s)"} +{"time":"2024-10-30T18:32:54.472742607-04:00","level":"INFO","msg":"sender: succeeded after taking longer than expected","seconds":969.700851498,"work":"WorkRecord(*service_go_proto.Request_Defer); Control(local:true always_send:true)"} +{"time":"2024-10-30T18:32:54.572982549-04:00","level":"ERROR","msg":"HTTP error","status":404,"method":"POST","url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:32:54.5730265-04:00","level":"ERROR","msg":"runfiles: CreateRunFiles returned error: returned error 404 Not Found: {\"errors\":[{\"message\":\"run impossible_llm_reverse/qf0srieq not found during createRunFiles\",\"path\":[\"createRunFiles\"]}],\"data\":{\"createRunFiles\":null}}"} +{"time":"2024-10-30T18:32:54.659812312-04:00","level":"ERROR","msg":"HTTP error","status":404,"method":"POST","url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:32:54.660095935-04:00","level":"ERROR","msg":"sender: failed to save job artifact: ArtifactSaver.createManifest: returned error 404 Not Found: {\"errors\":[{\"message\":\"failed to find run impossible_llm_reverse/qf0srieq\",\"path\":[\"createArtifactManifest\"]}],\"data\":{\"createArtifactManifest\":null}}"} +{"time":"2024-10-30T18:32:54.711516723-04:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2024-10-30T18:32:54.759969425-04:00","level":"ERROR","msg":"HTTP error","status":404,"method":"POST","url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:32:54.760011335-04:00","level":"ERROR","msg":"runfiles: CreateRunFiles returned error: returned error 404 Not Found: {\"errors\":[{\"message\":\"run impossible_llm_reverse/qf0srieq not found during createRunFiles\",\"path\":[\"createRunFiles\"]}],\"data\":{\"createRunFiles\":null}}"} +{"time":"2024-10-30T18:32:55.719242378-04:00","level":"INFO","msg":"stream: closing","id":"qf0srieq"} +{"time":"2024-10-30T18:32:55.719270409-04:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"qf0srieq"}} +{"time":"2024-10-30T18:32:55.719310959-04:00","level":"INFO","msg":"sender: closed","stream_id":"qf0srieq"} +{"time":"2024-10-30T18:32:55.719298099-04:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"qf0srieq"}} +{"time":"2024-10-30T18:32:55.71947517-04:00","level":"INFO","msg":"stream: closed","id":"qf0srieq"} diff --git a/wandb/run-20241030_112852-qf0srieq/logs/debug.log b/wandb/run-20241030_112852-qf0srieq/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..32de7a8710b85baa84d16df1e89afc2a14f8f79e --- /dev/null +++ b/wandb/run-20241030_112852-qf0srieq/logs/debug.log @@ -0,0 +1,33 @@ +2024-10-30 11:28:52,806 INFO MainThread:367767 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-10-30 11:28:52,807 INFO MainThread:367767 [wandb_setup.py:_flush():79] Configure stats pid to 367767 +2024-10-30 11:28:52,807 INFO MainThread:367767 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-10-30 11:28:52,807 INFO MainThread:367767 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-10-30 11:28:52,807 INFO MainThread:367767 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-10-30 11:28:52,807 INFO MainThread:367767 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-10-30 11:28:52,807 INFO MainThread:367767 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-10-30 11:28:52,807 INFO MainThread:367767 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-10-30 11:28:52,807 INFO MainThread:367767 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_112852-qf0srieq/logs/debug.log +2024-10-30 11:28:52,807 INFO MainThread:367767 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_112852-qf0srieq/logs/debug-internal.log +2024-10-30 11:28:52,807 INFO MainThread:367767 [wandb_init.py:init():621] calling init triggers +2024-10-30 11:28:52,807 INFO MainThread:367767 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-10-30 11:28:52,807 INFO MainThread:367767 [wandb_init.py:init():671] starting backend +2024-10-30 11:28:52,807 INFO MainThread:367767 [wandb_init.py:init():675] sending inform_init request +2024-10-30 11:28:52,808 INFO MainThread:367767 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-10-30 11:28:52,808 INFO MainThread:367767 [wandb_init.py:init():688] backend started and connected +2024-10-30 11:28:52,811 INFO MainThread:367767 [wandb_init.py:init():783] updated telemetry +2024-10-30 11:28:52,828 INFO MainThread:367767 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-10-30 11:28:53,124 INFO MainThread:367767 [wandb_init.py:init():867] starting run threads in backend +2024-10-30 11:28:53,211 INFO MainThread:367767 [wandb_run.py:_console_start():2463] atexit reg +2024-10-30 11:28:53,211 INFO MainThread:367767 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-10-30 11:28:53,211 INFO MainThread:367767 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-10-30 11:28:53,211 INFO MainThread:367767 [wandb_run.py:_redirect():2401] Redirects installed. +2024-10-30 11:28:53,213 INFO MainThread:367767 [wandb_init.py:init():911] run started, returning control to user process +2024-10-30 11:28:53,213 INFO MainThread:367767 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_control', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0} +2024-10-30 18:00:35,944 INFO MainThread:367767 [wandb_run.py:_finish():2158] finishing run yaning1001-dartmouth-college/impossible_llm_reverse/qf0srieq +2024-10-30 18:00:35,951 INFO MainThread:367767 [wandb_run.py:_atexit_cleanup():2426] got exitcode: 0 +2024-10-30 18:00:35,951 INFO MainThread:367767 [wandb_run.py:_restore():2408] restore +2024-10-30 18:00:35,968 INFO MainThread:367767 [wandb_run.py:_restore():2414] restore done +2024-10-30 18:32:55,714 INFO MainThread:367767 [wandb_run.py:_footer_history_summary_info():3975] rendering history +2024-10-30 18:32:55,714 INFO MainThread:367767 [wandb_run.py:_footer_history_summary_info():4007] rendering summary +2024-10-30 18:32:55,718 INFO MainThread:367767 [wandb_run.py:_footer_sync_info():3934] logging synced files diff --git a/wandb/run-20241030_112853-ognjedxv/files/config.yaml b/wandb/run-20241030_112853-ognjedxv/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f639ffb58b2b21d07229ba50bd66c486bf92404a --- /dev/null +++ b/wandb/run-20241030_112853-ognjedxv/files/config.yaml @@ -0,0 +1,48 @@ +_wandb: + value: + cli_version: 0.18.5 + m: [] + python_version: 3.9.19 + t: + "1": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "2": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "3": + - 2 + - 13 + - 23 + - 55 + "4": 3.9.19 + "5": 0.18.5 + "6": 4.45.1 + "8": + - 5 + "12": 0.18.5 + "13": linux-x86_64 +batch_size: + value: 3 +epoch: + value: 3 +perturbation: + value: reverse_control +seed: + value: 0 +train_set: + value: 10M diff --git a/wandb/run-20241030_112853-ognjedxv/files/output.log b/wandb/run-20241030_112853-ognjedxv/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..0c593556e1f102aaec9ce547001c7be325ab554d --- /dev/null +++ b/wandb/run-20241030_112853-ognjedxv/files/output.log @@ -0,0 +1,19 @@ +Downloading shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [02:08<00:00, 64.05s/it] +Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:04<00:00, 2.47s/it] +Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 18140/18140 [00:48<00:00, 372.44 examples/s] +tokenized_valid: Dataset({ + features: ['input_ids', 'attention_mask'], + num_rows: 600 +}) +/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead + warnings.warn( +[2024-10-30 11:31:56,739] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-10-30 11:32:04,178] [INFO] [comm.py:652:init_distributed] cdb=None +Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination +Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root... +Emitting ninja build file /home/chunhui/.cache/torch_extensions/py39_cu117/cpu_adam/build.ninja... +Building extension module cpu_adam... +Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +Loading extension module cpu_adam... +Time to load cpu_adam op: 4.685936212539673 seconds +wandb: WARNING Fatal error while uploading data. Some run data will not be synced, but it will still be written to disk. Use `wandb sync` at the end of the run to try uploading. diff --git a/wandb/run-20241030_112853-ognjedxv/files/wandb-metadata.json b/wandb/run-20241030_112853-ognjedxv/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..f57b663dce9df13fd03f04dc022b17e5b11eac44 --- /dev/null +++ b/wandb/run-20241030_112853-ognjedxv/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-10-30T15:28:53.133377Z", + "args": [ + "--perturbation", + "reverse_control", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "3", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1710831611904" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241030_112853-ognjedxv/files/wandb-summary.json b/wandb/run-20241030_112853-ognjedxv/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..5a2d7c4df53cc4d5850ad130ca1c832c0d2dc9d0 --- /dev/null +++ b/wandb/run-20241030_112853-ognjedxv/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":23502}} \ No newline at end of file diff --git a/wandb/run-20241030_112853-ognjedxv/logs/debug-internal.log b/wandb/run-20241030_112853-ognjedxv/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..a1622f12830473f83c885849b34c8931c3933345 --- /dev/null +++ b/wandb/run-20241030_112853-ognjedxv/logs/debug-internal.log @@ -0,0 +1,108 @@ +{"time":"2024-10-30T11:28:53.136675238-04:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-10-30T11:28:53.136701068-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_112853-ognjedxv/logs/debug-core.log"} +{"time":"2024-10-30T11:28:53.347633316-04:00","level":"INFO","msg":"created new stream","id":"ognjedxv"} +{"time":"2024-10-30T11:28:53.347690336-04:00","level":"INFO","msg":"stream: started","id":"ognjedxv"} +{"time":"2024-10-30T11:28:53.347800017-04:00","level":"INFO","msg":"sender: started","stream_id":"ognjedxv"} +{"time":"2024-10-30T11:28:53.347704066-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"ognjedxv"}} +{"time":"2024-10-30T11:28:53.347733556-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"ognjedxv"}} +{"time":"2024-10-30T11:28:53.523190791-04:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2024-10-30T14:02:53.936256906-04:00","level":"ERROR","msg":"HTTP error","status":404,"method":"POST","url":"https://api.wandb.ai/files/yaning1001-dartmouth-college/impossible_llm_reverse/ognjedxv/file_stream"} +{"time":"2024-10-30T14:02:54.037928715-04:00","level":"ERROR+4","msg":"filestream: fatal error: filestream: failed to upload: 404 Not Found path=files/yaning1001-dartmouth-college/impossible_llm_reverse/ognjedxv/file_stream: {\"error\":\"run impossible_llm_reverse/ognjedxv not found while streaming file\"}"} +{"time":"2024-10-30T15:32:59.30659372-04:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"} +{"time":"2024-10-30T18:00:35.968921574-04:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2024-10-30T18:00:35.984890772-04:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2024-10-30T18:00:36.012002318-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:00:36.969753173-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":1.014362711,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:00:38.325921691-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:00:42.497060545-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:00:51.61297613-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:01:09.600899944-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:01:36.999896141-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":61.04451063,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:01:45.006064613-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:02:37.029240457-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":121.073848476,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:02:45.056317872-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:03:37.053072349-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":181.097685018,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:03:45.107280763-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:04:37.077388538-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":241.121996887,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:04:45.166717481-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:05:37.099223886-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":301.143834105,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:05:45.21766182-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:06:37.12482984-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":361.169437169,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:06:45.278269332-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:07:37.142898291-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":421.187507639,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:07:45.329370113-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:08:37.165565448-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":481.210170617,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:08:45.38725225-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:09:37.189610381-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":541.23421759,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:09:45.4390075-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:10:35.955447357-04:00","level":"WARN","msg":"sender: taking a long time","seconds":600.000042295,"work":"WorkRecord(*service_go_proto.Record_Telemetry); Control(connection_id:\"127.0.0.1:47734\")"} +{"time":"2024-10-30T18:10:37.221958924-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":601.266562773,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:10:45.492534001-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:11:37.252893706-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":661.297499824,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:11:45.544448469-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:12:37.363875954-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":721.408484843,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:12:45.6051509-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:13:37.383426633-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":781.428030622,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:13:45.658741606-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:14:37.40420376-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":841.448812198,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:14:45.714819585-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:15:37.426200511-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":901.47080191,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:15:45.782516393-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:16:37.446643168-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":961.491247447,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:16:45.83431873-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:16:45.834418971-04:00","level":"ERROR","msg":"sender: sendConfig:","error":"api: failed sending: POST https://api.wandb.ai/graphql giving up after 21 attempt(s)"} +{"time":"2024-10-30T18:16:45.834565243-04:00","level":"INFO","msg":"sender: succeeded after taking longer than expected","seconds":969.879223162,"work":"WorkRecord(*service_go_proto.Record_Telemetry); Control(connection_id:\"127.0.0.1:47734\")"} +{"time":"2024-10-30T18:16:45.893720374-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:16:45.937388384-04:00","level":"ERROR","msg":"HTTP error","status":404,"method":"POST","url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:16:45.937441754-04:00","level":"ERROR","msg":"runfiles: CreateRunFiles returned error: returned error 404 Not Found: {\"errors\":[{\"message\":\"run impossible_llm_reverse/ognjedxv not found during createRunFiles\",\"path\":[\"createRunFiles\"]}],\"data\":{\"createRunFiles\":null}}"} +{"time":"2024-10-30T18:16:48.137783109-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:16:52.444026331-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:17:01.754112994-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:17:19.575906609-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:17:37.465079512-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":51.630157695,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:17:53.783058228-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:18:37.485068435-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":111.650150518,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:18:53.833207383-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:19:37.503426143-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":171.668504316,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:19:53.883403259-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:20:37.529190853-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":231.694274566,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:20:53.939024555-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:21:37.552685095-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":291.717760967,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:21:53.995972735-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:22:37.573860393-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":351.738938456,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:22:54.048193661-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:23:37.59771863-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":411.762794442,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:23:54.10633799-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:24:37.618320459-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":471.783396942,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:24:54.157197254-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:25:37.637366385-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":531.802443028,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:25:54.214564623-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:26:37.657934536-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":591.823008329,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:26:45.835568653-04:00","level":"WARN","msg":"sender: taking a long time","seconds":600.000301462,"work":"WorkRecord(*service_go_proto.Request_Defer); Control(local:true always_send:true)"} +{"time":"2024-10-30T18:26:54.269308808-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:27:37.67735044-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":651.842428013,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:27:54.325073744-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:28:37.695600522-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":711.860675855,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:28:54.377526276-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:29:37.71622571-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":771.881304503,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:29:54.433826357-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:30:37.735499119-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":831.900579192,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:30:54.489546782-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:31:37.758295272-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":891.923373515,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:31:54.55033287-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:32:37.784133118-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"updating run config","runtime_seconds":951.949222191,"error_status":"retrying HTTP 409 Conflict"}],"total_operations":1}} +{"time":"2024-10-30T18:32:54.601982687-04:00","level":"INFO","msg":"api: retrying HTTP error","status":409,"url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:32:54.602027358-04:00","level":"ERROR","msg":"sender: sendConfig:","error":"api: failed sending: POST https://api.wandb.ai/graphql giving up after 21 attempt(s)"} +{"time":"2024-10-30T18:32:54.60225771-04:00","level":"INFO","msg":"sender: succeeded after taking longer than expected","seconds":968.76710798,"work":"WorkRecord(*service_go_proto.Request_Defer); Control(local:true always_send:true)"} +{"time":"2024-10-30T18:32:54.704245558-04:00","level":"ERROR","msg":"HTTP error","status":404,"method":"POST","url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:32:54.704274299-04:00","level":"ERROR","msg":"runfiles: CreateRunFiles returned error: returned error 404 Not Found: {\"errors\":[{\"message\":\"run impossible_llm_reverse/ognjedxv not found during createRunFiles\",\"path\":[\"createRunFiles\"]}],\"data\":{\"createRunFiles\":null}}"} +{"time":"2024-10-30T18:32:54.828523016-04:00","level":"ERROR","msg":"HTTP error","status":404,"method":"POST","url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:32:54.828680797-04:00","level":"ERROR","msg":"sender: failed to save job artifact: ArtifactSaver.createManifest: returned error 404 Not Found: {\"errors\":[{\"message\":\"failed to find run impossible_llm_reverse/ognjedxv\",\"path\":[\"createArtifactManifest\"]}],\"data\":{\"createArtifactManifest\":null}}"} +{"time":"2024-10-30T18:32:54.879693521-04:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2024-10-30T18:32:54.926933592-04:00","level":"ERROR","msg":"HTTP error","status":404,"method":"POST","url":"https://api.wandb.ai/graphql"} +{"time":"2024-10-30T18:32:54.926962902-04:00","level":"ERROR","msg":"runfiles: CreateRunFiles returned error: returned error 404 Not Found: {\"errors\":[{\"message\":\"run impossible_llm_reverse/ognjedxv not found during createRunFiles\",\"path\":[\"createRunFiles\"]}],\"data\":{\"createRunFiles\":null}}"} +{"time":"2024-10-30T18:32:55.890003709-04:00","level":"INFO","msg":"stream: closing","id":"ognjedxv"} +{"time":"2024-10-30T18:32:55.89003529-04:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"ognjedxv"}} +{"time":"2024-10-30T18:32:55.89005759-04:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"ognjedxv"}} +{"time":"2024-10-30T18:32:55.89010566-04:00","level":"INFO","msg":"sender: closed","stream_id":"ognjedxv"} +{"time":"2024-10-30T18:32:55.8901257-04:00","level":"INFO","msg":"stream: closed","id":"ognjedxv"} diff --git a/wandb/run-20241030_112853-ognjedxv/logs/debug.log b/wandb/run-20241030_112853-ognjedxv/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..60e3cf74ebb77593ae27c3149d19a6ff3dd10bfd --- /dev/null +++ b/wandb/run-20241030_112853-ognjedxv/logs/debug.log @@ -0,0 +1,33 @@ +2024-10-30 11:28:53,129 INFO MainThread:367765 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-10-30 11:28:53,130 INFO MainThread:367765 [wandb_setup.py:_flush():79] Configure stats pid to 367765 +2024-10-30 11:28:53,130 INFO MainThread:367765 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-10-30 11:28:53,130 INFO MainThread:367765 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-10-30 11:28:53,130 INFO MainThread:367765 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-10-30 11:28:53,130 INFO MainThread:367765 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-10-30 11:28:53,130 INFO MainThread:367765 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-10-30 11:28:53,130 INFO MainThread:367765 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-10-30 11:28:53,130 INFO MainThread:367765 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_112853-ognjedxv/logs/debug.log +2024-10-30 11:28:53,130 INFO MainThread:367765 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_112853-ognjedxv/logs/debug-internal.log +2024-10-30 11:28:53,130 INFO MainThread:367765 [wandb_init.py:init():621] calling init triggers +2024-10-30 11:28:53,130 INFO MainThread:367765 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-10-30 11:28:53,130 INFO MainThread:367765 [wandb_init.py:init():671] starting backend +2024-10-30 11:28:53,130 INFO MainThread:367765 [wandb_init.py:init():675] sending inform_init request +2024-10-30 11:28:53,132 INFO MainThread:367765 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-10-30 11:28:53,133 INFO MainThread:367765 [wandb_init.py:init():688] backend started and connected +2024-10-30 11:28:53,136 INFO MainThread:367765 [wandb_init.py:init():783] updated telemetry +2024-10-30 11:28:53,181 INFO MainThread:367765 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-10-30 11:28:53,520 INFO MainThread:367765 [wandb_init.py:init():867] starting run threads in backend +2024-10-30 11:28:53,609 INFO MainThread:367765 [wandb_run.py:_console_start():2463] atexit reg +2024-10-30 11:28:53,609 INFO MainThread:367765 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-10-30 11:28:53,609 INFO MainThread:367765 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-10-30 11:28:53,609 INFO MainThread:367765 [wandb_run.py:_redirect():2401] Redirects installed. +2024-10-30 11:28:53,610 INFO MainThread:367765 [wandb_init.py:init():911] run started, returning control to user process +2024-10-30 11:28:53,611 INFO MainThread:367765 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_control', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0} +2024-10-30 18:00:35,945 INFO MainThread:367765 [wandb_run.py:_finish():2158] finishing run yaning1001-dartmouth-college/impossible_llm_reverse/ognjedxv +2024-10-30 18:00:35,955 INFO MainThread:367765 [wandb_run.py:_atexit_cleanup():2426] got exitcode: 0 +2024-10-30 18:00:35,955 INFO MainThread:367765 [wandb_run.py:_restore():2408] restore +2024-10-30 18:00:35,968 INFO MainThread:367765 [wandb_run.py:_restore():2414] restore done +2024-10-30 18:32:55,882 INFO MainThread:367765 [wandb_run.py:_footer_history_summary_info():3975] rendering history +2024-10-30 18:32:55,883 INFO MainThread:367765 [wandb_run.py:_footer_history_summary_info():4007] rendering summary +2024-10-30 18:32:55,889 INFO MainThread:367765 [wandb_run.py:_footer_sync_info():3934] logging synced files diff --git a/wandb/run-20241030_233740-anh3ext7/files/output.log b/wandb/run-20241030_233740-anh3ext7/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..41d09861fd2d7e81acd2a83f2150f94ad1972775 --- /dev/null +++ b/wandb/run-20241030_233740-anh3ext7/files/output.log @@ -0,0 +1,16 @@ +Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:06<00:00, 3.01s/it] +tokenized_valid: Dataset({ + features: ['input_ids', 'attention_mask'], + num_rows: 600 +}) +/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead + warnings.warn( +[2024-10-30 23:37:50,877] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-10-30 23:37:58,570] [INFO] [comm.py:652:init_distributed] cdb=None +Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination +Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root... +Emitting ninja build file /home/chunhui/.cache/torch_extensions/py39_cu117/cpu_adam/build.ninja... +Building extension module cpu_adam... +Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +Loading extension module cpu_adam... +Time to load cpu_adam op: 4.623389482498169 seconds diff --git a/wandb/run-20241030_233740-anh3ext7/files/requirements.txt b/wandb/run-20241030_233740-anh3ext7/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416 --- /dev/null +++ b/wandb/run-20241030_233740-anh3ext7/files/requirements.txt @@ -0,0 +1,147 @@ +funcsigs==1.0.2 +sentry-sdk==2.17.0 +multiprocess==0.70.16 +numpy==1.26.2 +pluralizer==1.2.0 +debugpy==1.6.7 +nvidia-cudnn-cu11==8.5.0.96 +deepspeed==0.15.2 +data==0.4 +pandas==2.1.3 +tomli==2.0.1 +charset-normalizer==3.3.2 +attrs==24.2.0 +aiosignal==1.3.1 +fsspec==2023.10.0 +nvidia-cusparse-cu11==11.7.4.91 +zipp==3.12.0 +mypy-extensions==1.0.0 +datasets==3.0.1 +joblib==1.3.2 +hjson==3.1.0 +traitlets==5.7.1 +stack-data==0.6.0 +transformers==4.45.1 +sympy==1.11.1 +Pygments==2.15.0 +docker-pycreds==0.4.0 +dill==0.3.8 +wheel==0.44.0 +prompt-toolkit==3.0.30 +parso==0.8.3 +ipykernel==6.23.1 +pyarrow==17.0.0 +certifi==2023.11.17 +nvidia-cufft-cu11==10.9.0.58 +six==1.16.0 +pydantic==2.9.2 +click==8.1.7 +nest-asyncio==1.5.6 +gmpy2==2.1.0 +matplotlib==3.8.2 +scipy==1.11.4 +typing_extensions==4.12.2 +statsmodels==0.14.0 +huggingface-hub==0.25.0 +frozenlist==1.4.1 +gpustat==1.1.1 +nvidia-nvtx-cu11==11.7.91 +safetensors==0.4.5 +stanza==1.9.2 +decorator==5.1.1 +seaborn==0.13.0 +sentencepiece==0.2.0 +PyYAML==6.0.1 +black==24.8.0 +protobuf==4.25.1 +pickleshare==0.7.5 +peft==0.13.0 +triton==2.0.0 +nvidia-cuda-runtime-cu11==11.7.99 +Jinja2==3.1.2 +nvidia-cusolver-cu11==11.4.0.1 +executing==1.2.0 +jupyter_client==8.1.0 +pluggy==1.3.0 +cmake==3.30.3 +pytz==2023.3.post1 +aiohappyeyeballs==2.4.2 +kiwisolver==1.4.5 +py-cpuinfo==9.0.0 +Pillow==10.1.0 +ptyprocess==0.7.0 +importlib_resources==6.4.5 +GitPython==3.1.43 +importlib-metadata==6.0.0 +iniconfig==2.0.0 +scikit-learn==1.3.2 +exceptiongroup==1.1.0 +networkx==2.8.6 +accelerate==1.0.0 +nltk==3.8.1 +shutilwhich==1.1.0 +fonttools==4.45.1 +future==0.18.3 +aiohttp==3.10.6 +wcwidth==0.2.5 +idna==3.6 +filelock==3.12.2 +pathspec==0.12.1 +jupyter_core==5.1.0 +lit==18.1.8 +nvidia-curand-cu11==10.2.10.91 +nvidia-cublas-cu11==11.10.3.66 +nvidia-ml-py==12.560.30 +msgpack==1.1.0 +python-dateutil==2.8.2 +blessed==1.20.0 +packaging==23.0 +gitdb==4.0.11 +yarl==1.13.0 +emoji==2.8.0 +tzdata==2023.3 +cycler==0.12.1 +tornado==6.2 +backcall==0.2.0 +plotnine==0.12.4 +ninja==1.11.1.1 +latex==0.7.0 +wandb==0.18.5 +setproctitle==1.3.3 +threadpoolctl==3.2.0 +requests==2.32.3 +pyparsing==3.1.1 +smmap==5.0.1 +pyzmq==23.0.0 +async-timeout==4.0.3 +annotated-types==0.7.0 +matplotlib-inline==0.1.6 +latexcodec==1.0.0 +ipython==8.0.0 +patsy==0.5.3 +contourpy==1.2.0 +multidict==6.1.0 +mizani==0.9.3 +urllib3==2.1.0 +tokenizers==0.20.0 +MarkupSafe==2.1.2 +pip==24.2 +pexpect==4.8.0 +tqdm==4.66.5 +jedi==0.18.2 +pydantic_core==2.23.4 +tempdir==0.7.1 +mpmath==1.2.1 +setuptools==72.1.0 +pytest==7.4.3 +pure-eval==0.2.2 +psutil==5.9.1 +comm==0.1.2 +nvidia-cuda-cupti-cu11==11.7.101 +nvidia-cuda-nvrtc-cu11==11.7.99 +regex==2023.10.3 +platformdirs==2.5.2 +asttokens==2.2.1 +torch==2.0.0 +nvidia-nccl-cu11==2.14.3 +xxhash==3.5.0 diff --git a/wandb/run-20241030_233740-anh3ext7/files/wandb-metadata.json b/wandb/run-20241030_233740-anh3ext7/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..77ce5d8f7aaff6ecfed01e7215f4d58ceadcfff7 --- /dev/null +++ b/wandb/run-20241030_233740-anh3ext7/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-10-31T03:37:40.846508Z", + "args": [ + "--perturbation", + "reverse_control", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "3", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1711065919488" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241030_233740-anh3ext7/logs/debug-internal.log b/wandb/run-20241030_233740-anh3ext7/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..0c613e5e235d8a74cb568b16168e4970535cb3c3 --- /dev/null +++ b/wandb/run-20241030_233740-anh3ext7/logs/debug-internal.log @@ -0,0 +1,8 @@ +{"time":"2024-10-30T23:37:40.850081823-04:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-10-30T23:37:40.850115643-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_233740-anh3ext7/logs/debug-core.log"} +{"time":"2024-10-30T23:37:40.962577204-04:00","level":"INFO","msg":"created new stream","id":"anh3ext7"} +{"time":"2024-10-30T23:37:40.962609945-04:00","level":"INFO","msg":"stream: started","id":"anh3ext7"} +{"time":"2024-10-30T23:37:40.962627565-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"anh3ext7"}} +{"time":"2024-10-30T23:37:40.962685265-04:00","level":"INFO","msg":"sender: started","stream_id":"anh3ext7"} +{"time":"2024-10-30T23:37:40.962685215-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"anh3ext7"}} +{"time":"2024-10-30T23:37:41.412681383-04:00","level":"INFO","msg":"Starting system monitor"} diff --git a/wandb/run-20241030_233740-anh3ext7/logs/debug.log b/wandb/run-20241030_233740-anh3ext7/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..9b88d8e5831e912e320ba044cca80b3783b319d7 --- /dev/null +++ b/wandb/run-20241030_233740-anh3ext7/logs/debug.log @@ -0,0 +1,26 @@ +2024-10-30 23:37:40,844 INFO MainThread:464535 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-10-30 23:37:40,844 INFO MainThread:464535 [wandb_setup.py:_flush():79] Configure stats pid to 464535 +2024-10-30 23:37:40,844 INFO MainThread:464535 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-10-30 23:37:40,844 INFO MainThread:464535 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-10-30 23:37:40,844 INFO MainThread:464535 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-10-30 23:37:40,844 INFO MainThread:464535 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-10-30 23:37:40,844 INFO MainThread:464535 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-10-30 23:37:40,844 INFO MainThread:464535 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-10-30 23:37:40,845 INFO MainThread:464535 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_233740-anh3ext7/logs/debug.log +2024-10-30 23:37:40,845 INFO MainThread:464535 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_233740-anh3ext7/logs/debug-internal.log +2024-10-30 23:37:40,845 INFO MainThread:464535 [wandb_init.py:init():621] calling init triggers +2024-10-30 23:37:40,845 INFO MainThread:464535 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-10-30 23:37:40,845 INFO MainThread:464535 [wandb_init.py:init():671] starting backend +2024-10-30 23:37:40,845 INFO MainThread:464535 [wandb_init.py:init():675] sending inform_init request +2024-10-30 23:37:40,846 INFO MainThread:464535 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-10-30 23:37:40,846 INFO MainThread:464535 [wandb_init.py:init():688] backend started and connected +2024-10-30 23:37:40,849 INFO MainThread:464535 [wandb_init.py:init():783] updated telemetry +2024-10-30 23:37:40,879 INFO MainThread:464535 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-10-30 23:37:41,409 INFO MainThread:464535 [wandb_init.py:init():867] starting run threads in backend +2024-10-30 23:37:42,836 INFO MainThread:464535 [wandb_run.py:_console_start():2463] atexit reg +2024-10-30 23:37:42,837 INFO MainThread:464535 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-10-30 23:37:42,837 INFO MainThread:464535 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-10-30 23:37:42,837 INFO MainThread:464535 [wandb_run.py:_redirect():2401] Redirects installed. +2024-10-30 23:37:42,860 INFO MainThread:464535 [wandb_init.py:init():911] run started, returning control to user process +2024-10-30 23:37:42,861 INFO MainThread:464535 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_control', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0} diff --git a/wandb/run-20241031_114700-78zg7gu4/files/output.log b/wandb/run-20241031_114700-78zg7gu4/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..01abb828cc839d561ed422dff44cf582b4578e08 --- /dev/null +++ b/wandb/run-20241031_114700-78zg7gu4/files/output.log @@ -0,0 +1,48 @@ +Downloading shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [02:32<00:00, 76.42s/it] +Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:04<00:00, 2.42s/it] +tokenized_valid: Dataset({ + features: ['input_ids', 'attention_mask'], + num_rows: 600 +}) +/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead + warnings.warn( +[2024-10-31 11:49:40,267] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-10-31 11:49:48,657] [INFO] [comm.py:652:init_distributed] cdb=None +Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination +Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root... +Emitting ninja build file /home/chunhui/.cache/torch_extensions/py39_cu117/cpu_adam/build.ninja... +Building extension module cpu_adam... +Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +Loading extension module cpu_adam... +Time to load cpu_adam op: 4.85258412361145 seconds +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", line 220, in + trainer.train() + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/trainer.py", line 2052, in train + return inner_training_loop( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/trainer.py", line 2388, in _inner_training_loop + tr_loss_step = self.training_step(model, inputs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/trainer.py", line 3518, in training_step + self.accelerator.backward(loss, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/accelerate/accelerator.py", line 2238, in backward + self.deepspeed_engine_wrapped.backward(loss, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/accelerate/utils/deepspeed.py", line 186, in backward + self.engine.backward(loss, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/deepspeed/utils/nvtx.py", line 18, in wrapped_fn + ret_val = func(*args, **kwargs) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/deepspeed/runtime/engine.py", line 2020, in backward + self.optimizer.backward(loss, retain_graph=retain_graph) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 2063, in backward + self.loss_scaler.backward(loss.float(), retain_graph=retain_graph) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/deepspeed/runtime/fp16/loss_scaler.py", line 63, in backward + scaled_loss.backward(retain_graph=retain_graph) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/torch/_tensor.py", line 487, in backward + torch.autograd.backward( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/torch/autograd/__init__.py", line 200, in backward + Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass +KeyboardInterrupt +Error in atexit._run_exitfuncs: +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/deepspeed/ops/transformer/inference/triton/matmul_ext.py", line 27, in is_nfs_path + output = subprocess.check_output(['df', '-T', path], encoding='utf-8') + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/subprocess.py", line 424, in check_output diff --git a/wandb/run-20241031_114700-78zg7gu4/files/requirements.txt b/wandb/run-20241031_114700-78zg7gu4/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416 --- /dev/null +++ b/wandb/run-20241031_114700-78zg7gu4/files/requirements.txt @@ -0,0 +1,147 @@ +funcsigs==1.0.2 +sentry-sdk==2.17.0 +multiprocess==0.70.16 +numpy==1.26.2 +pluralizer==1.2.0 +debugpy==1.6.7 +nvidia-cudnn-cu11==8.5.0.96 +deepspeed==0.15.2 +data==0.4 +pandas==2.1.3 +tomli==2.0.1 +charset-normalizer==3.3.2 +attrs==24.2.0 +aiosignal==1.3.1 +fsspec==2023.10.0 +nvidia-cusparse-cu11==11.7.4.91 +zipp==3.12.0 +mypy-extensions==1.0.0 +datasets==3.0.1 +joblib==1.3.2 +hjson==3.1.0 +traitlets==5.7.1 +stack-data==0.6.0 +transformers==4.45.1 +sympy==1.11.1 +Pygments==2.15.0 +docker-pycreds==0.4.0 +dill==0.3.8 +wheel==0.44.0 +prompt-toolkit==3.0.30 +parso==0.8.3 +ipykernel==6.23.1 +pyarrow==17.0.0 +certifi==2023.11.17 +nvidia-cufft-cu11==10.9.0.58 +six==1.16.0 +pydantic==2.9.2 +click==8.1.7 +nest-asyncio==1.5.6 +gmpy2==2.1.0 +matplotlib==3.8.2 +scipy==1.11.4 +typing_extensions==4.12.2 +statsmodels==0.14.0 +huggingface-hub==0.25.0 +frozenlist==1.4.1 +gpustat==1.1.1 +nvidia-nvtx-cu11==11.7.91 +safetensors==0.4.5 +stanza==1.9.2 +decorator==5.1.1 +seaborn==0.13.0 +sentencepiece==0.2.0 +PyYAML==6.0.1 +black==24.8.0 +protobuf==4.25.1 +pickleshare==0.7.5 +peft==0.13.0 +triton==2.0.0 +nvidia-cuda-runtime-cu11==11.7.99 +Jinja2==3.1.2 +nvidia-cusolver-cu11==11.4.0.1 +executing==1.2.0 +jupyter_client==8.1.0 +pluggy==1.3.0 +cmake==3.30.3 +pytz==2023.3.post1 +aiohappyeyeballs==2.4.2 +kiwisolver==1.4.5 +py-cpuinfo==9.0.0 +Pillow==10.1.0 +ptyprocess==0.7.0 +importlib_resources==6.4.5 +GitPython==3.1.43 +importlib-metadata==6.0.0 +iniconfig==2.0.0 +scikit-learn==1.3.2 +exceptiongroup==1.1.0 +networkx==2.8.6 +accelerate==1.0.0 +nltk==3.8.1 +shutilwhich==1.1.0 +fonttools==4.45.1 +future==0.18.3 +aiohttp==3.10.6 +wcwidth==0.2.5 +idna==3.6 +filelock==3.12.2 +pathspec==0.12.1 +jupyter_core==5.1.0 +lit==18.1.8 +nvidia-curand-cu11==10.2.10.91 +nvidia-cublas-cu11==11.10.3.66 +nvidia-ml-py==12.560.30 +msgpack==1.1.0 +python-dateutil==2.8.2 +blessed==1.20.0 +packaging==23.0 +gitdb==4.0.11 +yarl==1.13.0 +emoji==2.8.0 +tzdata==2023.3 +cycler==0.12.1 +tornado==6.2 +backcall==0.2.0 +plotnine==0.12.4 +ninja==1.11.1.1 +latex==0.7.0 +wandb==0.18.5 +setproctitle==1.3.3 +threadpoolctl==3.2.0 +requests==2.32.3 +pyparsing==3.1.1 +smmap==5.0.1 +pyzmq==23.0.0 +async-timeout==4.0.3 +annotated-types==0.7.0 +matplotlib-inline==0.1.6 +latexcodec==1.0.0 +ipython==8.0.0 +patsy==0.5.3 +contourpy==1.2.0 +multidict==6.1.0 +mizani==0.9.3 +urllib3==2.1.0 +tokenizers==0.20.0 +MarkupSafe==2.1.2 +pip==24.2 +pexpect==4.8.0 +tqdm==4.66.5 +jedi==0.18.2 +pydantic_core==2.23.4 +tempdir==0.7.1 +mpmath==1.2.1 +setuptools==72.1.0 +pytest==7.4.3 +pure-eval==0.2.2 +psutil==5.9.1 +comm==0.1.2 +nvidia-cuda-cupti-cu11==11.7.101 +nvidia-cuda-nvrtc-cu11==11.7.99 +regex==2023.10.3 +platformdirs==2.5.2 +asttokens==2.2.1 +torch==2.0.0 +nvidia-nccl-cu11==2.14.3 +xxhash==3.5.0 diff --git a/wandb/run-20241031_114700-78zg7gu4/files/wandb-metadata.json b/wandb/run-20241031_114700-78zg7gu4/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..732cc43efe5209c0ef994b74b40982f601d41db1 --- /dev/null +++ b/wandb/run-20241031_114700-78zg7gu4/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-10-31T15:47:00.200293Z", + "args": [ + "--perturbation", + "reverse_full", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "6", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1753158594560" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241031_114700-78zg7gu4/logs/debug-internal.log b/wandb/run-20241031_114700-78zg7gu4/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..5f69b2aef817f73e9ea7a7ece587c66c4898d9fa --- /dev/null +++ b/wandb/run-20241031_114700-78zg7gu4/logs/debug-internal.log @@ -0,0 +1,8 @@ +{"time":"2024-10-31T11:47:00.202347389-04:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-10-31T11:47:00.202360349-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241031_114700-78zg7gu4/logs/debug-core.log"} +{"time":"2024-10-31T11:47:00.312950526-04:00","level":"INFO","msg":"created new stream","id":"78zg7gu4"} +{"time":"2024-10-31T11:47:00.312997696-04:00","level":"INFO","msg":"stream: started","id":"78zg7gu4"} +{"time":"2024-10-31T11:47:00.313037447-04:00","level":"INFO","msg":"sender: started","stream_id":"78zg7gu4"} +{"time":"2024-10-31T11:47:00.313009646-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"78zg7gu4"}} +{"time":"2024-10-31T11:47:00.313032707-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"78zg7gu4"}} +{"time":"2024-10-31T11:47:00.524489984-04:00","level":"INFO","msg":"Starting system monitor"} diff --git a/wandb/run-20241031_114700-78zg7gu4/logs/debug.log b/wandb/run-20241031_114700-78zg7gu4/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..d213cfc7b3d7e9a11fa5d6b173dcda3138405092 --- /dev/null +++ b/wandb/run-20241031_114700-78zg7gu4/logs/debug.log @@ -0,0 +1,26 @@ +2024-10-31 11:47:00,198 INFO MainThread:554148 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-10-31 11:47:00,198 INFO MainThread:554148 [wandb_setup.py:_flush():79] Configure stats pid to 554148 +2024-10-31 11:47:00,198 INFO MainThread:554148 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-10-31 11:47:00,198 INFO MainThread:554148 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-10-31 11:47:00,198 INFO MainThread:554148 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-10-31 11:47:00,198 INFO MainThread:554148 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-10-31 11:47:00,198 INFO MainThread:554148 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-10-31 11:47:00,198 INFO MainThread:554148 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-10-31 11:47:00,198 INFO MainThread:554148 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241031_114700-78zg7gu4/logs/debug.log +2024-10-31 11:47:00,198 INFO MainThread:554148 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241031_114700-78zg7gu4/logs/debug-internal.log +2024-10-31 11:47:00,198 INFO MainThread:554148 [wandb_init.py:init():621] calling init triggers +2024-10-31 11:47:00,198 INFO MainThread:554148 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-10-31 11:47:00,198 INFO MainThread:554148 [wandb_init.py:init():671] starting backend +2024-10-31 11:47:00,198 INFO MainThread:554148 [wandb_init.py:init():675] sending inform_init request +2024-10-31 11:47:00,199 INFO MainThread:554148 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-10-31 11:47:00,200 INFO MainThread:554148 [wandb_init.py:init():688] backend started and connected +2024-10-31 11:47:00,203 INFO MainThread:554148 [wandb_init.py:init():783] updated telemetry +2024-10-31 11:47:00,240 INFO MainThread:554148 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-10-31 11:47:00,520 INFO MainThread:554148 [wandb_init.py:init():867] starting run threads in backend +2024-10-31 11:47:00,638 INFO MainThread:554148 [wandb_run.py:_console_start():2463] atexit reg +2024-10-31 11:47:00,638 INFO MainThread:554148 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-10-31 11:47:00,638 INFO MainThread:554148 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-10-31 11:47:00,638 INFO MainThread:554148 [wandb_run.py:_redirect():2401] Redirects installed. +2024-10-31 11:47:00,640 INFO MainThread:554148 [wandb_init.py:init():911] run started, returning control to user process +2024-10-31 11:47:00,640 INFO MainThread:554148 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_full', 'train_set': '10M', 'batch_size': 3, 'epoch': 6, 'seed': 0, 'lr': 0.0001} diff --git a/wandb/run-20241101_094656-v2rxhny6/files/output.log b/wandb/run-20241101_094656-v2rxhny6/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..7dfb5c3fe001e62ee88e779d7b756bf5a39ee7fb --- /dev/null +++ b/wandb/run-20241101_094656-v2rxhny6/files/output.log @@ -0,0 +1,13 @@ +Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:05<00:00, 2.73s/it] +tokenized_valid: Dataset({ + features: ['input_ids', 'attention_mask'], + num_rows: 600 +}) +/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead + warnings.warn( +[2024-11-01 09:47:04,115] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-11-01 09:47:12,997] [INFO] [comm.py:652:init_distributed] cdb=None +Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination +Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root... +Loading extension module cpu_adam... +Time to load cpu_adam op: 4.672824144363403 seconds diff --git a/wandb/run-20241101_094656-v2rxhny6/files/requirements.txt b/wandb/run-20241101_094656-v2rxhny6/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416 --- /dev/null +++ b/wandb/run-20241101_094656-v2rxhny6/files/requirements.txt @@ -0,0 +1,147 @@ +funcsigs==1.0.2 +sentry-sdk==2.17.0 +multiprocess==0.70.16 +numpy==1.26.2 +pluralizer==1.2.0 +debugpy==1.6.7 +nvidia-cudnn-cu11==8.5.0.96 +deepspeed==0.15.2 +data==0.4 +pandas==2.1.3 +tomli==2.0.1 +charset-normalizer==3.3.2 +attrs==24.2.0 +aiosignal==1.3.1 +fsspec==2023.10.0 +nvidia-cusparse-cu11==11.7.4.91 +zipp==3.12.0 +mypy-extensions==1.0.0 +datasets==3.0.1 +joblib==1.3.2 +hjson==3.1.0 +traitlets==5.7.1 +stack-data==0.6.0 +transformers==4.45.1 +sympy==1.11.1 +Pygments==2.15.0 +docker-pycreds==0.4.0 +dill==0.3.8 +wheel==0.44.0 +prompt-toolkit==3.0.30 +parso==0.8.3 +ipykernel==6.23.1 +pyarrow==17.0.0 +certifi==2023.11.17 +nvidia-cufft-cu11==10.9.0.58 +six==1.16.0 +pydantic==2.9.2 +click==8.1.7 +nest-asyncio==1.5.6 +gmpy2==2.1.0 +matplotlib==3.8.2 +scipy==1.11.4 +typing_extensions==4.12.2 +statsmodels==0.14.0 +huggingface-hub==0.25.0 +frozenlist==1.4.1 +gpustat==1.1.1 +nvidia-nvtx-cu11==11.7.91 +safetensors==0.4.5 +stanza==1.9.2 +decorator==5.1.1 +seaborn==0.13.0 +sentencepiece==0.2.0 +PyYAML==6.0.1 +black==24.8.0 +protobuf==4.25.1 +pickleshare==0.7.5 +peft==0.13.0 +triton==2.0.0 +nvidia-cuda-runtime-cu11==11.7.99 +Jinja2==3.1.2 +nvidia-cusolver-cu11==11.4.0.1 +executing==1.2.0 +jupyter_client==8.1.0 +pluggy==1.3.0 +cmake==3.30.3 +pytz==2023.3.post1 +aiohappyeyeballs==2.4.2 +kiwisolver==1.4.5 +py-cpuinfo==9.0.0 +Pillow==10.1.0 +ptyprocess==0.7.0 +importlib_resources==6.4.5 +GitPython==3.1.43 +importlib-metadata==6.0.0 +iniconfig==2.0.0 +scikit-learn==1.3.2 +exceptiongroup==1.1.0 +networkx==2.8.6 +accelerate==1.0.0 +nltk==3.8.1 +shutilwhich==1.1.0 +fonttools==4.45.1 +future==0.18.3 +aiohttp==3.10.6 +wcwidth==0.2.5 +idna==3.6 +filelock==3.12.2 +pathspec==0.12.1 +jupyter_core==5.1.0 +lit==18.1.8 +nvidia-curand-cu11==10.2.10.91 +nvidia-cublas-cu11==11.10.3.66 +nvidia-ml-py==12.560.30 +msgpack==1.1.0 +python-dateutil==2.8.2 +blessed==1.20.0 +packaging==23.0 +gitdb==4.0.11 +yarl==1.13.0 +emoji==2.8.0 +tzdata==2023.3 +cycler==0.12.1 +tornado==6.2 +backcall==0.2.0 +plotnine==0.12.4 +ninja==1.11.1.1 +latex==0.7.0 +wandb==0.18.5 +setproctitle==1.3.3 +threadpoolctl==3.2.0 +requests==2.32.3 +pyparsing==3.1.1 +smmap==5.0.1 +pyzmq==23.0.0 +async-timeout==4.0.3 +annotated-types==0.7.0 +matplotlib-inline==0.1.6 +latexcodec==1.0.0 +ipython==8.0.0 +patsy==0.5.3 +contourpy==1.2.0 +multidict==6.1.0 +mizani==0.9.3 +urllib3==2.1.0 +tokenizers==0.20.0 +MarkupSafe==2.1.2 +pip==24.2 +pexpect==4.8.0 +tqdm==4.66.5 +jedi==0.18.2 +pydantic_core==2.23.4 +tempdir==0.7.1 +mpmath==1.2.1 +setuptools==72.1.0 +pytest==7.4.3 +pure-eval==0.2.2 +psutil==5.9.1 +comm==0.1.2 +nvidia-cuda-cupti-cu11==11.7.101 +nvidia-cuda-nvrtc-cu11==11.7.99 +regex==2023.10.3 +platformdirs==2.5.2 +asttokens==2.2.1 +torch==2.0.0 +nvidia-nccl-cu11==2.14.3 +xxhash==3.5.0 diff --git a/wandb/run-20241101_094656-v2rxhny6/logs/debug-internal.log b/wandb/run-20241101_094656-v2rxhny6/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..dc13abbc2fa857dd21675c2807d733839f445fc4 --- /dev/null +++ b/wandb/run-20241101_094656-v2rxhny6/logs/debug-internal.log @@ -0,0 +1,8 @@ +{"time":"2024-11-01T09:46:56.281693958-04:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-11-01T09:46:56.281708498-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_094656-v2rxhny6/logs/debug-core.log"} +{"time":"2024-11-01T09:46:56.389353319-04:00","level":"INFO","msg":"created new stream","id":"v2rxhny6"} +{"time":"2024-11-01T09:46:56.3894075-04:00","level":"INFO","msg":"stream: started","id":"v2rxhny6"} +{"time":"2024-11-01T09:46:56.38944488-04:00","level":"INFO","msg":"sender: started","stream_id":"v2rxhny6"} +{"time":"2024-11-01T09:46:56.38948828-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"v2rxhny6"}} +{"time":"2024-11-01T09:46:56.38944628-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"v2rxhny6"}} +{"time":"2024-11-01T09:46:56.711129595-04:00","level":"INFO","msg":"Starting system monitor"} diff --git a/wandb/run-20241101_094656-v2rxhny6/logs/debug.log b/wandb/run-20241101_094656-v2rxhny6/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..726b87bc9540b929593847408a51d442eedc7baa --- /dev/null +++ b/wandb/run-20241101_094656-v2rxhny6/logs/debug.log @@ -0,0 +1,26 @@ +2024-11-01 09:46:56,277 INFO MainThread:786687 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-11-01 09:46:56,277 INFO MainThread:786687 [wandb_setup.py:_flush():79] Configure stats pid to 786687 +2024-11-01 09:46:56,277 INFO MainThread:786687 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-11-01 09:46:56,277 INFO MainThread:786687 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-11-01 09:46:56,277 INFO MainThread:786687 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-11-01 09:46:56,277 INFO MainThread:786687 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-11-01 09:46:56,277 INFO MainThread:786687 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-11-01 09:46:56,277 INFO MainThread:786687 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-11-01 09:46:56,277 INFO MainThread:786687 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_094656-v2rxhny6/logs/debug.log +2024-11-01 09:46:56,277 INFO MainThread:786687 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_094656-v2rxhny6/logs/debug-internal.log +2024-11-01 09:46:56,277 INFO MainThread:786687 [wandb_init.py:init():621] calling init triggers +2024-11-01 09:46:56,277 INFO MainThread:786687 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-11-01 09:46:56,277 INFO MainThread:786687 [wandb_init.py:init():671] starting backend +2024-11-01 09:46:56,278 INFO MainThread:786687 [wandb_init.py:init():675] sending inform_init request +2024-11-01 09:46:56,279 INFO MainThread:786687 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-11-01 09:46:56,279 INFO MainThread:786687 [wandb_init.py:init():688] backend started and connected +2024-11-01 09:46:56,282 INFO MainThread:786687 [wandb_init.py:init():783] updated telemetry +2024-11-01 09:46:56,330 INFO MainThread:786687 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-11-01 09:46:56,707 INFO MainThread:786687 [wandb_init.py:init():867] starting run threads in backend +2024-11-01 09:46:56,815 INFO MainThread:786687 [wandb_run.py:_console_start():2463] atexit reg +2024-11-01 09:46:56,815 INFO MainThread:786687 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-11-01 09:46:56,815 INFO MainThread:786687 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-11-01 09:46:56,815 INFO MainThread:786687 [wandb_run.py:_redirect():2401] Redirects installed. +2024-11-01 09:46:56,817 INFO MainThread:786687 [wandb_init.py:init():911] run started, returning control to user process +2024-11-01 09:46:56,818 INFO MainThread:786687 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_control', 'train_set': '10M', 'batch_size': 3, 'epoch': 7, 'seed': 0, 'lr': 5e-06} diff --git a/wandb/run-20241101_200535-6xsf0vem/run-6xsf0vem.wandb b/wandb/run-20241101_200535-6xsf0vem/run-6xsf0vem.wandb new file mode 100644 index 0000000000000000000000000000000000000000..a9a2b463a805625fa5156cfe70bb96ae8955db58 --- /dev/null +++ b/wandb/run-20241101_200535-6xsf0vem/run-6xsf0vem.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d0abd0bb8dcd60a9d99825361276dc45d95ba9a07cabf7f789f5a802d5f115d +size 131072 diff --git a/wandb/run-20241101_200535-hnfjoqai/run-hnfjoqai.wandb b/wandb/run-20241101_200535-hnfjoqai/run-hnfjoqai.wandb new file mode 100644 index 0000000000000000000000000000000000000000..2e4f03cdc622227d0704a161b3e9941e8948abd2 --- /dev/null +++ b/wandb/run-20241101_200535-hnfjoqai/run-hnfjoqai.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1d23a55d4b39da70237bf1611118dc5cdd7f3cefeb2d71e7f0a34e03d8a4c7a +size 131072 diff --git a/wandb/run-20241105_155954-wehwcr47/files/config.yaml b/wandb/run-20241105_155954-wehwcr47/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ba0e0eb5aa81d7186d91b3c8f342ad5574a4c100 --- /dev/null +++ b/wandb/run-20241105_155954-wehwcr47/files/config.yaml @@ -0,0 +1,49 @@ +_wandb: + value: + cli_version: 0.18.5 + m: [] + python_version: 3.9.19 + t: + "1": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "2": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "3": + - 13 + - 23 + - 55 + "4": 3.9.19 + "5": 0.18.5 + "6": 4.45.1 + "8": + - 5 + "12": 0.18.5 + "13": linux-x86_64 +batch_size: + value: 3 +epoch: + value: 3 +lr: + value: 5e-06 +perturbation: + value: shuffle_deterministic21 +seed: + value: 0 +train_set: + value: 10M diff --git a/wandb/run-20241105_155954-wehwcr47/files/output.log b/wandb/run-20241105_155954-wehwcr47/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..4889313e9210ddc0bf0e53a0b8deb84b34f76597 --- /dev/null +++ b/wandb/run-20241105_155954-wehwcr47/files/output.log @@ -0,0 +1,19 @@ +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/pathlib.py", line 1323, in mkdir + self._accessor.mkdir(self, mode) +FileNotFoundError: [Errno 2] No such file or directory: '/home/chunhui/.cache/huggingface/datasets/babylm_dataset_test/babylm_shuffle_deterministic21_10M_seed0/0.0.0' + +During handling of the above exception, another exception occurred: + +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", line 165, in + dataset = load_dataset('babylm_dataset_test.py', name=dataset_name, trust_remote_code=True) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/datasets/load.py", line 2096, in load_dataset + builder_instance.download_and_prepare( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/datasets/builder.py", line 855, in download_and_prepare + Path(self._output_dir).parent.mkdir(parents=True, exist_ok=True) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/pathlib.py", line 1327, in mkdir + self.parent.mkdir(parents=True, exist_ok=True) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/pathlib.py", line 1323, in mkdir + self._accessor.mkdir(self, mode) +OSError: [Errno 28] No space left on device: '/home/chunhui/.cache/huggingface/datasets/babylm_dataset_test/babylm_shuffle_deterministic21_10M_seed0' diff --git a/wandb/run-20241105_155954-wehwcr47/files/requirements.txt b/wandb/run-20241105_155954-wehwcr47/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416 --- /dev/null +++ b/wandb/run-20241105_155954-wehwcr47/files/requirements.txt @@ -0,0 +1,147 @@ +funcsigs==1.0.2 +sentry-sdk==2.17.0 +multiprocess==0.70.16 +numpy==1.26.2 +pluralizer==1.2.0 +debugpy==1.6.7 +nvidia-cudnn-cu11==8.5.0.96 +deepspeed==0.15.2 +data==0.4 +pandas==2.1.3 +tomli==2.0.1 +charset-normalizer==3.3.2 +attrs==24.2.0 +aiosignal==1.3.1 +fsspec==2023.10.0 +nvidia-cusparse-cu11==11.7.4.91 +zipp==3.12.0 +mypy-extensions==1.0.0 +datasets==3.0.1 +joblib==1.3.2 +hjson==3.1.0 +traitlets==5.7.1 +stack-data==0.6.0 +transformers==4.45.1 +sympy==1.11.1 +Pygments==2.15.0 +docker-pycreds==0.4.0 +dill==0.3.8 +wheel==0.44.0 +prompt-toolkit==3.0.30 +parso==0.8.3 +ipykernel==6.23.1 +pyarrow==17.0.0 +certifi==2023.11.17 +nvidia-cufft-cu11==10.9.0.58 +six==1.16.0 +pydantic==2.9.2 +click==8.1.7 +nest-asyncio==1.5.6 +gmpy2==2.1.0 +matplotlib==3.8.2 +scipy==1.11.4 +typing_extensions==4.12.2 +statsmodels==0.14.0 +huggingface-hub==0.25.0 +frozenlist==1.4.1 +gpustat==1.1.1 +nvidia-nvtx-cu11==11.7.91 +safetensors==0.4.5 +stanza==1.9.2 +decorator==5.1.1 +seaborn==0.13.0 +sentencepiece==0.2.0 +PyYAML==6.0.1 +black==24.8.0 +protobuf==4.25.1 +pickleshare==0.7.5 +peft==0.13.0 +triton==2.0.0 +nvidia-cuda-runtime-cu11==11.7.99 +Jinja2==3.1.2 +nvidia-cusolver-cu11==11.4.0.1 +executing==1.2.0 +jupyter_client==8.1.0 +pluggy==1.3.0 +cmake==3.30.3 +pytz==2023.3.post1 +aiohappyeyeballs==2.4.2 +kiwisolver==1.4.5 +py-cpuinfo==9.0.0 +Pillow==10.1.0 +ptyprocess==0.7.0 +importlib_resources==6.4.5 +GitPython==3.1.43 +importlib-metadata==6.0.0 +iniconfig==2.0.0 +scikit-learn==1.3.2 +exceptiongroup==1.1.0 +networkx==2.8.6 +accelerate==1.0.0 +nltk==3.8.1 +shutilwhich==1.1.0 +fonttools==4.45.1 +future==0.18.3 +aiohttp==3.10.6 +wcwidth==0.2.5 +idna==3.6 +filelock==3.12.2 +pathspec==0.12.1 +jupyter_core==5.1.0 +lit==18.1.8 +nvidia-curand-cu11==10.2.10.91 +nvidia-cublas-cu11==11.10.3.66 +nvidia-ml-py==12.560.30 +msgpack==1.1.0 +python-dateutil==2.8.2 +blessed==1.20.0 +packaging==23.0 +gitdb==4.0.11 +yarl==1.13.0 +emoji==2.8.0 +tzdata==2023.3 +cycler==0.12.1 +tornado==6.2 +backcall==0.2.0 +plotnine==0.12.4 +ninja==1.11.1.1 +latex==0.7.0 +wandb==0.18.5 +setproctitle==1.3.3 +threadpoolctl==3.2.0 +requests==2.32.3 +pyparsing==3.1.1 +smmap==5.0.1 +pyzmq==23.0.0 +async-timeout==4.0.3 +annotated-types==0.7.0 +matplotlib-inline==0.1.6 +latexcodec==1.0.0 +ipython==8.0.0 +patsy==0.5.3 +contourpy==1.2.0 +multidict==6.1.0 +mizani==0.9.3 +urllib3==2.1.0 +tokenizers==0.20.0 +MarkupSafe==2.1.2 +pip==24.2 +pexpect==4.8.0 +tqdm==4.66.5 +jedi==0.18.2 +pydantic_core==2.23.4 +tempdir==0.7.1 +mpmath==1.2.1 +setuptools==72.1.0 +pytest==7.4.3 +pure-eval==0.2.2 +psutil==5.9.1 +comm==0.1.2 +nvidia-cuda-cupti-cu11==11.7.101 +nvidia-cuda-nvrtc-cu11==11.7.99 +regex==2023.10.3 +platformdirs==2.5.2 +asttokens==2.2.1 +torch==2.0.0 +nvidia-nccl-cu11==2.14.3 +xxhash==3.5.0 diff --git a/wandb/run-20241105_155954-wehwcr47/files/wandb-metadata.json b/wandb/run-20241105_155954-wehwcr47/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..948dbf30bd623ee2c795858dbda0441c4b512e34 --- /dev/null +++ b/wandb/run-20241105_155954-wehwcr47/files/wandb-metadata.json @@ -0,0 +1,44 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-11-05T20:59:54.612366Z", + "args": [ + "--perturbation", + "shuffle_deterministic21", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "3", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "disk": { + "/": { + "total": "1888559353856", + "used": "1792550322176" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + } +} \ No newline at end of file diff --git a/wandb/run-20241105_155954-wehwcr47/files/wandb-summary.json b/wandb/run-20241105_155954-wehwcr47/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..a59211b910c7b68e6827eb6c887d30d98244727c --- /dev/null +++ b/wandb/run-20241105_155954-wehwcr47/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":5}} \ No newline at end of file diff --git a/wandb/run-20241105_155954-wehwcr47/logs/debug-internal.log b/wandb/run-20241105_155954-wehwcr47/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..6cf3aa3d62befa2ab1596a35d18c26081e258bf5 --- /dev/null +++ b/wandb/run-20241105_155954-wehwcr47/logs/debug-internal.log @@ -0,0 +1,17 @@ +{"time":"2024-11-05T15:59:54.614675371-05:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-11-05T15:59:54.614700731-05:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241105_155954-wehwcr47/logs/debug-core.log"} +{"time":"2024-11-05T15:59:59.638514173-05:00","level":"INFO","msg":"created new stream","id":"wehwcr47"} +{"time":"2024-11-05T15:59:59.638580933-05:00","level":"INFO","msg":"stream: started","id":"wehwcr47"} +{"time":"2024-11-05T15:59:59.638616623-05:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"wehwcr47"}} +{"time":"2024-11-05T15:59:59.638821764-05:00","level":"INFO","msg":"sender: started","stream_id":"wehwcr47"} +{"time":"2024-11-05T15:59:59.638739974-05:00","level":"INFO","msg":"handler: started","stream_id":{"value":"wehwcr47"}} +{"time":"2024-11-05T15:59:59.858951434-05:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2024-11-05T15:59:59.971734094-05:00","level":"INFO","msg":"stream: closing","id":"wehwcr47"} +{"time":"2024-11-05T15:59:59.971766184-05:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2024-11-05T15:59:59.971842264-05:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2024-11-05T16:00:00.249254784-05:00","level":"ERROR","msg":"sender: sendDefer: failed to build job artifact","error":"failed to write data to file: write /tmp/tmpfile-1264752096: no space left on device"} +{"time":"2024-11-05T16:00:00.589895964-05:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2024-11-05T16:00:00.706195627-05:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"wehwcr47"}} +{"time":"2024-11-05T16:00:00.706225787-05:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"wehwcr47"}} +{"time":"2024-11-05T16:00:00.706276908-05:00","level":"INFO","msg":"sender: closed","stream_id":"wehwcr47"} +{"time":"2024-11-05T16:00:00.706283998-05:00","level":"INFO","msg":"stream: closed","id":"wehwcr47"} diff --git a/wandb/run-20241105_155954-wehwcr47/logs/debug.log b/wandb/run-20241105_155954-wehwcr47/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..8cc71c8dfb296c8c1636ae0939b0ce067c7e2531 --- /dev/null +++ b/wandb/run-20241105_155954-wehwcr47/logs/debug.log @@ -0,0 +1,27 @@ +2024-11-05 15:59:54,610 INFO MainThread:1769193 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-11-05 15:59:54,610 INFO MainThread:1769193 [wandb_setup.py:_flush():79] Configure stats pid to 1769193 +2024-11-05 15:59:54,610 INFO MainThread:1769193 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-11-05 15:59:54,610 INFO MainThread:1769193 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-11-05 15:59:54,610 INFO MainThread:1769193 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-11-05 15:59:54,610 INFO MainThread:1769193 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-11-05 15:59:54,610 INFO MainThread:1769193 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-11-05 15:59:54,610 INFO MainThread:1769193 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-11-05 15:59:54,610 INFO MainThread:1769193 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241105_155954-wehwcr47/logs/debug.log +2024-11-05 15:59:54,610 INFO MainThread:1769193 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241105_155954-wehwcr47/logs/debug-internal.log +2024-11-05 15:59:54,611 INFO MainThread:1769193 [wandb_init.py:init():621] calling init triggers +2024-11-05 15:59:54,611 INFO MainThread:1769193 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-11-05 15:59:54,611 INFO MainThread:1769193 [wandb_init.py:init():671] starting backend +2024-11-05 15:59:54,611 INFO MainThread:1769193 [wandb_init.py:init():675] sending inform_init request +2024-11-05 15:59:54,612 INFO MainThread:1769193 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-11-05 15:59:54,612 INFO MainThread:1769193 [wandb_init.py:init():688] backend started and connected +2024-11-05 15:59:54,614 INFO MainThread:1769193 [wandb_init.py:init():783] updated telemetry +2024-11-05 15:59:54,634 INFO MainThread:1769193 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-11-05 15:59:59,856 INFO MainThread:1769193 [wandb_init.py:init():867] starting run threads in backend +2024-11-05 15:59:59,945 INFO MainThread:1769193 [wandb_run.py:_console_start():2463] atexit reg +2024-11-05 15:59:59,945 INFO MainThread:1769193 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-11-05 15:59:59,946 INFO MainThread:1769193 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-11-05 15:59:59,946 INFO MainThread:1769193 [wandb_run.py:_redirect():2401] Redirects installed. +2024-11-05 15:59:59,947 INFO MainThread:1769193 [wandb_init.py:init():911] run started, returning control to user process +2024-11-05 15:59:59,948 INFO MainThread:1769193 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'shuffle_deterministic21', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0, 'lr': 5e-06} +2024-11-05 15:59:59,971 WARNING MsgRouterThr:1769193 [router.py:message_loop():77] message_loop has been closed diff --git a/wandb/run-20241105_155954-wehwcr47/run-wehwcr47.wandb b/wandb/run-20241105_155954-wehwcr47/run-wehwcr47.wandb new file mode 100644 index 0000000000000000000000000000000000000000..0c61f07bceec5daa06e187f91bea1e0710ab2419 Binary files /dev/null and b/wandb/run-20241105_155954-wehwcr47/run-wehwcr47.wandb differ diff --git a/wandb/run-20241105_161113-xd1fe9ua/files/config.yaml b/wandb/run-20241105_161113-xd1fe9ua/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ba0e0eb5aa81d7186d91b3c8f342ad5574a4c100 --- /dev/null +++ b/wandb/run-20241105_161113-xd1fe9ua/files/config.yaml @@ -0,0 +1,49 @@ +_wandb: + value: + cli_version: 0.18.5 + m: [] + python_version: 3.9.19 + t: + "1": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "2": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "3": + - 13 + - 23 + - 55 + "4": 3.9.19 + "5": 0.18.5 + "6": 4.45.1 + "8": + - 5 + "12": 0.18.5 + "13": linux-x86_64 +batch_size: + value: 3 +epoch: + value: 3 +lr: + value: 5e-06 +perturbation: + value: shuffle_deterministic21 +seed: + value: 0 +train_set: + value: 10M diff --git a/wandb/run-20241105_161113-xd1fe9ua/files/wandb-metadata.json b/wandb/run-20241105_161113-xd1fe9ua/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..a9b499de49943239004915d0fb08539bbd1fd58b --- /dev/null +++ b/wandb/run-20241105_161113-xd1fe9ua/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-11-05T21:11:13.777933Z", + "args": [ + "--perturbation", + "shuffle_deterministic21", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "3", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1792542838784" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241105_161113-xd1fe9ua/run-xd1fe9ua.wandb b/wandb/run-20241105_161113-xd1fe9ua/run-xd1fe9ua.wandb new file mode 100644 index 0000000000000000000000000000000000000000..c8fc12d74f3a54a31bf54e1248aad2a269602e60 Binary files /dev/null and b/wandb/run-20241105_161113-xd1fe9ua/run-xd1fe9ua.wandb differ diff --git a/wandb/run-20241105_223736-qwsmv2c2/files/config.yaml b/wandb/run-20241105_223736-qwsmv2c2/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..469522e2a3072e744649d9e1f08dc0e55b7e497f --- /dev/null +++ b/wandb/run-20241105_223736-qwsmv2c2/files/config.yaml @@ -0,0 +1,531 @@ +_name_or_path: + value: meta-llama/Llama-3.2-3B +_wandb: + value: + cli_version: 0.18.5 + m: + - "1": train/global_step + "6": + - 3 + "7": [] + - "1": train/loss + "5": 1 + "6": + - 1 + - 3 + "7": [] + - "1": train/learning_rate + "5": 1 + "6": + - 1 + - 3 + "7": [] + - "1": train/epoch + "5": 1 + "6": + - 1 + - 3 + "7": [] + - "1": eval/samples_per_second + "5": 1 + "6": + - 1 + - 3 + "7": [] + - "1": eval/steps_per_second + "5": 1 + "6": + - 1 + - 3 + "7": [] + - "1": train/grad_norm + "5": 1 + "6": + - 1 + - 3 + "7": [] + - "1": eval/runtime + "5": 1 + "6": + - 1 + - 3 + "7": [] + - "1": eval/loss + "5": 1 + "6": + - 1 + - 3 + "7": [] + python_version: 3.9.19 + t: + "1": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "2": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "3": + - 2 + - 7 + - 13 + - 19 + - 23 + - 55 + - 62 + - 66 + "4": 3.9.19 + "5": 0.18.5 + "6": 4.45.1 + "8": + - 5 + "9": + "1": transformers_trainer + "12": 0.18.5 + "13": linux-x86_64 +accelerator_config: + value: + dispatch_batches: null + even_batches: true + gradient_accumulation_kwargs: null + non_blocking: false + split_batches: false + use_seedable_sampler: true +adafactor: + value: false +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +add_cross_attention: + value: false +architectures: + value: + - LlamaForCausalLM +attention_bias: + value: false +attention_dropout: + value: 0 +auto_find_batch_size: + value: false +bad_words_ids: + value: null +batch_eval_metrics: + value: false +batch_size: + value: 3 +begin_suppress_tokens: + value: null +bf16: + value: false +bf16_full_eval: + value: false +bos_token_id: + value: 128000 +chunk_size_feed_forward: + value: 0 +cross_attention_hidden_size: + value: null +data_seed: + value: null +dataloader_drop_last: + value: false +dataloader_num_workers: + value: 0 +dataloader_persistent_workers: + value: false +dataloader_pin_memory: + value: true +dataloader_prefetch_factor: + value: null +ddp_backend: + value: null +ddp_broadcast_buffers: + value: null +ddp_bucket_cap_mb: + value: null +ddp_find_unused_parameters: + value: null +ddp_timeout: + value: 1800 +debug: + value: [] +decoder_start_token_id: + value: null +deepspeed: + value: deepspeed_config/train_dp_config.json +disable_tqdm: + value: false +dispatch_batches: + value: null +diversity_penalty: + value: 0 +do_eval: + value: true +do_predict: + value: false +do_sample: + value: false +do_train: + value: false +early_stopping: + value: false +encoder_no_repeat_ngram_size: + value: 0 +eos_token_id: + value: 128001 +epoch: + value: 3 +eval_accumulation_steps: + value: null +eval_delay: + value: 0 +eval_do_concat_batches: + value: true +eval_on_start: + value: false +eval_steps: + value: 10 +eval_strategy: + value: steps +eval_use_gather_object: + value: false +evaluation_strategy: + value: steps +exponential_decay_length_penalty: + value: null +finetuning_task: + value: null +forced_bos_token_id: + value: null +forced_eos_token_id: + value: null +fp16: + value: true +fp16_backend: + value: auto +fp16_full_eval: + value: false +fp16_opt_level: + value: O1 +fsdp: + value: [] +fsdp_config: + value: + min_num_params: 0 + xla: false + xla_fsdp_grad_ckpt: false + xla_fsdp_v2: false +fsdp_min_num_params: + value: 0 +fsdp_transformer_layer_cls_to_wrap: + value: null +full_determinism: + value: false +gradient_accumulation_steps: + value: 2 +gradient_checkpointing: + value: false +gradient_checkpointing_kwargs: + value: null +greater_is_better: + value: null +group_by_length: + value: false +half_precision_backend: + value: auto +head_dim: + value: 128 +hidden_act: + value: silu +hidden_size: + value: 3072 +hub_always_push: + value: false +hub_model_id: + value: null +hub_private_repo: + value: false +hub_strategy: + value: every_save +hub_token: + value: +id2label: + value: + "0": LABEL_0 + "1": LABEL_1 +ignore_data_skip: + value: false +include_inputs_for_metrics: + value: false +include_num_input_tokens_seen: + value: false +include_tokens_per_second: + value: false +initializer_range: + value: 0.02 +intermediate_size: + value: 8192 +is_decoder: + value: false +is_encoder_decoder: + value: false +jit_mode_eval: + value: false +label_names: + value: null +label_smoothing_factor: + value: 0 +label2id: + value: + LABEL_0: 0 + LABEL_1: 1 +learning_rate: + value: 5e-06 +length_column_name: + value: length +length_penalty: + value: 1 +load_best_model_at_end: + value: false +local_rank: + value: 0 +log_level: + value: passive +log_level_replica: + value: warning +log_on_each_node: + value: true +logging_dir: + value: ./logs +logging_first_step: + value: false +logging_nan_inf_filter: + value: true +logging_steps: + value: 1 +logging_strategy: + value: steps +lr: + value: 5e-06 +lr_scheduler_type: + value: linear +max_grad_norm: + value: 1 +max_length: + value: 20 +max_position_embeddings: + value: 131072 +max_steps: + value: -1 +metric_for_best_model: + value: null +min_length: + value: 0 +mlp_bias: + value: false +model/num_parameters: + value: 3212749824 +model_type: + value: llama +mp_parameters: + value: "" +neftune_noise_alpha: + value: null +no_cuda: + value: false +no_repeat_ngram_size: + value: 0 +num_attention_heads: + value: 24 +num_beam_groups: + value: 1 +num_beams: + value: 1 +num_hidden_layers: + value: 28 +num_key_value_heads: + value: 8 +num_return_sequences: + value: 1 +num_train_epochs: + value: 3 +optim: + value: adamw_torch +optim_args: + value: null +optim_target_modules: + value: null +output_attentions: + value: false +output_dir: + value: ./checkpoints/Llama-3.2-3B/babylm_shuffle_deterministic21_10M_seed0/runs +output_hidden_states: + value: false +output_scores: + value: false +overwrite_output_dir: + value: false +pad_token_id: + value: null +past_index: + value: -1 +per_device_eval_batch_size: + value: 8 +per_device_train_batch_size: + value: 3 +per_gpu_eval_batch_size: + value: null +per_gpu_train_batch_size: + value: null +perturbation: + value: shuffle_deterministic21 +prediction_loss_only: + value: false +prefix: + value: null +pretraining_tp: + value: 1 +problem_type: + value: null +push_to_hub: + value: false +push_to_hub_model_id: + value: null +push_to_hub_organization: + value: null +push_to_hub_token: + value: +ray_scope: + value: last +remove_invalid_values: + value: false +remove_unused_columns: + value: true +repetition_penalty: + value: 1 +report_to: + value: + - wandb +restore_callback_states_from_checkpoint: + value: false +resume_from_checkpoint: + value: null +return_dict: + value: true +return_dict_in_generate: + value: false +rms_norm_eps: + value: 1e-05 +rope_scaling: + value: + factor: 32 + high_freq_factor: 4 + low_freq_factor: 1 + original_max_position_embeddings: 8192 + rope_type: llama3 +rope_theta: + value: 500000 +run_name: + value: ./checkpoints/Llama-3.2-3B/babylm_shuffle_deterministic21_10M_seed0/runs +save_on_each_node: + value: false +save_only_model: + value: false +save_safetensors: + value: true +save_steps: + value: 100 +save_strategy: + value: steps +save_total_limit: + value: null +seed: + value: 0 +sep_token_id: + value: null +skip_memory_metrics: + value: true +split_batches: + value: null +suppress_tokens: + value: null +task_specific_params: + value: null +temperature: + value: 1 +tf_legacy_loss: + value: false +tf32: + value: null +tie_encoder_decoder: + value: false +tie_word_embeddings: + value: true +tokenizer_class: + value: null +top_k: + value: 50 +top_p: + value: 1 +torch_compile: + value: false +torch_compile_backend: + value: null +torch_compile_mode: + value: null +torch_dtype: + value: bfloat16 +torch_empty_cache_steps: + value: null +torchdynamo: + value: null +torchscript: + value: false +tpu_metrics_debug: + value: false +tpu_num_cores: + value: null +train_set: + value: 10M +transformers_version: + value: 4.45.1 +typical_p: + value: 1 +use_bfloat16: + value: false +use_cache: + value: true +use_cpu: + value: false +use_ipex: + value: false +use_legacy_prediction_loop: + value: false +use_liger_kernel: + value: false +use_mps_device: + value: false +vocab_size: + value: 128256 +warmup_ratio: + value: 0.1 +warmup_steps: + value: 0 +weight_decay: + value: 0 diff --git a/wandb/run-20241105_223736-qwsmv2c2/files/output.log b/wandb/run-20241105_223736-qwsmv2c2/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..292dcbb3b4adee324dd4fadaaa83e66e1148db65 --- /dev/null +++ b/wandb/run-20241105_223736-qwsmv2c2/files/output.log @@ -0,0 +1,3030 @@ +Downloading shards: 0%| | 0/2 [00:00', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'evaluation_strategy': 'steps', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'eval_use_gather_object': False} +2024-11-05 22:41:13,699 INFO MainThread:1802404 [wandb_config.py:__setitem__():154] config set model/num_parameters = 3212749824 - > +2024-11-05 22:41:13,699 INFO MainThread:1802404 [wandb_run.py:_config_callback():1390] config_cb model/num_parameters 3212749824 None +2024-11-06 20:14:13,216 INFO MainThread:1802404 [wandb_run.py:_finish():2158] finishing run yaning1001-dartmouth-college/exp-impo-shuffle/qwsmv2c2 +2024-11-06 20:14:13,226 INFO MainThread:1802404 [wandb_run.py:_atexit_cleanup():2426] got exitcode: 0 +2024-11-06 20:14:13,227 INFO MainThread:1802404 [wandb_run.py:_restore():2408] restore +2024-11-06 20:14:13,246 INFO MainThread:1802404 [wandb_run.py:_restore():2414] restore done +2024-11-06 20:14:15,849 INFO MainThread:1802404 [wandb_run.py:_footer_history_summary_info():3975] rendering history +2024-11-06 20:14:15,850 INFO MainThread:1802404 [wandb_run.py:_footer_history_summary_info():4007] rendering summary +2024-11-06 20:14:15,904 INFO MainThread:1802404 [wandb_run.py:_footer_sync_info():3934] logging synced files diff --git a/wandb/run-20241107_160818-f9akdcdz/files/config.yaml b/wandb/run-20241107_160818-f9akdcdz/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..45c9885a80c6c6551af0939982b4765da8541cf4 --- /dev/null +++ b/wandb/run-20241107_160818-f9akdcdz/files/config.yaml @@ -0,0 +1,49 @@ +_wandb: + value: + cli_version: 0.18.5 + m: [] + python_version: 3.9.19 + t: + "1": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "2": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "3": + - 13 + - 23 + - 55 + "4": 3.9.19 + "5": 0.18.5 + "6": 4.45.1 + "8": + - 5 + "12": 0.18.5 + "13": linux-x86_64 +batch_size: + value: 3 +epoch: + value: 3 +lr: + value: 5e-06 +perturbation: + value: shuffle_even_odd +seed: + value: 0 +train_set: + value: 10M diff --git a/wandb/run-20241107_160818-f9akdcdz/files/output.log b/wandb/run-20241107_160818-f9akdcdz/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..8419bf5a98f8e4ad29176680ab3f4c8dd3c582c8 --- /dev/null +++ b/wandb/run-20241107_160818-f9akdcdz/files/output.log @@ -0,0 +1,29 @@ +Error in sys.excepthook: +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/linecache.py", line 46, in getlines + return updatecache(filename, module_globals) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/linecache.py", line 136, in updatecache + with tokenize.open(fullname) as fp: + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/tokenize.py", line 394, in open + encoding, lines = detect_encoding(buffer.readline) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/tokenize.py", line 363, in detect_encoding + first = read_or_stop() + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/tokenize.py", line 321, in read_or_stop + return readline() +KeyboardInterrupt + +Original exception was: +Traceback (most recent call last): + File "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", line 165, in + dataset = load_dataset('babylm_dataset_test.py', name=dataset_name, trust_remote_code=True) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/datasets/load.py", line 2074, in load_dataset + builder_instance = load_dataset_builder( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/datasets/load.py", line 1830, in load_dataset_builder + builder_cls = get_dataset_builder_class(dataset_module, dataset_name=dataset_name) + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/datasets/load.py", line 238, in get_dataset_builder_class + with lock_importable_file( + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/filelock/_api.py", line 255, in __enter__ + self.acquire() + File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/filelock/_api.py", line 225, in acquire + time.sleep(poll_interval) +KeyboardInterrupt diff --git a/wandb/run-20241107_160818-f9akdcdz/files/wandb-metadata.json b/wandb/run-20241107_160818-f9akdcdz/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..e829f3ea64501f1ccfdc636f379fc15443141ee7 --- /dev/null +++ b/wandb/run-20241107_160818-f9akdcdz/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-11-07T21:08:18.504321Z", + "args": [ + "--perturbation", + "shuffle_even_odd", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "3", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", + "codePath": "train/train_deep_wandb.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_deep_wandb.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1742796255232" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241107_160818-f9akdcdz/files/wandb-summary.json b/wandb/run-20241107_160818-f9akdcdz/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..6c37fe1cbbb8aed86fd461a79642cb991e4d35cf --- /dev/null +++ b/wandb/run-20241107_160818-f9akdcdz/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":0}} \ No newline at end of file diff --git a/wandb/run-20241107_160818-f9akdcdz/logs/debug-internal.log b/wandb/run-20241107_160818-f9akdcdz/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..f9b65ed15b50d035cdc4fba387e61e5dcd4a9f0f --- /dev/null +++ b/wandb/run-20241107_160818-f9akdcdz/logs/debug-internal.log @@ -0,0 +1,16 @@ +{"time":"2024-11-07T16:08:18.507288106-05:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-11-07T16:08:18.507308026-05:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241107_160818-f9akdcdz/logs/debug-core.log"} +{"time":"2024-11-07T16:08:18.613771778-05:00","level":"INFO","msg":"created new stream","id":"f9akdcdz"} +{"time":"2024-11-07T16:08:18.613800948-05:00","level":"INFO","msg":"stream: started","id":"f9akdcdz"} +{"time":"2024-11-07T16:08:18.613875638-05:00","level":"INFO","msg":"sender: started","stream_id":"f9akdcdz"} +{"time":"2024-11-07T16:08:18.613869768-05:00","level":"INFO","msg":"handler: started","stream_id":{"value":"f9akdcdz"}} +{"time":"2024-11-07T16:08:18.613831718-05:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"f9akdcdz"}} +{"time":"2024-11-07T16:08:18.784811316-05:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2024-11-07T16:08:18.9425024-05:00","level":"INFO","msg":"stream: closing","id":"f9akdcdz"} +{"time":"2024-11-07T16:08:18.94253656-05:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2024-11-07T16:08:18.942918363-05:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2024-11-07T16:08:19.600274898-05:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2024-11-07T16:08:19.723280773-05:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"f9akdcdz"}} +{"time":"2024-11-07T16:08:19.723362204-05:00","level":"INFO","msg":"sender: closed","stream_id":"f9akdcdz"} +{"time":"2024-11-07T16:08:19.723339764-05:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"f9akdcdz"}} +{"time":"2024-11-07T16:08:19.723455904-05:00","level":"INFO","msg":"stream: closed","id":"f9akdcdz"} diff --git a/wandb/run-20241107_160818-f9akdcdz/logs/debug.log b/wandb/run-20241107_160818-f9akdcdz/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..767dfa23d09bad4394f57b484281b5d69ea513de --- /dev/null +++ b/wandb/run-20241107_160818-f9akdcdz/logs/debug.log @@ -0,0 +1,27 @@ +2024-11-07 16:08:18,502 INFO MainThread:2097270 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-11-07 16:08:18,502 INFO MainThread:2097270 [wandb_setup.py:_flush():79] Configure stats pid to 2097270 +2024-11-07 16:08:18,502 INFO MainThread:2097270 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-11-07 16:08:18,502 INFO MainThread:2097270 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-11-07 16:08:18,502 INFO MainThread:2097270 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-11-07 16:08:18,502 INFO MainThread:2097270 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-11-07 16:08:18,502 INFO MainThread:2097270 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'} +2024-11-07 16:08:18,502 INFO MainThread:2097270 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-11-07 16:08:18,502 INFO MainThread:2097270 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241107_160818-f9akdcdz/logs/debug.log +2024-11-07 16:08:18,502 INFO MainThread:2097270 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241107_160818-f9akdcdz/logs/debug-internal.log +2024-11-07 16:08:18,503 INFO MainThread:2097270 [wandb_init.py:init():621] calling init triggers +2024-11-07 16:08:18,503 INFO MainThread:2097270 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-11-07 16:08:18,503 INFO MainThread:2097270 [wandb_init.py:init():671] starting backend +2024-11-07 16:08:18,503 INFO MainThread:2097270 [wandb_init.py:init():675] sending inform_init request +2024-11-07 16:08:18,503 INFO MainThread:2097270 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-11-07 16:08:18,504 INFO MainThread:2097270 [wandb_init.py:init():688] backend started and connected +2024-11-07 16:08:18,506 INFO MainThread:2097270 [wandb_init.py:init():783] updated telemetry +2024-11-07 16:08:18,532 INFO MainThread:2097270 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-11-07 16:08:18,781 INFO MainThread:2097270 [wandb_init.py:init():867] starting run threads in backend +2024-11-07 16:08:18,917 INFO MainThread:2097270 [wandb_run.py:_console_start():2463] atexit reg +2024-11-07 16:08:18,917 INFO MainThread:2097270 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-11-07 16:08:18,917 INFO MainThread:2097270 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-11-07 16:08:18,917 INFO MainThread:2097270 [wandb_run.py:_redirect():2401] Redirects installed. +2024-11-07 16:08:18,919 INFO MainThread:2097270 [wandb_init.py:init():911] run started, returning control to user process +2024-11-07 16:08:18,919 INFO MainThread:2097270 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'shuffle_even_odd', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0, 'lr': 5e-06} +2024-11-07 16:08:18,942 WARNING MsgRouterThr:2097270 [router.py:message_loop():77] message_loop has been closed diff --git a/wandb/run-20241107_160818-f9akdcdz/run-f9akdcdz.wandb b/wandb/run-20241107_160818-f9akdcdz/run-f9akdcdz.wandb new file mode 100644 index 0000000000000000000000000000000000000000..b29938fd87463da84c3ef218dbd751510d3d9836 Binary files /dev/null and b/wandb/run-20241107_160818-f9akdcdz/run-f9akdcdz.wandb differ diff --git a/wandb/run-20241114_215552-jvyhzech/files/config.yaml b/wandb/run-20241114_215552-jvyhzech/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8d07c4054eb6cea6e4e93d3fb630156b4b6a6798 --- /dev/null +++ b/wandb/run-20241114_215552-jvyhzech/files/config.yaml @@ -0,0 +1,50 @@ +_wandb: + value: + cli_version: 0.18.5 + m: [] + python_version: 3.9.19 + t: + "1": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "2": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "3": + - 2 + - 13 + - 23 + - 55 + "4": 3.9.19 + "5": 0.18.5 + "6": 4.45.1 + "8": + - 5 + "12": 0.18.5 + "13": linux-x86_64 +batch_size: + value: 3 +epoch: + value: 3 +lr: + value: 5e-06 +perturbation: + value: reverse_control +seed: + value: 0 +train_set: + value: 10M diff --git a/wandb/run-20241114_215552-jvyhzech/files/output.log b/wandb/run-20241114_215552-jvyhzech/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..e33c8a2186f3cd5b1a8d504d5161de384b23bb9c --- /dev/null +++ b/wandb/run-20241114_215552-jvyhzech/files/output.log @@ -0,0 +1,29 @@ +100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1098360/1098360 [00:04<00:00, 262057.63it/s] +100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1098360/1098360 [00:00<00:00, 3359828.51it/s] +100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17520/17520 [00:00<00:00, 36662.69it/s] +Generating train split: 17519 examples [00:07, 2239.43 examples/s]██████████████████████████████████████████████████████████████████████████▋ | 13436/17520 [00:00<00:00, 36902.73it/s] +100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1086121/1086121 [00:04<00:00, 242217.27it/s] +100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1086121/1086121 [00:00<00:00, 3182554.79it/s] +100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 18141/18141 [00:00<00:00, 35851.00it/s] +Generating validation split: 18140 examples [00:08, 2204.82 examples/s]████████████████████████████████████████████████████████████████████████████████████▏ | 15397/18141 [00:00<00:00, 37848.30it/s] +100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1031323/1031323 [00:03<00:00, 280907.77it/s] +100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1031323/1031323 [00:00<00:00, 3338998.20it/s] +100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16483/16483 [00:00<00:00, 37865.64it/s] +Generating test split: 16482 examples [00:07, 2317.75 examples/s]███████████████████████████████████████████████████████████████▉ | 11543/16483 [00:00<00:00, 37097.32it/s] +config.json: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 844/844 [00:00<00:00, 613kB/s] +Downloading shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [02:36<00:00, 78.36s/it] +Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:04<00:00, 2.06s/it] +Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17519/17519 [00:47<00:00, 372.49 examples/s] +Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 18140/18140 [00:48<00:00, 372.76 examples/s] +tokenized_valid: Dataset({ + features: ['input_ids', 'attention_mask'], + num_rows: 1000 +}) +/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead + warnings.warn( +[2024-11-14 22:00:35,892] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-11-14 22:00:43,002] [INFO] [comm.py:652:init_distributed] cdb=None +Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination +Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root... +Loading extension module cpu_adam... +Time to load cpu_adam op: 6.291985988616943 seconds diff --git a/wandb/run-20241114_215552-jvyhzech/files/wandb-metadata.json b/wandb/run-20241114_215552-jvyhzech/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..05990835a93d930c334b5e5d6d32f4c2ff27564a --- /dev/null +++ b/wandb/run-20241114_215552-jvyhzech/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-11-15T02:55:52.854273Z", + "args": [ + "--perturbation", + "reverse_control", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "3", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_ftp.py", + "codePath": "train/train_ftp.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_ftp.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1745777426432" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241114_215552-jvyhzech/files/wandb-summary.json b/wandb/run-20241114_215552-jvyhzech/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..7c2e4e5e6400d631494f6c54be0cc9043bc8c89b --- /dev/null +++ b/wandb/run-20241114_215552-jvyhzech/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":46111}} \ No newline at end of file diff --git a/wandb/run-20241114_215552-jvyhzech/logs/debug-internal.log b/wandb/run-20241114_215552-jvyhzech/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..8d1dd39b4c9a6d1a5bb88181ca21b23539d1a1ac --- /dev/null +++ b/wandb/run-20241114_215552-jvyhzech/logs/debug-internal.log @@ -0,0 +1,17 @@ +{"time":"2024-11-14T21:55:52.857825249-05:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-11-14T21:55:52.857851439-05:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241114_215552-jvyhzech/logs/debug-core.log"} +{"time":"2024-11-14T21:55:52.969064907-05:00","level":"INFO","msg":"created new stream","id":"jvyhzech"} +{"time":"2024-11-14T21:55:52.969121807-05:00","level":"INFO","msg":"stream: started","id":"jvyhzech"} +{"time":"2024-11-14T21:55:52.969239048-05:00","level":"INFO","msg":"sender: started","stream_id":"jvyhzech"} +{"time":"2024-11-14T21:55:52.969185397-05:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"jvyhzech"}} +{"time":"2024-11-14T21:55:52.969233518-05:00","level":"INFO","msg":"handler: started","stream_id":{"value":"jvyhzech"}} +{"time":"2024-11-14T21:55:53.28076955-05:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2024-11-15T10:44:24.010096874-05:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2024-11-15T10:44:24.127839713-05:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2024-11-15T10:44:24.795419194-05:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2024-11-15T10:44:24.910165238-05:00","level":"INFO","msg":"handler: operation stats","stats":{}} +{"time":"2024-11-15T10:44:25.996790127-05:00","level":"INFO","msg":"stream: closing","id":"jvyhzech"} +{"time":"2024-11-15T10:44:25.996825787-05:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"jvyhzech"}} +{"time":"2024-11-15T10:44:25.997004768-05:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"jvyhzech"}} +{"time":"2024-11-15T10:44:25.997045508-05:00","level":"INFO","msg":"sender: closed","stream_id":"jvyhzech"} +{"time":"2024-11-15T10:44:25.9973193-05:00","level":"INFO","msg":"stream: closed","id":"jvyhzech"} diff --git a/wandb/run-20241114_215552-jvyhzech/logs/debug.log b/wandb/run-20241114_215552-jvyhzech/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..cbac020e967fd7e01c2892105aceef29057da8e4 --- /dev/null +++ b/wandb/run-20241114_215552-jvyhzech/logs/debug.log @@ -0,0 +1,33 @@ +2024-11-14 21:55:52,851 INFO MainThread:2578626 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-11-14 21:55:52,852 INFO MainThread:2578626 [wandb_setup.py:_flush():79] Configure stats pid to 2578626 +2024-11-14 21:55:52,852 INFO MainThread:2578626 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-11-14 21:55:52,852 INFO MainThread:2578626 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-11-14 21:55:52,852 INFO MainThread:2578626 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-11-14 21:55:52,852 INFO MainThread:2578626 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-11-14 21:55:52,852 INFO MainThread:2578626 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_ftp.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_ftp.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_ftp.py'} +2024-11-14 21:55:52,852 INFO MainThread:2578626 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-11-14 21:55:52,852 INFO MainThread:2578626 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241114_215552-jvyhzech/logs/debug.log +2024-11-14 21:55:52,852 INFO MainThread:2578626 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241114_215552-jvyhzech/logs/debug-internal.log +2024-11-14 21:55:52,852 INFO MainThread:2578626 [wandb_init.py:init():621] calling init triggers +2024-11-14 21:55:52,852 INFO MainThread:2578626 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-11-14 21:55:52,852 INFO MainThread:2578626 [wandb_init.py:init():671] starting backend +2024-11-14 21:55:52,852 INFO MainThread:2578626 [wandb_init.py:init():675] sending inform_init request +2024-11-14 21:55:52,853 INFO MainThread:2578626 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-11-14 21:55:52,854 INFO MainThread:2578626 [wandb_init.py:init():688] backend started and connected +2024-11-14 21:55:52,857 INFO MainThread:2578626 [wandb_init.py:init():783] updated telemetry +2024-11-14 21:55:52,887 INFO MainThread:2578626 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-11-14 21:55:53,277 INFO MainThread:2578626 [wandb_init.py:init():867] starting run threads in backend +2024-11-14 21:55:53,403 INFO MainThread:2578626 [wandb_run.py:_console_start():2463] atexit reg +2024-11-14 21:55:53,403 INFO MainThread:2578626 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-11-14 21:55:53,403 INFO MainThread:2578626 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-11-14 21:55:53,403 INFO MainThread:2578626 [wandb_run.py:_redirect():2401] Redirects installed. +2024-11-14 21:55:53,405 INFO MainThread:2578626 [wandb_init.py:init():911] run started, returning control to user process +2024-11-14 21:55:53,406 INFO MainThread:2578626 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_control', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0, 'lr': 5e-06} +2024-11-15 10:44:23,683 INFO MainThread:2578626 [wandb_run.py:_finish():2158] finishing run yaning1001-dartmouth-college/exp-impo-reverse/jvyhzech +2024-11-15 10:44:23,881 INFO MainThread:2578626 [wandb_run.py:_atexit_cleanup():2426] got exitcode: 0 +2024-11-15 10:44:23,927 INFO MainThread:2578626 [wandb_run.py:_restore():2408] restore +2024-11-15 10:44:23,927 INFO MainThread:2578626 [wandb_run.py:_restore():2414] restore done +2024-11-15 10:44:25,914 INFO MainThread:2578626 [wandb_run.py:_footer_history_summary_info():3975] rendering history +2024-11-15 10:44:25,915 INFO MainThread:2578626 [wandb_run.py:_footer_history_summary_info():4007] rendering summary +2024-11-15 10:44:25,995 INFO MainThread:2578626 [wandb_run.py:_footer_sync_info():3934] logging synced files diff --git a/wandb/run-20241128_161554-9wf9o0ou/files/config.yaml b/wandb/run-20241128_161554-9wf9o0ou/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..804aafd96faca51be3aba452a8e248fe2cbd48d6 --- /dev/null +++ b/wandb/run-20241128_161554-9wf9o0ou/files/config.yaml @@ -0,0 +1,531 @@ +_name_or_path: + value: meta-llama/Llama-3.2-1B +_wandb: + value: + cli_version: 0.18.5 + m: + - "1": eval/runtime + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": train/global_step + "6": + - 3 + "7": [] + - "1": eval/samples_per_second + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": train/loss + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": train/grad_norm + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": train/learning_rate + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": eval/steps_per_second + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": train/epoch + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": eval/loss + "5": 2 + "6": + - 1 + - 3 + "7": [] + python_version: 3.9.19 + t: + "1": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "2": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "3": + - 2 + - 7 + - 13 + - 19 + - 23 + - 55 + - 62 + - 66 + "4": 3.9.19 + "5": 0.18.5 + "6": 4.45.1 + "8": + - 5 + "9": + "1": transformers_trainer + "12": 0.18.5 + "13": linux-x86_64 +accelerator_config: + value: + dispatch_batches: null + even_batches: true + gradient_accumulation_kwargs: null + non_blocking: false + split_batches: false + use_seedable_sampler: true +adafactor: + value: false +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +add_cross_attention: + value: false +architectures: + value: + - LlamaForCausalLM +attention_bias: + value: false +attention_dropout: + value: 0 +auto_find_batch_size: + value: false +bad_words_ids: + value: null +batch_eval_metrics: + value: false +batch_size: + value: 3 +begin_suppress_tokens: + value: null +bf16: + value: false +bf16_full_eval: + value: false +bos_token_id: + value: 128000 +chunk_size_feed_forward: + value: 0 +cross_attention_hidden_size: + value: null +data_seed: + value: null +dataloader_drop_last: + value: false +dataloader_num_workers: + value: 0 +dataloader_persistent_workers: + value: false +dataloader_pin_memory: + value: true +dataloader_prefetch_factor: + value: null +ddp_backend: + value: null +ddp_broadcast_buffers: + value: null +ddp_bucket_cap_mb: + value: null +ddp_find_unused_parameters: + value: null +ddp_timeout: + value: 1800 +debug: + value: [] +decoder_start_token_id: + value: null +deepspeed: + value: deepspeed_config/train_dp_config.json +disable_tqdm: + value: false +dispatch_batches: + value: null +diversity_penalty: + value: 0 +do_eval: + value: true +do_predict: + value: false +do_sample: + value: false +do_train: + value: false +early_stopping: + value: false +encoder_no_repeat_ngram_size: + value: 0 +eos_token_id: + value: 128001 +epoch: + value: 3 +eval_accumulation_steps: + value: null +eval_delay: + value: 0 +eval_do_concat_batches: + value: true +eval_on_start: + value: false +eval_steps: + value: 10 +eval_strategy: + value: steps +eval_use_gather_object: + value: false +evaluation_strategy: + value: steps +exponential_decay_length_penalty: + value: null +finetuning_task: + value: null +forced_bos_token_id: + value: null +forced_eos_token_id: + value: null +fp16: + value: true +fp16_backend: + value: auto +fp16_full_eval: + value: false +fp16_opt_level: + value: O1 +fsdp: + value: [] +fsdp_config: + value: + min_num_params: 0 + xla: false + xla_fsdp_grad_ckpt: false + xla_fsdp_v2: false +fsdp_min_num_params: + value: 0 +fsdp_transformer_layer_cls_to_wrap: + value: null +full_determinism: + value: false +gradient_accumulation_steps: + value: 2 +gradient_checkpointing: + value: false +gradient_checkpointing_kwargs: + value: null +greater_is_better: + value: null +group_by_length: + value: false +half_precision_backend: + value: auto +head_dim: + value: 64 +hidden_act: + value: silu +hidden_size: + value: 2048 +hub_always_push: + value: false +hub_model_id: + value: null +hub_private_repo: + value: false +hub_strategy: + value: every_save +hub_token: + value: +id2label: + value: + "0": LABEL_0 + "1": LABEL_1 +ignore_data_skip: + value: false +include_inputs_for_metrics: + value: false +include_num_input_tokens_seen: + value: false +include_tokens_per_second: + value: false +initializer_range: + value: 0.02 +intermediate_size: + value: 8192 +is_decoder: + value: false +is_encoder_decoder: + value: false +jit_mode_eval: + value: false +label_names: + value: null +label_smoothing_factor: + value: 0 +label2id: + value: + LABEL_0: 0 + LABEL_1: 1 +learning_rate: + value: 5e-06 +length_column_name: + value: length +length_penalty: + value: 1 +load_best_model_at_end: + value: false +local_rank: + value: 0 +log_level: + value: passive +log_level_replica: + value: warning +log_on_each_node: + value: true +logging_dir: + value: ./logs +logging_first_step: + value: false +logging_nan_inf_filter: + value: true +logging_steps: + value: 1 +logging_strategy: + value: steps +lr: + value: 5e-06 +lr_scheduler_type: + value: linear +max_grad_norm: + value: 1 +max_length: + value: 20 +max_position_embeddings: + value: 131072 +max_steps: + value: -1 +metric_for_best_model: + value: null +min_length: + value: 0 +mlp_bias: + value: false +model/num_parameters: + value: 1235814400 +model_type: + value: llama +mp_parameters: + value: "" +neftune_noise_alpha: + value: null +no_cuda: + value: false +no_repeat_ngram_size: + value: 0 +num_attention_heads: + value: 32 +num_beam_groups: + value: 1 +num_beams: + value: 1 +num_hidden_layers: + value: 16 +num_key_value_heads: + value: 8 +num_return_sequences: + value: 1 +num_train_epochs: + value: 3 +optim: + value: adamw_torch +optim_args: + value: null +optim_target_modules: + value: null +output_attentions: + value: false +output_dir: + value: ./checkpoints/Llama-3.2-1B/babylm_reverse_control_10M_seed0/runs +output_hidden_states: + value: false +output_scores: + value: false +overwrite_output_dir: + value: false +pad_token_id: + value: null +past_index: + value: -1 +per_device_eval_batch_size: + value: 8 +per_device_train_batch_size: + value: 3 +per_gpu_eval_batch_size: + value: null +per_gpu_train_batch_size: + value: null +perturbation: + value: reverse_control +prediction_loss_only: + value: false +prefix: + value: null +pretraining_tp: + value: 1 +problem_type: + value: null +push_to_hub: + value: false +push_to_hub_model_id: + value: null +push_to_hub_organization: + value: null +push_to_hub_token: + value: +ray_scope: + value: last +remove_invalid_values: + value: false +remove_unused_columns: + value: true +repetition_penalty: + value: 1 +report_to: + value: + - wandb +restore_callback_states_from_checkpoint: + value: false +resume_from_checkpoint: + value: null +return_dict: + value: true +return_dict_in_generate: + value: false +rms_norm_eps: + value: 1e-05 +rope_scaling: + value: + factor: 32 + high_freq_factor: 4 + low_freq_factor: 1 + original_max_position_embeddings: 8192 + rope_type: llama3 +rope_theta: + value: 500000 +run_name: + value: ./checkpoints/Llama-3.2-1B/babylm_reverse_control_10M_seed0/runs +save_on_each_node: + value: false +save_only_model: + value: false +save_safetensors: + value: true +save_steps: + value: 100 +save_strategy: + value: steps +save_total_limit: + value: null +seed: + value: 0 +sep_token_id: + value: null +skip_memory_metrics: + value: true +split_batches: + value: null +suppress_tokens: + value: null +task_specific_params: + value: null +temperature: + value: 1 +tf_legacy_loss: + value: false +tf32: + value: null +tie_encoder_decoder: + value: false +tie_word_embeddings: + value: true +tokenizer_class: + value: null +top_k: + value: 50 +top_p: + value: 1 +torch_compile: + value: false +torch_compile_backend: + value: null +torch_compile_mode: + value: null +torch_dtype: + value: bfloat16 +torch_empty_cache_steps: + value: null +torchdynamo: + value: null +torchscript: + value: false +tpu_metrics_debug: + value: false +tpu_num_cores: + value: null +train_set: + value: 10M +transformers_version: + value: 4.45.1 +typical_p: + value: 1 +use_bfloat16: + value: false +use_cache: + value: true +use_cpu: + value: false +use_ipex: + value: false +use_legacy_prediction_loop: + value: false +use_liger_kernel: + value: false +use_mps_device: + value: false +vocab_size: + value: 128256 +warmup_ratio: + value: 0.1 +warmup_steps: + value: 0 +weight_decay: + value: 0 diff --git a/wandb/run-20241128_161554-9wf9o0ou/files/output.log b/wandb/run-20241128_161554-9wf9o0ou/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..91469b247fe3813563311c0fddc883a8ff181f18 --- /dev/null +++ b/wandb/run-20241128_161554-9wf9o0ou/files/output.log @@ -0,0 +1,3233 @@ +config.json: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 843/843 [00:00<00:00, 266kB/s] +model.safetensors: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2.47G/2.47G [00:58<00:00, 42.0MB/s] +generation_config.json: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 185/185 [00:00<00:00, 41.5kB/s] +Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17519/17519 [00:48<00:00, 361.40 examples/s] +Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 18140/18140 [00:50<00:00, 361.57 examples/s] +tokenized_valid: Dataset({ + features: ['input_ids', 'attention_mask'], + num_rows: 1000 +}) +/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead + warnings.warn( +[2024-11-28 16:18:35,811] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-11-28 16:18:41,357] [INFO] [comm.py:652:init_distributed] cdb=None +[2024-11-28 16:18:41,357] [INFO] [comm.py:683:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher. +Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination +Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root... +Loading extension module cpu_adam... +Time to load cpu_adam op: 28.037892818450928 seconds +wandb: WARNING The `run_name` is currently set to the same value as `TrainingArguments.output_dir`. If this was not intended, please specify a different run name by setting the `TrainingArguments.run_name` parameter. + +{'loss': 2.7949, 'grad_norm': 0.0, 'learning_rate': 0.0, 'epoch': 0.0} +{'loss': 2.794, 'grad_norm': 0.0, 'learning_rate': 0.0, 'epoch': 0.0} +{'loss': 2.849, 'grad_norm': 0.0, 'learning_rate': 0.0, 'epoch': 0.0} +{'loss': 2.8447, 'grad_norm': 0.0, 'learning_rate': 0.0, 'epoch': 0.0} +{'loss': 2.8176, 'grad_norm': 0.0, 'learning_rate': 0.0, 'epoch': 0.01} +{'loss': 2.8343, 'grad_norm': 0.0, 'learning_rate': 0.0, 'epoch': 0.01} +{'loss': 2.8089, 'grad_norm': 0.0, 'learning_rate': 0.0, 'epoch': 0.01} +{'loss': 2.8487, 'grad_norm': 0.0, 'learning_rate': 0.0, 'epoch': 0.01} +{'loss': 2.8257, 'grad_norm': 0.0, 'learning_rate': 0.0, 'epoch': 0.01} +{'loss': 2.8376, 'grad_norm': 0.0, 'learning_rate': 0.0, 'epoch': 0.01} + +{'eval_loss': 2.8435168266296387, 'eval_runtime': 24.1524, 'eval_samples_per_second': 41.404, 'eval_steps_per_second': 1.739, 'epoch': 0.01} +{'loss': 2.7898, 'grad_norm': 0.0, 'learning_rate': 0.0, 'epoch': 0.01} +{'loss': 2.7829, 'grad_norm': 0.0, 'learning_rate': 0.0, 'epoch': 0.01} +{'loss': 2.8467, 'grad_norm': 0.0, 'learning_rate': 0.0, 'epoch': 0.01} +{'loss': 2.8104, 'grad_norm': 0.0, 'learning_rate': 0.0, 'epoch': 0.01} +{'loss': 2.8158, 'grad_norm': 0.0, 'learning_rate': 0.0, 'epoch': 0.02} +{'loss': 2.7963, 'grad_norm': 8.208587646484375, 'learning_rate': 1.7123287671232876e-08, 'epoch': 0.02} +{'loss': 2.7631, 'grad_norm': 7.8689494132995605, 'learning_rate': 3.424657534246575e-08, 'epoch': 0.02} +{'loss': 2.843, 'grad_norm': 7.920777797698975, 'learning_rate': 5.136986301369863e-08, 'epoch': 0.02} +{'loss': 2.7625, 'grad_norm': 8.07182788848877, 'learning_rate': 6.84931506849315e-08, 'epoch': 0.02} +{'loss': 2.8307, 'grad_norm': 7.792869567871094, 'learning_rate': 8.561643835616439e-08, 'epoch': 0.02} +{'eval_loss': 2.8431649208068848, 'eval_runtime': 24.3436, 'eval_samples_per_second': 41.079, 'eval_steps_per_second': 1.725, 'epoch': 0.02} +{'loss': 2.8443, 'grad_norm': 7.987356185913086, 'learning_rate': 1.0273972602739726e-07, 'epoch': 0.02} +{'loss': 2.8316, 'grad_norm': 7.717030048370361, 'learning_rate': 1.1986301369863014e-07, 'epoch': 0.02} +{'loss': 2.7948, 'grad_norm': 7.734523773193359, 'learning_rate': 1.36986301369863e-07, 'epoch': 0.02} +{'loss': 2.8119, 'grad_norm': 7.9570698738098145, 'learning_rate': 1.541095890410959e-07, 'epoch': 0.02} +{'loss': 2.801, 'grad_norm': 7.1653242111206055, 'learning_rate': 1.7123287671232878e-07, 'epoch': 0.03} +{'loss': 2.828, 'grad_norm': 7.6044087409973145, 'learning_rate': 1.8835616438356165e-07, 'epoch': 0.03} +{'loss': 2.7904, 'grad_norm': 7.117634296417236, 'learning_rate': 2.0547945205479452e-07, 'epoch': 0.03} +{'loss': 2.8134, 'grad_norm': 6.440175533294678, 'learning_rate': 2.226027397260274e-07, 'epoch': 0.03} +{'loss': 2.8129, 'grad_norm': 6.633477687835693, 'learning_rate': 2.397260273972603e-07, 'epoch': 0.03} +{'loss': 2.8084, 'grad_norm': 6.134422779083252, 'learning_rate': 2.568493150684932e-07, 'epoch': 0.03} +{'eval_loss': 2.8217151165008545, 'eval_runtime': 24.1079, 'eval_samples_per_second': 41.48, 'eval_steps_per_second': 1.742, 'epoch': 0.03} +{'loss': 2.7846, 'grad_norm': 6.14453649520874, 'learning_rate': 2.73972602739726e-07, 'epoch': 0.03} +{'loss': 2.8163, 'grad_norm': 4.827593803405762, 'learning_rate': 2.910958904109589e-07, 'epoch': 0.03} +{'loss': 2.7796, 'grad_norm': 4.798914909362793, 'learning_rate': 3.082191780821918e-07, 'epoch': 0.03} +{'loss': 2.7677, 'grad_norm': 4.513079643249512, 'learning_rate': 3.2534246575342466e-07, 'epoch': 0.03} +{'loss': 2.751, 'grad_norm': 4.523262977600098, 'learning_rate': 3.4246575342465755e-07, 'epoch': 0.04} +{'loss': 2.671, 'grad_norm': 4.647594451904297, 'learning_rate': 3.595890410958904e-07, 'epoch': 0.04} +{'loss': 2.7872, 'grad_norm': 4.41637659072876, 'learning_rate': 3.767123287671233e-07, 'epoch': 0.04} +{'loss': 2.7096, 'grad_norm': 4.175353527069092, 'learning_rate': 3.938356164383562e-07, 'epoch': 0.04} +{'loss': 2.7014, 'grad_norm': 4.172060012817383, 'learning_rate': 4.1095890410958903e-07, 'epoch': 0.04} +{'loss': 2.7485, 'grad_norm': 4.146830081939697, 'learning_rate': 4.2808219178082193e-07, 'epoch': 0.04} +{'eval_loss': 2.7425625324249268, 'eval_runtime': 24.1359, 'eval_samples_per_second': 41.432, 'eval_steps_per_second': 1.74, 'epoch': 0.04} +{'loss': 2.7056, 'grad_norm': 4.258517265319824, 'learning_rate': 4.452054794520548e-07, 'epoch': 0.04} +{'loss': 2.6478, 'grad_norm': 4.350872993469238, 'learning_rate': 4.6232876712328767e-07, 'epoch': 0.04} +{'loss': 2.7047, 'grad_norm': 4.057697772979736, 'learning_rate': 4.794520547945206e-07, 'epoch': 0.04} +{'loss': 2.6638, 'grad_norm': 3.800611734390259, 'learning_rate': 4.965753424657534e-07, 'epoch': 0.05} +{'loss': 2.6942, 'grad_norm': 3.6547670364379883, 'learning_rate': 5.136986301369864e-07, 'epoch': 0.05} +{'loss': 2.6794, 'grad_norm': 3.3958444595336914, 'learning_rate': 5.308219178082192e-07, 'epoch': 0.05} +{'loss': 2.7227, 'grad_norm': 3.4671475887298584, 'learning_rate': 5.47945205479452e-07, 'epoch': 0.05} +{'loss': 2.643, 'grad_norm': 3.4694061279296875, 'learning_rate': 5.65068493150685e-07, 'epoch': 0.05} +{'loss': 2.6573, 'grad_norm': 3.286062240600586, 'learning_rate': 5.821917808219178e-07, 'epoch': 0.05} +{'loss': 2.5999, 'grad_norm': 3.399857521057129, 'learning_rate': 5.993150684931507e-07, 'epoch': 0.05} +{'eval_loss': 2.667592763900757, 'eval_runtime': 24.1409, 'eval_samples_per_second': 41.424, 'eval_steps_per_second': 1.74, 'epoch': 0.05} +{'loss': 2.5836, 'grad_norm': 3.09999680519104, 'learning_rate': 6.164383561643836e-07, 'epoch': 0.05} +{'loss': 2.6383, 'grad_norm': 2.916637420654297, 'learning_rate': 6.335616438356165e-07, 'epoch': 0.05} +{'loss': 2.5648, 'grad_norm': 3.0454094409942627, 'learning_rate': 6.506849315068493e-07, 'epoch': 0.05} +{'loss': 2.6149, 'grad_norm': 2.8374462127685547, 'learning_rate': 6.678082191780823e-07, 'epoch': 0.06} +{'loss': 2.5845, 'grad_norm': 2.893265724182129, 'learning_rate': 6.849315068493151e-07, 'epoch': 0.06} +{'loss': 2.5536, 'grad_norm': 2.6646595001220703, 'learning_rate': 7.020547945205481e-07, 'epoch': 0.06} +{'loss': 2.581, 'grad_norm': 2.5691609382629395, 'learning_rate': 7.191780821917808e-07, 'epoch': 0.06} +{'loss': 2.5095, 'grad_norm': 2.675809144973755, 'learning_rate': 7.363013698630137e-07, 'epoch': 0.06} +{'loss': 2.5718, 'grad_norm': 2.448173999786377, 'learning_rate': 7.534246575342466e-07, 'epoch': 0.06} +{'loss': 2.5766, 'grad_norm': 2.6517527103424072, 'learning_rate': 7.705479452054795e-07, 'epoch': 0.06} +{'eval_loss': 2.6060078144073486, 'eval_runtime': 24.3949, 'eval_samples_per_second': 40.992, 'eval_steps_per_second': 1.722, 'epoch': 0.06} +{'loss': 2.5455, 'grad_norm': 2.60750412940979, 'learning_rate': 7.876712328767124e-07, 'epoch': 0.06} +{'loss': 2.5254, 'grad_norm': 2.389585494995117, 'learning_rate': 8.047945205479453e-07, 'epoch': 0.06} +{'loss': 2.5751, 'grad_norm': 2.701202154159546, 'learning_rate': 8.219178082191781e-07, 'epoch': 0.06} +{'loss': 2.5606, 'grad_norm': 2.5011820793151855, 'learning_rate': 8.39041095890411e-07, 'epoch': 0.07} +{'loss': 2.43, 'grad_norm': 2.517143964767456, 'learning_rate': 8.561643835616439e-07, 'epoch': 0.07} +{'loss': 2.5262, 'grad_norm': 2.296834945678711, 'learning_rate': 8.732876712328768e-07, 'epoch': 0.07} +{'loss': 2.4702, 'grad_norm': 2.21740460395813, 'learning_rate': 8.904109589041097e-07, 'epoch': 0.07} +{'loss': 2.5263, 'grad_norm': 2.5176782608032227, 'learning_rate': 9.075342465753426e-07, 'epoch': 0.07} +{'loss': 2.4791, 'grad_norm': 2.376972198486328, 'learning_rate': 9.246575342465753e-07, 'epoch': 0.07} +{'loss': 2.5813, 'grad_norm': 2.264622926712036, 'learning_rate': 9.417808219178083e-07, 'epoch': 0.07} +{'eval_loss': 2.5454611778259277, 'eval_runtime': 24.3769, 'eval_samples_per_second': 41.022, 'eval_steps_per_second': 1.723, 'epoch': 0.07} +{'loss': 2.4762, 'grad_norm': 2.2051570415496826, 'learning_rate': 9.589041095890411e-07, 'epoch': 0.07} +{'loss': 2.4346, 'grad_norm': 2.209958076477051, 'learning_rate': 9.76027397260274e-07, 'epoch': 0.07} +{'loss': 2.4929, 'grad_norm': 2.59735369682312, 'learning_rate': 9.931506849315068e-07, 'epoch': 0.07} +{'loss': 2.4637, 'grad_norm': 2.3152217864990234, 'learning_rate': 1.0102739726027399e-06, 'epoch': 0.08} +{'loss': 2.4602, 'grad_norm': 2.1851887702941895, 'learning_rate': 1.0273972602739727e-06, 'epoch': 0.08} +{'loss': 2.4479, 'grad_norm': 2.234846353530884, 'learning_rate': 1.0445205479452056e-06, 'epoch': 0.08} +{'loss': 2.4569, 'grad_norm': 2.203435182571411, 'learning_rate': 1.0616438356164384e-06, 'epoch': 0.08} +{'loss': 2.4679, 'grad_norm': 2.365788698196411, 'learning_rate': 1.0787671232876712e-06, 'epoch': 0.08} +{'loss': 2.4044, 'grad_norm': 2.2453958988189697, 'learning_rate': 1.095890410958904e-06, 'epoch': 0.08} +{'loss': 2.4457, 'grad_norm': 2.108588933944702, 'learning_rate': 1.1130136986301371e-06, 'epoch': 0.08} +{'eval_loss': 2.483847141265869, 'eval_runtime': 24.5919, 'eval_samples_per_second': 40.664, 'eval_steps_per_second': 1.708, 'epoch': 0.08} +{'loss': 2.3926, 'grad_norm': 2.12548565864563, 'learning_rate': 1.13013698630137e-06, 'epoch': 0.08} +{'loss': 2.4181, 'grad_norm': 2.2272450923919678, 'learning_rate': 1.1472602739726028e-06, 'epoch': 0.08} +{'loss': 2.4233, 'grad_norm': 2.2669615745544434, 'learning_rate': 1.1643835616438357e-06, 'epoch': 0.09} +{'loss': 2.3906, 'grad_norm': 2.1540491580963135, 'learning_rate': 1.1815068493150685e-06, 'epoch': 0.09} +{'loss': 2.3578, 'grad_norm': 2.2635693550109863, 'learning_rate': 1.1986301369863014e-06, 'epoch': 0.09} +{'loss': 2.4202, 'grad_norm': 2.8080427646636963, 'learning_rate': 1.2157534246575344e-06, 'epoch': 0.09} +{'loss': 2.3723, 'grad_norm': 2.4804835319519043, 'learning_rate': 1.2328767123287673e-06, 'epoch': 0.09} +{'loss': 2.3514, 'grad_norm': 2.43143630027771, 'learning_rate': 1.25e-06, 'epoch': 0.09} +{'loss': 2.3653, 'grad_norm': 2.3041858673095703, 'learning_rate': 1.267123287671233e-06, 'epoch': 0.09} +{'loss': 2.333, 'grad_norm': 2.603179454803467, 'learning_rate': 1.284246575342466e-06, 'epoch': 0.09} +{'eval_loss': 2.4336342811584473, 'eval_runtime': 24.8009, 'eval_samples_per_second': 40.321, 'eval_steps_per_second': 1.693, 'epoch': 0.09} +{'loss': 2.3422, 'grad_norm': 2.500437021255493, 'learning_rate': 1.3013698630136986e-06, 'epoch': 0.09} +{'loss': 2.3605, 'grad_norm': 2.5312438011169434, 'learning_rate': 1.3184931506849317e-06, 'epoch': 0.09} +{'loss': 2.339, 'grad_norm': 2.693019151687622, 'learning_rate': 1.3356164383561645e-06, 'epoch': 0.1} +{'loss': 2.3238, 'grad_norm': 2.756967544555664, 'learning_rate': 1.3527397260273976e-06, 'epoch': 0.1} +{'loss': 2.3837, 'grad_norm': 3.118143081665039, 'learning_rate': 1.3698630136986302e-06, 'epoch': 0.1} +{'loss': 2.2833, 'grad_norm': 2.717676877975464, 'learning_rate': 1.386986301369863e-06, 'epoch': 0.1} +{'loss': 2.3305, 'grad_norm': 2.60052752494812, 'learning_rate': 1.4041095890410961e-06, 'epoch': 0.1} +{'loss': 2.2287, 'grad_norm': 3.225191116333008, 'learning_rate': 1.421232876712329e-06, 'epoch': 0.1} +{'loss': 2.2596, 'grad_norm': 2.520937204360962, 'learning_rate': 1.4383561643835616e-06, 'epoch': 0.1} +{'loss': 2.2857, 'grad_norm': 3.3900930881500244, 'learning_rate': 1.4554794520547946e-06, 'epoch': 0.1} +{'eval_loss': 2.386864185333252, 'eval_runtime': 25.0939, 'eval_samples_per_second': 39.85, 'eval_steps_per_second': 1.674, 'epoch': 0.1} +{'loss': 2.2772, 'grad_norm': 2.6671180725097656, 'learning_rate': 1.4726027397260275e-06, 'epoch': 0.1} +{'loss': 2.2605, 'grad_norm': 2.609212875366211, 'learning_rate': 1.4897260273972605e-06, 'epoch': 0.1} +{'loss': 2.2182, 'grad_norm': 2.80885910987854, 'learning_rate': 1.5068493150684932e-06, 'epoch': 0.11} +{'loss': 2.2448, 'grad_norm': 2.74750018119812, 'learning_rate': 1.5239726027397262e-06, 'epoch': 0.11} +{'loss': 2.2939, 'grad_norm': 3.8605799674987793, 'learning_rate': 1.541095890410959e-06, 'epoch': 0.11} +{'loss': 2.2631, 'grad_norm': 3.5136616230010986, 'learning_rate': 1.5582191780821921e-06, 'epoch': 0.11} +{'loss': 2.2944, 'grad_norm': 4.318981647491455, 'learning_rate': 1.5753424657534248e-06, 'epoch': 0.11} +{'loss': 2.2668, 'grad_norm': 4.236979007720947, 'learning_rate': 1.5924657534246576e-06, 'epoch': 0.11} +{'loss': 2.1353, 'grad_norm': 4.898462772369385, 'learning_rate': 1.6095890410958907e-06, 'epoch': 0.11} +{'loss': 2.2097, 'grad_norm': 2.9506707191467285, 'learning_rate': 1.6267123287671235e-06, 'epoch': 0.11} +{'eval_loss': 2.3383545875549316, 'eval_runtime': 24.9453, 'eval_samples_per_second': 40.088, 'eval_steps_per_second': 1.684, 'epoch': 0.11} +{'loss': 2.1319, 'grad_norm': 4.7352495193481445, 'learning_rate': 1.6438356164383561e-06, 'epoch': 0.11} +{'loss': 2.223, 'grad_norm': 3.111501455307007, 'learning_rate': 1.6609589041095892e-06, 'epoch': 0.12} +{'loss': 2.1579, 'grad_norm': 5.210466384887695, 'learning_rate': 1.678082191780822e-06, 'epoch': 0.12} +{'loss': 2.2071, 'grad_norm': 4.084750175476074, 'learning_rate': 1.695205479452055e-06, 'epoch': 0.12} +{'loss': 2.1446, 'grad_norm': 4.3796491622924805, 'learning_rate': 1.7123287671232877e-06, 'epoch': 0.12} +{'loss': 2.2324, 'grad_norm': 3.9907236099243164, 'learning_rate': 1.7294520547945206e-06, 'epoch': 0.12} +{'loss': 2.1836, 'grad_norm': 3.408911943435669, 'learning_rate': 1.7465753424657536e-06, 'epoch': 0.12} +{'loss': 2.1864, 'grad_norm': 4.777821063995361, 'learning_rate': 1.7636986301369865e-06, 'epoch': 0.12} +{'loss': 2.1599, 'grad_norm': 3.0228383541107178, 'learning_rate': 1.7808219178082193e-06, 'epoch': 0.12} +{'loss': 2.2547, 'grad_norm': 3.9669029712677, 'learning_rate': 1.7979452054794521e-06, 'epoch': 0.12} +{'eval_loss': 2.29109525680542, 'eval_runtime': 24.43, 'eval_samples_per_second': 40.933, 'eval_steps_per_second': 1.719, 'epoch': 0.12} +{'loss': 2.182, 'grad_norm': 3.1892316341400146, 'learning_rate': 1.8150684931506852e-06, 'epoch': 0.12} +{'loss': 2.144, 'grad_norm': 4.223985195159912, 'learning_rate': 1.832191780821918e-06, 'epoch': 0.13} +{'loss': 2.0973, 'grad_norm': 2.7451999187469482, 'learning_rate': 1.8493150684931507e-06, 'epoch': 0.13} +{'loss': 2.1177, 'grad_norm': 4.296680450439453, 'learning_rate': 1.8664383561643837e-06, 'epoch': 0.13} +{'loss': 2.0853, 'grad_norm': 3.5394911766052246, 'learning_rate': 1.8835616438356166e-06, 'epoch': 0.13} +{'loss': 2.1535, 'grad_norm': 3.9397950172424316, 'learning_rate': 1.9006849315068496e-06, 'epoch': 0.13} +{'loss': 2.0591, 'grad_norm': 3.6749508380889893, 'learning_rate': 1.9178082191780823e-06, 'epoch': 0.13} +{'loss': 2.1321, 'grad_norm': 4.914153099060059, 'learning_rate': 1.9349315068493153e-06, 'epoch': 0.13} +{'loss': 2.1647, 'grad_norm': 3.588837146759033, 'learning_rate': 1.952054794520548e-06, 'epoch': 0.13} +{'loss': 2.1609, 'grad_norm': 4.999088764190674, 'learning_rate': 1.969178082191781e-06, 'epoch': 0.13} +{'eval_loss': 2.2510268688201904, 'eval_runtime': 24.2672, 'eval_samples_per_second': 41.208, 'eval_steps_per_second': 1.731, 'epoch': 0.13} +{'loss': 2.0749, 'grad_norm': 3.8223671913146973, 'learning_rate': 1.9863013698630136e-06, 'epoch': 0.13} +{'loss': 2.1483, 'grad_norm': 5.629230976104736, 'learning_rate': 2.0034246575342467e-06, 'epoch': 0.14} +{'loss': 2.134, 'grad_norm': 3.7592365741729736, 'learning_rate': 2.0205479452054797e-06, 'epoch': 0.14} +{'loss': 2.0772, 'grad_norm': 5.7058539390563965, 'learning_rate': 2.037671232876713e-06, 'epoch': 0.14} +{'loss': 2.1081, 'grad_norm': 4.703887462615967, 'learning_rate': 2.0547945205479454e-06, 'epoch': 0.14} +{'loss': 2.1209, 'grad_norm': 4.447408199310303, 'learning_rate': 2.071917808219178e-06, 'epoch': 0.14} +{'loss': 2.051, 'grad_norm': 6.202982425689697, 'learning_rate': 2.089041095890411e-06, 'epoch': 0.14} +{'loss': 2.1258, 'grad_norm': 3.526312828063965, 'learning_rate': 2.106164383561644e-06, 'epoch': 0.14} +{'loss': 2.0753, 'grad_norm': 5.085482120513916, 'learning_rate': 2.123287671232877e-06, 'epoch': 0.14} +{'loss': 2.0878, 'grad_norm': 4.764073848724365, 'learning_rate': 2.14041095890411e-06, 'epoch': 0.14} +{'eval_loss': 2.2408313751220703, 'eval_runtime': 24.6972, 'eval_samples_per_second': 40.49, 'eval_steps_per_second': 1.701, 'epoch': 0.14} +{'loss': 2.0856, 'grad_norm': 4.156815528869629, 'learning_rate': 2.1575342465753425e-06, 'epoch': 0.14} +{'loss': 2.0428, 'grad_norm': 5.981069087982178, 'learning_rate': 2.1746575342465755e-06, 'epoch': 0.15} +{'loss': 2.1689, 'grad_norm': 3.9684646129608154, 'learning_rate': 2.191780821917808e-06, 'epoch': 0.15} +{'loss': 2.0539, 'grad_norm': 4.107537269592285, 'learning_rate': 2.2089041095890412e-06, 'epoch': 0.15} +{'loss': 2.0323, 'grad_norm': 3.402860164642334, 'learning_rate': 2.2260273972602743e-06, 'epoch': 0.15} +{'loss': 2.0407, 'grad_norm': 3.392444610595703, 'learning_rate': 2.243150684931507e-06, 'epoch': 0.15} +{'loss': 2.0392, 'grad_norm': 4.484562397003174, 'learning_rate': 2.26027397260274e-06, 'epoch': 0.15} +{'loss': 2.0697, 'grad_norm': 3.4777073860168457, 'learning_rate': 2.2773972602739726e-06, 'epoch': 0.15} +{'loss': 2.154, 'grad_norm': 3.495891571044922, 'learning_rate': 2.2945205479452057e-06, 'epoch': 0.15} +{'loss': 2.0704, 'grad_norm': 4.008520603179932, 'learning_rate': 2.3116438356164387e-06, 'epoch': 0.15} +{'eval_loss': 2.2247276306152344, 'eval_runtime': 24.8699, 'eval_samples_per_second': 40.209, 'eval_steps_per_second': 1.689, 'epoch': 0.15} +{'loss': 1.9749, 'grad_norm': 6.638280391693115, 'learning_rate': 2.3287671232876713e-06, 'epoch': 0.16} +{'loss': 2.0321, 'grad_norm': 4.62777853012085, 'learning_rate': 2.3458904109589044e-06, 'epoch': 0.16} +{'loss': 2.0777, 'grad_norm': 4.346943378448486, 'learning_rate': 2.363013698630137e-06, 'epoch': 0.16} +{'loss': 2.0629, 'grad_norm': 3.7880513668060303, 'learning_rate': 2.38013698630137e-06, 'epoch': 0.16} +{'loss': 2.0229, 'grad_norm': 5.840289115905762, 'learning_rate': 2.3972602739726027e-06, 'epoch': 0.16} +{'loss': 1.9757, 'grad_norm': 5.146554470062256, 'learning_rate': 2.4143835616438358e-06, 'epoch': 0.16} +{'loss': 2.044, 'grad_norm': 4.507009983062744, 'learning_rate': 2.431506849315069e-06, 'epoch': 0.16} +{'loss': 2.058, 'grad_norm': 5.486617088317871, 'learning_rate': 2.4486301369863015e-06, 'epoch': 0.16} +{'loss': 2.0126, 'grad_norm': 3.572096824645996, 'learning_rate': 2.4657534246575345e-06, 'epoch': 0.16} +{'loss': 2.0505, 'grad_norm': 5.710933208465576, 'learning_rate': 2.482876712328767e-06, 'epoch': 0.16} +{'eval_loss': 2.1910157203674316, 'eval_runtime': 25.0103, 'eval_samples_per_second': 39.984, 'eval_steps_per_second': 1.679, 'epoch': 0.16} +{'loss': 2.0301, 'grad_norm': 3.9634411334991455, 'learning_rate': 2.5e-06, 'epoch': 0.17} +{'loss': 2.0099, 'grad_norm': 5.026034832000732, 'learning_rate': 2.5171232876712333e-06, 'epoch': 0.17} +{'loss': 1.9731, 'grad_norm': 4.158727645874023, 'learning_rate': 2.534246575342466e-06, 'epoch': 0.17} +{'loss': 2.0306, 'grad_norm': 7.127386569976807, 'learning_rate': 2.551369863013699e-06, 'epoch': 0.17} +{'loss': 2.0547, 'grad_norm': 4.547935962677002, 'learning_rate': 2.568493150684932e-06, 'epoch': 0.17} +{'loss': 2.0326, 'grad_norm': 6.289664268493652, 'learning_rate': 2.585616438356164e-06, 'epoch': 0.17} +{'loss': 1.9741, 'grad_norm': 5.794668674468994, 'learning_rate': 2.6027397260273973e-06, 'epoch': 0.17} +{'loss': 2.0319, 'grad_norm': 5.8975958824157715, 'learning_rate': 2.6198630136986303e-06, 'epoch': 0.17} +{'loss': 2.0563, 'grad_norm': 4.834619522094727, 'learning_rate': 2.6369863013698634e-06, 'epoch': 0.17} +{'loss': 2.0455, 'grad_norm': 5.357684135437012, 'learning_rate': 2.654109589041096e-06, 'epoch': 0.17} +{'eval_loss': 2.1876258850097656, 'eval_runtime': 25.2667, 'eval_samples_per_second': 39.578, 'eval_steps_per_second': 1.662, 'epoch': 0.17} +{'loss': 1.9795, 'grad_norm': 5.238529682159424, 'learning_rate': 2.671232876712329e-06, 'epoch': 0.18} +{'loss': 1.9811, 'grad_norm': 3.971308708190918, 'learning_rate': 2.688356164383562e-06, 'epoch': 0.18} +{'loss': 1.9665, 'grad_norm': 4.34420919418335, 'learning_rate': 2.705479452054795e-06, 'epoch': 0.18} +{'loss': 1.9195, 'grad_norm': 3.1406633853912354, 'learning_rate': 2.7226027397260274e-06, 'epoch': 0.18} +{'loss': 2.0396, 'grad_norm': 4.136984348297119, 'learning_rate': 2.7397260273972604e-06, 'epoch': 0.18} +{'loss': 2.0181, 'grad_norm': 3.515148162841797, 'learning_rate': 2.7568493150684935e-06, 'epoch': 0.18} +{'loss': 1.9259, 'grad_norm': 4.206666469573975, 'learning_rate': 2.773972602739726e-06, 'epoch': 0.18} +{'loss': 2.0168, 'grad_norm': 4.697192192077637, 'learning_rate': 2.791095890410959e-06, 'epoch': 0.18} +{'loss': 1.8965, 'grad_norm': 4.333245277404785, 'learning_rate': 2.8082191780821922e-06, 'epoch': 0.18} +{'loss': 2.0626, 'grad_norm': 4.301321029663086, 'learning_rate': 2.8253424657534253e-06, 'epoch': 0.18} +{'eval_loss': 2.167128086090088, 'eval_runtime': 20.8221, 'eval_samples_per_second': 48.026, 'eval_steps_per_second': 2.017, 'epoch': 0.18} +{'loss': 2.0196, 'grad_norm': 3.429021120071411, 'learning_rate': 2.842465753424658e-06, 'epoch': 0.19} +{'loss': 2.0247, 'grad_norm': 5.104475498199463, 'learning_rate': 2.8595890410958905e-06, 'epoch': 0.19} +{'loss': 2.0341, 'grad_norm': 4.321488857269287, 'learning_rate': 2.876712328767123e-06, 'epoch': 0.19} +{'loss': 1.8973, 'grad_norm': 4.264700412750244, 'learning_rate': 2.8938356164383562e-06, 'epoch': 0.19} +{'loss': 1.9188, 'grad_norm': 5.179917812347412, 'learning_rate': 2.9109589041095893e-06, 'epoch': 0.19} +{'loss': 1.9092, 'grad_norm': 4.154581546783447, 'learning_rate': 2.9280821917808223e-06, 'epoch': 0.19} +{'loss': 1.9265, 'grad_norm': 4.831721782684326, 'learning_rate': 2.945205479452055e-06, 'epoch': 0.19} +{'loss': 1.987, 'grad_norm': 4.270281791687012, 'learning_rate': 2.962328767123288e-06, 'epoch': 0.19} +{'loss': 1.9625, 'grad_norm': 4.06156063079834, 'learning_rate': 2.979452054794521e-06, 'epoch': 0.19} +{'loss': 1.9175, 'grad_norm': 4.434120178222656, 'learning_rate': 2.9965753424657533e-06, 'epoch': 0.2} +{'eval_loss': 2.1517720222473145, 'eval_runtime': 19.5094, 'eval_samples_per_second': 51.257, 'eval_steps_per_second': 2.153, 'epoch': 0.2} +{'loss': 1.9705, 'grad_norm': 4.340418338775635, 'learning_rate': 3.0136986301369864e-06, 'epoch': 0.2} +{'loss': 1.9581, 'grad_norm': 4.136503219604492, 'learning_rate': 3.0308219178082194e-06, 'epoch': 0.2} +{'loss': 1.905, 'grad_norm': 4.594971179962158, 'learning_rate': 3.0479452054794525e-06, 'epoch': 0.2} +{'loss': 1.9456, 'grad_norm': 4.170224666595459, 'learning_rate': 3.065068493150685e-06, 'epoch': 0.2} +{'loss': 1.9014, 'grad_norm': 4.011481761932373, 'learning_rate': 3.082191780821918e-06, 'epoch': 0.2} +{'loss': 1.9255, 'grad_norm': 5.140495777130127, 'learning_rate': 3.099315068493151e-06, 'epoch': 0.2} +{'loss': 1.9157, 'grad_norm': 3.6050641536712646, 'learning_rate': 3.1164383561643843e-06, 'epoch': 0.2} +{'loss': 1.8729, 'grad_norm': 4.116490364074707, 'learning_rate': 3.1335616438356165e-06, 'epoch': 0.2} +{'loss': 2.0275, 'grad_norm': 4.74360990524292, 'learning_rate': 3.1506849315068495e-06, 'epoch': 0.2} +{'loss': 1.8788, 'grad_norm': 4.481844425201416, 'learning_rate': 3.167808219178082e-06, 'epoch': 0.21} +{'eval_loss': 2.140434503555298, 'eval_runtime': 19.4896, 'eval_samples_per_second': 51.309, 'eval_steps_per_second': 2.155, 'epoch': 0.21} +{'loss': 1.8827, 'grad_norm': 4.030834197998047, 'learning_rate': 3.184931506849315e-06, 'epoch': 0.21} +{'loss': 1.8617, 'grad_norm': 5.015613079071045, 'learning_rate': 3.2020547945205483e-06, 'epoch': 0.21} +{'loss': 1.8483, 'grad_norm': 4.214653491973877, 'learning_rate': 3.2191780821917813e-06, 'epoch': 0.21} +{'loss': 1.8403, 'grad_norm': 4.806234359741211, 'learning_rate': 3.236301369863014e-06, 'epoch': 0.21} +{'loss': 1.8219, 'grad_norm': 5.836562633514404, 'learning_rate': 3.253424657534247e-06, 'epoch': 0.21} +{'loss': 1.914, 'grad_norm': 4.185085296630859, 'learning_rate': 3.2705479452054796e-06, 'epoch': 0.21} +{'loss': 1.8774, 'grad_norm': 4.9068989753723145, 'learning_rate': 3.2876712328767123e-06, 'epoch': 0.21} +{'loss': 1.852, 'grad_norm': 4.142357349395752, 'learning_rate': 3.3047945205479453e-06, 'epoch': 0.21} +{'loss': 1.8588, 'grad_norm': 4.465056419372559, 'learning_rate': 3.3219178082191784e-06, 'epoch': 0.21} +{'loss': 1.8957, 'grad_norm': 4.0861687660217285, 'learning_rate': 3.3390410958904114e-06, 'epoch': 0.22} +{'eval_loss': 2.1248621940612793, 'eval_runtime': 19.5068, 'eval_samples_per_second': 51.264, 'eval_steps_per_second': 2.153, 'epoch': 0.22} +{'loss': 1.9059, 'grad_norm': 4.966719627380371, 'learning_rate': 3.356164383561644e-06, 'epoch': 0.22} +{'loss': 1.9623, 'grad_norm': 5.384037494659424, 'learning_rate': 3.373287671232877e-06, 'epoch': 0.22} +{'loss': 1.9096, 'grad_norm': 4.472937107086182, 'learning_rate': 3.39041095890411e-06, 'epoch': 0.22} +{'loss': 1.9357, 'grad_norm': 4.410476207733154, 'learning_rate': 3.4075342465753424e-06, 'epoch': 0.22} +{'loss': 1.832, 'grad_norm': 4.773514270782471, 'learning_rate': 3.4246575342465754e-06, 'epoch': 0.22} +{'loss': 1.8869, 'grad_norm': 5.351006507873535, 'learning_rate': 3.4417808219178085e-06, 'epoch': 0.22} +{'loss': 1.8821, 'grad_norm': 4.096711158752441, 'learning_rate': 3.458904109589041e-06, 'epoch': 0.22} +{'loss': 1.8967, 'grad_norm': 4.471275806427002, 'learning_rate': 3.476027397260274e-06, 'epoch': 0.22} +{'loss': 1.883, 'grad_norm': 5.158395767211914, 'learning_rate': 3.4931506849315072e-06, 'epoch': 0.22} +{'loss': 1.8933, 'grad_norm': 5.52556037902832, 'learning_rate': 3.5102739726027403e-06, 'epoch': 0.23} +{'eval_loss': 2.123084783554077, 'eval_runtime': 19.4901, 'eval_samples_per_second': 51.308, 'eval_steps_per_second': 2.155, 'epoch': 0.23} +{'loss': 1.8848, 'grad_norm': 4.790727138519287, 'learning_rate': 3.527397260273973e-06, 'epoch': 0.23} +{'loss': 1.8611, 'grad_norm': 5.05641508102417, 'learning_rate': 3.5445205479452056e-06, 'epoch': 0.23} +{'loss': 1.8707, 'grad_norm': 3.449965476989746, 'learning_rate': 3.5616438356164386e-06, 'epoch': 0.23} +{'loss': 1.9363, 'grad_norm': 5.298236846923828, 'learning_rate': 3.5787671232876712e-06, 'epoch': 0.23} +{'loss': 1.899, 'grad_norm': 3.540912628173828, 'learning_rate': 3.5958904109589043e-06, 'epoch': 0.23} +{'loss': 1.8447, 'grad_norm': 4.450790882110596, 'learning_rate': 3.6130136986301373e-06, 'epoch': 0.23} +{'loss': 1.8608, 'grad_norm': 3.6753990650177, 'learning_rate': 3.6301369863013704e-06, 'epoch': 0.23} +{'loss': 1.9021, 'grad_norm': 4.823665618896484, 'learning_rate': 3.647260273972603e-06, 'epoch': 0.23} +{'loss': 1.9035, 'grad_norm': 4.010214805603027, 'learning_rate': 3.664383561643836e-06, 'epoch': 0.24} +{'loss': 1.8354, 'grad_norm': 4.114319324493408, 'learning_rate': 3.6815068493150687e-06, 'epoch': 0.24} +{'eval_loss': 2.1166093349456787, 'eval_runtime': 19.5106, 'eval_samples_per_second': 51.254, 'eval_steps_per_second': 2.153, 'epoch': 0.24} +{'loss': 1.8652, 'grad_norm': 3.9963600635528564, 'learning_rate': 3.6986301369863014e-06, 'epoch': 0.24} +{'loss': 1.8168, 'grad_norm': 3.886843681335449, 'learning_rate': 3.7157534246575344e-06, 'epoch': 0.24} +{'loss': 1.9244, 'grad_norm': 4.360881805419922, 'learning_rate': 3.7328767123287675e-06, 'epoch': 0.24} +{'loss': 1.8435, 'grad_norm': 5.23706579208374, 'learning_rate': 3.7500000000000005e-06, 'epoch': 0.24} +{'loss': 1.8769, 'grad_norm': 4.846019744873047, 'learning_rate': 3.767123287671233e-06, 'epoch': 0.24} +{'loss': 1.9074, 'grad_norm': 4.204360008239746, 'learning_rate': 3.784246575342466e-06, 'epoch': 0.24} +{'loss': 1.8329, 'grad_norm': 5.047863483428955, 'learning_rate': 3.8013698630136993e-06, 'epoch': 0.24} +{'loss': 1.8621, 'grad_norm': 3.358213186264038, 'learning_rate': 3.818493150684932e-06, 'epoch': 0.24} +{'loss': 1.7994, 'grad_norm': 5.258899211883545, 'learning_rate': 3.8356164383561645e-06, 'epoch': 0.25} +{'loss': 1.9193, 'grad_norm': 4.4145355224609375, 'learning_rate': 3.852739726027397e-06, 'epoch': 0.25} +{'eval_loss': 2.089660167694092, 'eval_runtime': 19.5228, 'eval_samples_per_second': 51.222, 'eval_steps_per_second': 2.151, 'epoch': 0.25} +{'loss': 1.8957, 'grad_norm': 3.3204946517944336, 'learning_rate': 3.869863013698631e-06, 'epoch': 0.25} +{'loss': 1.8339, 'grad_norm': 4.67922306060791, 'learning_rate': 3.886986301369863e-06, 'epoch': 0.25} +{'loss': 1.8432, 'grad_norm': 4.187448501586914, 'learning_rate': 3.904109589041096e-06, 'epoch': 0.25} +{'loss': 1.8022, 'grad_norm': 4.468316555023193, 'learning_rate': 3.921232876712329e-06, 'epoch': 0.25} +{'loss': 1.8194, 'grad_norm': 3.7287988662719727, 'learning_rate': 3.938356164383562e-06, 'epoch': 0.25} +{'loss': 1.7836, 'grad_norm': 4.8201704025268555, 'learning_rate': 3.9554794520547955e-06, 'epoch': 0.25} +{'loss': 1.8668, 'grad_norm': 3.69752836227417, 'learning_rate': 3.972602739726027e-06, 'epoch': 0.25} +{'loss': 1.792, 'grad_norm': 4.192770481109619, 'learning_rate': 3.989726027397261e-06, 'epoch': 0.25} +{'loss': 1.8577, 'grad_norm': 3.6263973712921143, 'learning_rate': 4.006849315068493e-06, 'epoch': 0.26} +{'loss': 1.864, 'grad_norm': 4.122676849365234, 'learning_rate': 4.023972602739726e-06, 'epoch': 0.26} +{'eval_loss': 2.0818262100219727, 'eval_runtime': 19.5256, 'eval_samples_per_second': 51.215, 'eval_steps_per_second': 2.151, 'epoch': 0.26} +{'loss': 1.8325, 'grad_norm': 3.9878997802734375, 'learning_rate': 4.0410958904109595e-06, 'epoch': 0.26} +{'loss': 1.8315, 'grad_norm': 3.5833492279052734, 'learning_rate': 4.058219178082192e-06, 'epoch': 0.26} +{'loss': 1.8534, 'grad_norm': 4.091355800628662, 'learning_rate': 4.075342465753426e-06, 'epoch': 0.26} +{'loss': 1.8168, 'grad_norm': 3.4195785522460938, 'learning_rate': 4.092465753424658e-06, 'epoch': 0.26} +{'loss': 1.8095, 'grad_norm': 3.8302130699157715, 'learning_rate': 4.109589041095891e-06, 'epoch': 0.26} +{'loss': 1.8832, 'grad_norm': 4.214291095733643, 'learning_rate': 4.1267123287671235e-06, 'epoch': 0.26} +{'loss': 1.8274, 'grad_norm': 4.5247015953063965, 'learning_rate': 4.143835616438356e-06, 'epoch': 0.26} +{'loss': 1.9014, 'grad_norm': 4.118529796600342, 'learning_rate': 4.16095890410959e-06, 'epoch': 0.27} +{'loss': 1.8724, 'grad_norm': 4.841413974761963, 'learning_rate': 4.178082191780822e-06, 'epoch': 0.27} +{'loss': 1.8059, 'grad_norm': 4.034916400909424, 'learning_rate': 4.195205479452055e-06, 'epoch': 0.27} +{'eval_loss': 2.0584115982055664, 'eval_runtime': 19.5321, 'eval_samples_per_second': 51.198, 'eval_steps_per_second': 2.15, 'epoch': 0.27} +{'loss': 1.8169, 'grad_norm': 4.246167182922363, 'learning_rate': 4.212328767123288e-06, 'epoch': 0.27} +{'loss': 1.816, 'grad_norm': 4.718570709228516, 'learning_rate': 4.229452054794521e-06, 'epoch': 0.27} +{'loss': 1.8197, 'grad_norm': 3.6808481216430664, 'learning_rate': 4.246575342465754e-06, 'epoch': 0.27} +{'loss': 1.8052, 'grad_norm': 5.385921001434326, 'learning_rate': 4.263698630136986e-06, 'epoch': 0.27} +{'loss': 1.7695, 'grad_norm': 4.946651458740234, 'learning_rate': 4.28082191780822e-06, 'epoch': 0.27} +{'loss': 1.9272, 'grad_norm': 4.222400665283203, 'learning_rate': 4.297945205479452e-06, 'epoch': 0.27} +{'loss': 1.8212, 'grad_norm': 4.6753950119018555, 'learning_rate': 4.315068493150685e-06, 'epoch': 0.27} +{'loss': 1.8055, 'grad_norm': 5.108915328979492, 'learning_rate': 4.3321917808219185e-06, 'epoch': 0.28} +{'loss': 1.8206, 'grad_norm': 3.7991561889648438, 'learning_rate': 4.349315068493151e-06, 'epoch': 0.28} +{'loss': 1.7956, 'grad_norm': 4.0541510581970215, 'learning_rate': 4.3664383561643846e-06, 'epoch': 0.28} +{'eval_loss': 2.050187826156616, 'eval_runtime': 19.5474, 'eval_samples_per_second': 51.158, 'eval_steps_per_second': 2.149, 'epoch': 0.28} +{'loss': 1.7936, 'grad_norm': 4.167999267578125, 'learning_rate': 4.383561643835616e-06, 'epoch': 0.28} +{'loss': 1.8782, 'grad_norm': 3.854851722717285, 'learning_rate': 4.40068493150685e-06, 'epoch': 0.28} +{'loss': 1.8294, 'grad_norm': 4.948131561279297, 'learning_rate': 4.4178082191780825e-06, 'epoch': 0.28} +{'loss': 1.802, 'grad_norm': 3.7915711402893066, 'learning_rate': 4.434931506849315e-06, 'epoch': 0.28} +{'loss': 1.7662, 'grad_norm': 4.456705570220947, 'learning_rate': 4.4520547945205486e-06, 'epoch': 0.28} +{'loss': 1.8213, 'grad_norm': 3.5915284156799316, 'learning_rate': 4.469178082191781e-06, 'epoch': 0.28} +{'loss': 1.8095, 'grad_norm': 5.5341596603393555, 'learning_rate': 4.486301369863014e-06, 'epoch': 0.28} +{'loss': 1.8524, 'grad_norm': 4.536434650421143, 'learning_rate': 4.503424657534247e-06, 'epoch': 0.29} +{'loss': 1.7485, 'grad_norm': 4.295739650726318, 'learning_rate': 4.52054794520548e-06, 'epoch': 0.29} +{'loss': 1.7839, 'grad_norm': 3.646023988723755, 'learning_rate': 4.537671232876713e-06, 'epoch': 0.29} +{'eval_loss': 2.042029857635498, 'eval_runtime': 19.5869, 'eval_samples_per_second': 51.055, 'eval_steps_per_second': 2.144, 'epoch': 0.29} +{'loss': 1.8394, 'grad_norm': 4.040236949920654, 'learning_rate': 4.554794520547945e-06, 'epoch': 0.29} +{'loss': 1.8368, 'grad_norm': 3.6592211723327637, 'learning_rate': 4.571917808219179e-06, 'epoch': 0.29} +{'loss': 1.8212, 'grad_norm': 4.579453468322754, 'learning_rate': 4.589041095890411e-06, 'epoch': 0.29} +{'loss': 1.7838, 'grad_norm': 4.745927810668945, 'learning_rate': 4.606164383561644e-06, 'epoch': 0.29} +{'loss': 1.7926, 'grad_norm': 4.577824115753174, 'learning_rate': 4.6232876712328774e-06, 'epoch': 0.29} +{'loss': 1.8073, 'grad_norm': 4.244920253753662, 'learning_rate': 4.64041095890411e-06, 'epoch': 0.29} +{'loss': 1.7752, 'grad_norm': 5.21291971206665, 'learning_rate': 4.657534246575343e-06, 'epoch': 0.29} +{'loss': 1.8187, 'grad_norm': 4.451918601989746, 'learning_rate': 4.674657534246575e-06, 'epoch': 0.3} +{'loss': 1.7942, 'grad_norm': 4.4631876945495605, 'learning_rate': 4.691780821917809e-06, 'epoch': 0.3} +{'loss': 1.8457, 'grad_norm': 4.721055507659912, 'learning_rate': 4.7089041095890414e-06, 'epoch': 0.3} +{'eval_loss': 2.062126398086548, 'eval_runtime': 19.5829, 'eval_samples_per_second': 51.065, 'eval_steps_per_second': 2.145, 'epoch': 0.3} +{'loss': 1.8013, 'grad_norm': 4.626242637634277, 'learning_rate': 4.726027397260274e-06, 'epoch': 0.3} +{'loss': 1.8022, 'grad_norm': 3.8861474990844727, 'learning_rate': 4.7431506849315075e-06, 'epoch': 0.3} +{'loss': 1.8125, 'grad_norm': 4.291984558105469, 'learning_rate': 4.76027397260274e-06, 'epoch': 0.3} +{'loss': 1.7567, 'grad_norm': 3.466405153274536, 'learning_rate': 4.777397260273973e-06, 'epoch': 0.3} +{'loss': 1.7877, 'grad_norm': 5.833539962768555, 'learning_rate': 4.7945205479452054e-06, 'epoch': 0.3} +{'loss': 1.7814, 'grad_norm': 4.091037750244141, 'learning_rate': 4.811643835616439e-06, 'epoch': 0.3} +{'loss': 1.7255, 'grad_norm': 4.735819339752197, 'learning_rate': 4.8287671232876716e-06, 'epoch': 0.31} +{'loss': 1.7625, 'grad_norm': 3.6650075912475586, 'learning_rate': 4.845890410958904e-06, 'epoch': 0.31} +{'loss': 1.7703, 'grad_norm': 4.763848304748535, 'learning_rate': 4.863013698630138e-06, 'epoch': 0.31} +{'loss': 1.7614, 'grad_norm': 4.726076126098633, 'learning_rate': 4.88013698630137e-06, 'epoch': 0.31} +{'eval_loss': 2.049801826477051, 'eval_runtime': 19.6068, 'eval_samples_per_second': 51.003, 'eval_steps_per_second': 2.142, 'epoch': 0.31} +{'loss': 1.7426, 'grad_norm': 4.360836505889893, 'learning_rate': 4.897260273972603e-06, 'epoch': 0.31} +{'loss': 1.8619, 'grad_norm': 4.374937534332275, 'learning_rate': 4.914383561643836e-06, 'epoch': 0.31} +{'loss': 1.787, 'grad_norm': 4.182437419891357, 'learning_rate': 4.931506849315069e-06, 'epoch': 0.31} +{'loss': 1.8177, 'grad_norm': 4.201266765594482, 'learning_rate': 4.948630136986302e-06, 'epoch': 0.31} +{'loss': 1.8603, 'grad_norm': 4.670989036560059, 'learning_rate': 4.965753424657534e-06, 'epoch': 0.31} +{'loss': 1.7316, 'grad_norm': 4.937220573425293, 'learning_rate': 4.982876712328768e-06, 'epoch': 0.31} +{'loss': 1.7626, 'grad_norm': 4.001572608947754, 'learning_rate': 5e-06, 'epoch': 0.32} +{'loss': 1.716, 'grad_norm': 4.248878479003906, 'learning_rate': 4.998096688237534e-06, 'epoch': 0.32} +{'loss': 1.8201, 'grad_norm': 3.746990919113159, 'learning_rate': 4.996193376475067e-06, 'epoch': 0.32} +{'loss': 1.695, 'grad_norm': 4.13502836227417, 'learning_rate': 4.9942900647126005e-06, 'epoch': 0.32} +{'eval_loss': 2.0357320308685303, 'eval_runtime': 19.5845, 'eval_samples_per_second': 51.061, 'eval_steps_per_second': 2.145, 'epoch': 0.32} +{'loss': 1.708, 'grad_norm': 4.243040084838867, 'learning_rate': 4.992386752950134e-06, 'epoch': 0.32} +{'loss': 1.7478, 'grad_norm': 4.323862075805664, 'learning_rate': 4.990483441187667e-06, 'epoch': 0.32} +{'loss': 1.77, 'grad_norm': 3.3732717037200928, 'learning_rate': 4.9885801294252e-06, 'epoch': 0.32} +{'loss': 1.7743, 'grad_norm': 4.087417125701904, 'learning_rate': 4.986676817662733e-06, 'epoch': 0.32} +{'loss': 1.7339, 'grad_norm': 3.719357490539551, 'learning_rate': 4.9847735059002665e-06, 'epoch': 0.32} +{'loss': 1.8099, 'grad_norm': 4.961339473724365, 'learning_rate': 4.9828701941378e-06, 'epoch': 0.32} +{'loss': 1.7439, 'grad_norm': 3.550788402557373, 'learning_rate': 4.980966882375333e-06, 'epoch': 0.33} +{'loss': 1.7191, 'grad_norm': 4.407485008239746, 'learning_rate': 4.979063570612867e-06, 'epoch': 0.33} +{'loss': 1.6868, 'grad_norm': 3.875699758529663, 'learning_rate': 4.9771602588504e-06, 'epoch': 0.33} +{'loss': 1.796, 'grad_norm': 4.398037433624268, 'learning_rate': 4.975256947087933e-06, 'epoch': 0.33} +{'eval_loss': 2.0058517456054688, 'eval_runtime': 19.6, 'eval_samples_per_second': 51.02, 'eval_steps_per_second': 2.143, 'epoch': 0.33} +{'loss': 1.6913, 'grad_norm': 4.079494476318359, 'learning_rate': 4.973353635325467e-06, 'epoch': 0.33} +{'loss': 1.747, 'grad_norm': 3.6496222019195557, 'learning_rate': 4.971450323563e-06, 'epoch': 0.33} +{'loss': 1.689, 'grad_norm': 3.799955368041992, 'learning_rate': 4.969547011800533e-06, 'epoch': 0.33} +{'loss': 1.7502, 'grad_norm': 3.666980028152466, 'learning_rate': 4.967643700038067e-06, 'epoch': 0.33} +{'loss': 1.755, 'grad_norm': 4.003580570220947, 'learning_rate': 4.9657403882756e-06, 'epoch': 0.33} +{'loss': 1.7086, 'grad_norm': 3.4668309688568115, 'learning_rate': 4.9638370765131335e-06, 'epoch': 0.33} +{'loss': 1.7084, 'grad_norm': 3.980245590209961, 'learning_rate': 4.961933764750667e-06, 'epoch': 0.34} +{'loss': 1.7716, 'grad_norm': 3.8204221725463867, 'learning_rate': 4.9600304529882e-06, 'epoch': 0.34} +{'loss': 1.7815, 'grad_norm': 4.053361892700195, 'learning_rate': 4.9581271412257335e-06, 'epoch': 0.34} +{'loss': 1.7502, 'grad_norm': 3.60760498046875, 'learning_rate': 4.956223829463266e-06, 'epoch': 0.34} +{'eval_loss': 2.0204944610595703, 'eval_runtime': 19.6161, 'eval_samples_per_second': 50.978, 'eval_steps_per_second': 2.141, 'epoch': 0.34} +{'loss': 1.6703, 'grad_norm': 3.953108549118042, 'learning_rate': 4.954320517700799e-06, 'epoch': 0.34} +{'loss': 1.7684, 'grad_norm': 3.4926702976226807, 'learning_rate': 4.952417205938333e-06, 'epoch': 0.34} +{'loss': 1.7492, 'grad_norm': 3.803178548812866, 'learning_rate': 4.950513894175866e-06, 'epoch': 0.34} +{'loss': 1.7311, 'grad_norm': 3.836808443069458, 'learning_rate': 4.9486105824133995e-06, 'epoch': 0.34} +{'loss': 1.7477, 'grad_norm': 3.7862977981567383, 'learning_rate': 4.946707270650933e-06, 'epoch': 0.34} +{'loss': 1.7585, 'grad_norm': 4.039956569671631, 'learning_rate': 4.944803958888466e-06, 'epoch': 0.35} +{'loss': 1.7875, 'grad_norm': 4.111494541168213, 'learning_rate': 4.942900647126e-06, 'epoch': 0.35} +{'loss': 1.7157, 'grad_norm': 3.891266107559204, 'learning_rate': 4.940997335363533e-06, 'epoch': 0.35} +{'loss': 1.6722, 'grad_norm': 3.8507604598999023, 'learning_rate': 4.939094023601066e-06, 'epoch': 0.35} +{'loss': 1.7374, 'grad_norm': 4.05128812789917, 'learning_rate': 4.9371907118386e-06, 'epoch': 0.35} +{'eval_loss': 2.032823085784912, 'eval_runtime': 19.6092, 'eval_samples_per_second': 50.997, 'eval_steps_per_second': 2.142, 'epoch': 0.35} +{'loss': 1.7302, 'grad_norm': 3.767080307006836, 'learning_rate': 4.935287400076133e-06, 'epoch': 0.35} +{'loss': 1.7443, 'grad_norm': 3.615769147872925, 'learning_rate': 4.933384088313666e-06, 'epoch': 0.35} +{'loss': 1.6936, 'grad_norm': 3.6855123043060303, 'learning_rate': 4.9314807765512e-06, 'epoch': 0.35} +{'loss': 1.6413, 'grad_norm': 3.6388423442840576, 'learning_rate': 4.929577464788733e-06, 'epoch': 0.35} +{'loss': 1.7186, 'grad_norm': 3.9919347763061523, 'learning_rate': 4.9276741530262665e-06, 'epoch': 0.35} +{'loss': 1.7079, 'grad_norm': 3.669332981109619, 'learning_rate': 4.9257708412638e-06, 'epoch': 0.36} +{'loss': 1.7155, 'grad_norm': 3.540557861328125, 'learning_rate': 4.923867529501332e-06, 'epoch': 0.36} +{'loss': 1.7041, 'grad_norm': 4.1520256996154785, 'learning_rate': 4.921964217738866e-06, 'epoch': 0.36} +{'loss': 1.7767, 'grad_norm': 4.042150020599365, 'learning_rate': 4.920060905976399e-06, 'epoch': 0.36} +{'loss': 1.7587, 'grad_norm': 3.605260133743286, 'learning_rate': 4.9181575942139325e-06, 'epoch': 0.36} +{'eval_loss': 2.0224974155426025, 'eval_runtime': 19.5712, 'eval_samples_per_second': 51.096, 'eval_steps_per_second': 2.146, 'epoch': 0.36} +{'loss': 1.7523, 'grad_norm': 4.034849643707275, 'learning_rate': 4.916254282451466e-06, 'epoch': 0.36} +{'loss': 1.8006, 'grad_norm': 4.090887069702148, 'learning_rate': 4.914350970688999e-06, 'epoch': 0.36} +{'loss': 1.6905, 'grad_norm': 3.54056978225708, 'learning_rate': 4.9124476589265325e-06, 'epoch': 0.36} +{'loss': 1.6973, 'grad_norm': 4.1786932945251465, 'learning_rate': 4.910544347164066e-06, 'epoch': 0.36} +{'loss': 1.772, 'grad_norm': 3.7698609828948975, 'learning_rate': 4.908641035401599e-06, 'epoch': 0.36} +{'loss': 1.8367, 'grad_norm': 4.738850116729736, 'learning_rate': 4.906737723639133e-06, 'epoch': 0.37} +{'loss': 1.7022, 'grad_norm': 3.5439541339874268, 'learning_rate': 4.904834411876665e-06, 'epoch': 0.37} +{'loss': 1.742, 'grad_norm': 4.029539108276367, 'learning_rate': 4.902931100114199e-06, 'epoch': 0.37} +{'loss': 1.6664, 'grad_norm': 4.081323623657227, 'learning_rate': 4.901027788351733e-06, 'epoch': 0.37} +{'loss': 1.7052, 'grad_norm': 3.4758691787719727, 'learning_rate': 4.899124476589266e-06, 'epoch': 0.37} +{'eval_loss': 2.0131168365478516, 'eval_runtime': 19.5792, 'eval_samples_per_second': 51.075, 'eval_steps_per_second': 2.145, 'epoch': 0.37} +{'loss': 1.6777, 'grad_norm': 3.7604763507843018, 'learning_rate': 4.8972211648267994e-06, 'epoch': 0.37} +{'loss': 1.7283, 'grad_norm': 4.209819793701172, 'learning_rate': 4.895317853064333e-06, 'epoch': 0.37} +{'loss': 1.7037, 'grad_norm': 3.6587347984313965, 'learning_rate': 4.893414541301866e-06, 'epoch': 0.37} +{'loss': 1.709, 'grad_norm': 4.005814075469971, 'learning_rate': 4.891511229539399e-06, 'epoch': 0.37} +{'loss': 1.7208, 'grad_norm': 4.135828018188477, 'learning_rate': 4.889607917776932e-06, 'epoch': 0.37} +{'loss': 1.7158, 'grad_norm': 3.7931952476501465, 'learning_rate': 4.887704606014465e-06, 'epoch': 0.38} +{'loss': 1.7325, 'grad_norm': 3.8655636310577393, 'learning_rate': 4.885801294251999e-06, 'epoch': 0.38} +{'loss': 1.6587, 'grad_norm': 4.344298839569092, 'learning_rate': 4.883897982489532e-06, 'epoch': 0.38} +{'loss': 1.7552, 'grad_norm': 3.9738237857818604, 'learning_rate': 4.8819946707270655e-06, 'epoch': 0.38} +{'loss': 1.7336, 'grad_norm': 3.6629505157470703, 'learning_rate': 4.880091358964599e-06, 'epoch': 0.38} +{'eval_loss': 1.9830520153045654, 'eval_runtime': 19.5679, 'eval_samples_per_second': 51.104, 'eval_steps_per_second': 2.146, 'epoch': 0.38} +{'loss': 1.6509, 'grad_norm': 4.0084075927734375, 'learning_rate': 4.878188047202132e-06, 'epoch': 0.38} +{'loss': 1.8228, 'grad_norm': 3.541398286819458, 'learning_rate': 4.876284735439666e-06, 'epoch': 0.38} +{'loss': 1.7863, 'grad_norm': 3.9965767860412598, 'learning_rate': 4.874381423677199e-06, 'epoch': 0.38} +{'loss': 1.71, 'grad_norm': 3.960212469100952, 'learning_rate': 4.8724781119147315e-06, 'epoch': 0.38} +{'loss': 1.7379, 'grad_norm': 3.7382888793945312, 'learning_rate': 4.870574800152265e-06, 'epoch': 0.39} +{'loss': 1.6635, 'grad_norm': 4.147199630737305, 'learning_rate': 4.868671488389798e-06, 'epoch': 0.39} +{'loss': 1.675, 'grad_norm': 3.8812103271484375, 'learning_rate': 4.8667681766273316e-06, 'epoch': 0.39} +{'loss': 1.6479, 'grad_norm': 3.824472188949585, 'learning_rate': 4.864864864864866e-06, 'epoch': 0.39} +{'loss': 1.7091, 'grad_norm': 3.8692915439605713, 'learning_rate': 4.862961553102399e-06, 'epoch': 0.39} +{'loss': 1.7334, 'grad_norm': 4.112631797790527, 'learning_rate': 4.8610582413399325e-06, 'epoch': 0.39} +{'eval_loss': 1.9880326986312866, 'eval_runtime': 19.552, 'eval_samples_per_second': 51.146, 'eval_steps_per_second': 2.148, 'epoch': 0.39} +{'loss': 1.6918, 'grad_norm': 3.366814374923706, 'learning_rate': 4.859154929577465e-06, 'epoch': 0.39} +{'loss': 1.7045, 'grad_norm': 3.7745437622070312, 'learning_rate': 4.857251617814998e-06, 'epoch': 0.39} +{'loss': 1.7393, 'grad_norm': 4.30635929107666, 'learning_rate': 4.855348306052532e-06, 'epoch': 0.39} +{'loss': 1.7277, 'grad_norm': 3.6196701526641846, 'learning_rate': 4.853444994290065e-06, 'epoch': 0.39} +{'loss': 1.6899, 'grad_norm': 3.875918388366699, 'learning_rate': 4.8515416825275985e-06, 'epoch': 0.4} +{'loss': 1.7596, 'grad_norm': 3.626540184020996, 'learning_rate': 4.849638370765132e-06, 'epoch': 0.4} +{'loss': 1.6807, 'grad_norm': 3.8511526584625244, 'learning_rate': 4.847735059002665e-06, 'epoch': 0.4} +{'loss': 1.6336, 'grad_norm': 3.7382988929748535, 'learning_rate': 4.8458317472401985e-06, 'epoch': 0.4} +{'loss': 1.7274, 'grad_norm': 3.7565016746520996, 'learning_rate': 4.843928435477732e-06, 'epoch': 0.4} +{'loss': 1.6793, 'grad_norm': 3.397143602371216, 'learning_rate': 4.842025123715265e-06, 'epoch': 0.4} +{'eval_loss': 1.9822829961776733, 'eval_runtime': 19.4965, 'eval_samples_per_second': 51.291, 'eval_steps_per_second': 2.154, 'epoch': 0.4} +{'loss': 1.7253, 'grad_norm': 3.7681241035461426, 'learning_rate': 4.840121811952798e-06, 'epoch': 0.4} +{'loss': 1.7079, 'grad_norm': 3.7063958644866943, 'learning_rate': 4.838218500190331e-06, 'epoch': 0.4} +{'loss': 1.7274, 'grad_norm': 4.270074367523193, 'learning_rate': 4.8363151884278645e-06, 'epoch': 0.4} +{'loss': 1.7007, 'grad_norm': 3.434239625930786, 'learning_rate': 4.834411876665398e-06, 'epoch': 0.4} +{'loss': 1.7143, 'grad_norm': 3.623870372772217, 'learning_rate': 4.832508564902931e-06, 'epoch': 0.41} +{'loss': 1.6521, 'grad_norm': 4.064011096954346, 'learning_rate': 4.830605253140465e-06, 'epoch': 0.41} +{'loss': 1.6929, 'grad_norm': 3.8437061309814453, 'learning_rate': 4.828701941377998e-06, 'epoch': 0.41} +{'loss': 1.6475, 'grad_norm': 4.1681060791015625, 'learning_rate': 4.826798629615531e-06, 'epoch': 0.41} +{'loss': 1.673, 'grad_norm': 4.051055431365967, 'learning_rate': 4.824895317853065e-06, 'epoch': 0.41} +{'loss': 1.6831, 'grad_norm': 3.900688886642456, 'learning_rate': 4.822992006090598e-06, 'epoch': 0.41} +{'eval_loss': 2.003829002380371, 'eval_runtime': 19.4971, 'eval_samples_per_second': 51.29, 'eval_steps_per_second': 2.154, 'epoch': 0.41} +{'loss': 1.6454, 'grad_norm': 3.6002299785614014, 'learning_rate': 4.821088694328131e-06, 'epoch': 0.41} +{'loss': 1.6665, 'grad_norm': 4.1019392013549805, 'learning_rate': 4.819185382565665e-06, 'epoch': 0.41} +{'loss': 1.7307, 'grad_norm': 3.568685531616211, 'learning_rate': 4.817282070803198e-06, 'epoch': 0.41} +{'loss': 1.7178, 'grad_norm': 3.7157418727874756, 'learning_rate': 4.8153787590407315e-06, 'epoch': 0.41} +{'loss': 1.6433, 'grad_norm': 3.7581965923309326, 'learning_rate': 4.813475447278265e-06, 'epoch': 0.42} +{'loss': 1.7326, 'grad_norm': 3.9906630516052246, 'learning_rate': 4.811572135515798e-06, 'epoch': 0.42} +{'loss': 1.6647, 'grad_norm': 3.944003105163574, 'learning_rate': 4.809668823753332e-06, 'epoch': 0.42} +{'loss': 1.6567, 'grad_norm': 3.7976725101470947, 'learning_rate': 4.807765511990864e-06, 'epoch': 0.42} +{'loss': 1.6849, 'grad_norm': 3.7561235427856445, 'learning_rate': 4.8058622002283975e-06, 'epoch': 0.42} +{'loss': 1.7396, 'grad_norm': 3.9234089851379395, 'learning_rate': 4.803958888465931e-06, 'epoch': 0.42} +{'eval_loss': 1.9910541772842407, 'eval_runtime': 19.5434, 'eval_samples_per_second': 51.168, 'eval_steps_per_second': 2.149, 'epoch': 0.42} +{'loss': 1.6852, 'grad_norm': 3.5238916873931885, 'learning_rate': 4.802055576703464e-06, 'epoch': 0.42} +{'loss': 1.7756, 'grad_norm': 3.654860258102417, 'learning_rate': 4.8001522649409976e-06, 'epoch': 0.42} +{'loss': 1.6534, 'grad_norm': 3.745997667312622, 'learning_rate': 4.798248953178531e-06, 'epoch': 0.42} +{'loss': 1.6239, 'grad_norm': 3.648606300354004, 'learning_rate': 4.796345641416064e-06, 'epoch': 0.43} +{'loss': 1.6806, 'grad_norm': 3.689387798309326, 'learning_rate': 4.794442329653598e-06, 'epoch': 0.43} +{'loss': 1.7372, 'grad_norm': 4.290556907653809, 'learning_rate': 4.792539017891131e-06, 'epoch': 0.43} +{'loss': 1.6611, 'grad_norm': 3.628199338912964, 'learning_rate': 4.790635706128664e-06, 'epoch': 0.43} +{'loss': 1.6948, 'grad_norm': 3.5407426357269287, 'learning_rate': 4.788732394366197e-06, 'epoch': 0.43} +{'loss': 1.6193, 'grad_norm': 3.582146406173706, 'learning_rate': 4.786829082603731e-06, 'epoch': 0.43} +{'loss': 1.6237, 'grad_norm': 3.4039249420166016, 'learning_rate': 4.7849257708412645e-06, 'epoch': 0.43} +{'eval_loss': 1.9884523153305054, 'eval_runtime': 19.5505, 'eval_samples_per_second': 51.15, 'eval_steps_per_second': 2.148, 'epoch': 0.43} +{'loss': 1.7051, 'grad_norm': 3.2031264305114746, 'learning_rate': 4.783022459078798e-06, 'epoch': 0.43} +{'loss': 1.64, 'grad_norm': 3.499633550643921, 'learning_rate': 4.781119147316331e-06, 'epoch': 0.43} +{'loss': 1.6624, 'grad_norm': 3.480463981628418, 'learning_rate': 4.7792158355538645e-06, 'epoch': 0.43} +{'loss': 1.6714, 'grad_norm': 3.8559625148773193, 'learning_rate': 4.777312523791398e-06, 'epoch': 0.44} +{'loss': 1.6622, 'grad_norm': 3.9810192584991455, 'learning_rate': 4.77540921202893e-06, 'epoch': 0.44} +{'loss': 1.6861, 'grad_norm': 4.068185329437256, 'learning_rate': 4.773505900266464e-06, 'epoch': 0.44} +{'loss': 1.6736, 'grad_norm': 3.6008434295654297, 'learning_rate': 4.771602588503997e-06, 'epoch': 0.44} +{'loss': 1.6777, 'grad_norm': 4.177882194519043, 'learning_rate': 4.7696992767415305e-06, 'epoch': 0.44} +{'loss': 1.7092, 'grad_norm': 3.747661590576172, 'learning_rate': 4.767795964979064e-06, 'epoch': 0.44} +{'loss': 1.7193, 'grad_norm': 3.547806978225708, 'learning_rate': 4.765892653216597e-06, 'epoch': 0.44} +{'eval_loss': 1.9860888719558716, 'eval_runtime': 19.5272, 'eval_samples_per_second': 51.211, 'eval_steps_per_second': 2.151, 'epoch': 0.44} +{'loss': 1.7106, 'grad_norm': 3.6006948947906494, 'learning_rate': 4.763989341454131e-06, 'epoch': 0.44} +{'loss': 1.6364, 'grad_norm': 3.7138659954071045, 'learning_rate': 4.762086029691664e-06, 'epoch': 0.44} +{'loss': 1.6473, 'grad_norm': 3.4081013202667236, 'learning_rate': 4.760182717929197e-06, 'epoch': 0.44} +{'loss': 1.6748, 'grad_norm': 4.099686145782471, 'learning_rate': 4.758279406166731e-06, 'epoch': 0.45} +{'loss': 1.5891, 'grad_norm': 3.7577171325683594, 'learning_rate': 4.756376094404263e-06, 'epoch': 0.45} +{'loss': 1.6786, 'grad_norm': 3.4139294624328613, 'learning_rate': 4.7544727826417966e-06, 'epoch': 0.45} +{'loss': 1.729, 'grad_norm': 3.847095489501953, 'learning_rate': 4.75256947087933e-06, 'epoch': 0.45} +{'loss': 1.6782, 'grad_norm': 3.6970272064208984, 'learning_rate': 4.750666159116863e-06, 'epoch': 0.45} +{'loss': 1.6499, 'grad_norm': 3.7617740631103516, 'learning_rate': 4.7487628473543975e-06, 'epoch': 0.45} +{'loss': 1.6035, 'grad_norm': 3.706563949584961, 'learning_rate': 4.746859535591931e-06, 'epoch': 0.45} +{'eval_loss': 1.9925755262374878, 'eval_runtime': 19.5287, 'eval_samples_per_second': 51.207, 'eval_steps_per_second': 2.151, 'epoch': 0.45} +{'loss': 1.6596, 'grad_norm': 3.5655503273010254, 'learning_rate': 4.744956223829464e-06, 'epoch': 0.45} +{'loss': 1.6405, 'grad_norm': 3.6816487312316895, 'learning_rate': 4.743052912066997e-06, 'epoch': 0.45} +{'loss': 1.6788, 'grad_norm': 3.4327733516693115, 'learning_rate': 4.74114960030453e-06, 'epoch': 0.46} +{'loss': 1.691, 'grad_norm': 3.587916374206543, 'learning_rate': 4.7392462885420635e-06, 'epoch': 0.46} +{'loss': 1.5875, 'grad_norm': 3.5958914756774902, 'learning_rate': 4.737342976779597e-06, 'epoch': 0.46} +{'loss': 1.6789, 'grad_norm': 3.811089515686035, 'learning_rate': 4.73543966501713e-06, 'epoch': 0.46} +{'loss': 1.6479, 'grad_norm': 3.4276814460754395, 'learning_rate': 4.7335363532546636e-06, 'epoch': 0.46} +{'loss': 1.6621, 'grad_norm': 3.5380873680114746, 'learning_rate': 4.731633041492197e-06, 'epoch': 0.46} +{'loss': 1.6565, 'grad_norm': 3.3950912952423096, 'learning_rate': 4.72972972972973e-06, 'epoch': 0.46} +{'loss': 1.6993, 'grad_norm': 3.8493549823760986, 'learning_rate': 4.727826417967264e-06, 'epoch': 0.46} +{'eval_loss': 1.9799402952194214, 'eval_runtime': 19.4878, 'eval_samples_per_second': 51.314, 'eval_steps_per_second': 2.155, 'epoch': 0.46} +{'loss': 1.6733, 'grad_norm': 3.646838903427124, 'learning_rate': 4.725923106204797e-06, 'epoch': 0.46} +{'loss': 1.6395, 'grad_norm': 3.3769218921661377, 'learning_rate': 4.72401979444233e-06, 'epoch': 0.46} +{'loss': 1.6849, 'grad_norm': 4.25264310836792, 'learning_rate': 4.722116482679863e-06, 'epoch': 0.47} +{'loss': 1.6728, 'grad_norm': 3.6438825130462646, 'learning_rate': 4.720213170917396e-06, 'epoch': 0.47} +{'loss': 1.6921, 'grad_norm': 4.4174323081970215, 'learning_rate': 4.71830985915493e-06, 'epoch': 0.47} +{'loss': 1.6387, 'grad_norm': 3.802617073059082, 'learning_rate': 4.716406547392463e-06, 'epoch': 0.47} +{'loss': 1.7024, 'grad_norm': 4.614408493041992, 'learning_rate': 4.714503235629996e-06, 'epoch': 0.47} +{'loss': 1.6279, 'grad_norm': 3.579514503479004, 'learning_rate': 4.71259992386753e-06, 'epoch': 0.47} +{'loss': 1.6651, 'grad_norm': 4.164543628692627, 'learning_rate': 4.710696612105063e-06, 'epoch': 0.47} +{'loss': 1.6601, 'grad_norm': 4.034106731414795, 'learning_rate': 4.708793300342596e-06, 'epoch': 0.47} +{'eval_loss': 1.9592889547348022, 'eval_runtime': 19.5146, 'eval_samples_per_second': 51.244, 'eval_steps_per_second': 2.152, 'epoch': 0.47} +{'loss': 1.6467, 'grad_norm': 4.239640235900879, 'learning_rate': 4.70688998858013e-06, 'epoch': 0.47} +{'loss': 1.6443, 'grad_norm': 4.195849418640137, 'learning_rate': 4.704986676817663e-06, 'epoch': 0.47} +{'loss': 1.6785, 'grad_norm': 4.0914530754089355, 'learning_rate': 4.7030833650551965e-06, 'epoch': 0.48} +{'loss': 1.6346, 'grad_norm': 4.312870025634766, 'learning_rate': 4.70118005329273e-06, 'epoch': 0.48} +{'loss': 1.7108, 'grad_norm': 3.6855762004852295, 'learning_rate': 4.699276741530263e-06, 'epoch': 0.48} +{'loss': 1.6833, 'grad_norm': 4.027989387512207, 'learning_rate': 4.697373429767797e-06, 'epoch': 0.48} +{'loss': 1.6699, 'grad_norm': 3.717198610305786, 'learning_rate': 4.69547011800533e-06, 'epoch': 0.48} +{'loss': 1.6614, 'grad_norm': 3.948587656021118, 'learning_rate': 4.693566806242863e-06, 'epoch': 0.48} +{'loss': 1.6583, 'grad_norm': 3.826885938644409, 'learning_rate': 4.691663494480397e-06, 'epoch': 0.48} +{'loss': 1.6842, 'grad_norm': 3.8523266315460205, 'learning_rate': 4.689760182717929e-06, 'epoch': 0.48} +{'eval_loss': 1.9510598182678223, 'eval_runtime': 19.5171, 'eval_samples_per_second': 51.237, 'eval_steps_per_second': 2.152, 'epoch': 0.48} +{'loss': 1.5791, 'grad_norm': 3.691249370574951, 'learning_rate': 4.6878568709554626e-06, 'epoch': 0.48} +{'loss': 1.6433, 'grad_norm': 3.6697733402252197, 'learning_rate': 4.685953559192996e-06, 'epoch': 0.48} +{'loss': 1.6677, 'grad_norm': 3.586216449737549, 'learning_rate': 4.684050247430529e-06, 'epoch': 0.49} +{'loss': 1.5992, 'grad_norm': 3.500448703765869, 'learning_rate': 4.682146935668063e-06, 'epoch': 0.49} +{'loss': 1.6481, 'grad_norm': 3.984527587890625, 'learning_rate': 4.680243623905596e-06, 'epoch': 0.49} +{'loss': 1.6621, 'grad_norm': 3.8697307109832764, 'learning_rate': 4.678340312143129e-06, 'epoch': 0.49} +{'loss': 1.6325, 'grad_norm': 3.587400197982788, 'learning_rate': 4.676437000380663e-06, 'epoch': 0.49} +{'loss': 1.7451, 'grad_norm': 3.86324143409729, 'learning_rate': 4.674533688618196e-06, 'epoch': 0.49} +{'loss': 1.6779, 'grad_norm': 3.588721513748169, 'learning_rate': 4.6726303768557295e-06, 'epoch': 0.49} +{'loss': 1.5883, 'grad_norm': 3.6514649391174316, 'learning_rate': 4.670727065093263e-06, 'epoch': 0.49} +{'eval_loss': 1.9579463005065918, 'eval_runtime': 19.5191, 'eval_samples_per_second': 51.232, 'eval_steps_per_second': 2.152, 'epoch': 0.49} +{'loss': 1.6177, 'grad_norm': 4.134105205535889, 'learning_rate': 4.668823753330796e-06, 'epoch': 0.49} +{'loss': 1.6318, 'grad_norm': 3.8660967350006104, 'learning_rate': 4.6669204415683295e-06, 'epoch': 0.5} +{'loss': 1.7086, 'grad_norm': 4.177504062652588, 'learning_rate': 4.665017129805863e-06, 'epoch': 0.5} +{'loss': 1.6857, 'grad_norm': 3.699950695037842, 'learning_rate': 4.663113818043396e-06, 'epoch': 0.5} +{'loss': 1.6465, 'grad_norm': 3.7795238494873047, 'learning_rate': 4.66121050628093e-06, 'epoch': 0.5} +{'loss': 1.6833, 'grad_norm': 3.6508758068084717, 'learning_rate': 4.659307194518463e-06, 'epoch': 0.5} +{'loss': 1.6473, 'grad_norm': 3.8457183837890625, 'learning_rate': 4.6574038827559955e-06, 'epoch': 0.5} +{'loss': 1.6647, 'grad_norm': 3.9396474361419678, 'learning_rate': 4.655500570993529e-06, 'epoch': 0.5} +{'loss': 1.6991, 'grad_norm': 4.000178813934326, 'learning_rate': 4.653597259231062e-06, 'epoch': 0.5} +{'loss': 1.5837, 'grad_norm': 3.970252513885498, 'learning_rate': 4.651693947468596e-06, 'epoch': 0.5} +{'eval_loss': 1.9684062004089355, 'eval_runtime': 19.5247, 'eval_samples_per_second': 51.217, 'eval_steps_per_second': 2.151, 'epoch': 0.5} +{'loss': 1.7311, 'grad_norm': 3.992440938949585, 'learning_rate': 4.649790635706129e-06, 'epoch': 0.5} +{'loss': 1.6264, 'grad_norm': 4.1443400382995605, 'learning_rate': 4.647887323943662e-06, 'epoch': 0.51} +{'loss': 1.6365, 'grad_norm': 3.8363723754882812, 'learning_rate': 4.645984012181196e-06, 'epoch': 0.51} +{'loss': 1.5583, 'grad_norm': 4.493776798248291, 'learning_rate': 4.644080700418729e-06, 'epoch': 0.51} +{'loss': 1.6755, 'grad_norm': 4.2898759841918945, 'learning_rate': 4.642177388656262e-06, 'epoch': 0.51} +{'loss': 1.639, 'grad_norm': 3.658322811126709, 'learning_rate': 4.640274076893796e-06, 'epoch': 0.51} +{'loss': 1.7018, 'grad_norm': 4.569284915924072, 'learning_rate': 4.638370765131328e-06, 'epoch': 0.51} +{'loss': 1.6214, 'grad_norm': 3.767124652862549, 'learning_rate': 4.636467453368862e-06, 'epoch': 0.51} +{'loss': 1.6051, 'grad_norm': 3.9240939617156982, 'learning_rate': 4.634564141606396e-06, 'epoch': 0.51} +{'loss': 1.6671, 'grad_norm': 4.106863975524902, 'learning_rate': 4.632660829843929e-06, 'epoch': 0.51} +{'eval_loss': 1.959475040435791, 'eval_runtime': 19.4982, 'eval_samples_per_second': 51.287, 'eval_steps_per_second': 2.154, 'epoch': 0.51} +{'loss': 1.6025, 'grad_norm': 3.582895517349243, 'learning_rate': 4.630757518081463e-06, 'epoch': 0.51} +{'loss': 1.7261, 'grad_norm': 3.9942805767059326, 'learning_rate': 4.628854206318996e-06, 'epoch': 0.52} +{'loss': 1.6356, 'grad_norm': 4.071309566497803, 'learning_rate': 4.626950894556529e-06, 'epoch': 0.52} +{'loss': 1.6608, 'grad_norm': 3.5222649574279785, 'learning_rate': 4.625047582794062e-06, 'epoch': 0.52} +{'loss': 1.5916, 'grad_norm': 4.291646957397461, 'learning_rate': 4.623144271031595e-06, 'epoch': 0.52} +{'loss': 1.5977, 'grad_norm': 3.4287211894989014, 'learning_rate': 4.6212409592691286e-06, 'epoch': 0.52} +{'loss': 1.6933, 'grad_norm': 3.6409990787506104, 'learning_rate': 4.619337647506662e-06, 'epoch': 0.52} +{'loss': 1.6531, 'grad_norm': 4.201468467712402, 'learning_rate': 4.617434335744195e-06, 'epoch': 0.52} +{'loss': 1.6184, 'grad_norm': 3.678266763687134, 'learning_rate': 4.615531023981729e-06, 'epoch': 0.52} +{'loss': 1.6347, 'grad_norm': 3.6852829456329346, 'learning_rate': 4.613627712219262e-06, 'epoch': 0.52} +{'eval_loss': 1.9549599885940552, 'eval_runtime': 19.5229, 'eval_samples_per_second': 51.222, 'eval_steps_per_second': 2.151, 'epoch': 0.52} +{'loss': 1.6431, 'grad_norm': 4.129430294036865, 'learning_rate': 4.611724400456795e-06, 'epoch': 0.52} +{'loss': 1.6427, 'grad_norm': 3.638345718383789, 'learning_rate': 4.609821088694329e-06, 'epoch': 0.53} +{'loss': 1.6214, 'grad_norm': 3.722020387649536, 'learning_rate': 4.607917776931862e-06, 'epoch': 0.53} +{'loss': 1.6713, 'grad_norm': 4.026375770568848, 'learning_rate': 4.606014465169395e-06, 'epoch': 0.53} +{'loss': 1.6796, 'grad_norm': 3.5974960327148438, 'learning_rate': 4.604111153406928e-06, 'epoch': 0.53} +{'loss': 1.5883, 'grad_norm': 3.648608446121216, 'learning_rate': 4.602207841644461e-06, 'epoch': 0.53} +{'loss': 1.6884, 'grad_norm': 4.179163455963135, 'learning_rate': 4.600304529881995e-06, 'epoch': 0.53} +{'loss': 1.6633, 'grad_norm': 3.915071487426758, 'learning_rate': 4.598401218119528e-06, 'epoch': 0.53} +{'loss': 1.6764, 'grad_norm': 3.411623954772949, 'learning_rate': 4.5964979063570614e-06, 'epoch': 0.53} +{'loss': 1.6137, 'grad_norm': 3.896123170852661, 'learning_rate': 4.594594594594596e-06, 'epoch': 0.53} +{'eval_loss': 1.9388415813446045, 'eval_runtime': 19.5271, 'eval_samples_per_second': 51.211, 'eval_steps_per_second': 2.151, 'epoch': 0.53} +{'loss': 1.6493, 'grad_norm': 3.224864959716797, 'learning_rate': 4.592691282832128e-06, 'epoch': 0.54} +{'loss': 1.6662, 'grad_norm': 3.431166648864746, 'learning_rate': 4.5907879710696615e-06, 'epoch': 0.54} +{'loss': 1.5793, 'grad_norm': 3.3794140815734863, 'learning_rate': 4.588884659307195e-06, 'epoch': 0.54} +{'loss': 1.6576, 'grad_norm': 3.3896186351776123, 'learning_rate': 4.586981347544728e-06, 'epoch': 0.54} +{'loss': 1.6092, 'grad_norm': 3.613070249557495, 'learning_rate': 4.585078035782262e-06, 'epoch': 0.54} +{'loss': 1.6393, 'grad_norm': 3.6575820446014404, 'learning_rate': 4.583174724019795e-06, 'epoch': 0.54} +{'loss': 1.5895, 'grad_norm': 3.2222516536712646, 'learning_rate': 4.581271412257328e-06, 'epoch': 0.54} +{'loss': 1.5718, 'grad_norm': 3.790112257003784, 'learning_rate': 4.579368100494862e-06, 'epoch': 0.54} +{'loss': 1.6748, 'grad_norm': 3.977811813354492, 'learning_rate': 4.577464788732395e-06, 'epoch': 0.54} +{'loss': 1.6466, 'grad_norm': 3.753455400466919, 'learning_rate': 4.575561476969928e-06, 'epoch': 0.54} +{'eval_loss': 1.9631779193878174, 'eval_runtime': 19.4985, 'eval_samples_per_second': 51.286, 'eval_steps_per_second': 2.154, 'epoch': 0.54} +{'loss': 1.6897, 'grad_norm': 3.9566311836242676, 'learning_rate': 4.573658165207461e-06, 'epoch': 0.55} +{'loss': 1.6527, 'grad_norm': 3.3900554180145264, 'learning_rate': 4.571754853444994e-06, 'epoch': 0.55} +{'loss': 1.6333, 'grad_norm': 3.8283536434173584, 'learning_rate': 4.569851541682528e-06, 'epoch': 0.55} +{'loss': 1.6074, 'grad_norm': 3.906646251678467, 'learning_rate': 4.567948229920061e-06, 'epoch': 0.55} +{'loss': 1.5747, 'grad_norm': 3.6340959072113037, 'learning_rate': 4.566044918157594e-06, 'epoch': 0.55} +{'loss': 1.6113, 'grad_norm': 3.6226003170013428, 'learning_rate': 4.564141606395128e-06, 'epoch': 0.55} +{'loss': 1.6674, 'grad_norm': 4.215226173400879, 'learning_rate': 4.562238294632661e-06, 'epoch': 0.55} +{'loss': 1.6226, 'grad_norm': 3.6712610721588135, 'learning_rate': 4.5603349828701945e-06, 'epoch': 0.55} +{'loss': 1.6506, 'grad_norm': 3.447861671447754, 'learning_rate': 4.558431671107728e-06, 'epoch': 0.55} +{'loss': 1.6887, 'grad_norm': 4.191435813903809, 'learning_rate': 4.556528359345261e-06, 'epoch': 0.55} +{'eval_loss': 1.9421565532684326, 'eval_runtime': 19.5404, 'eval_samples_per_second': 51.176, 'eval_steps_per_second': 2.149, 'epoch': 0.55} +{'loss': 1.621, 'grad_norm': 3.4830660820007324, 'learning_rate': 4.5546250475827946e-06, 'epoch': 0.56} +{'loss': 1.5816, 'grad_norm': 3.912907600402832, 'learning_rate': 4.552721735820328e-06, 'epoch': 0.56} +{'loss': 1.6181, 'grad_norm': 3.6781387329101562, 'learning_rate': 4.550818424057861e-06, 'epoch': 0.56} +{'loss': 1.6237, 'grad_norm': 3.6922621726989746, 'learning_rate': 4.548915112295395e-06, 'epoch': 0.56} +{'loss': 1.6352, 'grad_norm': 3.7651724815368652, 'learning_rate': 4.547011800532928e-06, 'epoch': 0.56} +{'loss': 1.5799, 'grad_norm': 3.9535162448883057, 'learning_rate': 4.545108488770461e-06, 'epoch': 0.56} +{'loss': 1.6868, 'grad_norm': 3.3779971599578857, 'learning_rate': 4.543205177007995e-06, 'epoch': 0.56} +{'loss': 1.625, 'grad_norm': 4.282001972198486, 'learning_rate': 4.541301865245527e-06, 'epoch': 0.56} +{'loss': 1.645, 'grad_norm': 3.400374412536621, 'learning_rate': 4.539398553483061e-06, 'epoch': 0.56} +{'loss': 1.6159, 'grad_norm': 3.8115952014923096, 'learning_rate': 4.537495241720594e-06, 'epoch': 0.56} +{'eval_loss': 1.950903296470642, 'eval_runtime': 19.5488, 'eval_samples_per_second': 51.154, 'eval_steps_per_second': 2.148, 'epoch': 0.56} +{'loss': 1.658, 'grad_norm': 3.9501523971557617, 'learning_rate': 4.535591929958127e-06, 'epoch': 0.57} +{'loss': 1.6409, 'grad_norm': 3.649984836578369, 'learning_rate': 4.533688618195661e-06, 'epoch': 0.57} +{'loss': 1.6251, 'grad_norm': 3.3394558429718018, 'learning_rate': 4.531785306433194e-06, 'epoch': 0.57} +{'loss': 1.6031, 'grad_norm': 3.6418933868408203, 'learning_rate': 4.529881994670727e-06, 'epoch': 0.57} +{'loss': 1.6023, 'grad_norm': 3.493224859237671, 'learning_rate': 4.527978682908261e-06, 'epoch': 0.57} +{'loss': 1.6194, 'grad_norm': 3.2951719760894775, 'learning_rate': 4.526075371145794e-06, 'epoch': 0.57} +{'loss': 1.6421, 'grad_norm': 3.6181528568267822, 'learning_rate': 4.5241720593833275e-06, 'epoch': 0.57} +{'loss': 1.6265, 'grad_norm': 3.409442901611328, 'learning_rate': 4.52226874762086e-06, 'epoch': 0.57} +{'loss': 1.6621, 'grad_norm': 3.5513176918029785, 'learning_rate': 4.520365435858393e-06, 'epoch': 0.57} +{'loss': 1.6771, 'grad_norm': 3.725229263305664, 'learning_rate': 4.518462124095928e-06, 'epoch': 0.58} +{'eval_loss': 1.9359264373779297, 'eval_runtime': 19.5682, 'eval_samples_per_second': 51.103, 'eval_steps_per_second': 2.146, 'epoch': 0.58} +{'loss': 1.6335, 'grad_norm': 3.3752098083496094, 'learning_rate': 4.516558812333461e-06, 'epoch': 0.58} +{'loss': 1.6198, 'grad_norm': 3.3705244064331055, 'learning_rate': 4.514655500570994e-06, 'epoch': 0.58} +{'loss': 1.6895, 'grad_norm': 3.4303088188171387, 'learning_rate': 4.512752188808528e-06, 'epoch': 0.58} +{'loss': 1.6862, 'grad_norm': 3.367138624191284, 'learning_rate': 4.510848877046061e-06, 'epoch': 0.58} +{'loss': 1.6464, 'grad_norm': 3.858853578567505, 'learning_rate': 4.5089455652835936e-06, 'epoch': 0.58} +{'loss': 1.6158, 'grad_norm': 3.7724015712738037, 'learning_rate': 4.507042253521127e-06, 'epoch': 0.58} +{'loss': 1.579, 'grad_norm': 3.8024699687957764, 'learning_rate': 4.50513894175866e-06, 'epoch': 0.58} +{'loss': 1.649, 'grad_norm': 3.5875134468078613, 'learning_rate': 4.503235629996194e-06, 'epoch': 0.58} +{'loss': 1.6561, 'grad_norm': 3.6846938133239746, 'learning_rate': 4.501332318233727e-06, 'epoch': 0.58} +{'loss': 1.6154, 'grad_norm': 3.797062635421753, 'learning_rate': 4.49942900647126e-06, 'epoch': 0.59} +{'eval_loss': 1.9322106838226318, 'eval_runtime': 19.5749, 'eval_samples_per_second': 51.086, 'eval_steps_per_second': 2.146, 'epoch': 0.59} +{'loss': 1.6548, 'grad_norm': 3.328934907913208, 'learning_rate': 4.497525694708794e-06, 'epoch': 0.59} +{'loss': 1.6531, 'grad_norm': 3.902885675430298, 'learning_rate': 4.495622382946327e-06, 'epoch': 0.59} +{'loss': 1.5963, 'grad_norm': 3.6463584899902344, 'learning_rate': 4.4937190711838605e-06, 'epoch': 0.59} +{'loss': 1.6815, 'grad_norm': 3.8405065536499023, 'learning_rate': 4.491815759421394e-06, 'epoch': 0.59} +{'loss': 1.6133, 'grad_norm': 4.140473365783691, 'learning_rate': 4.489912447658926e-06, 'epoch': 0.59} +{'loss': 1.5992, 'grad_norm': 3.6940884590148926, 'learning_rate': 4.48800913589646e-06, 'epoch': 0.59} +{'loss': 1.6399, 'grad_norm': 3.924219846725464, 'learning_rate': 4.486105824133993e-06, 'epoch': 0.59} +{'loss': 1.638, 'grad_norm': 3.540297031402588, 'learning_rate': 4.4842025123715264e-06, 'epoch': 0.59} +{'loss': 1.6073, 'grad_norm': 3.7058982849121094, 'learning_rate': 4.48229920060906e-06, 'epoch': 0.59} +{'loss': 1.579, 'grad_norm': 3.5080950260162354, 'learning_rate': 4.480395888846594e-06, 'epoch': 0.6} +{'eval_loss': 1.934954047203064, 'eval_runtime': 19.5845, 'eval_samples_per_second': 51.061, 'eval_steps_per_second': 2.145, 'epoch': 0.6} +{'loss': 1.6988, 'grad_norm': 3.894596576690674, 'learning_rate': 4.478492577084127e-06, 'epoch': 0.6} +{'loss': 1.5884, 'grad_norm': 3.211979866027832, 'learning_rate': 4.47658926532166e-06, 'epoch': 0.6} +{'loss': 1.6343, 'grad_norm': 3.606484889984131, 'learning_rate': 4.474685953559193e-06, 'epoch': 0.6} +{'loss': 1.7606, 'grad_norm': 3.8994548320770264, 'learning_rate': 4.472782641796727e-06, 'epoch': 0.6} +{'loss': 1.5743, 'grad_norm': 3.6121206283569336, 'learning_rate': 4.47087933003426e-06, 'epoch': 0.6} +{'loss': 1.5928, 'grad_norm': 4.073150157928467, 'learning_rate': 4.468976018271793e-06, 'epoch': 0.6} +{'loss': 1.6034, 'grad_norm': 3.3716487884521484, 'learning_rate': 4.467072706509327e-06, 'epoch': 0.6} +{'loss': 1.6319, 'grad_norm': 3.693453788757324, 'learning_rate': 4.46516939474686e-06, 'epoch': 0.6} +{'loss': 1.582, 'grad_norm': 3.4559779167175293, 'learning_rate': 4.463266082984393e-06, 'epoch': 0.61} +{'loss': 1.7058, 'grad_norm': 3.56341290473938, 'learning_rate': 4.461362771221927e-06, 'epoch': 0.61} +{'eval_loss': 1.9354392290115356, 'eval_runtime': 19.567, 'eval_samples_per_second': 51.106, 'eval_steps_per_second': 2.146, 'epoch': 0.61} +{'loss': 1.5971, 'grad_norm': 3.5636279582977295, 'learning_rate': 4.45945945945946e-06, 'epoch': 0.61} +{'loss': 1.6144, 'grad_norm': 3.4851160049438477, 'learning_rate': 4.457556147696993e-06, 'epoch': 0.61} +{'loss': 1.6809, 'grad_norm': 3.626793146133423, 'learning_rate': 4.455652835934526e-06, 'epoch': 0.61} +{'loss': 1.6801, 'grad_norm': 3.7558059692382812, 'learning_rate': 4.453749524172059e-06, 'epoch': 0.61} +{'loss': 1.6389, 'grad_norm': 3.3991072177886963, 'learning_rate': 4.451846212409593e-06, 'epoch': 0.61} +{'loss': 1.6322, 'grad_norm': 3.7863316535949707, 'learning_rate': 4.449942900647126e-06, 'epoch': 0.61} +{'loss': 1.5634, 'grad_norm': 3.273711681365967, 'learning_rate': 4.4480395888846595e-06, 'epoch': 0.61} +{'loss': 1.6366, 'grad_norm': 3.84954833984375, 'learning_rate': 4.446136277122193e-06, 'epoch': 0.61} +{'loss': 1.5672, 'grad_norm': 3.2423393726348877, 'learning_rate': 4.444232965359726e-06, 'epoch': 0.62} +{'loss': 1.6301, 'grad_norm': 3.6933250427246094, 'learning_rate': 4.4423296535972596e-06, 'epoch': 0.62} +{'eval_loss': 1.94279146194458, 'eval_runtime': 19.5788, 'eval_samples_per_second': 51.076, 'eval_steps_per_second': 2.145, 'epoch': 0.62} +{'loss': 1.5726, 'grad_norm': 4.006799221038818, 'learning_rate': 4.440426341834793e-06, 'epoch': 0.62} +{'loss': 1.6276, 'grad_norm': 3.6503140926361084, 'learning_rate': 4.438523030072326e-06, 'epoch': 0.62} +{'loss': 1.6478, 'grad_norm': 3.849588394165039, 'learning_rate': 4.43661971830986e-06, 'epoch': 0.62} +{'loss': 1.6001, 'grad_norm': 3.5333118438720703, 'learning_rate': 4.434716406547393e-06, 'epoch': 0.62} +{'loss': 1.5998, 'grad_norm': 3.5144429206848145, 'learning_rate': 4.432813094784926e-06, 'epoch': 0.62} +{'loss': 1.6533, 'grad_norm': 3.4345717430114746, 'learning_rate': 4.43090978302246e-06, 'epoch': 0.62} +{'loss': 1.6148, 'grad_norm': 3.7380740642547607, 'learning_rate': 4.429006471259993e-06, 'epoch': 0.62} +{'loss': 1.6056, 'grad_norm': 3.782078981399536, 'learning_rate': 4.4271031594975265e-06, 'epoch': 0.62} +{'loss': 1.5756, 'grad_norm': 3.328213930130005, 'learning_rate': 4.425199847735059e-06, 'epoch': 0.63} +{'loss': 1.5763, 'grad_norm': 3.3279001712799072, 'learning_rate': 4.423296535972592e-06, 'epoch': 0.63} +{'eval_loss': 1.9433379173278809, 'eval_runtime': 19.5704, 'eval_samples_per_second': 51.098, 'eval_steps_per_second': 2.146, 'epoch': 0.63} +{'loss': 1.6068, 'grad_norm': 3.653395891189575, 'learning_rate': 4.421393224210126e-06, 'epoch': 0.63} +{'loss': 1.5731, 'grad_norm': 3.353541851043701, 'learning_rate': 4.419489912447659e-06, 'epoch': 0.63} +{'loss': 1.658, 'grad_norm': 3.5061612129211426, 'learning_rate': 4.4175866006851924e-06, 'epoch': 0.63} +{'loss': 1.6205, 'grad_norm': 3.140570878982544, 'learning_rate': 4.415683288922726e-06, 'epoch': 0.63} +{'loss': 1.5993, 'grad_norm': 3.5273332595825195, 'learning_rate': 4.413779977160259e-06, 'epoch': 0.63} +{'loss': 1.6063, 'grad_norm': 3.416485548019409, 'learning_rate': 4.4118766653977925e-06, 'epoch': 0.63} +{'loss': 1.6723, 'grad_norm': 3.6776890754699707, 'learning_rate': 4.409973353635326e-06, 'epoch': 0.63} +{'loss': 1.6211, 'grad_norm': 3.5991504192352295, 'learning_rate': 4.408070041872859e-06, 'epoch': 0.63} +{'loss': 1.5766, 'grad_norm': 3.399278163909912, 'learning_rate': 4.406166730110393e-06, 'epoch': 0.64} +{'loss': 1.5633, 'grad_norm': 3.2386038303375244, 'learning_rate': 4.404263418347925e-06, 'epoch': 0.64} +{'eval_loss': 1.9623126983642578, 'eval_runtime': 19.5848, 'eval_samples_per_second': 51.06, 'eval_steps_per_second': 2.145, 'epoch': 0.64} +{'loss': 1.5837, 'grad_norm': 3.5039336681365967, 'learning_rate': 4.402360106585459e-06, 'epoch': 0.64} +{'loss': 1.5791, 'grad_norm': 3.502223491668701, 'learning_rate': 4.400456794822993e-06, 'epoch': 0.64} +{'loss': 1.6046, 'grad_norm': 3.664015293121338, 'learning_rate': 4.398553483060526e-06, 'epoch': 0.64} +{'loss': 1.5753, 'grad_norm': 4.016345024108887, 'learning_rate': 4.396650171298059e-06, 'epoch': 0.64} +{'loss': 1.5864, 'grad_norm': 3.4624314308166504, 'learning_rate': 4.394746859535593e-06, 'epoch': 0.64} +{'loss': 1.5606, 'grad_norm': 3.274582624435425, 'learning_rate': 4.392843547773125e-06, 'epoch': 0.64} +{'loss': 1.5838, 'grad_norm': 3.3756613731384277, 'learning_rate': 4.390940236010659e-06, 'epoch': 0.64} +{'loss': 1.5958, 'grad_norm': 3.608527898788452, 'learning_rate': 4.389036924248192e-06, 'epoch': 0.65} +{'loss': 1.6357, 'grad_norm': 3.5510783195495605, 'learning_rate': 4.387133612485725e-06, 'epoch': 0.65} +{'loss': 1.5957, 'grad_norm': 3.308255672454834, 'learning_rate': 4.385230300723259e-06, 'epoch': 0.65} +{'eval_loss': 1.9575309753417969, 'eval_runtime': 19.5709, 'eval_samples_per_second': 51.096, 'eval_steps_per_second': 2.146, 'epoch': 0.65} +{'loss': 1.5979, 'grad_norm': 3.7094995975494385, 'learning_rate': 4.383326988960792e-06, 'epoch': 0.65} +{'loss': 1.5684, 'grad_norm': 3.5525643825531006, 'learning_rate': 4.3814236771983255e-06, 'epoch': 0.65} +{'loss': 1.6383, 'grad_norm': 3.4564590454101562, 'learning_rate': 4.379520365435859e-06, 'epoch': 0.65} +{'loss': 1.6154, 'grad_norm': 3.7914085388183594, 'learning_rate': 4.377617053673392e-06, 'epoch': 0.65} +{'loss': 1.6051, 'grad_norm': 3.4316725730895996, 'learning_rate': 4.3757137419109256e-06, 'epoch': 0.65} +{'loss': 1.6734, 'grad_norm': 3.5904269218444824, 'learning_rate': 4.373810430148459e-06, 'epoch': 0.65} +{'loss': 1.5725, 'grad_norm': 3.408034563064575, 'learning_rate': 4.3719071183859914e-06, 'epoch': 0.65} +{'loss': 1.6312, 'grad_norm': 3.8338141441345215, 'learning_rate': 4.370003806623525e-06, 'epoch': 0.66} +{'loss': 1.5669, 'grad_norm': 3.398602247238159, 'learning_rate': 4.368100494861058e-06, 'epoch': 0.66} +{'loss': 1.6158, 'grad_norm': 3.2189407348632812, 'learning_rate': 4.3661971830985915e-06, 'epoch': 0.66} +{'eval_loss': 1.94804048538208, 'eval_runtime': 19.6317, 'eval_samples_per_second': 50.938, 'eval_steps_per_second': 2.139, 'epoch': 0.66} +{'loss': 1.5719, 'grad_norm': 3.8089427947998047, 'learning_rate': 4.364293871336126e-06, 'epoch': 0.66} +{'loss': 1.6079, 'grad_norm': 3.644151449203491, 'learning_rate': 4.362390559573659e-06, 'epoch': 0.66} +{'loss': 1.5639, 'grad_norm': 3.2669711112976074, 'learning_rate': 4.3604872478111925e-06, 'epoch': 0.66} +{'loss': 1.5059, 'grad_norm': 3.441220283508301, 'learning_rate': 4.358583936048725e-06, 'epoch': 0.66} +{'loss': 1.6368, 'grad_norm': 4.15876579284668, 'learning_rate': 4.356680624286258e-06, 'epoch': 0.66} +{'loss': 1.5965, 'grad_norm': 3.5194272994995117, 'learning_rate': 4.354777312523792e-06, 'epoch': 0.66} +{'loss': 1.6382, 'grad_norm': 3.7779362201690674, 'learning_rate': 4.352874000761325e-06, 'epoch': 0.66} +{'loss': 1.5924, 'grad_norm': 4.217499256134033, 'learning_rate': 4.3509706889988584e-06, 'epoch': 0.67} +{'loss': 1.6587, 'grad_norm': 3.541980504989624, 'learning_rate': 4.349067377236392e-06, 'epoch': 0.67} +{'loss': 1.6472, 'grad_norm': 3.880202293395996, 'learning_rate': 4.347164065473925e-06, 'epoch': 0.67} +{'eval_loss': 1.9581282138824463, 'eval_runtime': 19.6179, 'eval_samples_per_second': 50.974, 'eval_steps_per_second': 2.141, 'epoch': 0.67} +{'loss': 1.6605, 'grad_norm': 3.676569700241089, 'learning_rate': 4.3452607537114585e-06, 'epoch': 0.67} +{'loss': 1.5806, 'grad_norm': 3.6324715614318848, 'learning_rate': 4.343357441948992e-06, 'epoch': 0.67} +{'loss': 1.6195, 'grad_norm': 3.4637959003448486, 'learning_rate': 4.341454130186525e-06, 'epoch': 0.67} +{'loss': 1.5803, 'grad_norm': 3.9219613075256348, 'learning_rate': 4.339550818424058e-06, 'epoch': 0.67} +{'loss': 1.5591, 'grad_norm': 3.328799247741699, 'learning_rate': 4.337647506661591e-06, 'epoch': 0.67} +{'loss': 1.5894, 'grad_norm': 3.9095053672790527, 'learning_rate': 4.3357441948991245e-06, 'epoch': 0.67} +{'loss': 1.634, 'grad_norm': 3.7123284339904785, 'learning_rate': 4.333840883136658e-06, 'epoch': 0.67} +{'loss': 1.5707, 'grad_norm': 3.65523099899292, 'learning_rate': 4.331937571374191e-06, 'epoch': 0.68} +{'loss': 1.5808, 'grad_norm': 3.5030550956726074, 'learning_rate': 4.3300342596117246e-06, 'epoch': 0.68} +{'loss': 1.5883, 'grad_norm': 3.82133412361145, 'learning_rate': 4.328130947849258e-06, 'epoch': 0.68} +{'eval_loss': 1.9302200078964233, 'eval_runtime': 19.5626, 'eval_samples_per_second': 51.118, 'eval_steps_per_second': 2.147, 'epoch': 0.68} +{'loss': 1.5401, 'grad_norm': 3.487764596939087, 'learning_rate': 4.326227636086791e-06, 'epoch': 0.68} +{'loss': 1.5979, 'grad_norm': 3.6610918045043945, 'learning_rate': 4.324324324324325e-06, 'epoch': 0.68} +{'loss': 1.5961, 'grad_norm': 3.30855131149292, 'learning_rate': 4.322421012561858e-06, 'epoch': 0.68} +{'loss': 1.5852, 'grad_norm': 3.4520370960235596, 'learning_rate': 4.320517700799391e-06, 'epoch': 0.68} +{'loss': 1.6818, 'grad_norm': 3.952258586883545, 'learning_rate': 4.318614389036925e-06, 'epoch': 0.68} +{'loss': 1.6402, 'grad_norm': 3.534264326095581, 'learning_rate': 4.316711077274458e-06, 'epoch': 0.68} +{'loss': 1.549, 'grad_norm': 3.5959432125091553, 'learning_rate': 4.3148077655119915e-06, 'epoch': 0.69} +{'loss': 1.6559, 'grad_norm': 3.435608148574829, 'learning_rate': 4.312904453749525e-06, 'epoch': 0.69} +{'loss': 1.6019, 'grad_norm': 3.4857850074768066, 'learning_rate': 4.311001141987058e-06, 'epoch': 0.69} +{'loss': 1.6047, 'grad_norm': 3.574498176574707, 'learning_rate': 4.3090978302245916e-06, 'epoch': 0.69} +{'eval_loss': 1.9390149116516113, 'eval_runtime': 19.5992, 'eval_samples_per_second': 51.023, 'eval_steps_per_second': 2.143, 'epoch': 0.69} +{'loss': 1.6005, 'grad_norm': 3.4558212757110596, 'learning_rate': 4.307194518462124e-06, 'epoch': 0.69} +{'loss': 1.6736, 'grad_norm': 3.555696487426758, 'learning_rate': 4.3052912066996574e-06, 'epoch': 0.69} +{'loss': 1.6468, 'grad_norm': 3.451206922531128, 'learning_rate': 4.303387894937191e-06, 'epoch': 0.69} +{'loss': 1.5679, 'grad_norm': 3.3780364990234375, 'learning_rate': 4.301484583174724e-06, 'epoch': 0.69} +{'loss': 1.5478, 'grad_norm': 3.461742639541626, 'learning_rate': 4.2995812714122575e-06, 'epoch': 0.69} +{'loss': 1.5747, 'grad_norm': 3.4164371490478516, 'learning_rate': 4.297677959649791e-06, 'epoch': 0.69} +{'loss': 1.6118, 'grad_norm': 3.550280809402466, 'learning_rate': 4.295774647887324e-06, 'epoch': 0.7} +{'loss': 1.6257, 'grad_norm': 3.402688503265381, 'learning_rate': 4.293871336124858e-06, 'epoch': 0.7} +{'loss': 1.5994, 'grad_norm': 3.5133583545684814, 'learning_rate': 4.291968024362391e-06, 'epoch': 0.7} +{'loss': 1.6307, 'grad_norm': 3.3304834365844727, 'learning_rate': 4.290064712599924e-06, 'epoch': 0.7} +{'eval_loss': 1.9346461296081543, 'eval_runtime': 19.5492, 'eval_samples_per_second': 51.153, 'eval_steps_per_second': 2.148, 'epoch': 0.7} +{'loss': 1.6205, 'grad_norm': 3.4878392219543457, 'learning_rate': 4.288161400837458e-06, 'epoch': 0.7} +{'loss': 1.5404, 'grad_norm': 3.3085005283355713, 'learning_rate': 4.286258089074991e-06, 'epoch': 0.7} +{'loss': 1.6138, 'grad_norm': 3.1597721576690674, 'learning_rate': 4.284354777312524e-06, 'epoch': 0.7} +{'loss': 1.5716, 'grad_norm': 3.411226511001587, 'learning_rate': 4.282451465550058e-06, 'epoch': 0.7} +{'loss': 1.5886, 'grad_norm': 3.4839115142822266, 'learning_rate': 4.280548153787591e-06, 'epoch': 0.7} +{'loss': 1.5855, 'grad_norm': 3.2873077392578125, 'learning_rate': 4.2786448420251245e-06, 'epoch': 0.7} +{'loss': 1.5552, 'grad_norm': 3.3072986602783203, 'learning_rate': 4.276741530262658e-06, 'epoch': 0.71} +{'loss': 1.5727, 'grad_norm': 3.412501573562622, 'learning_rate': 4.27483821850019e-06, 'epoch': 0.71} +{'loss': 1.6506, 'grad_norm': 3.3539018630981445, 'learning_rate': 4.272934906737724e-06, 'epoch': 0.71} +{'loss': 1.634, 'grad_norm': 3.4645464420318604, 'learning_rate': 4.271031594975257e-06, 'epoch': 0.71} +{'eval_loss': 1.946365237236023, 'eval_runtime': 19.5545, 'eval_samples_per_second': 51.139, 'eval_steps_per_second': 2.148, 'epoch': 0.71} +{'loss': 1.6257, 'grad_norm': 3.4149861335754395, 'learning_rate': 4.2691282832127905e-06, 'epoch': 0.71} +{'loss': 1.5856, 'grad_norm': 3.55535626411438, 'learning_rate': 4.267224971450324e-06, 'epoch': 0.71} +{'loss': 1.5303, 'grad_norm': 3.504443883895874, 'learning_rate': 4.265321659687857e-06, 'epoch': 0.71} +{'loss': 1.5716, 'grad_norm': 3.848287343978882, 'learning_rate': 4.2634183479253906e-06, 'epoch': 0.71} +{'loss': 1.5533, 'grad_norm': 3.653019905090332, 'learning_rate': 4.261515036162924e-06, 'epoch': 0.71} +{'loss': 1.5877, 'grad_norm': 3.5746395587921143, 'learning_rate': 4.259611724400457e-06, 'epoch': 0.71} +{'loss': 1.5629, 'grad_norm': 3.5542399883270264, 'learning_rate': 4.257708412637991e-06, 'epoch': 0.72} +{'loss': 1.5121, 'grad_norm': 3.3521649837493896, 'learning_rate': 4.255805100875523e-06, 'epoch': 0.72} +{'loss': 1.541, 'grad_norm': 3.580216407775879, 'learning_rate': 4.2539017891130565e-06, 'epoch': 0.72} +{'loss': 1.6338, 'grad_norm': 3.9466676712036133, 'learning_rate': 4.25199847735059e-06, 'epoch': 0.72} +{'eval_loss': 1.9395411014556885, 'eval_runtime': 19.5656, 'eval_samples_per_second': 51.11, 'eval_steps_per_second': 2.147, 'epoch': 0.72} +{'loss': 1.5366, 'grad_norm': 3.7393381595611572, 'learning_rate': 4.250095165588123e-06, 'epoch': 0.72} +{'loss': 1.6053, 'grad_norm': 3.377337694168091, 'learning_rate': 4.2481918538256575e-06, 'epoch': 0.72} +{'loss': 1.5717, 'grad_norm': 4.053131580352783, 'learning_rate': 4.246288542063191e-06, 'epoch': 0.72} +{'loss': 1.603, 'grad_norm': 3.5684690475463867, 'learning_rate': 4.244385230300724e-06, 'epoch': 0.72} +{'loss': 1.5516, 'grad_norm': 3.3873133659362793, 'learning_rate': 4.242481918538257e-06, 'epoch': 0.72} +{'loss': 1.576, 'grad_norm': 3.5022854804992676, 'learning_rate': 4.24057860677579e-06, 'epoch': 0.73} +{'loss': 1.6259, 'grad_norm': 3.346998453140259, 'learning_rate': 4.2386752950133234e-06, 'epoch': 0.73} +{'loss': 1.5665, 'grad_norm': 3.3268625736236572, 'learning_rate': 4.236771983250857e-06, 'epoch': 0.73} +{'loss': 1.555, 'grad_norm': 3.785653829574585, 'learning_rate': 4.23486867148839e-06, 'epoch': 0.73} +{'loss': 1.5863, 'grad_norm': 3.7190122604370117, 'learning_rate': 4.2329653597259235e-06, 'epoch': 0.73} +{'eval_loss': 1.9305882453918457, 'eval_runtime': 19.5469, 'eval_samples_per_second': 51.159, 'eval_steps_per_second': 2.149, 'epoch': 0.73} +{'loss': 1.6074, 'grad_norm': 3.5142900943756104, 'learning_rate': 4.231062047963457e-06, 'epoch': 0.73} +{'loss': 1.6369, 'grad_norm': 3.5274715423583984, 'learning_rate': 4.22915873620099e-06, 'epoch': 0.73} +{'loss': 1.5946, 'grad_norm': 3.6084325313568115, 'learning_rate': 4.227255424438524e-06, 'epoch': 0.73} +{'loss': 1.6349, 'grad_norm': 3.5243382453918457, 'learning_rate': 4.225352112676057e-06, 'epoch': 0.73} +{'loss': 1.5631, 'grad_norm': 3.4704911708831787, 'learning_rate': 4.2234488009135895e-06, 'epoch': 0.73} +{'loss': 1.5175, 'grad_norm': 3.2883381843566895, 'learning_rate': 4.221545489151123e-06, 'epoch': 0.74} +{'loss': 1.5565, 'grad_norm': 3.3891568183898926, 'learning_rate': 4.219642177388656e-06, 'epoch': 0.74} +{'loss': 1.591, 'grad_norm': 3.9369943141937256, 'learning_rate': 4.2177388656261896e-06, 'epoch': 0.74} +{'loss': 1.5464, 'grad_norm': 3.455124855041504, 'learning_rate': 4.215835553863723e-06, 'epoch': 0.74} +{'loss': 1.5062, 'grad_norm': 3.562852144241333, 'learning_rate': 4.213932242101256e-06, 'epoch': 0.74} +{'eval_loss': 1.9313923120498657, 'eval_runtime': 19.5459, 'eval_samples_per_second': 51.162, 'eval_steps_per_second': 2.149, 'epoch': 0.74} +{'loss': 1.5699, 'grad_norm': 3.658515453338623, 'learning_rate': 4.21202893033879e-06, 'epoch': 0.74} +{'loss': 1.5891, 'grad_norm': 3.2583394050598145, 'learning_rate': 4.210125618576323e-06, 'epoch': 0.74} +{'loss': 1.5815, 'grad_norm': 3.6828389167785645, 'learning_rate': 4.208222306813856e-06, 'epoch': 0.74} +{'loss': 1.6394, 'grad_norm': 3.575493335723877, 'learning_rate': 4.20631899505139e-06, 'epoch': 0.74} +{'loss': 1.4972, 'grad_norm': 3.502178430557251, 'learning_rate': 4.204415683288923e-06, 'epoch': 0.74} +{'loss': 1.6009, 'grad_norm': 3.8077073097229004, 'learning_rate': 4.2025123715264565e-06, 'epoch': 0.75} +{'loss': 1.5502, 'grad_norm': 3.504354953765869, 'learning_rate': 4.20060905976399e-06, 'epoch': 0.75} +{'loss': 1.5094, 'grad_norm': 3.1941378116607666, 'learning_rate': 4.198705748001523e-06, 'epoch': 0.75} +{'loss': 1.6216, 'grad_norm': 3.4790117740631104, 'learning_rate': 4.1968024362390566e-06, 'epoch': 0.75} +{'loss': 1.5787, 'grad_norm': 3.38461971282959, 'learning_rate': 4.19489912447659e-06, 'epoch': 0.75} +{'eval_loss': 1.9350517988204956, 'eval_runtime': 19.5418, 'eval_samples_per_second': 51.172, 'eval_steps_per_second': 2.149, 'epoch': 0.75} +{'loss': 1.6373, 'grad_norm': 3.4787306785583496, 'learning_rate': 4.192995812714123e-06, 'epoch': 0.75} +{'loss': 1.5814, 'grad_norm': 3.502950429916382, 'learning_rate': 4.191092500951656e-06, 'epoch': 0.75} +{'loss': 1.5549, 'grad_norm': 3.4033455848693848, 'learning_rate': 4.189189189189189e-06, 'epoch': 0.75} +{'loss': 1.5501, 'grad_norm': 3.6077418327331543, 'learning_rate': 4.1872858774267225e-06, 'epoch': 0.75} +{'loss': 1.623, 'grad_norm': 3.633230686187744, 'learning_rate': 4.185382565664256e-06, 'epoch': 0.76} +{'loss': 1.5476, 'grad_norm': 3.238649368286133, 'learning_rate': 4.183479253901789e-06, 'epoch': 0.76} +{'loss': 1.5226, 'grad_norm': 3.547560214996338, 'learning_rate': 4.181575942139323e-06, 'epoch': 0.76} +{'loss': 1.5718, 'grad_norm': 3.488426923751831, 'learning_rate': 4.179672630376856e-06, 'epoch': 0.76} +{'loss': 1.5462, 'grad_norm': 3.272162437438965, 'learning_rate': 4.177769318614389e-06, 'epoch': 0.76} +{'loss': 1.6342, 'grad_norm': 3.4034695625305176, 'learning_rate': 4.175866006851923e-06, 'epoch': 0.76} +{'eval_loss': 1.9211115837097168, 'eval_runtime': 19.5565, 'eval_samples_per_second': 51.134, 'eval_steps_per_second': 2.148, 'epoch': 0.76} +{'loss': 1.6015, 'grad_norm': 3.5311338901519775, 'learning_rate': 4.173962695089456e-06, 'epoch': 0.76} +{'loss': 1.5817, 'grad_norm': 3.6152451038360596, 'learning_rate': 4.1720593833269894e-06, 'epoch': 0.76} +{'loss': 1.5915, 'grad_norm': 3.327263832092285, 'learning_rate': 4.170156071564523e-06, 'epoch': 0.76} +{'loss': 1.6647, 'grad_norm': 3.3626327514648438, 'learning_rate': 4.168252759802056e-06, 'epoch': 0.76} +{'loss': 1.5693, 'grad_norm': 3.3139383792877197, 'learning_rate': 4.1663494480395895e-06, 'epoch': 0.77} +{'loss': 1.6097, 'grad_norm': 3.538400888442993, 'learning_rate': 4.164446136277123e-06, 'epoch': 0.77} +{'loss': 1.6043, 'grad_norm': 3.1531734466552734, 'learning_rate': 4.162542824514656e-06, 'epoch': 0.77} +{'loss': 1.5733, 'grad_norm': 3.2067229747772217, 'learning_rate': 4.16063951275219e-06, 'epoch': 0.77} +{'loss': 1.5308, 'grad_norm': 3.212096691131592, 'learning_rate': 4.158736200989722e-06, 'epoch': 0.77} +{'loss': 1.5744, 'grad_norm': 3.7210311889648438, 'learning_rate': 4.1568328892272555e-06, 'epoch': 0.77} +{'eval_loss': 1.932050347328186, 'eval_runtime': 19.54, 'eval_samples_per_second': 51.177, 'eval_steps_per_second': 2.149, 'epoch': 0.77} +{'loss': 1.5886, 'grad_norm': 3.5126585960388184, 'learning_rate': 4.154929577464789e-06, 'epoch': 0.77} +{'loss': 1.5798, 'grad_norm': 3.452091932296753, 'learning_rate': 4.153026265702322e-06, 'epoch': 0.77} +{'loss': 1.5213, 'grad_norm': 3.7233076095581055, 'learning_rate': 4.1511229539398556e-06, 'epoch': 0.77} +{'loss': 1.5632, 'grad_norm': 3.3714747428894043, 'learning_rate': 4.149219642177389e-06, 'epoch': 0.77} +{'loss': 1.5822, 'grad_norm': 3.421003580093384, 'learning_rate': 4.147316330414922e-06, 'epoch': 0.78} +{'loss': 1.4943, 'grad_norm': 3.476848840713501, 'learning_rate': 4.145413018652456e-06, 'epoch': 0.78} +{'loss': 1.5946, 'grad_norm': 3.837141752243042, 'learning_rate': 4.143509706889989e-06, 'epoch': 0.78} +{'loss': 1.5725, 'grad_norm': 3.490628719329834, 'learning_rate': 4.141606395127522e-06, 'epoch': 0.78} +{'loss': 1.5645, 'grad_norm': 3.7314906120300293, 'learning_rate': 4.139703083365056e-06, 'epoch': 0.78} +{'loss': 1.6142, 'grad_norm': 3.7992820739746094, 'learning_rate': 4.137799771602588e-06, 'epoch': 0.78} +{'eval_loss': 1.9260114431381226, 'eval_runtime': 19.5225, 'eval_samples_per_second': 51.223, 'eval_steps_per_second': 2.151, 'epoch': 0.78} +{'loss': 1.5835, 'grad_norm': 3.5042412281036377, 'learning_rate': 4.135896459840122e-06, 'epoch': 0.78} +{'loss': 1.5378, 'grad_norm': 3.728440046310425, 'learning_rate': 4.133993148077656e-06, 'epoch': 0.78} +{'loss': 1.5607, 'grad_norm': 3.5068819522857666, 'learning_rate': 4.132089836315189e-06, 'epoch': 0.78} +{'loss': 1.5751, 'grad_norm': 3.941760540008545, 'learning_rate': 4.1301865245527226e-06, 'epoch': 0.78} +{'loss': 1.5239, 'grad_norm': 3.434079170227051, 'learning_rate': 4.128283212790256e-06, 'epoch': 0.79} +{'loss': 1.4902, 'grad_norm': 3.3499388694763184, 'learning_rate': 4.1263799010277884e-06, 'epoch': 0.79} +{'loss': 1.6226, 'grad_norm': 3.413666009902954, 'learning_rate': 4.124476589265322e-06, 'epoch': 0.79} +{'loss': 1.5485, 'grad_norm': 3.3497467041015625, 'learning_rate': 4.122573277502855e-06, 'epoch': 0.79} +{'loss': 1.5433, 'grad_norm': 3.391866683959961, 'learning_rate': 4.1206699657403885e-06, 'epoch': 0.79} +{'loss': 1.5801, 'grad_norm': 3.5316786766052246, 'learning_rate': 4.118766653977922e-06, 'epoch': 0.79} +{'eval_loss': 1.9219926595687866, 'eval_runtime': 19.5457, 'eval_samples_per_second': 51.162, 'eval_steps_per_second': 2.149, 'epoch': 0.79} +{'loss': 1.5539, 'grad_norm': 3.8433356285095215, 'learning_rate': 4.116863342215455e-06, 'epoch': 0.79} +{'loss': 1.5357, 'grad_norm': 3.3780910968780518, 'learning_rate': 4.114960030452989e-06, 'epoch': 0.79} +{'loss': 1.5647, 'grad_norm': 3.5539674758911133, 'learning_rate': 4.113056718690522e-06, 'epoch': 0.79} +{'loss': 1.5147, 'grad_norm': 3.340595006942749, 'learning_rate': 4.111153406928055e-06, 'epoch': 0.8} +{'loss': 1.6003, 'grad_norm': 3.3783743381500244, 'learning_rate': 4.109250095165589e-06, 'epoch': 0.8} +{'loss': 1.6151, 'grad_norm': 3.453640937805176, 'learning_rate': 4.107346783403122e-06, 'epoch': 0.8} +{'loss': 1.5507, 'grad_norm': 3.0727345943450928, 'learning_rate': 4.105443471640655e-06, 'epoch': 0.8} +{'loss': 1.531, 'grad_norm': 3.5906646251678467, 'learning_rate': 4.103540159878188e-06, 'epoch': 0.8} +{'loss': 1.5804, 'grad_norm': 3.396772623062134, 'learning_rate': 4.101636848115721e-06, 'epoch': 0.8} +{'loss': 1.5346, 'grad_norm': 3.358675956726074, 'learning_rate': 4.099733536353255e-06, 'epoch': 0.8} +{'eval_loss': 1.9109572172164917, 'eval_runtime': 19.5444, 'eval_samples_per_second': 51.166, 'eval_steps_per_second': 2.149, 'epoch': 0.8} +{'loss': 1.5775, 'grad_norm': 3.513740301132202, 'learning_rate': 4.097830224590788e-06, 'epoch': 0.8} +{'loss': 1.6669, 'grad_norm': 3.5253264904022217, 'learning_rate': 4.095926912828322e-06, 'epoch': 0.8} +{'loss': 1.6177, 'grad_norm': 3.437427282333374, 'learning_rate': 4.094023601065855e-06, 'epoch': 0.8} +{'loss': 1.5659, 'grad_norm': 3.1867613792419434, 'learning_rate': 4.092120289303388e-06, 'epoch': 0.81} +{'loss': 1.5705, 'grad_norm': 3.2600531578063965, 'learning_rate': 4.0902169775409215e-06, 'epoch': 0.81} +{'loss': 1.5196, 'grad_norm': 3.3169174194335938, 'learning_rate': 4.088313665778455e-06, 'epoch': 0.81} +{'loss': 1.5561, 'grad_norm': 3.5691440105438232, 'learning_rate': 4.086410354015988e-06, 'epoch': 0.81} +{'loss': 1.5322, 'grad_norm': 3.5287435054779053, 'learning_rate': 4.0845070422535216e-06, 'epoch': 0.81} +{'loss': 1.5444, 'grad_norm': 3.7393321990966797, 'learning_rate': 4.082603730491055e-06, 'epoch': 0.81} +{'loss': 1.587, 'grad_norm': 3.682798385620117, 'learning_rate': 4.080700418728588e-06, 'epoch': 0.81} +{'eval_loss': 1.9288097620010376, 'eval_runtime': 19.5515, 'eval_samples_per_second': 51.147, 'eval_steps_per_second': 2.148, 'epoch': 0.81} +{'loss': 1.5674, 'grad_norm': 3.786658763885498, 'learning_rate': 4.078797106966122e-06, 'epoch': 0.81} +{'loss': 1.4679, 'grad_norm': 3.2689034938812256, 'learning_rate': 4.076893795203655e-06, 'epoch': 0.81} +{'loss': 1.5332, 'grad_norm': 3.36384654045105, 'learning_rate': 4.074990483441188e-06, 'epoch': 0.81} +{'loss': 1.6152, 'grad_norm': 3.4345686435699463, 'learning_rate': 4.073087171678721e-06, 'epoch': 0.82} +{'loss': 1.5916, 'grad_norm': 3.3679862022399902, 'learning_rate': 4.071183859916254e-06, 'epoch': 0.82} +{'loss': 1.5673, 'grad_norm': 3.316446304321289, 'learning_rate': 4.069280548153788e-06, 'epoch': 0.82} +{'loss': 1.52, 'grad_norm': 3.40260648727417, 'learning_rate': 4.067377236391321e-06, 'epoch': 0.82} +{'loss': 1.5606, 'grad_norm': 3.45881986618042, 'learning_rate': 4.065473924628854e-06, 'epoch': 0.82} +{'loss': 1.5966, 'grad_norm': 3.4122838973999023, 'learning_rate': 4.063570612866388e-06, 'epoch': 0.82} +{'loss': 1.5878, 'grad_norm': 3.7200167179107666, 'learning_rate': 4.061667301103921e-06, 'epoch': 0.82} +{'eval_loss': 1.936495304107666, 'eval_runtime': 19.5528, 'eval_samples_per_second': 51.143, 'eval_steps_per_second': 2.148, 'epoch': 0.82} +{'loss': 1.5533, 'grad_norm': 3.638312816619873, 'learning_rate': 4.0597639893414544e-06, 'epoch': 0.82} +{'loss': 1.5643, 'grad_norm': 3.3524715900421143, 'learning_rate': 4.057860677578988e-06, 'epoch': 0.82} +{'loss': 1.5806, 'grad_norm': 3.435063123703003, 'learning_rate': 4.055957365816521e-06, 'epoch': 0.82} +{'loss': 1.5279, 'grad_norm': 3.757676124572754, 'learning_rate': 4.0540540540540545e-06, 'epoch': 0.83} +{'loss': 1.5205, 'grad_norm': 3.4565200805664062, 'learning_rate': 4.052150742291588e-06, 'epoch': 0.83} +{'loss': 1.5141, 'grad_norm': 3.3716468811035156, 'learning_rate': 4.050247430529121e-06, 'epoch': 0.83} +{'loss': 1.5984, 'grad_norm': 4.090661525726318, 'learning_rate': 4.048344118766655e-06, 'epoch': 0.83} +{'loss': 1.5971, 'grad_norm': 3.572295665740967, 'learning_rate': 4.046440807004188e-06, 'epoch': 0.83} +{'loss': 1.5648, 'grad_norm': 3.6160526275634766, 'learning_rate': 4.044537495241721e-06, 'epoch': 0.83} +{'loss': 1.5387, 'grad_norm': 3.7830121517181396, 'learning_rate': 4.042634183479255e-06, 'epoch': 0.83} +{'eval_loss': 1.916287899017334, 'eval_runtime': 19.56, 'eval_samples_per_second': 51.125, 'eval_steps_per_second': 2.147, 'epoch': 0.83} +{'loss': 1.5969, 'grad_norm': 3.651431083679199, 'learning_rate': 4.040730871716787e-06, 'epoch': 0.83} +{'loss': 1.5634, 'grad_norm': 3.3558826446533203, 'learning_rate': 4.038827559954321e-06, 'epoch': 0.83} +{'loss': 1.6133, 'grad_norm': 3.4205617904663086, 'learning_rate': 4.036924248191854e-06, 'epoch': 0.84} +{'loss': 1.5876, 'grad_norm': 3.4842634201049805, 'learning_rate': 4.035020936429387e-06, 'epoch': 0.84} +{'loss': 1.5537, 'grad_norm': 3.456399440765381, 'learning_rate': 4.033117624666921e-06, 'epoch': 0.84} +{'loss': 1.5321, 'grad_norm': 3.5076637268066406, 'learning_rate': 4.031214312904454e-06, 'epoch': 0.84} +{'loss': 1.517, 'grad_norm': 3.677919864654541, 'learning_rate': 4.029311001141987e-06, 'epoch': 0.84} +{'loss': 1.4867, 'grad_norm': 3.3499341011047363, 'learning_rate': 4.027407689379521e-06, 'epoch': 0.84} +{'loss': 1.5505, 'grad_norm': 3.3247077465057373, 'learning_rate': 4.025504377617054e-06, 'epoch': 0.84} +{'loss': 1.5264, 'grad_norm': 3.278609037399292, 'learning_rate': 4.0236010658545875e-06, 'epoch': 0.84} +{'eval_loss': 1.9164146184921265, 'eval_runtime': 19.5997, 'eval_samples_per_second': 51.021, 'eval_steps_per_second': 2.143, 'epoch': 0.84} +{'loss': 1.5749, 'grad_norm': 3.4496920108795166, 'learning_rate': 4.02169775409212e-06, 'epoch': 0.84} +{'loss': 1.6287, 'grad_norm': 3.541743040084839, 'learning_rate': 4.019794442329653e-06, 'epoch': 0.84} +{'loss': 1.4995, 'grad_norm': 3.3573708534240723, 'learning_rate': 4.0178911305671876e-06, 'epoch': 0.85} +{'loss': 1.5876, 'grad_norm': 3.6396191120147705, 'learning_rate': 4.015987818804721e-06, 'epoch': 0.85} +{'loss': 1.5223, 'grad_norm': 3.3554818630218506, 'learning_rate': 4.014084507042254e-06, 'epoch': 0.85} +{'loss': 1.4942, 'grad_norm': 3.3701114654541016, 'learning_rate': 4.012181195279788e-06, 'epoch': 0.85} +{'loss': 1.5975, 'grad_norm': 3.7770161628723145, 'learning_rate': 4.010277883517321e-06, 'epoch': 0.85} +{'loss': 1.5925, 'grad_norm': 3.463390588760376, 'learning_rate': 4.0083745717548535e-06, 'epoch': 0.85} +{'loss': 1.5486, 'grad_norm': 3.477566957473755, 'learning_rate': 4.006471259992387e-06, 'epoch': 0.85} +{'loss': 1.5715, 'grad_norm': 3.5830488204956055, 'learning_rate': 4.00456794822992e-06, 'epoch': 0.85} +{'eval_loss': 1.9171477556228638, 'eval_runtime': 19.5497, 'eval_samples_per_second': 51.152, 'eval_steps_per_second': 2.148, 'epoch': 0.85} +{'loss': 1.614, 'grad_norm': 3.3341407775878906, 'learning_rate': 4.002664636467454e-06, 'epoch': 0.85} +{'loss': 1.5701, 'grad_norm': 3.5394139289855957, 'learning_rate': 4.000761324704987e-06, 'epoch': 0.85} +{'loss': 1.5085, 'grad_norm': 3.667840003967285, 'learning_rate': 3.99885801294252e-06, 'epoch': 0.86} +{'loss': 1.5684, 'grad_norm': 3.418546438217163, 'learning_rate': 3.996954701180054e-06, 'epoch': 0.86} +{'loss': 1.492, 'grad_norm': 3.2795956134796143, 'learning_rate': 3.995051389417587e-06, 'epoch': 0.86} +{'loss': 1.514, 'grad_norm': 3.8579261302948, 'learning_rate': 3.9931480776551204e-06, 'epoch': 0.86} +{'loss': 1.6095, 'grad_norm': 3.4903883934020996, 'learning_rate': 3.991244765892654e-06, 'epoch': 0.86} +{'loss': 1.5299, 'grad_norm': 3.348982334136963, 'learning_rate': 3.989341454130186e-06, 'epoch': 0.86} +{'loss': 1.5012, 'grad_norm': 3.5125813484191895, 'learning_rate': 3.98743814236772e-06, 'epoch': 0.86} +{'loss': 1.5475, 'grad_norm': 3.5819289684295654, 'learning_rate': 3.985534830605253e-06, 'epoch': 0.86} +{'eval_loss': 1.9310911893844604, 'eval_runtime': 19.5479, 'eval_samples_per_second': 51.156, 'eval_steps_per_second': 2.149, 'epoch': 0.86} +{'loss': 1.5689, 'grad_norm': 3.4986045360565186, 'learning_rate': 3.983631518842786e-06, 'epoch': 0.86} +{'loss': 1.5965, 'grad_norm': 3.9191901683807373, 'learning_rate': 3.98172820708032e-06, 'epoch': 0.86} +{'loss': 1.5357, 'grad_norm': 3.6584315299987793, 'learning_rate': 3.979824895317854e-06, 'epoch': 0.87} +{'loss': 1.5116, 'grad_norm': 3.8193325996398926, 'learning_rate': 3.977921583555387e-06, 'epoch': 0.87} +{'loss': 1.5751, 'grad_norm': 3.656949758529663, 'learning_rate': 3.97601827179292e-06, 'epoch': 0.87} +{'loss': 1.5187, 'grad_norm': 3.8172154426574707, 'learning_rate': 3.974114960030453e-06, 'epoch': 0.87} +{'loss': 1.5581, 'grad_norm': 3.66030216217041, 'learning_rate': 3.9722116482679866e-06, 'epoch': 0.87} +{'loss': 1.546, 'grad_norm': 3.8292956352233887, 'learning_rate': 3.97030833650552e-06, 'epoch': 0.87} +{'loss': 1.5541, 'grad_norm': 4.047486782073975, 'learning_rate': 3.968405024743053e-06, 'epoch': 0.87} +{'loss': 1.4707, 'grad_norm': 3.248772382736206, 'learning_rate': 3.966501712980587e-06, 'epoch': 0.87} +{'eval_loss': 1.9297550916671753, 'eval_runtime': 19.5377, 'eval_samples_per_second': 51.183, 'eval_steps_per_second': 2.15, 'epoch': 0.87} +{'loss': 1.5289, 'grad_norm': 4.014291763305664, 'learning_rate': 3.96459840121812e-06, 'epoch': 0.87} +{'loss': 1.5385, 'grad_norm': 3.6265480518341064, 'learning_rate': 3.962695089455653e-06, 'epoch': 0.88} +{'loss': 1.5837, 'grad_norm': 3.5628662109375, 'learning_rate': 3.960791777693187e-06, 'epoch': 0.88} +{'loss': 1.5788, 'grad_norm': 4.0754714012146, 'learning_rate': 3.95888846593072e-06, 'epoch': 0.88} +{'loss': 1.5634, 'grad_norm': 3.143691062927246, 'learning_rate': 3.956985154168253e-06, 'epoch': 0.88} +{'loss': 1.6052, 'grad_norm': 3.70245361328125, 'learning_rate': 3.955081842405786e-06, 'epoch': 0.88} +{'loss': 1.6234, 'grad_norm': 3.468219041824341, 'learning_rate': 3.953178530643319e-06, 'epoch': 0.88} +{'loss': 1.6475, 'grad_norm': 3.613459825515747, 'learning_rate': 3.951275218880853e-06, 'epoch': 0.88} +{'loss': 1.4767, 'grad_norm': 3.5653300285339355, 'learning_rate': 3.949371907118386e-06, 'epoch': 0.88} +{'loss': 1.5902, 'grad_norm': 3.6431901454925537, 'learning_rate': 3.9474685953559194e-06, 'epoch': 0.88} +{'eval_loss': 1.9259147644042969, 'eval_runtime': 19.5373, 'eval_samples_per_second': 51.184, 'eval_steps_per_second': 2.15, 'epoch': 0.88} +{'loss': 1.5439, 'grad_norm': 3.527979612350464, 'learning_rate': 3.945565283593453e-06, 'epoch': 0.88} +{'loss': 1.5431, 'grad_norm': 3.52705717086792, 'learning_rate': 3.943661971830986e-06, 'epoch': 0.89} +{'loss': 1.5932, 'grad_norm': 3.2424206733703613, 'learning_rate': 3.9417586600685195e-06, 'epoch': 0.89} +{'loss': 1.5719, 'grad_norm': 3.498383045196533, 'learning_rate': 3.939855348306053e-06, 'epoch': 0.89} +{'loss': 1.5623, 'grad_norm': 3.585148811340332, 'learning_rate': 3.937952036543586e-06, 'epoch': 0.89} +{'loss': 1.5614, 'grad_norm': 3.381115436553955, 'learning_rate': 3.93604872478112e-06, 'epoch': 0.89} +{'loss': 1.5758, 'grad_norm': 3.9484786987304688, 'learning_rate': 3.934145413018653e-06, 'epoch': 0.89} +{'loss': 1.5212, 'grad_norm': 3.3912556171417236, 'learning_rate': 3.932242101256186e-06, 'epoch': 0.89} +{'loss': 1.5355, 'grad_norm': 3.4417104721069336, 'learning_rate': 3.93033878949372e-06, 'epoch': 0.89} +{'loss': 1.4937, 'grad_norm': 3.7835729122161865, 'learning_rate': 3.928435477731253e-06, 'epoch': 0.89} +{'eval_loss': 1.9156649112701416, 'eval_runtime': 19.5387, 'eval_samples_per_second': 51.18, 'eval_steps_per_second': 2.15, 'epoch': 0.89} +{'loss': 1.5822, 'grad_norm': 3.4857535362243652, 'learning_rate': 3.9265321659687864e-06, 'epoch': 0.89} +{'loss': 1.5939, 'grad_norm': 3.9433531761169434, 'learning_rate': 3.924628854206319e-06, 'epoch': 0.9} +{'loss': 1.5573, 'grad_norm': 3.438563585281372, 'learning_rate': 3.922725542443852e-06, 'epoch': 0.9} +{'loss': 1.5254, 'grad_norm': 3.606097936630249, 'learning_rate': 3.920822230681386e-06, 'epoch': 0.9} +{'loss': 1.5344, 'grad_norm': 3.5617189407348633, 'learning_rate': 3.918918918918919e-06, 'epoch': 0.9} +{'loss': 1.5599, 'grad_norm': 3.31109619140625, 'learning_rate': 3.917015607156452e-06, 'epoch': 0.9} +{'loss': 1.5404, 'grad_norm': 3.6074013710021973, 'learning_rate': 3.915112295393986e-06, 'epoch': 0.9} +{'loss': 1.634, 'grad_norm': 4.1906046867370605, 'learning_rate': 3.913208983631519e-06, 'epoch': 0.9} +{'loss': 1.6055, 'grad_norm': 3.3201050758361816, 'learning_rate': 3.9113056718690525e-06, 'epoch': 0.9} +{'loss': 1.5261, 'grad_norm': 3.9941864013671875, 'learning_rate': 3.909402360106586e-06, 'epoch': 0.9} +{'eval_loss': 1.9122051000595093, 'eval_runtime': 19.5443, 'eval_samples_per_second': 51.166, 'eval_steps_per_second': 2.149, 'epoch': 0.9} +{'loss': 1.5692, 'grad_norm': 3.733293056488037, 'learning_rate': 3.907499048344119e-06, 'epoch': 0.9} +{'loss': 1.5759, 'grad_norm': 3.5064730644226074, 'learning_rate': 3.905595736581652e-06, 'epoch': 0.91} +{'loss': 1.5766, 'grad_norm': 3.4425575733184814, 'learning_rate': 3.903692424819186e-06, 'epoch': 0.91} +{'loss': 1.5288, 'grad_norm': 3.305995225906372, 'learning_rate': 3.901789113056719e-06, 'epoch': 0.91} +{'loss': 1.5891, 'grad_norm': 3.8600997924804688, 'learning_rate': 3.899885801294253e-06, 'epoch': 0.91} +{'loss': 1.5568, 'grad_norm': 3.294036865234375, 'learning_rate': 3.897982489531786e-06, 'epoch': 0.91} +{'loss': 1.6169, 'grad_norm': 3.7429909706115723, 'learning_rate': 3.896079177769319e-06, 'epoch': 0.91} +{'loss': 1.5837, 'grad_norm': 3.2871479988098145, 'learning_rate': 3.894175866006853e-06, 'epoch': 0.91} +{'loss': 1.6079, 'grad_norm': 3.37500262260437, 'learning_rate': 3.892272554244385e-06, 'epoch': 0.91} +{'loss': 1.5206, 'grad_norm': 3.2661280632019043, 'learning_rate': 3.890369242481919e-06, 'epoch': 0.91} +{'eval_loss': 1.9233078956604004, 'eval_runtime': 19.5326, 'eval_samples_per_second': 51.196, 'eval_steps_per_second': 2.15, 'epoch': 0.91} +{'loss': 1.6006, 'grad_norm': 3.396087169647217, 'learning_rate': 3.888465930719452e-06, 'epoch': 0.92} +{'loss': 1.5325, 'grad_norm': 3.3297715187072754, 'learning_rate': 3.886562618956985e-06, 'epoch': 0.92} +{'loss': 1.5465, 'grad_norm': 3.2287917137145996, 'learning_rate': 3.884659307194519e-06, 'epoch': 0.92} +{'loss': 1.4902, 'grad_norm': 3.152458667755127, 'learning_rate': 3.882755995432052e-06, 'epoch': 0.92} +{'loss': 1.5665, 'grad_norm': 3.3514606952667236, 'learning_rate': 3.8808526836695854e-06, 'epoch': 0.92} +{'loss': 1.5217, 'grad_norm': 3.244168758392334, 'learning_rate': 3.878949371907119e-06, 'epoch': 0.92} +{'loss': 1.5156, 'grad_norm': 3.3506486415863037, 'learning_rate': 3.877046060144652e-06, 'epoch': 0.92} +{'loss': 1.5106, 'grad_norm': 3.148120641708374, 'learning_rate': 3.8751427483821855e-06, 'epoch': 0.92} +{'loss': 1.5471, 'grad_norm': 3.2761573791503906, 'learning_rate': 3.873239436619718e-06, 'epoch': 0.92} +{'loss': 1.4947, 'grad_norm': 3.1500706672668457, 'learning_rate': 3.871336124857251e-06, 'epoch': 0.92} +{'eval_loss': 1.9386425018310547, 'eval_runtime': 19.5374, 'eval_samples_per_second': 51.184, 'eval_steps_per_second': 2.15, 'epoch': 0.92} +{'loss': 1.561, 'grad_norm': 3.5470309257507324, 'learning_rate': 3.869432813094785e-06, 'epoch': 0.93} +{'loss': 1.572, 'grad_norm': 3.289841413497925, 'learning_rate': 3.867529501332318e-06, 'epoch': 0.93} +{'loss': 1.5578, 'grad_norm': 3.2713468074798584, 'learning_rate': 3.8656261895698515e-06, 'epoch': 0.93} +{'loss': 1.5474, 'grad_norm': 3.3061606884002686, 'learning_rate': 3.863722877807386e-06, 'epoch': 0.93} +{'loss': 1.5855, 'grad_norm': 3.4750723838806152, 'learning_rate': 3.861819566044919e-06, 'epoch': 0.93} +{'loss': 1.6158, 'grad_norm': 3.415891170501709, 'learning_rate': 3.859916254282452e-06, 'epoch': 0.93} +{'loss': 1.5328, 'grad_norm': 3.558656692504883, 'learning_rate': 3.858012942519985e-06, 'epoch': 0.93} +{'loss': 1.5554, 'grad_norm': 3.4502146244049072, 'learning_rate': 3.856109630757518e-06, 'epoch': 0.93} +{'loss': 1.5313, 'grad_norm': 3.6229748725891113, 'learning_rate': 3.854206318995052e-06, 'epoch': 0.93} +{'loss': 1.5792, 'grad_norm': 3.5197527408599854, 'learning_rate': 3.852303007232585e-06, 'epoch': 0.93} +{'eval_loss': 1.9303314685821533, 'eval_runtime': 19.5509, 'eval_samples_per_second': 51.149, 'eval_steps_per_second': 2.148, 'epoch': 0.93} +{'loss': 1.5154, 'grad_norm': 3.5002551078796387, 'learning_rate': 3.850399695470118e-06, 'epoch': 0.94} +{'loss': 1.526, 'grad_norm': 3.516631841659546, 'learning_rate': 3.848496383707652e-06, 'epoch': 0.94} +{'loss': 1.513, 'grad_norm': 3.4825432300567627, 'learning_rate': 3.846593071945185e-06, 'epoch': 0.94} +{'loss': 1.4949, 'grad_norm': 3.3698201179504395, 'learning_rate': 3.8446897601827185e-06, 'epoch': 0.94} +{'loss': 1.6213, 'grad_norm': 3.5728020668029785, 'learning_rate': 3.842786448420252e-06, 'epoch': 0.94} +{'loss': 1.6316, 'grad_norm': 3.9078235626220703, 'learning_rate': 3.840883136657785e-06, 'epoch': 0.94} +{'loss': 1.6128, 'grad_norm': 3.4761011600494385, 'learning_rate': 3.838979824895318e-06, 'epoch': 0.94} +{'loss': 1.4942, 'grad_norm': 3.681131601333618, 'learning_rate': 3.837076513132851e-06, 'epoch': 0.94} +{'loss': 1.5709, 'grad_norm': 3.5017917156219482, 'learning_rate': 3.8351732013703844e-06, 'epoch': 0.94} +{'loss': 1.5226, 'grad_norm': 3.396813154220581, 'learning_rate': 3.833269889607918e-06, 'epoch': 0.95} +{'eval_loss': 1.920267105102539, 'eval_runtime': 19.5562, 'eval_samples_per_second': 51.135, 'eval_steps_per_second': 2.148, 'epoch': 0.95} +{'loss': 1.5073, 'grad_norm': 4.064131736755371, 'learning_rate': 3.831366577845451e-06, 'epoch': 0.95} +{'loss': 1.5022, 'grad_norm': 3.629967212677002, 'learning_rate': 3.8294632660829845e-06, 'epoch': 0.95} +{'loss': 1.5407, 'grad_norm': 3.9132370948791504, 'learning_rate': 3.827559954320518e-06, 'epoch': 0.95} +{'loss': 1.5581, 'grad_norm': 3.9181604385375977, 'learning_rate': 3.825656642558051e-06, 'epoch': 0.95} +{'loss': 1.5675, 'grad_norm': 3.5980870723724365, 'learning_rate': 3.823753330795585e-06, 'epoch': 0.95} +{'loss': 1.5556, 'grad_norm': 3.874753475189209, 'learning_rate': 3.821850019033118e-06, 'epoch': 0.95} +{'loss': 1.4953, 'grad_norm': 3.434699058532715, 'learning_rate': 3.819946707270651e-06, 'epoch': 0.95} +{'loss': 1.5367, 'grad_norm': 3.426010847091675, 'learning_rate': 3.818043395508185e-06, 'epoch': 0.95} +{'loss': 1.5443, 'grad_norm': 3.3872179985046387, 'learning_rate': 3.816140083745718e-06, 'epoch': 0.95} +{'loss': 1.6271, 'grad_norm': 3.4363842010498047, 'learning_rate': 3.8142367719832514e-06, 'epoch': 0.96} +{'eval_loss': 1.908841848373413, 'eval_runtime': 19.5815, 'eval_samples_per_second': 51.069, 'eval_steps_per_second': 2.145, 'epoch': 0.96} +{'loss': 1.5137, 'grad_norm': 3.350131034851074, 'learning_rate': 3.8123334602207844e-06, 'epoch': 0.96} +{'loss': 1.5199, 'grad_norm': 3.6111559867858887, 'learning_rate': 3.8104301484583177e-06, 'epoch': 0.96} +{'loss': 1.5162, 'grad_norm': 3.4766688346862793, 'learning_rate': 3.808526836695851e-06, 'epoch': 0.96} +{'loss': 1.5622, 'grad_norm': 3.508544683456421, 'learning_rate': 3.8066235249333845e-06, 'epoch': 0.96} +{'loss': 1.5542, 'grad_norm': 3.3499271869659424, 'learning_rate': 3.804720213170918e-06, 'epoch': 0.96} +{'loss': 1.5442, 'grad_norm': 3.688570499420166, 'learning_rate': 3.8028169014084508e-06, 'epoch': 0.96} +{'loss': 1.5659, 'grad_norm': 3.5620317459106445, 'learning_rate': 3.800913589645984e-06, 'epoch': 0.96} +{'loss': 1.5659, 'grad_norm': 3.589094877243042, 'learning_rate': 3.7990102778835175e-06, 'epoch': 0.96} +{'loss': 1.5585, 'grad_norm': 3.741499662399292, 'learning_rate': 3.797106966121051e-06, 'epoch': 0.96} +{'loss': 1.6238, 'grad_norm': 3.556412696838379, 'learning_rate': 3.7952036543585842e-06, 'epoch': 0.97} +{'eval_loss': 1.9019454717636108, 'eval_runtime': 19.5506, 'eval_samples_per_second': 51.149, 'eval_steps_per_second': 2.148, 'epoch': 0.97} +{'loss': 1.5737, 'grad_norm': 3.8290817737579346, 'learning_rate': 3.793300342596117e-06, 'epoch': 0.97} +{'loss': 1.5925, 'grad_norm': 3.465275526046753, 'learning_rate': 3.7913970308336505e-06, 'epoch': 0.97} +{'loss': 1.5688, 'grad_norm': 3.419767379760742, 'learning_rate': 3.789493719071184e-06, 'epoch': 0.97} +{'loss': 1.5248, 'grad_norm': 3.319511651992798, 'learning_rate': 3.7875904073087177e-06, 'epoch': 0.97} +{'loss': 1.4872, 'grad_norm': 3.669585704803467, 'learning_rate': 3.785687095546251e-06, 'epoch': 0.97} +{'loss': 1.5365, 'grad_norm': 3.569004774093628, 'learning_rate': 3.7837837837837844e-06, 'epoch': 0.97} +{'loss': 1.5032, 'grad_norm': 3.6222283840179443, 'learning_rate': 3.7818804720213178e-06, 'epoch': 0.97} +{'loss': 1.4841, 'grad_norm': 3.3928351402282715, 'learning_rate': 3.7799771602588507e-06, 'epoch': 0.97} +{'loss': 1.4751, 'grad_norm': 3.2599754333496094, 'learning_rate': 3.778073848496384e-06, 'epoch': 0.97} +{'loss': 1.5492, 'grad_norm': 3.322974681854248, 'learning_rate': 3.7761705367339174e-06, 'epoch': 0.98} +{'eval_loss': 1.9191237688064575, 'eval_runtime': 19.5461, 'eval_samples_per_second': 51.161, 'eval_steps_per_second': 2.149, 'epoch': 0.98} +{'loss': 1.5738, 'grad_norm': 3.683272123336792, 'learning_rate': 3.7742672249714508e-06, 'epoch': 0.98} +{'loss': 1.5673, 'grad_norm': 3.0766491889953613, 'learning_rate': 3.772363913208984e-06, 'epoch': 0.98} +{'loss': 1.53, 'grad_norm': 3.14711856842041, 'learning_rate': 3.770460601446517e-06, 'epoch': 0.98} +{'loss': 1.563, 'grad_norm': 3.2930068969726562, 'learning_rate': 3.7685572896840504e-06, 'epoch': 0.98} +{'loss': 1.4984, 'grad_norm': 3.466487169265747, 'learning_rate': 3.766653977921584e-06, 'epoch': 0.98} +{'loss': 1.4479, 'grad_norm': 3.461385488510132, 'learning_rate': 3.764750666159117e-06, 'epoch': 0.98} +{'loss': 1.5492, 'grad_norm': 3.6675615310668945, 'learning_rate': 3.7628473543966505e-06, 'epoch': 0.98} +{'loss': 1.5076, 'grad_norm': 3.6086831092834473, 'learning_rate': 3.7609440426341835e-06, 'epoch': 0.98} +{'loss': 1.5997, 'grad_norm': 3.4152567386627197, 'learning_rate': 3.759040730871717e-06, 'epoch': 0.99} +{'loss': 1.4908, 'grad_norm': 3.4595847129821777, 'learning_rate': 3.75713741910925e-06, 'epoch': 0.99} +{'eval_loss': 1.9150792360305786, 'eval_runtime': 19.544, 'eval_samples_per_second': 51.166, 'eval_steps_per_second': 2.149, 'epoch': 0.99} +{'loss': 1.4826, 'grad_norm': 3.312623977661133, 'learning_rate': 3.7552341073467836e-06, 'epoch': 0.99} +{'loss': 1.5256, 'grad_norm': 3.511017084121704, 'learning_rate': 3.753330795584317e-06, 'epoch': 0.99} +{'loss': 1.5988, 'grad_norm': 3.6644725799560547, 'learning_rate': 3.75142748382185e-06, 'epoch': 0.99} +{'loss': 1.4898, 'grad_norm': 3.262586832046509, 'learning_rate': 3.749524172059384e-06, 'epoch': 0.99} +{'loss': 1.5442, 'grad_norm': 3.4980480670928955, 'learning_rate': 3.747620860296917e-06, 'epoch': 0.99} +{'loss': 1.5791, 'grad_norm': 3.505993127822876, 'learning_rate': 3.7457175485344504e-06, 'epoch': 0.99} +{'loss': 1.5936, 'grad_norm': 3.4161531925201416, 'learning_rate': 3.7438142367719837e-06, 'epoch': 0.99} +{'loss': 1.5552, 'grad_norm': 3.5883264541625977, 'learning_rate': 3.741910925009517e-06, 'epoch': 0.99} +{'loss': 1.5101, 'grad_norm': 3.2147250175476074, 'learning_rate': 3.7400076132470505e-06, 'epoch': 1.0} +{'loss': 1.5309, 'grad_norm': 3.712390661239624, 'learning_rate': 3.7381043014845834e-06, 'epoch': 1.0} +{'eval_loss': 1.9176290035247803, 'eval_runtime': 19.5662, 'eval_samples_per_second': 51.108, 'eval_steps_per_second': 2.147, 'epoch': 1.0} +{'loss': 1.561, 'grad_norm': 3.7538580894470215, 'learning_rate': 3.7362009897221168e-06, 'epoch': 1.0} +{'loss': 1.5373, 'grad_norm': 3.8403542041778564, 'learning_rate': 3.73429767795965e-06, 'epoch': 1.0} +{'loss': 1.5525, 'grad_norm': 3.389779567718506, 'learning_rate': 3.7323943661971835e-06, 'epoch': 1.0} +{'loss': 1.4708, 'grad_norm': 3.331282138824463, 'learning_rate': 3.730491054434717e-06, 'epoch': 1.0} +{'loss': 1.4368, 'grad_norm': 3.8428895473480225, 'learning_rate': 3.72858774267225e-06, 'epoch': 1.0} +{'loss': 1.506, 'grad_norm': 3.8069820404052734, 'learning_rate': 3.726684430909783e-06, 'epoch': 1.0} +{'loss': 1.5389, 'grad_norm': 3.3033525943756104, 'learning_rate': 3.7247811191473165e-06, 'epoch': 1.0} +{'loss': 1.5438, 'grad_norm': 3.8770864009857178, 'learning_rate': 3.72287780738485e-06, 'epoch': 1.0} +{'loss': 1.5265, 'grad_norm': 3.8660783767700195, 'learning_rate': 3.7209744956223832e-06, 'epoch': 1.01} +{'loss': 1.5338, 'grad_norm': 3.398698568344116, 'learning_rate': 3.719071183859916e-06, 'epoch': 1.01} +{'eval_loss': 1.9005818367004395, 'eval_runtime': 19.5051, 'eval_samples_per_second': 51.269, 'eval_steps_per_second': 2.153, 'epoch': 1.01} +{'loss': 1.4584, 'grad_norm': 3.5642073154449463, 'learning_rate': 3.7171678720974495e-06, 'epoch': 1.01} +{'loss': 1.5691, 'grad_norm': 3.704798698425293, 'learning_rate': 3.715264560334983e-06, 'epoch': 1.01} +{'loss': 1.5156, 'grad_norm': 3.442699670791626, 'learning_rate': 3.7133612485725163e-06, 'epoch': 1.01} +{'loss': 1.5326, 'grad_norm': 3.3977108001708984, 'learning_rate': 3.71145793681005e-06, 'epoch': 1.01} +{'loss': 1.4625, 'grad_norm': 3.5223193168640137, 'learning_rate': 3.7095546250475834e-06, 'epoch': 1.01} +{'loss': 1.5728, 'grad_norm': 3.890803575515747, 'learning_rate': 3.7076513132851168e-06, 'epoch': 1.01} +{'loss': 1.4909, 'grad_norm': 3.7792088985443115, 'learning_rate': 3.7057480015226497e-06, 'epoch': 1.01} +{'loss': 1.457, 'grad_norm': 3.811621904373169, 'learning_rate': 3.703844689760183e-06, 'epoch': 1.01} +{'loss': 1.547, 'grad_norm': 3.9113893508911133, 'learning_rate': 3.7019413779977164e-06, 'epoch': 1.02} +{'loss': 1.5185, 'grad_norm': 3.647035598754883, 'learning_rate': 3.70003806623525e-06, 'epoch': 1.02} +{'eval_loss': 1.9221603870391846, 'eval_runtime': 19.5336, 'eval_samples_per_second': 51.194, 'eval_steps_per_second': 2.15, 'epoch': 1.02} +{'loss': 1.5002, 'grad_norm': 3.6495511531829834, 'learning_rate': 3.698134754472783e-06, 'epoch': 1.02} +{'loss': 1.5689, 'grad_norm': 3.6710431575775146, 'learning_rate': 3.696231442710316e-06, 'epoch': 1.02} +{'loss': 1.5043, 'grad_norm': 3.7992377281188965, 'learning_rate': 3.6943281309478495e-06, 'epoch': 1.02} +{'loss': 1.5307, 'grad_norm': 3.817946672439575, 'learning_rate': 3.692424819185383e-06, 'epoch': 1.02} +{'loss': 1.5373, 'grad_norm': 4.153408527374268, 'learning_rate': 3.690521507422916e-06, 'epoch': 1.02} +{'loss': 1.474, 'grad_norm': 3.5114359855651855, 'learning_rate': 3.6886181956604496e-06, 'epoch': 1.02} +{'loss': 1.4832, 'grad_norm': 3.623908281326294, 'learning_rate': 3.686714883897983e-06, 'epoch': 1.02} +{'loss': 1.5491, 'grad_norm': 3.554457664489746, 'learning_rate': 3.684811572135516e-06, 'epoch': 1.03} +{'loss': 1.5465, 'grad_norm': 3.5737009048461914, 'learning_rate': 3.6829082603730492e-06, 'epoch': 1.03} +{'loss': 1.51, 'grad_norm': 3.597360849380493, 'learning_rate': 3.6810049486105826e-06, 'epoch': 1.03} +{'eval_loss': 1.9088037014007568, 'eval_runtime': 19.5161, 'eval_samples_per_second': 51.24, 'eval_steps_per_second': 2.152, 'epoch': 1.03} +{'loss': 1.5398, 'grad_norm': 3.6122491359710693, 'learning_rate': 3.679101636848116e-06, 'epoch': 1.03} +{'loss': 1.5364, 'grad_norm': 3.705017328262329, 'learning_rate': 3.6771983250856493e-06, 'epoch': 1.03} +{'loss': 1.5039, 'grad_norm': 3.631542205810547, 'learning_rate': 3.6752950133231822e-06, 'epoch': 1.03} +{'loss': 1.5378, 'grad_norm': 3.378002643585205, 'learning_rate': 3.6733917015607156e-06, 'epoch': 1.03} +{'loss': 1.5459, 'grad_norm': 3.60477352142334, 'learning_rate': 3.6714883897982494e-06, 'epoch': 1.03} +{'loss': 1.4835, 'grad_norm': 3.816406011581421, 'learning_rate': 3.6695850780357828e-06, 'epoch': 1.03} +{'loss': 1.4746, 'grad_norm': 3.5475008487701416, 'learning_rate': 3.667681766273316e-06, 'epoch': 1.03} +{'loss': 1.4988, 'grad_norm': 3.4700183868408203, 'learning_rate': 3.6657784545108495e-06, 'epoch': 1.04} +{'loss': 1.5801, 'grad_norm': 3.7459216117858887, 'learning_rate': 3.6638751427483824e-06, 'epoch': 1.04} +{'loss': 1.5326, 'grad_norm': 3.7247753143310547, 'learning_rate': 3.6619718309859158e-06, 'epoch': 1.04} +{'eval_loss': 1.8970634937286377, 'eval_runtime': 19.5286, 'eval_samples_per_second': 51.207, 'eval_steps_per_second': 2.151, 'epoch': 1.04} +{'loss': 1.5666, 'grad_norm': 3.8357791900634766, 'learning_rate': 3.660068519223449e-06, 'epoch': 1.04} +{'loss': 1.5313, 'grad_norm': 3.597970724105835, 'learning_rate': 3.6581652074609825e-06, 'epoch': 1.04} +{'loss': 1.5027, 'grad_norm': 3.4596781730651855, 'learning_rate': 3.656261895698516e-06, 'epoch': 1.04} +{'loss': 1.4877, 'grad_norm': 3.5452489852905273, 'learning_rate': 3.6543585839360492e-06, 'epoch': 1.04} +{'loss': 1.4961, 'grad_norm': 3.4546804428100586, 'learning_rate': 3.652455272173582e-06, 'epoch': 1.04} +{'loss': 1.494, 'grad_norm': 3.4546804428100586, 'learning_rate': 3.652455272173582e-06, 'epoch': 1.04} +{'loss': 1.511, 'grad_norm': 3.575840473175049, 'learning_rate': 3.6505519604111155e-06, 'epoch': 1.04} +{'loss': 1.552, 'grad_norm': 3.735212564468384, 'learning_rate': 3.648648648648649e-06, 'epoch': 1.05} +{'loss': 1.4967, 'grad_norm': 3.7571630477905273, 'learning_rate': 3.6467453368861823e-06, 'epoch': 1.05} +{'loss': 1.5229, 'grad_norm': 3.7550652027130127, 'learning_rate': 3.6448420251237156e-06, 'epoch': 1.05} +{'eval_loss': 1.910804271697998, 'eval_runtime': 19.5032, 'eval_samples_per_second': 51.274, 'eval_steps_per_second': 2.153, 'epoch': 1.05} +{'loss': 1.5558, 'grad_norm': 3.744152784347534, 'learning_rate': 3.6429387133612486e-06, 'epoch': 1.05} +{'loss': 1.4884, 'grad_norm': 3.62556529045105, 'learning_rate': 3.641035401598782e-06, 'epoch': 1.05} +{'loss': 1.5294, 'grad_norm': 3.7813358306884766, 'learning_rate': 3.6391320898363153e-06, 'epoch': 1.05} +{'loss': 1.5103, 'grad_norm': 3.5019783973693848, 'learning_rate': 3.6372287780738487e-06, 'epoch': 1.05} +{'loss': 1.5223, 'grad_norm': 3.561685085296631, 'learning_rate': 3.635325466311382e-06, 'epoch': 1.05} +{'loss': 1.5557, 'grad_norm': 3.8196139335632324, 'learning_rate': 3.633422154548916e-06, 'epoch': 1.05} +{'loss': 1.5174, 'grad_norm': 3.592857837677002, 'learning_rate': 3.6315188427864487e-06, 'epoch': 1.05} +{'loss': 1.4724, 'grad_norm': 3.643143892288208, 'learning_rate': 3.629615531023982e-06, 'epoch': 1.06} +{'loss': 1.534, 'grad_norm': 4.109361171722412, 'learning_rate': 3.6277122192615155e-06, 'epoch': 1.06} +{'loss': 1.4943, 'grad_norm': 3.65876841545105, 'learning_rate': 3.625808907499049e-06, 'epoch': 1.06} +{'eval_loss': 1.905645728111267, 'eval_runtime': 19.5197, 'eval_samples_per_second': 51.23, 'eval_steps_per_second': 2.152, 'epoch': 1.06} +{'loss': 1.4902, 'grad_norm': 3.6420438289642334, 'learning_rate': 3.623905595736582e-06, 'epoch': 1.06} +{'loss': 1.4746, 'grad_norm': 3.7955899238586426, 'learning_rate': 3.6220022839741156e-06, 'epoch': 1.06} +{'loss': 1.5026, 'grad_norm': 3.8086647987365723, 'learning_rate': 3.6200989722116485e-06, 'epoch': 1.06} +{'loss': 1.5095, 'grad_norm': 3.502751111984253, 'learning_rate': 3.618195660449182e-06, 'epoch': 1.06} +{'loss': 1.4582, 'grad_norm': 3.848308801651001, 'learning_rate': 3.6162923486867152e-06, 'epoch': 1.06} +{'loss': 1.4751, 'grad_norm': 3.4599502086639404, 'learning_rate': 3.6143890369242486e-06, 'epoch': 1.06} +{'loss': 1.5008, 'grad_norm': 3.4907546043395996, 'learning_rate': 3.612485725161782e-06, 'epoch': 1.07} +{'loss': 1.4887, 'grad_norm': 3.7708027362823486, 'learning_rate': 3.610582413399315e-06, 'epoch': 1.07} +{'loss': 1.5032, 'grad_norm': 3.5700554847717285, 'learning_rate': 3.6086791016368482e-06, 'epoch': 1.07} +{'loss': 1.5696, 'grad_norm': 3.7084248065948486, 'learning_rate': 3.6067757898743816e-06, 'epoch': 1.07} +{'eval_loss': 1.9035502672195435, 'eval_runtime': 19.5364, 'eval_samples_per_second': 51.186, 'eval_steps_per_second': 2.15, 'epoch': 1.07} +{'loss': 1.5653, 'grad_norm': 3.3420350551605225, 'learning_rate': 3.604872478111915e-06, 'epoch': 1.07} +{'loss': 1.5142, 'grad_norm': 3.41103458404541, 'learning_rate': 3.6029691663494483e-06, 'epoch': 1.07} +{'loss': 1.5473, 'grad_norm': 3.6000564098358154, 'learning_rate': 3.6010658545869813e-06, 'epoch': 1.07} +{'loss': 1.5323, 'grad_norm': 3.5071351528167725, 'learning_rate': 3.5991625428245146e-06, 'epoch': 1.07} +{'loss': 1.4716, 'grad_norm': 3.4709973335266113, 'learning_rate': 3.597259231062048e-06, 'epoch': 1.07} +{'loss': 1.5157, 'grad_norm': 3.7037558555603027, 'learning_rate': 3.5953559192995818e-06, 'epoch': 1.07} +{'loss': 1.5803, 'grad_norm': 3.5425498485565186, 'learning_rate': 3.593452607537115e-06, 'epoch': 1.08} +{'loss': 1.511, 'grad_norm': 3.4428577423095703, 'learning_rate': 3.5915492957746485e-06, 'epoch': 1.08} +{'loss': 1.5347, 'grad_norm': 3.8640990257263184, 'learning_rate': 3.589645984012182e-06, 'epoch': 1.08} +{'loss': 1.4936, 'grad_norm': 3.549441337585449, 'learning_rate': 3.587742672249715e-06, 'epoch': 1.08} +{'eval_loss': 1.9113357067108154, 'eval_runtime': 19.5509, 'eval_samples_per_second': 51.149, 'eval_steps_per_second': 2.148, 'epoch': 1.08} +{'loss': 1.4873, 'grad_norm': 3.4109599590301514, 'learning_rate': 3.585839360487248e-06, 'epoch': 1.08} +{'loss': 1.486, 'grad_norm': 3.3219544887542725, 'learning_rate': 3.5839360487247815e-06, 'epoch': 1.08} +{'loss': 1.5631, 'grad_norm': 3.705603837966919, 'learning_rate': 3.582032736962315e-06, 'epoch': 1.08} +{'loss': 1.4558, 'grad_norm': 3.2593443393707275, 'learning_rate': 3.5801294251998483e-06, 'epoch': 1.08} +{'loss': 1.5031, 'grad_norm': 3.6344857215881348, 'learning_rate': 3.578226113437381e-06, 'epoch': 1.08} +{'loss': 1.585, 'grad_norm': 3.5324461460113525, 'learning_rate': 3.5763228016749146e-06, 'epoch': 1.08} +{'loss': 1.5201, 'grad_norm': 3.434372663497925, 'learning_rate': 3.574419489912448e-06, 'epoch': 1.09} +{'loss': 1.4996, 'grad_norm': 3.5189168453216553, 'learning_rate': 3.5725161781499813e-06, 'epoch': 1.09} +{'loss': 1.5166, 'grad_norm': 3.2778172492980957, 'learning_rate': 3.5706128663875147e-06, 'epoch': 1.09} +{'loss': 1.5411, 'grad_norm': 3.533616304397583, 'learning_rate': 3.5687095546250476e-06, 'epoch': 1.09} +{'eval_loss': 1.9267067909240723, 'eval_runtime': 19.5431, 'eval_samples_per_second': 51.169, 'eval_steps_per_second': 2.149, 'epoch': 1.09} +{'loss': 1.5123, 'grad_norm': 3.4437334537506104, 'learning_rate': 3.566806242862581e-06, 'epoch': 1.09} +{'loss': 1.4866, 'grad_norm': 3.4394350051879883, 'learning_rate': 3.5649029311001143e-06, 'epoch': 1.09} +{'loss': 1.5033, 'grad_norm': 3.6861770153045654, 'learning_rate': 3.5629996193376477e-06, 'epoch': 1.09} +{'loss': 1.5308, 'grad_norm': 3.8002209663391113, 'learning_rate': 3.561096307575181e-06, 'epoch': 1.09} +{'loss': 1.5616, 'grad_norm': 3.7347514629364014, 'learning_rate': 3.559192995812714e-06, 'epoch': 1.09} +{'loss': 1.5421, 'grad_norm': 3.610152006149292, 'learning_rate': 3.557289684050248e-06, 'epoch': 1.1} +{'loss': 1.5492, 'grad_norm': 3.5219168663024902, 'learning_rate': 3.555386372287781e-06, 'epoch': 1.1} +{'loss': 1.5158, 'grad_norm': 3.5082759857177734, 'learning_rate': 3.5534830605253145e-06, 'epoch': 1.1} +{'loss': 1.5137, 'grad_norm': 3.588376522064209, 'learning_rate': 3.551579748762848e-06, 'epoch': 1.1} +{'loss': 1.4954, 'grad_norm': 3.7916743755340576, 'learning_rate': 3.5496764370003812e-06, 'epoch': 1.1} +{'eval_loss': 1.9160947799682617, 'eval_runtime': 19.576, 'eval_samples_per_second': 51.083, 'eval_steps_per_second': 2.145, 'epoch': 1.1} +{'loss': 1.4568, 'grad_norm': 3.471205949783325, 'learning_rate': 3.5477731252379146e-06, 'epoch': 1.1} +{'loss': 1.464, 'grad_norm': 3.5498523712158203, 'learning_rate': 3.5458698134754475e-06, 'epoch': 1.1} +{'loss': 1.4897, 'grad_norm': 3.4293758869171143, 'learning_rate': 3.543966501712981e-06, 'epoch': 1.1} +{'loss': 1.5842, 'grad_norm': 3.613922119140625, 'learning_rate': 3.5420631899505142e-06, 'epoch': 1.1} +{'loss': 1.4076, 'grad_norm': 3.4664764404296875, 'learning_rate': 3.5401598781880476e-06, 'epoch': 1.1} +{'loss': 1.5428, 'grad_norm': 3.558049201965332, 'learning_rate': 3.538256566425581e-06, 'epoch': 1.11} +{'loss': 1.4974, 'grad_norm': 3.3384342193603516, 'learning_rate': 3.536353254663114e-06, 'epoch': 1.11} +{'loss': 1.4763, 'grad_norm': 3.377072811126709, 'learning_rate': 3.5344499429006473e-06, 'epoch': 1.11} +{'loss': 1.5354, 'grad_norm': 3.7713782787323, 'learning_rate': 3.5325466311381806e-06, 'epoch': 1.11} +{'loss': 1.4463, 'grad_norm': 3.3044493198394775, 'learning_rate': 3.530643319375714e-06, 'epoch': 1.11} +{'eval_loss': 1.9128236770629883, 'eval_runtime': 19.5416, 'eval_samples_per_second': 51.173, 'eval_steps_per_second': 2.149, 'epoch': 1.11} +{'loss': 1.4877, 'grad_norm': 3.421248435974121, 'learning_rate': 3.5287400076132474e-06, 'epoch': 1.11} +{'loss': 1.5199, 'grad_norm': 3.7856926918029785, 'learning_rate': 3.5268366958507803e-06, 'epoch': 1.11} +{'loss': 1.5133, 'grad_norm': 3.4652209281921387, 'learning_rate': 3.5249333840883137e-06, 'epoch': 1.11} +{'loss': 1.4697, 'grad_norm': 3.6450560092926025, 'learning_rate': 3.523030072325847e-06, 'epoch': 1.11} +{'loss': 1.4821, 'grad_norm': 3.4510908126831055, 'learning_rate': 3.5211267605633804e-06, 'epoch': 1.11} +{'loss': 1.502, 'grad_norm': 3.7094366550445557, 'learning_rate': 3.519223448800914e-06, 'epoch': 1.12} +{'loss': 1.5033, 'grad_norm': 3.80910587310791, 'learning_rate': 3.5173201370384475e-06, 'epoch': 1.12} +{'loss': 1.5064, 'grad_norm': 3.387409210205078, 'learning_rate': 3.515416825275981e-06, 'epoch': 1.12} +{'loss': 1.5162, 'grad_norm': 3.728191375732422, 'learning_rate': 3.513513513513514e-06, 'epoch': 1.12} +{'loss': 1.4273, 'grad_norm': 3.734043836593628, 'learning_rate': 3.511610201751047e-06, 'epoch': 1.12} +{'eval_loss': 1.9193809032440186, 'eval_runtime': 19.572, 'eval_samples_per_second': 51.093, 'eval_steps_per_second': 2.146, 'epoch': 1.12} +{'loss': 1.4993, 'grad_norm': 3.7857792377471924, 'learning_rate': 3.5097068899885806e-06, 'epoch': 1.12} +{'loss': 1.5783, 'grad_norm': 4.398693561553955, 'learning_rate': 3.507803578226114e-06, 'epoch': 1.12} +{'loss': 1.4348, 'grad_norm': 3.3071231842041016, 'learning_rate': 3.5059002664636473e-06, 'epoch': 1.12} +{'loss': 1.4953, 'grad_norm': 3.5895280838012695, 'learning_rate': 3.5039969547011802e-06, 'epoch': 1.12} +{'loss': 1.5033, 'grad_norm': 3.6018621921539307, 'learning_rate': 3.5020936429387136e-06, 'epoch': 1.12} +{'loss': 1.5499, 'grad_norm': 3.589604139328003, 'learning_rate': 3.500190331176247e-06, 'epoch': 1.13} +{'loss': 1.483, 'grad_norm': 3.67411470413208, 'learning_rate': 3.4982870194137803e-06, 'epoch': 1.13} +{'loss': 1.4835, 'grad_norm': 3.643655776977539, 'learning_rate': 3.4963837076513137e-06, 'epoch': 1.13} +{'loss': 1.4288, 'grad_norm': 3.825059175491333, 'learning_rate': 3.4944803958888466e-06, 'epoch': 1.13} +{'loss': 1.5476, 'grad_norm': 4.00059175491333, 'learning_rate': 3.49257708412638e-06, 'epoch': 1.13} +{'eval_loss': 1.9116798639297485, 'eval_runtime': 19.5404, 'eval_samples_per_second': 51.176, 'eval_steps_per_second': 2.149, 'epoch': 1.13} +{'loss': 1.5614, 'grad_norm': 3.8289403915405273, 'learning_rate': 3.4906737723639133e-06, 'epoch': 1.13} +{'loss': 1.5111, 'grad_norm': 3.558891773223877, 'learning_rate': 3.4887704606014467e-06, 'epoch': 1.13} +{'loss': 1.5462, 'grad_norm': 3.495654344558716, 'learning_rate': 3.48686714883898e-06, 'epoch': 1.13} +{'loss': 1.4888, 'grad_norm': 3.9037580490112305, 'learning_rate': 3.484963837076513e-06, 'epoch': 1.13} +{'loss': 1.4383, 'grad_norm': 3.5056333541870117, 'learning_rate': 3.4830605253140464e-06, 'epoch': 1.14} +{'loss': 1.4842, 'grad_norm': 3.427194118499756, 'learning_rate': 3.4811572135515797e-06, 'epoch': 1.14} +{'loss': 1.4529, 'grad_norm': 3.503519058227539, 'learning_rate': 3.4792539017891135e-06, 'epoch': 1.14} +{'loss': 1.5296, 'grad_norm': 3.833681344985962, 'learning_rate': 3.477350590026647e-06, 'epoch': 1.14} +{'loss': 1.5357, 'grad_norm': 3.71791672706604, 'learning_rate': 3.4754472782641802e-06, 'epoch': 1.14} +{'loss': 1.4928, 'grad_norm': 3.5328214168548584, 'learning_rate': 3.4735439665017136e-06, 'epoch': 1.14} +{'eval_loss': 1.894834041595459, 'eval_runtime': 19.5154, 'eval_samples_per_second': 51.242, 'eval_steps_per_second': 2.152, 'epoch': 1.14} +{'loss': 1.497, 'grad_norm': 3.429631233215332, 'learning_rate': 3.4716406547392465e-06, 'epoch': 1.14} +{'loss': 1.5117, 'grad_norm': 3.477991819381714, 'learning_rate': 3.46973734297678e-06, 'epoch': 1.14} +{'loss': 1.5522, 'grad_norm': 3.6750600337982178, 'learning_rate': 3.4678340312143133e-06, 'epoch': 1.14} +{'loss': 1.5289, 'grad_norm': 3.457629680633545, 'learning_rate': 3.4659307194518466e-06, 'epoch': 1.14} +{'loss': 1.4672, 'grad_norm': 3.5894389152526855, 'learning_rate': 3.46402740768938e-06, 'epoch': 1.15} +{'loss': 1.4619, 'grad_norm': 3.640599250793457, 'learning_rate': 3.462124095926913e-06, 'epoch': 1.15} +{'loss': 1.5113, 'grad_norm': 3.5256636142730713, 'learning_rate': 3.4602207841644463e-06, 'epoch': 1.15} +{'loss': 1.559, 'grad_norm': 3.575544834136963, 'learning_rate': 3.4583174724019797e-06, 'epoch': 1.15} +{'loss': 1.5267, 'grad_norm': 3.4617910385131836, 'learning_rate': 3.456414160639513e-06, 'epoch': 1.15} +{'loss': 1.5111, 'grad_norm': 3.5557379722595215, 'learning_rate': 3.4545108488770464e-06, 'epoch': 1.15} +{'eval_loss': 1.8748478889465332, 'eval_runtime': 19.526, 'eval_samples_per_second': 51.214, 'eval_steps_per_second': 2.151, 'epoch': 1.15} +{'loss': 1.5173, 'grad_norm': 3.471031427383423, 'learning_rate': 3.4526075371145793e-06, 'epoch': 1.15} +{'loss': 1.5713, 'grad_norm': 3.6226091384887695, 'learning_rate': 3.4507042253521127e-06, 'epoch': 1.15} +{'loss': 1.4887, 'grad_norm': 3.6367220878601074, 'learning_rate': 3.448800913589646e-06, 'epoch': 1.15} +{'loss': 1.5827, 'grad_norm': 3.6253139972686768, 'learning_rate': 3.4468976018271794e-06, 'epoch': 1.15} +{'loss': 1.54, 'grad_norm': 3.445167064666748, 'learning_rate': 3.4449942900647128e-06, 'epoch': 1.16} +{'loss': 1.5287, 'grad_norm': 3.5699548721313477, 'learning_rate': 3.443090978302246e-06, 'epoch': 1.16} +{'loss': 1.5178, 'grad_norm': 3.45539927482605, 'learning_rate': 3.44118766653978e-06, 'epoch': 1.16} +{'loss': 1.5185, 'grad_norm': 3.541158437728882, 'learning_rate': 3.439284354777313e-06, 'epoch': 1.16} +{'loss': 1.4626, 'grad_norm': 3.557049512863159, 'learning_rate': 3.4373810430148462e-06, 'epoch': 1.16} +{'loss': 1.4897, 'grad_norm': 3.5796844959259033, 'learning_rate': 3.4354777312523796e-06, 'epoch': 1.16} +{'eval_loss': 1.8686708211898804, 'eval_runtime': 19.5135, 'eval_samples_per_second': 51.247, 'eval_steps_per_second': 2.152, 'epoch': 1.16} +{'loss': 1.5254, 'grad_norm': 3.4616944789886475, 'learning_rate': 3.433574419489913e-06, 'epoch': 1.16} +{'loss': 1.4854, 'grad_norm': 3.463737726211548, 'learning_rate': 3.4316711077274463e-06, 'epoch': 1.16} +{'loss': 1.5486, 'grad_norm': 3.595198154449463, 'learning_rate': 3.4297677959649792e-06, 'epoch': 1.16} +{'loss': 1.4708, 'grad_norm': 3.390246629714966, 'learning_rate': 3.4278644842025126e-06, 'epoch': 1.16} +{'loss': 1.5923, 'grad_norm': 3.712277412414551, 'learning_rate': 3.425961172440046e-06, 'epoch': 1.17} +{'loss': 1.4997, 'grad_norm': 3.5129363536834717, 'learning_rate': 3.4240578606775793e-06, 'epoch': 1.17} +{'loss': 1.4889, 'grad_norm': 3.5264368057250977, 'learning_rate': 3.4221545489151127e-06, 'epoch': 1.17} +{'loss': 1.5915, 'grad_norm': 3.639275074005127, 'learning_rate': 3.4202512371526456e-06, 'epoch': 1.17} +{'loss': 1.5416, 'grad_norm': 3.6508326530456543, 'learning_rate': 3.418347925390179e-06, 'epoch': 1.17} +{'loss': 1.5049, 'grad_norm': 3.3845608234405518, 'learning_rate': 3.4164446136277124e-06, 'epoch': 1.17} +{'eval_loss': 1.8865256309509277, 'eval_runtime': 19.5252, 'eval_samples_per_second': 51.216, 'eval_steps_per_second': 2.151, 'epoch': 1.17} +{'loss': 1.5307, 'grad_norm': 3.3509881496429443, 'learning_rate': 3.4145413018652457e-06, 'epoch': 1.17} +{'loss': 1.5536, 'grad_norm': 3.651043653488159, 'learning_rate': 3.412637990102779e-06, 'epoch': 1.17} +{'loss': 1.4629, 'grad_norm': 3.6111178398132324, 'learning_rate': 3.4107346783403125e-06, 'epoch': 1.17} +{'loss': 1.4945, 'grad_norm': 3.45613956451416, 'learning_rate': 3.4088313665778454e-06, 'epoch': 1.18} +{'loss': 1.4863, 'grad_norm': 3.612942695617676, 'learning_rate': 3.4069280548153788e-06, 'epoch': 1.18} +{'loss': 1.4959, 'grad_norm': 3.5480880737304688, 'learning_rate': 3.405024743052912e-06, 'epoch': 1.18} +{'loss': 1.4961, 'grad_norm': 4.082576751708984, 'learning_rate': 3.403121431290446e-06, 'epoch': 1.18} +{'loss': 1.4831, 'grad_norm': 3.664128065109253, 'learning_rate': 3.4012181195279793e-06, 'epoch': 1.18} +{'loss': 1.4987, 'grad_norm': 3.650513172149658, 'learning_rate': 3.3993148077655126e-06, 'epoch': 1.18} +{'loss': 1.4417, 'grad_norm': 3.5394937992095947, 'learning_rate': 3.3974114960030456e-06, 'epoch': 1.18} +{'eval_loss': 1.90912926197052, 'eval_runtime': 19.5555, 'eval_samples_per_second': 51.137, 'eval_steps_per_second': 2.148, 'epoch': 1.18} +{'loss': 1.4761, 'grad_norm': 3.558004140853882, 'learning_rate': 3.395508184240579e-06, 'epoch': 1.18} +{'loss': 1.5715, 'grad_norm': 3.714168071746826, 'learning_rate': 3.3936048724781123e-06, 'epoch': 1.18} +{'loss': 1.44, 'grad_norm': 3.532557725906372, 'learning_rate': 3.3917015607156457e-06, 'epoch': 1.18} +{'loss': 1.4659, 'grad_norm': 3.475353717803955, 'learning_rate': 3.389798248953179e-06, 'epoch': 1.19} +{'loss': 1.5064, 'grad_norm': 3.4950778484344482, 'learning_rate': 3.387894937190712e-06, 'epoch': 1.19} +{'loss': 1.5113, 'grad_norm': 3.853489398956299, 'learning_rate': 3.3859916254282453e-06, 'epoch': 1.19} +{'loss': 1.5656, 'grad_norm': 3.6265385150909424, 'learning_rate': 3.3840883136657787e-06, 'epoch': 1.19} +{'loss': 1.4788, 'grad_norm': 3.5415945053100586, 'learning_rate': 3.382185001903312e-06, 'epoch': 1.19} +{'loss': 1.512, 'grad_norm': 3.520357370376587, 'learning_rate': 3.3802816901408454e-06, 'epoch': 1.19} +{'loss': 1.4973, 'grad_norm': 3.431596279144287, 'learning_rate': 3.3783783783783788e-06, 'epoch': 1.19} +{'eval_loss': 1.8912651538848877, 'eval_runtime': 19.5389, 'eval_samples_per_second': 51.18, 'eval_steps_per_second': 2.15, 'epoch': 1.19} +{'loss': 1.4915, 'grad_norm': 3.518064260482788, 'learning_rate': 3.3764750666159117e-06, 'epoch': 1.19} +{'loss': 1.4852, 'grad_norm': 3.373807907104492, 'learning_rate': 3.374571754853445e-06, 'epoch': 1.19} +{'loss': 1.4425, 'grad_norm': 3.4895248413085938, 'learning_rate': 3.3726684430909784e-06, 'epoch': 1.19} +{'loss': 1.4732, 'grad_norm': 3.5565741062164307, 'learning_rate': 3.370765131328512e-06, 'epoch': 1.2} +{'loss': 1.4903, 'grad_norm': 3.6646194458007812, 'learning_rate': 3.368861819566045e-06, 'epoch': 1.2} +{'loss': 1.504, 'grad_norm': 3.715594530105591, 'learning_rate': 3.366958507803578e-06, 'epoch': 1.2} +{'loss': 1.4548, 'grad_norm': 3.6260979175567627, 'learning_rate': 3.365055196041112e-06, 'epoch': 1.2} +{'loss': 1.5423, 'grad_norm': 3.589029550552368, 'learning_rate': 3.3631518842786452e-06, 'epoch': 1.2} +{'loss': 1.4654, 'grad_norm': 3.494671106338501, 'learning_rate': 3.3612485725161786e-06, 'epoch': 1.2} +{'loss': 1.5549, 'grad_norm': 3.431565284729004, 'learning_rate': 3.359345260753712e-06, 'epoch': 1.2} +{'eval_loss': 1.8966683149337769, 'eval_runtime': 19.5219, 'eval_samples_per_second': 51.225, 'eval_steps_per_second': 2.151, 'epoch': 1.2} +{'loss': 1.5311, 'grad_norm': 3.6503396034240723, 'learning_rate': 3.3574419489912453e-06, 'epoch': 1.2} +{'loss': 1.5625, 'grad_norm': 3.736973285675049, 'learning_rate': 3.3555386372287783e-06, 'epoch': 1.2} +{'loss': 1.4632, 'grad_norm': 3.428126573562622, 'learning_rate': 3.3536353254663116e-06, 'epoch': 1.2} +{'loss': 1.5143, 'grad_norm': 3.6814732551574707, 'learning_rate': 3.351732013703845e-06, 'epoch': 1.21} +{'loss': 1.5206, 'grad_norm': 3.634695291519165, 'learning_rate': 3.3498287019413784e-06, 'epoch': 1.21} +{'loss': 1.5394, 'grad_norm': 3.8396401405334473, 'learning_rate': 3.3479253901789117e-06, 'epoch': 1.21} +{'loss': 1.4315, 'grad_norm': 3.694897174835205, 'learning_rate': 3.346022078416445e-06, 'epoch': 1.21} +{'loss': 1.5167, 'grad_norm': 3.721278429031372, 'learning_rate': 3.344118766653978e-06, 'epoch': 1.21} +{'loss': 1.4876, 'grad_norm': 3.5180704593658447, 'learning_rate': 3.3422154548915114e-06, 'epoch': 1.21} +{'loss': 1.5183, 'grad_norm': 3.66559100151062, 'learning_rate': 3.3403121431290448e-06, 'epoch': 1.21} +{'eval_loss': 1.8969014883041382, 'eval_runtime': 19.5483, 'eval_samples_per_second': 51.155, 'eval_steps_per_second': 2.149, 'epoch': 1.21} +{'loss': 1.4398, 'grad_norm': 3.4632375240325928, 'learning_rate': 3.338408831366578e-06, 'epoch': 1.21} +{'loss': 1.5276, 'grad_norm': 3.792841672897339, 'learning_rate': 3.3365055196041115e-06, 'epoch': 1.21} +{'loss': 1.5679, 'grad_norm': 4.066747188568115, 'learning_rate': 3.3346022078416444e-06, 'epoch': 1.22} +{'loss': 1.4871, 'grad_norm': 3.495617389678955, 'learning_rate': 3.3326988960791778e-06, 'epoch': 1.22} +{'loss': 1.4156, 'grad_norm': 3.707266092300415, 'learning_rate': 3.330795584316711e-06, 'epoch': 1.22} +{'loss': 1.4805, 'grad_norm': 4.0972580909729, 'learning_rate': 3.3288922725542445e-06, 'epoch': 1.22} +{'loss': 1.4972, 'grad_norm': 3.551191806793213, 'learning_rate': 3.326988960791778e-06, 'epoch': 1.22} +{'loss': 1.5558, 'grad_norm': 3.532569169998169, 'learning_rate': 3.3250856490293117e-06, 'epoch': 1.22} +{'loss': 1.5365, 'grad_norm': 3.6782970428466797, 'learning_rate': 3.323182337266845e-06, 'epoch': 1.22} +{'loss': 1.5296, 'grad_norm': 3.499028205871582, 'learning_rate': 3.321279025504378e-06, 'epoch': 1.22} +{'eval_loss': 1.8920588493347168, 'eval_runtime': 19.541, 'eval_samples_per_second': 51.174, 'eval_steps_per_second': 2.149, 'epoch': 1.22} +{'loss': 1.5189, 'grad_norm': 3.6891355514526367, 'learning_rate': 3.3193757137419113e-06, 'epoch': 1.22} +{'loss': 1.556, 'grad_norm': 3.8072047233581543, 'learning_rate': 3.3174724019794447e-06, 'epoch': 1.22} +{'loss': 1.4697, 'grad_norm': 3.673137664794922, 'learning_rate': 3.315569090216978e-06, 'epoch': 1.23} +{'loss': 1.5571, 'grad_norm': 3.989132881164551, 'learning_rate': 3.3136657784545114e-06, 'epoch': 1.23} +{'loss': 1.4777, 'grad_norm': 3.6898393630981445, 'learning_rate': 3.3117624666920443e-06, 'epoch': 1.23} +{'loss': 1.454, 'grad_norm': 3.5465087890625, 'learning_rate': 3.3098591549295777e-06, 'epoch': 1.23} +{'loss': 1.5283, 'grad_norm': 3.5209648609161377, 'learning_rate': 3.307955843167111e-06, 'epoch': 1.23} +{'loss': 1.5052, 'grad_norm': 3.5159339904785156, 'learning_rate': 3.3060525314046444e-06, 'epoch': 1.23} +{'loss': 1.4961, 'grad_norm': 3.298933982849121, 'learning_rate': 3.304149219642178e-06, 'epoch': 1.23} +{'loss': 1.5049, 'grad_norm': 3.457930564880371, 'learning_rate': 3.3022459078797107e-06, 'epoch': 1.23} +{'eval_loss': 1.900202989578247, 'eval_runtime': 19.5985, 'eval_samples_per_second': 51.024, 'eval_steps_per_second': 2.143, 'epoch': 1.23} +{'loss': 1.5028, 'grad_norm': 3.5035901069641113, 'learning_rate': 3.300342596117244e-06, 'epoch': 1.23} +{'loss': 1.4592, 'grad_norm': 3.654954195022583, 'learning_rate': 3.2984392843547775e-06, 'epoch': 1.23} +{'loss': 1.4452, 'grad_norm': 3.667118549346924, 'learning_rate': 3.296535972592311e-06, 'epoch': 1.24} +{'loss': 1.4772, 'grad_norm': 3.818599224090576, 'learning_rate': 3.294632660829844e-06, 'epoch': 1.24} +{'loss': 1.4735, 'grad_norm': 3.4481749534606934, 'learning_rate': 3.292729349067377e-06, 'epoch': 1.24} +{'loss': 1.4702, 'grad_norm': 3.5016086101531982, 'learning_rate': 3.2908260373049105e-06, 'epoch': 1.24} +{'loss': 1.5223, 'grad_norm': 3.7107155323028564, 'learning_rate': 3.288922725542444e-06, 'epoch': 1.24} +{'loss': 1.4781, 'grad_norm': 3.2854039669036865, 'learning_rate': 3.2870194137799776e-06, 'epoch': 1.24} +{'loss': 1.4525, 'grad_norm': 3.360025405883789, 'learning_rate': 3.285116102017511e-06, 'epoch': 1.24} +{'loss': 1.5328, 'grad_norm': 3.775726795196533, 'learning_rate': 3.2832127902550444e-06, 'epoch': 1.24} +{'eval_loss': 1.9010729789733887, 'eval_runtime': 19.5926, 'eval_samples_per_second': 51.04, 'eval_steps_per_second': 2.144, 'epoch': 1.24} +{'loss': 1.5072, 'grad_norm': 3.777946710586548, 'learning_rate': 3.2813094784925777e-06, 'epoch': 1.24} +{'loss': 1.4428, 'grad_norm': 3.410236358642578, 'learning_rate': 3.2794061667301107e-06, 'epoch': 1.24} +{'loss': 1.4829, 'grad_norm': 3.4890036582946777, 'learning_rate': 3.277502854967644e-06, 'epoch': 1.25} +{'loss': 1.469, 'grad_norm': 3.312319278717041, 'learning_rate': 3.2755995432051774e-06, 'epoch': 1.25} +{'loss': 1.4744, 'grad_norm': 3.426799774169922, 'learning_rate': 3.2736962314427107e-06, 'epoch': 1.25} +{'loss': 1.492, 'grad_norm': 3.4342689514160156, 'learning_rate': 3.271792919680244e-06, 'epoch': 1.25} +{'loss': 1.5477, 'grad_norm': 3.503075361251831, 'learning_rate': 3.269889607917777e-06, 'epoch': 1.25} +{'loss': 1.4658, 'grad_norm': 3.480652093887329, 'learning_rate': 3.2679862961553104e-06, 'epoch': 1.25} +{'loss': 1.4993, 'grad_norm': 3.571729898452759, 'learning_rate': 3.2660829843928438e-06, 'epoch': 1.25} +{'loss': 1.537, 'grad_norm': 3.632207155227661, 'learning_rate': 3.264179672630377e-06, 'epoch': 1.25} +{'eval_loss': 1.92387056350708, 'eval_runtime': 19.6058, 'eval_samples_per_second': 51.005, 'eval_steps_per_second': 2.142, 'epoch': 1.25} +{'loss': 1.5103, 'grad_norm': 3.7279932498931885, 'learning_rate': 3.2622763608679105e-06, 'epoch': 1.25} +{'loss': 1.4626, 'grad_norm': 3.501619338989258, 'learning_rate': 3.2603730491054434e-06, 'epoch': 1.26} +{'loss': 1.482, 'grad_norm': 3.638873815536499, 'learning_rate': 3.258469737342977e-06, 'epoch': 1.26} +{'loss': 1.4462, 'grad_norm': 3.5394251346588135, 'learning_rate': 3.25656642558051e-06, 'epoch': 1.26} +{'loss': 1.4748, 'grad_norm': 3.5057172775268555, 'learning_rate': 3.2546631138180435e-06, 'epoch': 1.26} +{'loss': 1.4695, 'grad_norm': 3.4633405208587646, 'learning_rate': 3.252759802055577e-06, 'epoch': 1.26} +{'loss': 1.4783, 'grad_norm': 3.4021005630493164, 'learning_rate': 3.25085649029311e-06, 'epoch': 1.26} +{'loss': 1.5149, 'grad_norm': 3.6344058513641357, 'learning_rate': 3.248953178530644e-06, 'epoch': 1.26} +{'loss': 1.4664, 'grad_norm': 3.3715789318084717, 'learning_rate': 3.247049866768177e-06, 'epoch': 1.26} +{'loss': 1.4255, 'grad_norm': 3.5089826583862305, 'learning_rate': 3.2451465550057103e-06, 'epoch': 1.26} +{'eval_loss': 1.9270000457763672, 'eval_runtime': 19.6099, 'eval_samples_per_second': 50.995, 'eval_steps_per_second': 2.142, 'epoch': 1.26} +{'loss': 1.4781, 'grad_norm': 3.4973366260528564, 'learning_rate': 3.2432432432432437e-06, 'epoch': 1.26} +{'loss': 1.5188, 'grad_norm': 3.551081418991089, 'learning_rate': 3.241339931480777e-06, 'epoch': 1.27} +{'loss': 1.507, 'grad_norm': 3.4629201889038086, 'learning_rate': 3.2394366197183104e-06, 'epoch': 1.27} +{'loss': 1.4766, 'grad_norm': 3.710367441177368, 'learning_rate': 3.2375333079558434e-06, 'epoch': 1.27} +{'loss': 1.5046, 'grad_norm': 3.446091413497925, 'learning_rate': 3.2356299961933767e-06, 'epoch': 1.27} +{'loss': 1.5113, 'grad_norm': 3.5539767742156982, 'learning_rate': 3.23372668443091e-06, 'epoch': 1.27} +{'loss': 1.5246, 'grad_norm': 3.542217254638672, 'learning_rate': 3.2318233726684435e-06, 'epoch': 1.27} +{'loss': 1.4676, 'grad_norm': 3.477060556411743, 'learning_rate': 3.229920060905977e-06, 'epoch': 1.27} +{'loss': 1.4665, 'grad_norm': 3.6180789470672607, 'learning_rate': 3.2280167491435098e-06, 'epoch': 1.27} +{'loss': 1.4734, 'grad_norm': 3.4398298263549805, 'learning_rate': 3.226113437381043e-06, 'epoch': 1.27} +{'eval_loss': 1.9118551015853882, 'eval_runtime': 19.529, 'eval_samples_per_second': 51.206, 'eval_steps_per_second': 2.151, 'epoch': 1.27} +{'loss': 1.4862, 'grad_norm': 3.4753944873809814, 'learning_rate': 3.2242101256185765e-06, 'epoch': 1.27} +{'loss': 1.551, 'grad_norm': 3.2855687141418457, 'learning_rate': 3.22230681385611e-06, 'epoch': 1.28} +{'loss': 1.4951, 'grad_norm': 3.428571939468384, 'learning_rate': 3.220403502093643e-06, 'epoch': 1.28} +{'loss': 1.5297, 'grad_norm': 3.286008834838867, 'learning_rate': 3.218500190331176e-06, 'epoch': 1.28} +{'loss': 1.4824, 'grad_norm': 3.3884329795837402, 'learning_rate': 3.2165968785687095e-06, 'epoch': 1.28} +{'loss': 1.5063, 'grad_norm': 3.5676679611206055, 'learning_rate': 3.214693566806243e-06, 'epoch': 1.28} +{'loss': 1.488, 'grad_norm': 3.5483181476593018, 'learning_rate': 3.2127902550437762e-06, 'epoch': 1.28} +{'loss': 1.4893, 'grad_norm': 3.539029359817505, 'learning_rate': 3.21088694328131e-06, 'epoch': 1.28} +{'loss': 1.4743, 'grad_norm': 3.4424350261688232, 'learning_rate': 3.2089836315188434e-06, 'epoch': 1.28} +{'loss': 1.4773, 'grad_norm': 3.3906919956207275, 'learning_rate': 3.2070803197563767e-06, 'epoch': 1.28} +{'eval_loss': 1.8996930122375488, 'eval_runtime': 19.6538, 'eval_samples_per_second': 50.881, 'eval_steps_per_second': 2.137, 'epoch': 1.28} +{'loss': 1.4974, 'grad_norm': 3.39992356300354, 'learning_rate': 3.2051770079939097e-06, 'epoch': 1.29} +{'loss': 1.531, 'grad_norm': 3.52290940284729, 'learning_rate': 3.203273696231443e-06, 'epoch': 1.29} +{'loss': 1.4671, 'grad_norm': 3.5336732864379883, 'learning_rate': 3.2013703844689764e-06, 'epoch': 1.29} +{'loss': 1.5118, 'grad_norm': 3.5422017574310303, 'learning_rate': 3.1994670727065098e-06, 'epoch': 1.29} +{'loss': 1.5312, 'grad_norm': 3.7440381050109863, 'learning_rate': 3.197563760944043e-06, 'epoch': 1.29} +{'loss': 1.5244, 'grad_norm': 3.5770304203033447, 'learning_rate': 3.195660449181576e-06, 'epoch': 1.29} +{'loss': 1.4621, 'grad_norm': 3.3159420490264893, 'learning_rate': 3.1937571374191094e-06, 'epoch': 1.29} +{'loss': 1.4937, 'grad_norm': 3.456700563430786, 'learning_rate': 3.191853825656643e-06, 'epoch': 1.29} +{'loss': 1.4812, 'grad_norm': 3.505206346511841, 'learning_rate': 3.189950513894176e-06, 'epoch': 1.29} +{'loss': 1.4849, 'grad_norm': 3.454866647720337, 'learning_rate': 3.1880472021317095e-06, 'epoch': 1.29} +{'eval_loss': 1.8908112049102783, 'eval_runtime': 19.5582, 'eval_samples_per_second': 51.129, 'eval_steps_per_second': 2.147, 'epoch': 1.29} +{'loss': 1.4345, 'grad_norm': 3.4157841205596924, 'learning_rate': 3.1861438903692425e-06, 'epoch': 1.3} +{'loss': 1.4726, 'grad_norm': 3.5255420207977295, 'learning_rate': 3.184240578606776e-06, 'epoch': 1.3} +{'loss': 1.4947, 'grad_norm': 3.521389961242676, 'learning_rate': 3.182337266844309e-06, 'epoch': 1.3} +{'loss': 1.5261, 'grad_norm': 3.456571102142334, 'learning_rate': 3.1804339550818426e-06, 'epoch': 1.3} +{'loss': 1.492, 'grad_norm': 3.476719856262207, 'learning_rate': 3.178530643319376e-06, 'epoch': 1.3} +{'loss': 1.4504, 'grad_norm': 3.5944302082061768, 'learning_rate': 3.176627331556909e-06, 'epoch': 1.3} +{'loss': 1.4866, 'grad_norm': 3.584378480911255, 'learning_rate': 3.1747240197944422e-06, 'epoch': 1.3} +{'loss': 1.4557, 'grad_norm': 3.2301833629608154, 'learning_rate': 3.172820708031976e-06, 'epoch': 1.3} +{'loss': 1.5294, 'grad_norm': 3.6874125003814697, 'learning_rate': 3.1709173962695094e-06, 'epoch': 1.3} +{'loss': 1.5144, 'grad_norm': 3.5736050605773926, 'learning_rate': 3.1690140845070427e-06, 'epoch': 1.3} +{'eval_loss': 1.9086660146713257, 'eval_runtime': 19.5453, 'eval_samples_per_second': 51.163, 'eval_steps_per_second': 2.149, 'epoch': 1.3} +{'loss': 1.4905, 'grad_norm': 3.6582255363464355, 'learning_rate': 3.167110772744576e-06, 'epoch': 1.31} +{'loss': 1.4842, 'grad_norm': 3.7300994396209717, 'learning_rate': 3.1652074609821095e-06, 'epoch': 1.31} +{'loss': 1.4734, 'grad_norm': 3.418839931488037, 'learning_rate': 3.1633041492196424e-06, 'epoch': 1.31} +{'loss': 1.5004, 'grad_norm': 3.7277026176452637, 'learning_rate': 3.1614008374571758e-06, 'epoch': 1.31} +{'loss': 1.4966, 'grad_norm': 3.6491987705230713, 'learning_rate': 3.159497525694709e-06, 'epoch': 1.31} +{'loss': 1.4733, 'grad_norm': 3.313399314880371, 'learning_rate': 3.1575942139322425e-06, 'epoch': 1.31} +{'loss': 1.5065, 'grad_norm': 3.718188524246216, 'learning_rate': 3.155690902169776e-06, 'epoch': 1.31} +{'loss': 1.4612, 'grad_norm': 3.7664339542388916, 'learning_rate': 3.1537875904073088e-06, 'epoch': 1.31} +{'loss': 1.4921, 'grad_norm': 3.6484899520874023, 'learning_rate': 3.151884278644842e-06, 'epoch': 1.31} +{'loss': 1.4661, 'grad_norm': 3.4585325717926025, 'learning_rate': 3.1499809668823755e-06, 'epoch': 1.31} +{'eval_loss': 1.894254207611084, 'eval_runtime': 19.5896, 'eval_samples_per_second': 51.047, 'eval_steps_per_second': 2.144, 'epoch': 1.31} +{'loss': 1.5132, 'grad_norm': 3.59190034866333, 'learning_rate': 3.148077655119909e-06, 'epoch': 1.32} +{'loss': 1.4531, 'grad_norm': 3.5601041316986084, 'learning_rate': 3.1461743433574422e-06, 'epoch': 1.32} +{'loss': 1.5189, 'grad_norm': 3.4624147415161133, 'learning_rate': 3.144271031594975e-06, 'epoch': 1.32} +{'loss': 1.5094, 'grad_norm': 3.364370107650757, 'learning_rate': 3.1423677198325085e-06, 'epoch': 1.32} +{'loss': 1.47, 'grad_norm': 3.6840009689331055, 'learning_rate': 3.140464408070042e-06, 'epoch': 1.32} +{'loss': 1.4584, 'grad_norm': 3.6294214725494385, 'learning_rate': 3.1385610963075753e-06, 'epoch': 1.32} +{'loss': 1.4491, 'grad_norm': 3.6182212829589844, 'learning_rate': 3.1366577845451086e-06, 'epoch': 1.32} +{'loss': 1.4398, 'grad_norm': 3.3501813411712646, 'learning_rate': 3.134754472782642e-06, 'epoch': 1.32} +{'loss': 1.4271, 'grad_norm': 3.437358856201172, 'learning_rate': 3.1328511610201758e-06, 'epoch': 1.32} +{'loss': 1.4226, 'grad_norm': 3.3304011821746826, 'learning_rate': 3.1309478492577087e-06, 'epoch': 1.33} +{'eval_loss': 1.8980953693389893, 'eval_runtime': 19.5846, 'eval_samples_per_second': 51.061, 'eval_steps_per_second': 2.145, 'epoch': 1.33} +{'loss': 1.4797, 'grad_norm': 3.4304802417755127, 'learning_rate': 3.129044537495242e-06, 'epoch': 1.33} +{'loss': 1.5113, 'grad_norm': 3.7087957859039307, 'learning_rate': 3.1271412257327754e-06, 'epoch': 1.33} +{'loss': 1.458, 'grad_norm': 3.488309144973755, 'learning_rate': 3.125237913970309e-06, 'epoch': 1.33} +{'loss': 1.4324, 'grad_norm': 3.620741367340088, 'learning_rate': 3.123334602207842e-06, 'epoch': 1.33} +{'loss': 1.5188, 'grad_norm': 3.520190477371216, 'learning_rate': 3.121431290445375e-06, 'epoch': 1.33} +{'loss': 1.5811, 'grad_norm': 3.71356201171875, 'learning_rate': 3.1195279786829085e-06, 'epoch': 1.33} +{'loss': 1.4678, 'grad_norm': 3.4246623516082764, 'learning_rate': 3.117624666920442e-06, 'epoch': 1.33} +{'loss': 1.4577, 'grad_norm': 3.394747734069824, 'learning_rate': 3.115721355157975e-06, 'epoch': 1.33} +{'loss': 1.4136, 'grad_norm': 3.323732376098633, 'learning_rate': 3.1138180433955085e-06, 'epoch': 1.33} +{'loss': 1.4995, 'grad_norm': 3.549511194229126, 'learning_rate': 3.1119147316330415e-06, 'epoch': 1.34} +{'eval_loss': 1.8902671337127686, 'eval_runtime': 19.5621, 'eval_samples_per_second': 51.119, 'eval_steps_per_second': 2.147, 'epoch': 1.34} +{'loss': 1.5349, 'grad_norm': 3.464460849761963, 'learning_rate': 3.110011419870575e-06, 'epoch': 1.34} +{'loss': 1.4645, 'grad_norm': 3.3578104972839355, 'learning_rate': 3.1081081081081082e-06, 'epoch': 1.34} +{'loss': 1.5093, 'grad_norm': 3.399073839187622, 'learning_rate': 3.1062047963456416e-06, 'epoch': 1.34} +{'loss': 1.5469, 'grad_norm': 3.550574779510498, 'learning_rate': 3.104301484583175e-06, 'epoch': 1.34} +{'loss': 1.5296, 'grad_norm': 3.487017869949341, 'learning_rate': 3.1023981728207083e-06, 'epoch': 1.34} +{'loss': 1.4765, 'grad_norm': 3.648475170135498, 'learning_rate': 3.1004948610582412e-06, 'epoch': 1.34} +{'loss': 1.4692, 'grad_norm': 3.416947603225708, 'learning_rate': 3.0985915492957746e-06, 'epoch': 1.34} +{'loss': 1.4377, 'grad_norm': 3.5845181941986084, 'learning_rate': 3.096688237533308e-06, 'epoch': 1.34} +{'loss': 1.5292, 'grad_norm': 3.4890127182006836, 'learning_rate': 3.0947849257708418e-06, 'epoch': 1.34} +{'loss': 1.4837, 'grad_norm': 3.5025761127471924, 'learning_rate': 3.092881614008375e-06, 'epoch': 1.35} +{'eval_loss': 1.8951964378356934, 'eval_runtime': 19.597, 'eval_samples_per_second': 51.028, 'eval_steps_per_second': 2.143, 'epoch': 1.35} +{'loss': 1.5349, 'grad_norm': 3.550020217895508, 'learning_rate': 3.0909783022459085e-06, 'epoch': 1.35} +{'loss': 1.4645, 'grad_norm': 3.3705122470855713, 'learning_rate': 3.0890749904834414e-06, 'epoch': 1.35} +{'loss': 1.5309, 'grad_norm': 3.3449292182922363, 'learning_rate': 3.0871716787209748e-06, 'epoch': 1.35} +{'loss': 1.4987, 'grad_norm': 3.616222381591797, 'learning_rate': 3.085268366958508e-06, 'epoch': 1.35} +{'loss': 1.5102, 'grad_norm': 3.526900291442871, 'learning_rate': 3.0833650551960415e-06, 'epoch': 1.35} +{'loss': 1.4585, 'grad_norm': 3.3756065368652344, 'learning_rate': 3.081461743433575e-06, 'epoch': 1.35} +{'loss': 1.48, 'grad_norm': 3.6922779083251953, 'learning_rate': 3.079558431671108e-06, 'epoch': 1.35} +{'loss': 1.4402, 'grad_norm': 3.5264687538146973, 'learning_rate': 3.077655119908641e-06, 'epoch': 1.35} +{'loss': 1.4689, 'grad_norm': 3.331678867340088, 'learning_rate': 3.0757518081461745e-06, 'epoch': 1.35} +{'loss': 1.4906, 'grad_norm': 3.755675792694092, 'learning_rate': 3.073848496383708e-06, 'epoch': 1.36} +{'eval_loss': 1.8909857273101807, 'eval_runtime': 19.5771, 'eval_samples_per_second': 51.08, 'eval_steps_per_second': 2.145, 'epoch': 1.36} +{'loss': 1.4639, 'grad_norm': 3.4811606407165527, 'learning_rate': 3.0719451846212413e-06, 'epoch': 1.36} +{'loss': 1.4958, 'grad_norm': 3.3411293029785156, 'learning_rate': 3.0700418728587746e-06, 'epoch': 1.36} +{'loss': 1.4585, 'grad_norm': 3.656400442123413, 'learning_rate': 3.0681385610963076e-06, 'epoch': 1.36} +{'loss': 1.5094, 'grad_norm': 3.8461031913757324, 'learning_rate': 3.066235249333841e-06, 'epoch': 1.36} +{'loss': 1.5378, 'grad_norm': 3.564850091934204, 'learning_rate': 3.0643319375713743e-06, 'epoch': 1.36} +{'loss': 1.4759, 'grad_norm': 3.4947054386138916, 'learning_rate': 3.0624286258089076e-06, 'epoch': 1.36} +{'loss': 1.4552, 'grad_norm': 3.413567543029785, 'learning_rate': 3.060525314046441e-06, 'epoch': 1.36} +{'loss': 1.5065, 'grad_norm': 3.5766429901123047, 'learning_rate': 3.058622002283974e-06, 'epoch': 1.36} +{'loss': 1.5193, 'grad_norm': 3.6552627086639404, 'learning_rate': 3.0567186905215077e-06, 'epoch': 1.37} +{'loss': 1.5219, 'grad_norm': 3.60090970993042, 'learning_rate': 3.054815378759041e-06, 'epoch': 1.37} +{'eval_loss': 1.8973960876464844, 'eval_runtime': 19.6256, 'eval_samples_per_second': 50.954, 'eval_steps_per_second': 2.14, 'epoch': 1.37} +{'loss': 1.5101, 'grad_norm': 3.4845283031463623, 'learning_rate': 3.0529120669965745e-06, 'epoch': 1.37} +{'loss': 1.4401, 'grad_norm': 3.433117151260376, 'learning_rate': 3.051008755234108e-06, 'epoch': 1.37} +{'loss': 1.4599, 'grad_norm': 3.5837252140045166, 'learning_rate': 3.049105443471641e-06, 'epoch': 1.37} +{'loss': 1.5962, 'grad_norm': 3.6097147464752197, 'learning_rate': 3.0472021317091745e-06, 'epoch': 1.37} +{'loss': 1.5117, 'grad_norm': 3.3721201419830322, 'learning_rate': 3.0452988199467075e-06, 'epoch': 1.37} +{'loss': 1.5064, 'grad_norm': 3.649167776107788, 'learning_rate': 3.043395508184241e-06, 'epoch': 1.37} +{'loss': 1.4528, 'grad_norm': 3.580226421356201, 'learning_rate': 3.041492196421774e-06, 'epoch': 1.37} +{'loss': 1.5004, 'grad_norm': 3.650916576385498, 'learning_rate': 3.0395888846593076e-06, 'epoch': 1.37} +{'loss': 1.477, 'grad_norm': 3.7565805912017822, 'learning_rate': 3.037685572896841e-06, 'epoch': 1.38} +{'loss': 1.4951, 'grad_norm': 3.646820545196533, 'learning_rate': 3.035782261134374e-06, 'epoch': 1.38} +{'eval_loss': 1.906795859336853, 'eval_runtime': 19.5722, 'eval_samples_per_second': 51.093, 'eval_steps_per_second': 2.146, 'epoch': 1.38} +{'loss': 1.5059, 'grad_norm': 3.6518476009368896, 'learning_rate': 3.0338789493719072e-06, 'epoch': 1.38} +{'loss': 1.5027, 'grad_norm': 3.6130800247192383, 'learning_rate': 3.0319756376094406e-06, 'epoch': 1.38} +{'loss': 1.4954, 'grad_norm': 3.6340603828430176, 'learning_rate': 3.030072325846974e-06, 'epoch': 1.38} +{'loss': 1.4503, 'grad_norm': 3.4152019023895264, 'learning_rate': 3.0281690140845073e-06, 'epoch': 1.38} +{'loss': 1.496, 'grad_norm': 3.5878539085388184, 'learning_rate': 3.0262657023220403e-06, 'epoch': 1.38} +{'loss': 1.5012, 'grad_norm': 3.5941483974456787, 'learning_rate': 3.0243623905595736e-06, 'epoch': 1.38} +{'loss': 1.4824, 'grad_norm': 3.7437148094177246, 'learning_rate': 3.022459078797107e-06, 'epoch': 1.38} +{'loss': 1.4924, 'grad_norm': 3.4894044399261475, 'learning_rate': 3.0205557670346404e-06, 'epoch': 1.38} +{'loss': 1.4504, 'grad_norm': 3.450178861618042, 'learning_rate': 3.018652455272174e-06, 'epoch': 1.39} +{'loss': 1.5866, 'grad_norm': 3.8060994148254395, 'learning_rate': 3.0167491435097075e-06, 'epoch': 1.39} +{'eval_loss': 1.9019533395767212, 'eval_runtime': 19.5686, 'eval_samples_per_second': 51.102, 'eval_steps_per_second': 2.146, 'epoch': 1.39} +{'loss': 1.4843, 'grad_norm': 3.5194859504699707, 'learning_rate': 3.014845831747241e-06, 'epoch': 1.39} +{'loss': 1.4619, 'grad_norm': 3.6644959449768066, 'learning_rate': 3.012942519984774e-06, 'epoch': 1.39} +{'loss': 1.4699, 'grad_norm': 3.324085235595703, 'learning_rate': 3.011039208222307e-06, 'epoch': 1.39} +{'loss': 1.4348, 'grad_norm': 3.5504794120788574, 'learning_rate': 3.0091358964598405e-06, 'epoch': 1.39} +{'loss': 1.4834, 'grad_norm': 3.4768283367156982, 'learning_rate': 3.007232584697374e-06, 'epoch': 1.39} +{'loss': 1.4943, 'grad_norm': 3.3828842639923096, 'learning_rate': 3.0053292729349073e-06, 'epoch': 1.39} +{'loss': 1.4939, 'grad_norm': 3.3939690589904785, 'learning_rate': 3.00342596117244e-06, 'epoch': 1.39} +{'loss': 1.4908, 'grad_norm': 3.836578369140625, 'learning_rate': 3.0015226494099736e-06, 'epoch': 1.39} +{'loss': 1.4516, 'grad_norm': 3.2970104217529297, 'learning_rate': 2.999619337647507e-06, 'epoch': 1.4} +{'loss': 1.4696, 'grad_norm': 3.7323384284973145, 'learning_rate': 2.9977160258850403e-06, 'epoch': 1.4} +{'eval_loss': 1.9022077322006226, 'eval_runtime': 19.5386, 'eval_samples_per_second': 51.181, 'eval_steps_per_second': 2.15, 'epoch': 1.4} +{'loss': 1.447, 'grad_norm': 3.666674852371216, 'learning_rate': 2.9958127141225736e-06, 'epoch': 1.4} +{'loss': 1.5231, 'grad_norm': 3.5165741443634033, 'learning_rate': 2.9939094023601066e-06, 'epoch': 1.4} +{'loss': 1.4924, 'grad_norm': 3.537316083908081, 'learning_rate': 2.99200609059764e-06, 'epoch': 1.4} +{'loss': 1.4298, 'grad_norm': 3.592864990234375, 'learning_rate': 2.9901027788351733e-06, 'epoch': 1.4} +{'loss': 1.4523, 'grad_norm': 3.736992120742798, 'learning_rate': 2.9881994670727067e-06, 'epoch': 1.4} +{'loss': 1.4732, 'grad_norm': 3.3955721855163574, 'learning_rate': 2.98629615531024e-06, 'epoch': 1.4} +{'loss': 1.4513, 'grad_norm': 3.487550973892212, 'learning_rate': 2.984392843547773e-06, 'epoch': 1.4} +{'loss': 1.4652, 'grad_norm': 3.544178009033203, 'learning_rate': 2.9824895317853063e-06, 'epoch': 1.41} +{'loss': 1.4743, 'grad_norm': 3.6225168704986572, 'learning_rate': 2.98058622002284e-06, 'epoch': 1.41} +{'loss': 1.5263, 'grad_norm': 3.4267966747283936, 'learning_rate': 2.9786829082603735e-06, 'epoch': 1.41} +{'eval_loss': 1.9004613161087036, 'eval_runtime': 19.512, 'eval_samples_per_second': 51.251, 'eval_steps_per_second': 2.153, 'epoch': 1.41} +{'loss': 1.5226, 'grad_norm': 3.799076795578003, 'learning_rate': 2.976779596497907e-06, 'epoch': 1.41} +{'loss': 1.5347, 'grad_norm': 3.5705244541168213, 'learning_rate': 2.97487628473544e-06, 'epoch': 1.41} +{'loss': 1.4917, 'grad_norm': 3.5650413036346436, 'learning_rate': 2.9729729729729736e-06, 'epoch': 1.41} +{'loss': 1.5255, 'grad_norm': 3.674274444580078, 'learning_rate': 2.9710696612105065e-06, 'epoch': 1.41} +{'loss': 1.4691, 'grad_norm': 3.469870090484619, 'learning_rate': 2.96916634944804e-06, 'epoch': 1.41} +{'loss': 1.5312, 'grad_norm': 3.7833914756774902, 'learning_rate': 2.9672630376855732e-06, 'epoch': 1.41} +{'loss': 1.4954, 'grad_norm': 3.6105148792266846, 'learning_rate': 2.9653597259231066e-06, 'epoch': 1.41} +{'loss': 1.4489, 'grad_norm': 3.502509832382202, 'learning_rate': 2.96345641416064e-06, 'epoch': 1.42} +{'loss': 1.382, 'grad_norm': 3.4045069217681885, 'learning_rate': 2.961553102398173e-06, 'epoch': 1.42} +{'loss': 1.5223, 'grad_norm': 3.5020158290863037, 'learning_rate': 2.9596497906357063e-06, 'epoch': 1.42} +{'eval_loss': 1.8955446481704712, 'eval_runtime': 19.506, 'eval_samples_per_second': 51.266, 'eval_steps_per_second': 2.153, 'epoch': 1.42} +{'loss': 1.5311, 'grad_norm': 3.7870781421661377, 'learning_rate': 2.9577464788732396e-06, 'epoch': 1.42} +{'loss': 1.4586, 'grad_norm': 3.5985190868377686, 'learning_rate': 2.955843167110773e-06, 'epoch': 1.42} +{'loss': 1.4733, 'grad_norm': 3.6743879318237305, 'learning_rate': 2.9539398553483064e-06, 'epoch': 1.42} +{'loss': 1.4885, 'grad_norm': 3.596088171005249, 'learning_rate': 2.9520365435858393e-06, 'epoch': 1.42} +{'loss': 1.5553, 'grad_norm': 3.734295606613159, 'learning_rate': 2.9501332318233727e-06, 'epoch': 1.42} +{'loss': 1.4952, 'grad_norm': 3.2503511905670166, 'learning_rate': 2.948229920060906e-06, 'epoch': 1.42} +{'loss': 1.457, 'grad_norm': 3.198627233505249, 'learning_rate': 2.9463266082984394e-06, 'epoch': 1.42} +{'loss': 1.4771, 'grad_norm': 3.5034215450286865, 'learning_rate': 2.9444232965359727e-06, 'epoch': 1.43} +{'loss': 1.496, 'grad_norm': 3.501101016998291, 'learning_rate': 2.9425199847735057e-06, 'epoch': 1.43} +{'loss': 1.4594, 'grad_norm': 3.3698441982269287, 'learning_rate': 2.94061667301104e-06, 'epoch': 1.43} +{'eval_loss': 1.9009376764297485, 'eval_runtime': 19.5388, 'eval_samples_per_second': 51.18, 'eval_steps_per_second': 2.15, 'epoch': 1.43} +{'loss': 1.401, 'grad_norm': 3.3776280879974365, 'learning_rate': 2.938713361248573e-06, 'epoch': 1.43} +{'loss': 1.4689, 'grad_norm': 3.573373794555664, 'learning_rate': 2.936810049486106e-06, 'epoch': 1.43} +{'loss': 1.4817, 'grad_norm': 3.4017117023468018, 'learning_rate': 2.9349067377236396e-06, 'epoch': 1.43} +{'loss': 1.4388, 'grad_norm': 3.5077953338623047, 'learning_rate': 2.933003425961173e-06, 'epoch': 1.43} +{'loss': 1.4625, 'grad_norm': 3.7072088718414307, 'learning_rate': 2.9311001141987063e-06, 'epoch': 1.43} +{'loss': 1.4522, 'grad_norm': 3.4292428493499756, 'learning_rate': 2.9291968024362392e-06, 'epoch': 1.43} +{'loss': 1.4338, 'grad_norm': 3.4341254234313965, 'learning_rate': 2.9272934906737726e-06, 'epoch': 1.44} +{'loss': 1.4522, 'grad_norm': 3.324857234954834, 'learning_rate': 2.925390178911306e-06, 'epoch': 1.44} +{'loss': 1.4753, 'grad_norm': 3.543111562728882, 'learning_rate': 2.9234868671488393e-06, 'epoch': 1.44} +{'loss': 1.482, 'grad_norm': 3.446840763092041, 'learning_rate': 2.9215835553863727e-06, 'epoch': 1.44} +{'eval_loss': 1.9123198986053467, 'eval_runtime': 19.5636, 'eval_samples_per_second': 51.115, 'eval_steps_per_second': 2.147, 'epoch': 1.44} +{'loss': 1.4729, 'grad_norm': 3.4904143810272217, 'learning_rate': 2.9196802436239056e-06, 'epoch': 1.44} +{'loss': 1.4478, 'grad_norm': 3.5760629177093506, 'learning_rate': 2.917776931861439e-06, 'epoch': 1.44} +{'loss': 1.4447, 'grad_norm': 3.5186285972595215, 'learning_rate': 2.9158736200989723e-06, 'epoch': 1.44} +{'loss': 1.4507, 'grad_norm': 3.4340295791625977, 'learning_rate': 2.9139703083365057e-06, 'epoch': 1.44} +{'loss': 1.479, 'grad_norm': 3.503673791885376, 'learning_rate': 2.912066996574039e-06, 'epoch': 1.44} +{'loss': 1.4153, 'grad_norm': 3.693777561187744, 'learning_rate': 2.910163684811572e-06, 'epoch': 1.44} +{'loss': 1.5386, 'grad_norm': 3.6448662281036377, 'learning_rate': 2.9082603730491054e-06, 'epoch': 1.45} +{'loss': 1.5018, 'grad_norm': 3.712688446044922, 'learning_rate': 2.9063570612866387e-06, 'epoch': 1.45} +{'loss': 1.4244, 'grad_norm': 3.758652925491333, 'learning_rate': 2.904453749524172e-06, 'epoch': 1.45} +{'loss': 1.4513, 'grad_norm': 3.415750503540039, 'learning_rate': 2.902550437761706e-06, 'epoch': 1.45} +{'eval_loss': 1.9042681455612183, 'eval_runtime': 19.5798, 'eval_samples_per_second': 51.073, 'eval_steps_per_second': 2.145, 'epoch': 1.45} +{'loss': 1.4693, 'grad_norm': 3.822981595993042, 'learning_rate': 2.9006471259992392e-06, 'epoch': 1.45} +{'loss': 1.4667, 'grad_norm': 3.5210466384887695, 'learning_rate': 2.8987438142367726e-06, 'epoch': 1.45} +{'loss': 1.5189, 'grad_norm': 3.660736560821533, 'learning_rate': 2.8968405024743055e-06, 'epoch': 1.45} +{'loss': 1.4221, 'grad_norm': 3.8035154342651367, 'learning_rate': 2.894937190711839e-06, 'epoch': 1.45} +{'loss': 1.4923, 'grad_norm': 3.6179070472717285, 'learning_rate': 2.8930338789493723e-06, 'epoch': 1.45} +{'loss': 1.4717, 'grad_norm': 3.3527944087982178, 'learning_rate': 2.8911305671869056e-06, 'epoch': 1.45} +{'loss': 1.4115, 'grad_norm': 3.4813122749328613, 'learning_rate': 2.889227255424439e-06, 'epoch': 1.46} +{'loss': 1.4674, 'grad_norm': 3.6203453540802, 'learning_rate': 2.887323943661972e-06, 'epoch': 1.46} +{'loss': 1.4452, 'grad_norm': 3.3381266593933105, 'learning_rate': 2.8854206318995053e-06, 'epoch': 1.46} +{'loss': 1.4656, 'grad_norm': 3.678907871246338, 'learning_rate': 2.8835173201370386e-06, 'epoch': 1.46} +{'eval_loss': 1.9010992050170898, 'eval_runtime': 35.6339, 'eval_samples_per_second': 28.063, 'eval_steps_per_second': 1.179, 'epoch': 1.46} +{'loss': 1.509, 'grad_norm': 4.0514421463012695, 'learning_rate': 2.881614008374572e-06, 'epoch': 1.46} +{'loss': 1.4889, 'grad_norm': 3.697770357131958, 'learning_rate': 2.8797106966121054e-06, 'epoch': 1.46} +{'loss': 1.4698, 'grad_norm': 3.4625465869903564, 'learning_rate': 2.8778073848496383e-06, 'epoch': 1.46} +{'loss': 1.4987, 'grad_norm': 3.5478508472442627, 'learning_rate': 2.8759040730871717e-06, 'epoch': 1.46} +{'loss': 1.4607, 'grad_norm': 3.5000874996185303, 'learning_rate': 2.874000761324705e-06, 'epoch': 1.46} +{'loss': 1.4932, 'grad_norm': 3.5087668895721436, 'learning_rate': 2.8720974495622384e-06, 'epoch': 1.46} +{'loss': 1.48, 'grad_norm': 3.5349714756011963, 'learning_rate': 2.8701941377997718e-06, 'epoch': 1.47} +{'loss': 1.5117, 'grad_norm': 3.5005040168762207, 'learning_rate': 2.8682908260373047e-06, 'epoch': 1.47} +{'loss': 1.492, 'grad_norm': 3.4692845344543457, 'learning_rate': 2.866387514274838e-06, 'epoch': 1.47} +{'loss': 1.4947, 'grad_norm': 3.516383409500122, 'learning_rate': 2.864484202512372e-06, 'epoch': 1.47} +{'eval_loss': 1.888146162033081, 'eval_runtime': 25.4025, 'eval_samples_per_second': 39.366, 'eval_steps_per_second': 1.653, 'epoch': 1.47} +{'loss': 1.456, 'grad_norm': 3.379704236984253, 'learning_rate': 2.8625808907499052e-06, 'epoch': 1.47} +{'loss': 1.4946, 'grad_norm': 3.304861545562744, 'learning_rate': 2.8606775789874386e-06, 'epoch': 1.47} +{'loss': 1.4617, 'grad_norm': 3.3411974906921387, 'learning_rate': 2.858774267224972e-06, 'epoch': 1.47} +{'loss': 1.4786, 'grad_norm': 3.4056971073150635, 'learning_rate': 2.8568709554625053e-06, 'epoch': 1.47} +{'loss': 1.5331, 'grad_norm': 3.448061227798462, 'learning_rate': 2.8549676437000382e-06, 'epoch': 1.47} +{'loss': 1.4995, 'grad_norm': 3.4297866821289062, 'learning_rate': 2.8530643319375716e-06, 'epoch': 1.48} +{'loss': 1.4632, 'grad_norm': 3.6503026485443115, 'learning_rate': 2.851161020175105e-06, 'epoch': 1.48} +{'loss': 1.4366, 'grad_norm': 3.526366949081421, 'learning_rate': 2.8492577084126383e-06, 'epoch': 1.48} +{'loss': 1.4791, 'grad_norm': 3.556513547897339, 'learning_rate': 2.8473543966501717e-06, 'epoch': 1.48} +{'loss': 1.5195, 'grad_norm': 3.581660509109497, 'learning_rate': 2.8454510848877046e-06, 'epoch': 1.48} +{'eval_loss': 1.8943085670471191, 'eval_runtime': 37.903, 'eval_samples_per_second': 26.383, 'eval_steps_per_second': 1.108, 'epoch': 1.48} +{'loss': 1.4733, 'grad_norm': 3.4455294609069824, 'learning_rate': 2.843547773125238e-06, 'epoch': 1.48} +{'loss': 1.4782, 'grad_norm': 3.497152805328369, 'learning_rate': 2.8416444613627714e-06, 'epoch': 1.48} +{'loss': 1.4289, 'grad_norm': 3.5996174812316895, 'learning_rate': 2.8397411496003047e-06, 'epoch': 1.48} +{'loss': 1.4763, 'grad_norm': 3.571659564971924, 'learning_rate': 2.837837837837838e-06, 'epoch': 1.48} +{'loss': 1.4984, 'grad_norm': 3.6680657863616943, 'learning_rate': 2.835934526075371e-06, 'epoch': 1.48} +{'loss': 1.4418, 'grad_norm': 3.5274746417999268, 'learning_rate': 2.8340312143129044e-06, 'epoch': 1.49} +{'loss': 1.4293, 'grad_norm': 3.379484176635742, 'learning_rate': 2.8321279025504377e-06, 'epoch': 1.49} +{'loss': 1.5417, 'grad_norm': 3.786261558532715, 'learning_rate': 2.830224590787971e-06, 'epoch': 1.49} +{'loss': 1.484, 'grad_norm': 3.6364946365356445, 'learning_rate': 2.8283212790255045e-06, 'epoch': 1.49} +{'loss': 1.5251, 'grad_norm': 3.4303221702575684, 'learning_rate': 2.8264179672630383e-06, 'epoch': 1.49} +{'eval_loss': 1.886722207069397, 'eval_runtime': 19.5303, 'eval_samples_per_second': 51.203, 'eval_steps_per_second': 2.151, 'epoch': 1.49} +{'loss': 1.4545, 'grad_norm': 3.5480268001556396, 'learning_rate': 2.8245146555005716e-06, 'epoch': 1.49} +{'loss': 1.4136, 'grad_norm': 3.5498242378234863, 'learning_rate': 2.8226113437381046e-06, 'epoch': 1.49} +{'loss': 1.4744, 'grad_norm': 3.422705888748169, 'learning_rate': 2.820708031975638e-06, 'epoch': 1.49} +{'loss': 1.5074, 'grad_norm': 3.7367637157440186, 'learning_rate': 2.8188047202131713e-06, 'epoch': 1.49} +{'loss': 1.5205, 'grad_norm': 3.5924689769744873, 'learning_rate': 2.8169014084507046e-06, 'epoch': 1.49} +{'loss': 1.4697, 'grad_norm': 3.557173490524292, 'learning_rate': 2.814998096688238e-06, 'epoch': 1.5} +{'loss': 1.422, 'grad_norm': 3.4056918621063232, 'learning_rate': 2.813094784925771e-06, 'epoch': 1.5} +{'loss': 1.5339, 'grad_norm': 3.5907366275787354, 'learning_rate': 2.8111914731633043e-06, 'epoch': 1.5} +{'loss': 1.4711, 'grad_norm': 3.55116868019104, 'learning_rate': 2.8092881614008377e-06, 'epoch': 1.5} +{'loss': 1.4337, 'grad_norm': 3.4128639698028564, 'learning_rate': 2.807384849638371e-06, 'epoch': 1.5} +{'eval_loss': 1.8867028951644897, 'eval_runtime': 19.5256, 'eval_samples_per_second': 51.215, 'eval_steps_per_second': 2.151, 'epoch': 1.5} +{'loss': 1.4948, 'grad_norm': 3.401756763458252, 'learning_rate': 2.8054815378759044e-06, 'epoch': 1.5} +{'loss': 1.4652, 'grad_norm': 3.4811344146728516, 'learning_rate': 2.8035782261134378e-06, 'epoch': 1.5} +{'loss': 1.4635, 'grad_norm': 3.5507752895355225, 'learning_rate': 2.8016749143509707e-06, 'epoch': 1.5} +{'loss': 1.4618, 'grad_norm': 3.493058919906616, 'learning_rate': 2.799771602588504e-06, 'epoch': 1.5} +{'loss': 1.4923, 'grad_norm': 3.7348647117614746, 'learning_rate': 2.7978682908260374e-06, 'epoch': 1.5} +{'loss': 1.5038, 'grad_norm': 3.834883451461792, 'learning_rate': 2.795964979063571e-06, 'epoch': 1.51} +{'loss': 1.4864, 'grad_norm': 3.410924196243286, 'learning_rate': 2.794061667301104e-06, 'epoch': 1.51} +{'loss': 1.4906, 'grad_norm': 3.6676347255706787, 'learning_rate': 2.792158355538637e-06, 'epoch': 1.51} +{'loss': 1.4905, 'grad_norm': 3.7822535037994385, 'learning_rate': 2.7902550437761705e-06, 'epoch': 1.51} +{'loss': 1.4418, 'grad_norm': 3.323343515396118, 'learning_rate': 2.7883517320137042e-06, 'epoch': 1.51} +{'eval_loss': 1.885093331336975, 'eval_runtime': 19.5199, 'eval_samples_per_second': 51.23, 'eval_steps_per_second': 2.152, 'epoch': 1.51} +{'loss': 1.4428, 'grad_norm': 3.6783649921417236, 'learning_rate': 2.7864484202512376e-06, 'epoch': 1.51} +{'loss': 1.4723, 'grad_norm': 3.541440963745117, 'learning_rate': 2.784545108488771e-06, 'epoch': 1.51} +{'loss': 1.4782, 'grad_norm': 3.6733627319335938, 'learning_rate': 2.7826417967263043e-06, 'epoch': 1.51} +{'loss': 1.517, 'grad_norm': 3.7048590183258057, 'learning_rate': 2.7807384849638373e-06, 'epoch': 1.51} +{'loss': 1.4699, 'grad_norm': 3.5511550903320312, 'learning_rate': 2.7788351732013706e-06, 'epoch': 1.52} +{'loss': 1.4663, 'grad_norm': 3.471226930618286, 'learning_rate': 2.776931861438904e-06, 'epoch': 1.52} +{'loss': 1.4714, 'grad_norm': 3.3670783042907715, 'learning_rate': 2.7750285496764374e-06, 'epoch': 1.52} +{'loss': 1.4801, 'grad_norm': 3.5483298301696777, 'learning_rate': 2.7731252379139707e-06, 'epoch': 1.52} +{'loss': 1.4169, 'grad_norm': 3.6219935417175293, 'learning_rate': 2.771221926151504e-06, 'epoch': 1.52} +{'loss': 1.4871, 'grad_norm': 3.6648623943328857, 'learning_rate': 2.769318614389037e-06, 'epoch': 1.52} +{'eval_loss': 1.8961957693099976, 'eval_runtime': 19.5148, 'eval_samples_per_second': 51.243, 'eval_steps_per_second': 2.152, 'epoch': 1.52} +{'loss': 1.4825, 'grad_norm': 3.78617525100708, 'learning_rate': 2.7674153026265704e-06, 'epoch': 1.52} +{'loss': 1.4912, 'grad_norm': 3.760444402694702, 'learning_rate': 2.7655119908641037e-06, 'epoch': 1.52} +{'loss': 1.4672, 'grad_norm': 3.6661674976348877, 'learning_rate': 2.763608679101637e-06, 'epoch': 1.52} +{'loss': 1.5064, 'grad_norm': 3.690605878829956, 'learning_rate': 2.7617053673391705e-06, 'epoch': 1.52} +{'loss': 1.5061, 'grad_norm': 3.645209550857544, 'learning_rate': 2.7598020555767034e-06, 'epoch': 1.53} +{'loss': 1.4348, 'grad_norm': 3.4887142181396484, 'learning_rate': 2.7578987438142368e-06, 'epoch': 1.53} +{'loss': 1.4849, 'grad_norm': 3.507307291030884, 'learning_rate': 2.75599543205177e-06, 'epoch': 1.53} +{'loss': 1.5066, 'grad_norm': 3.7039289474487305, 'learning_rate': 2.7540921202893035e-06, 'epoch': 1.53} +{'loss': 1.4811, 'grad_norm': 3.6331074237823486, 'learning_rate': 2.752188808526837e-06, 'epoch': 1.53} +{'loss': 1.4626, 'grad_norm': 3.5897464752197266, 'learning_rate': 2.75028549676437e-06, 'epoch': 1.53} +{'eval_loss': 1.8879802227020264, 'eval_runtime': 19.4937, 'eval_samples_per_second': 51.299, 'eval_steps_per_second': 2.155, 'epoch': 1.53} +{'loss': 1.4724, 'grad_norm': 3.6207938194274902, 'learning_rate': 2.7483821850019036e-06, 'epoch': 1.53} +{'loss': 1.4639, 'grad_norm': 3.607470989227295, 'learning_rate': 2.746478873239437e-06, 'epoch': 1.53} +{'loss': 1.4622, 'grad_norm': 3.603444814682007, 'learning_rate': 2.7445755614769703e-06, 'epoch': 1.53} +{'loss': 1.4365, 'grad_norm': 3.3372488021850586, 'learning_rate': 2.7426722497145037e-06, 'epoch': 1.53} +{'loss': 1.4458, 'grad_norm': 3.4883337020874023, 'learning_rate': 2.740768937952037e-06, 'epoch': 1.54} +{'loss': 1.4612, 'grad_norm': 3.580230474472046, 'learning_rate': 2.7388656261895704e-06, 'epoch': 1.54} +{'loss': 1.5464, 'grad_norm': 3.9022483825683594, 'learning_rate': 2.7369623144271033e-06, 'epoch': 1.54} +{'loss': 1.4557, 'grad_norm': 3.644609212875366, 'learning_rate': 2.7350590026646367e-06, 'epoch': 1.54} +{'loss': 1.4155, 'grad_norm': 3.5716593265533447, 'learning_rate': 2.73315569090217e-06, 'epoch': 1.54} +{'loss': 1.5131, 'grad_norm': 3.6449248790740967, 'learning_rate': 2.7312523791397034e-06, 'epoch': 1.54} +{'eval_loss': 1.8926544189453125, 'eval_runtime': 19.497, 'eval_samples_per_second': 51.29, 'eval_steps_per_second': 2.154, 'epoch': 1.54} +{'loss': 1.5348, 'grad_norm': 3.693988561630249, 'learning_rate': 2.7293490673772368e-06, 'epoch': 1.54} +{'loss': 1.4924, 'grad_norm': 3.458172559738159, 'learning_rate': 2.7274457556147697e-06, 'epoch': 1.54} +{'loss': 1.4771, 'grad_norm': 3.484513521194458, 'learning_rate': 2.725542443852303e-06, 'epoch': 1.54} +{'loss': 1.4846, 'grad_norm': 3.495142936706543, 'learning_rate': 2.7236391320898365e-06, 'epoch': 1.54} +{'loss': 1.4427, 'grad_norm': 3.5176403522491455, 'learning_rate': 2.72173582032737e-06, 'epoch': 1.55} +{'loss': 1.4753, 'grad_norm': 3.4579029083251953, 'learning_rate': 2.719832508564903e-06, 'epoch': 1.55} +{'loss': 1.4761, 'grad_norm': 3.6559529304504395, 'learning_rate': 2.717929196802436e-06, 'epoch': 1.55} +{'loss': 1.4541, 'grad_norm': 3.761859655380249, 'learning_rate': 2.7160258850399695e-06, 'epoch': 1.55} +{'loss': 1.4685, 'grad_norm': 3.6241519451141357, 'learning_rate': 2.714122573277503e-06, 'epoch': 1.55} +{'loss': 1.4623, 'grad_norm': 3.3880300521850586, 'learning_rate': 2.712219261515036e-06, 'epoch': 1.55} +{'eval_loss': 1.8897600173950195, 'eval_runtime': 19.5248, 'eval_samples_per_second': 51.217, 'eval_steps_per_second': 2.151, 'epoch': 1.55} +{'loss': 1.4808, 'grad_norm': 3.5921647548675537, 'learning_rate': 2.71031594975257e-06, 'epoch': 1.55} +{'loss': 1.4882, 'grad_norm': 3.4420578479766846, 'learning_rate': 2.7084126379901034e-06, 'epoch': 1.55} +{'loss': 1.5089, 'grad_norm': 3.722755193710327, 'learning_rate': 2.7065093262276367e-06, 'epoch': 1.55} +{'loss': 1.5161, 'grad_norm': 3.4246301651000977, 'learning_rate': 2.7046060144651697e-06, 'epoch': 1.56} +{'loss': 1.5277, 'grad_norm': 3.4514970779418945, 'learning_rate': 2.702702702702703e-06, 'epoch': 1.56} +{'loss': 1.3921, 'grad_norm': 3.40913724899292, 'learning_rate': 2.7007993909402364e-06, 'epoch': 1.56} +{'loss': 1.5298, 'grad_norm': 3.6044957637786865, 'learning_rate': 2.6988960791777697e-06, 'epoch': 1.56} +{'loss': 1.4753, 'grad_norm': 3.5198075771331787, 'learning_rate': 2.696992767415303e-06, 'epoch': 1.56} +{'loss': 1.469, 'grad_norm': 3.5519540309906006, 'learning_rate': 2.695089455652836e-06, 'epoch': 1.56} +{'loss': 1.4389, 'grad_norm': 3.588982343673706, 'learning_rate': 2.6931861438903694e-06, 'epoch': 1.56} +{'eval_loss': 1.8880659341812134, 'eval_runtime': 19.5333, 'eval_samples_per_second': 51.195, 'eval_steps_per_second': 2.15, 'epoch': 1.56} +{'loss': 1.4945, 'grad_norm': 3.5364952087402344, 'learning_rate': 2.6912828321279028e-06, 'epoch': 1.56} +{'loss': 1.4726, 'grad_norm': 3.6485745906829834, 'learning_rate': 2.689379520365436e-06, 'epoch': 1.56} +{'loss': 1.5013, 'grad_norm': 3.4896011352539062, 'learning_rate': 2.6874762086029695e-06, 'epoch': 1.56} +{'loss': 1.4494, 'grad_norm': 3.589754104614258, 'learning_rate': 2.6855728968405024e-06, 'epoch': 1.57} +{'loss': 1.4806, 'grad_norm': 3.5715506076812744, 'learning_rate': 2.683669585078036e-06, 'epoch': 1.57} +{'loss': 1.4284, 'grad_norm': 3.7296478748321533, 'learning_rate': 2.681766273315569e-06, 'epoch': 1.57} +{'loss': 1.4671, 'grad_norm': 3.4697155952453613, 'learning_rate': 2.6798629615531025e-06, 'epoch': 1.57} +{'loss': 1.4709, 'grad_norm': 3.5673766136169434, 'learning_rate': 2.677959649790636e-06, 'epoch': 1.57} +{'loss': 1.5039, 'grad_norm': 3.67797589302063, 'learning_rate': 2.676056338028169e-06, 'epoch': 1.57} +{'loss': 1.4892, 'grad_norm': 3.712921380996704, 'learning_rate': 2.674153026265702e-06, 'epoch': 1.57} +{'eval_loss': 1.8850891590118408, 'eval_runtime': 19.5377, 'eval_samples_per_second': 51.183, 'eval_steps_per_second': 2.15, 'epoch': 1.57} +{'loss': 1.5048, 'grad_norm': 3.6252105236053467, 'learning_rate': 2.672249714503236e-06, 'epoch': 1.57} +{'loss': 1.4511, 'grad_norm': 3.4601283073425293, 'learning_rate': 2.6703464027407693e-06, 'epoch': 1.57} +{'loss': 1.48, 'grad_norm': 3.7170798778533936, 'learning_rate': 2.6684430909783027e-06, 'epoch': 1.57} +{'loss': 1.4696, 'grad_norm': 3.641054153442383, 'learning_rate': 2.666539779215836e-06, 'epoch': 1.58} +{'loss': 1.4477, 'grad_norm': 3.452446460723877, 'learning_rate': 2.6646364674533694e-06, 'epoch': 1.58} +{'loss': 1.3725, 'grad_norm': 3.312429904937744, 'learning_rate': 2.6627331556909024e-06, 'epoch': 1.58} +{'loss': 1.4587, 'grad_norm': 3.6765859127044678, 'learning_rate': 2.6608298439284357e-06, 'epoch': 1.58} +{'loss': 1.4586, 'grad_norm': 3.392179012298584, 'learning_rate': 2.658926532165969e-06, 'epoch': 1.58} +{'loss': 1.423, 'grad_norm': 3.4510936737060547, 'learning_rate': 2.6570232204035024e-06, 'epoch': 1.58} +{'loss': 1.4579, 'grad_norm': 3.671384334564209, 'learning_rate': 2.655119908641036e-06, 'epoch': 1.58} +{'eval_loss': 1.877853274345398, 'eval_runtime': 19.5272, 'eval_samples_per_second': 51.211, 'eval_steps_per_second': 2.151, 'epoch': 1.58} +{'loss': 1.4691, 'grad_norm': 3.6903204917907715, 'learning_rate': 2.6532165968785687e-06, 'epoch': 1.58} +{'loss': 1.4248, 'grad_norm': 3.4317266941070557, 'learning_rate': 2.651313285116102e-06, 'epoch': 1.58} +{'loss': 1.4363, 'grad_norm': 3.389103889465332, 'learning_rate': 2.6494099733536355e-06, 'epoch': 1.59} +{'loss': 1.6224, 'grad_norm': 3.8953285217285156, 'learning_rate': 2.647506661591169e-06, 'epoch': 1.59} +{'loss': 1.4886, 'grad_norm': 3.3988535404205322, 'learning_rate': 2.645603349828702e-06, 'epoch': 1.59} +{'loss': 1.4267, 'grad_norm': 3.77156138420105, 'learning_rate': 2.643700038066235e-06, 'epoch': 1.59} +{'loss': 1.4799, 'grad_norm': 3.6133017539978027, 'learning_rate': 2.6417967263037685e-06, 'epoch': 1.59} +{'loss': 1.507, 'grad_norm': 3.7896270751953125, 'learning_rate': 2.639893414541302e-06, 'epoch': 1.59} +{'loss': 1.5263, 'grad_norm': 3.668889284133911, 'learning_rate': 2.6379901027788352e-06, 'epoch': 1.59} +{'loss': 1.4543, 'grad_norm': 3.5803921222686768, 'learning_rate': 2.6360867910163686e-06, 'epoch': 1.59} +{'eval_loss': 1.8805551528930664, 'eval_runtime': 19.5277, 'eval_samples_per_second': 51.209, 'eval_steps_per_second': 2.151, 'epoch': 1.59} +{'loss': 1.4727, 'grad_norm': 3.5142929553985596, 'learning_rate': 2.6341834792539024e-06, 'epoch': 1.59} +{'loss': 1.4824, 'grad_norm': 3.5427823066711426, 'learning_rate': 2.6322801674914357e-06, 'epoch': 1.59} +{'loss': 1.4665, 'grad_norm': 3.692046642303467, 'learning_rate': 2.6303768557289687e-06, 'epoch': 1.6} +{'loss': 1.4296, 'grad_norm': 3.325547456741333, 'learning_rate': 2.628473543966502e-06, 'epoch': 1.6} +{'loss': 1.4967, 'grad_norm': 3.5919787883758545, 'learning_rate': 2.6265702322040354e-06, 'epoch': 1.6} +{'loss': 1.4486, 'grad_norm': 3.744837999343872, 'learning_rate': 2.6246669204415688e-06, 'epoch': 1.6} +{'loss': 1.4272, 'grad_norm': 3.6402266025543213, 'learning_rate': 2.622763608679102e-06, 'epoch': 1.6} +{'loss': 1.4546, 'grad_norm': 3.3132426738739014, 'learning_rate': 2.620860296916635e-06, 'epoch': 1.6} +{'loss': 1.3953, 'grad_norm': 3.6351873874664307, 'learning_rate': 2.6189569851541684e-06, 'epoch': 1.6} +{'loss': 1.5038, 'grad_norm': 3.7452409267425537, 'learning_rate': 2.617053673391702e-06, 'epoch': 1.6} +{'eval_loss': 1.8828495740890503, 'eval_runtime': 19.5802, 'eval_samples_per_second': 51.072, 'eval_steps_per_second': 2.145, 'epoch': 1.6} +{'loss': 1.5334, 'grad_norm': 3.564208984375, 'learning_rate': 2.615150361629235e-06, 'epoch': 1.6} +{'loss': 1.4885, 'grad_norm': 3.401663064956665, 'learning_rate': 2.6132470498667685e-06, 'epoch': 1.6} +{'loss': 1.4713, 'grad_norm': 3.4774837493896484, 'learning_rate': 2.6113437381043015e-06, 'epoch': 1.61} +{'loss': 1.4531, 'grad_norm': 3.6255974769592285, 'learning_rate': 2.609440426341835e-06, 'epoch': 1.61} +{'loss': 1.4919, 'grad_norm': 3.787445545196533, 'learning_rate': 2.607537114579368e-06, 'epoch': 1.61} +{'loss': 1.4604, 'grad_norm': 3.480534076690674, 'learning_rate': 2.6056338028169015e-06, 'epoch': 1.61} +{'loss': 1.4375, 'grad_norm': 3.608654499053955, 'learning_rate': 2.603730491054435e-06, 'epoch': 1.61} +{'loss': 1.4883, 'grad_norm': 3.7310800552368164, 'learning_rate': 2.601827179291968e-06, 'epoch': 1.61} +{'loss': 1.4814, 'grad_norm': 3.7872259616851807, 'learning_rate': 2.599923867529501e-06, 'epoch': 1.61} +{'loss': 1.4652, 'grad_norm': 3.598719358444214, 'learning_rate': 2.5980205557670346e-06, 'epoch': 1.61} +{'eval_loss': 1.8743778467178345, 'eval_runtime': 19.6046, 'eval_samples_per_second': 51.008, 'eval_steps_per_second': 2.142, 'epoch': 1.61} +{'loss': 1.5048, 'grad_norm': 3.7258942127227783, 'learning_rate': 2.5961172440045684e-06, 'epoch': 1.61} +{'loss': 1.5037, 'grad_norm': 3.5065975189208984, 'learning_rate': 2.5942139322421017e-06, 'epoch': 1.61} +{'loss': 1.4553, 'grad_norm': 3.427415132522583, 'learning_rate': 2.592310620479635e-06, 'epoch': 1.62} +{'loss': 1.4169, 'grad_norm': 3.433415651321411, 'learning_rate': 2.5904073087171684e-06, 'epoch': 1.62} +{'loss': 1.4838, 'grad_norm': 3.44563627243042, 'learning_rate': 2.5885039969547014e-06, 'epoch': 1.62} +{'loss': 1.4315, 'grad_norm': 3.5431487560272217, 'learning_rate': 2.5866006851922347e-06, 'epoch': 1.62} +{'loss': 1.4399, 'grad_norm': 3.5764646530151367, 'learning_rate': 2.584697373429768e-06, 'epoch': 1.62} +{'loss': 1.4533, 'grad_norm': 3.730766773223877, 'learning_rate': 2.5827940616673015e-06, 'epoch': 1.62} +{'loss': 1.5033, 'grad_norm': 3.5377936363220215, 'learning_rate': 2.580890749904835e-06, 'epoch': 1.62} +{'loss': 1.4523, 'grad_norm': 3.411972999572754, 'learning_rate': 2.5789874381423678e-06, 'epoch': 1.62} +{'eval_loss': 1.8753681182861328, 'eval_runtime': 19.6003, 'eval_samples_per_second': 51.02, 'eval_steps_per_second': 2.143, 'epoch': 1.62} +{'loss': 1.4523, 'grad_norm': 3.665228843688965, 'learning_rate': 2.577084126379901e-06, 'epoch': 1.62} +{'loss': 1.4227, 'grad_norm': 3.7950446605682373, 'learning_rate': 2.5751808146174345e-06, 'epoch': 1.63} +{'loss': 1.4708, 'grad_norm': 3.508463144302368, 'learning_rate': 2.573277502854968e-06, 'epoch': 1.63} +{'loss': 1.4384, 'grad_norm': 3.294672727584839, 'learning_rate': 2.5713741910925012e-06, 'epoch': 1.63} +{'loss': 1.4402, 'grad_norm': 3.435230016708374, 'learning_rate': 2.569470879330034e-06, 'epoch': 1.63} +{'loss': 1.5394, 'grad_norm': 4.114372730255127, 'learning_rate': 2.5675675675675675e-06, 'epoch': 1.63} +{'loss': 1.4504, 'grad_norm': 3.4619836807250977, 'learning_rate': 2.565664255805101e-06, 'epoch': 1.63} +{'loss': 1.4852, 'grad_norm': 3.4026148319244385, 'learning_rate': 2.5637609440426343e-06, 'epoch': 1.63} +{'loss': 1.4232, 'grad_norm': 3.586623430252075, 'learning_rate': 2.5618576322801676e-06, 'epoch': 1.63} +{'loss': 1.5238, 'grad_norm': 3.7083818912506104, 'learning_rate': 2.5599543205177006e-06, 'epoch': 1.63} +{'eval_loss': 1.8861656188964844, 'eval_runtime': 19.6044, 'eval_samples_per_second': 51.009, 'eval_steps_per_second': 2.142, 'epoch': 1.63} +{'loss': 1.4844, 'grad_norm': 3.604175567626953, 'learning_rate': 2.558051008755234e-06, 'epoch': 1.63} +{'loss': 1.5293, 'grad_norm': 3.602494478225708, 'learning_rate': 2.5561476969927677e-06, 'epoch': 1.64} +{'loss': 1.4397, 'grad_norm': 3.4710710048675537, 'learning_rate': 2.554244385230301e-06, 'epoch': 1.64} +{'loss': 1.4712, 'grad_norm': 3.518333673477173, 'learning_rate': 2.5523410734678344e-06, 'epoch': 1.64} +{'loss': 1.4048, 'grad_norm': 3.655710220336914, 'learning_rate': 2.550437761705368e-06, 'epoch': 1.64} +{'loss': 1.4366, 'grad_norm': 3.3649561405181885, 'learning_rate': 2.548534449942901e-06, 'epoch': 1.64} +{'loss': 1.483, 'grad_norm': 3.6434574127197266, 'learning_rate': 2.546631138180434e-06, 'epoch': 1.64} +{'loss': 1.4689, 'grad_norm': 3.794060230255127, 'learning_rate': 2.5447278264179675e-06, 'epoch': 1.64} +{'loss': 1.4266, 'grad_norm': 3.469646692276001, 'learning_rate': 2.542824514655501e-06, 'epoch': 1.64} +{'loss': 1.5228, 'grad_norm': 3.5447559356689453, 'learning_rate': 2.540921202893034e-06, 'epoch': 1.64} +{'eval_loss': 1.8912339210510254, 'eval_runtime': 38.0666, 'eval_samples_per_second': 26.27, 'eval_steps_per_second': 1.103, 'epoch': 1.64} +{'loss': 1.5057, 'grad_norm': 3.615476608276367, 'learning_rate': 2.5390178911305675e-06, 'epoch': 1.64} +{'loss': 1.4683, 'grad_norm': 3.673374652862549, 'learning_rate': 2.5371145793681005e-06, 'epoch': 1.65} +{'loss': 1.463, 'grad_norm': 3.555816650390625, 'learning_rate': 2.535211267605634e-06, 'epoch': 1.65} +{'loss': 1.4035, 'grad_norm': 3.501431941986084, 'learning_rate': 2.533307955843167e-06, 'epoch': 1.65} +{'loss': 1.4658, 'grad_norm': 3.407412052154541, 'learning_rate': 2.5314046440807006e-06, 'epoch': 1.65} +{'loss': 1.4451, 'grad_norm': 3.5396435260772705, 'learning_rate': 2.529501332318234e-06, 'epoch': 1.65} +{'loss': 1.5089, 'grad_norm': 3.5976850986480713, 'learning_rate': 2.5275980205557673e-06, 'epoch': 1.65} +{'loss': 1.5426, 'grad_norm': 3.8909552097320557, 'learning_rate': 2.5256947087933002e-06, 'epoch': 1.65} +{'loss': 1.5132, 'grad_norm': 3.96783447265625, 'learning_rate': 2.5237913970308336e-06, 'epoch': 1.65} +{'loss': 1.4655, 'grad_norm': 3.638995885848999, 'learning_rate': 2.521888085268367e-06, 'epoch': 1.65} +{'eval_loss': 1.9035515785217285, 'eval_runtime': 38.512, 'eval_samples_per_second': 25.966, 'eval_steps_per_second': 1.091, 'epoch': 1.65} +{'loss': 1.512, 'grad_norm': 3.5379230976104736, 'learning_rate': 2.5199847735059003e-06, 'epoch': 1.65} +{'loss': 1.4401, 'grad_norm': 3.5413174629211426, 'learning_rate': 2.518081461743434e-06, 'epoch': 1.66} +{'loss': 1.4802, 'grad_norm': 3.3848090171813965, 'learning_rate': 2.5161781499809675e-06, 'epoch': 1.66} +{'loss': 1.4321, 'grad_norm': 3.261345148086548, 'learning_rate': 2.5142748382185004e-06, 'epoch': 1.66} +{'loss': 1.4273, 'grad_norm': 3.4942400455474854, 'learning_rate': 2.5123715264560338e-06, 'epoch': 1.66} +{'loss': 1.4472, 'grad_norm': 3.3935062885284424, 'learning_rate': 2.510468214693567e-06, 'epoch': 1.66} +{'loss': 1.3972, 'grad_norm': 3.69679594039917, 'learning_rate': 2.5085649029311005e-06, 'epoch': 1.66} +{'loss': 1.3562, 'grad_norm': 3.4592719078063965, 'learning_rate': 2.506661591168634e-06, 'epoch': 1.66} +{'loss': 1.4458, 'grad_norm': 3.4359240531921387, 'learning_rate': 2.504758279406167e-06, 'epoch': 1.66} +{'loss': 1.4929, 'grad_norm': 3.830765962600708, 'learning_rate': 2.5028549676437e-06, 'epoch': 1.66} +{'eval_loss': 1.897517442703247, 'eval_runtime': 38.3305, 'eval_samples_per_second': 26.089, 'eval_steps_per_second': 1.096, 'epoch': 1.66} +{'loss': 1.4355, 'grad_norm': 3.617687463760376, 'learning_rate': 2.5009516558812335e-06, 'epoch': 1.67} +{'loss': 1.4845, 'grad_norm': 3.386122226715088, 'learning_rate': 2.499048344118767e-06, 'epoch': 1.67} +{'loss': 1.5259, 'grad_norm': 3.433354377746582, 'learning_rate': 2.4971450323563002e-06, 'epoch': 1.67} +{'loss': 1.4366, 'grad_norm': 3.6345431804656982, 'learning_rate': 2.4952417205938336e-06, 'epoch': 1.67} +{'loss': 1.4492, 'grad_norm': 3.5302839279174805, 'learning_rate': 2.4933384088313666e-06, 'epoch': 1.67} +{'loss': 1.4341, 'grad_norm': 3.4282331466674805, 'learning_rate': 2.4914350970689e-06, 'epoch': 1.67} +{'loss': 1.4516, 'grad_norm': 3.859999656677246, 'learning_rate': 2.4895317853064337e-06, 'epoch': 1.67} +{'loss': 1.5059, 'grad_norm': 3.993675947189331, 'learning_rate': 2.4876284735439666e-06, 'epoch': 1.67} +{'loss': 1.4669, 'grad_norm': 3.5195322036743164, 'learning_rate': 2.4857251617815e-06, 'epoch': 1.67} +{'loss': 1.4749, 'grad_norm': 3.742225408554077, 'learning_rate': 2.4838218500190334e-06, 'epoch': 1.67} +{'eval_loss': 1.895235300064087, 'eval_runtime': 38.6961, 'eval_samples_per_second': 25.842, 'eval_steps_per_second': 1.085, 'epoch': 1.67} +{'loss': 1.4577, 'grad_norm': 3.5240163803100586, 'learning_rate': 2.4819185382565667e-06, 'epoch': 1.68} +{'loss': 1.4577, 'grad_norm': 3.611736297607422, 'learning_rate': 2.4800152264941e-06, 'epoch': 1.68} +{'loss': 1.424, 'grad_norm': 3.4664523601531982, 'learning_rate': 2.478111914731633e-06, 'epoch': 1.68} +{'loss': 1.458, 'grad_norm': 3.681832790374756, 'learning_rate': 2.4762086029691664e-06, 'epoch': 1.68} +{'loss': 1.5864, 'grad_norm': 4.062270641326904, 'learning_rate': 2.4743052912066998e-06, 'epoch': 1.68} +{'loss': 1.4379, 'grad_norm': 3.507434129714966, 'learning_rate': 2.472401979444233e-06, 'epoch': 1.68} +{'loss': 1.5239, 'grad_norm': 3.734678268432617, 'learning_rate': 2.4704986676817665e-06, 'epoch': 1.68} +{'loss': 1.4573, 'grad_norm': 3.796865463256836, 'learning_rate': 2.4685953559193e-06, 'epoch': 1.68} +{'loss': 1.501, 'grad_norm': 3.5453238487243652, 'learning_rate': 2.466692044156833e-06, 'epoch': 1.68} +{'loss': 1.4433, 'grad_norm': 3.498037576675415, 'learning_rate': 2.4647887323943666e-06, 'epoch': 1.68} +{'eval_loss': 1.8877137899398804, 'eval_runtime': 38.2256, 'eval_samples_per_second': 26.16, 'eval_steps_per_second': 1.099, 'epoch': 1.68} +{'loss': 1.4157, 'grad_norm': 3.7359251976013184, 'learning_rate': 2.4628854206319e-06, 'epoch': 1.69} +{'loss': 1.4099, 'grad_norm': 3.8247249126434326, 'learning_rate': 2.460982108869433e-06, 'epoch': 1.69} +{'loss': 1.5029, 'grad_norm': 3.622331142425537, 'learning_rate': 2.4590787971069662e-06, 'epoch': 1.69} +{'loss': 1.4376, 'grad_norm': 3.744528293609619, 'learning_rate': 2.4571754853444996e-06, 'epoch': 1.69} +{'loss': 1.4282, 'grad_norm': 3.633970022201538, 'learning_rate': 2.455272173582033e-06, 'epoch': 1.69} +{'loss': 1.4382, 'grad_norm': 3.5168251991271973, 'learning_rate': 2.4533688618195663e-06, 'epoch': 1.69} +{'loss': 1.3766, 'grad_norm': 3.4192118644714355, 'learning_rate': 2.4514655500570997e-06, 'epoch': 1.69} +{'loss': 1.4228, 'grad_norm': 3.770392656326294, 'learning_rate': 2.449562238294633e-06, 'epoch': 1.69} +{'loss': 1.3839, 'grad_norm': 3.7755722999572754, 'learning_rate': 2.4476589265321664e-06, 'epoch': 1.69} +{'loss': 1.4997, 'grad_norm': 4.072326183319092, 'learning_rate': 2.4457556147696993e-06, 'epoch': 1.69} +{'eval_loss': 1.8901031017303467, 'eval_runtime': 38.6785, 'eval_samples_per_second': 25.854, 'eval_steps_per_second': 1.086, 'epoch': 1.69} +{'loss': 1.4702, 'grad_norm': 3.7973761558532715, 'learning_rate': 2.4438523030072327e-06, 'epoch': 1.7} +{'loss': 1.4837, 'grad_norm': 3.9860458374023438, 'learning_rate': 2.441948991244766e-06, 'epoch': 1.7} +{'loss': 1.4743, 'grad_norm': 3.826643228530884, 'learning_rate': 2.4400456794822994e-06, 'epoch': 1.7} +{'loss': 1.4939, 'grad_norm': 3.494385004043579, 'learning_rate': 2.438142367719833e-06, 'epoch': 1.7} +{'loss': 1.4555, 'grad_norm': 3.6979312896728516, 'learning_rate': 2.4362390559573657e-06, 'epoch': 1.7} +{'loss': 1.4431, 'grad_norm': 3.837308168411255, 'learning_rate': 2.434335744194899e-06, 'epoch': 1.7} +{'loss': 1.4968, 'grad_norm': 3.7889695167541504, 'learning_rate': 2.432432432432433e-06, 'epoch': 1.7} +{'loss': 1.3987, 'grad_norm': 3.696317195892334, 'learning_rate': 2.4305291206699662e-06, 'epoch': 1.7} +{'loss': 1.4855, 'grad_norm': 3.9010350704193115, 'learning_rate': 2.428625808907499e-06, 'epoch': 1.7} +{'loss': 1.5021, 'grad_norm': 3.8719170093536377, 'learning_rate': 2.4267224971450325e-06, 'epoch': 1.71} +{'eval_loss': 1.8936494588851929, 'eval_runtime': 38.9429, 'eval_samples_per_second': 25.679, 'eval_steps_per_second': 1.079, 'epoch': 1.71} +{'loss': 1.4607, 'grad_norm': 3.666379928588867, 'learning_rate': 2.424819185382566e-06, 'epoch': 1.71} +{'loss': 1.4123, 'grad_norm': 3.495058059692383, 'learning_rate': 2.4229158736200993e-06, 'epoch': 1.71} +{'loss': 1.4192, 'grad_norm': 3.7880940437316895, 'learning_rate': 2.4210125618576326e-06, 'epoch': 1.71} +{'loss': 1.4444, 'grad_norm': 3.678636074066162, 'learning_rate': 2.4191092500951656e-06, 'epoch': 1.71} +{'loss': 1.476, 'grad_norm': 3.613985300064087, 'learning_rate': 2.417205938332699e-06, 'epoch': 1.71} +{'loss': 1.401, 'grad_norm': 3.4821643829345703, 'learning_rate': 2.4153026265702323e-06, 'epoch': 1.71} +{'loss': 1.4337, 'grad_norm': 3.6803171634674072, 'learning_rate': 2.4133993148077657e-06, 'epoch': 1.71} +{'loss': 1.4906, 'grad_norm': 3.5309746265411377, 'learning_rate': 2.411496003045299e-06, 'epoch': 1.71} +{'loss': 1.435, 'grad_norm': 3.4136171340942383, 'learning_rate': 2.4095926912828324e-06, 'epoch': 1.71} +{'loss': 1.4623, 'grad_norm': 3.7317070960998535, 'learning_rate': 2.4076893795203658e-06, 'epoch': 1.72} +{'eval_loss': 1.8755422830581665, 'eval_runtime': 38.5026, 'eval_samples_per_second': 25.972, 'eval_steps_per_second': 1.091, 'epoch': 1.72} +{'loss': 1.4452, 'grad_norm': 4.077658653259277, 'learning_rate': 2.405786067757899e-06, 'epoch': 1.72} +{'loss': 1.5291, 'grad_norm': 3.7566580772399902, 'learning_rate': 2.403882755995432e-06, 'epoch': 1.72} +{'loss': 1.457, 'grad_norm': 3.5571210384368896, 'learning_rate': 2.4019794442329654e-06, 'epoch': 1.72} +{'loss': 1.4342, 'grad_norm': 3.724270820617676, 'learning_rate': 2.4000761324704988e-06, 'epoch': 1.72} +{'loss': 1.4717, 'grad_norm': 3.663053274154663, 'learning_rate': 2.398172820708032e-06, 'epoch': 1.72} +{'loss': 1.4367, 'grad_norm': 3.414332389831543, 'learning_rate': 2.3962695089455655e-06, 'epoch': 1.72} +{'loss': 1.4207, 'grad_norm': 3.416717290878296, 'learning_rate': 2.3943661971830984e-06, 'epoch': 1.72} +{'loss': 1.434, 'grad_norm': 3.69801664352417, 'learning_rate': 2.3924628854206322e-06, 'epoch': 1.72} +{'loss': 1.4439, 'grad_norm': 3.8784968852996826, 'learning_rate': 2.3905595736581656e-06, 'epoch': 1.72} +{'loss': 1.496, 'grad_norm': 3.691215991973877, 'learning_rate': 2.388656261895699e-06, 'epoch': 1.73} +{'eval_loss': 1.8887341022491455, 'eval_runtime': 39.0438, 'eval_samples_per_second': 25.612, 'eval_steps_per_second': 1.076, 'epoch': 1.73} +{'loss': 1.4704, 'grad_norm': 4.1367387771606445, 'learning_rate': 2.386752950133232e-06, 'epoch': 1.73} +{'loss': 1.4715, 'grad_norm': 4.144986629486084, 'learning_rate': 2.3848496383707653e-06, 'epoch': 1.73} +{'loss': 1.4479, 'grad_norm': 3.668769359588623, 'learning_rate': 2.3829463266082986e-06, 'epoch': 1.73} +{'loss': 1.4766, 'grad_norm': 3.5820510387420654, 'learning_rate': 2.381043014845832e-06, 'epoch': 1.73} +{'loss': 1.4623, 'grad_norm': 3.679180145263672, 'learning_rate': 2.3791397030833653e-06, 'epoch': 1.73} +{'loss': 1.4419, 'grad_norm': 3.720754861831665, 'learning_rate': 2.3772363913208983e-06, 'epoch': 1.73} +{'loss': 1.4263, 'grad_norm': 3.7267398834228516, 'learning_rate': 2.3753330795584316e-06, 'epoch': 1.73} +{'loss': 1.4492, 'grad_norm': 3.8593533039093018, 'learning_rate': 2.3734297677959654e-06, 'epoch': 1.73} +{'loss': 1.4925, 'grad_norm': 3.812912940979004, 'learning_rate': 2.3715264560334984e-06, 'epoch': 1.73} +{'loss': 1.4773, 'grad_norm': 3.7618532180786133, 'learning_rate': 2.3696231442710317e-06, 'epoch': 1.74} +{'eval_loss': 1.8964438438415527, 'eval_runtime': 38.4199, 'eval_samples_per_second': 26.028, 'eval_steps_per_second': 1.093, 'epoch': 1.74} +{'loss': 1.4772, 'grad_norm': 3.6677284240722656, 'learning_rate': 2.367719832508565e-06, 'epoch': 1.74} +{'loss': 1.5042, 'grad_norm': 3.6003386974334717, 'learning_rate': 2.3658165207460985e-06, 'epoch': 1.74} +{'loss': 1.4653, 'grad_norm': 3.762824296951294, 'learning_rate': 2.363913208983632e-06, 'epoch': 1.74} +{'loss': 1.4807, 'grad_norm': 3.492278575897217, 'learning_rate': 2.362009897221165e-06, 'epoch': 1.74} +{'loss': 1.4676, 'grad_norm': 3.613609552383423, 'learning_rate': 2.360106585458698e-06, 'epoch': 1.74} +{'loss': 1.4244, 'grad_norm': 3.5085160732269287, 'learning_rate': 2.3582032736962315e-06, 'epoch': 1.74} +{'loss': 1.455, 'grad_norm': 3.5404748916625977, 'learning_rate': 2.356299961933765e-06, 'epoch': 1.74} +{'loss': 1.4326, 'grad_norm': 3.411201238632202, 'learning_rate': 2.354396650171298e-06, 'epoch': 1.74} +{'loss': 1.4248, 'grad_norm': 3.664170980453491, 'learning_rate': 2.3524933384088316e-06, 'epoch': 1.75} +{'loss': 1.5195, 'grad_norm': 3.7808725833892822, 'learning_rate': 2.350590026646365e-06, 'epoch': 1.75} +{'eval_loss': 1.9015717506408691, 'eval_runtime': 39.1364, 'eval_samples_per_second': 25.552, 'eval_steps_per_second': 1.073, 'epoch': 1.75} +{'loss': 1.4415, 'grad_norm': 3.596306800842285, 'learning_rate': 2.3486867148838983e-06, 'epoch': 1.75} +{'loss': 1.4362, 'grad_norm': 3.6116063594818115, 'learning_rate': 2.3467834031214317e-06, 'epoch': 1.75} +{'loss': 1.4473, 'grad_norm': 3.707850217819214, 'learning_rate': 2.3448800913589646e-06, 'epoch': 1.75} +{'loss': 1.4622, 'grad_norm': 3.7297089099884033, 'learning_rate': 2.342976779596498e-06, 'epoch': 1.75} +{'loss': 1.4599, 'grad_norm': 3.5759084224700928, 'learning_rate': 2.3410734678340313e-06, 'epoch': 1.75} +{'loss': 1.4952, 'grad_norm': 3.6391496658325195, 'learning_rate': 2.3391701560715647e-06, 'epoch': 1.75} +{'loss': 1.4543, 'grad_norm': 3.5105698108673096, 'learning_rate': 2.337266844309098e-06, 'epoch': 1.75} +{'loss': 1.4463, 'grad_norm': 3.5098063945770264, 'learning_rate': 2.3353635325466314e-06, 'epoch': 1.75} +{'loss': 1.4617, 'grad_norm': 3.7759642601013184, 'learning_rate': 2.3334602207841648e-06, 'epoch': 1.76} +{'loss': 1.4646, 'grad_norm': 3.5089807510375977, 'learning_rate': 2.331556909021698e-06, 'epoch': 1.76} +{'eval_loss': 1.8883657455444336, 'eval_runtime': 39.0003, 'eval_samples_per_second': 25.641, 'eval_steps_per_second': 1.077, 'epoch': 1.76} +{'loss': 1.4936, 'grad_norm': 3.6554079055786133, 'learning_rate': 2.3296535972592315e-06, 'epoch': 1.76} +{'loss': 1.4309, 'grad_norm': 3.4367165565490723, 'learning_rate': 2.3277502854967644e-06, 'epoch': 1.76} +{'loss': 1.4989, 'grad_norm': 3.6012988090515137, 'learning_rate': 2.325846973734298e-06, 'epoch': 1.76} +{'loss': 1.439, 'grad_norm': 3.5463366508483887, 'learning_rate': 2.323943661971831e-06, 'epoch': 1.76} +{'loss': 1.4821, 'grad_norm': 3.6648261547088623, 'learning_rate': 2.3220403502093645e-06, 'epoch': 1.76} +{'loss': 1.4941, 'grad_norm': 3.7296884059906006, 'learning_rate': 2.320137038446898e-06, 'epoch': 1.76} +{'loss': 1.4995, 'grad_norm': 3.7008771896362305, 'learning_rate': 2.318233726684431e-06, 'epoch': 1.76} +{'loss': 1.486, 'grad_norm': 3.5892093181610107, 'learning_rate': 2.3163304149219646e-06, 'epoch': 1.76} +{'loss': 1.4647, 'grad_norm': 3.8021533489227295, 'learning_rate': 2.314427103159498e-06, 'epoch': 1.77} +{'loss': 1.445, 'grad_norm': 3.456345558166504, 'learning_rate': 2.312523791397031e-06, 'epoch': 1.77} +{'eval_loss': 1.8882861137390137, 'eval_runtime': 38.6356, 'eval_samples_per_second': 25.883, 'eval_steps_per_second': 1.087, 'epoch': 1.77} +{'loss': 1.4101, 'grad_norm': 3.3932695388793945, 'learning_rate': 2.3106204796345643e-06, 'epoch': 1.77} +{'loss': 1.4449, 'grad_norm': 3.7964017391204834, 'learning_rate': 2.3087171678720976e-06, 'epoch': 1.77} +{'loss': 1.5033, 'grad_norm': 3.8719701766967773, 'learning_rate': 2.306813856109631e-06, 'epoch': 1.77} +{'loss': 1.4713, 'grad_norm': 3.9021968841552734, 'learning_rate': 2.3049105443471644e-06, 'epoch': 1.77} +{'loss': 1.446, 'grad_norm': 3.5928256511688232, 'learning_rate': 2.3030072325846973e-06, 'epoch': 1.77} +{'loss': 1.4343, 'grad_norm': 3.7034060955047607, 'learning_rate': 2.3011039208222307e-06, 'epoch': 1.77} +{'loss': 1.4355, 'grad_norm': 3.615835428237915, 'learning_rate': 2.299200609059764e-06, 'epoch': 1.77} +{'loss': 1.441, 'grad_norm': 3.5135746002197266, 'learning_rate': 2.297297297297298e-06, 'epoch': 1.78} +{'loss': 1.3982, 'grad_norm': 3.499718189239502, 'learning_rate': 2.2953939855348308e-06, 'epoch': 1.78} +{'loss': 1.4929, 'grad_norm': 3.5525877475738525, 'learning_rate': 2.293490673772364e-06, 'epoch': 1.78} +{'eval_loss': 1.889686107635498, 'eval_runtime': 39.2179, 'eval_samples_per_second': 25.499, 'eval_steps_per_second': 1.071, 'epoch': 1.78} +{'loss': 1.4775, 'grad_norm': 3.5001494884490967, 'learning_rate': 2.2915873620098975e-06, 'epoch': 1.78} +{'loss': 1.4666, 'grad_norm': 3.5272305011749268, 'learning_rate': 2.289684050247431e-06, 'epoch': 1.78} +{'loss': 1.4368, 'grad_norm': 3.5905961990356445, 'learning_rate': 2.287780738484964e-06, 'epoch': 1.78} +{'loss': 1.4707, 'grad_norm': 3.6039724349975586, 'learning_rate': 2.285877426722497e-06, 'epoch': 1.78} +{'loss': 1.4964, 'grad_norm': 3.643965482711792, 'learning_rate': 2.2839741149600305e-06, 'epoch': 1.78} +{'loss': 1.4909, 'grad_norm': 3.4614548683166504, 'learning_rate': 2.282070803197564e-06, 'epoch': 1.78} +{'loss': 1.4632, 'grad_norm': 3.5052123069763184, 'learning_rate': 2.2801674914350972e-06, 'epoch': 1.78} +{'loss': 1.4511, 'grad_norm': 3.3444948196411133, 'learning_rate': 2.2782641796726306e-06, 'epoch': 1.79} +{'loss': 1.4805, 'grad_norm': 3.4643707275390625, 'learning_rate': 2.276360867910164e-06, 'epoch': 1.79} +{'loss': 1.4148, 'grad_norm': 3.397580623626709, 'learning_rate': 2.2744575561476973e-06, 'epoch': 1.79} +{'eval_loss': 1.8934959173202515, 'eval_runtime': 39.5666, 'eval_samples_per_second': 25.274, 'eval_steps_per_second': 1.062, 'epoch': 1.79} +{'loss': 1.445, 'grad_norm': 3.385331869125366, 'learning_rate': 2.2725542443852307e-06, 'epoch': 1.79} +{'loss': 1.4963, 'grad_norm': 3.5462424755096436, 'learning_rate': 2.2706509326227636e-06, 'epoch': 1.79} +{'loss': 1.4666, 'grad_norm': 3.719648599624634, 'learning_rate': 2.268747620860297e-06, 'epoch': 1.79} +{'loss': 1.4649, 'grad_norm': 3.837676525115967, 'learning_rate': 2.2668443090978303e-06, 'epoch': 1.79} +{'loss': 1.4799, 'grad_norm': 3.4848811626434326, 'learning_rate': 2.2649409973353637e-06, 'epoch': 1.79} +{'loss': 1.4306, 'grad_norm': 3.4575588703155518, 'learning_rate': 2.263037685572897e-06, 'epoch': 1.79} +{'loss': 1.4264, 'grad_norm': 3.271036386489868, 'learning_rate': 2.26113437381043e-06, 'epoch': 1.79} +{'loss': 1.4694, 'grad_norm': 3.790252447128296, 'learning_rate': 2.259231062047964e-06, 'epoch': 1.8} +{'loss': 1.485, 'grad_norm': 3.595132350921631, 'learning_rate': 2.257327750285497e-06, 'epoch': 1.8} +{'loss': 1.4466, 'grad_norm': 3.476789712905884, 'learning_rate': 2.2554244385230305e-06, 'epoch': 1.8} +{'eval_loss': 1.9018653631210327, 'eval_runtime': 38.9662, 'eval_samples_per_second': 25.663, 'eval_steps_per_second': 1.078, 'epoch': 1.8} +{'loss': 1.4498, 'grad_norm': 3.514662504196167, 'learning_rate': 2.2535211267605635e-06, 'epoch': 1.8} +{'loss': 1.4672, 'grad_norm': 3.5332884788513184, 'learning_rate': 2.251617814998097e-06, 'epoch': 1.8} +{'loss': 1.3808, 'grad_norm': 3.475616216659546, 'learning_rate': 2.24971450323563e-06, 'epoch': 1.8} +{'loss': 1.4687, 'grad_norm': 3.7221732139587402, 'learning_rate': 2.2478111914731636e-06, 'epoch': 1.8} +{'loss': 1.3378, 'grad_norm': 3.56778621673584, 'learning_rate': 2.245907879710697e-06, 'epoch': 1.8} +{'loss': 1.4803, 'grad_norm': 3.6230456829071045, 'learning_rate': 2.24400456794823e-06, 'epoch': 1.8} +{'loss': 1.4296, 'grad_norm': 3.3937320709228516, 'learning_rate': 2.2421012561857632e-06, 'epoch': 1.8} +{'loss': 1.4356, 'grad_norm': 3.503523111343384, 'learning_rate': 2.240197944423297e-06, 'epoch': 1.81} +{'loss': 1.4352, 'grad_norm': 3.4580845832824707, 'learning_rate': 2.23829463266083e-06, 'epoch': 1.81} +{'loss': 1.4241, 'grad_norm': 3.514845609664917, 'learning_rate': 2.2363913208983633e-06, 'epoch': 1.81} +{'eval_loss': 1.9051522016525269, 'eval_runtime': 39.4462, 'eval_samples_per_second': 25.351, 'eval_steps_per_second': 1.065, 'epoch': 1.81} +{'loss': 1.4644, 'grad_norm': 3.5541889667510986, 'learning_rate': 2.2344880091358967e-06, 'epoch': 1.81} +{'loss': 1.4348, 'grad_norm': 3.4365334510803223, 'learning_rate': 2.23258469737343e-06, 'epoch': 1.81} +{'loss': 1.4843, 'grad_norm': 3.743832588195801, 'learning_rate': 2.2306813856109634e-06, 'epoch': 1.81} +{'loss': 1.4344, 'grad_norm': 3.5101191997528076, 'learning_rate': 2.2287780738484963e-06, 'epoch': 1.81} +{'loss': 1.5191, 'grad_norm': 3.5511999130249023, 'learning_rate': 2.2268747620860297e-06, 'epoch': 1.81} +{'loss': 1.4555, 'grad_norm': 3.6212966442108154, 'learning_rate': 2.224971450323563e-06, 'epoch': 1.81} +{'loss': 1.3792, 'grad_norm': 3.7049856185913086, 'learning_rate': 2.2230681385610964e-06, 'epoch': 1.82} +{'loss': 1.4436, 'grad_norm': 3.6553220748901367, 'learning_rate': 2.2211648267986298e-06, 'epoch': 1.82} +{'loss': 1.4301, 'grad_norm': 3.3817148208618164, 'learning_rate': 2.219261515036163e-06, 'epoch': 1.82} +{'loss': 1.4707, 'grad_norm': 3.6438188552856445, 'learning_rate': 2.2173582032736965e-06, 'epoch': 1.82} +{'eval_loss': 1.892125129699707, 'eval_runtime': 39.1796, 'eval_samples_per_second': 25.524, 'eval_steps_per_second': 1.072, 'epoch': 1.82} +{'loss': 1.4869, 'grad_norm': 3.642857789993286, 'learning_rate': 2.21545489151123e-06, 'epoch': 1.82} +{'loss': 1.4472, 'grad_norm': 3.4803924560546875, 'learning_rate': 2.2135515797487632e-06, 'epoch': 1.82} +{'loss': 1.445, 'grad_norm': 3.4874632358551025, 'learning_rate': 2.211648267986296e-06, 'epoch': 1.82} +{'loss': 1.4436, 'grad_norm': 3.514256715774536, 'learning_rate': 2.2097449562238295e-06, 'epoch': 1.82} +{'loss': 1.4424, 'grad_norm': 3.463587760925293, 'learning_rate': 2.207841644461363e-06, 'epoch': 1.82} +{'loss': 1.4931, 'grad_norm': 3.763096570968628, 'learning_rate': 2.2059383326988963e-06, 'epoch': 1.82} +{'loss': 1.4618, 'grad_norm': 3.7167205810546875, 'learning_rate': 2.2040350209364296e-06, 'epoch': 1.83} +{'loss': 1.4319, 'grad_norm': 3.4547367095947266, 'learning_rate': 2.2021317091739626e-06, 'epoch': 1.83} +{'loss': 1.4541, 'grad_norm': 3.422720432281494, 'learning_rate': 2.2002283974114963e-06, 'epoch': 1.83} +{'loss': 1.3826, 'grad_norm': 3.3542587757110596, 'learning_rate': 2.1983250856490297e-06, 'epoch': 1.83} +{'eval_loss': 1.8945997953414917, 'eval_runtime': 39.5512, 'eval_samples_per_second': 25.284, 'eval_steps_per_second': 1.062, 'epoch': 1.83} +{'loss': 1.4415, 'grad_norm': 3.5338187217712402, 'learning_rate': 2.1964217738865626e-06, 'epoch': 1.83} +{'loss': 1.4664, 'grad_norm': 3.688912868499756, 'learning_rate': 2.194518462124096e-06, 'epoch': 1.83} +{'loss': 1.507, 'grad_norm': 3.6614632606506348, 'learning_rate': 2.1926151503616294e-06, 'epoch': 1.83} +{'loss': 1.4648, 'grad_norm': 3.5228211879730225, 'learning_rate': 2.1907118385991627e-06, 'epoch': 1.83} +{'loss': 1.423, 'grad_norm': 3.4034955501556396, 'learning_rate': 2.188808526836696e-06, 'epoch': 1.83} +{'loss': 1.4551, 'grad_norm': 3.447815179824829, 'learning_rate': 2.1869052150742295e-06, 'epoch': 1.83} +{'loss': 1.3796, 'grad_norm': 3.4723832607269287, 'learning_rate': 2.1850019033117624e-06, 'epoch': 1.84} +{'loss': 1.4465, 'grad_norm': 3.5548415184020996, 'learning_rate': 2.1830985915492958e-06, 'epoch': 1.84} +{'loss': 1.461, 'grad_norm': 3.4193859100341797, 'learning_rate': 2.1811952797868295e-06, 'epoch': 1.84} +{'loss': 1.425, 'grad_norm': 3.4917654991149902, 'learning_rate': 2.1792919680243625e-06, 'epoch': 1.84} +{'eval_loss': 1.902342677116394, 'eval_runtime': 39.4427, 'eval_samples_per_second': 25.353, 'eval_steps_per_second': 1.065, 'epoch': 1.84} +{'loss': 1.4599, 'grad_norm': 3.6898601055145264, 'learning_rate': 2.177388656261896e-06, 'epoch': 1.84} +{'loss': 1.5302, 'grad_norm': 3.686903953552246, 'learning_rate': 2.1754853444994292e-06, 'epoch': 1.84} +{'loss': 1.3477, 'grad_norm': 3.509903907775879, 'learning_rate': 2.1735820327369626e-06, 'epoch': 1.84} +{'loss': 1.4801, 'grad_norm': 3.514153003692627, 'learning_rate': 2.171678720974496e-06, 'epoch': 1.84} +{'loss': 1.4797, 'grad_norm': 3.5297913551330566, 'learning_rate': 2.169775409212029e-06, 'epoch': 1.84} +{'loss': 1.4789, 'grad_norm': 3.5820188522338867, 'learning_rate': 2.1678720974495622e-06, 'epoch': 1.84} +{'loss': 1.4664, 'grad_norm': 3.5850486755371094, 'learning_rate': 2.1659687856870956e-06, 'epoch': 1.85} +{'loss': 1.4705, 'grad_norm': 3.534773826599121, 'learning_rate': 2.164065473924629e-06, 'epoch': 1.85} +{'loss': 1.4161, 'grad_norm': 3.5675203800201416, 'learning_rate': 2.1621621621621623e-06, 'epoch': 1.85} +{'loss': 1.4748, 'grad_norm': 3.653181791305542, 'learning_rate': 2.1602588503996957e-06, 'epoch': 1.85} +{'eval_loss': 1.8928773403167725, 'eval_runtime': 39.4009, 'eval_samples_per_second': 25.38, 'eval_steps_per_second': 1.066, 'epoch': 1.85} +{'loss': 1.4494, 'grad_norm': 3.639472723007202, 'learning_rate': 2.158355538637229e-06, 'epoch': 1.85} +{'loss': 1.4387, 'grad_norm': 3.610908031463623, 'learning_rate': 2.1564522268747624e-06, 'epoch': 1.85} +{'loss': 1.5164, 'grad_norm': 3.661078453063965, 'learning_rate': 2.1545489151122958e-06, 'epoch': 1.85} +{'loss': 1.4624, 'grad_norm': 3.5154707431793213, 'learning_rate': 2.1526456033498287e-06, 'epoch': 1.85} +{'loss': 1.4677, 'grad_norm': 3.524573802947998, 'learning_rate': 2.150742291587362e-06, 'epoch': 1.85} +{'loss': 1.4568, 'grad_norm': 3.4164857864379883, 'learning_rate': 2.1488389798248954e-06, 'epoch': 1.86} +{'loss': 1.492, 'grad_norm': 3.6285064220428467, 'learning_rate': 2.146935668062429e-06, 'epoch': 1.86} +{'loss': 1.4025, 'grad_norm': 3.4221484661102295, 'learning_rate': 2.145032356299962e-06, 'epoch': 1.86} +{'loss': 1.4546, 'grad_norm': 3.3887946605682373, 'learning_rate': 2.1431290445374955e-06, 'epoch': 1.86} +{'loss': 1.4189, 'grad_norm': 3.3935163021087646, 'learning_rate': 2.141225732775029e-06, 'epoch': 1.86} +{'eval_loss': 1.890660047531128, 'eval_runtime': 39.4969, 'eval_samples_per_second': 25.318, 'eval_steps_per_second': 1.063, 'epoch': 1.86} +{'loss': 1.4551, 'grad_norm': 3.677288770675659, 'learning_rate': 2.1393224210125623e-06, 'epoch': 1.86} +{'loss': 1.4451, 'grad_norm': 3.4312140941619873, 'learning_rate': 2.137419109250095e-06, 'epoch': 1.86} +{'loss': 1.5021, 'grad_norm': 3.741480588912964, 'learning_rate': 2.1355157974876286e-06, 'epoch': 1.86} +{'loss': 1.4882, 'grad_norm': 3.707773447036743, 'learning_rate': 2.133612485725162e-06, 'epoch': 1.86} +{'loss': 1.4018, 'grad_norm': 3.313340187072754, 'learning_rate': 2.1317091739626953e-06, 'epoch': 1.86} +{'loss': 1.4294, 'grad_norm': 3.598076343536377, 'learning_rate': 2.1298058622002286e-06, 'epoch': 1.87} +{'loss': 1.4453, 'grad_norm': 3.6260178089141846, 'learning_rate': 2.1279025504377616e-06, 'epoch': 1.87} +{'loss': 1.4379, 'grad_norm': 3.4067490100860596, 'learning_rate': 2.125999238675295e-06, 'epoch': 1.87} +{'loss': 1.4802, 'grad_norm': 3.652594804763794, 'learning_rate': 2.1240959269128287e-06, 'epoch': 1.87} +{'loss': 1.447, 'grad_norm': 3.511345624923706, 'learning_rate': 2.122192615150362e-06, 'epoch': 1.87} +{'eval_loss': 1.8905056715011597, 'eval_runtime': 38.7863, 'eval_samples_per_second': 25.782, 'eval_steps_per_second': 1.083, 'epoch': 1.87} +{'loss': 1.3924, 'grad_norm': 3.4852428436279297, 'learning_rate': 2.120289303387895e-06, 'epoch': 1.87} +{'loss': 1.4405, 'grad_norm': 3.826545476913452, 'learning_rate': 2.1183859916254284e-06, 'epoch': 1.87} +{'loss': 1.4745, 'grad_norm': 3.396336317062378, 'learning_rate': 2.1164826798629618e-06, 'epoch': 1.87} +{'loss': 1.4335, 'grad_norm': 3.4580719470977783, 'learning_rate': 2.114579368100495e-06, 'epoch': 1.87} +{'loss': 1.4202, 'grad_norm': 3.5681469440460205, 'learning_rate': 2.1126760563380285e-06, 'epoch': 1.87} +{'loss': 1.3898, 'grad_norm': 3.267469882965088, 'learning_rate': 2.1107727445755614e-06, 'epoch': 1.88} +{'loss': 1.5283, 'grad_norm': 3.8846282958984375, 'learning_rate': 2.1088694328130948e-06, 'epoch': 1.88} +{'loss': 1.411, 'grad_norm': 3.4394569396972656, 'learning_rate': 2.106966121050628e-06, 'epoch': 1.88} +{'loss': 1.4825, 'grad_norm': 3.710031032562256, 'learning_rate': 2.1050628092881615e-06, 'epoch': 1.88} +{'loss': 1.5115, 'grad_norm': 3.480318069458008, 'learning_rate': 2.103159497525695e-06, 'epoch': 1.88} +{'eval_loss': 1.8945716619491577, 'eval_runtime': 39.215, 'eval_samples_per_second': 25.5, 'eval_steps_per_second': 1.071, 'epoch': 1.88} +{'loss': 1.4313, 'grad_norm': 3.708608388900757, 'learning_rate': 2.1012561857632282e-06, 'epoch': 1.88} +{'loss': 1.4849, 'grad_norm': 3.6906962394714355, 'learning_rate': 2.0993528740007616e-06, 'epoch': 1.88} +{'loss': 1.4933, 'grad_norm': 3.4862051010131836, 'learning_rate': 2.097449562238295e-06, 'epoch': 1.88} +{'loss': 1.4431, 'grad_norm': 3.3403091430664062, 'learning_rate': 2.095546250475828e-06, 'epoch': 1.88} +{'loss': 1.4536, 'grad_norm': 3.4892570972442627, 'learning_rate': 2.0936429387133613e-06, 'epoch': 1.88} +{'loss': 1.5062, 'grad_norm': 3.749154806137085, 'learning_rate': 2.0917396269508946e-06, 'epoch': 1.89} +{'loss': 1.4786, 'grad_norm': 3.678331136703491, 'learning_rate': 2.089836315188428e-06, 'epoch': 1.89} +{'loss': 1.4899, 'grad_norm': 3.6372768878936768, 'learning_rate': 2.0879330034259614e-06, 'epoch': 1.89} +{'loss': 1.4301, 'grad_norm': 3.512761116027832, 'learning_rate': 2.0860296916634947e-06, 'epoch': 1.89} +{'loss': 1.5182, 'grad_norm': 3.5614986419677734, 'learning_rate': 2.084126379901028e-06, 'epoch': 1.89} +{'eval_loss': 1.8903381824493408, 'eval_runtime': 39.2848, 'eval_samples_per_second': 25.455, 'eval_steps_per_second': 1.069, 'epoch': 1.89} +{'loss': 1.4192, 'grad_norm': 3.717592239379883, 'learning_rate': 2.0822230681385614e-06, 'epoch': 1.89} +{'loss': 1.4733, 'grad_norm': 3.584731101989746, 'learning_rate': 2.080319756376095e-06, 'epoch': 1.89} +{'loss': 1.4725, 'grad_norm': 3.51308536529541, 'learning_rate': 2.0784164446136277e-06, 'epoch': 1.89} +{'loss': 1.4306, 'grad_norm': 3.4310622215270996, 'learning_rate': 2.076513132851161e-06, 'epoch': 1.89} +{'loss': 1.4731, 'grad_norm': 3.768308639526367, 'learning_rate': 2.0746098210886945e-06, 'epoch': 1.9} +{'loss': 1.4397, 'grad_norm': 3.537625551223755, 'learning_rate': 2.072706509326228e-06, 'epoch': 1.9} +{'loss': 1.4521, 'grad_norm': 3.685905933380127, 'learning_rate': 2.070803197563761e-06, 'epoch': 1.9} +{'loss': 1.4443, 'grad_norm': 3.7887933254241943, 'learning_rate': 2.068899885801294e-06, 'epoch': 1.9} +{'loss': 1.4603, 'grad_norm': 3.579270362854004, 'learning_rate': 2.066996574038828e-06, 'epoch': 1.9} +{'loss': 1.4995, 'grad_norm': 3.596040964126587, 'learning_rate': 2.0650932622763613e-06, 'epoch': 1.9} +{'eval_loss': 1.8911840915679932, 'eval_runtime': 38.994, 'eval_samples_per_second': 25.645, 'eval_steps_per_second': 1.077, 'epoch': 1.9} +{'loss': 1.386, 'grad_norm': 3.750377655029297, 'learning_rate': 2.0631899505138942e-06, 'epoch': 1.9} +{'loss': 1.4268, 'grad_norm': 3.4379935264587402, 'learning_rate': 2.0612866387514276e-06, 'epoch': 1.9} +{'loss': 1.4672, 'grad_norm': 3.55523943901062, 'learning_rate': 2.059383326988961e-06, 'epoch': 1.9} +{'loss': 1.4414, 'grad_norm': 3.636821985244751, 'learning_rate': 2.0574800152264943e-06, 'epoch': 1.9} +{'loss': 1.4265, 'grad_norm': 3.5707204341888428, 'learning_rate': 2.0555767034640277e-06, 'epoch': 1.91} +{'loss': 1.3821, 'grad_norm': 3.5450851917266846, 'learning_rate': 2.053673391701561e-06, 'epoch': 1.91} +{'loss': 1.4593, 'grad_norm': 3.606229305267334, 'learning_rate': 2.051770079939094e-06, 'epoch': 1.91} +{'loss': 1.4202, 'grad_norm': 3.5193092823028564, 'learning_rate': 2.0498667681766273e-06, 'epoch': 1.91} +{'loss': 1.4171, 'grad_norm': 3.497385025024414, 'learning_rate': 2.047963456414161e-06, 'epoch': 1.91} +{'loss': 1.4497, 'grad_norm': 3.6150004863739014, 'learning_rate': 2.046060144651694e-06, 'epoch': 1.91} +{'eval_loss': 1.8844013214111328, 'eval_runtime': 39.2679, 'eval_samples_per_second': 25.466, 'eval_steps_per_second': 1.07, 'epoch': 1.91} +{'loss': 1.4543, 'grad_norm': 3.3783135414123535, 'learning_rate': 2.0441568328892274e-06, 'epoch': 1.91} +{'loss': 1.4587, 'grad_norm': 3.4945452213287354, 'learning_rate': 2.0422535211267608e-06, 'epoch': 1.91} +{'loss': 1.4363, 'grad_norm': 3.407348155975342, 'learning_rate': 2.040350209364294e-06, 'epoch': 1.91} +{'loss': 1.427, 'grad_norm': 3.6301255226135254, 'learning_rate': 2.0384468976018275e-06, 'epoch': 1.91} +{'loss': 1.4237, 'grad_norm': 3.5864486694335938, 'learning_rate': 2.0365435858393604e-06, 'epoch': 1.92} +{'loss': 1.4181, 'grad_norm': 3.6446657180786133, 'learning_rate': 2.034640274076894e-06, 'epoch': 1.92} +{'loss': 1.4895, 'grad_norm': 3.7792136669158936, 'learning_rate': 2.032736962314427e-06, 'epoch': 1.92} +{'loss': 1.4327, 'grad_norm': 3.4916372299194336, 'learning_rate': 2.0308336505519605e-06, 'epoch': 1.92} +{'loss': 1.4953, 'grad_norm': 3.5477612018585205, 'learning_rate': 2.028930338789494e-06, 'epoch': 1.92} +{'loss': 1.4151, 'grad_norm': 3.6497409343719482, 'learning_rate': 2.0270270270270273e-06, 'epoch': 1.92} +{'eval_loss': 1.8946868181228638, 'eval_runtime': 39.3207, 'eval_samples_per_second': 25.432, 'eval_steps_per_second': 1.068, 'epoch': 1.92} +{'loss': 1.4311, 'grad_norm': 3.6108758449554443, 'learning_rate': 2.0251237152645606e-06, 'epoch': 1.92} +{'loss': 1.4227, 'grad_norm': 3.493025779724121, 'learning_rate': 2.023220403502094e-06, 'epoch': 1.92} +{'loss': 1.393, 'grad_norm': 3.614938735961914, 'learning_rate': 2.0213170917396273e-06, 'epoch': 1.92} +{'loss': 1.4296, 'grad_norm': 3.8534555435180664, 'learning_rate': 2.0194137799771603e-06, 'epoch': 1.93} +{'loss': 1.443, 'grad_norm': 3.650941848754883, 'learning_rate': 2.0175104682146937e-06, 'epoch': 1.93} +{'loss': 1.3788, 'grad_norm': 3.613111972808838, 'learning_rate': 2.015607156452227e-06, 'epoch': 1.93} +{'loss': 1.4203, 'grad_norm': 3.8737235069274902, 'learning_rate': 2.0137038446897604e-06, 'epoch': 1.93} +{'loss': 1.4103, 'grad_norm': 3.8021459579467773, 'learning_rate': 2.0118005329272937e-06, 'epoch': 1.93} +{'loss': 1.4359, 'grad_norm': 3.540128707885742, 'learning_rate': 2.0098972211648267e-06, 'epoch': 1.93} +{'loss': 1.4273, 'grad_norm': 3.4836935997009277, 'learning_rate': 2.0079939094023605e-06, 'epoch': 1.93} +{'eval_loss': 1.8922477960586548, 'eval_runtime': 39.5724, 'eval_samples_per_second': 25.27, 'eval_steps_per_second': 1.061, 'epoch': 1.93} +{'loss': 1.4756, 'grad_norm': 3.6354784965515137, 'learning_rate': 2.006090597639894e-06, 'epoch': 1.93} +{'loss': 1.5038, 'grad_norm': 3.562559127807617, 'learning_rate': 2.0041872858774268e-06, 'epoch': 1.93} +{'loss': 1.4682, 'grad_norm': 3.600203037261963, 'learning_rate': 2.00228397411496e-06, 'epoch': 1.93} +{'loss': 1.4549, 'grad_norm': 3.7266688346862793, 'learning_rate': 2.0003806623524935e-06, 'epoch': 1.94} +{'loss': 1.3714, 'grad_norm': 3.6591978073120117, 'learning_rate': 1.998477350590027e-06, 'epoch': 1.94} +{'loss': 1.463, 'grad_norm': 3.6303930282592773, 'learning_rate': 1.9965740388275602e-06, 'epoch': 1.94} +{'loss': 1.4453, 'grad_norm': 3.4547266960144043, 'learning_rate': 1.994670727065093e-06, 'epoch': 1.94} +{'loss': 1.402, 'grad_norm': 3.774893045425415, 'learning_rate': 1.9927674153026265e-06, 'epoch': 1.94} +{'loss': 1.4691, 'grad_norm': 3.6460556983947754, 'learning_rate': 1.99086410354016e-06, 'epoch': 1.94} +{'loss': 1.43, 'grad_norm': 3.5169243812561035, 'learning_rate': 1.9889607917776937e-06, 'epoch': 1.94} +{'eval_loss': 1.8907979726791382, 'eval_runtime': 39.5529, 'eval_samples_per_second': 25.283, 'eval_steps_per_second': 1.062, 'epoch': 1.94} +{'loss': 1.4451, 'grad_norm': 3.613104820251465, 'learning_rate': 1.9870574800152266e-06, 'epoch': 1.94} +{'loss': 1.4662, 'grad_norm': 3.5212597846984863, 'learning_rate': 1.98515416825276e-06, 'epoch': 1.94} +{'loss': 1.4574, 'grad_norm': 3.425621509552002, 'learning_rate': 1.9832508564902933e-06, 'epoch': 1.94} +{'loss': 1.3702, 'grad_norm': 3.464221239089966, 'learning_rate': 1.9813475447278267e-06, 'epoch': 1.95} +{'loss': 1.4533, 'grad_norm': 3.5837228298187256, 'learning_rate': 1.97944423296536e-06, 'epoch': 1.95} +{'loss': 1.4351, 'grad_norm': 3.6390135288238525, 'learning_rate': 1.977540921202893e-06, 'epoch': 1.95} +{'loss': 1.4153, 'grad_norm': 3.4677658081054688, 'learning_rate': 1.9756376094404264e-06, 'epoch': 1.95} +{'loss': 1.3963, 'grad_norm': 3.5055925846099854, 'learning_rate': 1.9737342976779597e-06, 'epoch': 1.95} +{'loss': 1.4757, 'grad_norm': 3.767453908920288, 'learning_rate': 1.971830985915493e-06, 'epoch': 1.95} +{'loss': 1.4405, 'grad_norm': 3.6364212036132812, 'learning_rate': 1.9699276741530264e-06, 'epoch': 1.95} +{'eval_loss': 1.8991445302963257, 'eval_runtime': 39.1695, 'eval_samples_per_second': 25.53, 'eval_steps_per_second': 1.072, 'epoch': 1.95} +{'loss': 1.4403, 'grad_norm': 3.652681589126587, 'learning_rate': 1.96802436239056e-06, 'epoch': 1.95} +{'loss': 1.4278, 'grad_norm': 3.3259754180908203, 'learning_rate': 1.966121050628093e-06, 'epoch': 1.95} +{'loss': 1.4632, 'grad_norm': 3.561762571334839, 'learning_rate': 1.9642177388656265e-06, 'epoch': 1.95} +{'loss': 1.4938, 'grad_norm': 3.7611937522888184, 'learning_rate': 1.9623144271031595e-06, 'epoch': 1.96} +{'loss': 1.4955, 'grad_norm': 3.6294333934783936, 'learning_rate': 1.960411115340693e-06, 'epoch': 1.96} +{'loss': 1.4475, 'grad_norm': 3.391306161880493, 'learning_rate': 1.958507803578226e-06, 'epoch': 1.96} +{'loss': 1.4579, 'grad_norm': 3.5956716537475586, 'learning_rate': 1.9566044918157596e-06, 'epoch': 1.96} +{'loss': 1.4706, 'grad_norm': 3.583899974822998, 'learning_rate': 1.954701180053293e-06, 'epoch': 1.96} +{'loss': 1.4271, 'grad_norm': 3.6752536296844482, 'learning_rate': 1.952797868290826e-06, 'epoch': 1.96} +{'loss': 1.4832, 'grad_norm': 3.929912805557251, 'learning_rate': 1.9508945565283596e-06, 'epoch': 1.96} +{'eval_loss': 1.8852365016937256, 'eval_runtime': 39.4527, 'eval_samples_per_second': 25.347, 'eval_steps_per_second': 1.065, 'epoch': 1.96} +{'loss': 1.4346, 'grad_norm': 3.5828683376312256, 'learning_rate': 1.948991244765893e-06, 'epoch': 1.96} +{'loss': 1.4747, 'grad_norm': 3.603971004486084, 'learning_rate': 1.9470879330034264e-06, 'epoch': 1.96} +{'loss': 1.4048, 'grad_norm': 3.351708173751831, 'learning_rate': 1.9451846212409593e-06, 'epoch': 1.97} +{'loss': 1.4662, 'grad_norm': 3.816876173019409, 'learning_rate': 1.9432813094784927e-06, 'epoch': 1.97} +{'loss': 1.4023, 'grad_norm': 3.46870493888855, 'learning_rate': 1.941377997716026e-06, 'epoch': 1.97} +{'loss': 1.4282, 'grad_norm': 3.3724565505981445, 'learning_rate': 1.9394746859535594e-06, 'epoch': 1.97} +{'loss': 1.3986, 'grad_norm': 3.518724203109741, 'learning_rate': 1.9375713741910928e-06, 'epoch': 1.97} +{'loss': 1.43, 'grad_norm': 3.5220441818237305, 'learning_rate': 1.9356680624286257e-06, 'epoch': 1.97} +{'loss': 1.3794, 'grad_norm': 3.507056951522827, 'learning_rate': 1.933764750666159e-06, 'epoch': 1.97} +{'loss': 1.4384, 'grad_norm': 3.5616719722747803, 'learning_rate': 1.931861438903693e-06, 'epoch': 1.97} +{'eval_loss': 1.8802683353424072, 'eval_runtime': 39.4879, 'eval_samples_per_second': 25.324, 'eval_steps_per_second': 1.064, 'epoch': 1.97} +{'loss': 1.4079, 'grad_norm': 3.481914758682251, 'learning_rate': 1.929958127141226e-06, 'epoch': 1.97} +{'loss': 1.4281, 'grad_norm': 3.5840163230895996, 'learning_rate': 1.928054815378759e-06, 'epoch': 1.97} +{'loss': 1.5111, 'grad_norm': 3.8909904956817627, 'learning_rate': 1.9261515036162925e-06, 'epoch': 1.98} +{'loss': 1.4767, 'grad_norm': 3.548985242843628, 'learning_rate': 1.924248191853826e-06, 'epoch': 1.98} +{'loss': 1.4477, 'grad_norm': 3.5904016494750977, 'learning_rate': 1.9223448800913592e-06, 'epoch': 1.98} +{'loss': 1.4357, 'grad_norm': 3.433415174484253, 'learning_rate': 1.9204415683288926e-06, 'epoch': 1.98} +{'loss': 1.4704, 'grad_norm': 3.741858959197998, 'learning_rate': 1.9185382565664255e-06, 'epoch': 1.98} +{'loss': 1.479, 'grad_norm': 3.4985711574554443, 'learning_rate': 1.916634944803959e-06, 'epoch': 1.98} +{'loss': 1.4532, 'grad_norm': 3.560875415802002, 'learning_rate': 1.9147316330414923e-06, 'epoch': 1.98} +{'loss': 1.4408, 'grad_norm': 3.4896459579467773, 'learning_rate': 1.9128283212790256e-06, 'epoch': 1.98} +{'eval_loss': 1.8891727924346924, 'eval_runtime': 39.0465, 'eval_samples_per_second': 25.61, 'eval_steps_per_second': 1.076, 'epoch': 1.98} +{'loss': 1.4872, 'grad_norm': 3.449495553970337, 'learning_rate': 1.910925009516559e-06, 'epoch': 1.98} +{'loss': 1.4536, 'grad_norm': 3.4946038722991943, 'learning_rate': 1.9090216977540924e-06, 'epoch': 1.98} +{'loss': 1.5008, 'grad_norm': 3.7030792236328125, 'learning_rate': 1.9071183859916257e-06, 'epoch': 1.99} +{'loss': 1.4667, 'grad_norm': 3.6974480152130127, 'learning_rate': 1.9052150742291589e-06, 'epoch': 1.99} +{'loss': 1.4586, 'grad_norm': 3.5509986877441406, 'learning_rate': 1.9033117624666922e-06, 'epoch': 1.99} +{'loss': 1.4249, 'grad_norm': 3.4987192153930664, 'learning_rate': 1.9014084507042254e-06, 'epoch': 1.99} +{'loss': 1.4096, 'grad_norm': 3.6572329998016357, 'learning_rate': 1.8995051389417587e-06, 'epoch': 1.99} +{'loss': 1.3974, 'grad_norm': 3.546966314315796, 'learning_rate': 1.8976018271792921e-06, 'epoch': 1.99} +{'loss': 1.4471, 'grad_norm': 3.568060874938965, 'learning_rate': 1.8956985154168253e-06, 'epoch': 1.99} +{'loss': 1.4316, 'grad_norm': 3.5834546089172363, 'learning_rate': 1.8937952036543588e-06, 'epoch': 1.99} +{'eval_loss': 1.8757987022399902, 'eval_runtime': 39.1558, 'eval_samples_per_second': 25.539, 'eval_steps_per_second': 1.073, 'epoch': 1.99} +{'loss': 1.3894, 'grad_norm': 3.4199063777923584, 'learning_rate': 1.8918918918918922e-06, 'epoch': 1.99} +{'loss': 1.4466, 'grad_norm': 3.352961540222168, 'learning_rate': 1.8899885801294253e-06, 'epoch': 1.99} +{'loss': 1.4576, 'grad_norm': 3.668595790863037, 'learning_rate': 1.8880852683669587e-06, 'epoch': 2.0} +{'loss': 1.4365, 'grad_norm': 3.6364426612854004, 'learning_rate': 1.886181956604492e-06, 'epoch': 2.0} +{'loss': 1.4285, 'grad_norm': 3.520859479904175, 'learning_rate': 1.8842786448420252e-06, 'epoch': 2.0} +{'loss': 1.4637, 'grad_norm': 3.7487142086029053, 'learning_rate': 1.8823753330795586e-06, 'epoch': 2.0} +{'loss': 1.3555, 'grad_norm': 3.5696628093719482, 'learning_rate': 1.8804720213170917e-06, 'epoch': 2.0} +{'loss': 1.403, 'grad_norm': 3.6427314281463623, 'learning_rate': 1.878568709554625e-06, 'epoch': 2.0} +{'loss': 1.4522, 'grad_norm': 3.735656261444092, 'learning_rate': 1.8766653977921585e-06, 'epoch': 2.0} +{'loss': 1.4193, 'grad_norm': 3.6570093631744385, 'learning_rate': 1.874762086029692e-06, 'epoch': 2.0} +{'eval_loss': 1.8887497186660767, 'eval_runtime': 38.808, 'eval_samples_per_second': 25.768, 'eval_steps_per_second': 1.082, 'epoch': 2.0} +{'loss': 1.4499, 'grad_norm': 3.8690154552459717, 'learning_rate': 1.8728587742672252e-06, 'epoch': 2.0} +{'loss': 1.4601, 'grad_norm': 3.988176107406616, 'learning_rate': 1.8709554625047585e-06, 'epoch': 2.01} +{'loss': 1.3868, 'grad_norm': 3.6991844177246094, 'learning_rate': 1.8690521507422917e-06, 'epoch': 2.01} +{'loss': 1.4131, 'grad_norm': 3.826028823852539, 'learning_rate': 1.867148838979825e-06, 'epoch': 2.01} +{'loss': 1.4465, 'grad_norm': 3.70851731300354, 'learning_rate': 1.8652455272173584e-06, 'epoch': 2.01} +{'loss': 1.4571, 'grad_norm': 3.6638004779815674, 'learning_rate': 1.8633422154548916e-06, 'epoch': 2.01} +{'loss': 1.3601, 'grad_norm': 3.7089900970458984, 'learning_rate': 1.861438903692425e-06, 'epoch': 2.01} +{'loss': 1.3856, 'grad_norm': 3.747080087661743, 'learning_rate': 1.859535591929958e-06, 'epoch': 2.01} +{'loss': 1.4691, 'grad_norm': 3.9451565742492676, 'learning_rate': 1.8576322801674915e-06, 'epoch': 2.01} +{'loss': 1.4004, 'grad_norm': 3.860267162322998, 'learning_rate': 1.855728968405025e-06, 'epoch': 2.01} +{'eval_loss': 1.8911617994308472, 'eval_runtime': 39.2555, 'eval_samples_per_second': 25.474, 'eval_steps_per_second': 1.07, 'epoch': 2.01} +{'loss': 1.4264, 'grad_norm': 3.8440234661102295, 'learning_rate': 1.8538256566425584e-06, 'epoch': 2.01} +{'loss': 1.4574, 'grad_norm': 3.7536094188690186, 'learning_rate': 1.8519223448800915e-06, 'epoch': 2.02} +{'loss': 1.4034, 'grad_norm': 3.7992007732391357, 'learning_rate': 1.850019033117625e-06, 'epoch': 2.02} +{'loss': 1.4082, 'grad_norm': 3.787818670272827, 'learning_rate': 1.848115721355158e-06, 'epoch': 2.02} +{'loss': 1.4205, 'grad_norm': 3.739194869995117, 'learning_rate': 1.8462124095926914e-06, 'epoch': 2.02} +{'loss': 1.4316, 'grad_norm': 3.9704442024230957, 'learning_rate': 1.8443090978302248e-06, 'epoch': 2.02} +{'loss': 1.4726, 'grad_norm': 3.922250747680664, 'learning_rate': 1.842405786067758e-06, 'epoch': 2.02} +{'loss': 1.4088, 'grad_norm': 3.6453888416290283, 'learning_rate': 1.8405024743052913e-06, 'epoch': 2.02} +{'loss': 1.4504, 'grad_norm': 3.766308546066284, 'learning_rate': 1.8385991625428247e-06, 'epoch': 2.02} +{'loss': 1.3905, 'grad_norm': 3.6479172706604004, 'learning_rate': 1.8366958507803578e-06, 'epoch': 2.02} +{'eval_loss': 1.888603925704956, 'eval_runtime': 39.3429, 'eval_samples_per_second': 25.418, 'eval_steps_per_second': 1.068, 'epoch': 2.02} +{'loss': 1.432, 'grad_norm': 3.7478854656219482, 'learning_rate': 1.8347925390178914e-06, 'epoch': 2.02} +{'loss': 1.372, 'grad_norm': 3.579448699951172, 'learning_rate': 1.8328892272554247e-06, 'epoch': 2.03} +{'loss': 1.4403, 'grad_norm': 3.7010204792022705, 'learning_rate': 1.8309859154929579e-06, 'epoch': 2.03} +{'loss': 1.4291, 'grad_norm': 3.8068289756774902, 'learning_rate': 1.8290826037304913e-06, 'epoch': 2.03} +{'loss': 1.391, 'grad_norm': 3.618842840194702, 'learning_rate': 1.8271792919680246e-06, 'epoch': 2.03} +{'loss': 1.4041, 'grad_norm': 3.7741870880126953, 'learning_rate': 1.8252759802055578e-06, 'epoch': 2.03} +{'loss': 1.4471, 'grad_norm': 3.796035051345825, 'learning_rate': 1.8233726684430911e-06, 'epoch': 2.03} +{'loss': 1.3818, 'grad_norm': 3.8931241035461426, 'learning_rate': 1.8214693566806243e-06, 'epoch': 2.03} +{'loss': 1.4221, 'grad_norm': 3.8217387199401855, 'learning_rate': 1.8195660449181576e-06, 'epoch': 2.03} +{'loss': 1.4246, 'grad_norm': 3.8785173892974854, 'learning_rate': 1.817662733155691e-06, 'epoch': 2.03} +{'eval_loss': 1.892845869064331, 'eval_runtime': 39.1581, 'eval_samples_per_second': 25.537, 'eval_steps_per_second': 1.073, 'epoch': 2.03} +{'loss': 1.4129, 'grad_norm': 3.6765635013580322, 'learning_rate': 1.8157594213932244e-06, 'epoch': 2.03} +{'loss': 1.4282, 'grad_norm': 3.76060152053833, 'learning_rate': 1.8138561096307577e-06, 'epoch': 2.04} +{'loss': 1.3706, 'grad_norm': 3.7311484813690186, 'learning_rate': 1.811952797868291e-06, 'epoch': 2.04} +{'loss': 1.4048, 'grad_norm': 3.8937771320343018, 'learning_rate': 1.8100494861058242e-06, 'epoch': 2.04} +{'loss': 1.4248, 'grad_norm': 3.827505111694336, 'learning_rate': 1.8081461743433576e-06, 'epoch': 2.04} +{'loss': 1.3698, 'grad_norm': 3.5888681411743164, 'learning_rate': 1.806242862580891e-06, 'epoch': 2.04} +{'loss': 1.4236, 'grad_norm': 3.864014148712158, 'learning_rate': 1.8043395508184241e-06, 'epoch': 2.04} +{'loss': 1.4262, 'grad_norm': 3.8208773136138916, 'learning_rate': 1.8024362390559575e-06, 'epoch': 2.04} +{'loss': 1.385, 'grad_norm': 3.9098780155181885, 'learning_rate': 1.8005329272934906e-06, 'epoch': 2.04} +{'loss': 1.4853, 'grad_norm': 4.151137351989746, 'learning_rate': 1.798629615531024e-06, 'epoch': 2.04} +{'eval_loss': 1.900230050086975, 'eval_runtime': 39.4877, 'eval_samples_per_second': 25.324, 'eval_steps_per_second': 1.064, 'epoch': 2.04} +{'loss': 1.4048, 'grad_norm': 4.014110565185547, 'learning_rate': 1.7967263037685576e-06, 'epoch': 2.05} +{'loss': 1.4049, 'grad_norm': 3.817169427871704, 'learning_rate': 1.794822992006091e-06, 'epoch': 2.05} +{'loss': 1.4623, 'grad_norm': 3.753967523574829, 'learning_rate': 1.792919680243624e-06, 'epoch': 2.05} +{'loss': 1.3802, 'grad_norm': 3.9619951248168945, 'learning_rate': 1.7910163684811574e-06, 'epoch': 2.05} +{'loss': 1.4054, 'grad_norm': 3.593874216079712, 'learning_rate': 1.7891130567186906e-06, 'epoch': 2.05} +{'loss': 1.3857, 'grad_norm': 3.739203929901123, 'learning_rate': 1.787209744956224e-06, 'epoch': 2.05} +{'loss': 1.4147, 'grad_norm': 3.824517011642456, 'learning_rate': 1.7853064331937573e-06, 'epoch': 2.05} +{'loss': 1.4493, 'grad_norm': 3.6705546379089355, 'learning_rate': 1.7834031214312905e-06, 'epoch': 2.05} +{'loss': 1.3877, 'grad_norm': 3.7453954219818115, 'learning_rate': 1.7814998096688238e-06, 'epoch': 2.05} +{'loss': 1.4229, 'grad_norm': 3.894132614135742, 'learning_rate': 1.779596497906357e-06, 'epoch': 2.05} +{'eval_loss': 1.8847311735153198, 'eval_runtime': 38.4946, 'eval_samples_per_second': 25.978, 'eval_steps_per_second': 1.091, 'epoch': 2.05} +{'loss': 1.4619, 'grad_norm': 3.6707732677459717, 'learning_rate': 1.7776931861438906e-06, 'epoch': 2.06} +{'loss': 1.4569, 'grad_norm': 3.989419460296631, 'learning_rate': 1.775789874381424e-06, 'epoch': 2.06} +{'loss': 1.4485, 'grad_norm': 3.918484687805176, 'learning_rate': 1.7738865626189573e-06, 'epoch': 2.06} +{'loss': 1.3763, 'grad_norm': 3.9125781059265137, 'learning_rate': 1.7719832508564904e-06, 'epoch': 2.06} +{'loss': 1.4006, 'grad_norm': 3.899907112121582, 'learning_rate': 1.7700799390940238e-06, 'epoch': 2.06} +{'loss': 1.425, 'grad_norm': 3.7762441635131836, 'learning_rate': 1.768176627331557e-06, 'epoch': 2.06} +{'loss': 1.4235, 'grad_norm': 3.668630599975586, 'learning_rate': 1.7662733155690903e-06, 'epoch': 2.06} +{'loss': 1.457, 'grad_norm': 3.942714214324951, 'learning_rate': 1.7643700038066237e-06, 'epoch': 2.06} +{'loss': 1.3756, 'grad_norm': 3.63503098487854, 'learning_rate': 1.7624666920441568e-06, 'epoch': 2.06} +{'loss': 1.3741, 'grad_norm': 3.594670534133911, 'learning_rate': 1.7605633802816902e-06, 'epoch': 2.06} +{'eval_loss': 1.8899147510528564, 'eval_runtime': 39.0273, 'eval_samples_per_second': 25.623, 'eval_steps_per_second': 1.076, 'epoch': 2.06} +{'loss': 1.4754, 'grad_norm': 3.8125579357147217, 'learning_rate': 1.7586600685192238e-06, 'epoch': 2.07} +{'loss': 1.4722, 'grad_norm': 3.8670105934143066, 'learning_rate': 1.756756756756757e-06, 'epoch': 2.07} +{'loss': 1.438, 'grad_norm': 3.874603033065796, 'learning_rate': 1.7548534449942903e-06, 'epoch': 2.07} +{'loss': 1.4189, 'grad_norm': 3.9304723739624023, 'learning_rate': 1.7529501332318236e-06, 'epoch': 2.07} +{'loss': 1.4028, 'grad_norm': 3.861403226852417, 'learning_rate': 1.7510468214693568e-06, 'epoch': 2.07} +{'loss': 1.382, 'grad_norm': 3.6960110664367676, 'learning_rate': 1.7491435097068902e-06, 'epoch': 2.07} +{'loss': 1.4291, 'grad_norm': 3.6960110664367676, 'learning_rate': 1.7491435097068902e-06, 'epoch': 2.07} +{'loss': 1.3549, 'grad_norm': 3.792027711868286, 'learning_rate': 1.7472401979444233e-06, 'epoch': 2.07} +{'loss': 1.4086, 'grad_norm': 3.815934419631958, 'learning_rate': 1.7453368861819567e-06, 'epoch': 2.07} +{'loss': 1.4625, 'grad_norm': 4.1008782386779785, 'learning_rate': 1.74343357441949e-06, 'epoch': 2.07} +{'eval_loss': 1.8921340703964233, 'eval_runtime': 39.1401, 'eval_samples_per_second': 25.549, 'eval_steps_per_second': 1.073, 'epoch': 2.07} +{'loss': 1.4047, 'grad_norm': 3.5898642539978027, 'learning_rate': 1.7415302626570232e-06, 'epoch': 2.08} +{'loss': 1.3789, 'grad_norm': 3.8074820041656494, 'learning_rate': 1.7396269508945568e-06, 'epoch': 2.08} +{'loss': 1.4504, 'grad_norm': 3.9214768409729004, 'learning_rate': 1.7377236391320901e-06, 'epoch': 2.08} +{'loss': 1.4125, 'grad_norm': 3.812035083770752, 'learning_rate': 1.7358203273696233e-06, 'epoch': 2.08} +{'loss': 1.4022, 'grad_norm': 3.833949327468872, 'learning_rate': 1.7339170156071566e-06, 'epoch': 2.08} +{'loss': 1.4144, 'grad_norm': 3.8817965984344482, 'learning_rate': 1.73201370384469e-06, 'epoch': 2.08} +{'loss': 1.4277, 'grad_norm': 3.902334213256836, 'learning_rate': 1.7301103920822231e-06, 'epoch': 2.08} +{'loss': 1.4109, 'grad_norm': 3.784571647644043, 'learning_rate': 1.7282070803197565e-06, 'epoch': 2.08} +{'loss': 1.4348, 'grad_norm': 3.985269546508789, 'learning_rate': 1.7263037685572897e-06, 'epoch': 2.08} +{'loss': 1.3968, 'grad_norm': 3.6933484077453613, 'learning_rate': 1.724400456794823e-06, 'epoch': 2.09} +{'eval_loss': 1.8923474550247192, 'eval_runtime': 38.8548, 'eval_samples_per_second': 25.737, 'eval_steps_per_second': 1.081, 'epoch': 2.09} +{'loss': 1.4025, 'grad_norm': 3.6797826290130615, 'learning_rate': 1.7224971450323564e-06, 'epoch': 2.09} +{'loss': 1.485, 'grad_norm': 3.924562931060791, 'learning_rate': 1.72059383326989e-06, 'epoch': 2.09} +{'loss': 1.3899, 'grad_norm': 3.846958637237549, 'learning_rate': 1.7186905215074231e-06, 'epoch': 2.09} +{'loss': 1.419, 'grad_norm': 4.0274834632873535, 'learning_rate': 1.7167872097449565e-06, 'epoch': 2.09} +{'loss': 1.4085, 'grad_norm': 3.779113531112671, 'learning_rate': 1.7148838979824896e-06, 'epoch': 2.09} +{'loss': 1.3926, 'grad_norm': 3.83450984954834, 'learning_rate': 1.712980586220023e-06, 'epoch': 2.09} +{'loss': 1.4061, 'grad_norm': 3.87125563621521, 'learning_rate': 1.7110772744575563e-06, 'epoch': 2.09} +{'loss': 1.4429, 'grad_norm': 3.7037150859832764, 'learning_rate': 1.7091739626950895e-06, 'epoch': 2.09} +{'loss': 1.4108, 'grad_norm': 3.8723442554473877, 'learning_rate': 1.7072706509326229e-06, 'epoch': 2.09} +{'loss': 1.3849, 'grad_norm': 3.8358163833618164, 'learning_rate': 1.7053673391701562e-06, 'epoch': 2.1} +{'eval_loss': 1.884762167930603, 'eval_runtime': 39.3752, 'eval_samples_per_second': 25.397, 'eval_steps_per_second': 1.067, 'epoch': 2.1} +{'loss': 1.4595, 'grad_norm': 3.998617649078369, 'learning_rate': 1.7034640274076894e-06, 'epoch': 2.1} +{'loss': 1.378, 'grad_norm': 3.7093710899353027, 'learning_rate': 1.701560715645223e-06, 'epoch': 2.1} +{'loss': 1.4225, 'grad_norm': 3.7816097736358643, 'learning_rate': 1.6996574038827563e-06, 'epoch': 2.1} +{'loss': 1.3973, 'grad_norm': 3.976496696472168, 'learning_rate': 1.6977540921202895e-06, 'epoch': 2.1} +{'loss': 1.4263, 'grad_norm': 3.907363176345825, 'learning_rate': 1.6958507803578228e-06, 'epoch': 2.1} +{'loss': 1.441, 'grad_norm': 4.120336532592773, 'learning_rate': 1.693947468595356e-06, 'epoch': 2.1} +{'loss': 1.371, 'grad_norm': 3.7310473918914795, 'learning_rate': 1.6920441568328893e-06, 'epoch': 2.1} +{'loss': 1.4017, 'grad_norm': 3.791879177093506, 'learning_rate': 1.6901408450704227e-06, 'epoch': 2.1} +{'loss': 1.4563, 'grad_norm': 3.986079216003418, 'learning_rate': 1.6882375333079559e-06, 'epoch': 2.1} +{'loss': 1.4464, 'grad_norm': 3.706230401992798, 'learning_rate': 1.6863342215454892e-06, 'epoch': 2.11} +{'eval_loss': 1.8859559297561646, 'eval_runtime': 39.1728, 'eval_samples_per_second': 25.528, 'eval_steps_per_second': 1.072, 'epoch': 2.11} +{'loss': 1.4274, 'grad_norm': 3.8091201782226562, 'learning_rate': 1.6844309097830226e-06, 'epoch': 2.11} +{'loss': 1.458, 'grad_norm': 3.931368827819824, 'learning_rate': 1.682527598020556e-06, 'epoch': 2.11} +{'loss': 1.3824, 'grad_norm': 3.5865678787231445, 'learning_rate': 1.6806242862580893e-06, 'epoch': 2.11} +{'loss': 1.3756, 'grad_norm': 3.7363500595092773, 'learning_rate': 1.6787209744956227e-06, 'epoch': 2.11} +{'loss': 1.4193, 'grad_norm': 3.652477979660034, 'learning_rate': 1.6768176627331558e-06, 'epoch': 2.11} +{'loss': 1.4063, 'grad_norm': 3.756624460220337, 'learning_rate': 1.6749143509706892e-06, 'epoch': 2.11} +{'loss': 1.4179, 'grad_norm': 3.8477284908294678, 'learning_rate': 1.6730110392082225e-06, 'epoch': 2.11} +{'loss': 1.3744, 'grad_norm': 3.9322173595428467, 'learning_rate': 1.6711077274457557e-06, 'epoch': 2.11} +{'loss': 1.4306, 'grad_norm': 3.906801462173462, 'learning_rate': 1.669204415683289e-06, 'epoch': 2.12} +{'loss': 1.4145, 'grad_norm': 3.605224609375, 'learning_rate': 1.6673011039208222e-06, 'epoch': 2.12} +{'eval_loss': 1.879068374633789, 'eval_runtime': 39.1842, 'eval_samples_per_second': 25.521, 'eval_steps_per_second': 1.072, 'epoch': 2.12} +{'loss': 1.4327, 'grad_norm': 3.8868417739868164, 'learning_rate': 1.6653977921583556e-06, 'epoch': 2.12} +{'loss': 1.3977, 'grad_norm': 3.8862948417663574, 'learning_rate': 1.663494480395889e-06, 'epoch': 2.12} +{'loss': 1.4144, 'grad_norm': 3.8731849193573, 'learning_rate': 1.6615911686334225e-06, 'epoch': 2.12} +{'loss': 1.4014, 'grad_norm': 4.008052349090576, 'learning_rate': 1.6596878568709557e-06, 'epoch': 2.12} +{'loss': 1.4527, 'grad_norm': 4.020989418029785, 'learning_rate': 1.657784545108489e-06, 'epoch': 2.12} +{'loss': 1.4429, 'grad_norm': 3.923231601715088, 'learning_rate': 1.6558812333460222e-06, 'epoch': 2.12} +{'loss': 1.3953, 'grad_norm': 3.6388964653015137, 'learning_rate': 1.6539779215835555e-06, 'epoch': 2.12} +{'loss': 1.3931, 'grad_norm': 3.7071149349212646, 'learning_rate': 1.652074609821089e-06, 'epoch': 2.12} +{'loss': 1.403, 'grad_norm': 3.8841397762298584, 'learning_rate': 1.650171298058622e-06, 'epoch': 2.13} +{'loss': 1.3969, 'grad_norm': 3.8592262268066406, 'learning_rate': 1.6482679862961554e-06, 'epoch': 2.13} +{'eval_loss': 1.8843810558319092, 'eval_runtime': 39.0817, 'eval_samples_per_second': 25.587, 'eval_steps_per_second': 1.075, 'epoch': 2.13} +{'loss': 1.3702, 'grad_norm': 3.675586700439453, 'learning_rate': 1.6463646745336886e-06, 'epoch': 2.13} +{'loss': 1.4171, 'grad_norm': 3.8904383182525635, 'learning_rate': 1.644461362771222e-06, 'epoch': 2.13} +{'loss': 1.3828, 'grad_norm': 3.7897231578826904, 'learning_rate': 1.6425580510087555e-06, 'epoch': 2.13} +{'loss': 1.3776, 'grad_norm': 3.7733633518218994, 'learning_rate': 1.6406547392462889e-06, 'epoch': 2.13} +{'loss': 1.4139, 'grad_norm': 3.8648877143859863, 'learning_rate': 1.638751427483822e-06, 'epoch': 2.13} +{'loss': 1.3975, 'grad_norm': 3.8082759380340576, 'learning_rate': 1.6368481157213554e-06, 'epoch': 2.13} +{'loss': 1.4661, 'grad_norm': 3.91013503074646, 'learning_rate': 1.6349448039588885e-06, 'epoch': 2.13} +{'loss': 1.3732, 'grad_norm': 3.7294511795043945, 'learning_rate': 1.6330414921964219e-06, 'epoch': 2.13} +{'loss': 1.4292, 'grad_norm': 3.8252320289611816, 'learning_rate': 1.6311381804339553e-06, 'epoch': 2.14} +{'loss': 1.3959, 'grad_norm': 3.763751745223999, 'learning_rate': 1.6292348686714884e-06, 'epoch': 2.14} +{'eval_loss': 1.8865312337875366, 'eval_runtime': 39.1295, 'eval_samples_per_second': 25.556, 'eval_steps_per_second': 1.073, 'epoch': 2.14} +{'loss': 1.478, 'grad_norm': 3.7793376445770264, 'learning_rate': 1.6273315569090218e-06, 'epoch': 2.14} +{'loss': 1.4581, 'grad_norm': 3.8865175247192383, 'learning_rate': 1.625428245146555e-06, 'epoch': 2.14} +{'loss': 1.4184, 'grad_norm': 3.757173776626587, 'learning_rate': 1.6235249333840885e-06, 'epoch': 2.14} +{'loss': 1.4469, 'grad_norm': 3.6030209064483643, 'learning_rate': 1.6216216216216219e-06, 'epoch': 2.14} +{'loss': 1.4202, 'grad_norm': 3.760394811630249, 'learning_rate': 1.6197183098591552e-06, 'epoch': 2.14} +{'loss': 1.3898, 'grad_norm': 3.6963648796081543, 'learning_rate': 1.6178149980966884e-06, 'epoch': 2.14} +{'loss': 1.4687, 'grad_norm': 3.785379409790039, 'learning_rate': 1.6159116863342217e-06, 'epoch': 2.14} +{'loss': 1.3762, 'grad_norm': 3.7960989475250244, 'learning_rate': 1.6140083745717549e-06, 'epoch': 2.14} +{'loss': 1.4558, 'grad_norm': 3.7413229942321777, 'learning_rate': 1.6121050628092882e-06, 'epoch': 2.15} +{'loss': 1.3654, 'grad_norm': 3.5880978107452393, 'learning_rate': 1.6102017510468216e-06, 'epoch': 2.15} +{'eval_loss': 1.8930236101150513, 'eval_runtime': 39.4969, 'eval_samples_per_second': 25.318, 'eval_steps_per_second': 1.063, 'epoch': 2.15} +{'loss': 1.4197, 'grad_norm': 3.8222718238830566, 'learning_rate': 1.6082984392843548e-06, 'epoch': 2.15} +{'loss': 1.4678, 'grad_norm': 3.7593390941619873, 'learning_rate': 1.6063951275218881e-06, 'epoch': 2.15} +{'loss': 1.3974, 'grad_norm': 3.819197416305542, 'learning_rate': 1.6044918157594217e-06, 'epoch': 2.15} +{'loss': 1.416, 'grad_norm': 3.6722075939178467, 'learning_rate': 1.6025885039969548e-06, 'epoch': 2.15} +{'loss': 1.4332, 'grad_norm': 3.8644683361053467, 'learning_rate': 1.6006851922344882e-06, 'epoch': 2.15} +{'loss': 1.4378, 'grad_norm': 3.710681438446045, 'learning_rate': 1.5987818804720216e-06, 'epoch': 2.15} +{'loss': 1.3704, 'grad_norm': 3.744978904724121, 'learning_rate': 1.5968785687095547e-06, 'epoch': 2.15} +{'loss': 1.4372, 'grad_norm': 3.6079931259155273, 'learning_rate': 1.594975256947088e-06, 'epoch': 2.16} +{'loss': 1.3855, 'grad_norm': 3.629657030105591, 'learning_rate': 1.5930719451846212e-06, 'epoch': 2.16} +{'loss': 1.4328, 'grad_norm': 3.7351815700531006, 'learning_rate': 1.5911686334221546e-06, 'epoch': 2.16} +{'eval_loss': 1.8911535739898682, 'eval_runtime': 39.0679, 'eval_samples_per_second': 25.596, 'eval_steps_per_second': 1.075, 'epoch': 2.16} +{'loss': 1.3404, 'grad_norm': 3.6025283336639404, 'learning_rate': 1.589265321659688e-06, 'epoch': 2.16} +{'loss': 1.3859, 'grad_norm': 3.7653889656066895, 'learning_rate': 1.5873620098972211e-06, 'epoch': 2.16} +{'loss': 1.4352, 'grad_norm': 4.092021942138672, 'learning_rate': 1.5854586981347547e-06, 'epoch': 2.16} +{'loss': 1.4474, 'grad_norm': 3.8096868991851807, 'learning_rate': 1.583555386372288e-06, 'epoch': 2.16} +{'loss': 1.412, 'grad_norm': 3.650540351867676, 'learning_rate': 1.5816520746098212e-06, 'epoch': 2.16} +{'loss': 1.367, 'grad_norm': 3.660374164581299, 'learning_rate': 1.5797487628473546e-06, 'epoch': 2.16} +{'loss': 1.4224, 'grad_norm': 3.8286571502685547, 'learning_rate': 1.577845451084888e-06, 'epoch': 2.16} +{'loss': 1.3332, 'grad_norm': 3.7125942707061768, 'learning_rate': 1.575942139322421e-06, 'epoch': 2.17} +{'loss': 1.4182, 'grad_norm': 3.68766450881958, 'learning_rate': 1.5740388275599544e-06, 'epoch': 2.17} +{'loss': 1.4056, 'grad_norm': 3.8403890132904053, 'learning_rate': 1.5721355157974876e-06, 'epoch': 2.17} +{'eval_loss': 1.8906915187835693, 'eval_runtime': 38.7828, 'eval_samples_per_second': 25.785, 'eval_steps_per_second': 1.083, 'epoch': 2.17} +{'loss': 1.41, 'grad_norm': 3.884368658065796, 'learning_rate': 1.570232204035021e-06, 'epoch': 2.17} +{'loss': 1.4169, 'grad_norm': 3.830977439880371, 'learning_rate': 1.5683288922725543e-06, 'epoch': 2.17} +{'loss': 1.3982, 'grad_norm': 3.832181453704834, 'learning_rate': 1.5664255805100879e-06, 'epoch': 2.17} +{'loss': 1.4175, 'grad_norm': 3.7909111976623535, 'learning_rate': 1.564522268747621e-06, 'epoch': 2.17} +{'loss': 1.3462, 'grad_norm': 3.822723865509033, 'learning_rate': 1.5626189569851544e-06, 'epoch': 2.17} +{'loss': 1.417, 'grad_norm': 3.976454734802246, 'learning_rate': 1.5607156452226875e-06, 'epoch': 2.17} +{'loss': 1.4191, 'grad_norm': 3.7159669399261475, 'learning_rate': 1.558812333460221e-06, 'epoch': 2.17} +{'loss': 1.4093, 'grad_norm': 3.771538496017456, 'learning_rate': 1.5569090216977543e-06, 'epoch': 2.18} +{'loss': 1.4309, 'grad_norm': 3.6531739234924316, 'learning_rate': 1.5550057099352874e-06, 'epoch': 2.18} +{'loss': 1.3856, 'grad_norm': 3.642829656600952, 'learning_rate': 1.5531023981728208e-06, 'epoch': 2.18} +{'eval_loss': 1.8838125467300415, 'eval_runtime': 39.0727, 'eval_samples_per_second': 25.593, 'eval_steps_per_second': 1.075, 'epoch': 2.18} +{'loss': 1.4177, 'grad_norm': 3.6914288997650146, 'learning_rate': 1.5511990864103542e-06, 'epoch': 2.18} +{'loss': 1.445, 'grad_norm': 4.001401424407959, 'learning_rate': 1.5492957746478873e-06, 'epoch': 2.18} +{'loss': 1.4123, 'grad_norm': 3.829439401626587, 'learning_rate': 1.5473924628854209e-06, 'epoch': 2.18} +{'loss': 1.4074, 'grad_norm': 3.765784978866577, 'learning_rate': 1.5454891511229542e-06, 'epoch': 2.18} +{'loss': 1.4363, 'grad_norm': 3.868715524673462, 'learning_rate': 1.5435858393604874e-06, 'epoch': 2.18} +{'loss': 1.3883, 'grad_norm': 4.0136399269104, 'learning_rate': 1.5416825275980208e-06, 'epoch': 2.18} +{'loss': 1.3886, 'grad_norm': 3.7096469402313232, 'learning_rate': 1.539779215835554e-06, 'epoch': 2.18} +{'loss': 1.4586, 'grad_norm': 3.7758445739746094, 'learning_rate': 1.5378759040730873e-06, 'epoch': 2.19} +{'loss': 1.4276, 'grad_norm': 3.8597636222839355, 'learning_rate': 1.5359725923106206e-06, 'epoch': 2.19} +{'loss': 1.3914, 'grad_norm': 3.6662185192108154, 'learning_rate': 1.5340692805481538e-06, 'epoch': 2.19} +{'eval_loss': 1.8835080862045288, 'eval_runtime': 38.7384, 'eval_samples_per_second': 25.814, 'eval_steps_per_second': 1.084, 'epoch': 2.19} +{'loss': 1.3854, 'grad_norm': 3.752978563308716, 'learning_rate': 1.5321659687856871e-06, 'epoch': 2.19} +{'loss': 1.3511, 'grad_norm': 3.6560189723968506, 'learning_rate': 1.5302626570232205e-06, 'epoch': 2.19} +{'loss': 1.4618, 'grad_norm': 3.927957057952881, 'learning_rate': 1.5283593452607539e-06, 'epoch': 2.19} +{'loss': 1.3977, 'grad_norm': 3.6453375816345215, 'learning_rate': 1.5264560334982872e-06, 'epoch': 2.19} +{'loss': 1.4288, 'grad_norm': 3.7945237159729004, 'learning_rate': 1.5245527217358206e-06, 'epoch': 2.19} +{'loss': 1.3898, 'grad_norm': 3.728896141052246, 'learning_rate': 1.5226494099733537e-06, 'epoch': 2.19} +{'loss': 1.4626, 'grad_norm': 3.9501233100891113, 'learning_rate': 1.520746098210887e-06, 'epoch': 2.2} +{'loss': 1.3806, 'grad_norm': 3.8677923679351807, 'learning_rate': 1.5188427864484205e-06, 'epoch': 2.2} +{'loss': 1.3909, 'grad_norm': 3.8189618587493896, 'learning_rate': 1.5169394746859536e-06, 'epoch': 2.2} +{'loss': 1.4183, 'grad_norm': 3.910177230834961, 'learning_rate': 1.515036162923487e-06, 'epoch': 2.2} +{'eval_loss': 1.8927663564682007, 'eval_runtime': 39.2193, 'eval_samples_per_second': 25.498, 'eval_steps_per_second': 1.071, 'epoch': 2.2} +{'loss': 1.4509, 'grad_norm': 4.016850471496582, 'learning_rate': 1.5131328511610201e-06, 'epoch': 2.2} +{'loss': 1.4375, 'grad_norm': 3.818112850189209, 'learning_rate': 1.5112295393985535e-06, 'epoch': 2.2} +{'loss': 1.3485, 'grad_norm': 3.580780029296875, 'learning_rate': 1.509326227636087e-06, 'epoch': 2.2} +{'loss': 1.4043, 'grad_norm': 3.743756055831909, 'learning_rate': 1.5074229158736204e-06, 'epoch': 2.2} +{'loss': 1.4038, 'grad_norm': 3.7317755222320557, 'learning_rate': 1.5055196041111536e-06, 'epoch': 2.2} +{'loss': 1.4521, 'grad_norm': 3.995760917663574, 'learning_rate': 1.503616292348687e-06, 'epoch': 2.2} +{'loss': 1.3508, 'grad_norm': 3.8809351921081543, 'learning_rate': 1.50171298058622e-06, 'epoch': 2.21} +{'loss': 1.4288, 'grad_norm': 3.975825071334839, 'learning_rate': 1.4998096688237535e-06, 'epoch': 2.21} +{'loss': 1.4231, 'grad_norm': 3.8478012084960938, 'learning_rate': 1.4979063570612868e-06, 'epoch': 2.21} +{'loss': 1.3827, 'grad_norm': 3.7383739948272705, 'learning_rate': 1.49600304529882e-06, 'epoch': 2.21} +{'eval_loss': 1.881961464881897, 'eval_runtime': 39.317, 'eval_samples_per_second': 25.434, 'eval_steps_per_second': 1.068, 'epoch': 2.21} +{'loss': 1.4299, 'grad_norm': 3.7537779808044434, 'learning_rate': 1.4940997335363533e-06, 'epoch': 2.21} +{'loss': 1.3982, 'grad_norm': 3.6476807594299316, 'learning_rate': 1.4921964217738865e-06, 'epoch': 2.21} +{'loss': 1.423, 'grad_norm': 3.8064823150634766, 'learning_rate': 1.49029311001142e-06, 'epoch': 2.21} +{'loss': 1.4401, 'grad_norm': 3.855590343475342, 'learning_rate': 1.4883897982489534e-06, 'epoch': 2.21} +{'loss': 1.391, 'grad_norm': 3.7072765827178955, 'learning_rate': 1.4864864864864868e-06, 'epoch': 2.21} +{'loss': 1.4101, 'grad_norm': 3.8890511989593506, 'learning_rate': 1.48458317472402e-06, 'epoch': 2.21} +{'loss': 1.3597, 'grad_norm': 3.7216806411743164, 'learning_rate': 1.4826798629615533e-06, 'epoch': 2.22} +{'loss': 1.3863, 'grad_norm': 3.8384571075439453, 'learning_rate': 1.4807765511990864e-06, 'epoch': 2.22} +{'loss': 1.3948, 'grad_norm': 3.8343822956085205, 'learning_rate': 1.4788732394366198e-06, 'epoch': 2.22} +{'loss': 1.3917, 'grad_norm': 3.9467062950134277, 'learning_rate': 1.4769699276741532e-06, 'epoch': 2.22} +{'eval_loss': 1.890305519104004, 'eval_runtime': 38.9083, 'eval_samples_per_second': 25.701, 'eval_steps_per_second': 1.079, 'epoch': 2.22} +{'loss': 1.3582, 'grad_norm': 3.7977960109710693, 'learning_rate': 1.4750666159116863e-06, 'epoch': 2.22} +{'loss': 1.4293, 'grad_norm': 3.8876988887786865, 'learning_rate': 1.4731633041492197e-06, 'epoch': 2.22} +{'loss': 1.4453, 'grad_norm': 3.9785125255584717, 'learning_rate': 1.4712599923867528e-06, 'epoch': 2.22} +{'loss': 1.3737, 'grad_norm': 3.630812406539917, 'learning_rate': 1.4693566806242864e-06, 'epoch': 2.22} +{'loss': 1.3981, 'grad_norm': 3.898693799972534, 'learning_rate': 1.4674533688618198e-06, 'epoch': 2.22} +{'loss': 1.3969, 'grad_norm': 3.7654495239257812, 'learning_rate': 1.4655500570993531e-06, 'epoch': 2.22} +{'loss': 1.4629, 'grad_norm': 3.7944958209991455, 'learning_rate': 1.4636467453368863e-06, 'epoch': 2.23} +{'loss': 1.4406, 'grad_norm': 3.9004244804382324, 'learning_rate': 1.4617434335744197e-06, 'epoch': 2.23} +{'loss': 1.4296, 'grad_norm': 3.940141201019287, 'learning_rate': 1.4598401218119528e-06, 'epoch': 2.23} +{'loss': 1.4101, 'grad_norm': 3.817246198654175, 'learning_rate': 1.4579368100494862e-06, 'epoch': 2.23} +{'eval_loss': 1.8858181238174438, 'eval_runtime': 39.3798, 'eval_samples_per_second': 25.394, 'eval_steps_per_second': 1.067, 'epoch': 2.23} +{'loss': 1.4246, 'grad_norm': 4.0129313468933105, 'learning_rate': 1.4560334982870195e-06, 'epoch': 2.23} +{'loss': 1.4292, 'grad_norm': 4.027838230133057, 'learning_rate': 1.4541301865245527e-06, 'epoch': 2.23} +{'loss': 1.381, 'grad_norm': 3.715360641479492, 'learning_rate': 1.452226874762086e-06, 'epoch': 2.23} +{'loss': 1.4525, 'grad_norm': 3.980217695236206, 'learning_rate': 1.4503235629996196e-06, 'epoch': 2.23} +{'loss': 1.35, 'grad_norm': 3.730584144592285, 'learning_rate': 1.4484202512371528e-06, 'epoch': 2.23} +{'loss': 1.4701, 'grad_norm': 3.903085708618164, 'learning_rate': 1.4465169394746861e-06, 'epoch': 2.24} +{'loss': 1.3628, 'grad_norm': 3.700035810470581, 'learning_rate': 1.4446136277122195e-06, 'epoch': 2.24} +{'loss': 1.3924, 'grad_norm': 3.7635107040405273, 'learning_rate': 1.4427103159497526e-06, 'epoch': 2.24} +{'loss': 1.4165, 'grad_norm': 3.853732109069824, 'learning_rate': 1.440807004187286e-06, 'epoch': 2.24} +{'loss': 1.434, 'grad_norm': 3.855116605758667, 'learning_rate': 1.4389036924248192e-06, 'epoch': 2.24} +{'eval_loss': 1.8864396810531616, 'eval_runtime': 38.5346, 'eval_samples_per_second': 25.951, 'eval_steps_per_second': 1.09, 'epoch': 2.24} +{'loss': 1.4, 'grad_norm': 3.8182194232940674, 'learning_rate': 1.4370003806623525e-06, 'epoch': 2.24} +{'loss': 1.444, 'grad_norm': 3.7903404235839844, 'learning_rate': 1.4350970688998859e-06, 'epoch': 2.24} +{'loss': 1.3977, 'grad_norm': 3.633317470550537, 'learning_rate': 1.433193757137419e-06, 'epoch': 2.24} +{'loss': 1.3664, 'grad_norm': 3.6548566818237305, 'learning_rate': 1.4312904453749526e-06, 'epoch': 2.24} +{'loss': 1.4134, 'grad_norm': 3.7453954219818115, 'learning_rate': 1.429387133612486e-06, 'epoch': 2.24} +{'loss': 1.4227, 'grad_norm': 3.830434799194336, 'learning_rate': 1.4274838218500191e-06, 'epoch': 2.25} +{'loss': 1.4816, 'grad_norm': 4.004784107208252, 'learning_rate': 1.4255805100875525e-06, 'epoch': 2.25} +{'loss': 1.4023, 'grad_norm': 3.8402459621429443, 'learning_rate': 1.4236771983250858e-06, 'epoch': 2.25} +{'loss': 1.375, 'grad_norm': 3.620270252227783, 'learning_rate': 1.421773886562619e-06, 'epoch': 2.25} +{'loss': 1.4046, 'grad_norm': 3.651615619659424, 'learning_rate': 1.4198705748001524e-06, 'epoch': 2.25} +{'eval_loss': 1.8843915462493896, 'eval_runtime': 39.1985, 'eval_samples_per_second': 25.511, 'eval_steps_per_second': 1.071, 'epoch': 2.25} +{'loss': 1.3731, 'grad_norm': 3.696775436401367, 'learning_rate': 1.4179672630376855e-06, 'epoch': 2.25} +{'loss': 1.3927, 'grad_norm': 3.79319167137146, 'learning_rate': 1.4160639512752189e-06, 'epoch': 2.25} +{'loss': 1.367, 'grad_norm': 3.7621428966522217, 'learning_rate': 1.4141606395127522e-06, 'epoch': 2.25} +{'loss': 1.4214, 'grad_norm': 3.8129916191101074, 'learning_rate': 1.4122573277502858e-06, 'epoch': 2.25} +{'loss': 1.4009, 'grad_norm': 3.826503276824951, 'learning_rate': 1.410354015987819e-06, 'epoch': 2.25} +{'loss': 1.4257, 'grad_norm': 3.7033872604370117, 'learning_rate': 1.4084507042253523e-06, 'epoch': 2.26} +{'loss': 1.4025, 'grad_norm': 3.755481481552124, 'learning_rate': 1.4065473924628855e-06, 'epoch': 2.26} +{'loss': 1.396, 'grad_norm': 3.9239840507507324, 'learning_rate': 1.4046440807004188e-06, 'epoch': 2.26} +{'loss': 1.3844, 'grad_norm': 3.722135066986084, 'learning_rate': 1.4027407689379522e-06, 'epoch': 2.26} +{'loss': 1.4164, 'grad_norm': 3.7343573570251465, 'learning_rate': 1.4008374571754854e-06, 'epoch': 2.26} +{'eval_loss': 1.8877151012420654, 'eval_runtime': 39.0809, 'eval_samples_per_second': 25.588, 'eval_steps_per_second': 1.075, 'epoch': 2.26} +{'loss': 1.4076, 'grad_norm': 3.7343573570251465, 'learning_rate': 1.4008374571754854e-06, 'epoch': 2.26} +{'loss': 1.4236, 'grad_norm': 3.871894359588623, 'learning_rate': 1.3989341454130187e-06, 'epoch': 2.26} +{'loss': 1.488, 'grad_norm': 4.107873916625977, 'learning_rate': 1.397030833650552e-06, 'epoch': 2.26} +{'loss': 1.38, 'grad_norm': 3.676144599914551, 'learning_rate': 1.3951275218880852e-06, 'epoch': 2.26} +{'loss': 1.4169, 'grad_norm': 3.9853713512420654, 'learning_rate': 1.3932242101256188e-06, 'epoch': 2.27} +{'loss': 1.3654, 'grad_norm': 3.764937400817871, 'learning_rate': 1.3913208983631522e-06, 'epoch': 2.27} +{'loss': 1.3545, 'grad_norm': 3.728503465652466, 'learning_rate': 1.3894175866006853e-06, 'epoch': 2.27} +{'loss': 1.4045, 'grad_norm': 3.7056779861450195, 'learning_rate': 1.3875142748382187e-06, 'epoch': 2.27} +{'loss': 1.4363, 'grad_norm': 3.8273932933807373, 'learning_rate': 1.385610963075752e-06, 'epoch': 2.27} +{'loss': 1.4157, 'grad_norm': 3.9768242835998535, 'learning_rate': 1.3837076513132852e-06, 'epoch': 2.27} +{'eval_loss': 1.8918414115905762, 'eval_runtime': 38.7741, 'eval_samples_per_second': 25.79, 'eval_steps_per_second': 1.083, 'epoch': 2.27} +{'loss': 1.3968, 'grad_norm': 3.513251543045044, 'learning_rate': 1.3818043395508186e-06, 'epoch': 2.27} +{'loss': 1.4237, 'grad_norm': 3.692544460296631, 'learning_rate': 1.3799010277883517e-06, 'epoch': 2.27} +{'loss': 1.4453, 'grad_norm': 3.7865631580352783, 'learning_rate': 1.377997716025885e-06, 'epoch': 2.27} +{'loss': 1.4179, 'grad_norm': 3.867410659790039, 'learning_rate': 1.3760944042634184e-06, 'epoch': 2.27} +{'loss': 1.4315, 'grad_norm': 3.901146650314331, 'learning_rate': 1.3741910925009518e-06, 'epoch': 2.28} +{'loss': 1.3974, 'grad_norm': 3.744940757751465, 'learning_rate': 1.3722877807384852e-06, 'epoch': 2.28} +{'loss': 1.3485, 'grad_norm': 3.703552007675171, 'learning_rate': 1.3703844689760185e-06, 'epoch': 2.28} +{'loss': 1.4288, 'grad_norm': 4.170780181884766, 'learning_rate': 1.3684811572135517e-06, 'epoch': 2.28} +{'loss': 1.4284, 'grad_norm': 4.1385498046875, 'learning_rate': 1.366577845451085e-06, 'epoch': 2.28} +{'loss': 1.3759, 'grad_norm': 3.78433895111084, 'learning_rate': 1.3646745336886184e-06, 'epoch': 2.28} +{'eval_loss': 1.891239047050476, 'eval_runtime': 38.8528, 'eval_samples_per_second': 25.738, 'eval_steps_per_second': 1.081, 'epoch': 2.28} +{'loss': 1.41, 'grad_norm': 3.8275749683380127, 'learning_rate': 1.3627712219261515e-06, 'epoch': 2.28} +{'loss': 1.4084, 'grad_norm': 4.003354072570801, 'learning_rate': 1.360867910163685e-06, 'epoch': 2.28} +{'loss': 1.3932, 'grad_norm': 3.9215543270111084, 'learning_rate': 1.358964598401218e-06, 'epoch': 2.28} +{'loss': 1.4617, 'grad_norm': 4.124348163604736, 'learning_rate': 1.3570612866387514e-06, 'epoch': 2.28} +{'loss': 1.4164, 'grad_norm': 3.8298404216766357, 'learning_rate': 1.355157974876285e-06, 'epoch': 2.29} +{'loss': 1.3993, 'grad_norm': 3.8481974601745605, 'learning_rate': 1.3532546631138184e-06, 'epoch': 2.29} +{'loss': 1.3938, 'grad_norm': 3.8297905921936035, 'learning_rate': 1.3513513513513515e-06, 'epoch': 2.29} +{'loss': 1.3802, 'grad_norm': 3.7777862548828125, 'learning_rate': 1.3494480395888849e-06, 'epoch': 2.29} +{'loss': 1.4227, 'grad_norm': 3.9423277378082275, 'learning_rate': 1.347544727826418e-06, 'epoch': 2.29} +{'loss': 1.4184, 'grad_norm': 3.8949787616729736, 'learning_rate': 1.3456414160639514e-06, 'epoch': 2.29} +{'eval_loss': 1.8907074928283691, 'eval_runtime': 38.9787, 'eval_samples_per_second': 25.655, 'eval_steps_per_second': 1.078, 'epoch': 2.29} +{'loss': 1.406, 'grad_norm': 3.905834197998047, 'learning_rate': 1.3437381043014847e-06, 'epoch': 2.29} +{'loss': 1.414, 'grad_norm': 3.8927206993103027, 'learning_rate': 1.341834792539018e-06, 'epoch': 2.29} +{'loss': 1.4176, 'grad_norm': 4.02139949798584, 'learning_rate': 1.3399314807765513e-06, 'epoch': 2.29} +{'loss': 1.3718, 'grad_norm': 3.977431058883667, 'learning_rate': 1.3380281690140844e-06, 'epoch': 2.29} +{'loss': 1.3739, 'grad_norm': 4.093680381774902, 'learning_rate': 1.336124857251618e-06, 'epoch': 2.3} +{'loss': 1.4073, 'grad_norm': 3.6997435092926025, 'learning_rate': 1.3342215454891513e-06, 'epoch': 2.3} +{'loss': 1.4474, 'grad_norm': 3.8596630096435547, 'learning_rate': 1.3323182337266847e-06, 'epoch': 2.3} +{'loss': 1.3933, 'grad_norm': 3.836728572845459, 'learning_rate': 1.3304149219642179e-06, 'epoch': 2.3} +{'loss': 1.4036, 'grad_norm': 3.6852171421051025, 'learning_rate': 1.3285116102017512e-06, 'epoch': 2.3} +{'loss': 1.3874, 'grad_norm': 3.7992279529571533, 'learning_rate': 1.3266082984392844e-06, 'epoch': 2.3} +{'eval_loss': 1.8807792663574219, 'eval_runtime': 38.425, 'eval_samples_per_second': 26.025, 'eval_steps_per_second': 1.093, 'epoch': 2.3} +{'loss': 1.3802, 'grad_norm': 3.682476043701172, 'learning_rate': 1.3247049866768177e-06, 'epoch': 2.3} +{'loss': 1.4134, 'grad_norm': 3.8788857460021973, 'learning_rate': 1.322801674914351e-06, 'epoch': 2.3} +{'loss': 1.398, 'grad_norm': 3.819302797317505, 'learning_rate': 1.3208983631518843e-06, 'epoch': 2.3} +{'loss': 1.4135, 'grad_norm': 3.7616729736328125, 'learning_rate': 1.3189950513894176e-06, 'epoch': 2.31} +{'loss': 1.439, 'grad_norm': 3.891249418258667, 'learning_rate': 1.3170917396269512e-06, 'epoch': 2.31} +{'loss': 1.3796, 'grad_norm': 3.711500883102417, 'learning_rate': 1.3151884278644843e-06, 'epoch': 2.31} +{'loss': 1.4717, 'grad_norm': 4.244138717651367, 'learning_rate': 1.3132851161020177e-06, 'epoch': 2.31} +{'loss': 1.3876, 'grad_norm': 3.7272891998291016, 'learning_rate': 1.311381804339551e-06, 'epoch': 2.31} +{'loss': 1.4179, 'grad_norm': 3.6459193229675293, 'learning_rate': 1.3094784925770842e-06, 'epoch': 2.31} +{'loss': 1.373, 'grad_norm': 3.802356243133545, 'learning_rate': 1.3075751808146176e-06, 'epoch': 2.31} +{'eval_loss': 1.8756392002105713, 'eval_runtime': 38.9591, 'eval_samples_per_second': 25.668, 'eval_steps_per_second': 1.078, 'epoch': 2.31} +{'loss': 1.3891, 'grad_norm': 3.695012092590332, 'learning_rate': 1.3056718690521507e-06, 'epoch': 2.31} +{'loss': 1.385, 'grad_norm': 3.780090808868408, 'learning_rate': 1.303768557289684e-06, 'epoch': 2.31} +{'loss': 1.3892, 'grad_norm': 3.6370489597320557, 'learning_rate': 1.3018652455272175e-06, 'epoch': 2.31} +{'loss': 1.4267, 'grad_norm': 3.7199184894561768, 'learning_rate': 1.2999619337647506e-06, 'epoch': 2.32} +{'loss': 1.4386, 'grad_norm': 3.7514007091522217, 'learning_rate': 1.2980586220022842e-06, 'epoch': 2.32} +{'loss': 1.4249, 'grad_norm': 3.860673189163208, 'learning_rate': 1.2961553102398175e-06, 'epoch': 2.32} +{'loss': 1.3777, 'grad_norm': 3.9224369525909424, 'learning_rate': 1.2942519984773507e-06, 'epoch': 2.32} +{'loss': 1.4281, 'grad_norm': 3.7801625728607178, 'learning_rate': 1.292348686714884e-06, 'epoch': 2.32} +{'loss': 1.3508, 'grad_norm': 3.7595367431640625, 'learning_rate': 1.2904453749524174e-06, 'epoch': 2.32} +{'loss': 1.388, 'grad_norm': 3.821812868118286, 'learning_rate': 1.2885420631899506e-06, 'epoch': 2.32} +{'eval_loss': 1.8754379749298096, 'eval_runtime': 38.6329, 'eval_samples_per_second': 25.885, 'eval_steps_per_second': 1.087, 'epoch': 2.32} +{'loss': 1.4191, 'grad_norm': 3.703956365585327, 'learning_rate': 1.286638751427484e-06, 'epoch': 2.32} +{'loss': 1.4301, 'grad_norm': 3.726445198059082, 'learning_rate': 1.284735439665017e-06, 'epoch': 2.32} +{'loss': 1.4005, 'grad_norm': 3.7294535636901855, 'learning_rate': 1.2828321279025504e-06, 'epoch': 2.32} +{'loss': 1.384, 'grad_norm': 3.755748748779297, 'learning_rate': 1.2809288161400838e-06, 'epoch': 2.33} +{'loss': 1.4497, 'grad_norm': 3.935171127319336, 'learning_rate': 1.279025504377617e-06, 'epoch': 2.33} +{'loss': 1.3947, 'grad_norm': 3.7325186729431152, 'learning_rate': 1.2771221926151505e-06, 'epoch': 2.33} +{'loss': 1.3969, 'grad_norm': 3.8117432594299316, 'learning_rate': 1.275218880852684e-06, 'epoch': 2.33} +{'loss': 1.4622, 'grad_norm': 3.922048330307007, 'learning_rate': 1.273315569090217e-06, 'epoch': 2.33} +{'loss': 1.4634, 'grad_norm': 3.8676812648773193, 'learning_rate': 1.2714122573277504e-06, 'epoch': 2.33} +{'loss': 1.421, 'grad_norm': 3.851907730102539, 'learning_rate': 1.2695089455652838e-06, 'epoch': 2.33} +{'eval_loss': 1.8673007488250732, 'eval_runtime': 39.1077, 'eval_samples_per_second': 25.57, 'eval_steps_per_second': 1.074, 'epoch': 2.33} +{'loss': 1.4195, 'grad_norm': 3.751912832260132, 'learning_rate': 1.267605633802817e-06, 'epoch': 2.33} +{'loss': 1.4023, 'grad_norm': 3.849126100540161, 'learning_rate': 1.2657023220403503e-06, 'epoch': 2.33} +{'loss': 1.4014, 'grad_norm': 3.92042875289917, 'learning_rate': 1.2637990102778836e-06, 'epoch': 2.33} +{'loss': 1.4422, 'grad_norm': 3.9508938789367676, 'learning_rate': 1.2618956985154168e-06, 'epoch': 2.34} +{'loss': 1.4103, 'grad_norm': 3.6862587928771973, 'learning_rate': 1.2599923867529502e-06, 'epoch': 2.34} +{'loss': 1.4395, 'grad_norm': 3.871906042098999, 'learning_rate': 1.2580890749904837e-06, 'epoch': 2.34} +{'loss': 1.4678, 'grad_norm': 4.084641933441162, 'learning_rate': 1.2561857632280169e-06, 'epoch': 2.34} +{'loss': 1.4352, 'grad_norm': 3.98874568939209, 'learning_rate': 1.2542824514655502e-06, 'epoch': 2.34} +{'loss': 1.3999, 'grad_norm': 3.7319843769073486, 'learning_rate': 1.2523791397030834e-06, 'epoch': 2.34} +{'loss': 1.4373, 'grad_norm': 3.9538984298706055, 'learning_rate': 1.2504758279406168e-06, 'epoch': 2.34} +{'eval_loss': 1.8792237043380737, 'eval_runtime': 39.0442, 'eval_samples_per_second': 25.612, 'eval_steps_per_second': 1.076, 'epoch': 2.34} +{'loss': 1.51, 'grad_norm': 3.916919708251953, 'learning_rate': 1.2485725161781501e-06, 'epoch': 2.34} +{'loss': 1.3645, 'grad_norm': 3.671508312225342, 'learning_rate': 1.2466692044156833e-06, 'epoch': 2.34} +{'loss': 1.4102, 'grad_norm': 3.6340858936309814, 'learning_rate': 1.2447658926532168e-06, 'epoch': 2.35} +{'loss': 1.3866, 'grad_norm': 3.8072052001953125, 'learning_rate': 1.24286258089075e-06, 'epoch': 2.35} +{'loss': 1.4199, 'grad_norm': 4.02155065536499, 'learning_rate': 1.2409592691282834e-06, 'epoch': 2.35} +{'loss': 1.4145, 'grad_norm': 3.8325469493865967, 'learning_rate': 1.2390559573658165e-06, 'epoch': 2.35} +{'loss': 1.3802, 'grad_norm': 3.6448421478271484, 'learning_rate': 1.2371526456033499e-06, 'epoch': 2.35} +{'loss': 1.4975, 'grad_norm': 4.091626167297363, 'learning_rate': 1.2352493338408832e-06, 'epoch': 2.35} +{'loss': 1.4228, 'grad_norm': 4.044641017913818, 'learning_rate': 1.2333460220784166e-06, 'epoch': 2.35} +{'loss': 1.4513, 'grad_norm': 3.976301670074463, 'learning_rate': 1.23144271031595e-06, 'epoch': 2.35} +{'eval_loss': 1.8856838941574097, 'eval_runtime': 38.8895, 'eval_samples_per_second': 25.714, 'eval_steps_per_second': 1.08, 'epoch': 2.35} +{'loss': 1.331, 'grad_norm': 4.033593654632568, 'learning_rate': 1.2295393985534831e-06, 'epoch': 2.35} +{'loss': 1.4536, 'grad_norm': 4.06403923034668, 'learning_rate': 1.2276360867910165e-06, 'epoch': 2.35} +{'loss': 1.3719, 'grad_norm': 3.916027784347534, 'learning_rate': 1.2257327750285498e-06, 'epoch': 2.36} +{'loss': 1.3792, 'grad_norm': 3.760979652404785, 'learning_rate': 1.2238294632660832e-06, 'epoch': 2.36} +{'loss': 1.4297, 'grad_norm': 3.896545648574829, 'learning_rate': 1.2219261515036164e-06, 'epoch': 2.36} +{'loss': 1.463, 'grad_norm': 3.707213878631592, 'learning_rate': 1.2200228397411497e-06, 'epoch': 2.36} +{'loss': 1.3911, 'grad_norm': 3.708606243133545, 'learning_rate': 1.2181195279786829e-06, 'epoch': 2.36} +{'loss': 1.3813, 'grad_norm': 4.080479621887207, 'learning_rate': 1.2162162162162164e-06, 'epoch': 2.36} +{'loss': 1.3906, 'grad_norm': 3.949032783508301, 'learning_rate': 1.2143129044537496e-06, 'epoch': 2.36} +{'loss': 1.4121, 'grad_norm': 3.9459354877471924, 'learning_rate': 1.212409592691283e-06, 'epoch': 2.36} +{'eval_loss': 1.8794779777526855, 'eval_runtime': 39.5132, 'eval_samples_per_second': 25.308, 'eval_steps_per_second': 1.063, 'epoch': 2.36} +{'loss': 1.4233, 'grad_norm': 3.8748619556427, 'learning_rate': 1.2105062809288163e-06, 'epoch': 2.36} +{'loss': 1.3634, 'grad_norm': 3.5708930492401123, 'learning_rate': 1.2086029691663495e-06, 'epoch': 2.36} +{'loss': 1.4358, 'grad_norm': 4.04731559753418, 'learning_rate': 1.2066996574038828e-06, 'epoch': 2.37} +{'loss': 1.412, 'grad_norm': 3.8527228832244873, 'learning_rate': 1.2047963456414162e-06, 'epoch': 2.37} +{'loss': 1.3518, 'grad_norm': 3.785165309906006, 'learning_rate': 1.2028930338789496e-06, 'epoch': 2.37} +{'loss': 1.4176, 'grad_norm': 3.978516101837158, 'learning_rate': 1.2009897221164827e-06, 'epoch': 2.37} +{'loss': 1.386, 'grad_norm': 3.832094669342041, 'learning_rate': 1.199086410354016e-06, 'epoch': 2.37} +{'loss': 1.3804, 'grad_norm': 3.782668113708496, 'learning_rate': 1.1971830985915492e-06, 'epoch': 2.37} +{'loss': 1.4111, 'grad_norm': 3.890467643737793, 'learning_rate': 1.1952797868290828e-06, 'epoch': 2.37} +{'loss': 1.4185, 'grad_norm': 3.762063980102539, 'learning_rate': 1.193376475066616e-06, 'epoch': 2.37} +{'eval_loss': 1.884463906288147, 'eval_runtime': 38.9143, 'eval_samples_per_second': 25.697, 'eval_steps_per_second': 1.079, 'epoch': 2.37} +{'loss': 1.3829, 'grad_norm': 3.776261806488037, 'learning_rate': 1.1914731633041493e-06, 'epoch': 2.37} +{'loss': 1.3712, 'grad_norm': 3.703554630279541, 'learning_rate': 1.1895698515416827e-06, 'epoch': 2.37} +{'loss': 1.4225, 'grad_norm': 3.9105772972106934, 'learning_rate': 1.1876665397792158e-06, 'epoch': 2.38} +{'loss': 1.3476, 'grad_norm': 3.7660679817199707, 'learning_rate': 1.1857632280167492e-06, 'epoch': 2.38} +{'loss': 1.4301, 'grad_norm': 3.827824592590332, 'learning_rate': 1.1838599162542825e-06, 'epoch': 2.38} +{'loss': 1.3988, 'grad_norm': 3.8088977336883545, 'learning_rate': 1.181956604491816e-06, 'epoch': 2.38} +{'loss': 1.4871, 'grad_norm': 4.236826419830322, 'learning_rate': 1.180053292729349e-06, 'epoch': 2.38} +{'loss': 1.406, 'grad_norm': 3.8484609127044678, 'learning_rate': 1.1781499809668824e-06, 'epoch': 2.38} +{'loss': 1.4318, 'grad_norm': 3.7540841102600098, 'learning_rate': 1.1762466692044158e-06, 'epoch': 2.38} +{'loss': 1.3793, 'grad_norm': 3.767538070678711, 'learning_rate': 1.1743433574419491e-06, 'epoch': 2.38} +{'eval_loss': 1.8822261095046997, 'eval_runtime': 39.1354, 'eval_samples_per_second': 25.552, 'eval_steps_per_second': 1.073, 'epoch': 2.38} +{'loss': 1.4355, 'grad_norm': 3.9159727096557617, 'learning_rate': 1.1724400456794823e-06, 'epoch': 2.38} +{'loss': 1.4192, 'grad_norm': 4.3853631019592285, 'learning_rate': 1.1705367339170157e-06, 'epoch': 2.39} +{'loss': 1.3623, 'grad_norm': 3.9850170612335205, 'learning_rate': 1.168633422154549e-06, 'epoch': 2.39} +{'loss': 1.3423, 'grad_norm': 3.5114099979400635, 'learning_rate': 1.1667301103920824e-06, 'epoch': 2.39} +{'loss': 1.3977, 'grad_norm': 4.100709438323975, 'learning_rate': 1.1648267986296157e-06, 'epoch': 2.39} +{'loss': 1.4114, 'grad_norm': 3.8762168884277344, 'learning_rate': 1.162923486867149e-06, 'epoch': 2.39} +{'loss': 1.4664, 'grad_norm': 4.394121170043945, 'learning_rate': 1.1610201751046823e-06, 'epoch': 2.39} +{'loss': 1.4252, 'grad_norm': 4.014926433563232, 'learning_rate': 1.1591168633422154e-06, 'epoch': 2.39} +{'loss': 1.4044, 'grad_norm': 3.825906276702881, 'learning_rate': 1.157213551579749e-06, 'epoch': 2.39} +{'loss': 1.3797, 'grad_norm': 3.7653579711914062, 'learning_rate': 1.1553102398172821e-06, 'epoch': 2.39} +{'eval_loss': 1.8852497339248657, 'eval_runtime': 38.8482, 'eval_samples_per_second': 25.741, 'eval_steps_per_second': 1.081, 'epoch': 2.39} +{'loss': 1.4255, 'grad_norm': 3.908841848373413, 'learning_rate': 1.1534069280548155e-06, 'epoch': 2.39} +{'loss': 1.3836, 'grad_norm': 3.8772692680358887, 'learning_rate': 1.1515036162923487e-06, 'epoch': 2.4} +{'loss': 1.3972, 'grad_norm': 3.7436907291412354, 'learning_rate': 1.149600304529882e-06, 'epoch': 2.4} +{'loss': 1.3929, 'grad_norm': 3.6557600498199463, 'learning_rate': 1.1476969927674154e-06, 'epoch': 2.4} +{'loss': 1.44, 'grad_norm': 3.9747960567474365, 'learning_rate': 1.1457936810049487e-06, 'epoch': 2.4} +{'loss': 1.3613, 'grad_norm': 3.7209384441375732, 'learning_rate': 1.143890369242482e-06, 'epoch': 2.4} +{'loss': 1.3933, 'grad_norm': 3.6955058574676514, 'learning_rate': 1.1419870574800153e-06, 'epoch': 2.4} +{'loss': 1.3609, 'grad_norm': 3.73045015335083, 'learning_rate': 1.1400837457175486e-06, 'epoch': 2.4} +{'loss': 1.4295, 'grad_norm': 3.814049005508423, 'learning_rate': 1.138180433955082e-06, 'epoch': 2.4} +{'loss': 1.4074, 'grad_norm': 4.025428295135498, 'learning_rate': 1.1362771221926153e-06, 'epoch': 2.4} +{'eval_loss': 1.8910856246948242, 'eval_runtime': 39.1984, 'eval_samples_per_second': 25.511, 'eval_steps_per_second': 1.071, 'epoch': 2.4} +{'loss': 1.3804, 'grad_norm': 3.8834328651428223, 'learning_rate': 1.1343738104301485e-06, 'epoch': 2.4} +{'loss': 1.4475, 'grad_norm': 4.029475212097168, 'learning_rate': 1.1324704986676819e-06, 'epoch': 2.41} +{'loss': 1.3876, 'grad_norm': 3.9325878620147705, 'learning_rate': 1.130567186905215e-06, 'epoch': 2.41} +{'loss': 1.3847, 'grad_norm': 3.8090524673461914, 'learning_rate': 1.1286638751427486e-06, 'epoch': 2.41} +{'loss': 1.4322, 'grad_norm': 4.055549144744873, 'learning_rate': 1.1267605633802817e-06, 'epoch': 2.41} +{'loss': 1.4626, 'grad_norm': 4.0241570472717285, 'learning_rate': 1.124857251617815e-06, 'epoch': 2.41} +{'loss': 1.3986, 'grad_norm': 3.8859708309173584, 'learning_rate': 1.1229539398553485e-06, 'epoch': 2.41} +{'loss': 1.4352, 'grad_norm': 3.802727460861206, 'learning_rate': 1.1210506280928816e-06, 'epoch': 2.41} +{'loss': 1.4087, 'grad_norm': 4.0616350173950195, 'learning_rate': 1.119147316330415e-06, 'epoch': 2.41} +{'loss': 1.4089, 'grad_norm': 3.7449159622192383, 'learning_rate': 1.1172440045679483e-06, 'epoch': 2.41} +{'eval_loss': 1.879085659980774, 'eval_runtime': 39.1098, 'eval_samples_per_second': 25.569, 'eval_steps_per_second': 1.074, 'epoch': 2.41} +{'loss': 1.4345, 'grad_norm': 3.759726047515869, 'learning_rate': 1.1153406928054817e-06, 'epoch': 2.41} +{'loss': 1.4465, 'grad_norm': 4.014988422393799, 'learning_rate': 1.1134373810430148e-06, 'epoch': 2.42} +{'loss': 1.4021, 'grad_norm': 3.874109983444214, 'learning_rate': 1.1115340692805482e-06, 'epoch': 2.42} +{'loss': 1.371, 'grad_norm': 3.817092180252075, 'learning_rate': 1.1096307575180816e-06, 'epoch': 2.42} +{'loss': 1.3823, 'grad_norm': 3.7893261909484863, 'learning_rate': 1.107727445755615e-06, 'epoch': 2.42} +{'loss': 1.4086, 'grad_norm': 3.873870372772217, 'learning_rate': 1.105824133993148e-06, 'epoch': 2.42} +{'loss': 1.4026, 'grad_norm': 3.859938859939575, 'learning_rate': 1.1039208222306814e-06, 'epoch': 2.42} +{'loss': 1.388, 'grad_norm': 3.8468515872955322, 'learning_rate': 1.1020175104682148e-06, 'epoch': 2.42} +{'loss': 1.41, 'grad_norm': 3.9764840602874756, 'learning_rate': 1.1001141987057482e-06, 'epoch': 2.42} +{'loss': 1.4366, 'grad_norm': 3.903120517730713, 'learning_rate': 1.0982108869432813e-06, 'epoch': 2.42} +{'eval_loss': 1.8877712488174438, 'eval_runtime': 39.0898, 'eval_samples_per_second': 25.582, 'eval_steps_per_second': 1.074, 'epoch': 2.42} +{'loss': 1.4036, 'grad_norm': 3.688204526901245, 'learning_rate': 1.0963075751808147e-06, 'epoch': 2.43} +{'loss': 1.4031, 'grad_norm': 3.891559362411499, 'learning_rate': 1.094404263418348e-06, 'epoch': 2.43} +{'loss': 1.3926, 'grad_norm': 3.7046892642974854, 'learning_rate': 1.0925009516558812e-06, 'epoch': 2.43} +{'loss': 1.4169, 'grad_norm': 3.924924612045288, 'learning_rate': 1.0905976398934148e-06, 'epoch': 2.43} +{'loss': 1.3593, 'grad_norm': 3.7725932598114014, 'learning_rate': 1.088694328130948e-06, 'epoch': 2.43} +{'loss': 1.4073, 'grad_norm': 3.826176643371582, 'learning_rate': 1.0867910163684813e-06, 'epoch': 2.43} +{'loss': 1.3947, 'grad_norm': 3.7417454719543457, 'learning_rate': 1.0848877046060144e-06, 'epoch': 2.43} +{'loss': 1.3991, 'grad_norm': 3.7589855194091797, 'learning_rate': 1.0829843928435478e-06, 'epoch': 2.43} +{'loss': 1.4126, 'grad_norm': 3.802074670791626, 'learning_rate': 1.0810810810810812e-06, 'epoch': 2.43} +{'loss': 1.4016, 'grad_norm': 3.96932315826416, 'learning_rate': 1.0791777693186145e-06, 'epoch': 2.43} +{'eval_loss': 1.8820111751556396, 'eval_runtime': 39.126, 'eval_samples_per_second': 25.558, 'eval_steps_per_second': 1.073, 'epoch': 2.43} +{'loss': 1.3859, 'grad_norm': 3.7459776401519775, 'learning_rate': 1.0772744575561479e-06, 'epoch': 2.44} +{'loss': 1.3831, 'grad_norm': 3.711710214614868, 'learning_rate': 1.075371145793681e-06, 'epoch': 2.44} +{'loss': 1.3797, 'grad_norm': 3.758971691131592, 'learning_rate': 1.0734678340312144e-06, 'epoch': 2.44} +{'loss': 1.3855, 'grad_norm': 3.734750986099243, 'learning_rate': 1.0715645222687478e-06, 'epoch': 2.44} +{'loss': 1.4181, 'grad_norm': 3.872424602508545, 'learning_rate': 1.0696612105062811e-06, 'epoch': 2.44} +{'loss': 1.368, 'grad_norm': 3.6354382038116455, 'learning_rate': 1.0677578987438143e-06, 'epoch': 2.44} +{'loss': 1.3824, 'grad_norm': 3.7328145503997803, 'learning_rate': 1.0658545869813476e-06, 'epoch': 2.44} +{'loss': 1.3955, 'grad_norm': 3.7629051208496094, 'learning_rate': 1.0639512752188808e-06, 'epoch': 2.44} +{'loss': 1.3652, 'grad_norm': 3.819352388381958, 'learning_rate': 1.0620479634564144e-06, 'epoch': 2.44} +{'loss': 1.3634, 'grad_norm': 3.8065381050109863, 'learning_rate': 1.0601446516939475e-06, 'epoch': 2.44} +{'eval_loss': 1.8805789947509766, 'eval_runtime': 38.9052, 'eval_samples_per_second': 25.704, 'eval_steps_per_second': 1.08, 'epoch': 2.44} +{'loss': 1.3796, 'grad_norm': 3.9771666526794434, 'learning_rate': 1.0582413399314809e-06, 'epoch': 2.45} +{'loss': 1.3791, 'grad_norm': 3.7634780406951904, 'learning_rate': 1.0563380281690142e-06, 'epoch': 2.45} +{'loss': 1.4083, 'grad_norm': 3.841782569885254, 'learning_rate': 1.0544347164065474e-06, 'epoch': 2.45} +{'loss': 1.3817, 'grad_norm': 3.8366568088531494, 'learning_rate': 1.0525314046440808e-06, 'epoch': 2.45} +{'loss': 1.388, 'grad_norm': 3.883749485015869, 'learning_rate': 1.0506280928816141e-06, 'epoch': 2.45} +{'loss': 1.4037, 'grad_norm': 3.8496763706207275, 'learning_rate': 1.0487247811191475e-06, 'epoch': 2.45} +{'loss': 1.418, 'grad_norm': 3.893866777420044, 'learning_rate': 1.0468214693566806e-06, 'epoch': 2.45} +{'loss': 1.3748, 'grad_norm': 3.908076763153076, 'learning_rate': 1.044918157594214e-06, 'epoch': 2.45} +{'loss': 1.3684, 'grad_norm': 3.6190345287323, 'learning_rate': 1.0430148458317474e-06, 'epoch': 2.45} +{'loss': 1.4449, 'grad_norm': 4.007608890533447, 'learning_rate': 1.0411115340692807e-06, 'epoch': 2.46} +{'eval_loss': 1.8735251426696777, 'eval_runtime': 39.3147, 'eval_samples_per_second': 25.436, 'eval_steps_per_second': 1.068, 'epoch': 2.46} +{'loss': 1.4076, 'grad_norm': 3.9389219284057617, 'learning_rate': 1.0392082223068139e-06, 'epoch': 2.46} +{'loss': 1.4106, 'grad_norm': 3.783188581466675, 'learning_rate': 1.0373049105443472e-06, 'epoch': 2.46} +{'loss': 1.3718, 'grad_norm': 3.625424861907959, 'learning_rate': 1.0354015987818806e-06, 'epoch': 2.46} +{'loss': 1.429, 'grad_norm': 4.060116767883301, 'learning_rate': 1.033498287019414e-06, 'epoch': 2.46} +{'loss': 1.3603, 'grad_norm': 3.976637840270996, 'learning_rate': 1.0315949752569471e-06, 'epoch': 2.46} +{'loss': 1.4058, 'grad_norm': 3.96303391456604, 'learning_rate': 1.0296916634944805e-06, 'epoch': 2.46} +{'loss': 1.4355, 'grad_norm': 4.178105354309082, 'learning_rate': 1.0277883517320138e-06, 'epoch': 2.46} +{'loss': 1.3244, 'grad_norm': 3.9457364082336426, 'learning_rate': 1.025885039969547e-06, 'epoch': 2.46} +{'loss': 1.3757, 'grad_norm': 3.863892078399658, 'learning_rate': 1.0239817282070806e-06, 'epoch': 2.46} +{'loss': 1.3886, 'grad_norm': 3.7649881839752197, 'learning_rate': 1.0220784164446137e-06, 'epoch': 2.47} +{'eval_loss': 1.8845109939575195, 'eval_runtime': 39.1784, 'eval_samples_per_second': 25.524, 'eval_steps_per_second': 1.072, 'epoch': 2.47} +{'loss': 1.3958, 'grad_norm': 3.6952157020568848, 'learning_rate': 1.020175104682147e-06, 'epoch': 2.47} +{'loss': 1.4329, 'grad_norm': 3.8997414112091064, 'learning_rate': 1.0182717929196802e-06, 'epoch': 2.47} +{'loss': 1.3484, 'grad_norm': 3.8246068954467773, 'learning_rate': 1.0163684811572136e-06, 'epoch': 2.47} +{'loss': 1.3987, 'grad_norm': 3.954486846923828, 'learning_rate': 1.014465169394747e-06, 'epoch': 2.47} +{'loss': 1.3786, 'grad_norm': 3.877336025238037, 'learning_rate': 1.0125618576322803e-06, 'epoch': 2.47} +{'loss': 1.3756, 'grad_norm': 3.875992774963379, 'learning_rate': 1.0106585458698137e-06, 'epoch': 2.47} +{'loss': 1.4564, 'grad_norm': 4.042083263397217, 'learning_rate': 1.0087552341073468e-06, 'epoch': 2.47} +{'loss': 1.4345, 'grad_norm': 3.9419937133789062, 'learning_rate': 1.0068519223448802e-06, 'epoch': 2.47} +{'loss': 1.3732, 'grad_norm': 4.02181339263916, 'learning_rate': 1.0049486105824133e-06, 'epoch': 2.47} +{'loss': 1.3713, 'grad_norm': 4.01954460144043, 'learning_rate': 1.003045298819947e-06, 'epoch': 2.48} +{'eval_loss': 1.890286922454834, 'eval_runtime': 38.8229, 'eval_samples_per_second': 25.758, 'eval_steps_per_second': 1.082, 'epoch': 2.48} +{'loss': 1.3676, 'grad_norm': 3.7543082237243652, 'learning_rate': 1.00114198705748e-06, 'epoch': 2.48} +{'loss': 1.3631, 'grad_norm': 3.8260843753814697, 'learning_rate': 9.992386752950134e-07, 'epoch': 2.48} +{'loss': 1.3611, 'grad_norm': 3.82153582572937, 'learning_rate': 9.973353635325466e-07, 'epoch': 2.48} +{'loss': 1.3775, 'grad_norm': 3.7833526134490967, 'learning_rate': 9.9543205177008e-07, 'epoch': 2.48} +{'loss': 1.3537, 'grad_norm': 3.972115993499756, 'learning_rate': 9.935287400076133e-07, 'epoch': 2.48} +{'loss': 1.3989, 'grad_norm': 4.0531721115112305, 'learning_rate': 9.916254282451467e-07, 'epoch': 2.48} +{'loss': 1.4149, 'grad_norm': 3.7722175121307373, 'learning_rate': 9.8972211648268e-07, 'epoch': 2.48} +{'loss': 1.3883, 'grad_norm': 4.015010833740234, 'learning_rate': 9.878188047202132e-07, 'epoch': 2.48} +{'loss': 1.3594, 'grad_norm': 3.9410476684570312, 'learning_rate': 9.859154929577465e-07, 'epoch': 2.48} +{'loss': 1.3802, 'grad_norm': 4.01100492477417, 'learning_rate': 9.8401218119528e-07, 'epoch': 2.49} +{'eval_loss': 1.8873405456542969, 'eval_runtime': 38.7466, 'eval_samples_per_second': 25.809, 'eval_steps_per_second': 1.084, 'epoch': 2.49} +{'loss': 1.4214, 'grad_norm': 4.015278339385986, 'learning_rate': 9.821088694328133e-07, 'epoch': 2.49} +{'loss': 1.4019, 'grad_norm': 3.7989606857299805, 'learning_rate': 9.802055576703464e-07, 'epoch': 2.49} +{'loss': 1.3963, 'grad_norm': 3.8566884994506836, 'learning_rate': 9.783022459078798e-07, 'epoch': 2.49} +{'loss': 1.4126, 'grad_norm': 4.044495105743408, 'learning_rate': 9.76398934145413e-07, 'epoch': 2.49} +{'loss': 1.4219, 'grad_norm': 4.06904411315918, 'learning_rate': 9.744956223829465e-07, 'epoch': 2.49} +{'loss': 1.4454, 'grad_norm': 3.9633665084838867, 'learning_rate': 9.725923106204797e-07, 'epoch': 2.49} +{'loss': 1.4239, 'grad_norm': 3.9925265312194824, 'learning_rate': 9.70688998858013e-07, 'epoch': 2.49} +{'loss': 1.4105, 'grad_norm': 4.029801845550537, 'learning_rate': 9.687856870955464e-07, 'epoch': 2.49} +{'loss': 1.3509, 'grad_norm': 3.782688617706299, 'learning_rate': 9.668823753330795e-07, 'epoch': 2.5} +{'loss': 1.3941, 'grad_norm': 3.931802988052368, 'learning_rate': 9.64979063570613e-07, 'epoch': 2.5} +{'eval_loss': 1.8822802305221558, 'eval_runtime': 39.2329, 'eval_samples_per_second': 25.489, 'eval_steps_per_second': 1.071, 'epoch': 2.5} +{'loss': 1.3975, 'grad_norm': 3.8316829204559326, 'learning_rate': 9.630757518081463e-07, 'epoch': 2.5} +{'loss': 1.3701, 'grad_norm': 3.8386945724487305, 'learning_rate': 9.611724400456796e-07, 'epoch': 2.5} +{'loss': 1.3961, 'grad_norm': 3.789863109588623, 'learning_rate': 9.592691282832128e-07, 'epoch': 2.5} +{'loss': 1.4237, 'grad_norm': 3.784151315689087, 'learning_rate': 9.573658165207461e-07, 'epoch': 2.5} +{'loss': 1.364, 'grad_norm': 3.7614283561706543, 'learning_rate': 9.554625047582795e-07, 'epoch': 2.5} +{'loss': 1.4092, 'grad_norm': 3.906604051589966, 'learning_rate': 9.535591929958129e-07, 'epoch': 2.5} +{'loss': 1.409, 'grad_norm': 3.941166400909424, 'learning_rate': 9.516558812333461e-07, 'epoch': 2.5} +{'loss': 1.3857, 'grad_norm': 3.6770758628845215, 'learning_rate': 9.497525694708794e-07, 'epoch': 2.5} +{'loss': 1.3848, 'grad_norm': 3.8969197273254395, 'learning_rate': 9.478492577084126e-07, 'epoch': 2.51} +{'loss': 1.3736, 'grad_norm': 3.868818998336792, 'learning_rate': 9.459459459459461e-07, 'epoch': 2.51} +{'eval_loss': 1.884974479675293, 'eval_runtime': 39.5147, 'eval_samples_per_second': 25.307, 'eval_steps_per_second': 1.063, 'epoch': 2.51} +{'loss': 1.3899, 'grad_norm': 3.727487564086914, 'learning_rate': 9.440426341834794e-07, 'epoch': 2.51} +{'loss': 1.3953, 'grad_norm': 4.08335542678833, 'learning_rate': 9.421393224210126e-07, 'epoch': 2.51} +{'loss': 1.3524, 'grad_norm': 3.823840618133545, 'learning_rate': 9.402360106585459e-07, 'epoch': 2.51} +{'loss': 1.3548, 'grad_norm': 3.9261486530303955, 'learning_rate': 9.383326988960792e-07, 'epoch': 2.51} +{'loss': 1.3874, 'grad_norm': 3.944051742553711, 'learning_rate': 9.364293871336126e-07, 'epoch': 2.51} +{'loss': 1.4042, 'grad_norm': 3.8614444732666016, 'learning_rate': 9.345260753711458e-07, 'epoch': 2.51} +{'loss': 1.3486, 'grad_norm': 3.8116726875305176, 'learning_rate': 9.326227636086792e-07, 'epoch': 2.51} +{'loss': 1.4718, 'grad_norm': 4.096467971801758, 'learning_rate': 9.307194518462125e-07, 'epoch': 2.51} +{'loss': 1.3514, 'grad_norm': 3.746457815170288, 'learning_rate': 9.288161400837457e-07, 'epoch': 2.52} +{'loss': 1.3844, 'grad_norm': 3.676745891571045, 'learning_rate': 9.269128283212792e-07, 'epoch': 2.52} +{'eval_loss': 1.8801977634429932, 'eval_runtime': 39.6635, 'eval_samples_per_second': 25.212, 'eval_steps_per_second': 1.059, 'epoch': 2.52} +{'loss': 1.45, 'grad_norm': 3.831447124481201, 'learning_rate': 9.250095165588125e-07, 'epoch': 2.52} +{'loss': 1.4505, 'grad_norm': 3.869492530822754, 'learning_rate': 9.231062047963457e-07, 'epoch': 2.52} +{'loss': 1.4028, 'grad_norm': 3.906553268432617, 'learning_rate': 9.21202893033879e-07, 'epoch': 2.52} +{'loss': 1.4011, 'grad_norm': 3.8048441410064697, 'learning_rate': 9.192995812714123e-07, 'epoch': 2.52} +{'loss': 1.395, 'grad_norm': 3.854220151901245, 'learning_rate': 9.173962695089457e-07, 'epoch': 2.52} +{'loss': 1.4238, 'grad_norm': 3.9096851348876953, 'learning_rate': 9.154929577464789e-07, 'epoch': 2.52} +{'loss': 1.4064, 'grad_norm': 3.9367594718933105, 'learning_rate': 9.135896459840123e-07, 'epoch': 2.52} +{'loss': 1.4399, 'grad_norm': 3.9652819633483887, 'learning_rate': 9.116863342215456e-07, 'epoch': 2.52} +{'loss': 1.3504, 'grad_norm': 3.7570557594299316, 'learning_rate': 9.097830224590788e-07, 'epoch': 2.53} +{'loss': 1.4323, 'grad_norm': 3.8762097358703613, 'learning_rate': 9.078797106966122e-07, 'epoch': 2.53} +{'eval_loss': 1.8804926872253418, 'eval_runtime': 39.6482, 'eval_samples_per_second': 25.222, 'eval_steps_per_second': 1.059, 'epoch': 2.53} +{'loss': 1.3637, 'grad_norm': 3.863027572631836, 'learning_rate': 9.059763989341455e-07, 'epoch': 2.53} +{'loss': 1.3944, 'grad_norm': 3.7366769313812256, 'learning_rate': 9.040730871716788e-07, 'epoch': 2.53} +{'loss': 1.3957, 'grad_norm': 3.83339262008667, 'learning_rate': 9.021697754092121e-07, 'epoch': 2.53} +{'loss': 1.4155, 'grad_norm': 3.864362955093384, 'learning_rate': 9.002664636467453e-07, 'epoch': 2.53} +{'loss': 1.4344, 'grad_norm': 3.843515634536743, 'learning_rate': 8.983631518842788e-07, 'epoch': 2.53} +{'loss': 1.3896, 'grad_norm': 3.8208236694335938, 'learning_rate': 8.96459840121812e-07, 'epoch': 2.53} +{'loss': 1.4184, 'grad_norm': 3.736323833465576, 'learning_rate': 8.945565283593453e-07, 'epoch': 2.53} +{'loss': 1.4013, 'grad_norm': 3.874582290649414, 'learning_rate': 8.926532165968787e-07, 'epoch': 2.54} +{'loss': 1.361, 'grad_norm': 3.8031864166259766, 'learning_rate': 8.907499048344119e-07, 'epoch': 2.54} +{'loss': 1.3595, 'grad_norm': 3.6207268238067627, 'learning_rate': 8.888465930719453e-07, 'epoch': 2.54} +{'eval_loss': 1.8865208625793457, 'eval_runtime': 39.5593, 'eval_samples_per_second': 25.279, 'eval_steps_per_second': 1.062, 'epoch': 2.54} +{'loss': 1.3942, 'grad_norm': 3.8777174949645996, 'learning_rate': 8.869432813094786e-07, 'epoch': 2.54} +{'loss': 1.4159, 'grad_norm': 3.8689544200897217, 'learning_rate': 8.850399695470119e-07, 'epoch': 2.54} +{'loss': 1.3688, 'grad_norm': 3.7620460987091064, 'learning_rate': 8.831366577845452e-07, 'epoch': 2.54} +{'loss': 1.3455, 'grad_norm': 3.817760467529297, 'learning_rate': 8.812333460220784e-07, 'epoch': 2.54} +{'loss': 1.414, 'grad_norm': 3.751615524291992, 'learning_rate': 8.793300342596119e-07, 'epoch': 2.54} +{'loss': 1.4323, 'grad_norm': 3.9047205448150635, 'learning_rate': 8.774267224971451e-07, 'epoch': 2.54} +{'loss': 1.3596, 'grad_norm': 3.8950581550598145, 'learning_rate': 8.755234107346784e-07, 'epoch': 2.54} +{'loss': 1.3835, 'grad_norm': 3.673219680786133, 'learning_rate': 8.736200989722117e-07, 'epoch': 2.55} +{'loss': 1.4184, 'grad_norm': 3.797407388687134, 'learning_rate': 8.71716787209745e-07, 'epoch': 2.55} +{'loss': 1.3327, 'grad_norm': 3.838111639022827, 'learning_rate': 8.698134754472784e-07, 'epoch': 2.55} +{'eval_loss': 1.8825849294662476, 'eval_runtime': 39.6267, 'eval_samples_per_second': 25.236, 'eval_steps_per_second': 1.06, 'epoch': 2.55} +{'loss': 1.4131, 'grad_norm': 3.9529426097869873, 'learning_rate': 8.679101636848116e-07, 'epoch': 2.55} +{'loss': 1.3954, 'grad_norm': 3.6625475883483887, 'learning_rate': 8.66006851922345e-07, 'epoch': 2.55} +{'loss': 1.3762, 'grad_norm': 3.615492105484009, 'learning_rate': 8.641035401598783e-07, 'epoch': 2.55} +{'loss': 1.4124, 'grad_norm': 3.9706532955169678, 'learning_rate': 8.622002283974115e-07, 'epoch': 2.55} +{'loss': 1.3766, 'grad_norm': 3.911755323410034, 'learning_rate': 8.60296916634945e-07, 'epoch': 2.55} +{'loss': 1.3699, 'grad_norm': 3.855879306793213, 'learning_rate': 8.583936048724782e-07, 'epoch': 2.55} +{'loss': 1.3933, 'grad_norm': 3.891148090362549, 'learning_rate': 8.564902931100115e-07, 'epoch': 2.55} +{'loss': 1.3948, 'grad_norm': 3.763122320175171, 'learning_rate': 8.545869813475448e-07, 'epoch': 2.56} +{'loss': 1.4161, 'grad_norm': 3.916998863220215, 'learning_rate': 8.526836695850781e-07, 'epoch': 2.56} +{'loss': 1.4063, 'grad_norm': 4.001009941101074, 'learning_rate': 8.507803578226115e-07, 'epoch': 2.56} +{'eval_loss': 1.8867930173873901, 'eval_runtime': 39.6896, 'eval_samples_per_second': 25.196, 'eval_steps_per_second': 1.058, 'epoch': 2.56} +{'loss': 1.3658, 'grad_norm': 4.010340690612793, 'learning_rate': 8.488770460601447e-07, 'epoch': 2.56} +{'loss': 1.3736, 'grad_norm': 3.8056952953338623, 'learning_rate': 8.46973734297678e-07, 'epoch': 2.56} +{'loss': 1.3746, 'grad_norm': 3.9626150131225586, 'learning_rate': 8.450704225352114e-07, 'epoch': 2.56} +{'loss': 1.4084, 'grad_norm': 3.8164234161376953, 'learning_rate': 8.431671107727446e-07, 'epoch': 2.56} +{'loss': 1.3912, 'grad_norm': 3.808336019515991, 'learning_rate': 8.41263799010278e-07, 'epoch': 2.56} +{'loss': 1.3882, 'grad_norm': 3.9413936138153076, 'learning_rate': 8.393604872478113e-07, 'epoch': 2.56} +{'loss': 1.3907, 'grad_norm': 3.8775124549865723, 'learning_rate': 8.374571754853446e-07, 'epoch': 2.56} +{'loss': 1.3944, 'grad_norm': 3.887746572494507, 'learning_rate': 8.355538637228778e-07, 'epoch': 2.57} +{'loss': 1.4129, 'grad_norm': 3.780708074569702, 'learning_rate': 8.336505519604111e-07, 'epoch': 2.57} +{'loss': 1.438, 'grad_norm': 4.096318244934082, 'learning_rate': 8.317472401979445e-07, 'epoch': 2.57} +{'eval_loss': 1.89131760597229, 'eval_runtime': 39.7477, 'eval_samples_per_second': 25.159, 'eval_steps_per_second': 1.057, 'epoch': 2.57} +{'loss': 1.4175, 'grad_norm': 4.045408725738525, 'learning_rate': 8.298439284354778e-07, 'epoch': 2.57} +{'loss': 1.419, 'grad_norm': 3.8963425159454346, 'learning_rate': 8.279406166730111e-07, 'epoch': 2.57} +{'loss': 1.3956, 'grad_norm': 3.8797616958618164, 'learning_rate': 8.260373049105444e-07, 'epoch': 2.57} +{'loss': 1.4458, 'grad_norm': 3.832763671875, 'learning_rate': 8.241339931480777e-07, 'epoch': 2.57} +{'loss': 1.4051, 'grad_norm': 3.730623245239258, 'learning_rate': 8.22230681385611e-07, 'epoch': 2.57} +{'loss': 1.3592, 'grad_norm': 4.017776966094971, 'learning_rate': 8.203273696231444e-07, 'epoch': 2.57} +{'loss': 1.4086, 'grad_norm': 3.8183741569519043, 'learning_rate': 8.184240578606777e-07, 'epoch': 2.58} +{'loss': 1.3647, 'grad_norm': 3.6962153911590576, 'learning_rate': 8.165207460982109e-07, 'epoch': 2.58} +{'loss': 1.4083, 'grad_norm': 3.8572189807891846, 'learning_rate': 8.146174343357442e-07, 'epoch': 2.58} +{'loss': 1.4116, 'grad_norm': 3.806687355041504, 'learning_rate': 8.127141225732775e-07, 'epoch': 2.58} +{'eval_loss': 1.8837863206863403, 'eval_runtime': 39.7534, 'eval_samples_per_second': 25.155, 'eval_steps_per_second': 1.057, 'epoch': 2.58} +{'loss': 1.395, 'grad_norm': 3.9477732181549072, 'learning_rate': 8.108108108108109e-07, 'epoch': 2.58} +{'loss': 1.3924, 'grad_norm': 4.0018696784973145, 'learning_rate': 8.089074990483442e-07, 'epoch': 2.58} +{'loss': 1.4031, 'grad_norm': 3.997870683670044, 'learning_rate': 8.070041872858774e-07, 'epoch': 2.58} +{'loss': 1.3361, 'grad_norm': 3.7380313873291016, 'learning_rate': 8.051008755234108e-07, 'epoch': 2.58} +{'loss': 1.4005, 'grad_norm': 3.9691057205200195, 'learning_rate': 8.031975637609441e-07, 'epoch': 2.58} +{'loss': 1.4148, 'grad_norm': 4.008757591247559, 'learning_rate': 8.012942519984774e-07, 'epoch': 2.58} +{'loss': 1.4075, 'grad_norm': 3.8459408283233643, 'learning_rate': 7.993909402360108e-07, 'epoch': 2.59} +{'loss': 1.3964, 'grad_norm': 3.96577525138855, 'learning_rate': 7.97487628473544e-07, 'epoch': 2.59} +{'loss': 1.3515, 'grad_norm': 3.769768238067627, 'learning_rate': 7.955843167110773e-07, 'epoch': 2.59} +{'loss': 1.3652, 'grad_norm': 3.8873584270477295, 'learning_rate': 7.936810049486106e-07, 'epoch': 2.59} +{'eval_loss': 1.8840581178665161, 'eval_runtime': 39.741, 'eval_samples_per_second': 25.163, 'eval_steps_per_second': 1.057, 'epoch': 2.59} +{'loss': 1.4016, 'grad_norm': 3.9510862827301025, 'learning_rate': 7.91777693186144e-07, 'epoch': 2.59} +{'loss': 1.3855, 'grad_norm': 3.8778483867645264, 'learning_rate': 7.898743814236773e-07, 'epoch': 2.59} +{'loss': 1.4441, 'grad_norm': 4.035155773162842, 'learning_rate': 7.879710696612105e-07, 'epoch': 2.59} +{'loss': 1.3981, 'grad_norm': 3.9134511947631836, 'learning_rate': 7.860677578987438e-07, 'epoch': 2.59} +{'loss': 1.3706, 'grad_norm': 3.7907378673553467, 'learning_rate': 7.841644461362772e-07, 'epoch': 2.59} +{'loss': 1.4297, 'grad_norm': 3.9794673919677734, 'learning_rate': 7.822611343738105e-07, 'epoch': 2.59} +{'loss': 1.3715, 'grad_norm': 3.790806770324707, 'learning_rate': 7.803578226113438e-07, 'epoch': 2.6} +{'loss': 1.4004, 'grad_norm': 3.820833444595337, 'learning_rate': 7.784545108488771e-07, 'epoch': 2.6} +{'loss': 1.4543, 'grad_norm': 3.9757015705108643, 'learning_rate': 7.765511990864104e-07, 'epoch': 2.6} +{'loss': 1.3909, 'grad_norm': 3.7083210945129395, 'learning_rate': 7.746478873239437e-07, 'epoch': 2.6} +{'eval_loss': 1.8823740482330322, 'eval_runtime': 39.8439, 'eval_samples_per_second': 25.098, 'eval_steps_per_second': 1.054, 'epoch': 2.6} +{'loss': 1.4068, 'grad_norm': 3.8906664848327637, 'learning_rate': 7.727445755614771e-07, 'epoch': 2.6} +{'loss': 1.4127, 'grad_norm': 3.9688336849212646, 'learning_rate': 7.708412637990104e-07, 'epoch': 2.6} +{'loss': 1.3752, 'grad_norm': 3.795198440551758, 'learning_rate': 7.689379520365436e-07, 'epoch': 2.6} +{'loss': 1.4121, 'grad_norm': 3.9142467975616455, 'learning_rate': 7.670346402740769e-07, 'epoch': 2.6} +{'loss': 1.4219, 'grad_norm': 3.763429641723633, 'learning_rate': 7.651313285116103e-07, 'epoch': 2.6} +{'loss': 1.3555, 'grad_norm': 3.9204156398773193, 'learning_rate': 7.632280167491436e-07, 'epoch': 2.61} +{'loss': 1.3314, 'grad_norm': 3.8489737510681152, 'learning_rate': 7.613247049866769e-07, 'epoch': 2.61} +{'loss': 1.3721, 'grad_norm': 3.657639503479004, 'learning_rate': 7.594213932242102e-07, 'epoch': 2.61} +{'loss': 1.3896, 'grad_norm': 3.8925728797912598, 'learning_rate': 7.575180814617435e-07, 'epoch': 2.61} +{'loss': 1.4013, 'grad_norm': 3.8714938163757324, 'learning_rate': 7.556147696992767e-07, 'epoch': 2.61} +{'eval_loss': 1.8822592496871948, 'eval_runtime': 39.7479, 'eval_samples_per_second': 25.159, 'eval_steps_per_second': 1.057, 'epoch': 2.61} +{'loss': 1.3642, 'grad_norm': 3.8305976390838623, 'learning_rate': 7.537114579368102e-07, 'epoch': 2.61} +{'loss': 1.3769, 'grad_norm': 3.8417131900787354, 'learning_rate': 7.518081461743435e-07, 'epoch': 2.61} +{'loss': 1.4095, 'grad_norm': 3.970949649810791, 'learning_rate': 7.499048344118767e-07, 'epoch': 2.61} +{'loss': 1.3775, 'grad_norm': 3.74432373046875, 'learning_rate': 7.4800152264941e-07, 'epoch': 2.61} +{'loss': 1.3717, 'grad_norm': 3.760646104812622, 'learning_rate': 7.460982108869432e-07, 'epoch': 2.61} +{'loss': 1.4117, 'grad_norm': 3.9182989597320557, 'learning_rate': 7.441948991244767e-07, 'epoch': 2.62} +{'loss': 1.4202, 'grad_norm': 3.8708109855651855, 'learning_rate': 7.4229158736201e-07, 'epoch': 2.62} +{'loss': 1.3635, 'grad_norm': 3.8293240070343018, 'learning_rate': 7.403882755995432e-07, 'epoch': 2.62} +{'loss': 1.3253, 'grad_norm': 3.7582433223724365, 'learning_rate': 7.384849638370766e-07, 'epoch': 2.62} +{'loss': 1.3863, 'grad_norm': 3.825119733810425, 'learning_rate': 7.365816520746098e-07, 'epoch': 2.62} +{'eval_loss': 1.8822712898254395, 'eval_runtime': 39.854, 'eval_samples_per_second': 25.092, 'eval_steps_per_second': 1.054, 'epoch': 2.62} +{'loss': 1.372, 'grad_norm': 3.818189859390259, 'learning_rate': 7.346783403121432e-07, 'epoch': 2.62} +{'loss': 1.3871, 'grad_norm': 3.9651031494140625, 'learning_rate': 7.327750285496766e-07, 'epoch': 2.62} +{'loss': 1.3474, 'grad_norm': 3.674095869064331, 'learning_rate': 7.308717167872098e-07, 'epoch': 2.62} +{'loss': 1.4468, 'grad_norm': 3.8565824031829834, 'learning_rate': 7.289684050247431e-07, 'epoch': 2.62} +{'loss': 1.3302, 'grad_norm': 3.6978604793548584, 'learning_rate': 7.270650932622763e-07, 'epoch': 2.62} +{'loss': 1.4116, 'grad_norm': 3.8465383052825928, 'learning_rate': 7.251617814998098e-07, 'epoch': 2.63} +{'loss': 1.3667, 'grad_norm': 3.817852258682251, 'learning_rate': 7.232584697373431e-07, 'epoch': 2.63} +{'loss': 1.3926, 'grad_norm': 3.681070327758789, 'learning_rate': 7.213551579748763e-07, 'epoch': 2.63} +{'loss': 1.4701, 'grad_norm': 3.9513540267944336, 'learning_rate': 7.194518462124096e-07, 'epoch': 2.63} +{'loss': 1.3706, 'grad_norm': 3.857112407684326, 'learning_rate': 7.175485344499429e-07, 'epoch': 2.63} +{'eval_loss': 1.8817071914672852, 'eval_runtime': 39.7788, 'eval_samples_per_second': 25.139, 'eval_steps_per_second': 1.056, 'epoch': 2.63} +{'loss': 1.4675, 'grad_norm': 3.8915369510650635, 'learning_rate': 7.156452226874763e-07, 'epoch': 2.63} +{'loss': 1.3788, 'grad_norm': 3.814026117324829, 'learning_rate': 7.137419109250096e-07, 'epoch': 2.63} +{'loss': 1.3755, 'grad_norm': 3.8298401832580566, 'learning_rate': 7.118385991625429e-07, 'epoch': 2.63} +{'loss': 1.3813, 'grad_norm': 3.8778364658355713, 'learning_rate': 7.099352874000762e-07, 'epoch': 2.63} +{'loss': 1.3565, 'grad_norm': 3.711390495300293, 'learning_rate': 7.080319756376094e-07, 'epoch': 2.63} +{'loss': 1.3929, 'grad_norm': 3.873267412185669, 'learning_rate': 7.061286638751429e-07, 'epoch': 2.64} +{'loss': 1.3465, 'grad_norm': 3.8934457302093506, 'learning_rate': 7.042253521126762e-07, 'epoch': 2.64} +{'loss': 1.4257, 'grad_norm': 3.9411487579345703, 'learning_rate': 7.023220403502094e-07, 'epoch': 2.64} +{'loss': 1.4179, 'grad_norm': 3.9741506576538086, 'learning_rate': 7.004187285877427e-07, 'epoch': 2.64} +{'loss': 1.3893, 'grad_norm': 3.7731056213378906, 'learning_rate': 6.98515416825276e-07, 'epoch': 2.64} +{'eval_loss': 1.8830575942993164, 'eval_runtime': 39.8904, 'eval_samples_per_second': 25.069, 'eval_steps_per_second': 1.053, 'epoch': 2.64} +{'loss': 1.4404, 'grad_norm': 3.913270950317383, 'learning_rate': 6.966121050628094e-07, 'epoch': 2.64} +{'loss': 1.4007, 'grad_norm': 3.860281467437744, 'learning_rate': 6.947087933003427e-07, 'epoch': 2.64} +{'loss': 1.4414, 'grad_norm': 3.9857051372528076, 'learning_rate': 6.92805481537876e-07, 'epoch': 2.64} +{'loss': 1.4128, 'grad_norm': 3.735942840576172, 'learning_rate': 6.909021697754093e-07, 'epoch': 2.64} +{'loss': 1.3728, 'grad_norm': 3.7014577388763428, 'learning_rate': 6.889988580129425e-07, 'epoch': 2.65} +{'loss': 1.4169, 'grad_norm': 3.85150146484375, 'learning_rate': 6.870955462504759e-07, 'epoch': 2.65} +{'loss': 1.3972, 'grad_norm': 3.596527576446533, 'learning_rate': 6.851922344880093e-07, 'epoch': 2.65} +{'loss': 1.403, 'grad_norm': 4.0174241065979, 'learning_rate': 6.832889227255425e-07, 'epoch': 2.65} +{'loss': 1.4024, 'grad_norm': 3.818106174468994, 'learning_rate': 6.813856109630758e-07, 'epoch': 2.65} +{'loss': 1.3611, 'grad_norm': 3.7481136322021484, 'learning_rate': 6.79482299200609e-07, 'epoch': 2.65} +{'eval_loss': 1.8827629089355469, 'eval_runtime': 39.8462, 'eval_samples_per_second': 25.096, 'eval_steps_per_second': 1.054, 'epoch': 2.65} +{'loss': 1.4401, 'grad_norm': 4.032146453857422, 'learning_rate': 6.775789874381425e-07, 'epoch': 2.65} +{'loss': 1.4151, 'grad_norm': 4.041475772857666, 'learning_rate': 6.756756756756758e-07, 'epoch': 2.65} +{'loss': 1.4097, 'grad_norm': 3.8888063430786133, 'learning_rate': 6.73772363913209e-07, 'epoch': 2.65} +{'loss': 1.3519, 'grad_norm': 3.834587812423706, 'learning_rate': 6.718690521507424e-07, 'epoch': 2.65} +{'loss': 1.4229, 'grad_norm': 3.7665910720825195, 'learning_rate': 6.699657403882756e-07, 'epoch': 2.66} +{'loss': 1.4305, 'grad_norm': 3.7820255756378174, 'learning_rate': 6.68062428625809e-07, 'epoch': 2.66} +{'loss': 1.4143, 'grad_norm': 3.840729236602783, 'learning_rate': 6.661591168633424e-07, 'epoch': 2.66} +{'loss': 1.4096, 'grad_norm': 3.7404730319976807, 'learning_rate': 6.642558051008756e-07, 'epoch': 2.66} +{'loss': 1.4037, 'grad_norm': 3.9895193576812744, 'learning_rate': 6.623524933384089e-07, 'epoch': 2.66} +{'loss': 1.3708, 'grad_norm': 3.8034799098968506, 'learning_rate': 6.604491815759421e-07, 'epoch': 2.66} +{'eval_loss': 1.8798842430114746, 'eval_runtime': 40.0324, 'eval_samples_per_second': 24.98, 'eval_steps_per_second': 1.049, 'epoch': 2.66} +{'loss': 1.3426, 'grad_norm': 3.634685516357422, 'learning_rate': 6.585458698134756e-07, 'epoch': 2.66} +{'loss': 1.4102, 'grad_norm': 3.8531060218811035, 'learning_rate': 6.566425580510089e-07, 'epoch': 2.66} +{'loss': 1.4263, 'grad_norm': 3.7603073120117188, 'learning_rate': 6.547392462885421e-07, 'epoch': 2.66} +{'loss': 1.3852, 'grad_norm': 3.8385210037231445, 'learning_rate': 6.528359345260754e-07, 'epoch': 2.66} +{'loss': 1.4012, 'grad_norm': 3.7867016792297363, 'learning_rate': 6.509326227636087e-07, 'epoch': 2.67} +{'loss': 1.3896, 'grad_norm': 3.7447903156280518, 'learning_rate': 6.490293110011421e-07, 'epoch': 2.67} +{'loss': 1.4377, 'grad_norm': 3.809674024581909, 'learning_rate': 6.471259992386753e-07, 'epoch': 2.67} +{'loss': 1.3886, 'grad_norm': 3.888641834259033, 'learning_rate': 6.452226874762087e-07, 'epoch': 2.67} +{'loss': 1.4087, 'grad_norm': 3.8340442180633545, 'learning_rate': 6.43319375713742e-07, 'epoch': 2.67} +{'loss': 1.3968, 'grad_norm': 3.6957991123199463, 'learning_rate': 6.414160639512752e-07, 'epoch': 2.67} +{'eval_loss': 1.8869060277938843, 'eval_runtime': 40.2127, 'eval_samples_per_second': 24.868, 'eval_steps_per_second': 1.044, 'epoch': 2.67} +{'loss': 1.3691, 'grad_norm': 3.924255609512329, 'learning_rate': 6.395127521888085e-07, 'epoch': 2.67} +{'loss': 1.4472, 'grad_norm': 3.848115921020508, 'learning_rate': 6.37609440426342e-07, 'epoch': 2.67} +{'loss': 1.4098, 'grad_norm': 3.869439125061035, 'learning_rate': 6.357061286638752e-07, 'epoch': 2.67} +{'loss': 1.4394, 'grad_norm': 3.947949171066284, 'learning_rate': 6.338028169014085e-07, 'epoch': 2.67} +{'loss': 1.3112, 'grad_norm': 3.9090356826782227, 'learning_rate': 6.318995051389418e-07, 'epoch': 2.68} +{'loss': 1.4146, 'grad_norm': 3.969165563583374, 'learning_rate': 6.299961933764751e-07, 'epoch': 2.68} +{'loss': 1.486, 'grad_norm': 4.098067760467529, 'learning_rate': 6.280928816140084e-07, 'epoch': 2.68} +{'loss': 1.4267, 'grad_norm': 3.9414467811584473, 'learning_rate': 6.261895698515417e-07, 'epoch': 2.68} +{'loss': 1.3715, 'grad_norm': 3.9655747413635254, 'learning_rate': 6.242862580890751e-07, 'epoch': 2.68} +{'loss': 1.3318, 'grad_norm': 3.8533565998077393, 'learning_rate': 6.223829463266084e-07, 'epoch': 2.68} +{'eval_loss': 1.8865827322006226, 'eval_runtime': 39.8893, 'eval_samples_per_second': 25.069, 'eval_steps_per_second': 1.053, 'epoch': 2.68} +{'loss': 1.3977, 'grad_norm': 3.961475372314453, 'learning_rate': 6.204796345641417e-07, 'epoch': 2.68} +{'loss': 1.4292, 'grad_norm': 3.934936046600342, 'learning_rate': 6.185763228016749e-07, 'epoch': 2.68} +{'loss': 1.4465, 'grad_norm': 4.020622253417969, 'learning_rate': 6.166730110392083e-07, 'epoch': 2.68} +{'loss': 1.4033, 'grad_norm': 3.939596652984619, 'learning_rate': 6.147696992767416e-07, 'epoch': 2.69} +{'loss': 1.3913, 'grad_norm': 3.922213554382324, 'learning_rate': 6.128663875142749e-07, 'epoch': 2.69} +{'loss': 1.4594, 'grad_norm': 3.8975822925567627, 'learning_rate': 6.109630757518082e-07, 'epoch': 2.69} +{'loss': 1.3611, 'grad_norm': 3.819105863571167, 'learning_rate': 6.090597639893414e-07, 'epoch': 2.69} +{'loss': 1.4011, 'grad_norm': 3.7825939655303955, 'learning_rate': 6.071564522268748e-07, 'epoch': 2.69} +{'loss': 1.5104, 'grad_norm': 4.23924446105957, 'learning_rate': 6.052531404644082e-07, 'epoch': 2.69} +{'loss': 1.3833, 'grad_norm': 3.7282018661499023, 'learning_rate': 6.033498287019414e-07, 'epoch': 2.69} +{'eval_loss': 1.8851450681686401, 'eval_runtime': 39.9068, 'eval_samples_per_second': 25.058, 'eval_steps_per_second': 1.052, 'epoch': 2.69} +{'loss': 1.387, 'grad_norm': 3.9149973392486572, 'learning_rate': 6.014465169394748e-07, 'epoch': 2.69} +{'loss': 1.3803, 'grad_norm': 3.8654725551605225, 'learning_rate': 5.99543205177008e-07, 'epoch': 2.69} +{'loss': 1.379, 'grad_norm': 3.8922040462493896, 'learning_rate': 5.976398934145414e-07, 'epoch': 2.69} +{'loss': 1.4009, 'grad_norm': 3.9166221618652344, 'learning_rate': 5.957365816520747e-07, 'epoch': 2.7} +{'loss': 1.3813, 'grad_norm': 3.750443458557129, 'learning_rate': 5.938332698896079e-07, 'epoch': 2.7} +{'loss': 1.3928, 'grad_norm': 3.8591694831848145, 'learning_rate': 5.919299581271413e-07, 'epoch': 2.7} +{'loss': 1.3803, 'grad_norm': 3.6287693977355957, 'learning_rate': 5.900266463646745e-07, 'epoch': 2.7} +{'loss': 1.4274, 'grad_norm': 3.7702808380126953, 'learning_rate': 5.881233346022079e-07, 'epoch': 2.7} +{'loss': 1.4496, 'grad_norm': 3.975019693374634, 'learning_rate': 5.862200228397411e-07, 'epoch': 2.7} +{'loss': 1.4031, 'grad_norm': 3.732516050338745, 'learning_rate': 5.843167110772745e-07, 'epoch': 2.7} +{'eval_loss': 1.8859587907791138, 'eval_runtime': 40.0152, 'eval_samples_per_second': 24.991, 'eval_steps_per_second': 1.05, 'epoch': 2.7} +{'loss': 1.3597, 'grad_norm': 3.8270859718322754, 'learning_rate': 5.824133993148079e-07, 'epoch': 2.7} +{'loss': 1.3436, 'grad_norm': 3.7131991386413574, 'learning_rate': 5.805100875523411e-07, 'epoch': 2.7} +{'loss': 1.3688, 'grad_norm': 3.846182107925415, 'learning_rate': 5.786067757898745e-07, 'epoch': 2.7} +{'loss': 1.4652, 'grad_norm': 3.990445852279663, 'learning_rate': 5.767034640274078e-07, 'epoch': 2.71} +{'loss': 1.3671, 'grad_norm': 3.911424398422241, 'learning_rate': 5.74800152264941e-07, 'epoch': 2.71} +{'loss': 1.4469, 'grad_norm': 4.01072883605957, 'learning_rate': 5.728968405024744e-07, 'epoch': 2.71} +{'loss': 1.4065, 'grad_norm': 3.9382660388946533, 'learning_rate': 5.709935287400076e-07, 'epoch': 2.71} +{'loss': 1.3739, 'grad_norm': 3.8436427116394043, 'learning_rate': 5.69090216977541e-07, 'epoch': 2.71} +{'loss': 1.3798, 'grad_norm': 3.6897571086883545, 'learning_rate': 5.671869052150742e-07, 'epoch': 2.71} +{'loss': 1.3841, 'grad_norm': 3.790116786956787, 'learning_rate': 5.652835934526075e-07, 'epoch': 2.71} +{'eval_loss': 1.8809692859649658, 'eval_runtime': 39.8363, 'eval_samples_per_second': 25.103, 'eval_steps_per_second': 1.054, 'epoch': 2.71} +{'loss': 1.3646, 'grad_norm': 3.708942174911499, 'learning_rate': 5.633802816901409e-07, 'epoch': 2.71} +{'loss': 1.3844, 'grad_norm': 3.8552048206329346, 'learning_rate': 5.614769699276742e-07, 'epoch': 2.71} +{'loss': 1.3855, 'grad_norm': 3.7647571563720703, 'learning_rate': 5.595736581652075e-07, 'epoch': 2.71} +{'loss': 1.4147, 'grad_norm': 3.8995256423950195, 'learning_rate': 5.576703464027408e-07, 'epoch': 2.72} +{'loss': 1.394, 'grad_norm': 3.9707953929901123, 'learning_rate': 5.557670346402741e-07, 'epoch': 2.72} +{'loss': 1.4252, 'grad_norm': 3.9724373817443848, 'learning_rate': 5.538637228778075e-07, 'epoch': 2.72} +{'loss': 1.3544, 'grad_norm': 3.7825074195861816, 'learning_rate': 5.519604111153407e-07, 'epoch': 2.72} +{'loss': 1.4441, 'grad_norm': 3.947127103805542, 'learning_rate': 5.500570993528741e-07, 'epoch': 2.72} +{'loss': 1.4599, 'grad_norm': 3.914632558822632, 'learning_rate': 5.481537875904073e-07, 'epoch': 2.72} +{'loss': 1.3789, 'grad_norm': 3.91018009185791, 'learning_rate': 5.462504758279406e-07, 'epoch': 2.72} +{'eval_loss': 1.878528118133545, 'eval_runtime': 39.9131, 'eval_samples_per_second': 25.054, 'eval_steps_per_second': 1.052, 'epoch': 2.72} +{'loss': 1.3975, 'grad_norm': 3.774101734161377, 'learning_rate': 5.44347164065474e-07, 'epoch': 2.72} +{'loss': 1.4603, 'grad_norm': 4.146400451660156, 'learning_rate': 5.424438523030072e-07, 'epoch': 2.72} +{'loss': 1.4214, 'grad_norm': 3.725928783416748, 'learning_rate': 5.405405405405406e-07, 'epoch': 2.73} +{'loss': 1.4282, 'grad_norm': 3.8616065979003906, 'learning_rate': 5.386372287780739e-07, 'epoch': 2.73} +{'loss': 1.3908, 'grad_norm': 3.8246281147003174, 'learning_rate': 5.367339170156072e-07, 'epoch': 2.73} +{'loss': 1.4109, 'grad_norm': 3.9759585857391357, 'learning_rate': 5.348306052531406e-07, 'epoch': 2.73} +{'loss': 1.3572, 'grad_norm': 3.9480018615722656, 'learning_rate': 5.329272934906738e-07, 'epoch': 2.73} +{'loss': 1.436, 'grad_norm': 3.8713862895965576, 'learning_rate': 5.310239817282072e-07, 'epoch': 2.73} +{'loss': 1.3742, 'grad_norm': 3.8037495613098145, 'learning_rate': 5.291206699657404e-07, 'epoch': 2.73} +{'loss': 1.4028, 'grad_norm': 3.841688871383667, 'learning_rate': 5.272173582032737e-07, 'epoch': 2.73} +{'eval_loss': 1.8792674541473389, 'eval_runtime': 39.9996, 'eval_samples_per_second': 25.0, 'eval_steps_per_second': 1.05, 'epoch': 2.73} +{'loss': 1.3713, 'grad_norm': 3.6583614349365234, 'learning_rate': 5.253140464408071e-07, 'epoch': 2.73} +{'loss': 1.3636, 'grad_norm': 3.880516290664673, 'learning_rate': 5.234107346783403e-07, 'epoch': 2.73} +{'loss': 1.3818, 'grad_norm': 3.8144097328186035, 'learning_rate': 5.215074229158737e-07, 'epoch': 2.74} +{'loss': 1.3997, 'grad_norm': 3.9771406650543213, 'learning_rate': 5.196041111534069e-07, 'epoch': 2.74} +{'loss': 1.4176, 'grad_norm': 4.109496593475342, 'learning_rate': 5.177007993909403e-07, 'epoch': 2.74} +{'loss': 1.4086, 'grad_norm': 3.7382779121398926, 'learning_rate': 5.157974876284736e-07, 'epoch': 2.74} +{'loss': 1.3916, 'grad_norm': 3.69738507270813, 'learning_rate': 5.138941758660069e-07, 'epoch': 2.74} +{'loss': 1.351, 'grad_norm': 3.8113343715667725, 'learning_rate': 5.119908641035403e-07, 'epoch': 2.74} +{'loss': 1.3715, 'grad_norm': 3.838883399963379, 'learning_rate': 5.100875523410735e-07, 'epoch': 2.74} +{'loss': 1.3848, 'grad_norm': 4.007609844207764, 'learning_rate': 5.081842405786068e-07, 'epoch': 2.74} +{'eval_loss': 1.8782685995101929, 'eval_runtime': 40.064, 'eval_samples_per_second': 24.96, 'eval_steps_per_second': 1.048, 'epoch': 2.74} +{'loss': 1.4102, 'grad_norm': 4.021542072296143, 'learning_rate': 5.062809288161402e-07, 'epoch': 2.74} +{'loss': 1.4181, 'grad_norm': 3.822211742401123, 'learning_rate': 5.043776170536734e-07, 'epoch': 2.74} +{'loss': 1.4071, 'grad_norm': 3.9262492656707764, 'learning_rate': 5.024743052912067e-07, 'epoch': 2.75} +{'loss': 1.4325, 'grad_norm': 3.8525044918060303, 'learning_rate': 5.0057099352874e-07, 'epoch': 2.75} +{'loss': 1.4025, 'grad_norm': 3.8408403396606445, 'learning_rate': 4.986676817662733e-07, 'epoch': 2.75} +{'loss': 1.4127, 'grad_norm': 3.8544201850891113, 'learning_rate': 4.967643700038067e-07, 'epoch': 2.75} +{'loss': 1.4484, 'grad_norm': 3.9865224361419678, 'learning_rate': 4.9486105824134e-07, 'epoch': 2.75} +{'loss': 1.4228, 'grad_norm': 3.8513691425323486, 'learning_rate': 4.929577464788733e-07, 'epoch': 2.75} +{'loss': 1.3439, 'grad_norm': 3.7481565475463867, 'learning_rate': 4.910544347164066e-07, 'epoch': 2.75} +{'loss': 1.3991, 'grad_norm': 3.7829861640930176, 'learning_rate': 4.891511229539399e-07, 'epoch': 2.75} +{'eval_loss': 1.8755896091461182, 'eval_runtime': 40.1929, 'eval_samples_per_second': 24.88, 'eval_steps_per_second': 1.045, 'epoch': 2.75} +{'loss': 1.3726, 'grad_norm': 3.7487716674804688, 'learning_rate': 4.872478111914733e-07, 'epoch': 2.75} +{'loss': 1.3923, 'grad_norm': 3.861785650253296, 'learning_rate': 4.853444994290065e-07, 'epoch': 2.76} +{'loss': 1.3688, 'grad_norm': 3.7650818824768066, 'learning_rate': 4.834411876665398e-07, 'epoch': 2.76} +{'loss': 1.4171, 'grad_norm': 3.9085147380828857, 'learning_rate': 4.815378759040731e-07, 'epoch': 2.76} +{'loss': 1.3771, 'grad_norm': 3.9094717502593994, 'learning_rate': 4.796345641416064e-07, 'epoch': 2.76} +{'loss': 1.4028, 'grad_norm': 3.6745688915252686, 'learning_rate': 4.777312523791397e-07, 'epoch': 2.76} +{'loss': 1.4227, 'grad_norm': 4.000057220458984, 'learning_rate': 4.7582794061667306e-07, 'epoch': 2.76} +{'loss': 1.3918, 'grad_norm': 3.7640957832336426, 'learning_rate': 4.739246288542063e-07, 'epoch': 2.76} +{'loss': 1.4599, 'grad_norm': 3.8789303302764893, 'learning_rate': 4.720213170917397e-07, 'epoch': 2.76} +{'loss': 1.3642, 'grad_norm': 3.6986827850341797, 'learning_rate': 4.7011800532927293e-07, 'epoch': 2.76} +{'eval_loss': 1.8756881952285767, 'eval_runtime': 40.2434, 'eval_samples_per_second': 24.849, 'eval_steps_per_second': 1.044, 'epoch': 2.76} +{'loss': 1.4233, 'grad_norm': 3.954890489578247, 'learning_rate': 4.682146935668063e-07, 'epoch': 2.76} +{'loss': 1.3991, 'grad_norm': 4.0259552001953125, 'learning_rate': 4.663113818043396e-07, 'epoch': 2.77} +{'loss': 1.4064, 'grad_norm': 3.7902672290802, 'learning_rate': 4.6440807004187286e-07, 'epoch': 2.77} +{'loss': 1.3802, 'grad_norm': 3.850297212600708, 'learning_rate': 4.625047582794062e-07, 'epoch': 2.77} +{'loss': 1.3925, 'grad_norm': 3.7800862789154053, 'learning_rate': 4.606014465169395e-07, 'epoch': 2.77} +{'loss': 1.3941, 'grad_norm': 3.613258123397827, 'learning_rate': 4.5869813475447284e-07, 'epoch': 2.77} +{'loss': 1.4057, 'grad_norm': 3.944683790206909, 'learning_rate': 4.5679482299200615e-07, 'epoch': 2.77} +{'loss': 1.3352, 'grad_norm': 3.826795816421509, 'learning_rate': 4.548915112295394e-07, 'epoch': 2.77} +{'loss': 1.3898, 'grad_norm': 3.901362895965576, 'learning_rate': 4.529881994670728e-07, 'epoch': 2.77} +{'loss': 1.403, 'grad_norm': 3.8098769187927246, 'learning_rate': 4.5108488770460603e-07, 'epoch': 2.77} +{'eval_loss': 1.8781360387802124, 'eval_runtime': 40.3275, 'eval_samples_per_second': 24.797, 'eval_steps_per_second': 1.041, 'epoch': 2.77} +{'loss': 1.2881, 'grad_norm': 3.648611307144165, 'learning_rate': 4.491815759421394e-07, 'epoch': 2.77} +{'loss': 1.335, 'grad_norm': 4.0549774169921875, 'learning_rate': 4.4727826417967265e-07, 'epoch': 2.78} +{'loss': 1.3988, 'grad_norm': 3.799110174179077, 'learning_rate': 4.4537495241720596e-07, 'epoch': 2.78} +{'loss': 1.4039, 'grad_norm': 3.8234822750091553, 'learning_rate': 4.434716406547393e-07, 'epoch': 2.78} +{'loss': 1.3781, 'grad_norm': 4.1356096267700195, 'learning_rate': 4.415683288922726e-07, 'epoch': 2.78} +{'loss': 1.3514, 'grad_norm': 3.8113937377929688, 'learning_rate': 4.3966501712980594e-07, 'epoch': 2.78} +{'loss': 1.4492, 'grad_norm': 3.9750049114227295, 'learning_rate': 4.377617053673392e-07, 'epoch': 2.78} +{'loss': 1.4118, 'grad_norm': 3.987436056137085, 'learning_rate': 4.358583936048725e-07, 'epoch': 2.78} +{'loss': 1.4007, 'grad_norm': 3.8864099979400635, 'learning_rate': 4.339550818424058e-07, 'epoch': 2.78} +{'loss': 1.4255, 'grad_norm': 3.940422296524048, 'learning_rate': 4.3205177007993913e-07, 'epoch': 2.78} +{'eval_loss': 1.8807488679885864, 'eval_runtime': 39.8633, 'eval_samples_per_second': 25.086, 'eval_steps_per_second': 1.054, 'epoch': 2.78} +{'loss': 1.369, 'grad_norm': 3.852419853210449, 'learning_rate': 4.301484583174725e-07, 'epoch': 2.78} +{'loss': 1.3988, 'grad_norm': 3.8571765422821045, 'learning_rate': 4.2824514655500575e-07, 'epoch': 2.79} +{'loss': 1.3561, 'grad_norm': 3.7478137016296387, 'learning_rate': 4.2634183479253906e-07, 'epoch': 2.79} +{'loss': 1.4568, 'grad_norm': 3.9666666984558105, 'learning_rate': 4.2443852303007237e-07, 'epoch': 2.79} +{'loss': 1.3947, 'grad_norm': 3.79902720451355, 'learning_rate': 4.225352112676057e-07, 'epoch': 2.79} +{'loss': 1.3286, 'grad_norm': 3.9412965774536133, 'learning_rate': 4.20631899505139e-07, 'epoch': 2.79} +{'loss': 1.3642, 'grad_norm': 3.675896406173706, 'learning_rate': 4.187285877426723e-07, 'epoch': 2.79} +{'loss': 1.4735, 'grad_norm': 4.030459880828857, 'learning_rate': 4.1682527598020555e-07, 'epoch': 2.79} +{'loss': 1.3867, 'grad_norm': 3.7431952953338623, 'learning_rate': 4.149219642177389e-07, 'epoch': 2.79} +{'loss': 1.3802, 'grad_norm': 3.7733840942382812, 'learning_rate': 4.130186524552722e-07, 'epoch': 2.79} +{'eval_loss': 1.8798390626907349, 'eval_runtime': 40.0289, 'eval_samples_per_second': 24.982, 'eval_steps_per_second': 1.049, 'epoch': 2.79} +{'loss': 1.3553, 'grad_norm': 3.6864938735961914, 'learning_rate': 4.111153406928055e-07, 'epoch': 2.8} +{'loss': 1.4245, 'grad_norm': 3.868021249771118, 'learning_rate': 4.0921202893033884e-07, 'epoch': 2.8} +{'loss': 1.426, 'grad_norm': 3.8868701457977295, 'learning_rate': 4.073087171678721e-07, 'epoch': 2.8} +{'loss': 1.3606, 'grad_norm': 3.868267059326172, 'learning_rate': 4.0540540540540546e-07, 'epoch': 2.8} +{'loss': 1.3352, 'grad_norm': 3.8654685020446777, 'learning_rate': 4.035020936429387e-07, 'epoch': 2.8} +{'loss': 1.3951, 'grad_norm': 3.9675679206848145, 'learning_rate': 4.0159878188047203e-07, 'epoch': 2.8} +{'loss': 1.3712, 'grad_norm': 3.803370952606201, 'learning_rate': 3.996954701180054e-07, 'epoch': 2.8} +{'loss': 1.396, 'grad_norm': 3.9249520301818848, 'learning_rate': 3.9779215835553865e-07, 'epoch': 2.8} +{'loss': 1.38, 'grad_norm': 3.822579860687256, 'learning_rate': 3.95888846593072e-07, 'epoch': 2.8} +{'loss': 1.3962, 'grad_norm': 3.8149044513702393, 'learning_rate': 3.9398553483060527e-07, 'epoch': 2.8} +{'eval_loss': 1.8817750215530396, 'eval_runtime': 39.8793, 'eval_samples_per_second': 25.076, 'eval_steps_per_second': 1.053, 'epoch': 2.8} +{'loss': 1.3974, 'grad_norm': 3.805476427078247, 'learning_rate': 3.920822230681386e-07, 'epoch': 2.81} +{'loss': 1.3302, 'grad_norm': 3.8896853923797607, 'learning_rate': 3.901789113056719e-07, 'epoch': 2.81} +{'loss': 1.4644, 'grad_norm': 3.9144227504730225, 'learning_rate': 3.882755995432052e-07, 'epoch': 2.81} +{'loss': 1.367, 'grad_norm': 3.7619786262512207, 'learning_rate': 3.8637228778073856e-07, 'epoch': 2.81} +{'loss': 1.3604, 'grad_norm': 3.878241539001465, 'learning_rate': 3.844689760182718e-07, 'epoch': 2.81} +{'loss': 1.4221, 'grad_norm': 3.8346457481384277, 'learning_rate': 3.825656642558051e-07, 'epoch': 2.81} +{'loss': 1.4041, 'grad_norm': 3.7970917224884033, 'learning_rate': 3.8066235249333844e-07, 'epoch': 2.81} +{'loss': 1.438, 'grad_norm': 3.961592197418213, 'learning_rate': 3.7875904073087175e-07, 'epoch': 2.81} +{'loss': 1.4021, 'grad_norm': 3.825028657913208, 'learning_rate': 3.768557289684051e-07, 'epoch': 2.81} +{'loss': 1.3794, 'grad_norm': 3.870042562484741, 'learning_rate': 3.7495241720593836e-07, 'epoch': 2.81} +{'eval_loss': 1.8810781240463257, 'eval_runtime': 39.7514, 'eval_samples_per_second': 25.156, 'eval_steps_per_second': 1.057, 'epoch': 2.81} +{'loss': 1.39, 'grad_norm': 3.6341490745544434, 'learning_rate': 3.730491054434716e-07, 'epoch': 2.82} +{'loss': 1.4356, 'grad_norm': 3.9107167720794678, 'learning_rate': 3.71145793681005e-07, 'epoch': 2.82} +{'loss': 1.4181, 'grad_norm': 3.9296414852142334, 'learning_rate': 3.692424819185383e-07, 'epoch': 2.82} +{'loss': 1.3807, 'grad_norm': 3.772857666015625, 'learning_rate': 3.673391701560716e-07, 'epoch': 2.82} +{'loss': 1.3689, 'grad_norm': 3.741347312927246, 'learning_rate': 3.654358583936049e-07, 'epoch': 2.82} +{'loss': 1.3906, 'grad_norm': 3.7597832679748535, 'learning_rate': 3.6353254663113817e-07, 'epoch': 2.82} +{'loss': 1.3977, 'grad_norm': 3.7832183837890625, 'learning_rate': 3.6162923486867153e-07, 'epoch': 2.82} +{'loss': 1.339, 'grad_norm': 3.741288900375366, 'learning_rate': 3.597259231062048e-07, 'epoch': 2.82} +{'loss': 1.3517, 'grad_norm': 3.7187626361846924, 'learning_rate': 3.5782261134373815e-07, 'epoch': 2.82} +{'loss': 1.4167, 'grad_norm': 3.8858604431152344, 'learning_rate': 3.5591929958127146e-07, 'epoch': 2.82} +{'eval_loss': 1.8796318769454956, 'eval_runtime': 40.0089, 'eval_samples_per_second': 24.994, 'eval_steps_per_second': 1.05, 'epoch': 2.82} +{'loss': 1.414, 'grad_norm': 3.7923436164855957, 'learning_rate': 3.540159878188047e-07, 'epoch': 2.83} +{'loss': 1.4017, 'grad_norm': 3.831918954849243, 'learning_rate': 3.521126760563381e-07, 'epoch': 2.83} +{'loss': 1.3576, 'grad_norm': 3.7947099208831787, 'learning_rate': 3.5020936429387134e-07, 'epoch': 2.83} +{'loss': 1.4729, 'grad_norm': 4.066439151763916, 'learning_rate': 3.483060525314047e-07, 'epoch': 2.83} +{'loss': 1.3588, 'grad_norm': 3.780667304992676, 'learning_rate': 3.46402740768938e-07, 'epoch': 2.83} +{'loss': 1.4295, 'grad_norm': 3.9543967247009277, 'learning_rate': 3.4449942900647127e-07, 'epoch': 2.83} +{'loss': 1.4917, 'grad_norm': 3.887244701385498, 'learning_rate': 3.4259611724400463e-07, 'epoch': 2.83} +{'loss': 1.3756, 'grad_norm': 3.771094799041748, 'learning_rate': 3.406928054815379e-07, 'epoch': 2.83} +{'loss': 1.3595, 'grad_norm': 3.649413585662842, 'learning_rate': 3.3878949371907125e-07, 'epoch': 2.83} +{'loss': 1.3523, 'grad_norm': 3.767350435256958, 'learning_rate': 3.368861819566045e-07, 'epoch': 2.84} +{'eval_loss': 1.878659963607788, 'eval_runtime': 40.0455, 'eval_samples_per_second': 24.972, 'eval_steps_per_second': 1.049, 'epoch': 2.84} +{'loss': 1.3306, 'grad_norm': 3.7707576751708984, 'learning_rate': 3.349828701941378e-07, 'epoch': 2.84} +{'loss': 1.3625, 'grad_norm': 3.825338125228882, 'learning_rate': 3.330795584316712e-07, 'epoch': 2.84} +{'loss': 1.4346, 'grad_norm': 4.062577724456787, 'learning_rate': 3.3117624666920443e-07, 'epoch': 2.84} +{'loss': 1.3977, 'grad_norm': 3.8770904541015625, 'learning_rate': 3.292729349067378e-07, 'epoch': 2.84} +{'loss': 1.4383, 'grad_norm': 3.983837604522705, 'learning_rate': 3.2736962314427105e-07, 'epoch': 2.84} +{'loss': 1.3682, 'grad_norm': 3.7786667346954346, 'learning_rate': 3.2546631138180436e-07, 'epoch': 2.84} +{'loss': 1.4481, 'grad_norm': 3.8814167976379395, 'learning_rate': 3.2356299961933767e-07, 'epoch': 2.84} +{'loss': 1.3555, 'grad_norm': 3.8296091556549072, 'learning_rate': 3.21659687856871e-07, 'epoch': 2.84} +{'loss': 1.4411, 'grad_norm': 3.8836183547973633, 'learning_rate': 3.1975637609440424e-07, 'epoch': 2.84} +{'loss': 1.3873, 'grad_norm': 4.102138042449951, 'learning_rate': 3.178530643319376e-07, 'epoch': 2.85} +{'eval_loss': 1.8775180578231812, 'eval_runtime': 40.0111, 'eval_samples_per_second': 24.993, 'eval_steps_per_second': 1.05, 'epoch': 2.85} +{'loss': 1.4022, 'grad_norm': 3.9818458557128906, 'learning_rate': 3.159497525694709e-07, 'epoch': 2.85} +{'loss': 1.4748, 'grad_norm': 3.8949356079101562, 'learning_rate': 3.140464408070042e-07, 'epoch': 2.85} +{'loss': 1.3827, 'grad_norm': 3.7249393463134766, 'learning_rate': 3.1214312904453753e-07, 'epoch': 2.85} +{'loss': 1.4069, 'grad_norm': 3.6749801635742188, 'learning_rate': 3.1023981728207084e-07, 'epoch': 2.85} +{'loss': 1.3874, 'grad_norm': 3.930877447128296, 'learning_rate': 3.0833650551960415e-07, 'epoch': 2.85} +{'loss': 1.3926, 'grad_norm': 3.789501190185547, 'learning_rate': 3.0643319375713746e-07, 'epoch': 2.85} +{'loss': 1.3382, 'grad_norm': 3.7185535430908203, 'learning_rate': 3.045298819946707e-07, 'epoch': 2.85} +{'loss': 1.3589, 'grad_norm': 3.8357388973236084, 'learning_rate': 3.026265702322041e-07, 'epoch': 2.85} +{'loss': 1.364, 'grad_norm': 3.8080005645751953, 'learning_rate': 3.007232584697374e-07, 'epoch': 2.85} +{'loss': 1.3126, 'grad_norm': 3.756728410720825, 'learning_rate': 2.988199467072707e-07, 'epoch': 2.86} +{'eval_loss': 1.8784682750701904, 'eval_runtime': 39.9385, 'eval_samples_per_second': 25.039, 'eval_steps_per_second': 1.052, 'epoch': 2.86} +{'loss': 1.4184, 'grad_norm': 3.9970998764038086, 'learning_rate': 2.9691663494480396e-07, 'epoch': 2.86} +{'loss': 1.3635, 'grad_norm': 3.757476806640625, 'learning_rate': 2.9501332318233727e-07, 'epoch': 2.86} +{'loss': 1.391, 'grad_norm': 4.100804805755615, 'learning_rate': 2.931100114198706e-07, 'epoch': 2.86} +{'loss': 1.339, 'grad_norm': 3.6817245483398438, 'learning_rate': 2.9120669965740394e-07, 'epoch': 2.86} +{'loss': 1.3712, 'grad_norm': 3.8400635719299316, 'learning_rate': 2.8930338789493725e-07, 'epoch': 2.86} +{'loss': 1.4045, 'grad_norm': 3.810770273208618, 'learning_rate': 2.874000761324705e-07, 'epoch': 2.86} +{'loss': 1.3917, 'grad_norm': 3.851249933242798, 'learning_rate': 2.854967643700038e-07, 'epoch': 2.86} +{'loss': 1.402, 'grad_norm': 3.958754301071167, 'learning_rate': 2.835934526075371e-07, 'epoch': 2.86} +{'loss': 1.413, 'grad_norm': 3.960965871810913, 'learning_rate': 2.8169014084507043e-07, 'epoch': 2.86} +{'loss': 1.3891, 'grad_norm': 3.8343114852905273, 'learning_rate': 2.7978682908260374e-07, 'epoch': 2.87} +{'eval_loss': 1.8787450790405273, 'eval_runtime': 40.0609, 'eval_samples_per_second': 24.962, 'eval_steps_per_second': 1.048, 'epoch': 2.87} +{'loss': 1.3575, 'grad_norm': 3.7968180179595947, 'learning_rate': 2.7788351732013705e-07, 'epoch': 2.87} +{'loss': 1.3716, 'grad_norm': 3.837141752243042, 'learning_rate': 2.7598020555767036e-07, 'epoch': 2.87} +{'loss': 1.4415, 'grad_norm': 3.918738842010498, 'learning_rate': 2.7407689379520367e-07, 'epoch': 2.87} +{'loss': 1.4153, 'grad_norm': 3.793039083480835, 'learning_rate': 2.72173582032737e-07, 'epoch': 2.87} +{'loss': 1.3541, 'grad_norm': 3.81770396232605, 'learning_rate': 2.702702702702703e-07, 'epoch': 2.87} +{'loss': 1.4247, 'grad_norm': 3.904343605041504, 'learning_rate': 2.683669585078036e-07, 'epoch': 2.87} +{'loss': 1.3768, 'grad_norm': 3.822688579559326, 'learning_rate': 2.664636467453369e-07, 'epoch': 2.87} +{'loss': 1.3494, 'grad_norm': 3.664989709854126, 'learning_rate': 2.645603349828702e-07, 'epoch': 2.87} +{'loss': 1.3519, 'grad_norm': 3.7318038940429688, 'learning_rate': 2.6265702322040353e-07, 'epoch': 2.88} +{'loss': 1.3858, 'grad_norm': 4.023704528808594, 'learning_rate': 2.6075371145793684e-07, 'epoch': 2.88} +{'eval_loss': 1.8771593570709229, 'eval_runtime': 40.1279, 'eval_samples_per_second': 24.92, 'eval_steps_per_second': 1.047, 'epoch': 2.88} +{'loss': 1.3301, 'grad_norm': 3.894585132598877, 'learning_rate': 2.5885039969547015e-07, 'epoch': 2.88} +{'loss': 1.3723, 'grad_norm': 3.7344110012054443, 'learning_rate': 2.5694708793300346e-07, 'epoch': 2.88} +{'loss': 1.3897, 'grad_norm': 3.9429214000701904, 'learning_rate': 2.5504377617053677e-07, 'epoch': 2.88} +{'loss': 1.4022, 'grad_norm': 3.938826560974121, 'learning_rate': 2.531404644080701e-07, 'epoch': 2.88} +{'loss': 1.3923, 'grad_norm': 3.7782013416290283, 'learning_rate': 2.5123715264560333e-07, 'epoch': 2.88} +{'loss': 1.4048, 'grad_norm': 3.71818208694458, 'learning_rate': 2.4933384088313664e-07, 'epoch': 2.88} +{'loss': 1.3517, 'grad_norm': 3.7149171829223633, 'learning_rate': 2.4743052912067e-07, 'epoch': 2.88} +{'loss': 1.4218, 'grad_norm': 3.7881288528442383, 'learning_rate': 2.455272173582033e-07, 'epoch': 2.88} +{'loss': 1.4443, 'grad_norm': 3.8458030223846436, 'learning_rate': 2.436239055957366e-07, 'epoch': 2.89} +{'loss': 1.3979, 'grad_norm': 3.98480224609375, 'learning_rate': 2.417205938332699e-07, 'epoch': 2.89} +{'eval_loss': 1.8776938915252686, 'eval_runtime': 39.8236, 'eval_samples_per_second': 25.111, 'eval_steps_per_second': 1.055, 'epoch': 2.89} +{'loss': 1.4323, 'grad_norm': 3.8022677898406982, 'learning_rate': 2.398172820708032e-07, 'epoch': 2.89} +{'loss': 1.3934, 'grad_norm': 3.918951988220215, 'learning_rate': 2.3791397030833653e-07, 'epoch': 2.89} +{'loss': 1.3885, 'grad_norm': 3.82030987739563, 'learning_rate': 2.3601065854586984e-07, 'epoch': 2.89} +{'loss': 1.3594, 'grad_norm': 3.8477871417999268, 'learning_rate': 2.3410734678340315e-07, 'epoch': 2.89} +{'loss': 1.3829, 'grad_norm': 3.7906508445739746, 'learning_rate': 2.3220403502093643e-07, 'epoch': 2.89} +{'loss': 1.3358, 'grad_norm': 3.7985310554504395, 'learning_rate': 2.3030072325846974e-07, 'epoch': 2.89} +{'loss': 1.3955, 'grad_norm': 3.927086591720581, 'learning_rate': 2.2839741149600308e-07, 'epoch': 2.89} +{'loss': 1.4013, 'grad_norm': 4.046324729919434, 'learning_rate': 2.264940997335364e-07, 'epoch': 2.89} +{'loss': 1.3553, 'grad_norm': 3.8483235836029053, 'learning_rate': 2.245907879710697e-07, 'epoch': 2.9} +{'loss': 1.4044, 'grad_norm': 3.8777379989624023, 'learning_rate': 2.2268747620860298e-07, 'epoch': 2.9} +{'eval_loss': 1.876637578010559, 'eval_runtime': 39.969, 'eval_samples_per_second': 25.019, 'eval_steps_per_second': 1.051, 'epoch': 2.9} +{'loss': 1.3438, 'grad_norm': 3.7980751991271973, 'learning_rate': 2.207841644461363e-07, 'epoch': 2.9} +{'loss': 1.4921, 'grad_norm': 3.865647315979004, 'learning_rate': 2.188808526836696e-07, 'epoch': 2.9} +{'loss': 1.3674, 'grad_norm': 3.8148233890533447, 'learning_rate': 2.169775409212029e-07, 'epoch': 2.9} +{'loss': 1.4084, 'grad_norm': 3.9391632080078125, 'learning_rate': 2.1507422915873625e-07, 'epoch': 2.9} +{'loss': 1.4206, 'grad_norm': 3.774245262145996, 'learning_rate': 2.1317091739626953e-07, 'epoch': 2.9} +{'loss': 1.3643, 'grad_norm': 3.7833707332611084, 'learning_rate': 2.1126760563380284e-07, 'epoch': 2.9} +{'loss': 1.3771, 'grad_norm': 3.8323473930358887, 'learning_rate': 2.0936429387133615e-07, 'epoch': 2.9} +{'loss': 1.3899, 'grad_norm': 3.6737687587738037, 'learning_rate': 2.0746098210886946e-07, 'epoch': 2.9} +{'loss': 1.4548, 'grad_norm': 3.812310218811035, 'learning_rate': 2.0555767034640274e-07, 'epoch': 2.91} +{'loss': 1.3947, 'grad_norm': 3.816920280456543, 'learning_rate': 2.0365435858393605e-07, 'epoch': 2.91} +{'eval_loss': 1.8750535249710083, 'eval_runtime': 40.0365, 'eval_samples_per_second': 24.977, 'eval_steps_per_second': 1.049, 'epoch': 2.91} +{'loss': 1.4075, 'grad_norm': 3.8060126304626465, 'learning_rate': 2.0175104682146936e-07, 'epoch': 2.91} +{'loss': 1.371, 'grad_norm': 3.7699368000030518, 'learning_rate': 1.998477350590027e-07, 'epoch': 2.91} +{'loss': 1.43, 'grad_norm': 3.9050779342651367, 'learning_rate': 1.97944423296536e-07, 'epoch': 2.91} +{'loss': 1.416, 'grad_norm': 3.7673375606536865, 'learning_rate': 1.960411115340693e-07, 'epoch': 2.91} +{'loss': 1.3748, 'grad_norm': 3.808513641357422, 'learning_rate': 1.941377997716026e-07, 'epoch': 2.91} +{'loss': 1.3889, 'grad_norm': 3.9608469009399414, 'learning_rate': 1.922344880091359e-07, 'epoch': 2.91} +{'loss': 1.3308, 'grad_norm': 3.740614175796509, 'learning_rate': 1.9033117624666922e-07, 'epoch': 2.91} +{'loss': 1.3874, 'grad_norm': 3.846964120864868, 'learning_rate': 1.8842786448420255e-07, 'epoch': 2.92} +{'loss': 1.4469, 'grad_norm': 3.916963815689087, 'learning_rate': 1.865245527217358e-07, 'epoch': 2.92} +{'loss': 1.3365, 'grad_norm': 3.7335784435272217, 'learning_rate': 1.8462124095926915e-07, 'epoch': 2.92} +{'eval_loss': 1.8740971088409424, 'eval_runtime': 40.0616, 'eval_samples_per_second': 24.962, 'eval_steps_per_second': 1.048, 'epoch': 2.92} +{'loss': 1.4053, 'grad_norm': 3.8188748359680176, 'learning_rate': 1.8271792919680246e-07, 'epoch': 2.92} +{'loss': 1.3745, 'grad_norm': 3.7970457077026367, 'learning_rate': 1.8081461743433577e-07, 'epoch': 2.92} +{'loss': 1.4269, 'grad_norm': 3.8884096145629883, 'learning_rate': 1.7891130567186908e-07, 'epoch': 2.92} +{'loss': 1.3612, 'grad_norm': 3.6271514892578125, 'learning_rate': 1.7700799390940236e-07, 'epoch': 2.92} +{'loss': 1.4071, 'grad_norm': 3.821834087371826, 'learning_rate': 1.7510468214693567e-07, 'epoch': 2.92} +{'loss': 1.3986, 'grad_norm': 3.8108725547790527, 'learning_rate': 1.73201370384469e-07, 'epoch': 2.92} +{'loss': 1.3517, 'grad_norm': 3.64693546295166, 'learning_rate': 1.7129805862200231e-07, 'epoch': 2.92} +{'loss': 1.4098, 'grad_norm': 3.7155401706695557, 'learning_rate': 1.6939474685953562e-07, 'epoch': 2.93} +{'loss': 1.4139, 'grad_norm': 3.8417892456054688, 'learning_rate': 1.674914350970689e-07, 'epoch': 2.93} +{'loss': 1.3986, 'grad_norm': 3.7051541805267334, 'learning_rate': 1.6558812333460222e-07, 'epoch': 2.93} +{'eval_loss': 1.875389575958252, 'eval_runtime': 40.0579, 'eval_samples_per_second': 24.964, 'eval_steps_per_second': 1.048, 'epoch': 2.93} +{'loss': 1.3724, 'grad_norm': 3.767265558242798, 'learning_rate': 1.6368481157213553e-07, 'epoch': 2.93} +{'loss': 1.3924, 'grad_norm': 3.9107353687286377, 'learning_rate': 1.6178149980966884e-07, 'epoch': 2.93} +{'loss': 1.3853, 'grad_norm': 3.884481430053711, 'learning_rate': 1.5987818804720212e-07, 'epoch': 2.93} +{'loss': 1.3976, 'grad_norm': 3.764592170715332, 'learning_rate': 1.5797487628473546e-07, 'epoch': 2.93} +{'loss': 1.36, 'grad_norm': 3.720179796218872, 'learning_rate': 1.5607156452226877e-07, 'epoch': 2.93} +{'loss': 1.4155, 'grad_norm': 3.837712287902832, 'learning_rate': 1.5416825275980208e-07, 'epoch': 2.93} +{'loss': 1.4197, 'grad_norm': 3.7450170516967773, 'learning_rate': 1.5226494099733536e-07, 'epoch': 2.93} +{'loss': 1.3732, 'grad_norm': 3.8077738285064697, 'learning_rate': 1.503616292348687e-07, 'epoch': 2.94} +{'loss': 1.397, 'grad_norm': 3.8688950538635254, 'learning_rate': 1.4845831747240198e-07, 'epoch': 2.94} +{'loss': 1.4036, 'grad_norm': 3.8784477710723877, 'learning_rate': 1.465550057099353e-07, 'epoch': 2.94} +{'eval_loss': 1.8752580881118774, 'eval_runtime': 40.1129, 'eval_samples_per_second': 24.93, 'eval_steps_per_second': 1.047, 'epoch': 2.94} +{'loss': 1.3785, 'grad_norm': 3.959904432296753, 'learning_rate': 1.4465169394746862e-07, 'epoch': 2.94} +{'loss': 1.3352, 'grad_norm': 3.763336658477783, 'learning_rate': 1.427483821850019e-07, 'epoch': 2.94} +{'loss': 1.3657, 'grad_norm': 3.7203028202056885, 'learning_rate': 1.4084507042253522e-07, 'epoch': 2.94} +{'loss': 1.3609, 'grad_norm': 3.803619623184204, 'learning_rate': 1.3894175866006853e-07, 'epoch': 2.94} +{'loss': 1.3946, 'grad_norm': 3.883944272994995, 'learning_rate': 1.3703844689760184e-07, 'epoch': 2.94} +{'loss': 1.3666, 'grad_norm': 3.7392418384552, 'learning_rate': 1.3513513513513515e-07, 'epoch': 2.94} +{'loss': 1.4155, 'grad_norm': 3.9345693588256836, 'learning_rate': 1.3323182337266846e-07, 'epoch': 2.95} +{'loss': 1.3589, 'grad_norm': 3.7840828895568848, 'learning_rate': 1.3132851161020176e-07, 'epoch': 2.95} +{'loss': 1.3847, 'grad_norm': 3.7329013347625732, 'learning_rate': 1.2942519984773507e-07, 'epoch': 2.95} +{'loss': 1.4114, 'grad_norm': 3.814135789871216, 'learning_rate': 1.2752188808526838e-07, 'epoch': 2.95} +{'eval_loss': 1.8759804964065552, 'eval_runtime': 40.1207, 'eval_samples_per_second': 24.925, 'eval_steps_per_second': 1.047, 'epoch': 2.95} +{'loss': 1.44, 'grad_norm': 3.935213565826416, 'learning_rate': 1.2561857632280167e-07, 'epoch': 2.95} +{'loss': 1.3937, 'grad_norm': 3.749368667602539, 'learning_rate': 1.23715264560335e-07, 'epoch': 2.95} +{'loss': 1.4213, 'grad_norm': 3.906313419342041, 'learning_rate': 1.218119527978683e-07, 'epoch': 2.95} +{'loss': 1.3404, 'grad_norm': 3.701674699783325, 'learning_rate': 1.199086410354016e-07, 'epoch': 2.95} +{'loss': 1.3777, 'grad_norm': 3.7840311527252197, 'learning_rate': 1.1800532927293492e-07, 'epoch': 2.95} +{'loss': 1.3818, 'grad_norm': 3.9264726638793945, 'learning_rate': 1.1610201751046822e-07, 'epoch': 2.95} +{'loss': 1.4202, 'grad_norm': 3.9638118743896484, 'learning_rate': 1.1419870574800154e-07, 'epoch': 2.96} +{'loss': 1.3681, 'grad_norm': 3.7045352458953857, 'learning_rate': 1.1229539398553485e-07, 'epoch': 2.96} +{'loss': 1.3982, 'grad_norm': 3.815375566482544, 'learning_rate': 1.1039208222306814e-07, 'epoch': 2.96} +{'loss': 1.4053, 'grad_norm': 3.9582948684692383, 'learning_rate': 1.0848877046060145e-07, 'epoch': 2.96} +{'eval_loss': 1.8760737180709839, 'eval_runtime': 40.0764, 'eval_samples_per_second': 24.952, 'eval_steps_per_second': 1.048, 'epoch': 2.96} +{'loss': 1.3991, 'grad_norm': 3.9250643253326416, 'learning_rate': 1.0658545869813476e-07, 'epoch': 2.96} +{'loss': 1.3822, 'grad_norm': 3.6423048973083496, 'learning_rate': 1.0468214693566807e-07, 'epoch': 2.96} +{'loss': 1.386, 'grad_norm': 3.833054542541504, 'learning_rate': 1.0277883517320137e-07, 'epoch': 2.96} +{'loss': 1.385, 'grad_norm': 3.882209062576294, 'learning_rate': 1.0087552341073468e-07, 'epoch': 2.96} +{'loss': 1.3781, 'grad_norm': 3.882456064224243, 'learning_rate': 9.8972211648268e-08, 'epoch': 2.96} +{'loss': 1.3352, 'grad_norm': 3.668623685836792, 'learning_rate': 9.70688998858013e-08, 'epoch': 2.96} +{'loss': 1.3949, 'grad_norm': 3.8989810943603516, 'learning_rate': 9.516558812333461e-08, 'epoch': 2.97} +{'loss': 1.4023, 'grad_norm': 3.904904365539551, 'learning_rate': 9.32622763608679e-08, 'epoch': 2.97} +{'loss': 1.3798, 'grad_norm': 3.7808146476745605, 'learning_rate': 9.135896459840123e-08, 'epoch': 2.97} +{'loss': 1.4055, 'grad_norm': 3.98618745803833, 'learning_rate': 8.945565283593454e-08, 'epoch': 2.97} +{'eval_loss': 1.8772724866867065, 'eval_runtime': 40.1657, 'eval_samples_per_second': 24.897, 'eval_steps_per_second': 1.046, 'epoch': 2.97} +{'loss': 1.3593, 'grad_norm': 3.8462116718292236, 'learning_rate': 8.755234107346783e-08, 'epoch': 2.97} +{'loss': 1.3448, 'grad_norm': 3.830436944961548, 'learning_rate': 8.564902931100116e-08, 'epoch': 2.97} +{'loss': 1.4068, 'grad_norm': 3.754737377166748, 'learning_rate': 8.374571754853445e-08, 'epoch': 2.97} +{'loss': 1.4236, 'grad_norm': 4.041820049285889, 'learning_rate': 8.184240578606776e-08, 'epoch': 2.97} +{'loss': 1.3933, 'grad_norm': 3.8910529613494873, 'learning_rate': 7.993909402360106e-08, 'epoch': 2.97} +{'loss': 1.4421, 'grad_norm': 3.781639575958252, 'learning_rate': 7.803578226113438e-08, 'epoch': 2.97} +{'loss': 1.396, 'grad_norm': 3.9327352046966553, 'learning_rate': 7.613247049866768e-08, 'epoch': 2.98} +{'loss': 1.3634, 'grad_norm': 3.8365159034729004, 'learning_rate': 7.422915873620099e-08, 'epoch': 2.98} +{'loss': 1.4008, 'grad_norm': 3.915872573852539, 'learning_rate': 7.232584697373431e-08, 'epoch': 2.98} +{'loss': 1.3657, 'grad_norm': 3.733189582824707, 'learning_rate': 7.042253521126761e-08, 'epoch': 2.98} +{'eval_loss': 1.8770265579223633, 'eval_runtime': 40.1227, 'eval_samples_per_second': 24.924, 'eval_steps_per_second': 1.047, 'epoch': 2.98} +{'loss': 1.3674, 'grad_norm': 3.923065423965454, 'learning_rate': 6.851922344880092e-08, 'epoch': 2.98} +{'loss': 1.4553, 'grad_norm': 3.867604970932007, 'learning_rate': 6.661591168633423e-08, 'epoch': 2.98} +{'loss': 1.4208, 'grad_norm': 3.7667503356933594, 'learning_rate': 6.471259992386754e-08, 'epoch': 2.98} +{'loss': 1.3791, 'grad_norm': 3.8518869876861572, 'learning_rate': 6.280928816140083e-08, 'epoch': 2.98} +{'loss': 1.3772, 'grad_norm': 3.7714622020721436, 'learning_rate': 6.090597639893416e-08, 'epoch': 2.98} +{'loss': 1.3678, 'grad_norm': 3.6855297088623047, 'learning_rate': 5.900266463646746e-08, 'epoch': 2.99} +{'loss': 1.3635, 'grad_norm': 3.774871349334717, 'learning_rate': 5.709935287400077e-08, 'epoch': 2.99} +{'loss': 1.3088, 'grad_norm': 3.7525200843811035, 'learning_rate': 5.519604111153407e-08, 'epoch': 2.99} +{'loss': 1.3583, 'grad_norm': 3.7506844997406006, 'learning_rate': 5.329272934906738e-08, 'epoch': 2.99} +{'loss': 1.3428, 'grad_norm': 3.7373640537261963, 'learning_rate': 5.1389417586600685e-08, 'epoch': 2.99} +{'eval_loss': 1.8772258758544922, 'eval_runtime': 39.9021, 'eval_samples_per_second': 25.061, 'eval_steps_per_second': 1.053, 'epoch': 2.99} +{'loss': 1.4251, 'grad_norm': 3.9256906509399414, 'learning_rate': 4.9486105824134e-08, 'epoch': 2.99} +{'loss': 1.405, 'grad_norm': 3.6815032958984375, 'learning_rate': 4.7582794061667304e-08, 'epoch': 2.99} +{'loss': 1.4257, 'grad_norm': 3.846493721008301, 'learning_rate': 4.5679482299200614e-08, 'epoch': 2.99} +{'loss': 1.3998, 'grad_norm': 3.87013840675354, 'learning_rate': 4.377617053673392e-08, 'epoch': 2.99} +{'loss': 1.3759, 'grad_norm': 3.7858004570007324, 'learning_rate': 4.187285877426723e-08, 'epoch': 2.99} +{'loss': 1.3621, 'grad_norm': 3.7252180576324463, 'learning_rate': 3.996954701180053e-08, 'epoch': 3.0} +{'loss': 1.4283, 'grad_norm': 3.871983528137207, 'learning_rate': 3.806623524933384e-08, 'epoch': 3.0} +{'loss': 1.3602, 'grad_norm': 3.8000223636627197, 'learning_rate': 3.6162923486867156e-08, 'epoch': 3.0} +{'loss': 1.3689, 'grad_norm': 3.6668052673339844, 'learning_rate': 3.425961172440046e-08, 'epoch': 3.0} +{'train_runtime': 52789.9023, 'train_samples_per_second': 0.996, 'train_steps_per_second': 0.055, 'train_loss': 1.5552208020532732, 'epoch': 3.0} diff --git a/wandb/run-20241128_161554-9wf9o0ou/files/wandb-metadata.json b/wandb/run-20241128_161554-9wf9o0ou/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..fea8bc1a7306e8a9a7cc6b083a8f1dbff8f260b3 --- /dev/null +++ b/wandb/run-20241128_161554-9wf9o0ou/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-11-28T21:15:54.208363Z", + "args": [ + "--perturbation", + "reverse_control", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "3", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_llama_1B.py", + "codePath": "train/train_llama_1B.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_llama_1B.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1723122159616" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241128_161554-9wf9o0ou/files/wandb-summary.json b/wandb/run-20241128_161554-9wf9o0ou/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..ae48f2e2554f9504bf35fc74fe0335a5e71e80ff --- /dev/null +++ b/wandb/run-20241128_161554-9wf9o0ou/files/wandb-summary.json @@ -0,0 +1 @@ +{"train/loss":1.3689,"_step":3210,"_timestamp":1.7328815553736565e+09,"_wandb":{"runtime":53001},"total_flos":3.141491282946294e+17,"eval/steps_per_second":1.053,"train/grad_norm":3.6668052673339844,"train_steps_per_second":0.055,"train/epoch":2.9984591679506933,"train_loss":1.5552208020532732,"eval/loss":1.8772258758544922,"eval/samples_per_second":25.061,"train_runtime":52789.9023,"eval/runtime":39.9021,"train_samples_per_second":0.996,"train/global_step":2919,"_runtime":53001.165421808,"train/learning_rate":3.425961172440046e-08} \ No newline at end of file diff --git a/wandb/run-20241128_161554-9wf9o0ou/logs/debug-internal.log b/wandb/run-20241128_161554-9wf9o0ou/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..1e28b8cc5607590d5aab577f78e7c4d664e79648 --- /dev/null +++ b/wandb/run-20241128_161554-9wf9o0ou/logs/debug-internal.log @@ -0,0 +1,18 @@ +{"time":"2024-11-28T16:15:54.215654996-05:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-11-28T16:15:54.215683856-05:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241128_161554-9wf9o0ou/logs/debug-core.log"} +{"time":"2024-11-28T16:15:54.328479615-05:00","level":"INFO","msg":"created new stream","id":"9wf9o0ou"} +{"time":"2024-11-28T16:15:54.328565136-05:00","level":"INFO","msg":"stream: started","id":"9wf9o0ou"} +{"time":"2024-11-28T16:15:54.328903288-05:00","level":"INFO","msg":"handler: started","stream_id":{"value":"9wf9o0ou"}} +{"time":"2024-11-28T16:15:54.331100353-05:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"9wf9o0ou"}} +{"time":"2024-11-28T16:15:54.331138184-05:00","level":"INFO","msg":"sender: started","stream_id":"9wf9o0ou"} +{"time":"2024-11-28T16:15:54.591675463-05:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2024-11-28T21:15:00.365952818-05:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"} +{"time":"2024-11-29T06:59:15.38285535-05:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2024-11-29T06:59:15.388270397-05:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2024-11-29T06:59:15.893667875-05:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2024-11-29T06:59:16.064361699-05:00","level":"INFO","msg":"handler: operation stats","stats":{}} +{"time":"2024-11-29T06:59:17.074101017-05:00","level":"INFO","msg":"stream: closing","id":"9wf9o0ou"} +{"time":"2024-11-29T06:59:17.074123837-05:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"9wf9o0ou"}} +{"time":"2024-11-29T06:59:17.074143497-05:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"9wf9o0ou"}} +{"time":"2024-11-29T06:59:17.074180267-05:00","level":"INFO","msg":"sender: closed","stream_id":"9wf9o0ou"} +{"time":"2024-11-29T06:59:17.074217878-05:00","level":"INFO","msg":"stream: closed","id":"9wf9o0ou"} diff --git a/wandb/run-20241128_161554-9wf9o0ou/logs/debug.log b/wandb/run-20241128_161554-9wf9o0ou/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..85fb2124f89280cdffb896cdf0b0cb7ffb746531 --- /dev/null +++ b/wandb/run-20241128_161554-9wf9o0ou/logs/debug.log @@ -0,0 +1,36 @@ +2024-11-28 16:15:54,206 INFO MainThread:3101594 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-11-28 16:15:54,206 INFO MainThread:3101594 [wandb_setup.py:_flush():79] Configure stats pid to 3101594 +2024-11-28 16:15:54,206 INFO MainThread:3101594 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-11-28 16:15:54,206 INFO MainThread:3101594 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-11-28 16:15:54,206 INFO MainThread:3101594 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-11-28 16:15:54,206 INFO MainThread:3101594 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-11-28 16:15:54,206 INFO MainThread:3101594 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_llama_1B.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_llama_1B.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_llama_1B.py'} +2024-11-28 16:15:54,206 INFO MainThread:3101594 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-11-28 16:15:54,206 INFO MainThread:3101594 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241128_161554-9wf9o0ou/logs/debug.log +2024-11-28 16:15:54,206 INFO MainThread:3101594 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241128_161554-9wf9o0ou/logs/debug-internal.log +2024-11-28 16:15:54,206 INFO MainThread:3101594 [wandb_init.py:init():621] calling init triggers +2024-11-28 16:15:54,206 INFO MainThread:3101594 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-11-28 16:15:54,206 INFO MainThread:3101594 [wandb_init.py:init():671] starting backend +2024-11-28 16:15:54,206 INFO MainThread:3101594 [wandb_init.py:init():675] sending inform_init request +2024-11-28 16:15:54,207 INFO MainThread:3101594 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-11-28 16:15:54,208 INFO MainThread:3101594 [wandb_init.py:init():688] backend started and connected +2024-11-28 16:15:54,212 INFO MainThread:3101594 [wandb_init.py:init():783] updated telemetry +2024-11-28 16:15:54,241 INFO MainThread:3101594 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-11-28 16:15:54,587 INFO MainThread:3101594 [wandb_init.py:init():867] starting run threads in backend +2024-11-28 16:15:54,714 INFO MainThread:3101594 [wandb_run.py:_console_start():2463] atexit reg +2024-11-28 16:15:54,714 INFO MainThread:3101594 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-11-28 16:15:54,714 INFO MainThread:3101594 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-11-28 16:15:54,714 INFO MainThread:3101594 [wandb_run.py:_redirect():2401] Redirects installed. +2024-11-28 16:15:54,716 INFO MainThread:3101594 [wandb_init.py:init():911] run started, returning control to user process +2024-11-28 16:15:54,716 INFO MainThread:3101594 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_control', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0, 'lr': 5e-06} +2024-11-28 16:19:25,470 INFO MainThread:3101594 [wandb_run.py:_config_callback():1390] config_cb None None {'vocab_size': 128256, 'max_position_embeddings': 131072, 'hidden_size': 2048, 'intermediate_size': 8192, 'num_hidden_layers': 16, 'num_attention_heads': 32, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'pretraining_tp': 1, 'use_cache': True, 'rope_theta': 500000.0, 'rope_scaling': {'factor': 32.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}, 'attention_bias': False, 'attention_dropout': 0.0, 'mlp_bias': False, 'head_dim': 64, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': True, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['LlamaForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 128000, 'pad_token_id': None, 'eos_token_id': 128001, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'meta-llama/Llama-3.2-1B', 'transformers_version': '4.45.1', 'model_type': 'llama', 'output_dir': './checkpoints/Llama-3.2-1B/babylm_reverse_control_10M_seed0/runs', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 3, 'per_device_eval_batch_size': 8, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 2, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-06, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.1, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': './logs', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 100, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 0, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': False, 'fp16': True, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 10, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': './checkpoints/Llama-3.2-1B/babylm_reverse_control_10M_seed0/runs', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': 'deepspeed_config/train_dp_config.json', 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'evaluation_strategy': 'steps', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'eval_use_gather_object': False} +2024-11-28 16:19:25,472 INFO MainThread:3101594 [wandb_config.py:__setitem__():154] config set model/num_parameters = 1235814400 - > +2024-11-28 16:19:25,472 INFO MainThread:3101594 [wandb_run.py:_config_callback():1390] config_cb model/num_parameters 1235814400 None +2024-11-29 06:59:15,375 INFO MainThread:3101594 [wandb_run.py:_finish():2158] finishing run yaning1001-dartmouth-college/exp-impo-reverse/9wf9o0ou +2024-11-29 06:59:15,381 INFO MainThread:3101594 [wandb_run.py:_atexit_cleanup():2426] got exitcode: 0 +2024-11-29 06:59:15,382 INFO MainThread:3101594 [wandb_run.py:_restore():2408] restore +2024-11-29 06:59:15,382 INFO MainThread:3101594 [wandb_run.py:_restore():2414] restore done +2024-11-29 06:59:17,068 INFO MainThread:3101594 [wandb_run.py:_footer_history_summary_info():3975] rendering history +2024-11-29 06:59:17,069 INFO MainThread:3101594 [wandb_run.py:_footer_history_summary_info():4007] rendering summary +2024-11-29 06:59:17,073 INFO MainThread:3101594 [wandb_run.py:_footer_sync_info():3934] logging synced files diff --git a/wandb/run-20241128_161638-ky5jdi3b/files/config.yaml b/wandb/run-20241128_161638-ky5jdi3b/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5feb2a98b43e2e1c45b273f0fd6ee4e8b6deba5d --- /dev/null +++ b/wandb/run-20241128_161638-ky5jdi3b/files/config.yaml @@ -0,0 +1,50 @@ +_wandb: + value: + cli_version: 0.18.5 + m: [] + python_version: 3.9.19 + t: + "1": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "2": + - 1 + - 5 + - 11 + - 49 + - 51 + - 53 + - 55 + - 71 + - 98 + "3": + - 2 + - 13 + - 23 + - 55 + "4": 3.9.19 + "5": 0.18.5 + "6": 4.45.1 + "8": + - 5 + "12": 0.18.5 + "13": linux-x86_64 +batch_size: + value: 3 +epoch: + value: 3 +lr: + value: 5e-06 +perturbation: + value: reverse_full +seed: + value: 0 +train_set: + value: 10M diff --git a/wandb/run-20241128_161638-ky5jdi3b/files/output.log b/wandb/run-20241128_161638-ky5jdi3b/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..58fee43535e81db85cacb661f1b3bf530360b78c --- /dev/null +++ b/wandb/run-20241128_161638-ky5jdi3b/files/output.log @@ -0,0 +1,19 @@ +model.safetensors: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2.47G/2.47G [00:58<00:00, 42.0MB/s] +generation_config.json: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 185/185 [00:00<00:00, 57.0kB/s] +Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17519/17519 [00:48<00:00, 361.07 examples/s] +Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 18140/18140 [00:49<00:00, 364.85 examples/s] +tokenized_valid: Dataset({ + features: ['input_ids', 'attention_mask'], + num_rows: 1000 +}) +/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead + warnings.warn( +[2024-11-28 16:19:19,451] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-11-28 16:19:25,128] [INFO] [comm.py:652:init_distributed] cdb=None +Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination +Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root... +Emitting ninja build file /home/chunhui/.cache/torch_extensions/py39_cu117/cpu_adam/build.ninja... +Building extension module cpu_adam... +Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +Loading extension module cpu_adam... +Time to load cpu_adam op: 3.7673845291137695 seconds diff --git a/wandb/run-20241128_161638-ky5jdi3b/files/wandb-metadata.json b/wandb/run-20241128_161638-ky5jdi3b/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..2369624a0bfdd406a7323d89739a6ceb92b8c1e1 --- /dev/null +++ b/wandb/run-20241128_161638-ky5jdi3b/files/wandb-metadata.json @@ -0,0 +1,97 @@ +{ + "os": "Linux-5.4.0-162-generic-x86_64-with-glibc2.31", + "python": "3.9.19", + "startedAt": "2024-11-28T21:16:38.426161Z", + "args": [ + "--perturbation", + "reverse_full", + "--train_set", + "10M", + "--batch_size", + "3", + "--epoch", + "3", + "--seed", + "0" + ], + "program": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_llama_1B.py", + "codePath": "train/train_llama_1B.py", + "git": { + "remote": "git@hf.co:Yaning1001/Impossible_llm.git", + "commit": "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4" + }, + "email": "yaning1001@gmail.com", + "root": "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train", + "host": "mms-large-2", + "username": "chunhui", + "executable": "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python", + "codePathLocal": "train_llama_1B.py", + "cpu_count": 32, + "cpu_count_logical": 64, + "gpu": "NVIDIA RTX A6000", + "gpu_count": 8, + "disk": { + "/": { + "total": "1888559353856", + "used": "1723122212864" + } + }, + "memory": { + "total": "202617098240" + }, + "cpu": { + "count": 32, + "countLogical": 64 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + }, + { + "name": "NVIDIA RTX A6000", + "memoryTotal": "51527024640", + "cudaCores": 10752, + "architecture": "Ampere" + } + ], + "cudaVersion": "11.8" +} \ No newline at end of file diff --git a/wandb/run-20241128_161638-ky5jdi3b/files/wandb-summary.json b/wandb/run-20241128_161638-ky5jdi3b/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..b47b0ad94e10d6a9a6ab27e63a37a56e1daa465e --- /dev/null +++ b/wandb/run-20241128_161638-ky5jdi3b/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":48559}} \ No newline at end of file diff --git a/wandb/run-20241128_161638-ky5jdi3b/logs/debug-internal.log b/wandb/run-20241128_161638-ky5jdi3b/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..e0a815f6bd199744bd25f6955d94e24cf99bf22f --- /dev/null +++ b/wandb/run-20241128_161638-ky5jdi3b/logs/debug-internal.log @@ -0,0 +1,18 @@ +{"time":"2024-11-28T16:16:38.42841522-05:00","level":"INFO","msg":"using version","core version":"0.18.5"} +{"time":"2024-11-28T16:16:38.42843487-05:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241128_161638-ky5jdi3b/logs/debug-core.log"} +{"time":"2024-11-28T16:16:38.534827381-05:00","level":"INFO","msg":"created new stream","id":"ky5jdi3b"} +{"time":"2024-11-28T16:16:38.534861882-05:00","level":"INFO","msg":"stream: started","id":"ky5jdi3b"} +{"time":"2024-11-28T16:16:38.534974843-05:00","level":"INFO","msg":"sender: started","stream_id":"ky5jdi3b"} +{"time":"2024-11-28T16:16:38.534931932-05:00","level":"INFO","msg":"handler: started","stream_id":{"value":"ky5jdi3b"}} +{"time":"2024-11-28T16:16:38.534931952-05:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"ky5jdi3b"}} +{"time":"2024-11-28T16:16:38.740441285-05:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2024-11-28T21:26:09.121035493-05:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/yaning1001-dartmouth-college/exp-impo-reverse/ky5jdi3b/file_stream"} +{"time":"2024-11-29T05:45:58.401161164-05:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2024-11-29T05:45:58.405236761-05:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2024-11-29T05:45:59.401866829-05:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"saving job artifact","runtime_seconds":0.718168328}],"total_operations":1}} +{"time":"2024-11-29T05:45:59.967984122-05:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2024-11-29T05:46:01.172219798-05:00","level":"INFO","msg":"stream: closing","id":"ky5jdi3b"} +{"time":"2024-11-29T05:46:01.172252958-05:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"ky5jdi3b"}} +{"time":"2024-11-29T05:46:01.172277168-05:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"ky5jdi3b"}} +{"time":"2024-11-29T05:46:01.172411549-05:00","level":"INFO","msg":"sender: closed","stream_id":"ky5jdi3b"} +{"time":"2024-11-29T05:46:01.17245432-05:00","level":"INFO","msg":"stream: closed","id":"ky5jdi3b"} diff --git a/wandb/run-20241128_161638-ky5jdi3b/logs/debug.log b/wandb/run-20241128_161638-ky5jdi3b/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..4bb109b2d0d387c337d20e68ba14cd60bb9f8da3 --- /dev/null +++ b/wandb/run-20241128_161638-ky5jdi3b/logs/debug.log @@ -0,0 +1,33 @@ +2024-11-28 16:16:38,422 INFO MainThread:3102258 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5 +2024-11-28 16:16:38,423 INFO MainThread:3102258 [wandb_setup.py:_flush():79] Configure stats pid to 3102258 +2024-11-28 16:16:38,423 INFO MainThread:3102258 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings +2024-11-28 16:16:38,423 INFO MainThread:3102258 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings +2024-11-28 16:16:38,423 INFO MainThread:3102258 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2024-11-28 16:16:38,423 INFO MainThread:3102258 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None} +2024-11-28 16:16:38,423 INFO MainThread:3102258 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_llama_1B.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_llama_1B.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_llama_1B.py'} +2024-11-28 16:16:38,423 INFO MainThread:3102258 [wandb_setup.py:_flush():79] Applying login settings: {} +2024-11-28 16:16:38,423 INFO MainThread:3102258 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241128_161638-ky5jdi3b/logs/debug.log +2024-11-28 16:16:38,423 INFO MainThread:3102258 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241128_161638-ky5jdi3b/logs/debug-internal.log +2024-11-28 16:16:38,423 INFO MainThread:3102258 [wandb_init.py:init():621] calling init triggers +2024-11-28 16:16:38,423 INFO MainThread:3102258 [wandb_init.py:init():628] wandb.init called with sweep_config: {} +config: {} +2024-11-28 16:16:38,423 INFO MainThread:3102258 [wandb_init.py:init():671] starting backend +2024-11-28 16:16:38,423 INFO MainThread:3102258 [wandb_init.py:init():675] sending inform_init request +2024-11-28 16:16:38,425 INFO MainThread:3102258 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2024-11-28 16:16:38,425 INFO MainThread:3102258 [wandb_init.py:init():688] backend started and connected +2024-11-28 16:16:38,430 INFO MainThread:3102258 [wandb_init.py:init():783] updated telemetry +2024-11-28 16:16:38,461 INFO MainThread:3102258 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout +2024-11-28 16:16:38,737 INFO MainThread:3102258 [wandb_init.py:init():867] starting run threads in backend +2024-11-28 16:16:38,826 INFO MainThread:3102258 [wandb_run.py:_console_start():2463] atexit reg +2024-11-28 16:16:38,826 INFO MainThread:3102258 [wandb_run.py:_redirect():2311] redirect: wrap_raw +2024-11-28 16:16:38,826 INFO MainThread:3102258 [wandb_run.py:_redirect():2376] Wrapping output streams. +2024-11-28 16:16:38,826 INFO MainThread:3102258 [wandb_run.py:_redirect():2401] Redirects installed. +2024-11-28 16:16:38,827 INFO MainThread:3102258 [wandb_init.py:init():911] run started, returning control to user process +2024-11-28 16:16:38,828 INFO MainThread:3102258 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_full', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0, 'lr': 5e-06} +2024-11-29 05:45:58,292 INFO MainThread:3102258 [wandb_run.py:_finish():2158] finishing run yaning1001-dartmouth-college/exp-impo-reverse/ky5jdi3b +2024-11-29 05:45:58,399 INFO MainThread:3102258 [wandb_run.py:_atexit_cleanup():2426] got exitcode: 0 +2024-11-29 05:45:58,400 INFO MainThread:3102258 [wandb_run.py:_restore():2408] restore +2024-11-29 05:45:58,400 INFO MainThread:3102258 [wandb_run.py:_restore():2414] restore done +2024-11-29 05:46:01,161 INFO MainThread:3102258 [wandb_run.py:_footer_history_summary_info():3975] rendering history +2024-11-29 05:46:01,162 INFO MainThread:3102258 [wandb_run.py:_footer_history_summary_info():4007] rendering summary +2024-11-29 05:46:01,171 INFO MainThread:3102258 [wandb_run.py:_footer_sync_info():3934] logging synced files