diff --git a/wandb/run-20241030_010759-dim9v1es/files/wandb-metadata.json b/wandb/run-20241030_010759-dim9v1es/files/wandb-metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..a1a29128a90880d67e34a4f37ea8c990e90750d0
--- /dev/null
+++ b/wandb/run-20241030_010759-dim9v1es/files/wandb-metadata.json
@@ -0,0 +1,97 @@
+{
+  "os":  "Linux-5.4.0-162-generic-x86_64-with-glibc2.31",
+  "python":  "3.9.19",
+  "startedAt":  "2024-10-30T05:07:59.121382Z",
+  "args":  [
+    "--perturbation",
+    "reverse_control",
+    "--train_set",
+    "10M",
+    "--batch_size",
+    "3",
+    "--epoch",
+    "7",
+    "--seed",
+    "0"
+  ],
+  "program":  "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py",
+  "codePath":  "train/train_deep_wandb.py",
+  "git":  {
+    "remote":  "git@hf.co:Yaning1001/Impossible_llm.git",
+    "commit":  "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4"
+  },
+  "email":  "yaning1001@gmail.com",
+  "root":  "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train",
+  "host":  "mms-large-2",
+  "username":  "chunhui",
+  "executable":  "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python",
+  "codePathLocal":  "train_deep_wandb.py",
+  "cpu_count":  32,
+  "cpu_count_logical":  64,
+  "gpu":  "NVIDIA RTX A6000",
+  "gpu_count":  8,
+  "disk":  {
+    "/":  {
+      "total":  "1888559353856",
+      "used":  "1719200272384"
+    }
+  },
+  "memory":  {
+    "total":  "202617098240"
+  },
+  "cpu":  {
+    "count":  32,
+    "countLogical":  64
+  },
+  "gpu_nvidia":  [
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    }
+  ],
+  "cudaVersion":  "11.8"
+}
\ No newline at end of file
diff --git a/wandb/run-20241030_010759-dim9v1es/files/wandb-summary.json b/wandb/run-20241030_010759-dim9v1es/files/wandb-summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..6c37fe1cbbb8aed86fd461a79642cb991e4d35cf
--- /dev/null
+++ b/wandb/run-20241030_010759-dim9v1es/files/wandb-summary.json
@@ -0,0 +1 @@
+{"_wandb":{"runtime":0}}
\ No newline at end of file
diff --git a/wandb/run-20241030_010759-dim9v1es/logs/debug-internal.log b/wandb/run-20241030_010759-dim9v1es/logs/debug-internal.log
new file mode 100644
index 0000000000000000000000000000000000000000..f09d4cd23477aa305709b7457bafeb273e57072b
--- /dev/null
+++ b/wandb/run-20241030_010759-dim9v1es/logs/debug-internal.log
@@ -0,0 +1,16 @@
+{"time":"2024-10-30T01:07:59.123018178-04:00","level":"INFO","msg":"using version","core version":"0.18.5"}
+{"time":"2024-10-30T01:07:59.123029468-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_010759-dim9v1es/logs/debug-core.log"}
+{"time":"2024-10-30T01:07:59.228528967-04:00","level":"INFO","msg":"created new stream","id":"dim9v1es"}
+{"time":"2024-10-30T01:07:59.228567837-04:00","level":"INFO","msg":"stream: started","id":"dim9v1es"}
+{"time":"2024-10-30T01:07:59.228581067-04:00","level":"INFO","msg":"sender: started","stream_id":"dim9v1es"}
+{"time":"2024-10-30T01:07:59.228568237-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"dim9v1es"}}
+{"time":"2024-10-30T01:07:59.228568187-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"dim9v1es"}}
+{"time":"2024-10-30T01:07:59.441316995-04:00","level":"INFO","msg":"Starting system monitor"}
+{"time":"2024-10-30T01:07:59.536719185-04:00","level":"INFO","msg":"stream: closing","id":"dim9v1es"}
+{"time":"2024-10-30T01:07:59.536770865-04:00","level":"INFO","msg":"Stopping system monitor"}
+{"time":"2024-10-30T01:07:59.53739974-04:00","level":"INFO","msg":"Stopped system monitor"}
+{"time":"2024-10-30T01:08:00.081295733-04:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
+{"time":"2024-10-30T01:08:00.206167083-04:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"dim9v1es"}}
+{"time":"2024-10-30T01:08:00.206226274-04:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"dim9v1es"}}
+{"time":"2024-10-30T01:08:00.206291324-04:00","level":"INFO","msg":"sender: closed","stream_id":"dim9v1es"}
+{"time":"2024-10-30T01:08:00.206325864-04:00","level":"INFO","msg":"stream: closed","id":"dim9v1es"}
diff --git a/wandb/run-20241030_010759-dim9v1es/logs/debug.log b/wandb/run-20241030_010759-dim9v1es/logs/debug.log
new file mode 100644
index 0000000000000000000000000000000000000000..834e7bac02fbbec4cd3d5b338bde8b88b91b6a7b
--- /dev/null
+++ b/wandb/run-20241030_010759-dim9v1es/logs/debug.log
@@ -0,0 +1,27 @@
+2024-10-30 01:07:59,119 INFO    MainThread:322462 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5
+2024-10-30 01:07:59,119 INFO    MainThread:322462 [wandb_setup.py:_flush():79] Configure stats pid to 322462
+2024-10-30 01:07:59,119 INFO    MainThread:322462 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings
+2024-10-30 01:07:59,119 INFO    MainThread:322462 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings
+2024-10-30 01:07:59,119 INFO    MainThread:322462 [wandb_setup.py:_flush():79] Loading settings from environment variables: {}
+2024-10-30 01:07:59,119 INFO    MainThread:322462 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None}
+2024-10-30 01:07:59,119 INFO    MainThread:322462 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'}
+2024-10-30 01:07:59,119 INFO    MainThread:322462 [wandb_setup.py:_flush():79] Applying login settings: {}
+2024-10-30 01:07:59,119 INFO    MainThread:322462 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_010759-dim9v1es/logs/debug.log
+2024-10-30 01:07:59,119 INFO    MainThread:322462 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_010759-dim9v1es/logs/debug-internal.log
+2024-10-30 01:07:59,119 INFO    MainThread:322462 [wandb_init.py:init():621] calling init triggers
+2024-10-30 01:07:59,119 INFO    MainThread:322462 [wandb_init.py:init():628] wandb.init called with sweep_config: {}
+config: {}
+2024-10-30 01:07:59,119 INFO    MainThread:322462 [wandb_init.py:init():671] starting backend
+2024-10-30 01:07:59,119 INFO    MainThread:322462 [wandb_init.py:init():675] sending inform_init request
+2024-10-30 01:07:59,120 INFO    MainThread:322462 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-10-30 01:07:59,121 INFO    MainThread:322462 [wandb_init.py:init():688] backend started and connected
+2024-10-30 01:07:59,124 INFO    MainThread:322462 [wandb_init.py:init():783] updated telemetry
+2024-10-30 01:07:59,156 INFO    MainThread:322462 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout
+2024-10-30 01:07:59,438 INFO    MainThread:322462 [wandb_init.py:init():867] starting run threads in backend
+2024-10-30 01:07:59,533 INFO    MainThread:322462 [wandb_run.py:_console_start():2463] atexit reg
+2024-10-30 01:07:59,533 INFO    MainThread:322462 [wandb_run.py:_redirect():2311] redirect: wrap_raw
+2024-10-30 01:07:59,533 INFO    MainThread:322462 [wandb_run.py:_redirect():2376] Wrapping output streams.
+2024-10-30 01:07:59,533 INFO    MainThread:322462 [wandb_run.py:_redirect():2401] Redirects installed.
+2024-10-30 01:07:59,535 INFO    MainThread:322462 [wandb_init.py:init():911] run started, returning control to user process
+2024-10-30 01:07:59,535 INFO    MainThread:322462 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_control', 'train_set': '10M', 'batch_size': 3, 'epoch': 7, 'seed': 0}
+2024-10-30 01:07:59,536 WARNING MsgRouterThr:322462 [router.py:message_loop():77] message_loop has been closed
diff --git a/wandb/run-20241030_012617-yt7vh1dq/files/output.log b/wandb/run-20241030_012617-yt7vh1dq/files/output.log
new file mode 100644
index 0000000000000000000000000000000000000000..e53ae3d6909872986257e38503a288ad80db241f
--- /dev/null
+++ b/wandb/run-20241030_012617-yt7vh1dq/files/output.log
@@ -0,0 +1,2 @@
+Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:04<00:00,  2.36s/it]
+Map:  11%|██████████████                                                                                                                  | 2000/18140 [00:06<00:52, 308.60 examples/s]
diff --git a/wandb/run-20241030_012617-yt7vh1dq/files/requirements.txt b/wandb/run-20241030_012617-yt7vh1dq/files/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416
--- /dev/null
+++ b/wandb/run-20241030_012617-yt7vh1dq/files/requirements.txt
@@ -0,0 +1,147 @@
+funcsigs==1.0.2
+sentry-sdk==2.17.0
+multiprocess==0.70.16
+numpy==1.26.2
+pluralizer==1.2.0
+debugpy==1.6.7
+nvidia-cudnn-cu11==8.5.0.96
+deepspeed==0.15.2
+data==0.4
+pandas==2.1.3
+tomli==2.0.1
+charset-normalizer==3.3.2
+attrs==24.2.0
+aiosignal==1.3.1
+fsspec==2023.10.0
+nvidia-cusparse-cu11==11.7.4.91
+zipp==3.12.0
+mypy-extensions==1.0.0
+datasets==3.0.1
+joblib==1.3.2
+hjson==3.1.0
+traitlets==5.7.1
+stack-data==0.6.0
+transformers==4.45.1
+sympy==1.11.1
+Pygments==2.15.0
+docker-pycreds==0.4.0
+dill==0.3.8
+wheel==0.44.0
+prompt-toolkit==3.0.30
+parso==0.8.3
+ipykernel==6.23.1
+pyarrow==17.0.0
+certifi==2023.11.17
+nvidia-cufft-cu11==10.9.0.58
+six==1.16.0
+pydantic==2.9.2
+click==8.1.7
+nest-asyncio==1.5.6
+gmpy2==2.1.0
+matplotlib==3.8.2
+scipy==1.11.4
+typing_extensions==4.12.2
+statsmodels==0.14.0
+huggingface-hub==0.25.0
+frozenlist==1.4.1
+gpustat==1.1.1
+nvidia-nvtx-cu11==11.7.91
+safetensors==0.4.5
+stanza==1.9.2
+decorator==5.1.1
+seaborn==0.13.0
+sentencepiece==0.2.0
+PyYAML==6.0.1
+black==24.8.0
+protobuf==4.25.1
+pickleshare==0.7.5
+peft==0.13.0
+triton==2.0.0
+nvidia-cuda-runtime-cu11==11.7.99
+Jinja2==3.1.2
+nvidia-cusolver-cu11==11.4.0.1
+executing==1.2.0
+jupyter_client==8.1.0
+pluggy==1.3.0
+cmake==3.30.3
+pytz==2023.3.post1
+aiohappyeyeballs==2.4.2
+kiwisolver==1.4.5
+py-cpuinfo==9.0.0
+Pillow==10.1.0
+ptyprocess==0.7.0
+importlib_resources==6.4.5
+GitPython==3.1.43
+importlib-metadata==6.0.0
+iniconfig==2.0.0
+scikit-learn==1.3.2
+exceptiongroup==1.1.0
+networkx==2.8.6
+accelerate==1.0.0
+nltk==3.8.1
+shutilwhich==1.1.0
+fonttools==4.45.1
+future==0.18.3
+aiohttp==3.10.6
+wcwidth==0.2.5
+idna==3.6
+filelock==3.12.2
+pathspec==0.12.1
+jupyter_core==5.1.0
+lit==18.1.8
+nvidia-curand-cu11==10.2.10.91
+nvidia-cublas-cu11==11.10.3.66
+nvidia-ml-py==12.560.30
+msgpack==1.1.0
+python-dateutil==2.8.2
+blessed==1.20.0
+packaging==23.0
+gitdb==4.0.11
+yarl==1.13.0
+emoji==2.8.0
+tzdata==2023.3
+cycler==0.12.1
+tornado==6.2
+backcall==0.2.0
+plotnine==0.12.4
+ninja==1.11.1.1
+latex==0.7.0
+wandb==0.18.5
+setproctitle==1.3.3
+threadpoolctl==3.2.0
+requests==2.32.3
+pyparsing==3.1.1
+smmap==5.0.1
+pyzmq==23.0.0
+async-timeout==4.0.3
+annotated-types==0.7.0
+matplotlib-inline==0.1.6
+latexcodec==1.0.0
+ipython==8.0.0
+patsy==0.5.3
+contourpy==1.2.0
+multidict==6.1.0
+mizani==0.9.3
+urllib3==2.1.0
+tokenizers==0.20.0
+MarkupSafe==2.1.2
+pip==24.2
+pexpect==4.8.0
+tqdm==4.66.5
+jedi==0.18.2
+pydantic_core==2.23.4
+tempdir==0.7.1
+mpmath==1.2.1
+setuptools==72.1.0
+pytest==7.4.3
+pure-eval==0.2.2
+psutil==5.9.1
+comm==0.1.2
+nvidia-cuda-cupti-cu11==11.7.101
+nvidia-cuda-nvrtc-cu11==11.7.99
+regex==2023.10.3
+platformdirs==2.5.2
+asttokens==2.2.1
+torch==2.0.0
+nvidia-nccl-cu11==2.14.3
+xxhash==3.5.0
diff --git a/wandb/run-20241030_012617-yt7vh1dq/files/wandb-metadata.json b/wandb/run-20241030_012617-yt7vh1dq/files/wandb-metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..2c54bd4724db32a1cda3272082e3d60eb81487a5
--- /dev/null
+++ b/wandb/run-20241030_012617-yt7vh1dq/files/wandb-metadata.json
@@ -0,0 +1,97 @@
+{
+  "os":  "Linux-5.4.0-162-generic-x86_64-with-glibc2.31",
+  "python":  "3.9.19",
+  "startedAt":  "2024-10-30T05:26:17.324794Z",
+  "args":  [
+    "--perturbation",
+    "reverse_control",
+    "--train_set",
+    "10M",
+    "--batch_size",
+    "3",
+    "--epoch",
+    "7",
+    "--seed",
+    "0"
+  ],
+  "program":  "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py",
+  "codePath":  "train/train_deep_wandb.py",
+  "git":  {
+    "remote":  "git@hf.co:Yaning1001/Impossible_llm.git",
+    "commit":  "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4"
+  },
+  "email":  "yaning1001@gmail.com",
+  "root":  "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train",
+  "host":  "mms-large-2",
+  "username":  "chunhui",
+  "executable":  "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python",
+  "codePathLocal":  "train_deep_wandb.py",
+  "cpu_count":  32,
+  "cpu_count_logical":  64,
+  "gpu":  "NVIDIA RTX A6000",
+  "gpu_count":  8,
+  "disk":  {
+    "/":  {
+      "total":  "1888559353856",
+      "used":  "1709772775424"
+    }
+  },
+  "memory":  {
+    "total":  "202617098240"
+  },
+  "cpu":  {
+    "count":  32,
+    "countLogical":  64
+  },
+  "gpu_nvidia":  [
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    }
+  ],
+  "cudaVersion":  "11.8"
+}
\ No newline at end of file
diff --git a/wandb/run-20241030_012617-yt7vh1dq/logs/debug-internal.log b/wandb/run-20241030_012617-yt7vh1dq/logs/debug-internal.log
new file mode 100644
index 0000000000000000000000000000000000000000..70a031dec399cd9ca958fbd5469b4158a7dc6646
--- /dev/null
+++ b/wandb/run-20241030_012617-yt7vh1dq/logs/debug-internal.log
@@ -0,0 +1,8 @@
+{"time":"2024-10-30T01:26:17.327161166-04:00","level":"INFO","msg":"using version","core version":"0.18.5"}
+{"time":"2024-10-30T01:26:17.327175976-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_012617-yt7vh1dq/logs/debug-core.log"}
+{"time":"2024-10-30T01:26:17.435118413-04:00","level":"INFO","msg":"created new stream","id":"yt7vh1dq"}
+{"time":"2024-10-30T01:26:17.435165823-04:00","level":"INFO","msg":"stream: started","id":"yt7vh1dq"}
+{"time":"2024-10-30T01:26:17.435237323-04:00","level":"INFO","msg":"sender: started","stream_id":"yt7vh1dq"}
+{"time":"2024-10-30T01:26:17.435183773-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"yt7vh1dq"}}
+{"time":"2024-10-30T01:26:17.435252003-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"yt7vh1dq"}}
+{"time":"2024-10-30T01:26:17.695977809-04:00","level":"INFO","msg":"Starting system monitor"}
diff --git a/wandb/run-20241030_012617-yt7vh1dq/logs/debug.log b/wandb/run-20241030_012617-yt7vh1dq/logs/debug.log
new file mode 100644
index 0000000000000000000000000000000000000000..7ed3b0a4b1d0f4e8ed1af0fead97f3848cd4ecd9
--- /dev/null
+++ b/wandb/run-20241030_012617-yt7vh1dq/logs/debug.log
@@ -0,0 +1,26 @@
+2024-10-30 01:26:17,323 INFO    MainThread:332624 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5
+2024-10-30 01:26:17,323 INFO    MainThread:332624 [wandb_setup.py:_flush():79] Configure stats pid to 332624
+2024-10-30 01:26:17,323 INFO    MainThread:332624 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings
+2024-10-30 01:26:17,323 INFO    MainThread:332624 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings
+2024-10-30 01:26:17,323 INFO    MainThread:332624 [wandb_setup.py:_flush():79] Loading settings from environment variables: {}
+2024-10-30 01:26:17,323 INFO    MainThread:332624 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None}
+2024-10-30 01:26:17,323 INFO    MainThread:332624 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'}
+2024-10-30 01:26:17,323 INFO    MainThread:332624 [wandb_setup.py:_flush():79] Applying login settings: {}
+2024-10-30 01:26:17,323 INFO    MainThread:332624 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_012617-yt7vh1dq/logs/debug.log
+2024-10-30 01:26:17,323 INFO    MainThread:332624 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_012617-yt7vh1dq/logs/debug-internal.log
+2024-10-30 01:26:17,323 INFO    MainThread:332624 [wandb_init.py:init():621] calling init triggers
+2024-10-30 01:26:17,323 INFO    MainThread:332624 [wandb_init.py:init():628] wandb.init called with sweep_config: {}
+config: {}
+2024-10-30 01:26:17,323 INFO    MainThread:332624 [wandb_init.py:init():671] starting backend
+2024-10-30 01:26:17,323 INFO    MainThread:332624 [wandb_init.py:init():675] sending inform_init request
+2024-10-30 01:26:17,324 INFO    MainThread:332624 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-10-30 01:26:17,324 INFO    MainThread:332624 [wandb_init.py:init():688] backend started and connected
+2024-10-30 01:26:17,328 INFO    MainThread:332624 [wandb_init.py:init():783] updated telemetry
+2024-10-30 01:26:17,385 INFO    MainThread:332624 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout
+2024-10-30 01:26:17,692 INFO    MainThread:332624 [wandb_init.py:init():867] starting run threads in backend
+2024-10-30 01:26:17,844 INFO    MainThread:332624 [wandb_run.py:_console_start():2463] atexit reg
+2024-10-30 01:26:17,844 INFO    MainThread:332624 [wandb_run.py:_redirect():2311] redirect: wrap_raw
+2024-10-30 01:26:17,844 INFO    MainThread:332624 [wandb_run.py:_redirect():2376] Wrapping output streams.
+2024-10-30 01:26:17,844 INFO    MainThread:332624 [wandb_run.py:_redirect():2401] Redirects installed.
+2024-10-30 01:26:17,849 INFO    MainThread:332624 [wandb_init.py:init():911] run started, returning control to user process
+2024-10-30 01:26:17,850 INFO    MainThread:332624 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_control', 'train_set': '10M', 'batch_size': 3, 'epoch': 7, 'seed': 0}
diff --git a/wandb/run-20241030_013141-bkcoggdw/files/config.yaml b/wandb/run-20241030_013141-bkcoggdw/files/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d471293c627dbaeaf5be47c709016b3dfbf28c6c
--- /dev/null
+++ b/wandb/run-20241030_013141-bkcoggdw/files/config.yaml
@@ -0,0 +1,47 @@
+_wandb:
+    value:
+        cli_version: 0.18.5
+        m: []
+        python_version: 3.9.19
+        t:
+            "1":
+                - 1
+                - 5
+                - 11
+                - 49
+                - 51
+                - 53
+                - 55
+                - 71
+                - 98
+            "2":
+                - 1
+                - 5
+                - 11
+                - 49
+                - 51
+                - 53
+                - 55
+                - 71
+                - 98
+            "3":
+                - 13
+                - 23
+                - 55
+            "4": 3.9.19
+            "5": 0.18.5
+            "6": 4.45.1
+            "8":
+                - 5
+            "12": 0.18.5
+            "13": linux-x86_64
+batch_size:
+    value: 3
+epoch:
+    value: 7
+perturbation:
+    value: reverse_full
+seed:
+    value: 0
+train_set:
+    value: 10M
diff --git a/wandb/run-20241030_013141-bkcoggdw/files/output.log b/wandb/run-20241030_013141-bkcoggdw/files/output.log
new file mode 100644
index 0000000000000000000000000000000000000000..d0ecc4c27edb1cb376892301d35b62359512b1a3
--- /dev/null
+++ b/wandb/run-20241030_013141-bkcoggdw/files/output.log
@@ -0,0 +1,62 @@
+[34m[1mwandb[0m: 500 encountered ({"errors":[{"message":"An internal error occurred. Please contact support.","path":["upsertBucket"]}],"data":{"upsertBucket":null}}), retrying request
+model.safetensors.index.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20.9k/20.9k [00:00<00:00, 11.2MB/s]
+Downloading shards:   0%|                                                                                                                                        | 0/2 [01:04<?, ?it/s]
+Traceback (most recent call last): 54%|██████████████████████████████████████████████████████████▎                                                | 2.71G/4.97G [01:04<00:53, 42.2MB/s]
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 541, in http_get
+    for chunk in r.iter_content(chunk_size=constants.DOWNLOAD_CHUNK_SIZE):
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/requests/models.py", line 820, in generate
+    yield from self.raw.stream(chunk_size, decode_content=True)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/urllib3/response.py", line 934, in stream
+    data = self.read(amt=amt, decode_content=decode_content)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/urllib3/response.py", line 877, in read
+    data = self._raw_read(amt)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/urllib3/response.py", line 812, in _raw_read
+    data = self._fp_read(amt) if not fp_closed else b""
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/urllib3/response.py", line 789, in _fp_read
+    data = self._fp.read(chunk_amt)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/http/client.py", line 463, in read
+    n = self.readinto(b)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/http/client.py", line 507, in readinto
+    n = self.fp.readinto(b)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/socket.py", line 704, in readinto
+    return self._sock.recv_into(b)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/ssl.py", line 1275, in recv_into
+    return self.read(nbytes, buffer)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/ssl.py", line 1133, in read
+    return self._sslobj.read(len, buffer)
+KeyboardInterrupt
+
+During handling of the above exception, another exception occurred:
+
+Traceback (most recent call last):
+  File "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", line 172, in <module>
+    model = AutoModelForCausalLM.from_pretrained(model_name,
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/models/auto/auto_factory.py", line 564, in from_pretrained
+    return model_class.from_pretrained(
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/modeling_utils.py", line 3769, in from_pretrained
+    resolved_archive_file, sharded_metadata = get_checkpoint_shard_files(
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 1098, in get_checkpoint_shard_files
+    cached_filename = cached_file(
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 403, in cached_file
+    resolved_file = hf_hub_download(
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_deprecation.py", line 101, in inner_f
+    return f(*args, **kwargs)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn
+    return fn(*args, **kwargs)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1232, in hf_hub_download
+    return _hf_hub_download_to_cache_dir(
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1381, in _hf_hub_download_to_cache_dir
+    _download_to_tmp_and_move(
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1915, in _download_to_tmp_and_move
+    http_get(
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 558, in http_get
+    return http_get(
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/std.py", line 1140, in __exit__
+    self.close()
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/std.py", line 1275, in close
+    self._decr_instances(self)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/std.py", line 696, in _decr_instances
+    with cls._lock:
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/std.py", line 110, in __enter__
+    def __enter__(self):
+KeyboardInterrupt
diff --git a/wandb/run-20241030_013141-bkcoggdw/files/requirements.txt b/wandb/run-20241030_013141-bkcoggdw/files/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416
--- /dev/null
+++ b/wandb/run-20241030_013141-bkcoggdw/files/requirements.txt
@@ -0,0 +1,147 @@
+funcsigs==1.0.2
+sentry-sdk==2.17.0
+multiprocess==0.70.16
+numpy==1.26.2
+pluralizer==1.2.0
+debugpy==1.6.7
+nvidia-cudnn-cu11==8.5.0.96
+deepspeed==0.15.2
+data==0.4
+pandas==2.1.3
+tomli==2.0.1
+charset-normalizer==3.3.2
+attrs==24.2.0
+aiosignal==1.3.1
+fsspec==2023.10.0
+nvidia-cusparse-cu11==11.7.4.91
+zipp==3.12.0
+mypy-extensions==1.0.0
+datasets==3.0.1
+joblib==1.3.2
+hjson==3.1.0
+traitlets==5.7.1
+stack-data==0.6.0
+transformers==4.45.1
+sympy==1.11.1
+Pygments==2.15.0
+docker-pycreds==0.4.0
+dill==0.3.8
+wheel==0.44.0
+prompt-toolkit==3.0.30
+parso==0.8.3
+ipykernel==6.23.1
+pyarrow==17.0.0
+certifi==2023.11.17
+nvidia-cufft-cu11==10.9.0.58
+six==1.16.0
+pydantic==2.9.2
+click==8.1.7
+nest-asyncio==1.5.6
+gmpy2==2.1.0
+matplotlib==3.8.2
+scipy==1.11.4
+typing_extensions==4.12.2
+statsmodels==0.14.0
+huggingface-hub==0.25.0
+frozenlist==1.4.1
+gpustat==1.1.1
+nvidia-nvtx-cu11==11.7.91
+safetensors==0.4.5
+stanza==1.9.2
+decorator==5.1.1
+seaborn==0.13.0
+sentencepiece==0.2.0
+PyYAML==6.0.1
+black==24.8.0
+protobuf==4.25.1
+pickleshare==0.7.5
+peft==0.13.0
+triton==2.0.0
+nvidia-cuda-runtime-cu11==11.7.99
+Jinja2==3.1.2
+nvidia-cusolver-cu11==11.4.0.1
+executing==1.2.0
+jupyter_client==8.1.0
+pluggy==1.3.0
+cmake==3.30.3
+pytz==2023.3.post1
+aiohappyeyeballs==2.4.2
+kiwisolver==1.4.5
+py-cpuinfo==9.0.0
+Pillow==10.1.0
+ptyprocess==0.7.0
+importlib_resources==6.4.5
+GitPython==3.1.43
+importlib-metadata==6.0.0
+iniconfig==2.0.0
+scikit-learn==1.3.2
+exceptiongroup==1.1.0
+networkx==2.8.6
+accelerate==1.0.0
+nltk==3.8.1
+shutilwhich==1.1.0
+fonttools==4.45.1
+future==0.18.3
+aiohttp==3.10.6
+wcwidth==0.2.5
+idna==3.6
+filelock==3.12.2
+pathspec==0.12.1
+jupyter_core==5.1.0
+lit==18.1.8
+nvidia-curand-cu11==10.2.10.91
+nvidia-cublas-cu11==11.10.3.66
+nvidia-ml-py==12.560.30
+msgpack==1.1.0
+python-dateutil==2.8.2
+blessed==1.20.0
+packaging==23.0
+gitdb==4.0.11
+yarl==1.13.0
+emoji==2.8.0
+tzdata==2023.3
+cycler==0.12.1
+tornado==6.2
+backcall==0.2.0
+plotnine==0.12.4
+ninja==1.11.1.1
+latex==0.7.0
+wandb==0.18.5
+setproctitle==1.3.3
+threadpoolctl==3.2.0
+requests==2.32.3
+pyparsing==3.1.1
+smmap==5.0.1
+pyzmq==23.0.0
+async-timeout==4.0.3
+annotated-types==0.7.0
+matplotlib-inline==0.1.6
+latexcodec==1.0.0
+ipython==8.0.0
+patsy==0.5.3
+contourpy==1.2.0
+multidict==6.1.0
+mizani==0.9.3
+urllib3==2.1.0
+tokenizers==0.20.0
+MarkupSafe==2.1.2
+pip==24.2
+pexpect==4.8.0
+tqdm==4.66.5
+jedi==0.18.2
+pydantic_core==2.23.4
+tempdir==0.7.1
+mpmath==1.2.1
+setuptools==72.1.0
+pytest==7.4.3
+pure-eval==0.2.2
+psutil==5.9.1
+comm==0.1.2
+nvidia-cuda-cupti-cu11==11.7.101
+nvidia-cuda-nvrtc-cu11==11.7.99
+regex==2023.10.3
+platformdirs==2.5.2
+asttokens==2.2.1
+torch==2.0.0
+nvidia-nccl-cu11==2.14.3
+xxhash==3.5.0
diff --git a/wandb/run-20241030_013141-bkcoggdw/files/wandb-metadata.json b/wandb/run-20241030_013141-bkcoggdw/files/wandb-metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..384dc5d9c3ccca87869fb92ab5b8c911825746ed
--- /dev/null
+++ b/wandb/run-20241030_013141-bkcoggdw/files/wandb-metadata.json
@@ -0,0 +1,97 @@
+{
+  "os":  "Linux-5.4.0-162-generic-x86_64-with-glibc2.31",
+  "python":  "3.9.19",
+  "startedAt":  "2024-10-30T05:31:41.693480Z",
+  "args":  [
+    "--perturbation",
+    "reverse_full",
+    "--train_set",
+    "10M",
+    "--batch_size",
+    "3",
+    "--epoch",
+    "7",
+    "--seed",
+    "0"
+  ],
+  "program":  "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py",
+  "codePath":  "train/train_deep_wandb.py",
+  "git":  {
+    "remote":  "git@hf.co:Yaning1001/Impossible_llm.git",
+    "commit":  "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4"
+  },
+  "email":  "yaning1001@gmail.com",
+  "root":  "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train",
+  "host":  "mms-large-2",
+  "username":  "chunhui",
+  "executable":  "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python",
+  "codePathLocal":  "train_deep_wandb.py",
+  "cpu_count":  32,
+  "cpu_count_logical":  64,
+  "gpu":  "NVIDIA RTX A6000",
+  "gpu_count":  8,
+  "disk":  {
+    "/":  {
+      "total":  "1888559353856",
+      "used":  "1709824425984"
+    }
+  },
+  "memory":  {
+    "total":  "202617098240"
+  },
+  "cpu":  {
+    "count":  32,
+    "countLogical":  64
+  },
+  "gpu_nvidia":  [
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    }
+  ],
+  "cudaVersion":  "11.8"
+}
\ No newline at end of file
diff --git a/wandb/run-20241030_013141-bkcoggdw/files/wandb-summary.json b/wandb/run-20241030_013141-bkcoggdw/files/wandb-summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..b155621703e444311fe2da9b782a2b70b5491169
--- /dev/null
+++ b/wandb/run-20241030_013141-bkcoggdw/files/wandb-summary.json
@@ -0,0 +1 @@
+{"_wandb":{"runtime":94}}
\ No newline at end of file
diff --git a/wandb/run-20241030_013141-bkcoggdw/logs/debug-internal.log b/wandb/run-20241030_013141-bkcoggdw/logs/debug-internal.log
new file mode 100644
index 0000000000000000000000000000000000000000..d69bd8925e81bf8fd31ec3529e2edc6cf85e29a7
--- /dev/null
+++ b/wandb/run-20241030_013141-bkcoggdw/logs/debug-internal.log
@@ -0,0 +1,12 @@
+{"time":"2024-10-30T01:31:41.69578299-04:00","level":"INFO","msg":"using version","core version":"0.18.5"}
+{"time":"2024-10-30T01:31:41.6958064-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_013141-bkcoggdw/logs/debug-core.log"}
+{"time":"2024-10-30T01:31:41.804007808-04:00","level":"INFO","msg":"created new stream","id":"bkcoggdw"}
+{"time":"2024-10-30T01:31:41.804049448-04:00","level":"INFO","msg":"stream: started","id":"bkcoggdw"}
+{"time":"2024-10-30T01:31:41.804096128-04:00","level":"INFO","msg":"sender: started","stream_id":"bkcoggdw"}
+{"time":"2024-10-30T01:31:41.804097678-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"bkcoggdw"}}
+{"time":"2024-10-30T01:31:41.804074928-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"bkcoggdw"}}
+{"time":"2024-10-30T01:31:41.90989171-04:00","level":"INFO","msg":"api: retrying HTTP error","status":500,"url":"https://api.wandb.ai/graphql"}
+{"time":"2024-10-30T01:31:44.314369215-04:00","level":"INFO","msg":"Starting system monitor"}
+{"time":"2024-10-30T01:33:16.64084496-04:00","level":"INFO","msg":"stream: closing","id":"bkcoggdw"}
+{"time":"2024-10-30T01:33:16.64087647-04:00","level":"INFO","msg":"Stopping system monitor"}
+{"time":"2024-10-30T01:33:16.641390294-04:00","level":"INFO","msg":"Stopped system monitor"}
diff --git a/wandb/run-20241030_112852-av3r7rx8/files/wandb-metadata.json b/wandb/run-20241030_112852-av3r7rx8/files/wandb-metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..4cf16452ad5800bb82814bb30ae5db662ec9588f
--- /dev/null
+++ b/wandb/run-20241030_112852-av3r7rx8/files/wandb-metadata.json
@@ -0,0 +1,97 @@
+{
+  "os":  "Linux-5.4.0-162-generic-x86_64-with-glibc2.31",
+  "python":  "3.9.19",
+  "startedAt":  "2024-10-30T15:28:52.925806Z",
+  "args":  [
+    "--perturbation",
+    "reverse_control",
+    "--train_set",
+    "10M",
+    "--batch_size",
+    "3",
+    "--epoch",
+    "3",
+    "--seed",
+    "0"
+  ],
+  "program":  "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py",
+  "codePath":  "train/train_deep_wandb.py",
+  "git":  {
+    "remote":  "git@hf.co:Yaning1001/Impossible_llm.git",
+    "commit":  "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4"
+  },
+  "email":  "yaning1001@gmail.com",
+  "root":  "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train",
+  "host":  "mms-large-2",
+  "username":  "chunhui",
+  "executable":  "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python",
+  "codePathLocal":  "train_deep_wandb.py",
+  "cpu_count":  32,
+  "cpu_count_logical":  64,
+  "gpu":  "NVIDIA RTX A6000",
+  "gpu_count":  8,
+  "disk":  {
+    "/":  {
+      "total":  "1888559353856",
+      "used":  "1710831611904"
+    }
+  },
+  "memory":  {
+    "total":  "202617098240"
+  },
+  "cpu":  {
+    "count":  32,
+    "countLogical":  64
+  },
+  "gpu_nvidia":  [
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    }
+  ],
+  "cudaVersion":  "11.8"
+}
\ No newline at end of file
diff --git a/wandb/run-20241030_225833-giupspdj/logs/debug-internal.log b/wandb/run-20241030_225833-giupspdj/logs/debug-internal.log
new file mode 100644
index 0000000000000000000000000000000000000000..d44c542ac2782d7a17ea50d32c95ef95ea8035f8
--- /dev/null
+++ b/wandb/run-20241030_225833-giupspdj/logs/debug-internal.log
@@ -0,0 +1,8 @@
+{"time":"2024-10-30T22:58:33.52447176-04:00","level":"INFO","msg":"using version","core version":"0.18.5"}
+{"time":"2024-10-30T22:58:33.52448387-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_225833-giupspdj/logs/debug-core.log"}
+{"time":"2024-10-30T22:58:33.631043427-04:00","level":"INFO","msg":"created new stream","id":"giupspdj"}
+{"time":"2024-10-30T22:58:33.631075407-04:00","level":"INFO","msg":"stream: started","id":"giupspdj"}
+{"time":"2024-10-30T22:58:33.631121257-04:00","level":"INFO","msg":"sender: started","stream_id":"giupspdj"}
+{"time":"2024-10-30T22:58:33.631092947-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"giupspdj"}}
+{"time":"2024-10-30T22:58:33.631105957-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"giupspdj"}}
+{"time":"2024-10-30T22:58:33.831702761-04:00","level":"INFO","msg":"Starting system monitor"}
diff --git a/wandb/run-20241031_001055-32u9qnul/files/output.log b/wandb/run-20241031_001055-32u9qnul/files/output.log
new file mode 100644
index 0000000000000000000000000000000000000000..d6ca833b35444b28034c5e68bd2d8c659b61d6e5
--- /dev/null
+++ b/wandb/run-20241031_001055-32u9qnul/files/output.log
@@ -0,0 +1,13 @@
+Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:05<00:00,  2.84s/it]
+tokenized_valid: Dataset({
+    features: ['input_ids', 'attention_mask'],
+    num_rows: 600
+})
+/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead
+  warnings.warn(
+[2024-10-31 00:11:03,787] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2024-10-31 00:11:12,645] [INFO] [comm.py:652:init_distributed] cdb=None
+Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination
+Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root...
+Loading extension module cpu_adam...
+Time to load cpu_adam op: 5.372655630111694 seconds
diff --git a/wandb/run-20241031_001055-32u9qnul/files/requirements.txt b/wandb/run-20241031_001055-32u9qnul/files/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416
--- /dev/null
+++ b/wandb/run-20241031_001055-32u9qnul/files/requirements.txt
@@ -0,0 +1,147 @@
+funcsigs==1.0.2
+sentry-sdk==2.17.0
+multiprocess==0.70.16
+numpy==1.26.2
+pluralizer==1.2.0
+debugpy==1.6.7
+nvidia-cudnn-cu11==8.5.0.96
+deepspeed==0.15.2
+data==0.4
+pandas==2.1.3
+tomli==2.0.1
+charset-normalizer==3.3.2
+attrs==24.2.0
+aiosignal==1.3.1
+fsspec==2023.10.0
+nvidia-cusparse-cu11==11.7.4.91
+zipp==3.12.0
+mypy-extensions==1.0.0
+datasets==3.0.1
+joblib==1.3.2
+hjson==3.1.0
+traitlets==5.7.1
+stack-data==0.6.0
+transformers==4.45.1
+sympy==1.11.1
+Pygments==2.15.0
+docker-pycreds==0.4.0
+dill==0.3.8
+wheel==0.44.0
+prompt-toolkit==3.0.30
+parso==0.8.3
+ipykernel==6.23.1
+pyarrow==17.0.0
+certifi==2023.11.17
+nvidia-cufft-cu11==10.9.0.58
+six==1.16.0
+pydantic==2.9.2
+click==8.1.7
+nest-asyncio==1.5.6
+gmpy2==2.1.0
+matplotlib==3.8.2
+scipy==1.11.4
+typing_extensions==4.12.2
+statsmodels==0.14.0
+huggingface-hub==0.25.0
+frozenlist==1.4.1
+gpustat==1.1.1
+nvidia-nvtx-cu11==11.7.91
+safetensors==0.4.5
+stanza==1.9.2
+decorator==5.1.1
+seaborn==0.13.0
+sentencepiece==0.2.0
+PyYAML==6.0.1
+black==24.8.0
+protobuf==4.25.1
+pickleshare==0.7.5
+peft==0.13.0
+triton==2.0.0
+nvidia-cuda-runtime-cu11==11.7.99
+Jinja2==3.1.2
+nvidia-cusolver-cu11==11.4.0.1
+executing==1.2.0
+jupyter_client==8.1.0
+pluggy==1.3.0
+cmake==3.30.3
+pytz==2023.3.post1
+aiohappyeyeballs==2.4.2
+kiwisolver==1.4.5
+py-cpuinfo==9.0.0
+Pillow==10.1.0
+ptyprocess==0.7.0
+importlib_resources==6.4.5
+GitPython==3.1.43
+importlib-metadata==6.0.0
+iniconfig==2.0.0
+scikit-learn==1.3.2
+exceptiongroup==1.1.0
+networkx==2.8.6
+accelerate==1.0.0
+nltk==3.8.1
+shutilwhich==1.1.0
+fonttools==4.45.1
+future==0.18.3
+aiohttp==3.10.6
+wcwidth==0.2.5
+idna==3.6
+filelock==3.12.2
+pathspec==0.12.1
+jupyter_core==5.1.0
+lit==18.1.8
+nvidia-curand-cu11==10.2.10.91
+nvidia-cublas-cu11==11.10.3.66
+nvidia-ml-py==12.560.30
+msgpack==1.1.0
+python-dateutil==2.8.2
+blessed==1.20.0
+packaging==23.0
+gitdb==4.0.11
+yarl==1.13.0
+emoji==2.8.0
+tzdata==2023.3
+cycler==0.12.1
+tornado==6.2
+backcall==0.2.0
+plotnine==0.12.4
+ninja==1.11.1.1
+latex==0.7.0
+wandb==0.18.5
+setproctitle==1.3.3
+threadpoolctl==3.2.0
+requests==2.32.3
+pyparsing==3.1.1
+smmap==5.0.1
+pyzmq==23.0.0
+async-timeout==4.0.3
+annotated-types==0.7.0
+matplotlib-inline==0.1.6
+latexcodec==1.0.0
+ipython==8.0.0
+patsy==0.5.3
+contourpy==1.2.0
+multidict==6.1.0
+mizani==0.9.3
+urllib3==2.1.0
+tokenizers==0.20.0
+MarkupSafe==2.1.2
+pip==24.2
+pexpect==4.8.0
+tqdm==4.66.5
+jedi==0.18.2
+pydantic_core==2.23.4
+tempdir==0.7.1
+mpmath==1.2.1
+setuptools==72.1.0
+pytest==7.4.3
+pure-eval==0.2.2
+psutil==5.9.1
+comm==0.1.2
+nvidia-cuda-cupti-cu11==11.7.101
+nvidia-cuda-nvrtc-cu11==11.7.99
+regex==2023.10.3
+platformdirs==2.5.2
+asttokens==2.2.1
+torch==2.0.0
+nvidia-nccl-cu11==2.14.3
+xxhash==3.5.0
diff --git a/wandb/run-20241031_001055-32u9qnul/files/wandb-metadata.json b/wandb/run-20241031_001055-32u9qnul/files/wandb-metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..a6ab8a9f5a3a84eedc7d3300c02e6d025fff21f3
--- /dev/null
+++ b/wandb/run-20241031_001055-32u9qnul/files/wandb-metadata.json
@@ -0,0 +1,97 @@
+{
+  "os":  "Linux-5.4.0-162-generic-x86_64-with-glibc2.31",
+  "python":  "3.9.19",
+  "startedAt":  "2024-10-31T04:10:55.973455Z",
+  "args":  [
+    "--perturbation",
+    "reverse_full",
+    "--train_set",
+    "10M",
+    "--batch_size",
+    "3",
+    "--epoch",
+    "6",
+    "--seed",
+    "0"
+  ],
+  "program":  "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py",
+  "codePath":  "train/train_deep_wandb.py",
+  "git":  {
+    "remote":  "git@hf.co:Yaning1001/Impossible_llm.git",
+    "commit":  "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4"
+  },
+  "email":  "yaning1001@gmail.com",
+  "root":  "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train",
+  "host":  "mms-large-2",
+  "username":  "chunhui",
+  "executable":  "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python",
+  "codePathLocal":  "train_deep_wandb.py",
+  "cpu_count":  32,
+  "cpu_count_logical":  64,
+  "gpu":  "NVIDIA RTX A6000",
+  "gpu_count":  8,
+  "disk":  {
+    "/":  {
+      "total":  "1888559353856",
+      "used":  "1728856920064"
+    }
+  },
+  "memory":  {
+    "total":  "202617098240"
+  },
+  "cpu":  {
+    "count":  32,
+    "countLogical":  64
+  },
+  "gpu_nvidia":  [
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    }
+  ],
+  "cudaVersion":  "11.8"
+}
\ No newline at end of file
diff --git a/wandb/run-20241031_001055-sr4xke8e/files/wandb-metadata.json b/wandb/run-20241031_001055-sr4xke8e/files/wandb-metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..2dd5d1779d154c5240ad450a940c656286731617
--- /dev/null
+++ b/wandb/run-20241031_001055-sr4xke8e/files/wandb-metadata.json
@@ -0,0 +1,97 @@
+{
+  "os":  "Linux-5.4.0-162-generic-x86_64-with-glibc2.31",
+  "python":  "3.9.19",
+  "startedAt":  "2024-10-31T04:10:55.613835Z",
+  "args":  [
+    "--perturbation",
+    "reverse_full",
+    "--train_set",
+    "10M",
+    "--batch_size",
+    "3",
+    "--epoch",
+    "6",
+    "--seed",
+    "0"
+  ],
+  "program":  "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py",
+  "codePath":  "train/train_deep_wandb.py",
+  "git":  {
+    "remote":  "git@hf.co:Yaning1001/Impossible_llm.git",
+    "commit":  "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4"
+  },
+  "email":  "yaning1001@gmail.com",
+  "root":  "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train",
+  "host":  "mms-large-2",
+  "username":  "chunhui",
+  "executable":  "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python",
+  "codePathLocal":  "train_deep_wandb.py",
+  "cpu_count":  32,
+  "cpu_count_logical":  64,
+  "gpu":  "NVIDIA RTX A6000",
+  "gpu_count":  8,
+  "disk":  {
+    "/":  {
+      "total":  "1888559353856",
+      "used":  "1728850759680"
+    }
+  },
+  "memory":  {
+    "total":  "202617098240"
+  },
+  "cpu":  {
+    "count":  32,
+    "countLogical":  64
+  },
+  "gpu_nvidia":  [
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    }
+  ],
+  "cudaVersion":  "11.8"
+}
\ No newline at end of file
diff --git a/wandb/run-20241031_001055-sr4xke8e/logs/debug-internal.log b/wandb/run-20241031_001055-sr4xke8e/logs/debug-internal.log
new file mode 100644
index 0000000000000000000000000000000000000000..5e0ff061b071aa5ae140dd99663db4ac6ac4da04
--- /dev/null
+++ b/wandb/run-20241031_001055-sr4xke8e/logs/debug-internal.log
@@ -0,0 +1,8 @@
+{"time":"2024-10-31T00:10:55.615453654-04:00","level":"INFO","msg":"using version","core version":"0.18.5"}
+{"time":"2024-10-31T00:10:55.615464774-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241031_001055-sr4xke8e/logs/debug-core.log"}
+{"time":"2024-10-31T00:10:55.72181439-04:00","level":"INFO","msg":"created new stream","id":"sr4xke8e"}
+{"time":"2024-10-31T00:10:55.7218437-04:00","level":"INFO","msg":"stream: started","id":"sr4xke8e"}
+{"time":"2024-10-31T00:10:55.721915701-04:00","level":"INFO","msg":"sender: started","stream_id":"sr4xke8e"}
+{"time":"2024-10-31T00:10:55.721914011-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"sr4xke8e"}}
+{"time":"2024-10-31T00:10:55.721899881-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"sr4xke8e"}}
+{"time":"2024-10-31T00:10:55.919527304-04:00","level":"INFO","msg":"Starting system monitor"}
diff --git a/wandb/run-20241031_114700-3cqkhntc/files/requirements.txt b/wandb/run-20241031_114700-3cqkhntc/files/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416
--- /dev/null
+++ b/wandb/run-20241031_114700-3cqkhntc/files/requirements.txt
@@ -0,0 +1,147 @@
+funcsigs==1.0.2
+sentry-sdk==2.17.0
+multiprocess==0.70.16
+numpy==1.26.2
+pluralizer==1.2.0
+debugpy==1.6.7
+nvidia-cudnn-cu11==8.5.0.96
+deepspeed==0.15.2
+data==0.4
+pandas==2.1.3
+tomli==2.0.1
+charset-normalizer==3.3.2
+attrs==24.2.0
+aiosignal==1.3.1
+fsspec==2023.10.0
+nvidia-cusparse-cu11==11.7.4.91
+zipp==3.12.0
+mypy-extensions==1.0.0
+datasets==3.0.1
+joblib==1.3.2
+hjson==3.1.0
+traitlets==5.7.1
+stack-data==0.6.0
+transformers==4.45.1
+sympy==1.11.1
+Pygments==2.15.0
+docker-pycreds==0.4.0
+dill==0.3.8
+wheel==0.44.0
+prompt-toolkit==3.0.30
+parso==0.8.3
+ipykernel==6.23.1
+pyarrow==17.0.0
+certifi==2023.11.17
+nvidia-cufft-cu11==10.9.0.58
+six==1.16.0
+pydantic==2.9.2
+click==8.1.7
+nest-asyncio==1.5.6
+gmpy2==2.1.0
+matplotlib==3.8.2
+scipy==1.11.4
+typing_extensions==4.12.2
+statsmodels==0.14.0
+huggingface-hub==0.25.0
+frozenlist==1.4.1
+gpustat==1.1.1
+nvidia-nvtx-cu11==11.7.91
+safetensors==0.4.5
+stanza==1.9.2
+decorator==5.1.1
+seaborn==0.13.0
+sentencepiece==0.2.0
+PyYAML==6.0.1
+black==24.8.0
+protobuf==4.25.1
+pickleshare==0.7.5
+peft==0.13.0
+triton==2.0.0
+nvidia-cuda-runtime-cu11==11.7.99
+Jinja2==3.1.2
+nvidia-cusolver-cu11==11.4.0.1
+executing==1.2.0
+jupyter_client==8.1.0
+pluggy==1.3.0
+cmake==3.30.3
+pytz==2023.3.post1
+aiohappyeyeballs==2.4.2
+kiwisolver==1.4.5
+py-cpuinfo==9.0.0
+Pillow==10.1.0
+ptyprocess==0.7.0
+importlib_resources==6.4.5
+GitPython==3.1.43
+importlib-metadata==6.0.0
+iniconfig==2.0.0
+scikit-learn==1.3.2
+exceptiongroup==1.1.0
+networkx==2.8.6
+accelerate==1.0.0
+nltk==3.8.1
+shutilwhich==1.1.0
+fonttools==4.45.1
+future==0.18.3
+aiohttp==3.10.6
+wcwidth==0.2.5
+idna==3.6
+filelock==3.12.2
+pathspec==0.12.1
+jupyter_core==5.1.0
+lit==18.1.8
+nvidia-curand-cu11==10.2.10.91
+nvidia-cublas-cu11==11.10.3.66
+nvidia-ml-py==12.560.30
+msgpack==1.1.0
+python-dateutil==2.8.2
+blessed==1.20.0
+packaging==23.0
+gitdb==4.0.11
+yarl==1.13.0
+emoji==2.8.0
+tzdata==2023.3
+cycler==0.12.1
+tornado==6.2
+backcall==0.2.0
+plotnine==0.12.4
+ninja==1.11.1.1
+latex==0.7.0
+wandb==0.18.5
+setproctitle==1.3.3
+threadpoolctl==3.2.0
+requests==2.32.3
+pyparsing==3.1.1
+smmap==5.0.1
+pyzmq==23.0.0
+async-timeout==4.0.3
+annotated-types==0.7.0
+matplotlib-inline==0.1.6
+latexcodec==1.0.0
+ipython==8.0.0
+patsy==0.5.3
+contourpy==1.2.0
+multidict==6.1.0
+mizani==0.9.3
+urllib3==2.1.0
+tokenizers==0.20.0
+MarkupSafe==2.1.2
+pip==24.2
+pexpect==4.8.0
+tqdm==4.66.5
+jedi==0.18.2
+pydantic_core==2.23.4
+tempdir==0.7.1
+mpmath==1.2.1
+setuptools==72.1.0
+pytest==7.4.3
+pure-eval==0.2.2
+psutil==5.9.1
+comm==0.1.2
+nvidia-cuda-cupti-cu11==11.7.101
+nvidia-cuda-nvrtc-cu11==11.7.99
+regex==2023.10.3
+platformdirs==2.5.2
+asttokens==2.2.1
+torch==2.0.0
+nvidia-nccl-cu11==2.14.3
+xxhash==3.5.0
diff --git a/wandb/run-20241031_114700-3cqkhntc/files/wandb-metadata.json b/wandb/run-20241031_114700-3cqkhntc/files/wandb-metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..d8fd0987a40149c07002f44e26d4340781782d10
--- /dev/null
+++ b/wandb/run-20241031_114700-3cqkhntc/files/wandb-metadata.json
@@ -0,0 +1,97 @@
+{
+  "os":  "Linux-5.4.0-162-generic-x86_64-with-glibc2.31",
+  "python":  "3.9.19",
+  "startedAt":  "2024-10-31T15:47:00.289124Z",
+  "args":  [
+    "--perturbation",
+    "reverse_full",
+    "--train_set",
+    "10M",
+    "--batch_size",
+    "3",
+    "--epoch",
+    "6",
+    "--seed",
+    "0"
+  ],
+  "program":  "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py",
+  "codePath":  "train/train_deep_wandb.py",
+  "git":  {
+    "remote":  "git@hf.co:Yaning1001/Impossible_llm.git",
+    "commit":  "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4"
+  },
+  "email":  "yaning1001@gmail.com",
+  "root":  "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train",
+  "host":  "mms-large-2",
+  "username":  "chunhui",
+  "executable":  "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python",
+  "codePathLocal":  "train_deep_wandb.py",
+  "cpu_count":  32,
+  "cpu_count_logical":  64,
+  "gpu":  "NVIDIA RTX A6000",
+  "gpu_count":  8,
+  "disk":  {
+    "/":  {
+      "total":  "1888559353856",
+      "used":  "1753158594560"
+    }
+  },
+  "memory":  {
+    "total":  "202617098240"
+  },
+  "cpu":  {
+    "count":  32,
+    "countLogical":  64
+  },
+  "gpu_nvidia":  [
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    }
+  ],
+  "cudaVersion":  "11.8"
+}
\ No newline at end of file
diff --git a/wandb/run-20241031_114700-q0d78n2b/files/requirements.txt b/wandb/run-20241031_114700-q0d78n2b/files/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416
--- /dev/null
+++ b/wandb/run-20241031_114700-q0d78n2b/files/requirements.txt
@@ -0,0 +1,147 @@
+funcsigs==1.0.2
+sentry-sdk==2.17.0
+multiprocess==0.70.16
+numpy==1.26.2
+pluralizer==1.2.0
+debugpy==1.6.7
+nvidia-cudnn-cu11==8.5.0.96
+deepspeed==0.15.2
+data==0.4
+pandas==2.1.3
+tomli==2.0.1
+charset-normalizer==3.3.2
+attrs==24.2.0
+aiosignal==1.3.1
+fsspec==2023.10.0
+nvidia-cusparse-cu11==11.7.4.91
+zipp==3.12.0
+mypy-extensions==1.0.0
+datasets==3.0.1
+joblib==1.3.2
+hjson==3.1.0
+traitlets==5.7.1
+stack-data==0.6.0
+transformers==4.45.1
+sympy==1.11.1
+Pygments==2.15.0
+docker-pycreds==0.4.0
+dill==0.3.8
+wheel==0.44.0
+prompt-toolkit==3.0.30
+parso==0.8.3
+ipykernel==6.23.1
+pyarrow==17.0.0
+certifi==2023.11.17
+nvidia-cufft-cu11==10.9.0.58
+six==1.16.0
+pydantic==2.9.2
+click==8.1.7
+nest-asyncio==1.5.6
+gmpy2==2.1.0
+matplotlib==3.8.2
+scipy==1.11.4
+typing_extensions==4.12.2
+statsmodels==0.14.0
+huggingface-hub==0.25.0
+frozenlist==1.4.1
+gpustat==1.1.1
+nvidia-nvtx-cu11==11.7.91
+safetensors==0.4.5
+stanza==1.9.2
+decorator==5.1.1
+seaborn==0.13.0
+sentencepiece==0.2.0
+PyYAML==6.0.1
+black==24.8.0
+protobuf==4.25.1
+pickleshare==0.7.5
+peft==0.13.0
+triton==2.0.0
+nvidia-cuda-runtime-cu11==11.7.99
+Jinja2==3.1.2
+nvidia-cusolver-cu11==11.4.0.1
+executing==1.2.0
+jupyter_client==8.1.0
+pluggy==1.3.0
+cmake==3.30.3
+pytz==2023.3.post1
+aiohappyeyeballs==2.4.2
+kiwisolver==1.4.5
+py-cpuinfo==9.0.0
+Pillow==10.1.0
+ptyprocess==0.7.0
+importlib_resources==6.4.5
+GitPython==3.1.43
+importlib-metadata==6.0.0
+iniconfig==2.0.0
+scikit-learn==1.3.2
+exceptiongroup==1.1.0
+networkx==2.8.6
+accelerate==1.0.0
+nltk==3.8.1
+shutilwhich==1.1.0
+fonttools==4.45.1
+future==0.18.3
+aiohttp==3.10.6
+wcwidth==0.2.5
+idna==3.6
+filelock==3.12.2
+pathspec==0.12.1
+jupyter_core==5.1.0
+lit==18.1.8
+nvidia-curand-cu11==10.2.10.91
+nvidia-cublas-cu11==11.10.3.66
+nvidia-ml-py==12.560.30
+msgpack==1.1.0
+python-dateutil==2.8.2
+blessed==1.20.0
+packaging==23.0
+gitdb==4.0.11
+yarl==1.13.0
+emoji==2.8.0
+tzdata==2023.3
+cycler==0.12.1
+tornado==6.2
+backcall==0.2.0
+plotnine==0.12.4
+ninja==1.11.1.1
+latex==0.7.0
+wandb==0.18.5
+setproctitle==1.3.3
+threadpoolctl==3.2.0
+requests==2.32.3
+pyparsing==3.1.1
+smmap==5.0.1
+pyzmq==23.0.0
+async-timeout==4.0.3
+annotated-types==0.7.0
+matplotlib-inline==0.1.6
+latexcodec==1.0.0
+ipython==8.0.0
+patsy==0.5.3
+contourpy==1.2.0
+multidict==6.1.0
+mizani==0.9.3
+urllib3==2.1.0
+tokenizers==0.20.0
+MarkupSafe==2.1.2
+pip==24.2
+pexpect==4.8.0
+tqdm==4.66.5
+jedi==0.18.2
+pydantic_core==2.23.4
+tempdir==0.7.1
+mpmath==1.2.1
+setuptools==72.1.0
+pytest==7.4.3
+pure-eval==0.2.2
+psutil==5.9.1
+comm==0.1.2
+nvidia-cuda-cupti-cu11==11.7.101
+nvidia-cuda-nvrtc-cu11==11.7.99
+regex==2023.10.3
+platformdirs==2.5.2
+asttokens==2.2.1
+torch==2.0.0
+nvidia-nccl-cu11==2.14.3
+xxhash==3.5.0
diff --git a/wandb/run-20241031_114700-q0d78n2b/files/wandb-metadata.json b/wandb/run-20241031_114700-q0d78n2b/files/wandb-metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..446eee014fa68da60f0b4df9a048fe33cc073063
--- /dev/null
+++ b/wandb/run-20241031_114700-q0d78n2b/files/wandb-metadata.json
@@ -0,0 +1,97 @@
+{
+  "os":  "Linux-5.4.0-162-generic-x86_64-with-glibc2.31",
+  "python":  "3.9.19",
+  "startedAt":  "2024-10-31T15:47:00.243502Z",
+  "args":  [
+    "--perturbation",
+    "reverse_full",
+    "--train_set",
+    "10M",
+    "--batch_size",
+    "3",
+    "--epoch",
+    "6",
+    "--seed",
+    "0"
+  ],
+  "program":  "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py",
+  "codePath":  "train/train_deep_wandb.py",
+  "git":  {
+    "remote":  "git@hf.co:Yaning1001/Impossible_llm.git",
+    "commit":  "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4"
+  },
+  "email":  "yaning1001@gmail.com",
+  "root":  "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train",
+  "host":  "mms-large-2",
+  "username":  "chunhui",
+  "executable":  "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python",
+  "codePathLocal":  "train_deep_wandb.py",
+  "cpu_count":  32,
+  "cpu_count_logical":  64,
+  "gpu":  "NVIDIA RTX A6000",
+  "gpu_count":  8,
+  "disk":  {
+    "/":  {
+      "total":  "1888559353856",
+      "used":  "1753158594560"
+    }
+  },
+  "memory":  {
+    "total":  "202617098240"
+  },
+  "cpu":  {
+    "count":  32,
+    "countLogical":  64
+  },
+  "gpu_nvidia":  [
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    }
+  ],
+  "cudaVersion":  "11.8"
+}
\ No newline at end of file
diff --git a/wandb/run-20241031_114700-q0d78n2b/logs/debug-internal.log b/wandb/run-20241031_114700-q0d78n2b/logs/debug-internal.log
new file mode 100644
index 0000000000000000000000000000000000000000..fd446dc1e65013e9fdccc866f9587219ddfb6e68
--- /dev/null
+++ b/wandb/run-20241031_114700-q0d78n2b/logs/debug-internal.log
@@ -0,0 +1,8 @@
+{"time":"2024-10-31T11:47:00.246260836-04:00","level":"INFO","msg":"using version","core version":"0.18.5"}
+{"time":"2024-10-31T11:47:00.246281016-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241031_114700-q0d78n2b/logs/debug-core.log"}
+{"time":"2024-10-31T11:47:00.352833535-04:00","level":"INFO","msg":"created new stream","id":"q0d78n2b"}
+{"time":"2024-10-31T11:47:00.352859865-04:00","level":"INFO","msg":"stream: started","id":"q0d78n2b"}
+{"time":"2024-10-31T11:47:00.352931156-04:00","level":"INFO","msg":"sender: started","stream_id":"q0d78n2b"}
+{"time":"2024-10-31T11:47:00.352897086-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"q0d78n2b"}}
+{"time":"2024-10-31T11:47:00.352894256-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"q0d78n2b"}}
+{"time":"2024-10-31T11:47:00.611011859-04:00","level":"INFO","msg":"Starting system monitor"}
diff --git a/wandb/run-20241031_114700-q0d78n2b/logs/debug.log b/wandb/run-20241031_114700-q0d78n2b/logs/debug.log
new file mode 100644
index 0000000000000000000000000000000000000000..a408b9f0693fe259c47f5be0304df1a0969ed1b9
--- /dev/null
+++ b/wandb/run-20241031_114700-q0d78n2b/logs/debug.log
@@ -0,0 +1,26 @@
+2024-10-31 11:47:00,241 INFO    MainThread:554145 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5
+2024-10-31 11:47:00,241 INFO    MainThread:554145 [wandb_setup.py:_flush():79] Configure stats pid to 554145
+2024-10-31 11:47:00,241 INFO    MainThread:554145 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings
+2024-10-31 11:47:00,241 INFO    MainThread:554145 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings
+2024-10-31 11:47:00,241 INFO    MainThread:554145 [wandb_setup.py:_flush():79] Loading settings from environment variables: {}
+2024-10-31 11:47:00,241 INFO    MainThread:554145 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None}
+2024-10-31 11:47:00,241 INFO    MainThread:554145 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'}
+2024-10-31 11:47:00,241 INFO    MainThread:554145 [wandb_setup.py:_flush():79] Applying login settings: {}
+2024-10-31 11:47:00,241 INFO    MainThread:554145 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241031_114700-q0d78n2b/logs/debug.log
+2024-10-31 11:47:00,241 INFO    MainThread:554145 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241031_114700-q0d78n2b/logs/debug-internal.log
+2024-10-31 11:47:00,241 INFO    MainThread:554145 [wandb_init.py:init():621] calling init triggers
+2024-10-31 11:47:00,241 INFO    MainThread:554145 [wandb_init.py:init():628] wandb.init called with sweep_config: {}
+config: {}
+2024-10-31 11:47:00,241 INFO    MainThread:554145 [wandb_init.py:init():671] starting backend
+2024-10-31 11:47:00,241 INFO    MainThread:554145 [wandb_init.py:init():675] sending inform_init request
+2024-10-31 11:47:00,243 INFO    MainThread:554145 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-10-31 11:47:00,243 INFO    MainThread:554145 [wandb_init.py:init():688] backend started and connected
+2024-10-31 11:47:00,247 INFO    MainThread:554145 [wandb_init.py:init():783] updated telemetry
+2024-10-31 11:47:00,278 INFO    MainThread:554145 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout
+2024-10-31 11:47:00,608 INFO    MainThread:554145 [wandb_init.py:init():867] starting run threads in backend
+2024-10-31 11:47:00,695 INFO    MainThread:554145 [wandb_run.py:_console_start():2463] atexit reg
+2024-10-31 11:47:00,695 INFO    MainThread:554145 [wandb_run.py:_redirect():2311] redirect: wrap_raw
+2024-10-31 11:47:00,695 INFO    MainThread:554145 [wandb_run.py:_redirect():2376] Wrapping output streams.
+2024-10-31 11:47:00,695 INFO    MainThread:554145 [wandb_run.py:_redirect():2401] Redirects installed.
+2024-10-31 11:47:00,696 INFO    MainThread:554145 [wandb_init.py:init():911] run started, returning control to user process
+2024-10-31 11:47:00,697 INFO    MainThread:554145 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_full', 'train_set': '10M', 'batch_size': 3, 'epoch': 6, 'seed': 0, 'lr': 0.0001}
diff --git a/wandb/run-20241031_122005-nip14lm6/files/config.yaml b/wandb/run-20241031_122005-nip14lm6/files/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..725c8381c5f9fe81efa0c182e9fe88850f0f19e9
--- /dev/null
+++ b/wandb/run-20241031_122005-nip14lm6/files/config.yaml
@@ -0,0 +1,49 @@
+_wandb:
+    value:
+        cli_version: 0.18.5
+        m: []
+        python_version: 3.9.19
+        t:
+            "1":
+                - 1
+                - 5
+                - 11
+                - 49
+                - 51
+                - 53
+                - 55
+                - 71
+                - 98
+            "2":
+                - 1
+                - 5
+                - 11
+                - 49
+                - 51
+                - 53
+                - 55
+                - 71
+                - 98
+            "3":
+                - 13
+                - 23
+                - 55
+            "4": 3.9.19
+            "5": 0.18.5
+            "6": 4.45.1
+            "8":
+                - 5
+            "12": 0.18.5
+            "13": linux-x86_64
+batch_size:
+    value: 3
+epoch:
+    value: 6
+lr:
+    value: 5e-06
+perturbation:
+    value: reverse_full
+seed:
+    value: 0
+train_set:
+    value: 10M
diff --git a/wandb/run-20241031_122005-nip14lm6/files/output.log b/wandb/run-20241031_122005-nip14lm6/files/output.log
new file mode 100644
index 0000000000000000000000000000000000000000..7932a49196018ff934fd534bc05f01d11a0e95d8
--- /dev/null
+++ b/wandb/run-20241031_122005-nip14lm6/files/output.log
@@ -0,0 +1,35 @@
+model.safetensors.index.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20.9k/20.9k [00:00<00:00, 8.51MB/s]
+Downloading shards:   0%|                                                                                                                                        | 0/2 [00:22<?, ?it/s]
+Error in sys.excepthook:
+Traceback (most recent call last):
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/wandb/sdk/lib/exit_hooks.py", line 41, in exc_handler
+    def exc_handler(
+KeyboardInterrupt
+
+Original exception was:
+Traceback (most recent call last):
+  File "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", line 173, in <module>
+    model = AutoModelForCausalLM.from_pretrained(model_name,
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/models/auto/auto_factory.py", line 564, in from_pretrained
+    return model_class.from_pretrained(
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/modeling_utils.py", line 3769, in from_pretrained
+    resolved_archive_file, sharded_metadata = get_checkpoint_shard_files(
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 1098, in get_checkpoint_shard_files
+    cached_filename = cached_file(
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 403, in cached_file
+    resolved_file = hf_hub_download(
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_deprecation.py", line 101, in inner_f
+    return f(*args, **kwargs)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn
+    return fn(*args, **kwargs)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1232, in hf_hub_download
+    return _hf_hub_download_to_cache_dir(
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1380, in _hf_hub_download_to_cache_dir
+    with WeakFileLock(lock_path):
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/contextlib.py", line 119, in __enter__
+    return next(self.gen)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_fixes.py", line 98, in WeakFileLock
+    lock.acquire()
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/filelock/_api.py", line 225, in acquire
+    time.sleep(poll_interval)
+KeyboardInterrupt
diff --git a/wandb/run-20241031_122005-nip14lm6/files/requirements.txt b/wandb/run-20241031_122005-nip14lm6/files/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416
--- /dev/null
+++ b/wandb/run-20241031_122005-nip14lm6/files/requirements.txt
@@ -0,0 +1,147 @@
+funcsigs==1.0.2
+sentry-sdk==2.17.0
+multiprocess==0.70.16
+numpy==1.26.2
+pluralizer==1.2.0
+debugpy==1.6.7
+nvidia-cudnn-cu11==8.5.0.96
+deepspeed==0.15.2
+data==0.4
+pandas==2.1.3
+tomli==2.0.1
+charset-normalizer==3.3.2
+attrs==24.2.0
+aiosignal==1.3.1
+fsspec==2023.10.0
+nvidia-cusparse-cu11==11.7.4.91
+zipp==3.12.0
+mypy-extensions==1.0.0
+datasets==3.0.1
+joblib==1.3.2
+hjson==3.1.0
+traitlets==5.7.1
+stack-data==0.6.0
+transformers==4.45.1
+sympy==1.11.1
+Pygments==2.15.0
+docker-pycreds==0.4.0
+dill==0.3.8
+wheel==0.44.0
+prompt-toolkit==3.0.30
+parso==0.8.3
+ipykernel==6.23.1
+pyarrow==17.0.0
+certifi==2023.11.17
+nvidia-cufft-cu11==10.9.0.58
+six==1.16.0
+pydantic==2.9.2
+click==8.1.7
+nest-asyncio==1.5.6
+gmpy2==2.1.0
+matplotlib==3.8.2
+scipy==1.11.4
+typing_extensions==4.12.2
+statsmodels==0.14.0
+huggingface-hub==0.25.0
+frozenlist==1.4.1
+gpustat==1.1.1
+nvidia-nvtx-cu11==11.7.91
+safetensors==0.4.5
+stanza==1.9.2
+decorator==5.1.1
+seaborn==0.13.0
+sentencepiece==0.2.0
+PyYAML==6.0.1
+black==24.8.0
+protobuf==4.25.1
+pickleshare==0.7.5
+peft==0.13.0
+triton==2.0.0
+nvidia-cuda-runtime-cu11==11.7.99
+Jinja2==3.1.2
+nvidia-cusolver-cu11==11.4.0.1
+executing==1.2.0
+jupyter_client==8.1.0
+pluggy==1.3.0
+cmake==3.30.3
+pytz==2023.3.post1
+aiohappyeyeballs==2.4.2
+kiwisolver==1.4.5
+py-cpuinfo==9.0.0
+Pillow==10.1.0
+ptyprocess==0.7.0
+importlib_resources==6.4.5
+GitPython==3.1.43
+importlib-metadata==6.0.0
+iniconfig==2.0.0
+scikit-learn==1.3.2
+exceptiongroup==1.1.0
+networkx==2.8.6
+accelerate==1.0.0
+nltk==3.8.1
+shutilwhich==1.1.0
+fonttools==4.45.1
+future==0.18.3
+aiohttp==3.10.6
+wcwidth==0.2.5
+idna==3.6
+filelock==3.12.2
+pathspec==0.12.1
+jupyter_core==5.1.0
+lit==18.1.8
+nvidia-curand-cu11==10.2.10.91
+nvidia-cublas-cu11==11.10.3.66
+nvidia-ml-py==12.560.30
+msgpack==1.1.0
+python-dateutil==2.8.2
+blessed==1.20.0
+packaging==23.0
+gitdb==4.0.11
+yarl==1.13.0
+emoji==2.8.0
+tzdata==2023.3
+cycler==0.12.1
+tornado==6.2
+backcall==0.2.0
+plotnine==0.12.4
+ninja==1.11.1.1
+latex==0.7.0
+wandb==0.18.5
+setproctitle==1.3.3
+threadpoolctl==3.2.0
+requests==2.32.3
+pyparsing==3.1.1
+smmap==5.0.1
+pyzmq==23.0.0
+async-timeout==4.0.3
+annotated-types==0.7.0
+matplotlib-inline==0.1.6
+latexcodec==1.0.0
+ipython==8.0.0
+patsy==0.5.3
+contourpy==1.2.0
+multidict==6.1.0
+mizani==0.9.3
+urllib3==2.1.0
+tokenizers==0.20.0
+MarkupSafe==2.1.2
+pip==24.2
+pexpect==4.8.0
+tqdm==4.66.5
+jedi==0.18.2
+pydantic_core==2.23.4
+tempdir==0.7.1
+mpmath==1.2.1
+setuptools==72.1.0
+pytest==7.4.3
+pure-eval==0.2.2
+psutil==5.9.1
+comm==0.1.2
+nvidia-cuda-cupti-cu11==11.7.101
+nvidia-cuda-nvrtc-cu11==11.7.99
+regex==2023.10.3
+platformdirs==2.5.2
+asttokens==2.2.1
+torch==2.0.0
+nvidia-nccl-cu11==2.14.3
+xxhash==3.5.0
diff --git a/wandb/run-20241031_122005-nip14lm6/files/wandb-metadata.json b/wandb/run-20241031_122005-nip14lm6/files/wandb-metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..56b499dd5cc4956cb199e725267386b547157f1d
--- /dev/null
+++ b/wandb/run-20241031_122005-nip14lm6/files/wandb-metadata.json
@@ -0,0 +1,97 @@
+{
+  "os":  "Linux-5.4.0-162-generic-x86_64-with-glibc2.31",
+  "python":  "3.9.19",
+  "startedAt":  "2024-10-31T16:20:05.846194Z",
+  "args":  [
+    "--perturbation",
+    "reverse_full",
+    "--train_set",
+    "10M",
+    "--batch_size",
+    "3",
+    "--epoch",
+    "6",
+    "--seed",
+    "0"
+  ],
+  "program":  "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py",
+  "codePath":  "train/train_deep_wandb.py",
+  "git":  {
+    "remote":  "git@hf.co:Yaning1001/Impossible_llm.git",
+    "commit":  "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4"
+  },
+  "email":  "yaning1001@gmail.com",
+  "root":  "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train",
+  "host":  "mms-large-2",
+  "username":  "chunhui",
+  "executable":  "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python",
+  "codePathLocal":  "train_deep_wandb.py",
+  "cpu_count":  32,
+  "cpu_count_logical":  64,
+  "gpu":  "NVIDIA RTX A6000",
+  "gpu_count":  8,
+  "disk":  {
+    "/":  {
+      "total":  "1888559353856",
+      "used":  "1753159847936"
+    }
+  },
+  "memory":  {
+    "total":  "202617098240"
+  },
+  "cpu":  {
+    "count":  32,
+    "countLogical":  64
+  },
+  "gpu_nvidia":  [
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    }
+  ],
+  "cudaVersion":  "11.8"
+}
\ No newline at end of file
diff --git a/wandb/run-20241031_122005-nip14lm6/files/wandb-summary.json b/wandb/run-20241031_122005-nip14lm6/files/wandb-summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..15f6b8e9049a55292dab131278b3f2fc1f52e50d
--- /dev/null
+++ b/wandb/run-20241031_122005-nip14lm6/files/wandb-summary.json
@@ -0,0 +1 @@
+{"_wandb":{"runtime":23}}
\ No newline at end of file
diff --git a/wandb/run-20241031_122005-nip14lm6/run-nip14lm6.wandb b/wandb/run-20241031_122005-nip14lm6/run-nip14lm6.wandb
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/wandb/run-20241101_012613-k6o0lha8/files/output.log b/wandb/run-20241101_012613-k6o0lha8/files/output.log
new file mode 100644
index 0000000000000000000000000000000000000000..5824b3c1b0e7410256b3374a200db5bd3cc11c9d
--- /dev/null
+++ b/wandb/run-20241101_012613-k6o0lha8/files/output.log
@@ -0,0 +1,12 @@
+Traceback (most recent call last):
+  File "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", line 164, in <module>
+    dataset = load_dataset('babylm_dataset_test.py', name=dataset_name, trust_remote_code=True)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/datasets/load.py", line 2074, in load_dataset
+    builder_instance = load_dataset_builder(
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/datasets/load.py", line 1832, in load_dataset_builder
+    builder_instance: DatasetBuilder = builder_cls(
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/datasets/builder.py", line 342, in __init__
+    self.config, self.config_id = self._create_builder_config(
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/datasets/builder.py", line 569, in _create_builder_config
+    raise ValueError(
+ValueError: BuilderConfig 'babylm_shuffle_nodeterministic_10M_seed0' not found. Available: ['babylm_hop_control_10M_seed0', 'babylm_hop_tokens4_10M_seed0', 'babylm_hop_words4_10M_seed0', 'babylm_reverse_control_10M_seed0', 'babylm_reverse_partial_10M_seed0', 'babylm_reverse_full_10M_seed0', 'babylm_shuffle_control_10M_seed0', 'babylm_shuffle_nondeterministic_10M_seed0', 'babylm_shuffle_deterministic21_10M_seed0', 'babylm_shuffle_deterministic57_10M_seed0', 'babylm_shuffle_deterministic84_10M_seed0', 'babylm_shuffle_local3_10M_seed0', 'babylm_shuffle_local5_10M_seed0', 'babylm_shuffle_local10_10M_seed0', 'babylm_shuffle_even_odd_10M_seed0']
diff --git a/wandb/run-20241101_012613-k6o0lha8/files/wandb-metadata.json b/wandb/run-20241101_012613-k6o0lha8/files/wandb-metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..6e96a1dfccdea4e071278636df0097986c8a027a
--- /dev/null
+++ b/wandb/run-20241101_012613-k6o0lha8/files/wandb-metadata.json
@@ -0,0 +1,97 @@
+{
+  "os":  "Linux-5.4.0-162-generic-x86_64-with-glibc2.31",
+  "python":  "3.9.19",
+  "startedAt":  "2024-11-01T05:26:13.051361Z",
+  "args":  [
+    "--perturbation",
+    "shuffle_nodeterministic",
+    "--train_set",
+    "10M",
+    "--batch_size",
+    "3",
+    "--epoch",
+    "6",
+    "--seed",
+    "0"
+  ],
+  "program":  "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py",
+  "codePath":  "train/train_deep_wandb.py",
+  "git":  {
+    "remote":  "git@hf.co:Yaning1001/Impossible_llm.git",
+    "commit":  "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4"
+  },
+  "email":  "yaning1001@gmail.com",
+  "root":  "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train",
+  "host":  "mms-large-2",
+  "username":  "chunhui",
+  "executable":  "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python",
+  "codePathLocal":  "train_deep_wandb.py",
+  "cpu_count":  32,
+  "cpu_count_logical":  64,
+  "gpu":  "NVIDIA RTX A6000",
+  "gpu_count":  8,
+  "disk":  {
+    "/":  {
+      "total":  "1888559353856",
+      "used":  "1753992237056"
+    }
+  },
+  "memory":  {
+    "total":  "202617098240"
+  },
+  "cpu":  {
+    "count":  32,
+    "countLogical":  64
+  },
+  "gpu_nvidia":  [
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    }
+  ],
+  "cudaVersion":  "11.8"
+}
\ No newline at end of file
diff --git a/wandb/run-20241101_012613-k6o0lha8/run-k6o0lha8.wandb b/wandb/run-20241101_012613-k6o0lha8/run-k6o0lha8.wandb
new file mode 100644
index 0000000000000000000000000000000000000000..22c8b57f9fb7484be6fc7d6cdbf69f3414a83821
Binary files /dev/null and b/wandb/run-20241101_012613-k6o0lha8/run-k6o0lha8.wandb differ
diff --git a/wandb/run-20241101_012733-9v55tr72/files/output.log b/wandb/run-20241101_012733-9v55tr72/files/output.log
new file mode 100644
index 0000000000000000000000000000000000000000..1274fd7be7f3cdff563854e73f6319af33003741
--- /dev/null
+++ b/wandb/run-20241101_012733-9v55tr72/files/output.log
@@ -0,0 +1,196 @@
+100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1098312/1098312 [00:04<00:00, 225385.84it/s]
+100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1098312/1098312 [00:00<00:00, 2536102.07it/s]
+100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16426/16426 [00:00<00:00, 27033.16it/s]
+Generating train split: 16425 examples [00:08, 1830.77 examples/s]█████████████████████████████████████████████████████████                   | 14150/16426 [00:00<00:00, 29025.68it/s]
+100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1085967/1085967 [00:05<00:00, 206269.15it/s]
+100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1085967/1085967 [00:00<00:00, 2626307.34it/s]
+100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17014/17014 [00:00<00:00, 24718.45it/s]
+Generating validation split: 17013 examples [00:10, 1685.09 examples/s]█████████████████████████████████▏                                     | 12320/17014 [00:00<00:00, 26810.27it/s]
+100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1031242/1031242 [00:04<00:00, 250719.52it/s]
+100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1031242/1031242 [00:00<00:00, 3139247.02it/s]
+100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15439/15439 [00:00<00:00, 27030.75it/s]
+Generating test split: 15438 examples [00:08, 1840.00 examples/s]█████████████████████████████████████████████████████████████▉               | 13736/15439 [00:00<00:00, 34826.46it/s]
+Downloading shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [02:32<00:00, 76.36s/it]
+Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:06<00:00,  3.45s/it]
+Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16425/16425 [00:54<00:00, 303.11 examples/s]
+Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17013/17013 [00:57<00:00, 297.31 examples/s]
+tokenized_valid: Dataset({
+    features: ['input_ids', 'attention_mask'],
+    num_rows: 600
+})
+/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead
+  warnings.warn(
+[2024-11-01 01:32:35,310] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2024-11-01 01:32:46,120] [INFO] [comm.py:652:init_distributed] cdb=None
+[2024-11-01 01:32:46,120] [INFO] [comm.py:683:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
+Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
+Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination
+Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root...
+Loading extension module cpu_adam...
+Time to load cpu_adam op: 5.5455732345581055 seconds
+[34m[1mwandb[0m: [33mWARNING[0m The `run_name` is currently set to the same value as `TrainingArguments.output_dir`. If this was not intended, please specify a different run name by setting the `TrainingArguments.run_name` parameter.
+                                                                                                                                                                                       
+{'loss': 3.0928, 'grad_norm': 0.0, 'learning_rate': 5e-06, 'epoch': 0.0}
+{'loss': 3.0737, 'grad_norm': 0.0, 'learning_rate': 5e-06, 'epoch': 0.0}
+{'loss': 3.1154, 'grad_norm': 0.0, 'learning_rate': 5e-06, 'epoch': 0.01}
+{'loss': 3.1109, 'grad_norm': 0.0, 'learning_rate': 5e-06, 'epoch': 0.01}
+{'loss': 3.1179, 'grad_norm': 0.0, 'learning_rate': 5e-06, 'epoch': 0.01}
+{'loss': 3.089, 'grad_norm': 0.0, 'learning_rate': 5e-06, 'epoch': 0.01}
+{'loss': 3.1042, 'grad_norm': 0.0, 'learning_rate': 5e-06, 'epoch': 0.02}
+{'loss': 3.109, 'grad_norm': 0.0, 'learning_rate': 5e-06, 'epoch': 0.02}
+{'loss': 3.097, 'grad_norm': 0.0, 'learning_rate': 5e-06, 'epoch': 0.02}
+{'loss': 3.1119, 'grad_norm': 0.0, 'learning_rate': 5e-06, 'epoch': 0.02}
+                                                                                                                                                                                       
+{'eval_loss': 3.1238040924072266, 'eval_runtime': 12.4211, 'eval_samples_per_second': 48.305, 'eval_steps_per_second': 1.047, 'epoch': 0.02}
+{'loss': 3.0899, 'grad_norm': 0.0, 'learning_rate': 5e-06, 'epoch': 0.02}
+{'loss': 3.1001, 'grad_norm': 0.0, 'learning_rate': 5e-06, 'epoch': 0.03}
+{'loss': 3.118, 'grad_norm': 0.0, 'learning_rate': 5e-06, 'epoch': 0.03}
+{'loss': 3.1069, 'grad_norm': 0.0, 'learning_rate': 5e-06, 'epoch': 0.03}
+{'loss': 3.0758, 'grad_norm': 6.654611587524414, 'learning_rate': 4.998172514619883e-06, 'epoch': 0.03}
+{'loss': 3.0993, 'grad_norm': 6.654611587524414, 'learning_rate': 4.998172514619883e-06, 'epoch': 0.04}
+{'loss': 3.0696, 'grad_norm': 9.038572311401367, 'learning_rate': 4.996345029239767e-06, 'epoch': 0.04}
+{'loss': 3.1161, 'grad_norm': 9.038572311401367, 'learning_rate': 4.996345029239767e-06, 'epoch': 0.04}
+{'loss': 3.0935, 'grad_norm': 8.783886909484863, 'learning_rate': 4.9945175438596495e-06, 'epoch': 0.04}
+{'loss': 3.0074, 'grad_norm': 5.36458158493042, 'learning_rate': 4.992690058479532e-06, 'epoch': 0.04}
+{'eval_loss': 2.966599225997925, 'eval_runtime': 11.5387, 'eval_samples_per_second': 51.999, 'eval_steps_per_second': 1.127, 'epoch': 0.04}
+{'loss': 2.9491, 'grad_norm': 4.384786605834961, 'learning_rate': 4.990862573099415e-06, 'epoch': 0.05}
+{'loss': 2.919, 'grad_norm': 5.37711238861084, 'learning_rate': 4.989035087719299e-06, 'epoch': 0.05}
+{'loss': 2.8708, 'grad_norm': 4.1505208015441895, 'learning_rate': 4.987207602339182e-06, 'epoch': 0.05}
+{'loss': 2.8378, 'grad_norm': 2.6863813400268555, 'learning_rate': 4.985380116959065e-06, 'epoch': 0.05}
+{'loss': 2.8356, 'grad_norm': 2.7242753505706787, 'learning_rate': 4.983552631578948e-06, 'epoch': 0.05}
+{'loss': 2.7966, 'grad_norm': 2.6609349250793457, 'learning_rate': 4.9817251461988304e-06, 'epoch': 0.06}
+{'loss': 2.7441, 'grad_norm': 2.2204103469848633, 'learning_rate': 4.979897660818714e-06, 'epoch': 0.06}
+{'loss': 2.7614, 'grad_norm': 2.374406099319458, 'learning_rate': 4.978070175438597e-06, 'epoch': 0.06}
+{'loss': 2.7582, 'grad_norm': 2.696918249130249, 'learning_rate': 4.9762426900584795e-06, 'epoch': 0.06}
+{'loss': 2.6845, 'grad_norm': 1.8222397565841675, 'learning_rate': 4.974415204678363e-06, 'epoch': 0.07}
+{'eval_loss': 2.758892297744751, 'eval_runtime': 11.5398, 'eval_samples_per_second': 51.994, 'eval_steps_per_second': 1.127, 'epoch': 0.07}
+{'loss': 2.6874, 'grad_norm': 2.076284408569336, 'learning_rate': 4.972587719298246e-06, 'epoch': 0.07}
+{'loss': 2.693, 'grad_norm': 2.0412065982818604, 'learning_rate': 4.970760233918129e-06, 'epoch': 0.07}
+{'loss': 2.6601, 'grad_norm': 1.8842229843139648, 'learning_rate': 4.968932748538012e-06, 'epoch': 0.07}
+{'loss': 2.6749, 'grad_norm': 1.756975531578064, 'learning_rate': 4.967105263157895e-06, 'epoch': 0.07}
+{'loss': 2.6141, 'grad_norm': 2.0640175342559814, 'learning_rate': 4.9652777777777786e-06, 'epoch': 0.08}
+{'loss': 2.608, 'grad_norm': 1.5173723697662354, 'learning_rate': 4.963450292397661e-06, 'epoch': 0.08}
+{'loss': 2.5623, 'grad_norm': 1.8280211687088013, 'learning_rate': 4.961622807017544e-06, 'epoch': 0.08}
+{'loss': 2.6046, 'grad_norm': 1.990080714225769, 'learning_rate': 4.959795321637428e-06, 'epoch': 0.08}
+{'loss': 2.5329, 'grad_norm': 1.4346381425857544, 'learning_rate': 4.95796783625731e-06, 'epoch': 0.09}
+{'loss': 2.5307, 'grad_norm': 1.45533287525177, 'learning_rate': 4.956140350877193e-06, 'epoch': 0.09}
+{'eval_loss': 2.629573345184326, 'eval_runtime': 11.5669, 'eval_samples_per_second': 51.872, 'eval_steps_per_second': 1.124, 'epoch': 0.09}
+{'loss': 2.5591, 'grad_norm': 1.5484964847564697, 'learning_rate': 4.954312865497076e-06, 'epoch': 0.09}
+{'loss': 2.5403, 'grad_norm': 1.3261419534683228, 'learning_rate': 4.9524853801169595e-06, 'epoch': 0.09}
+{'loss': 2.5176, 'grad_norm': 1.4234470129013062, 'learning_rate': 4.950657894736843e-06, 'epoch': 0.09}
+{'loss': 2.4846, 'grad_norm': 1.5438008308410645, 'learning_rate': 4.948830409356726e-06, 'epoch': 0.1}
+{'loss': 2.4678, 'grad_norm': 1.4391041994094849, 'learning_rate': 4.947002923976609e-06, 'epoch': 0.1}
+{'loss': 2.5105, 'grad_norm': 1.7514405250549316, 'learning_rate': 4.945175438596491e-06, 'epoch': 0.1}
+{'loss': 2.5033, 'grad_norm': 1.2241393327713013, 'learning_rate': 4.943347953216375e-06, 'epoch': 0.1}
+{'loss': 2.4888, 'grad_norm': 1.4796929359436035, 'learning_rate': 4.941520467836258e-06, 'epoch': 0.11}
+{'loss': 2.497, 'grad_norm': 1.3036240339279175, 'learning_rate': 4.9396929824561404e-06, 'epoch': 0.11}
+{'loss': 2.4566, 'grad_norm': 1.309809923171997, 'learning_rate': 4.937865497076024e-06, 'epoch': 0.11}
+{'eval_loss': 2.558666944503784, 'eval_runtime': 11.5691, 'eval_samples_per_second': 51.862, 'eval_steps_per_second': 1.124, 'epoch': 0.11}
+{'loss': 2.4567, 'grad_norm': 1.414117455482483, 'learning_rate': 4.936038011695907e-06, 'epoch': 0.11}
+{'loss': 2.49, 'grad_norm': 1.4788432121276855, 'learning_rate': 4.9342105263157895e-06, 'epoch': 0.11}
+{'loss': 2.4243, 'grad_norm': 1.4120174646377563, 'learning_rate': 4.932383040935672e-06, 'epoch': 0.12}
+{'loss': 2.4309, 'grad_norm': 1.6317367553710938, 'learning_rate': 4.930555555555556e-06, 'epoch': 0.12}
+{'loss': 2.4456, 'grad_norm': 1.1397351026535034, 'learning_rate': 4.9287280701754395e-06, 'epoch': 0.12}
+{'loss': 2.4707, 'grad_norm': 1.6220897436141968, 'learning_rate': 4.926900584795322e-06, 'epoch': 0.12}
+{'loss': 2.4705, 'grad_norm': 1.2757837772369385, 'learning_rate': 4.925073099415205e-06, 'epoch': 0.12}
+{'loss': 2.4143, 'grad_norm': 1.3141602277755737, 'learning_rate': 4.9232456140350886e-06, 'epoch': 0.13}
+{'loss': 2.4199, 'grad_norm': 1.5668749809265137, 'learning_rate': 4.921418128654971e-06, 'epoch': 0.13}
+{'loss': 2.4307, 'grad_norm': 1.4094359874725342, 'learning_rate': 4.919590643274854e-06, 'epoch': 0.13}
+{'eval_loss': 2.517282247543335, 'eval_runtime': 11.5691, 'eval_samples_per_second': 51.862, 'eval_steps_per_second': 1.124, 'epoch': 0.13}
+{'loss': 2.3747, 'grad_norm': 1.752899169921875, 'learning_rate': 4.917763157894737e-06, 'epoch': 0.13}
+{'loss': 2.4056, 'grad_norm': 1.739943027496338, 'learning_rate': 4.91593567251462e-06, 'epoch': 0.14}
+{'loss': 2.4286, 'grad_norm': 1.6286025047302246, 'learning_rate': 4.914108187134503e-06, 'epoch': 0.14}
+{'loss': 2.4063, 'grad_norm': 1.542277455329895, 'learning_rate': 4.912280701754386e-06, 'epoch': 0.14}
+{'loss': 2.412, 'grad_norm': 1.8128482103347778, 'learning_rate': 4.9104532163742695e-06, 'epoch': 0.14}
+{'loss': 2.4342, 'grad_norm': 1.3743454217910767, 'learning_rate': 4.908625730994152e-06, 'epoch': 0.14}
+{'loss': 2.3785, 'grad_norm': 2.225510835647583, 'learning_rate': 4.906798245614036e-06, 'epoch': 0.15}
+{'loss': 2.4023, 'grad_norm': 1.531154990196228, 'learning_rate': 4.904970760233919e-06, 'epoch': 0.15}
+{'loss': 2.4038, 'grad_norm': 1.983007788658142, 'learning_rate': 4.903143274853801e-06, 'epoch': 0.15}
+{'loss': 2.3977, 'grad_norm': 1.4333405494689941, 'learning_rate': 4.901315789473685e-06, 'epoch': 0.15}
+{'eval_loss': 2.488751173019409, 'eval_runtime': 11.6055, 'eval_samples_per_second': 51.7, 'eval_steps_per_second': 1.12, 'epoch': 0.15}
+{'loss': 2.381, 'grad_norm': 1.7076454162597656, 'learning_rate': 4.899488304093568e-06, 'epoch': 0.16}
+{'loss': 2.3719, 'grad_norm': 1.627768874168396, 'learning_rate': 4.8976608187134504e-06, 'epoch': 0.16}
+{'loss': 2.3685, 'grad_norm': 1.3088836669921875, 'learning_rate': 4.895833333333333e-06, 'epoch': 0.16}
+{'loss': 2.3684, 'grad_norm': 1.7792292833328247, 'learning_rate': 4.894005847953217e-06, 'epoch': 0.16}
+{'loss': 2.377, 'grad_norm': 1.4323128461837769, 'learning_rate': 4.8921783625731e-06, 'epoch': 0.16}
+{'loss': 2.366, 'grad_norm': 1.5406019687652588, 'learning_rate': 4.890350877192983e-06, 'epoch': 0.17}
+{'loss': 2.3304, 'grad_norm': 1.864188313484192, 'learning_rate': 4.888523391812866e-06, 'epoch': 0.17}
+{'loss': 2.3666, 'grad_norm': 1.6635836362838745, 'learning_rate': 4.886695906432749e-06, 'epoch': 0.17}
+{'loss': 2.3656, 'grad_norm': 1.360572099685669, 'learning_rate': 4.884868421052632e-06, 'epoch': 0.17}
+{'loss': 2.3807, 'grad_norm': 1.5489475727081299, 'learning_rate': 4.883040935672515e-06, 'epoch': 0.18}
+{'eval_loss': 2.464202880859375, 'eval_runtime': 11.5982, 'eval_samples_per_second': 51.732, 'eval_steps_per_second': 1.121, 'epoch': 0.18}
+{'loss': 2.351, 'grad_norm': 1.4594776630401611, 'learning_rate': 4.881213450292398e-06, 'epoch': 0.18}
+{'loss': 2.3653, 'grad_norm': 1.4087573289871216, 'learning_rate': 4.879385964912281e-06, 'epoch': 0.18}
+{'loss': 2.3573, 'grad_norm': 2.222598075866699, 'learning_rate': 4.877558479532164e-06, 'epoch': 0.18}
+{'loss': 2.4051, 'grad_norm': 1.8786218166351318, 'learning_rate': 4.875730994152047e-06, 'epoch': 0.18}
+{'loss': 2.3461, 'grad_norm': 1.4465943574905396, 'learning_rate': 4.8739035087719296e-06, 'epoch': 0.19}
+{'loss': 2.3144, 'grad_norm': 1.9490894079208374, 'learning_rate': 4.872076023391813e-06, 'epoch': 0.19}
+{'loss': 2.3444, 'grad_norm': 1.7288326025009155, 'learning_rate': 4.870248538011697e-06, 'epoch': 0.19}
+{'loss': 2.3569, 'grad_norm': 1.7530410289764404, 'learning_rate': 4.8684210526315795e-06, 'epoch': 0.19}
+{'loss': 2.3743, 'grad_norm': 1.4135267734527588, 'learning_rate': 4.866593567251462e-06, 'epoch': 0.19}
+{'loss': 2.3217, 'grad_norm': 1.8368803262710571, 'learning_rate': 4.864766081871346e-06, 'epoch': 0.2}
+{'eval_loss': 2.450228452682495, 'eval_runtime': 11.5967, 'eval_samples_per_second': 51.739, 'eval_steps_per_second': 1.121, 'epoch': 0.2}
+{'loss': 2.3678, 'grad_norm': 1.3603743314743042, 'learning_rate': 4.862938596491229e-06, 'epoch': 0.2}
+{'loss': 2.3663, 'grad_norm': 1.9931479692459106, 'learning_rate': 4.861111111111111e-06, 'epoch': 0.2}
+{'loss': 2.3366, 'grad_norm': 1.4983241558074951, 'learning_rate': 4.859283625730994e-06, 'epoch': 0.2}
+{'loss': 2.3388, 'grad_norm': 1.9140528440475464, 'learning_rate': 4.857456140350878e-06, 'epoch': 0.21}
+{'loss': 2.3373, 'grad_norm': 1.4306626319885254, 'learning_rate': 4.8556286549707604e-06, 'epoch': 0.21}
+{'loss': 2.3393, 'grad_norm': 1.8524028062820435, 'learning_rate': 4.853801169590643e-06, 'epoch': 0.21}
+{'loss': 2.3076, 'grad_norm': 1.4418741464614868, 'learning_rate': 4.851973684210527e-06, 'epoch': 0.21}
+{'loss': 2.3428, 'grad_norm': 1.6648645401000977, 'learning_rate': 4.8501461988304095e-06, 'epoch': 0.21}
+{'loss': 2.3321, 'grad_norm': 1.887403130531311, 'learning_rate': 4.848318713450293e-06, 'epoch': 0.22}
+{'loss': 2.3366, 'grad_norm': 1.9936954975128174, 'learning_rate': 4.846491228070176e-06, 'epoch': 0.22}
+{'eval_loss': 2.4382259845733643, 'eval_runtime': 11.6173, 'eval_samples_per_second': 51.647, 'eval_steps_per_second': 1.119, 'epoch': 0.22}
+{'loss': 2.3472, 'grad_norm': 1.8773130178451538, 'learning_rate': 4.844663742690059e-06, 'epoch': 0.22}
+{'loss': 2.3142, 'grad_norm': 1.8776212930679321, 'learning_rate': 4.842836257309942e-06, 'epoch': 0.22}
+{'loss': 2.3106, 'grad_norm': 2.305266857147217, 'learning_rate': 4.841008771929825e-06, 'epoch': 0.23}
+{'loss': 2.3276, 'grad_norm': 2.13682222366333, 'learning_rate': 4.839181286549708e-06, 'epoch': 0.23}
+{'loss': 2.3762, 'grad_norm': 1.4358876943588257, 'learning_rate': 4.8373538011695905e-06, 'epoch': 0.23}
+{'loss': 2.3149, 'grad_norm': 1.7932581901550293, 'learning_rate': 4.835526315789474e-06, 'epoch': 0.23}
+{'loss': 2.275, 'grad_norm': 1.6192528009414673, 'learning_rate': 4.833698830409358e-06, 'epoch': 0.23}
+{'loss': 2.2949, 'grad_norm': 2.0717737674713135, 'learning_rate': 4.83187134502924e-06, 'epoch': 0.24}
+{'loss': 2.3171, 'grad_norm': 1.6378692388534546, 'learning_rate': 4.830043859649123e-06, 'epoch': 0.24}
+{'loss': 2.3051, 'grad_norm': 1.669114112854004, 'learning_rate': 4.828216374269007e-06, 'epoch': 0.24}
+{'eval_loss': 2.4258029460906982, 'eval_runtime': 11.6018, 'eval_samples_per_second': 51.716, 'eval_steps_per_second': 1.121, 'epoch': 0.24}
+{'loss': 2.3045, 'grad_norm': 1.3886950016021729, 'learning_rate': 4.8263888888888895e-06, 'epoch': 0.24}
+{'loss': 2.3076, 'grad_norm': 1.7219699621200562, 'learning_rate': 4.824561403508772e-06, 'epoch': 0.25}
+{'loss': 2.2857, 'grad_norm': 1.4992568492889404, 'learning_rate': 4.822733918128655e-06, 'epoch': 0.25}
+{'loss': 2.3096, 'grad_norm': 1.7140436172485352, 'learning_rate': 4.820906432748539e-06, 'epoch': 0.25}
+{'loss': 2.3194, 'grad_norm': 1.6086301803588867, 'learning_rate': 4.819078947368421e-06, 'epoch': 0.25}
+{'loss': 2.3498, 'grad_norm': 1.5028151273727417, 'learning_rate': 4.817251461988304e-06, 'epoch': 0.25}
+{'loss': 2.2867, 'grad_norm': 1.7474697828292847, 'learning_rate': 4.815423976608188e-06, 'epoch': 0.26}
+{'loss': 2.2662, 'grad_norm': 1.6730782985687256, 'learning_rate': 4.8135964912280704e-06, 'epoch': 0.26}
+{'loss': 2.2926, 'grad_norm': 2.4538962841033936, 'learning_rate': 4.811769005847954e-06, 'epoch': 0.26}
+{'loss': 2.3217, 'grad_norm': 1.6457512378692627, 'learning_rate': 4.809941520467837e-06, 'epoch': 0.26}
+{'eval_loss': 2.408306121826172, 'eval_runtime': 11.6092, 'eval_samples_per_second': 51.683, 'eval_steps_per_second': 1.12, 'epoch': 0.26}
+{'loss': 2.2943, 'grad_norm': 1.8019167184829712, 'learning_rate': 4.8081140350877195e-06, 'epoch': 0.27}
+{'loss': 2.2945, 'grad_norm': 1.3667467832565308, 'learning_rate': 4.806286549707603e-06, 'epoch': 0.27}
+{'loss': 2.2925, 'grad_norm': 1.6296675205230713, 'learning_rate': 4.804459064327486e-06, 'epoch': 0.27}
+{'loss': 2.2902, 'grad_norm': 1.9956955909729004, 'learning_rate': 4.802631578947369e-06, 'epoch': 0.27}
+{'loss': 2.2738, 'grad_norm': 1.850484848022461, 'learning_rate': 4.800804093567251e-06, 'epoch': 0.27}
+{'loss': 2.2809, 'grad_norm': 1.682741403579712, 'learning_rate': 4.798976608187135e-06, 'epoch': 0.28}
+{'loss': 2.2944, 'grad_norm': 1.5462265014648438, 'learning_rate': 4.797149122807018e-06, 'epoch': 0.28}
+{'loss': 2.3055, 'grad_norm': 1.6992024183273315, 'learning_rate': 4.7953216374269005e-06, 'epoch': 0.28}
+{'loss': 2.2811, 'grad_norm': 2.0903217792510986, 'learning_rate': 4.793494152046784e-06, 'epoch': 0.28}
+{'loss': 2.3058, 'grad_norm': 1.9676622152328491, 'learning_rate': 4.791666666666668e-06, 'epoch': 0.28}
+{'eval_loss': 2.40899658203125, 'eval_runtime': 11.5988, 'eval_samples_per_second': 51.729, 'eval_steps_per_second': 1.121, 'epoch': 0.28}
+{'loss': 2.3126, 'grad_norm': 1.6649582386016846, 'learning_rate': 4.78983918128655e-06, 'epoch': 0.29}
+{'loss': 2.2789, 'grad_norm': 2.4453353881835938, 'learning_rate': 4.788011695906433e-06, 'epoch': 0.29}
+{'loss': 2.3292, 'grad_norm': 2.011908769607544, 'learning_rate': 4.786184210526316e-06, 'epoch': 0.29}
+{'loss': 2.2913, 'grad_norm': 1.4906234741210938, 'learning_rate': 4.7843567251461995e-06, 'epoch': 0.29}
+{'loss': 2.2931, 'grad_norm': 2.1027095317840576, 'learning_rate': 4.782529239766082e-06, 'epoch': 0.3}
+{'loss': 2.3031, 'grad_norm': 1.4204366207122803, 'learning_rate': 4.780701754385965e-06, 'epoch': 0.3}
+{'loss': 2.3196, 'grad_norm': 2.1822638511657715, 'learning_rate': 4.778874269005848e-06, 'epoch': 0.3}
+{'loss': 2.2743, 'grad_norm': 1.7422493696212769, 'learning_rate': 4.777046783625731e-06, 'epoch': 0.3}
+{'loss': 2.285, 'grad_norm': 1.6661350727081299, 'learning_rate': 4.775219298245615e-06, 'epoch': 0.3}
+{'loss': 2.295, 'grad_norm': 1.7462584972381592, 'learning_rate': 4.773391812865498e-06, 'epoch': 0.31}
+{'eval_loss': 2.4030065536499023, 'eval_runtime': 11.608, 'eval_samples_per_second': 51.688, 'eval_steps_per_second': 1.12, 'epoch': 0.31}
+{'loss': 2.2568, 'grad_norm': 1.5412518978118896, 'learning_rate': 4.7715643274853804e-06, 'epoch': 0.31}
+{'loss': 2.2638, 'grad_norm': 1.5836228132247925, 'learning_rate': 4.769736842105264e-06, 'epoch': 0.31}
+{'loss': 2.2783, 'grad_norm': 1.7100133895874023, 'learning_rate': 4.767909356725147e-06, 'epoch': 0.31}
+{'loss': 2.3331, 'grad_norm': 1.7988075017929077, 'learning_rate': 4.7660818713450295e-06, 'epoch': 0.32}
+{'loss': 2.297, 'grad_norm': 1.66475510597229, 'learning_rate': 4.764254385964912e-06, 'epoch': 0.32}
+{'loss': 2.2671, 'grad_norm': 1.811797857284546, 'learning_rate': 4.762426900584796e-06, 'epoch': 0.32}
+{'loss': 2.2555, 'grad_norm': 1.4660041332244873, 'learning_rate': 4.760599415204679e-06, 'epoch': 0.32}
+{'loss': 2.2699, 'grad_norm': 2.041257381439209, 'learning_rate': 4.758771929824561e-06, 'epoch': 0.32}
+{'loss': 2.2772, 'grad_norm': 1.6798973083496094, 'learning_rate': 4.756944444444445e-06, 'epoch': 0.33}
diff --git a/wandb/run-20241101_012733-9v55tr72/files/requirements.txt b/wandb/run-20241101_012733-9v55tr72/files/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416
--- /dev/null
+++ b/wandb/run-20241101_012733-9v55tr72/files/requirements.txt
@@ -0,0 +1,147 @@
+funcsigs==1.0.2
+sentry-sdk==2.17.0
+multiprocess==0.70.16
+numpy==1.26.2
+pluralizer==1.2.0
+debugpy==1.6.7
+nvidia-cudnn-cu11==8.5.0.96
+deepspeed==0.15.2
+data==0.4
+pandas==2.1.3
+tomli==2.0.1
+charset-normalizer==3.3.2
+attrs==24.2.0
+aiosignal==1.3.1
+fsspec==2023.10.0
+nvidia-cusparse-cu11==11.7.4.91
+zipp==3.12.0
+mypy-extensions==1.0.0
+datasets==3.0.1
+joblib==1.3.2
+hjson==3.1.0
+traitlets==5.7.1
+stack-data==0.6.0
+transformers==4.45.1
+sympy==1.11.1
+Pygments==2.15.0
+docker-pycreds==0.4.0
+dill==0.3.8
+wheel==0.44.0
+prompt-toolkit==3.0.30
+parso==0.8.3
+ipykernel==6.23.1
+pyarrow==17.0.0
+certifi==2023.11.17
+nvidia-cufft-cu11==10.9.0.58
+six==1.16.0
+pydantic==2.9.2
+click==8.1.7
+nest-asyncio==1.5.6
+gmpy2==2.1.0
+matplotlib==3.8.2
+scipy==1.11.4
+typing_extensions==4.12.2
+statsmodels==0.14.0
+huggingface-hub==0.25.0
+frozenlist==1.4.1
+gpustat==1.1.1
+nvidia-nvtx-cu11==11.7.91
+safetensors==0.4.5
+stanza==1.9.2
+decorator==5.1.1
+seaborn==0.13.0
+sentencepiece==0.2.0
+PyYAML==6.0.1
+black==24.8.0
+protobuf==4.25.1
+pickleshare==0.7.5
+peft==0.13.0
+triton==2.0.0
+nvidia-cuda-runtime-cu11==11.7.99
+Jinja2==3.1.2
+nvidia-cusolver-cu11==11.4.0.1
+executing==1.2.0
+jupyter_client==8.1.0
+pluggy==1.3.0
+cmake==3.30.3
+pytz==2023.3.post1
+aiohappyeyeballs==2.4.2
+kiwisolver==1.4.5
+py-cpuinfo==9.0.0
+Pillow==10.1.0
+ptyprocess==0.7.0
+importlib_resources==6.4.5
+GitPython==3.1.43
+importlib-metadata==6.0.0
+iniconfig==2.0.0
+scikit-learn==1.3.2
+exceptiongroup==1.1.0
+networkx==2.8.6
+accelerate==1.0.0
+nltk==3.8.1
+shutilwhich==1.1.0
+fonttools==4.45.1
+future==0.18.3
+aiohttp==3.10.6
+wcwidth==0.2.5
+idna==3.6
+filelock==3.12.2
+pathspec==0.12.1
+jupyter_core==5.1.0
+lit==18.1.8
+nvidia-curand-cu11==10.2.10.91
+nvidia-cublas-cu11==11.10.3.66
+nvidia-ml-py==12.560.30
+msgpack==1.1.0
+python-dateutil==2.8.2
+blessed==1.20.0
+packaging==23.0
+gitdb==4.0.11
+yarl==1.13.0
+emoji==2.8.0
+tzdata==2023.3
+cycler==0.12.1
+tornado==6.2
+backcall==0.2.0
+plotnine==0.12.4
+ninja==1.11.1.1
+latex==0.7.0
+wandb==0.18.5
+setproctitle==1.3.3
+threadpoolctl==3.2.0
+requests==2.32.3
+pyparsing==3.1.1
+smmap==5.0.1
+pyzmq==23.0.0
+async-timeout==4.0.3
+annotated-types==0.7.0
+matplotlib-inline==0.1.6
+latexcodec==1.0.0
+ipython==8.0.0
+patsy==0.5.3
+contourpy==1.2.0
+multidict==6.1.0
+mizani==0.9.3
+urllib3==2.1.0
+tokenizers==0.20.0
+MarkupSafe==2.1.2
+pip==24.2
+pexpect==4.8.0
+tqdm==4.66.5
+jedi==0.18.2
+pydantic_core==2.23.4
+tempdir==0.7.1
+mpmath==1.2.1
+setuptools==72.1.0
+pytest==7.4.3
+pure-eval==0.2.2
+psutil==5.9.1
+comm==0.1.2
+nvidia-cuda-cupti-cu11==11.7.101
+nvidia-cuda-nvrtc-cu11==11.7.99
+regex==2023.10.3
+platformdirs==2.5.2
+asttokens==2.2.1
+torch==2.0.0
+nvidia-nccl-cu11==2.14.3
+xxhash==3.5.0
diff --git a/wandb/run-20241101_012733-9v55tr72/files/wandb-metadata.json b/wandb/run-20241101_012733-9v55tr72/files/wandb-metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..81f4e96e0da386e846c29c764f9b5648d64b82ed
--- /dev/null
+++ b/wandb/run-20241101_012733-9v55tr72/files/wandb-metadata.json
@@ -0,0 +1,97 @@
+{
+  "os":  "Linux-5.4.0-162-generic-x86_64-with-glibc2.31",
+  "python":  "3.9.19",
+  "startedAt":  "2024-11-01T05:27:33.891704Z",
+  "args":  [
+    "--perturbation",
+    "shuffle_nondeterministic",
+    "--train_set",
+    "10M",
+    "--batch_size",
+    "3",
+    "--epoch",
+    "6",
+    "--seed",
+    "0"
+  ],
+  "program":  "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py",
+  "codePath":  "train/train_deep_wandb.py",
+  "git":  {
+    "remote":  "git@hf.co:Yaning1001/Impossible_llm.git",
+    "commit":  "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4"
+  },
+  "email":  "yaning1001@gmail.com",
+  "root":  "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train",
+  "host":  "mms-large-2",
+  "username":  "chunhui",
+  "executable":  "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python",
+  "codePathLocal":  "train_deep_wandb.py",
+  "cpu_count":  32,
+  "cpu_count_logical":  64,
+  "gpu":  "NVIDIA RTX A6000",
+  "gpu_count":  8,
+  "disk":  {
+    "/":  {
+      "total":  "1888559353856",
+      "used":  "1753992261632"
+    }
+  },
+  "memory":  {
+    "total":  "202617098240"
+  },
+  "cpu":  {
+    "count":  32,
+    "countLogical":  64
+  },
+  "gpu_nvidia":  [
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    }
+  ],
+  "cudaVersion":  "11.8"
+}
\ No newline at end of file
diff --git a/wandb/run-20241101_012733-9v55tr72/logs/debug-internal.log b/wandb/run-20241101_012733-9v55tr72/logs/debug-internal.log
new file mode 100644
index 0000000000000000000000000000000000000000..e82bcf5af30ba4419f4145672e1330c5342eee52
--- /dev/null
+++ b/wandb/run-20241101_012733-9v55tr72/logs/debug-internal.log
@@ -0,0 +1,8 @@
+{"time":"2024-11-01T01:27:33.89478546-04:00","level":"INFO","msg":"using version","core version":"0.18.5"}
+{"time":"2024-11-01T01:27:33.89479698-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_012733-9v55tr72/logs/debug-core.log"}
+{"time":"2024-11-01T01:27:34.000399053-04:00","level":"INFO","msg":"created new stream","id":"9v55tr72"}
+{"time":"2024-11-01T01:27:34.000437843-04:00","level":"INFO","msg":"stream: started","id":"9v55tr72"}
+{"time":"2024-11-01T01:27:34.000488243-04:00","level":"INFO","msg":"sender: started","stream_id":"9v55tr72"}
+{"time":"2024-11-01T01:27:34.000483083-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"9v55tr72"}}
+{"time":"2024-11-01T01:27:34.000469223-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"9v55tr72"}}
+{"time":"2024-11-01T01:27:34.182959789-04:00","level":"INFO","msg":"Starting system monitor"}
diff --git a/wandb/run-20241101_012733-9v55tr72/logs/debug.log b/wandb/run-20241101_012733-9v55tr72/logs/debug.log
new file mode 100644
index 0000000000000000000000000000000000000000..db741336f5379f9f7eb42dda8ab3cf65ebec06ff
--- /dev/null
+++ b/wandb/run-20241101_012733-9v55tr72/logs/debug.log
@@ -0,0 +1,29 @@
+2024-11-01 01:27:33,888 INFO    MainThread:678552 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5
+2024-11-01 01:27:33,888 INFO    MainThread:678552 [wandb_setup.py:_flush():79] Configure stats pid to 678552
+2024-11-01 01:27:33,888 INFO    MainThread:678552 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings
+2024-11-01 01:27:33,888 INFO    MainThread:678552 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings
+2024-11-01 01:27:33,888 INFO    MainThread:678552 [wandb_setup.py:_flush():79] Loading settings from environment variables: {}
+2024-11-01 01:27:33,889 INFO    MainThread:678552 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None}
+2024-11-01 01:27:33,889 INFO    MainThread:678552 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'}
+2024-11-01 01:27:33,889 INFO    MainThread:678552 [wandb_setup.py:_flush():79] Applying login settings: {}
+2024-11-01 01:27:33,889 INFO    MainThread:678552 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_012733-9v55tr72/logs/debug.log
+2024-11-01 01:27:33,889 INFO    MainThread:678552 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_012733-9v55tr72/logs/debug-internal.log
+2024-11-01 01:27:33,889 INFO    MainThread:678552 [wandb_init.py:init():621] calling init triggers
+2024-11-01 01:27:33,889 INFO    MainThread:678552 [wandb_init.py:init():628] wandb.init called with sweep_config: {}
+config: {}
+2024-11-01 01:27:33,889 INFO    MainThread:678552 [wandb_init.py:init():671] starting backend
+2024-11-01 01:27:33,889 INFO    MainThread:678552 [wandb_init.py:init():675] sending inform_init request
+2024-11-01 01:27:33,891 INFO    MainThread:678552 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-11-01 01:27:33,891 INFO    MainThread:678552 [wandb_init.py:init():688] backend started and connected
+2024-11-01 01:27:33,894 INFO    MainThread:678552 [wandb_init.py:init():783] updated telemetry
+2024-11-01 01:27:33,912 INFO    MainThread:678552 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout
+2024-11-01 01:27:34,178 INFO    MainThread:678552 [wandb_init.py:init():867] starting run threads in backend
+2024-11-01 01:27:34,264 INFO    MainThread:678552 [wandb_run.py:_console_start():2463] atexit reg
+2024-11-01 01:27:34,264 INFO    MainThread:678552 [wandb_run.py:_redirect():2311] redirect: wrap_raw
+2024-11-01 01:27:34,264 INFO    MainThread:678552 [wandb_run.py:_redirect():2376] Wrapping output streams.
+2024-11-01 01:27:34,264 INFO    MainThread:678552 [wandb_run.py:_redirect():2401] Redirects installed.
+2024-11-01 01:27:34,266 INFO    MainThread:678552 [wandb_init.py:init():911] run started, returning control to user process
+2024-11-01 01:27:34,266 INFO    MainThread:678552 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'shuffle_nondeterministic', 'train_set': '10M', 'batch_size': 3, 'epoch': 6, 'seed': 0, 'lr': 5e-06}
+2024-11-01 01:33:19,616 INFO    MainThread:678552 [wandb_run.py:_config_callback():1390] config_cb None None {'vocab_size': 128256, 'max_position_embeddings': 131072, 'hidden_size': 3072, 'intermediate_size': 8192, 'num_hidden_layers': 28, 'num_attention_heads': 24, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'pretraining_tp': 1, 'use_cache': True, 'rope_theta': 500000.0, 'rope_scaling': {'factor': 32.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}, 'attention_bias': False, 'attention_dropout': 0.0, 'mlp_bias': False, 'head_dim': 128, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': True, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['LlamaForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 128000, 'pad_token_id': None, 'eos_token_id': 128001, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'meta-llama/Llama-3.2-3B', 'transformers_version': '4.45.1', 'model_type': 'llama', 'output_dir': './checkpoints/Llama-3.2-3B/babylm_shuffle_nondeterministic_10M_seed0/runs', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 3, 'per_device_eval_batch_size': 8, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 2, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-06, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 6, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': './logs', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 150, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 0, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': False, 'fp16': True, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 10, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': './checkpoints/Llama-3.2-3B/babylm_shuffle_nondeterministic_10M_seed0/runs', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': 'deepspeed_config/train_dp_config.json', 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'evaluation_strategy': 'steps', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'eval_use_gather_object': False}
+2024-11-01 01:33:19,623 INFO    MainThread:678552 [wandb_config.py:__setitem__():154] config set model/num_parameters = 3212749824 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x7f574a0c3fa0>>
+2024-11-01 01:33:19,623 INFO    MainThread:678552 [wandb_run.py:_config_callback():1390] config_cb model/num_parameters 3212749824 None
diff --git a/wandb/run-20241101_094656-ae4hctp0/files/output.log b/wandb/run-20241101_094656-ae4hctp0/files/output.log
new file mode 100644
index 0000000000000000000000000000000000000000..127126551236cb66b04fdb9d1bbf00e4210b9038
--- /dev/null
+++ b/wandb/run-20241101_094656-ae4hctp0/files/output.log
@@ -0,0 +1,13 @@
+Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:05<00:00,  2.71s/it]
+tokenized_valid: Dataset({
+    features: ['input_ids', 'attention_mask'],
+    num_rows: 600
+})
+/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead
+  warnings.warn(
+[2024-11-01 09:47:04,113] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2024-11-01 09:47:14,377] [INFO] [comm.py:652:init_distributed] cdb=None
+Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination
+Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root...
+Loading extension module cpu_adam...
+Time to load cpu_adam op: 4.517135381698608 seconds
diff --git a/wandb/run-20241101_094656-ae4hctp0/files/requirements.txt b/wandb/run-20241101_094656-ae4hctp0/files/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416
--- /dev/null
+++ b/wandb/run-20241101_094656-ae4hctp0/files/requirements.txt
@@ -0,0 +1,147 @@
+funcsigs==1.0.2
+sentry-sdk==2.17.0
+multiprocess==0.70.16
+numpy==1.26.2
+pluralizer==1.2.0
+debugpy==1.6.7
+nvidia-cudnn-cu11==8.5.0.96
+deepspeed==0.15.2
+data==0.4
+pandas==2.1.3
+tomli==2.0.1
+charset-normalizer==3.3.2
+attrs==24.2.0
+aiosignal==1.3.1
+fsspec==2023.10.0
+nvidia-cusparse-cu11==11.7.4.91
+zipp==3.12.0
+mypy-extensions==1.0.0
+datasets==3.0.1
+joblib==1.3.2
+hjson==3.1.0
+traitlets==5.7.1
+stack-data==0.6.0
+transformers==4.45.1
+sympy==1.11.1
+Pygments==2.15.0
+docker-pycreds==0.4.0
+dill==0.3.8
+wheel==0.44.0
+prompt-toolkit==3.0.30
+parso==0.8.3
+ipykernel==6.23.1
+pyarrow==17.0.0
+certifi==2023.11.17
+nvidia-cufft-cu11==10.9.0.58
+six==1.16.0
+pydantic==2.9.2
+click==8.1.7
+nest-asyncio==1.5.6
+gmpy2==2.1.0
+matplotlib==3.8.2
+scipy==1.11.4
+typing_extensions==4.12.2
+statsmodels==0.14.0
+huggingface-hub==0.25.0
+frozenlist==1.4.1
+gpustat==1.1.1
+nvidia-nvtx-cu11==11.7.91
+safetensors==0.4.5
+stanza==1.9.2
+decorator==5.1.1
+seaborn==0.13.0
+sentencepiece==0.2.0
+PyYAML==6.0.1
+black==24.8.0
+protobuf==4.25.1
+pickleshare==0.7.5
+peft==0.13.0
+triton==2.0.0
+nvidia-cuda-runtime-cu11==11.7.99
+Jinja2==3.1.2
+nvidia-cusolver-cu11==11.4.0.1
+executing==1.2.0
+jupyter_client==8.1.0
+pluggy==1.3.0
+cmake==3.30.3
+pytz==2023.3.post1
+aiohappyeyeballs==2.4.2
+kiwisolver==1.4.5
+py-cpuinfo==9.0.0
+Pillow==10.1.0
+ptyprocess==0.7.0
+importlib_resources==6.4.5
+GitPython==3.1.43
+importlib-metadata==6.0.0
+iniconfig==2.0.0
+scikit-learn==1.3.2
+exceptiongroup==1.1.0
+networkx==2.8.6
+accelerate==1.0.0
+nltk==3.8.1
+shutilwhich==1.1.0
+fonttools==4.45.1
+future==0.18.3
+aiohttp==3.10.6
+wcwidth==0.2.5
+idna==3.6
+filelock==3.12.2
+pathspec==0.12.1
+jupyter_core==5.1.0
+lit==18.1.8
+nvidia-curand-cu11==10.2.10.91
+nvidia-cublas-cu11==11.10.3.66
+nvidia-ml-py==12.560.30
+msgpack==1.1.0
+python-dateutil==2.8.2
+blessed==1.20.0
+packaging==23.0
+gitdb==4.0.11
+yarl==1.13.0
+emoji==2.8.0
+tzdata==2023.3
+cycler==0.12.1
+tornado==6.2
+backcall==0.2.0
+plotnine==0.12.4
+ninja==1.11.1.1
+latex==0.7.0
+wandb==0.18.5
+setproctitle==1.3.3
+threadpoolctl==3.2.0
+requests==2.32.3
+pyparsing==3.1.1
+smmap==5.0.1
+pyzmq==23.0.0
+async-timeout==4.0.3
+annotated-types==0.7.0
+matplotlib-inline==0.1.6
+latexcodec==1.0.0
+ipython==8.0.0
+patsy==0.5.3
+contourpy==1.2.0
+multidict==6.1.0
+mizani==0.9.3
+urllib3==2.1.0
+tokenizers==0.20.0
+MarkupSafe==2.1.2
+pip==24.2
+pexpect==4.8.0
+tqdm==4.66.5
+jedi==0.18.2
+pydantic_core==2.23.4
+tempdir==0.7.1
+mpmath==1.2.1
+setuptools==72.1.0
+pytest==7.4.3
+pure-eval==0.2.2
+psutil==5.9.1
+comm==0.1.2
+nvidia-cuda-cupti-cu11==11.7.101
+nvidia-cuda-nvrtc-cu11==11.7.99
+regex==2023.10.3
+platformdirs==2.5.2
+asttokens==2.2.1
+torch==2.0.0
+nvidia-nccl-cu11==2.14.3
+xxhash==3.5.0
diff --git a/wandb/run-20241101_094656-ae4hctp0/files/wandb-metadata.json b/wandb/run-20241101_094656-ae4hctp0/files/wandb-metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..ddfb439dc3cbb516110be7211d21973744242988
--- /dev/null
+++ b/wandb/run-20241101_094656-ae4hctp0/files/wandb-metadata.json
@@ -0,0 +1,97 @@
+{
+  "os":  "Linux-5.4.0-162-generic-x86_64-with-glibc2.31",
+  "python":  "3.9.19",
+  "startedAt":  "2024-11-01T13:46:56.380225Z",
+  "args":  [
+    "--perturbation",
+    "reverse_control",
+    "--train_set",
+    "10M",
+    "--batch_size",
+    "3",
+    "--epoch",
+    "7",
+    "--seed",
+    "0"
+  ],
+  "program":  "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py",
+  "codePath":  "train/train_deep_wandb.py",
+  "git":  {
+    "remote":  "git@hf.co:Yaning1001/Impossible_llm.git",
+    "commit":  "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4"
+  },
+  "email":  "yaning1001@gmail.com",
+  "root":  "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train",
+  "host":  "mms-large-2",
+  "username":  "chunhui",
+  "executable":  "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python",
+  "codePathLocal":  "train_deep_wandb.py",
+  "cpu_count":  32,
+  "cpu_count_logical":  64,
+  "gpu":  "NVIDIA RTX A6000",
+  "gpu_count":  8,
+  "disk":  {
+    "/":  {
+      "total":  "1888559353856",
+      "used":  "1754695659520"
+    }
+  },
+  "memory":  {
+    "total":  "202617098240"
+  },
+  "cpu":  {
+    "count":  32,
+    "countLogical":  64
+  },
+  "gpu_nvidia":  [
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    }
+  ],
+  "cudaVersion":  "11.8"
+}
\ No newline at end of file
diff --git a/wandb/run-20241101_094656-ae4hctp0/logs/debug.log b/wandb/run-20241101_094656-ae4hctp0/logs/debug.log
new file mode 100644
index 0000000000000000000000000000000000000000..fc8b27166a694bd466abb7311225057a611a3c3a
--- /dev/null
+++ b/wandb/run-20241101_094656-ae4hctp0/logs/debug.log
@@ -0,0 +1,26 @@
+2024-11-01 09:46:56,378 INFO    MainThread:786690 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5
+2024-11-01 09:46:56,378 INFO    MainThread:786690 [wandb_setup.py:_flush():79] Configure stats pid to 786690
+2024-11-01 09:46:56,378 INFO    MainThread:786690 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings
+2024-11-01 09:46:56,378 INFO    MainThread:786690 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings
+2024-11-01 09:46:56,378 INFO    MainThread:786690 [wandb_setup.py:_flush():79] Loading settings from environment variables: {}
+2024-11-01 09:46:56,378 INFO    MainThread:786690 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None}
+2024-11-01 09:46:56,378 INFO    MainThread:786690 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'}
+2024-11-01 09:46:56,378 INFO    MainThread:786690 [wandb_setup.py:_flush():79] Applying login settings: {}
+2024-11-01 09:46:56,378 INFO    MainThread:786690 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_094656-ae4hctp0/logs/debug.log
+2024-11-01 09:46:56,378 INFO    MainThread:786690 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_094656-ae4hctp0/logs/debug-internal.log
+2024-11-01 09:46:56,378 INFO    MainThread:786690 [wandb_init.py:init():621] calling init triggers
+2024-11-01 09:46:56,378 INFO    MainThread:786690 [wandb_init.py:init():628] wandb.init called with sweep_config: {}
+config: {}
+2024-11-01 09:46:56,378 INFO    MainThread:786690 [wandb_init.py:init():671] starting backend
+2024-11-01 09:46:56,378 INFO    MainThread:786690 [wandb_init.py:init():675] sending inform_init request
+2024-11-01 09:46:56,379 INFO    MainThread:786690 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-11-01 09:46:56,380 INFO    MainThread:786690 [wandb_init.py:init():688] backend started and connected
+2024-11-01 09:46:56,383 INFO    MainThread:786690 [wandb_init.py:init():783] updated telemetry
+2024-11-01 09:46:56,411 INFO    MainThread:786690 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout
+2024-11-01 09:46:56,711 INFO    MainThread:786690 [wandb_init.py:init():867] starting run threads in backend
+2024-11-01 09:46:56,846 INFO    MainThread:786690 [wandb_run.py:_console_start():2463] atexit reg
+2024-11-01 09:46:56,846 INFO    MainThread:786690 [wandb_run.py:_redirect():2311] redirect: wrap_raw
+2024-11-01 09:46:56,846 INFO    MainThread:786690 [wandb_run.py:_redirect():2376] Wrapping output streams.
+2024-11-01 09:46:56,847 INFO    MainThread:786690 [wandb_run.py:_redirect():2401] Redirects installed.
+2024-11-01 09:46:56,849 INFO    MainThread:786690 [wandb_init.py:init():911] run started, returning control to user process
+2024-11-01 09:46:56,849 INFO    MainThread:786690 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_control', 'train_set': '10M', 'batch_size': 3, 'epoch': 7, 'seed': 0, 'lr': 5e-06}
diff --git a/wandb/run-20241101_094656-ae4hctp0/run-ae4hctp0.wandb b/wandb/run-20241101_094656-ae4hctp0/run-ae4hctp0.wandb
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/wandb/run-20241101_200517-77b12390/files/output.log b/wandb/run-20241101_200517-77b12390/files/output.log
new file mode 100644
index 0000000000000000000000000000000000000000..73134a1b145f9b9eae01e20f4bdcd927010d9ab3
--- /dev/null
+++ b/wandb/run-20241101_200517-77b12390/files/output.log
@@ -0,0 +1,57 @@
+Downloading shards:   0%|                                                                                                                                                               | 0/2 [00:00<?, ?it/s]Exception ignored in: <generator object tqdm.__iter__ at 0x7f1a1c684d60>
+Traceback (most recent call last):
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/std.py", line 1196, in __iter__
+    self.close()
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/std.py", line 1302, in close
+    self.display(pos=0)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/std.py", line 1495, in display
+    self.sp(self.__str__() if msg is None else msg)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/std.py", line 459, in print_status
+    fp_write('\r' + s + (' ' * max(last_len[0] - len_s, 0)))
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/std.py", line 452, in fp_write
+    fp.write(str(s))
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/utils.py", line 196, in inner
+    return func(*args, **kwargs)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/wandb/sdk/lib/redirect.py", line 648, in write
+    cb(data)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/wandb/sdk/wandb_run.py", line 2386, in <lambda>
+    lambda data: self._console_raw_callback("stderr", data),
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/wandb/sdk/wandb_run.py", line 403, in wrapper_fn
+    return func(self, *args, **kwargs)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/wandb/sdk/wandb_run.py", line 1547, in _console_raw_callback
+    self._backend.interface.publish_output_raw(name, data)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/wandb/sdk/interface/interface.py", line 721, in publish_output_raw
+    self._publish_output_raw(o)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/wandb/sdk/interface/interface_shared.py", line 79, in _publish_output_raw
+    self._publish(rec)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/wandb/sdk/interface/interface_sock.py", line 50, in _publish
+    self._assign(record)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/wandb/sdk/interface/interface_sock.py", line 45, in _assign
+    def _assign(self, record: Any) -> None:
+KeyboardInterrupt:
+Traceback (most recent call last):
+  File "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", line 173, in <module>
+    model = AutoModelForCausalLM.from_pretrained(model_name,
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/models/auto/auto_factory.py", line 564, in from_pretrained
+    return model_class.from_pretrained(
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/modeling_utils.py", line 3769, in from_pretrained
+    resolved_archive_file, sharded_metadata = get_checkpoint_shard_files(
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 1098, in get_checkpoint_shard_files
+    cached_filename = cached_file(
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 403, in cached_file
+    resolved_file = hf_hub_download(
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_deprecation.py", line 101, in inner_f
+    return f(*args, **kwargs)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn
+    return fn(*args, **kwargs)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1232, in hf_hub_download
+    return _hf_hub_download_to_cache_dir(
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1380, in _hf_hub_download_to_cache_dir
+    with WeakFileLock(lock_path):
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/contextlib.py", line 119, in __enter__
+    return next(self.gen)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_fixes.py", line 98, in WeakFileLock
+    lock.acquire()
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/filelock/_api.py", line 225, in acquire
+    time.sleep(poll_interval)
+KeyboardInterrupt
diff --git a/wandb/run-20241101_200517-77b12390/files/wandb-metadata.json b/wandb/run-20241101_200517-77b12390/files/wandb-metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..37f79f63e00320deea3302d6743afafd21b3338e
--- /dev/null
+++ b/wandb/run-20241101_200517-77b12390/files/wandb-metadata.json
@@ -0,0 +1,97 @@
+{
+  "os":  "Linux-5.4.0-162-generic-x86_64-with-glibc2.31",
+  "python":  "3.9.19",
+  "startedAt":  "2024-11-02T00:05:17.462510Z",
+  "args":  [
+    "--perturbation",
+    "shuffle_nondeterministic",
+    "--train_set",
+    "10M",
+    "--batch_size",
+    "3",
+    "--epoch",
+    "3",
+    "--seed",
+    "0"
+  ],
+  "program":  "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py",
+  "codePath":  "train/train_deep_wandb.py",
+  "git":  {
+    "remote":  "git@hf.co:Yaning1001/Impossible_llm.git",
+    "commit":  "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4"
+  },
+  "email":  "yaning1001@gmail.com",
+  "root":  "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train",
+  "host":  "mms-large-2",
+  "username":  "chunhui",
+  "executable":  "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python",
+  "codePathLocal":  "train_deep_wandb.py",
+  "cpu_count":  32,
+  "cpu_count_logical":  64,
+  "gpu":  "NVIDIA RTX A6000",
+  "gpu_count":  8,
+  "disk":  {
+    "/":  {
+      "total":  "1888559353856",
+      "used":  "1754801557504"
+    }
+  },
+  "memory":  {
+    "total":  "202617098240"
+  },
+  "cpu":  {
+    "count":  32,
+    "countLogical":  64
+  },
+  "gpu_nvidia":  [
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    }
+  ],
+  "cudaVersion":  "11.8"
+}
\ No newline at end of file
diff --git a/wandb/run-20241101_200517-77b12390/files/wandb-summary.json b/wandb/run-20241101_200517-77b12390/files/wandb-summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..c437ff1a48b0e53a8cdd36dcd584a8e6b22b4bc2
--- /dev/null
+++ b/wandb/run-20241101_200517-77b12390/files/wandb-summary.json
@@ -0,0 +1 @@
+{"_wandb":{"runtime":7}}
\ No newline at end of file
diff --git a/wandb/run-20241101_200517-77b12390/logs/debug-internal.log b/wandb/run-20241101_200517-77b12390/logs/debug-internal.log
new file mode 100644
index 0000000000000000000000000000000000000000..6ac4efda0e718cdf43d1a08e2192d5cc5b75c282
--- /dev/null
+++ b/wandb/run-20241101_200517-77b12390/logs/debug-internal.log
@@ -0,0 +1,11 @@
+{"time":"2024-11-01T20:05:17.45897271-04:00","level":"INFO","msg":"using version","core version":"0.18.5"}
+{"time":"2024-11-01T20:05:17.45899154-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_200517-77b12390/logs/debug-core.log"}
+{"time":"2024-11-01T20:05:17.566867501-04:00","level":"INFO","msg":"created new stream","id":"77b12390"}
+{"time":"2024-11-01T20:05:17.566910622-04:00","level":"INFO","msg":"stream: started","id":"77b12390"}
+{"time":"2024-11-01T20:05:17.567008312-04:00","level":"INFO","msg":"sender: started","stream_id":"77b12390"}
+{"time":"2024-11-01T20:05:17.566939652-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"77b12390"}}
+{"time":"2024-11-01T20:05:17.566955952-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"77b12390"}}
+{"time":"2024-11-01T20:05:17.736070962-04:00","level":"INFO","msg":"Starting system monitor"}
+{"time":"2024-11-01T20:05:25.266366833-04:00","level":"INFO","msg":"stream: closing","id":"77b12390"}
+{"time":"2024-11-01T20:05:25.266402524-04:00","level":"INFO","msg":"Stopping system monitor"}
+{"time":"2024-11-01T20:05:25.266957688-04:00","level":"INFO","msg":"Stopped system monitor"}
diff --git a/wandb/run-20241101_200517-77b12390/logs/debug.log b/wandb/run-20241101_200517-77b12390/logs/debug.log
new file mode 100644
index 0000000000000000000000000000000000000000..71947adb65477bd25fab337dc910ef1e17f6f6d1
--- /dev/null
+++ b/wandb/run-20241101_200517-77b12390/logs/debug.log
@@ -0,0 +1,27 @@
+2024-11-01 20:05:17,453 INFO    MainThread:870381 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5
+2024-11-01 20:05:17,453 INFO    MainThread:870381 [wandb_setup.py:_flush():79] Configure stats pid to 870381
+2024-11-01 20:05:17,453 INFO    MainThread:870381 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings
+2024-11-01 20:05:17,453 INFO    MainThread:870381 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings
+2024-11-01 20:05:17,453 INFO    MainThread:870381 [wandb_setup.py:_flush():79] Loading settings from environment variables: {}
+2024-11-01 20:05:17,453 INFO    MainThread:870381 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None}
+2024-11-01 20:05:17,453 INFO    MainThread:870381 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'}
+2024-11-01 20:05:17,453 INFO    MainThread:870381 [wandb_setup.py:_flush():79] Applying login settings: {}
+2024-11-01 20:05:17,454 INFO    MainThread:870381 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_200517-77b12390/logs/debug.log
+2024-11-01 20:05:17,454 INFO    MainThread:870381 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_200517-77b12390/logs/debug-internal.log
+2024-11-01 20:05:17,454 INFO    MainThread:870381 [wandb_init.py:init():621] calling init triggers
+2024-11-01 20:05:17,454 INFO    MainThread:870381 [wandb_init.py:init():628] wandb.init called with sweep_config: {}
+config: {}
+2024-11-01 20:05:17,454 INFO    MainThread:870381 [wandb_init.py:init():671] starting backend
+2024-11-01 20:05:17,454 INFO    MainThread:870381 [wandb_init.py:init():675] sending inform_init request
+2024-11-01 20:05:17,456 INFO    MainThread:870381 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-11-01 20:05:17,456 INFO    MainThread:870381 [wandb_init.py:init():688] backend started and connected
+2024-11-01 20:05:17,465 INFO    MainThread:870381 [wandb_init.py:init():783] updated telemetry
+2024-11-01 20:05:17,492 INFO    MainThread:870381 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout
+2024-11-01 20:05:17,733 INFO    MainThread:870381 [wandb_init.py:init():867] starting run threads in backend
+2024-11-01 20:05:17,821 INFO    MainThread:870381 [wandb_run.py:_console_start():2463] atexit reg
+2024-11-01 20:05:17,821 INFO    MainThread:870381 [wandb_run.py:_redirect():2311] redirect: wrap_raw
+2024-11-01 20:05:17,821 INFO    MainThread:870381 [wandb_run.py:_redirect():2376] Wrapping output streams.
+2024-11-01 20:05:17,821 INFO    MainThread:870381 [wandb_run.py:_redirect():2401] Redirects installed.
+2024-11-01 20:05:17,822 INFO    MainThread:870381 [wandb_init.py:init():911] run started, returning control to user process
+2024-11-01 20:05:17,822 INFO    MainThread:870381 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'shuffle_nondeterministic', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0, 'lr': 5e-06}
+2024-11-01 20:05:25,266 WARNING MsgRouterThr:870381 [router.py:message_loop():77] message_loop has been closed
diff --git a/wandb/run-20241101_200517-iopieyi0/run-iopieyi0.wandb b/wandb/run-20241101_200517-iopieyi0/run-iopieyi0.wandb
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/wandb/run-20241101_201707-qparlnlu/run-qparlnlu.wandb b/wandb/run-20241101_201707-qparlnlu/run-qparlnlu.wandb
new file mode 100644
index 0000000000000000000000000000000000000000..84712ce5952c0d074021247a0bfdf12463bdccbb
Binary files /dev/null and b/wandb/run-20241101_201707-qparlnlu/run-qparlnlu.wandb differ
diff --git a/wandb/run-20241105_161832-sl992h9i/logs/debug-internal.log b/wandb/run-20241105_161832-sl992h9i/logs/debug-internal.log
new file mode 100644
index 0000000000000000000000000000000000000000..3bc4d3693d6886135e94299dfb271daaf90d3682
--- /dev/null
+++ b/wandb/run-20241105_161832-sl992h9i/logs/debug-internal.log
@@ -0,0 +1,8 @@
+{"time":"2024-11-05T16:18:32.541242362-05:00","level":"INFO","msg":"using version","core version":"0.18.5"}
+{"time":"2024-11-05T16:18:32.541252362-05:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241105_161832-sl992h9i/logs/debug-core.log"}
+{"time":"2024-11-05T16:18:32.64870613-05:00","level":"INFO","msg":"created new stream","id":"sl992h9i"}
+{"time":"2024-11-05T16:18:32.648925761-05:00","level":"INFO","msg":"stream: started","id":"sl992h9i"}
+{"time":"2024-11-05T16:18:32.648950381-05:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"sl992h9i"}}
+{"time":"2024-11-05T16:18:32.649202603-05:00","level":"INFO","msg":"sender: started","stream_id":"sl992h9i"}
+{"time":"2024-11-05T16:18:32.649258273-05:00","level":"INFO","msg":"handler: started","stream_id":{"value":"sl992h9i"}}
+{"time":"2024-11-05T16:18:32.866729655-05:00","level":"INFO","msg":"Starting system monitor"}
diff --git a/wandb/run-20241105_161832-sl992h9i/logs/debug.log b/wandb/run-20241105_161832-sl992h9i/logs/debug.log
new file mode 100644
index 0000000000000000000000000000000000000000..919a7094c2ccc3c0af70ed62eb8a0c87e1a32bb3
--- /dev/null
+++ b/wandb/run-20241105_161832-sl992h9i/logs/debug.log
@@ -0,0 +1,29 @@
+2024-11-05 16:18:32,537 INFO    MainThread:1773597 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5
+2024-11-05 16:18:32,537 INFO    MainThread:1773597 [wandb_setup.py:_flush():79] Configure stats pid to 1773597
+2024-11-05 16:18:32,537 INFO    MainThread:1773597 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings
+2024-11-05 16:18:32,537 INFO    MainThread:1773597 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings
+2024-11-05 16:18:32,537 INFO    MainThread:1773597 [wandb_setup.py:_flush():79] Loading settings from environment variables: {}
+2024-11-05 16:18:32,538 INFO    MainThread:1773597 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None}
+2024-11-05 16:18:32,538 INFO    MainThread:1773597 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'}
+2024-11-05 16:18:32,538 INFO    MainThread:1773597 [wandb_setup.py:_flush():79] Applying login settings: {}
+2024-11-05 16:18:32,538 INFO    MainThread:1773597 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241105_161832-sl992h9i/logs/debug.log
+2024-11-05 16:18:32,538 INFO    MainThread:1773597 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241105_161832-sl992h9i/logs/debug-internal.log
+2024-11-05 16:18:32,538 INFO    MainThread:1773597 [wandb_init.py:init():621] calling init triggers
+2024-11-05 16:18:32,538 INFO    MainThread:1773597 [wandb_init.py:init():628] wandb.init called with sweep_config: {}
+config: {}
+2024-11-05 16:18:32,538 INFO    MainThread:1773597 [wandb_init.py:init():671] starting backend
+2024-11-05 16:18:32,538 INFO    MainThread:1773597 [wandb_init.py:init():675] sending inform_init request
+2024-11-05 16:18:32,539 INFO    MainThread:1773597 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-11-05 16:18:32,539 INFO    MainThread:1773597 [wandb_init.py:init():688] backend started and connected
+2024-11-05 16:18:32,541 INFO    MainThread:1773597 [wandb_init.py:init():783] updated telemetry
+2024-11-05 16:18:32,575 INFO    MainThread:1773597 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout
+2024-11-05 16:18:32,863 INFO    MainThread:1773597 [wandb_init.py:init():867] starting run threads in backend
+2024-11-05 16:18:32,949 INFO    MainThread:1773597 [wandb_run.py:_console_start():2463] atexit reg
+2024-11-05 16:18:32,949 INFO    MainThread:1773597 [wandb_run.py:_redirect():2311] redirect: wrap_raw
+2024-11-05 16:18:32,950 INFO    MainThread:1773597 [wandb_run.py:_redirect():2376] Wrapping output streams.
+2024-11-05 16:18:32,950 INFO    MainThread:1773597 [wandb_run.py:_redirect():2401] Redirects installed.
+2024-11-05 16:18:32,951 INFO    MainThread:1773597 [wandb_init.py:init():911] run started, returning control to user process
+2024-11-05 16:18:32,951 INFO    MainThread:1773597 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'shuffle_deterministic21', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0, 'lr': 5e-06}
+2024-11-05 16:24:36,566 INFO    MainThread:1773597 [wandb_run.py:_config_callback():1390] config_cb None None {'vocab_size': 128256, 'max_position_embeddings': 131072, 'hidden_size': 3072, 'intermediate_size': 8192, 'num_hidden_layers': 28, 'num_attention_heads': 24, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'pretraining_tp': 1, 'use_cache': True, 'rope_theta': 500000.0, 'rope_scaling': {'factor': 32.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}, 'attention_bias': False, 'attention_dropout': 0.0, 'mlp_bias': False, 'head_dim': 128, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': True, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['LlamaForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 128000, 'pad_token_id': None, 'eos_token_id': 128001, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'meta-llama/Llama-3.2-3B', 'transformers_version': '4.45.1', 'model_type': 'llama', 'output_dir': './checkpoints/Llama-3.2-3B/babylm_shuffle_deterministic21_10M_seed0/runs', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 3, 'per_device_eval_batch_size': 8, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 2, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-06, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.1, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': './logs', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 100, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 0, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': False, 'fp16': True, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 10, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': './checkpoints/Llama-3.2-3B/babylm_shuffle_deterministic21_10M_seed0/runs', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': 'deepspeed_config/train_dp_config.json', 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'evaluation_strategy': 'steps', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'eval_use_gather_object': False}
+2024-11-05 16:24:36,569 INFO    MainThread:1773597 [wandb_config.py:__setitem__():154] config set model/num_parameters = 3212749824 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x7f1cd15c93a0>>
+2024-11-05 16:24:36,569 INFO    MainThread:1773597 [wandb_run.py:_config_callback():1390] config_cb model/num_parameters 3212749824 None
diff --git a/wandb/run-20241105_162824-fa9ep6qh/files/requirements.txt b/wandb/run-20241105_162824-fa9ep6qh/files/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416
--- /dev/null
+++ b/wandb/run-20241105_162824-fa9ep6qh/files/requirements.txt
@@ -0,0 +1,147 @@
+funcsigs==1.0.2
+sentry-sdk==2.17.0
+multiprocess==0.70.16
+numpy==1.26.2
+pluralizer==1.2.0
+debugpy==1.6.7
+nvidia-cudnn-cu11==8.5.0.96
+deepspeed==0.15.2
+data==0.4
+pandas==2.1.3
+tomli==2.0.1
+charset-normalizer==3.3.2
+attrs==24.2.0
+aiosignal==1.3.1
+fsspec==2023.10.0
+nvidia-cusparse-cu11==11.7.4.91
+zipp==3.12.0
+mypy-extensions==1.0.0
+datasets==3.0.1
+joblib==1.3.2
+hjson==3.1.0
+traitlets==5.7.1
+stack-data==0.6.0
+transformers==4.45.1
+sympy==1.11.1
+Pygments==2.15.0
+docker-pycreds==0.4.0
+dill==0.3.8
+wheel==0.44.0
+prompt-toolkit==3.0.30
+parso==0.8.3
+ipykernel==6.23.1
+pyarrow==17.0.0
+certifi==2023.11.17
+nvidia-cufft-cu11==10.9.0.58
+six==1.16.0
+pydantic==2.9.2
+click==8.1.7
+nest-asyncio==1.5.6
+gmpy2==2.1.0
+matplotlib==3.8.2
+scipy==1.11.4
+typing_extensions==4.12.2
+statsmodels==0.14.0
+huggingface-hub==0.25.0
+frozenlist==1.4.1
+gpustat==1.1.1
+nvidia-nvtx-cu11==11.7.91
+safetensors==0.4.5
+stanza==1.9.2
+decorator==5.1.1
+seaborn==0.13.0
+sentencepiece==0.2.0
+PyYAML==6.0.1
+black==24.8.0
+protobuf==4.25.1
+pickleshare==0.7.5
+peft==0.13.0
+triton==2.0.0
+nvidia-cuda-runtime-cu11==11.7.99
+Jinja2==3.1.2
+nvidia-cusolver-cu11==11.4.0.1
+executing==1.2.0
+jupyter_client==8.1.0
+pluggy==1.3.0
+cmake==3.30.3
+pytz==2023.3.post1
+aiohappyeyeballs==2.4.2
+kiwisolver==1.4.5
+py-cpuinfo==9.0.0
+Pillow==10.1.0
+ptyprocess==0.7.0
+importlib_resources==6.4.5
+GitPython==3.1.43
+importlib-metadata==6.0.0
+iniconfig==2.0.0
+scikit-learn==1.3.2
+exceptiongroup==1.1.0
+networkx==2.8.6
+accelerate==1.0.0
+nltk==3.8.1
+shutilwhich==1.1.0
+fonttools==4.45.1
+future==0.18.3
+aiohttp==3.10.6
+wcwidth==0.2.5
+idna==3.6
+filelock==3.12.2
+pathspec==0.12.1
+jupyter_core==5.1.0
+lit==18.1.8
+nvidia-curand-cu11==10.2.10.91
+nvidia-cublas-cu11==11.10.3.66
+nvidia-ml-py==12.560.30
+msgpack==1.1.0
+python-dateutil==2.8.2
+blessed==1.20.0
+packaging==23.0
+gitdb==4.0.11
+yarl==1.13.0
+emoji==2.8.0
+tzdata==2023.3
+cycler==0.12.1
+tornado==6.2
+backcall==0.2.0
+plotnine==0.12.4
+ninja==1.11.1.1
+latex==0.7.0
+wandb==0.18.5
+setproctitle==1.3.3
+threadpoolctl==3.2.0
+requests==2.32.3
+pyparsing==3.1.1
+smmap==5.0.1
+pyzmq==23.0.0
+async-timeout==4.0.3
+annotated-types==0.7.0
+matplotlib-inline==0.1.6
+latexcodec==1.0.0
+ipython==8.0.0
+patsy==0.5.3
+contourpy==1.2.0
+multidict==6.1.0
+mizani==0.9.3
+urllib3==2.1.0
+tokenizers==0.20.0
+MarkupSafe==2.1.2
+pip==24.2
+pexpect==4.8.0
+tqdm==4.66.5
+jedi==0.18.2
+pydantic_core==2.23.4
+tempdir==0.7.1
+mpmath==1.2.1
+setuptools==72.1.0
+pytest==7.4.3
+pure-eval==0.2.2
+psutil==5.9.1
+comm==0.1.2
+nvidia-cuda-cupti-cu11==11.7.101
+nvidia-cuda-nvrtc-cu11==11.7.99
+regex==2023.10.3
+platformdirs==2.5.2
+asttokens==2.2.1
+torch==2.0.0
+nvidia-nccl-cu11==2.14.3
+xxhash==3.5.0
diff --git a/wandb/run-20241105_162824-fa9ep6qh/logs/debug.log b/wandb/run-20241105_162824-fa9ep6qh/logs/debug.log
new file mode 100644
index 0000000000000000000000000000000000000000..5a35297ade645a0097ec42bed9de6cd507afa7bc
--- /dev/null
+++ b/wandb/run-20241105_162824-fa9ep6qh/logs/debug.log
@@ -0,0 +1,26 @@
+2024-11-05 16:28:24,422 INFO    MainThread:1777856 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5
+2024-11-05 16:28:24,422 INFO    MainThread:1777856 [wandb_setup.py:_flush():79] Configure stats pid to 1777856
+2024-11-05 16:28:24,422 INFO    MainThread:1777856 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings
+2024-11-05 16:28:24,422 INFO    MainThread:1777856 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings
+2024-11-05 16:28:24,423 INFO    MainThread:1777856 [wandb_setup.py:_flush():79] Loading settings from environment variables: {}
+2024-11-05 16:28:24,423 INFO    MainThread:1777856 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None}
+2024-11-05 16:28:24,423 INFO    MainThread:1777856 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'}
+2024-11-05 16:28:24,423 INFO    MainThread:1777856 [wandb_setup.py:_flush():79] Applying login settings: {}
+2024-11-05 16:28:24,423 INFO    MainThread:1777856 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241105_162824-fa9ep6qh/logs/debug.log
+2024-11-05 16:28:24,423 INFO    MainThread:1777856 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241105_162824-fa9ep6qh/logs/debug-internal.log
+2024-11-05 16:28:24,423 INFO    MainThread:1777856 [wandb_init.py:init():621] calling init triggers
+2024-11-05 16:28:24,423 INFO    MainThread:1777856 [wandb_init.py:init():628] wandb.init called with sweep_config: {}
+config: {}
+2024-11-05 16:28:24,423 INFO    MainThread:1777856 [wandb_init.py:init():671] starting backend
+2024-11-05 16:28:24,423 INFO    MainThread:1777856 [wandb_init.py:init():675] sending inform_init request
+2024-11-05 16:28:24,425 INFO    MainThread:1777856 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-11-05 16:28:24,425 INFO    MainThread:1777856 [wandb_init.py:init():688] backend started and connected
+2024-11-05 16:28:24,428 INFO    MainThread:1777856 [wandb_init.py:init():783] updated telemetry
+2024-11-05 16:28:24,454 INFO    MainThread:1777856 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout
+2024-11-05 16:28:24,752 INFO    MainThread:1777856 [wandb_init.py:init():867] starting run threads in backend
+2024-11-05 16:28:24,846 INFO    MainThread:1777856 [wandb_run.py:_console_start():2463] atexit reg
+2024-11-05 16:28:24,846 INFO    MainThread:1777856 [wandb_run.py:_redirect():2311] redirect: wrap_raw
+2024-11-05 16:28:24,847 INFO    MainThread:1777856 [wandb_run.py:_redirect():2376] Wrapping output streams.
+2024-11-05 16:28:24,847 INFO    MainThread:1777856 [wandb_run.py:_redirect():2401] Redirects installed.
+2024-11-05 16:28:24,849 INFO    MainThread:1777856 [wandb_init.py:init():911] run started, returning control to user process
+2024-11-05 16:28:24,850 INFO    MainThread:1777856 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'shuffle_deterministic21', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0, 'lr': 5e-06}
diff --git a/wandb/run-20241105_162858-6py0unak/run-6py0unak.wandb b/wandb/run-20241105_162858-6py0unak/run-6py0unak.wandb
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/wandb/run-20241106_234111-v7pqfaqj/files/config.yaml b/wandb/run-20241106_234111-v7pqfaqj/files/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..45c9885a80c6c6551af0939982b4765da8541cf4
--- /dev/null
+++ b/wandb/run-20241106_234111-v7pqfaqj/files/config.yaml
@@ -0,0 +1,49 @@
+_wandb:
+    value:
+        cli_version: 0.18.5
+        m: []
+        python_version: 3.9.19
+        t:
+            "1":
+                - 1
+                - 5
+                - 11
+                - 49
+                - 51
+                - 53
+                - 55
+                - 71
+                - 98
+            "2":
+                - 1
+                - 5
+                - 11
+                - 49
+                - 51
+                - 53
+                - 55
+                - 71
+                - 98
+            "3":
+                - 13
+                - 23
+                - 55
+            "4": 3.9.19
+            "5": 0.18.5
+            "6": 4.45.1
+            "8":
+                - 5
+            "12": 0.18.5
+            "13": linux-x86_64
+batch_size:
+    value: 3
+epoch:
+    value: 3
+lr:
+    value: 5e-06
+perturbation:
+    value: shuffle_even_odd
+seed:
+    value: 0
+train_set:
+    value: 10M
diff --git a/wandb/run-20241106_234111-v7pqfaqj/files/output.log b/wandb/run-20241106_234111-v7pqfaqj/files/output.log
new file mode 100644
index 0000000000000000000000000000000000000000..67dc001e59e5b45c00aba7db4d17418174736f80
--- /dev/null
+++ b/wandb/run-20241106_234111-v7pqfaqj/files/output.log
@@ -0,0 +1,43 @@
+Downloading shards:   0%|                                                                                                                                                                                                                                                                           | 0/2 [00:00<?, ?it/s]Exception ignored in: <generator object tqdm.__iter__ at 0x7f76e0736cf0>
+Traceback (most recent call last):
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/std.py", line 1196, in __iter__
+    self.close()
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/std.py", line 1302, in close
+    self.display(pos=0)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/std.py", line 1495, in display
+    self.sp(self.__str__() if msg is None else msg)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/std.py", line 458, in print_status
+    len_s = disp_len(s)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/utils.py", line 383, in disp_len
+    return _text_width(RE_ANSI.sub('', data))
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/utils.py", line 375, in _text_width
+    return sum(2 if east_asian_width(ch) in 'FW' else 1 for ch in str(s))
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/utils.py", line 375, in <genexpr>
+    return sum(2 if east_asian_width(ch) in 'FW' else 1 for ch in str(s))
+KeyboardInterrupt:
+Traceback (most recent call last):
+  File "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", line 174, in <module>
+    model = AutoModelForCausalLM.from_pretrained(model_name,
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/models/auto/auto_factory.py", line 564, in from_pretrained
+    return model_class.from_pretrained(
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/modeling_utils.py", line 3769, in from_pretrained
+    resolved_archive_file, sharded_metadata = get_checkpoint_shard_files(
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 1098, in get_checkpoint_shard_files
+    cached_filename = cached_file(
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 403, in cached_file
+    resolved_file = hf_hub_download(
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_deprecation.py", line 101, in inner_f
+    return f(*args, **kwargs)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn
+    return fn(*args, **kwargs)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1232, in hf_hub_download
+    return _hf_hub_download_to_cache_dir(
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1380, in _hf_hub_download_to_cache_dir
+    with WeakFileLock(lock_path):
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/contextlib.py", line 119, in __enter__
+    return next(self.gen)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_fixes.py", line 98, in WeakFileLock
+    lock.acquire()
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/filelock/_api.py", line 225, in acquire
+    time.sleep(poll_interval)
+KeyboardInterrupt
diff --git a/wandb/run-20241106_234111-v7pqfaqj/files/requirements.txt b/wandb/run-20241106_234111-v7pqfaqj/files/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416
--- /dev/null
+++ b/wandb/run-20241106_234111-v7pqfaqj/files/requirements.txt
@@ -0,0 +1,147 @@
+funcsigs==1.0.2
+sentry-sdk==2.17.0
+multiprocess==0.70.16
+numpy==1.26.2
+pluralizer==1.2.0
+debugpy==1.6.7
+nvidia-cudnn-cu11==8.5.0.96
+deepspeed==0.15.2
+data==0.4
+pandas==2.1.3
+tomli==2.0.1
+charset-normalizer==3.3.2
+attrs==24.2.0
+aiosignal==1.3.1
+fsspec==2023.10.0
+nvidia-cusparse-cu11==11.7.4.91
+zipp==3.12.0
+mypy-extensions==1.0.0
+datasets==3.0.1
+joblib==1.3.2
+hjson==3.1.0
+traitlets==5.7.1
+stack-data==0.6.0
+transformers==4.45.1
+sympy==1.11.1
+Pygments==2.15.0
+docker-pycreds==0.4.0
+dill==0.3.8
+wheel==0.44.0
+prompt-toolkit==3.0.30
+parso==0.8.3
+ipykernel==6.23.1
+pyarrow==17.0.0
+certifi==2023.11.17
+nvidia-cufft-cu11==10.9.0.58
+six==1.16.0
+pydantic==2.9.2
+click==8.1.7
+nest-asyncio==1.5.6
+gmpy2==2.1.0
+matplotlib==3.8.2
+scipy==1.11.4
+typing_extensions==4.12.2
+statsmodels==0.14.0
+huggingface-hub==0.25.0
+frozenlist==1.4.1
+gpustat==1.1.1
+nvidia-nvtx-cu11==11.7.91
+safetensors==0.4.5
+stanza==1.9.2
+decorator==5.1.1
+seaborn==0.13.0
+sentencepiece==0.2.0
+PyYAML==6.0.1
+black==24.8.0
+protobuf==4.25.1
+pickleshare==0.7.5
+peft==0.13.0
+triton==2.0.0
+nvidia-cuda-runtime-cu11==11.7.99
+Jinja2==3.1.2
+nvidia-cusolver-cu11==11.4.0.1
+executing==1.2.0
+jupyter_client==8.1.0
+pluggy==1.3.0
+cmake==3.30.3
+pytz==2023.3.post1
+aiohappyeyeballs==2.4.2
+kiwisolver==1.4.5
+py-cpuinfo==9.0.0
+Pillow==10.1.0
+ptyprocess==0.7.0
+importlib_resources==6.4.5
+GitPython==3.1.43
+importlib-metadata==6.0.0
+iniconfig==2.0.0
+scikit-learn==1.3.2
+exceptiongroup==1.1.0
+networkx==2.8.6
+accelerate==1.0.0
+nltk==3.8.1
+shutilwhich==1.1.0
+fonttools==4.45.1
+future==0.18.3
+aiohttp==3.10.6
+wcwidth==0.2.5
+idna==3.6
+filelock==3.12.2
+pathspec==0.12.1
+jupyter_core==5.1.0
+lit==18.1.8
+nvidia-curand-cu11==10.2.10.91
+nvidia-cublas-cu11==11.10.3.66
+nvidia-ml-py==12.560.30
+msgpack==1.1.0
+python-dateutil==2.8.2
+blessed==1.20.0
+packaging==23.0
+gitdb==4.0.11
+yarl==1.13.0
+emoji==2.8.0
+tzdata==2023.3
+cycler==0.12.1
+tornado==6.2
+backcall==0.2.0
+plotnine==0.12.4
+ninja==1.11.1.1
+latex==0.7.0
+wandb==0.18.5
+setproctitle==1.3.3
+threadpoolctl==3.2.0
+requests==2.32.3
+pyparsing==3.1.1
+smmap==5.0.1
+pyzmq==23.0.0
+async-timeout==4.0.3
+annotated-types==0.7.0
+matplotlib-inline==0.1.6
+latexcodec==1.0.0
+ipython==8.0.0
+patsy==0.5.3
+contourpy==1.2.0
+multidict==6.1.0
+mizani==0.9.3
+urllib3==2.1.0
+tokenizers==0.20.0
+MarkupSafe==2.1.2
+pip==24.2
+pexpect==4.8.0
+tqdm==4.66.5
+jedi==0.18.2
+pydantic_core==2.23.4
+tempdir==0.7.1
+mpmath==1.2.1
+setuptools==72.1.0
+pytest==7.4.3
+pure-eval==0.2.2
+psutil==5.9.1
+comm==0.1.2
+nvidia-cuda-cupti-cu11==11.7.101
+nvidia-cuda-nvrtc-cu11==11.7.99
+regex==2023.10.3
+platformdirs==2.5.2
+asttokens==2.2.1
+torch==2.0.0
+nvidia-nccl-cu11==2.14.3
+xxhash==3.5.0
diff --git a/wandb/run-20241106_234111-v7pqfaqj/logs/debug-internal.log b/wandb/run-20241106_234111-v7pqfaqj/logs/debug-internal.log
new file mode 100644
index 0000000000000000000000000000000000000000..9200cd1a9604def4926562b9dfa5f33ba3955450
--- /dev/null
+++ b/wandb/run-20241106_234111-v7pqfaqj/logs/debug-internal.log
@@ -0,0 +1,11 @@
+{"time":"2024-11-06T23:41:11.203613171-05:00","level":"INFO","msg":"using version","core version":"0.18.5"}
+{"time":"2024-11-06T23:41:11.203623461-05:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241106_234111-v7pqfaqj/logs/debug-core.log"}
+{"time":"2024-11-06T23:41:11.309763781-05:00","level":"INFO","msg":"created new stream","id":"v7pqfaqj"}
+{"time":"2024-11-06T23:41:11.309797181-05:00","level":"INFO","msg":"stream: started","id":"v7pqfaqj"}
+{"time":"2024-11-06T23:41:11.309856822-05:00","level":"INFO","msg":"sender: started","stream_id":"v7pqfaqj"}
+{"time":"2024-11-06T23:41:11.309818252-05:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"v7pqfaqj"}}
+{"time":"2024-11-06T23:41:11.309881832-05:00","level":"INFO","msg":"handler: started","stream_id":{"value":"v7pqfaqj"}}
+{"time":"2024-11-06T23:41:11.514969656-05:00","level":"INFO","msg":"Starting system monitor"}
+{"time":"2024-11-06T23:43:02.304539765-05:00","level":"INFO","msg":"stream: closing","id":"v7pqfaqj"}
+{"time":"2024-11-06T23:43:02.304582275-05:00","level":"INFO","msg":"Stopping system monitor"}
+{"time":"2024-11-06T23:43:02.305136719-05:00","level":"INFO","msg":"Stopped system monitor"}
diff --git a/wandb/run-20241106_234111-v7pqfaqj/logs/debug.log b/wandb/run-20241106_234111-v7pqfaqj/logs/debug.log
new file mode 100644
index 0000000000000000000000000000000000000000..80b6cf6e464cfbe74e794b683479d1a6701f6942
--- /dev/null
+++ b/wandb/run-20241106_234111-v7pqfaqj/logs/debug.log
@@ -0,0 +1,27 @@
+2024-11-06 23:41:11,199 INFO    MainThread:1997395 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5
+2024-11-06 23:41:11,199 INFO    MainThread:1997395 [wandb_setup.py:_flush():79] Configure stats pid to 1997395
+2024-11-06 23:41:11,199 INFO    MainThread:1997395 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings
+2024-11-06 23:41:11,199 INFO    MainThread:1997395 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings
+2024-11-06 23:41:11,200 INFO    MainThread:1997395 [wandb_setup.py:_flush():79] Loading settings from environment variables: {}
+2024-11-06 23:41:11,200 INFO    MainThread:1997395 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None}
+2024-11-06 23:41:11,200 INFO    MainThread:1997395 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'}
+2024-11-06 23:41:11,200 INFO    MainThread:1997395 [wandb_setup.py:_flush():79] Applying login settings: {}
+2024-11-06 23:41:11,200 INFO    MainThread:1997395 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241106_234111-v7pqfaqj/logs/debug.log
+2024-11-06 23:41:11,200 INFO    MainThread:1997395 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241106_234111-v7pqfaqj/logs/debug-internal.log
+2024-11-06 23:41:11,200 INFO    MainThread:1997395 [wandb_init.py:init():621] calling init triggers
+2024-11-06 23:41:11,200 INFO    MainThread:1997395 [wandb_init.py:init():628] wandb.init called with sweep_config: {}
+config: {}
+2024-11-06 23:41:11,200 INFO    MainThread:1997395 [wandb_init.py:init():671] starting backend
+2024-11-06 23:41:11,200 INFO    MainThread:1997395 [wandb_init.py:init():675] sending inform_init request
+2024-11-06 23:41:11,201 INFO    MainThread:1997395 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-11-06 23:41:11,201 INFO    MainThread:1997395 [wandb_init.py:init():688] backend started and connected
+2024-11-06 23:41:11,204 INFO    MainThread:1997395 [wandb_init.py:init():783] updated telemetry
+2024-11-06 23:41:11,227 INFO    MainThread:1997395 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout
+2024-11-06 23:41:11,511 INFO    MainThread:1997395 [wandb_init.py:init():867] starting run threads in backend
+2024-11-06 23:41:11,624 INFO    MainThread:1997395 [wandb_run.py:_console_start():2463] atexit reg
+2024-11-06 23:41:11,624 INFO    MainThread:1997395 [wandb_run.py:_redirect():2311] redirect: wrap_raw
+2024-11-06 23:41:11,624 INFO    MainThread:1997395 [wandb_run.py:_redirect():2376] Wrapping output streams.
+2024-11-06 23:41:11,624 INFO    MainThread:1997395 [wandb_run.py:_redirect():2401] Redirects installed.
+2024-11-06 23:41:11,626 INFO    MainThread:1997395 [wandb_init.py:init():911] run started, returning control to user process
+2024-11-06 23:41:11,626 INFO    MainThread:1997395 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'shuffle_even_odd', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0, 'lr': 5e-06}
+2024-11-06 23:43:02,304 WARNING MsgRouterThr:1997395 [router.py:message_loop():77] message_loop has been closed
diff --git a/wandb/run-20241107_160818-1kyoikt6/run-1kyoikt6.wandb b/wandb/run-20241107_160818-1kyoikt6/run-1kyoikt6.wandb
new file mode 100644
index 0000000000000000000000000000000000000000..bae7aa0efd8cc8a9f486b401d7153ee3eb875b8e
Binary files /dev/null and b/wandb/run-20241107_160818-1kyoikt6/run-1kyoikt6.wandb differ
diff --git a/wandb/run-20241113_180154-y8wvn5hq/files/output.log b/wandb/run-20241113_180154-y8wvn5hq/files/output.log
new file mode 100644
index 0000000000000000000000000000000000000000..ab4a8210003ba08325765d147693863e16b8737c
--- /dev/null
+++ b/wandb/run-20241113_180154-y8wvn5hq/files/output.log
@@ -0,0 +1,29 @@
+100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1098360/1098360 [00:04<00:00, 269079.17it/s]
+100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1098360/1098360 [00:00<00:00, 3537253.85it/s]
+100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17520/17520 [00:00<00:00, 38005.50it/s]
+Generating train split: 17519 examples [00:07, 2284.97 examples/s]██████████████████████████████████████████████████████████████████████████▊                                         | 13440/17520 [00:00<00:00, 38164.07it/s]
+100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1086121/1086121 [00:04<00:00, 251453.58it/s]
+100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1086121/1086121 [00:00<00:00, 3190550.09it/s]
+100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 18141/18141 [00:00<00:00, 36805.97it/s]
+Generating validation split: 18140 examples [00:08, 2245.37 examples/s]████████████████████████████████████████████████████████████████████████████████████▏                          | 15397/18141 [00:00<00:00, 38602.13it/s]
+100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1031323/1031323 [00:03<00:00, 287511.18it/s]
+100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1031323/1031323 [00:00<00:00, 3701774.08it/s]
+100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16483/16483 [00:00<00:00, 37332.58it/s]
+Generating test split: 16482 examples [00:06, 2366.18 examples/s]████████████████████████████████████████████████████████████████                                                     | 11548/16483 [00:00<00:00, 36946.41it/s]
+config.json: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 844/844 [00:00<00:00, 589kB/s]
+Downloading shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [02:32<00:00, 76.15s/it]
+Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:06<00:00,  3.03s/it]
+Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17519/17519 [00:49<00:00, 353.64 examples/s]
+Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 18140/18140 [00:50<00:00, 358.38 examples/s]
+tokenized_valid: Dataset({
+    features: ['input_ids', 'attention_mask'],
+    num_rows: 1000
+})
+/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead
+  warnings.warn(
+[2024-11-13 18:06:38,687] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2024-11-13 18:06:49,762] [INFO] [comm.py:652:init_distributed] cdb=None
+Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination
+Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root...
+Loading extension module cpu_adam...
+Time to load cpu_adam op: 12.725266218185425 seconds
diff --git a/wandb/run-20241113_180154-y8wvn5hq/files/wandb-metadata.json b/wandb/run-20241113_180154-y8wvn5hq/files/wandb-metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..9d488cac5a4f0dab8caee1a17cc3c0fff6cb57cf
--- /dev/null
+++ b/wandb/run-20241113_180154-y8wvn5hq/files/wandb-metadata.json
@@ -0,0 +1,97 @@
+{
+  "os":  "Linux-5.4.0-162-generic-x86_64-with-glibc2.31",
+  "python":  "3.9.19",
+  "startedAt":  "2024-11-13T23:01:54.134835Z",
+  "args":  [
+    "--perturbation",
+    "reverse_full",
+    "--train_set",
+    "10M",
+    "--batch_size",
+    "3",
+    "--epoch",
+    "3",
+    "--seed",
+    "0"
+  ],
+  "program":  "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_ftp.py",
+  "codePath":  "train/train_ftp.py",
+  "git":  {
+    "remote":  "git@hf.co:Yaning1001/Impossible_llm.git",
+    "commit":  "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4"
+  },
+  "email":  "yaning1001@gmail.com",
+  "root":  "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train",
+  "host":  "mms-large-2",
+  "username":  "chunhui",
+  "executable":  "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python",
+  "codePathLocal":  "train_ftp.py",
+  "cpu_count":  32,
+  "cpu_count_logical":  64,
+  "gpu":  "NVIDIA RTX A6000",
+  "gpu_count":  8,
+  "disk":  {
+    "/":  {
+      "total":  "1888559353856",
+      "used":  "1744802353152"
+    }
+  },
+  "memory":  {
+    "total":  "202617098240"
+  },
+  "cpu":  {
+    "count":  32,
+    "countLogical":  64
+  },
+  "gpu_nvidia":  [
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    }
+  ],
+  "cudaVersion":  "11.8"
+}
\ No newline at end of file
diff --git a/wandb/run-20241113_180154-y8wvn5hq/logs/debug-internal.log b/wandb/run-20241113_180154-y8wvn5hq/logs/debug-internal.log
new file mode 100644
index 0000000000000000000000000000000000000000..926e2cb335a135cbb4c32f6d1692e15bda6950fb
--- /dev/null
+++ b/wandb/run-20241113_180154-y8wvn5hq/logs/debug-internal.log
@@ -0,0 +1,8 @@
+{"time":"2024-11-13T18:01:54.13856869-05:00","level":"INFO","msg":"using version","core version":"0.18.5"}
+{"time":"2024-11-13T18:01:54.13860319-05:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241113_180154-y8wvn5hq/logs/debug-core.log"}
+{"time":"2024-11-13T18:01:54.252154361-05:00","level":"INFO","msg":"created new stream","id":"y8wvn5hq"}
+{"time":"2024-11-13T18:01:54.252214081-05:00","level":"INFO","msg":"stream: started","id":"y8wvn5hq"}
+{"time":"2024-11-13T18:01:54.252242311-05:00","level":"INFO","msg":"sender: started","stream_id":"y8wvn5hq"}
+{"time":"2024-11-13T18:01:54.252224001-05:00","level":"INFO","msg":"handler: started","stream_id":{"value":"y8wvn5hq"}}
+{"time":"2024-11-13T18:01:54.252245641-05:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"y8wvn5hq"}}
+{"time":"2024-11-13T18:01:54.482362144-05:00","level":"INFO","msg":"Starting system monitor"}
diff --git a/wandb/run-20241115_125218-rrve0rbk/files/output.log b/wandb/run-20241115_125218-rrve0rbk/files/output.log
new file mode 100644
index 0000000000000000000000000000000000000000..5aa223281be6a039d3f567b0f3eb7240c3890288
--- /dev/null
+++ b/wandb/run-20241115_125218-rrve0rbk/files/output.log
@@ -0,0 +1,23 @@
+config.json: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 844/844 [00:00<00:00, 258kB/s]
+model.safetensors.index.json: 100%|████████████████████████████████████████████████████████████████████████████████████████████| 20.9k/20.9k [00:00<00:00, 18.4MB/s]
+model-00001-of-00002.safetensors: 100%|████████████████████████████████████████████████████████████████████████████████████████| 4.97G/4.97G [01:58<00:00, 42.0MB/s]
+Downloading shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [02:33<00:00, 76.60s/it]
+Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:03<00:00,  1.92s/it]
+generation_config.json: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 185/185 [00:00<00:00, 54.9kB/s]
+Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16425/16425 [00:43<00:00, 377.79 examples/s]
+Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17013/17013 [00:44<00:00, 380.17 examples/s]
+tokenized_valid: Dataset({
+    features: ['input_ids', 'attention_mask'],
+    num_rows: 1000
+})
+/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead
+  warnings.warn(
+[2024-11-15 12:56:26,083] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2024-11-15 12:56:31,912] [INFO] [comm.py:652:init_distributed] cdb=None
+Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination
+Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root...
+Emitting ninja build file /home/chunhui/.cache/torch_extensions/py39_cu117/cpu_adam/build.ninja...
+Building extension module cpu_adam...
+Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
+Loading extension module cpu_adam...
+Time to load cpu_adam op: 5.30732536315918 seconds
diff --git a/wandb/run-20241115_125218-rrve0rbk/files/requirements.txt b/wandb/run-20241115_125218-rrve0rbk/files/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416
--- /dev/null
+++ b/wandb/run-20241115_125218-rrve0rbk/files/requirements.txt
@@ -0,0 +1,147 @@
+funcsigs==1.0.2
+sentry-sdk==2.17.0
+multiprocess==0.70.16
+numpy==1.26.2
+pluralizer==1.2.0
+debugpy==1.6.7
+nvidia-cudnn-cu11==8.5.0.96
+deepspeed==0.15.2
+data==0.4
+pandas==2.1.3
+tomli==2.0.1
+charset-normalizer==3.3.2
+attrs==24.2.0
+aiosignal==1.3.1
+fsspec==2023.10.0
+nvidia-cusparse-cu11==11.7.4.91
+zipp==3.12.0
+mypy-extensions==1.0.0
+datasets==3.0.1
+joblib==1.3.2
+hjson==3.1.0
+traitlets==5.7.1
+stack-data==0.6.0
+transformers==4.45.1
+sympy==1.11.1
+Pygments==2.15.0
+docker-pycreds==0.4.0
+dill==0.3.8
+wheel==0.44.0
+prompt-toolkit==3.0.30
+parso==0.8.3
+ipykernel==6.23.1
+pyarrow==17.0.0
+certifi==2023.11.17
+nvidia-cufft-cu11==10.9.0.58
+six==1.16.0
+pydantic==2.9.2
+click==8.1.7
+nest-asyncio==1.5.6
+gmpy2==2.1.0
+matplotlib==3.8.2
+scipy==1.11.4
+typing_extensions==4.12.2
+statsmodels==0.14.0
+huggingface-hub==0.25.0
+frozenlist==1.4.1
+gpustat==1.1.1
+nvidia-nvtx-cu11==11.7.91
+safetensors==0.4.5
+stanza==1.9.2
+decorator==5.1.1
+seaborn==0.13.0
+sentencepiece==0.2.0
+PyYAML==6.0.1
+black==24.8.0
+protobuf==4.25.1
+pickleshare==0.7.5
+peft==0.13.0
+triton==2.0.0
+nvidia-cuda-runtime-cu11==11.7.99
+Jinja2==3.1.2
+nvidia-cusolver-cu11==11.4.0.1
+executing==1.2.0
+jupyter_client==8.1.0
+pluggy==1.3.0
+cmake==3.30.3
+pytz==2023.3.post1
+aiohappyeyeballs==2.4.2
+kiwisolver==1.4.5
+py-cpuinfo==9.0.0
+Pillow==10.1.0
+ptyprocess==0.7.0
+importlib_resources==6.4.5
+GitPython==3.1.43
+importlib-metadata==6.0.0
+iniconfig==2.0.0
+scikit-learn==1.3.2
+exceptiongroup==1.1.0
+networkx==2.8.6
+accelerate==1.0.0
+nltk==3.8.1
+shutilwhich==1.1.0
+fonttools==4.45.1
+future==0.18.3
+aiohttp==3.10.6
+wcwidth==0.2.5
+idna==3.6
+filelock==3.12.2
+pathspec==0.12.1
+jupyter_core==5.1.0
+lit==18.1.8
+nvidia-curand-cu11==10.2.10.91
+nvidia-cublas-cu11==11.10.3.66
+nvidia-ml-py==12.560.30
+msgpack==1.1.0
+python-dateutil==2.8.2
+blessed==1.20.0
+packaging==23.0
+gitdb==4.0.11
+yarl==1.13.0
+emoji==2.8.0
+tzdata==2023.3
+cycler==0.12.1
+tornado==6.2
+backcall==0.2.0
+plotnine==0.12.4
+ninja==1.11.1.1
+latex==0.7.0
+wandb==0.18.5
+setproctitle==1.3.3
+threadpoolctl==3.2.0
+requests==2.32.3
+pyparsing==3.1.1
+smmap==5.0.1
+pyzmq==23.0.0
+async-timeout==4.0.3
+annotated-types==0.7.0
+matplotlib-inline==0.1.6
+latexcodec==1.0.0
+ipython==8.0.0
+patsy==0.5.3
+contourpy==1.2.0
+multidict==6.1.0
+mizani==0.9.3
+urllib3==2.1.0
+tokenizers==0.20.0
+MarkupSafe==2.1.2
+pip==24.2
+pexpect==4.8.0
+tqdm==4.66.5
+jedi==0.18.2
+pydantic_core==2.23.4
+tempdir==0.7.1
+mpmath==1.2.1
+setuptools==72.1.0
+pytest==7.4.3
+pure-eval==0.2.2
+psutil==5.9.1
+comm==0.1.2
+nvidia-cuda-cupti-cu11==11.7.101
+nvidia-cuda-nvrtc-cu11==11.7.99
+regex==2023.10.3
+platformdirs==2.5.2
+asttokens==2.2.1
+torch==2.0.0
+nvidia-nccl-cu11==2.14.3
+xxhash==3.5.0
diff --git a/wandb/run-20241115_125218-rrve0rbk/files/wandb-metadata.json b/wandb/run-20241115_125218-rrve0rbk/files/wandb-metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..35ad49ba614cfe26b6352dac7279e1f68c441251
--- /dev/null
+++ b/wandb/run-20241115_125218-rrve0rbk/files/wandb-metadata.json
@@ -0,0 +1,97 @@
+{
+  "os":  "Linux-5.4.0-162-generic-x86_64-with-glibc2.31",
+  "python":  "3.9.19",
+  "startedAt":  "2024-11-15T17:52:18.165441Z",
+  "args":  [
+    "--perturbation",
+    "shuffle_deterministic84",
+    "--train_set",
+    "10M",
+    "--batch_size",
+    "3",
+    "--epoch",
+    "3",
+    "--seed",
+    "0"
+  ],
+  "program":  "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_ftp.py",
+  "codePath":  "train/train_ftp.py",
+  "git":  {
+    "remote":  "git@hf.co:Yaning1001/Impossible_llm.git",
+    "commit":  "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4"
+  },
+  "email":  "yaning1001@gmail.com",
+  "root":  "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train",
+  "host":  "mms-large-2",
+  "username":  "chunhui",
+  "executable":  "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python",
+  "codePathLocal":  "train_ftp.py",
+  "cpu_count":  32,
+  "cpu_count_logical":  64,
+  "gpu":  "NVIDIA RTX A6000",
+  "gpu_count":  8,
+  "disk":  {
+    "/":  {
+      "total":  "1888559353856",
+      "used":  "1762493665280"
+    }
+  },
+  "memory":  {
+    "total":  "202617098240"
+  },
+  "cpu":  {
+    "count":  32,
+    "countLogical":  64
+  },
+  "gpu_nvidia":  [
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    }
+  ],
+  "cudaVersion":  "11.8"
+}
\ No newline at end of file
diff --git a/wandb/run-20241115_125218-rrve0rbk/logs/debug-internal.log b/wandb/run-20241115_125218-rrve0rbk/logs/debug-internal.log
new file mode 100644
index 0000000000000000000000000000000000000000..a89ebf414e2306400790ea6a8eb6b033b98fd456
--- /dev/null
+++ b/wandb/run-20241115_125218-rrve0rbk/logs/debug-internal.log
@@ -0,0 +1,10 @@
+{"time":"2024-11-15T12:52:18.166300035-05:00","level":"INFO","msg":"using version","core version":"0.18.5"}
+{"time":"2024-11-15T12:52:18.166310145-05:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241115_125218-rrve0rbk/logs/debug-core.log"}
+{"time":"2024-11-15T12:52:18.275265339-05:00","level":"INFO","msg":"created new stream","id":"rrve0rbk"}
+{"time":"2024-11-15T12:52:18.275299929-05:00","level":"INFO","msg":"stream: started","id":"rrve0rbk"}
+{"time":"2024-11-15T12:52:18.275345179-05:00","level":"INFO","msg":"sender: started","stream_id":"rrve0rbk"}
+{"time":"2024-11-15T12:52:18.275322849-05:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"rrve0rbk"}}
+{"time":"2024-11-15T12:52:18.275333929-05:00","level":"INFO","msg":"handler: started","stream_id":{"value":"rrve0rbk"}}
+{"time":"2024-11-15T12:52:18.531175062-05:00","level":"INFO","msg":"Starting system monitor"}
+{"time":"2024-11-15T14:35:03.896023097-05:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/yaning1001-dartmouth-college/exp-impo-shuffle/rrve0rbk/file_stream"}
+{"time":"2024-11-15T14:52:44.59452379-05:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": read tcp 10.81.0.110:36132->35.186.228.49:443: read: connection reset by peer"}
diff --git a/wandb/run-20241115_125218-rrve0rbk/logs/debug.log b/wandb/run-20241115_125218-rrve0rbk/logs/debug.log
new file mode 100644
index 0000000000000000000000000000000000000000..3914487e1e37f02bfb67fef6a9685bd9d8d63767
--- /dev/null
+++ b/wandb/run-20241115_125218-rrve0rbk/logs/debug.log
@@ -0,0 +1,26 @@
+2024-11-15 12:52:18,162 INFO    MainThread:2609855 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5
+2024-11-15 12:52:18,162 INFO    MainThread:2609855 [wandb_setup.py:_flush():79] Configure stats pid to 2609855
+2024-11-15 12:52:18,162 INFO    MainThread:2609855 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings
+2024-11-15 12:52:18,162 INFO    MainThread:2609855 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings
+2024-11-15 12:52:18,162 INFO    MainThread:2609855 [wandb_setup.py:_flush():79] Loading settings from environment variables: {}
+2024-11-15 12:52:18,162 INFO    MainThread:2609855 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None}
+2024-11-15 12:52:18,162 INFO    MainThread:2609855 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_ftp.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_ftp.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_ftp.py'}
+2024-11-15 12:52:18,162 INFO    MainThread:2609855 [wandb_setup.py:_flush():79] Applying login settings: {}
+2024-11-15 12:52:18,162 INFO    MainThread:2609855 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241115_125218-rrve0rbk/logs/debug.log
+2024-11-15 12:52:18,163 INFO    MainThread:2609855 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241115_125218-rrve0rbk/logs/debug-internal.log
+2024-11-15 12:52:18,163 INFO    MainThread:2609855 [wandb_init.py:init():621] calling init triggers
+2024-11-15 12:52:18,163 INFO    MainThread:2609855 [wandb_init.py:init():628] wandb.init called with sweep_config: {}
+config: {}
+2024-11-15 12:52:18,163 INFO    MainThread:2609855 [wandb_init.py:init():671] starting backend
+2024-11-15 12:52:18,163 INFO    MainThread:2609855 [wandb_init.py:init():675] sending inform_init request
+2024-11-15 12:52:18,164 INFO    MainThread:2609855 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-11-15 12:52:18,165 INFO    MainThread:2609855 [wandb_init.py:init():688] backend started and connected
+2024-11-15 12:52:18,167 INFO    MainThread:2609855 [wandb_init.py:init():783] updated telemetry
+2024-11-15 12:52:18,184 INFO    MainThread:2609855 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout
+2024-11-15 12:52:18,528 INFO    MainThread:2609855 [wandb_init.py:init():867] starting run threads in backend
+2024-11-15 12:52:18,618 INFO    MainThread:2609855 [wandb_run.py:_console_start():2463] atexit reg
+2024-11-15 12:52:18,618 INFO    MainThread:2609855 [wandb_run.py:_redirect():2311] redirect: wrap_raw
+2024-11-15 12:52:18,618 INFO    MainThread:2609855 [wandb_run.py:_redirect():2376] Wrapping output streams.
+2024-11-15 12:52:18,618 INFO    MainThread:2609855 [wandb_run.py:_redirect():2401] Redirects installed.
+2024-11-15 12:52:18,620 INFO    MainThread:2609855 [wandb_init.py:init():911] run started, returning control to user process
+2024-11-15 12:52:18,620 INFO    MainThread:2609855 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'shuffle_deterministic84', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0, 'lr': 5e-06}
diff --git a/wandb/run-20241128_161554-907lsb28/logs/debug-internal.log b/wandb/run-20241128_161554-907lsb28/logs/debug-internal.log
new file mode 100644
index 0000000000000000000000000000000000000000..8d2dc365ac0331eccaf0b9f80876093d91786fd3
--- /dev/null
+++ b/wandb/run-20241128_161554-907lsb28/logs/debug-internal.log
@@ -0,0 +1,19 @@
+{"time":"2024-11-28T16:15:54.215655506-05:00","level":"INFO","msg":"using version","core version":"0.18.5"}
+{"time":"2024-11-28T16:15:54.215678516-05:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241128_161554-907lsb28/logs/debug-core.log"}
+{"time":"2024-11-28T16:15:54.426700134-05:00","level":"INFO","msg":"created new stream","id":"907lsb28"}
+{"time":"2024-11-28T16:15:54.426746584-05:00","level":"INFO","msg":"stream: started","id":"907lsb28"}
+{"time":"2024-11-28T16:15:54.426791414-05:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"907lsb28"}}
+{"time":"2024-11-28T16:15:54.427138797-05:00","level":"INFO","msg":"handler: started","stream_id":{"value":"907lsb28"}}
+{"time":"2024-11-28T16:15:54.427186257-05:00","level":"INFO","msg":"sender: started","stream_id":"907lsb28"}
+{"time":"2024-11-28T16:15:54.603855087-05:00","level":"INFO","msg":"Starting system monitor"}
+{"time":"2024-11-28T22:30:24.964340473-05:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/yaning1001-dartmouth-college/exp-impo-reverse/907lsb28/file_stream"}
+{"time":"2024-11-29T03:53:49.678920958-05:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/yaning1001-dartmouth-college/exp-impo-reverse/907lsb28/file_stream"}
+{"time":"2024-11-29T06:59:15.382511638-05:00","level":"INFO","msg":"Stopping system monitor"}
+{"time":"2024-11-29T06:59:15.388445668-05:00","level":"INFO","msg":"Stopped system monitor"}
+{"time":"2024-11-29T06:59:15.845898959-05:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
+{"time":"2024-11-29T06:59:16.007005818-05:00","level":"INFO","msg":"handler: operation stats","stats":{}}
+{"time":"2024-11-29T06:59:17.020469811-05:00","level":"INFO","msg":"stream: closing","id":"907lsb28"}
+{"time":"2024-11-29T06:59:17.020502291-05:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"907lsb28"}}
+{"time":"2024-11-29T06:59:17.020523081-05:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"907lsb28"}}
+{"time":"2024-11-29T06:59:17.020579451-05:00","level":"INFO","msg":"sender: closed","stream_id":"907lsb28"}
+{"time":"2024-11-29T06:59:17.020631992-05:00","level":"INFO","msg":"stream: closed","id":"907lsb28"}
diff --git a/wandb/run-20241128_161554-907lsb28/logs/debug.log b/wandb/run-20241128_161554-907lsb28/logs/debug.log
new file mode 100644
index 0000000000000000000000000000000000000000..8379c6a57899a14a89219105461c35b590bd8360
--- /dev/null
+++ b/wandb/run-20241128_161554-907lsb28/logs/debug.log
@@ -0,0 +1,33 @@
+2024-11-28 16:15:54,208 INFO    MainThread:3101596 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5
+2024-11-28 16:15:54,208 INFO    MainThread:3101596 [wandb_setup.py:_flush():79] Configure stats pid to 3101596
+2024-11-28 16:15:54,208 INFO    MainThread:3101596 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings
+2024-11-28 16:15:54,208 INFO    MainThread:3101596 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings
+2024-11-28 16:15:54,208 INFO    MainThread:3101596 [wandb_setup.py:_flush():79] Loading settings from environment variables: {}
+2024-11-28 16:15:54,208 INFO    MainThread:3101596 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None}
+2024-11-28 16:15:54,208 INFO    MainThread:3101596 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_llama_1B.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_llama_1B.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_llama_1B.py'}
+2024-11-28 16:15:54,208 INFO    MainThread:3101596 [wandb_setup.py:_flush():79] Applying login settings: {}
+2024-11-28 16:15:54,208 INFO    MainThread:3101596 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241128_161554-907lsb28/logs/debug.log
+2024-11-28 16:15:54,208 INFO    MainThread:3101596 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241128_161554-907lsb28/logs/debug-internal.log
+2024-11-28 16:15:54,208 INFO    MainThread:3101596 [wandb_init.py:init():621] calling init triggers
+2024-11-28 16:15:54,208 INFO    MainThread:3101596 [wandb_init.py:init():628] wandb.init called with sweep_config: {}
+config: {}
+2024-11-28 16:15:54,208 INFO    MainThread:3101596 [wandb_init.py:init():671] starting backend
+2024-11-28 16:15:54,208 INFO    MainThread:3101596 [wandb_init.py:init():675] sending inform_init request
+2024-11-28 16:15:54,210 INFO    MainThread:3101596 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-11-28 16:15:54,210 INFO    MainThread:3101596 [wandb_init.py:init():688] backend started and connected
+2024-11-28 16:15:54,216 INFO    MainThread:3101596 [wandb_init.py:init():783] updated telemetry
+2024-11-28 16:15:54,251 INFO    MainThread:3101596 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout
+2024-11-28 16:15:54,600 INFO    MainThread:3101596 [wandb_init.py:init():867] starting run threads in backend
+2024-11-28 16:15:54,714 INFO    MainThread:3101596 [wandb_run.py:_console_start():2463] atexit reg
+2024-11-28 16:15:54,714 INFO    MainThread:3101596 [wandb_run.py:_redirect():2311] redirect: wrap_raw
+2024-11-28 16:15:54,714 INFO    MainThread:3101596 [wandb_run.py:_redirect():2376] Wrapping output streams.
+2024-11-28 16:15:54,714 INFO    MainThread:3101596 [wandb_run.py:_redirect():2401] Redirects installed.
+2024-11-28 16:15:54,716 INFO    MainThread:3101596 [wandb_init.py:init():911] run started, returning control to user process
+2024-11-28 16:15:54,716 INFO    MainThread:3101596 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_control', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0, 'lr': 5e-06}
+2024-11-29 06:59:15,374 INFO    MainThread:3101596 [wandb_run.py:_finish():2158] finishing run yaning1001-dartmouth-college/exp-impo-reverse/907lsb28
+2024-11-29 06:59:15,381 INFO    MainThread:3101596 [wandb_run.py:_atexit_cleanup():2426] got exitcode: 0
+2024-11-29 06:59:15,381 INFO    MainThread:3101596 [wandb_run.py:_restore():2408] restore
+2024-11-29 06:59:15,382 INFO    MainThread:3101596 [wandb_run.py:_restore():2414] restore done
+2024-11-29 06:59:17,010 INFO    MainThread:3101596 [wandb_run.py:_footer_history_summary_info():3975] rendering history
+2024-11-29 06:59:17,010 INFO    MainThread:3101596 [wandb_run.py:_footer_history_summary_info():4007] rendering summary
+2024-11-29 06:59:17,019 INFO    MainThread:3101596 [wandb_run.py:_footer_sync_info():3934] logging synced files