diff --git a/wandb/run-20241030_012617-0h15y3p4/files/config.yaml b/wandb/run-20241030_012617-0h15y3p4/files/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f8546482f67cbb9041af9d756594322c485d67a9
--- /dev/null
+++ b/wandb/run-20241030_012617-0h15y3p4/files/config.yaml
@@ -0,0 +1,47 @@
+_wandb:
+    value:
+        cli_version: 0.18.5
+        m: []
+        python_version: 3.9.19
+        t:
+            "1":
+                - 1
+                - 5
+                - 11
+                - 49
+                - 51
+                - 53
+                - 55
+                - 71
+                - 98
+            "2":
+                - 1
+                - 5
+                - 11
+                - 49
+                - 51
+                - 53
+                - 55
+                - 71
+                - 98
+            "3":
+                - 13
+                - 23
+                - 55
+            "4": 3.9.19
+            "5": 0.18.5
+            "6": 4.45.1
+            "8":
+                - 5
+            "12": 0.18.5
+            "13": linux-x86_64
+batch_size:
+    value: 3
+epoch:
+    value: 7
+perturbation:
+    value: reverse_control
+seed:
+    value: 0
+train_set:
+    value: 10M
diff --git a/wandb/run-20241030_012617-0h15y3p4/files/output.log b/wandb/run-20241030_012617-0h15y3p4/files/output.log
new file mode 100644
index 0000000000000000000000000000000000000000..c78613b68fdc40d0d678a230555c23d17949c02e
--- /dev/null
+++ b/wandb/run-20241030_012617-0h15y3p4/files/output.log
@@ -0,0 +1,24 @@
+Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:03<00:00,  1.96s/it]
+Map:  11%|██████████████                                                                                                                  | 2000/18140 [00:09<01:18, 206.29 examples/s]
+Traceback (most recent call last):
+  File "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", line 181, in <module>
+    tokenized_valid = valid_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/datasets/arrow_dataset.py", line 560, in wrapper
+    out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/datasets/arrow_dataset.py", line 3035, in map
+    for rank, done, content in Dataset._map_single(**dataset_kwargs):
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/datasets/arrow_dataset.py", line 3438, in _map_single
+    batch = apply_function_on_filtered_inputs(
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/datasets/arrow_dataset.py", line 3300, in apply_function_on_filtered_inputs
+    processed_inputs = function(*fn_args, *additional_args, **fn_kwargs)
+  File "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", line 179, in tokenize_function
+    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=1024)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/tokenization_utils_base.py", line 3024, in __call__
+    encodings = self._call_one(text=text, text_pair=text_pair, **all_kwargs)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/tokenization_utils_base.py", line 3112, in _call_one
+    return self.batch_encode_plus(
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/tokenization_utils_base.py", line 3314, in batch_encode_plus
+    return self._batch_encode_plus(
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/tokenization_utils_fast.py", line 529, in _batch_encode_plus
+    encodings = self._tokenizer.encode_batch(
+KeyboardInterrupt
diff --git a/wandb/run-20241030_012617-0h15y3p4/files/requirements.txt b/wandb/run-20241030_012617-0h15y3p4/files/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416
--- /dev/null
+++ b/wandb/run-20241030_012617-0h15y3p4/files/requirements.txt
@@ -0,0 +1,147 @@
+funcsigs==1.0.2
+sentry-sdk==2.17.0
+multiprocess==0.70.16
+numpy==1.26.2
+pluralizer==1.2.0
+debugpy==1.6.7
+nvidia-cudnn-cu11==8.5.0.96
+deepspeed==0.15.2
+data==0.4
+pandas==2.1.3
+tomli==2.0.1
+charset-normalizer==3.3.2
+attrs==24.2.0
+aiosignal==1.3.1
+fsspec==2023.10.0
+nvidia-cusparse-cu11==11.7.4.91
+zipp==3.12.0
+mypy-extensions==1.0.0
+datasets==3.0.1
+joblib==1.3.2
+hjson==3.1.0
+traitlets==5.7.1
+stack-data==0.6.0
+transformers==4.45.1
+sympy==1.11.1
+Pygments==2.15.0
+docker-pycreds==0.4.0
+dill==0.3.8
+wheel==0.44.0
+prompt-toolkit==3.0.30
+parso==0.8.3
+ipykernel==6.23.1
+pyarrow==17.0.0
+certifi==2023.11.17
+nvidia-cufft-cu11==10.9.0.58
+six==1.16.0
+pydantic==2.9.2
+click==8.1.7
+nest-asyncio==1.5.6
+gmpy2==2.1.0
+matplotlib==3.8.2
+scipy==1.11.4
+typing_extensions==4.12.2
+statsmodels==0.14.0
+huggingface-hub==0.25.0
+frozenlist==1.4.1
+gpustat==1.1.1
+nvidia-nvtx-cu11==11.7.91
+safetensors==0.4.5
+stanza==1.9.2
+decorator==5.1.1
+seaborn==0.13.0
+sentencepiece==0.2.0
+PyYAML==6.0.1
+black==24.8.0
+protobuf==4.25.1
+pickleshare==0.7.5
+peft==0.13.0
+triton==2.0.0
+nvidia-cuda-runtime-cu11==11.7.99
+Jinja2==3.1.2
+nvidia-cusolver-cu11==11.4.0.1
+executing==1.2.0
+jupyter_client==8.1.0
+pluggy==1.3.0
+cmake==3.30.3
+pytz==2023.3.post1
+aiohappyeyeballs==2.4.2
+kiwisolver==1.4.5
+py-cpuinfo==9.0.0
+Pillow==10.1.0
+ptyprocess==0.7.0
+importlib_resources==6.4.5
+GitPython==3.1.43
+importlib-metadata==6.0.0
+iniconfig==2.0.0
+scikit-learn==1.3.2
+exceptiongroup==1.1.0
+networkx==2.8.6
+accelerate==1.0.0
+nltk==3.8.1
+shutilwhich==1.1.0
+fonttools==4.45.1
+future==0.18.3
+aiohttp==3.10.6
+wcwidth==0.2.5
+idna==3.6
+filelock==3.12.2
+pathspec==0.12.1
+jupyter_core==5.1.0
+lit==18.1.8
+nvidia-curand-cu11==10.2.10.91
+nvidia-cublas-cu11==11.10.3.66
+nvidia-ml-py==12.560.30
+msgpack==1.1.0
+python-dateutil==2.8.2
+blessed==1.20.0
+packaging==23.0
+gitdb==4.0.11
+yarl==1.13.0
+emoji==2.8.0
+tzdata==2023.3
+cycler==0.12.1
+tornado==6.2
+backcall==0.2.0
+plotnine==0.12.4
+ninja==1.11.1.1
+latex==0.7.0
+wandb==0.18.5
+setproctitle==1.3.3
+threadpoolctl==3.2.0
+requests==2.32.3
+pyparsing==3.1.1
+smmap==5.0.1
+pyzmq==23.0.0
+async-timeout==4.0.3
+annotated-types==0.7.0
+matplotlib-inline==0.1.6
+latexcodec==1.0.0
+ipython==8.0.0
+patsy==0.5.3
+contourpy==1.2.0
+multidict==6.1.0
+mizani==0.9.3
+urllib3==2.1.0
+tokenizers==0.20.0
+MarkupSafe==2.1.2
+pip==24.2
+pexpect==4.8.0
+tqdm==4.66.5
+jedi==0.18.2
+pydantic_core==2.23.4
+tempdir==0.7.1
+mpmath==1.2.1
+setuptools==72.1.0
+pytest==7.4.3
+pure-eval==0.2.2
+psutil==5.9.1
+comm==0.1.2
+nvidia-cuda-cupti-cu11==11.7.101
+nvidia-cuda-nvrtc-cu11==11.7.99
+regex==2023.10.3
+platformdirs==2.5.2
+asttokens==2.2.1
+torch==2.0.0
+nvidia-nccl-cu11==2.14.3
+xxhash==3.5.0
diff --git a/wandb/run-20241030_012617-0h15y3p4/files/wandb-metadata.json b/wandb/run-20241030_012617-0h15y3p4/files/wandb-metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..de14e93a5b60db7fa9a88b096e6ab0f55d17cbff
--- /dev/null
+++ b/wandb/run-20241030_012617-0h15y3p4/files/wandb-metadata.json
@@ -0,0 +1,97 @@
+{
+  "os":  "Linux-5.4.0-162-generic-x86_64-with-glibc2.31",
+  "python":  "3.9.19",
+  "startedAt":  "2024-10-30T05:26:17.391721Z",
+  "args":  [
+    "--perturbation",
+    "reverse_control",
+    "--train_set",
+    "10M",
+    "--batch_size",
+    "3",
+    "--epoch",
+    "7",
+    "--seed",
+    "0"
+  ],
+  "program":  "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py",
+  "codePath":  "train/train_deep_wandb.py",
+  "git":  {
+    "remote":  "git@hf.co:Yaning1001/Impossible_llm.git",
+    "commit":  "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4"
+  },
+  "email":  "yaning1001@gmail.com",
+  "root":  "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train",
+  "host":  "mms-large-2",
+  "username":  "chunhui",
+  "executable":  "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python",
+  "codePathLocal":  "train_deep_wandb.py",
+  "cpu_count":  32,
+  "cpu_count_logical":  64,
+  "gpu":  "NVIDIA RTX A6000",
+  "gpu_count":  8,
+  "disk":  {
+    "/":  {
+      "total":  "1888559353856",
+      "used":  "1709772775424"
+    }
+  },
+  "memory":  {
+    "total":  "202617098240"
+  },
+  "cpu":  {
+    "count":  32,
+    "countLogical":  64
+  },
+  "gpu_nvidia":  [
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    }
+  ],
+  "cudaVersion":  "11.8"
+}
\ No newline at end of file
diff --git a/wandb/run-20241030_012617-0h15y3p4/files/wandb-summary.json b/wandb/run-20241030_012617-0h15y3p4/files/wandb-summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..1476d70fd36aa7b2a81bf4776ad653af3dd34436
--- /dev/null
+++ b/wandb/run-20241030_012617-0h15y3p4/files/wandb-summary.json
@@ -0,0 +1 @@
+{"_wandb":{"runtime":14}}
\ No newline at end of file
diff --git a/wandb/run-20241030_012617-0h15y3p4/logs/debug.log b/wandb/run-20241030_012617-0h15y3p4/logs/debug.log
new file mode 100644
index 0000000000000000000000000000000000000000..b771b9b8af92083fe681715e0944a8477a493ec7
--- /dev/null
+++ b/wandb/run-20241030_012617-0h15y3p4/logs/debug.log
@@ -0,0 +1,27 @@
+2024-10-30 01:26:17,385 INFO    MainThread:332626 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5
+2024-10-30 01:26:17,385 INFO    MainThread:332626 [wandb_setup.py:_flush():79] Configure stats pid to 332626
+2024-10-30 01:26:17,385 INFO    MainThread:332626 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings
+2024-10-30 01:26:17,386 INFO    MainThread:332626 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings
+2024-10-30 01:26:17,386 INFO    MainThread:332626 [wandb_setup.py:_flush():79] Loading settings from environment variables: {}
+2024-10-30 01:26:17,386 INFO    MainThread:332626 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None}
+2024-10-30 01:26:17,386 INFO    MainThread:332626 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'}
+2024-10-30 01:26:17,386 INFO    MainThread:332626 [wandb_setup.py:_flush():79] Applying login settings: {}
+2024-10-30 01:26:17,386 INFO    MainThread:332626 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_012617-0h15y3p4/logs/debug.log
+2024-10-30 01:26:17,386 INFO    MainThread:332626 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_012617-0h15y3p4/logs/debug-internal.log
+2024-10-30 01:26:17,386 INFO    MainThread:332626 [wandb_init.py:init():621] calling init triggers
+2024-10-30 01:26:17,386 INFO    MainThread:332626 [wandb_init.py:init():628] wandb.init called with sweep_config: {}
+config: {}
+2024-10-30 01:26:17,386 INFO    MainThread:332626 [wandb_init.py:init():671] starting backend
+2024-10-30 01:26:17,386 INFO    MainThread:332626 [wandb_init.py:init():675] sending inform_init request
+2024-10-30 01:26:17,391 INFO    MainThread:332626 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-10-30 01:26:17,391 INFO    MainThread:332626 [wandb_init.py:init():688] backend started and connected
+2024-10-30 01:26:17,395 INFO    MainThread:332626 [wandb_init.py:init():783] updated telemetry
+2024-10-30 01:26:17,456 INFO    MainThread:332626 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout
+2024-10-30 01:26:17,693 INFO    MainThread:332626 [wandb_init.py:init():867] starting run threads in backend
+2024-10-30 01:26:17,829 INFO    MainThread:332626 [wandb_run.py:_console_start():2463] atexit reg
+2024-10-30 01:26:17,829 INFO    MainThread:332626 [wandb_run.py:_redirect():2311] redirect: wrap_raw
+2024-10-30 01:26:17,829 INFO    MainThread:332626 [wandb_run.py:_redirect():2376] Wrapping output streams.
+2024-10-30 01:26:17,829 INFO    MainThread:332626 [wandb_run.py:_redirect():2401] Redirects installed.
+2024-10-30 01:26:17,831 INFO    MainThread:332626 [wandb_init.py:init():911] run started, returning control to user process
+2024-10-30 01:26:17,831 INFO    MainThread:332626 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_control', 'train_set': '10M', 'batch_size': 3, 'epoch': 7, 'seed': 0}
+2024-10-30 01:26:32,054 WARNING MsgRouterThr:332626 [router.py:message_loop():77] message_loop has been closed
diff --git a/wandb/run-20241030_112700-j5l8vh9z/files/output.log b/wandb/run-20241030_112700-j5l8vh9z/files/output.log
new file mode 100644
index 0000000000000000000000000000000000000000..b2f6de40700ad07202d2c98a78053598b93b2a49
--- /dev/null
+++ b/wandb/run-20241030_112700-j5l8vh9z/files/output.log
@@ -0,0 +1,43 @@
+Downloading shards:   0%|                                                                                                                                        | 0/2 [01:32<?, ?it/s]Exception ignored in: <generator object tqdm.__iter__ at 0x7fc104557cf0>
+Traceback (most recent call last):
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/std.py", line 1196, in __iter__
+    self.close()
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/std.py", line 1303, in close
+    fp_write('\n')
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/std.py", line 1287, in fp_write
+    self.fp.write(str(s))
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/tqdm/utils.py", line 196, in inner
+    return func(*args, **kwargs)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/wandb/sdk/lib/redirect.py", line 648, in write
+    cb(data)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/wandb/sdk/wandb_run.py", line 2386, in <lambda>
+    lambda data: self._console_raw_callback("stderr", data),
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/wandb/sdk/wandb_run.py", line 400, in wrapper_fn
+    @functools.wraps(func)
+KeyboardInterrupt:
+Traceback (most recent call last):
+  File "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", line 172, in <module>
+    model = AutoModelForCausalLM.from_pretrained(model_name,
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/models/auto/auto_factory.py", line 564, in from_pretrained
+    return model_class.from_pretrained(
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/modeling_utils.py", line 3769, in from_pretrained
+    resolved_archive_file, sharded_metadata = get_checkpoint_shard_files(
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 1098, in get_checkpoint_shard_files
+    cached_filename = cached_file(
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 403, in cached_file
+    resolved_file = hf_hub_download(
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_deprecation.py", line 101, in inner_f
+    return f(*args, **kwargs)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn
+    return fn(*args, **kwargs)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1232, in hf_hub_download
+    return _hf_hub_download_to_cache_dir(
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1380, in _hf_hub_download_to_cache_dir
+    with WeakFileLock(lock_path):
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/contextlib.py", line 119, in __enter__
+    return next(self.gen)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_fixes.py", line 98, in WeakFileLock
+    lock.acquire()
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/filelock/_api.py", line 225, in acquire
+    time.sleep(poll_interval)
+KeyboardInterrupt
diff --git a/wandb/run-20241030_112700-j5l8vh9z/files/wandb-metadata.json b/wandb/run-20241030_112700-j5l8vh9z/files/wandb-metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..2368b469a7622ce8a9fd6763085f395c896aa6a4
--- /dev/null
+++ b/wandb/run-20241030_112700-j5l8vh9z/files/wandb-metadata.json
@@ -0,0 +1,97 @@
+{
+  "os":  "Linux-5.4.0-162-generic-x86_64-with-glibc2.31",
+  "python":  "3.9.19",
+  "startedAt":  "2024-10-30T15:27:00.728207Z",
+  "args":  [
+    "--perturbation",
+    "reverse_control",
+    "--train_set",
+    "10M",
+    "--batch_size",
+    "3",
+    "--epoch",
+    "3",
+    "--seed",
+    "0"
+  ],
+  "program":  "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py",
+  "codePath":  "train/train_deep_wandb.py",
+  "git":  {
+    "remote":  "git@hf.co:Yaning1001/Impossible_llm.git",
+    "commit":  "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4"
+  },
+  "email":  "yaning1001@gmail.com",
+  "root":  "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train",
+  "host":  "mms-large-2",
+  "username":  "chunhui",
+  "executable":  "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python",
+  "codePathLocal":  "train_deep_wandb.py",
+  "cpu_count":  32,
+  "cpu_count_logical":  64,
+  "gpu":  "NVIDIA RTX A6000",
+  "gpu_count":  8,
+  "disk":  {
+    "/":  {
+      "total":  "1888559353856",
+      "used":  "1710831083520"
+    }
+  },
+  "memory":  {
+    "total":  "202617098240"
+  },
+  "cpu":  {
+    "count":  32,
+    "countLogical":  64
+  },
+  "gpu_nvidia":  [
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    }
+  ],
+  "cudaVersion":  "11.8"
+}
\ No newline at end of file
diff --git a/wandb/run-20241030_112700-j5l8vh9z/logs/debug-internal.log b/wandb/run-20241030_112700-j5l8vh9z/logs/debug-internal.log
new file mode 100644
index 0000000000000000000000000000000000000000..61d86e3ac061b1c203b67badd2555b9c9aee5e58
--- /dev/null
+++ b/wandb/run-20241030_112700-j5l8vh9z/logs/debug-internal.log
@@ -0,0 +1,11 @@
+{"time":"2024-10-30T11:27:00.732261133-04:00","level":"INFO","msg":"using version","core version":"0.18.5"}
+{"time":"2024-10-30T11:27:00.732288613-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_112700-j5l8vh9z/logs/debug-core.log"}
+{"time":"2024-10-30T11:27:00.843951357-04:00","level":"INFO","msg":"created new stream","id":"j5l8vh9z"}
+{"time":"2024-10-30T11:27:00.844003227-04:00","level":"INFO","msg":"stream: started","id":"j5l8vh9z"}
+{"time":"2024-10-30T11:27:00.844048658-04:00","level":"INFO","msg":"sender: started","stream_id":"j5l8vh9z"}
+{"time":"2024-10-30T11:27:00.844032848-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"j5l8vh9z"}}
+{"time":"2024-10-30T11:27:00.844096438-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"j5l8vh9z"}}
+{"time":"2024-10-30T11:27:01.06475121-04:00","level":"INFO","msg":"Starting system monitor"}
+{"time":"2024-10-30T11:28:34.163673047-04:00","level":"INFO","msg":"stream: closing","id":"j5l8vh9z"}
+{"time":"2024-10-30T11:28:34.163727308-04:00","level":"INFO","msg":"Stopping system monitor"}
+{"time":"2024-10-30T11:28:34.164260991-04:00","level":"INFO","msg":"Stopped system monitor"}
diff --git a/wandb/run-20241030_112700-j5l8vh9z/logs/debug.log b/wandb/run-20241030_112700-j5l8vh9z/logs/debug.log
new file mode 100644
index 0000000000000000000000000000000000000000..70fa6af18e3ddda9ce755d4c5568b446e21235b4
--- /dev/null
+++ b/wandb/run-20241030_112700-j5l8vh9z/logs/debug.log
@@ -0,0 +1,27 @@
+2024-10-30 11:27:00,724 INFO    MainThread:366801 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5
+2024-10-30 11:27:00,724 INFO    MainThread:366801 [wandb_setup.py:_flush():79] Configure stats pid to 366801
+2024-10-30 11:27:00,724 INFO    MainThread:366801 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings
+2024-10-30 11:27:00,724 INFO    MainThread:366801 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings
+2024-10-30 11:27:00,724 INFO    MainThread:366801 [wandb_setup.py:_flush():79] Loading settings from environment variables: {}
+2024-10-30 11:27:00,724 INFO    MainThread:366801 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None}
+2024-10-30 11:27:00,724 INFO    MainThread:366801 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'}
+2024-10-30 11:27:00,724 INFO    MainThread:366801 [wandb_setup.py:_flush():79] Applying login settings: {}
+2024-10-30 11:27:00,724 INFO    MainThread:366801 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_112700-j5l8vh9z/logs/debug.log
+2024-10-30 11:27:00,725 INFO    MainThread:366801 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_112700-j5l8vh9z/logs/debug-internal.log
+2024-10-30 11:27:00,725 INFO    MainThread:366801 [wandb_init.py:init():621] calling init triggers
+2024-10-30 11:27:00,725 INFO    MainThread:366801 [wandb_init.py:init():628] wandb.init called with sweep_config: {}
+config: {}
+2024-10-30 11:27:00,725 INFO    MainThread:366801 [wandb_init.py:init():671] starting backend
+2024-10-30 11:27:00,725 INFO    MainThread:366801 [wandb_init.py:init():675] sending inform_init request
+2024-10-30 11:27:00,727 INFO    MainThread:366801 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-10-30 11:27:00,727 INFO    MainThread:366801 [wandb_init.py:init():688] backend started and connected
+2024-10-30 11:27:00,734 INFO    MainThread:366801 [wandb_init.py:init():783] updated telemetry
+2024-10-30 11:27:00,767 INFO    MainThread:366801 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout
+2024-10-30 11:27:01,061 INFO    MainThread:366801 [wandb_init.py:init():867] starting run threads in backend
+2024-10-30 11:27:01,152 INFO    MainThread:366801 [wandb_run.py:_console_start():2463] atexit reg
+2024-10-30 11:27:01,152 INFO    MainThread:366801 [wandb_run.py:_redirect():2311] redirect: wrap_raw
+2024-10-30 11:27:01,152 INFO    MainThread:366801 [wandb_run.py:_redirect():2376] Wrapping output streams.
+2024-10-30 11:27:01,152 INFO    MainThread:366801 [wandb_run.py:_redirect():2401] Redirects installed.
+2024-10-30 11:27:01,153 INFO    MainThread:366801 [wandb_init.py:init():911] run started, returning control to user process
+2024-10-30 11:27:01,153 INFO    MainThread:366801 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_control', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0}
+2024-10-30 11:28:34,163 WARNING MsgRouterThr:366801 [router.py:message_loop():77] message_loop has been closed
diff --git a/wandb/run-20241030_112700-j5l8vh9z/run-j5l8vh9z.wandb b/wandb/run-20241030_112700-j5l8vh9z/run-j5l8vh9z.wandb
new file mode 100644
index 0000000000000000000000000000000000000000..9272ba814e28998a29a5ccaa03e341c983bd7cdc
Binary files /dev/null and b/wandb/run-20241030_112700-j5l8vh9z/run-j5l8vh9z.wandb differ
diff --git a/wandb/run-20241030_225833-frh96rd1/files/output.log b/wandb/run-20241030_225833-frh96rd1/files/output.log
new file mode 100644
index 0000000000000000000000000000000000000000..32fc2696987cc845b2d6bb810e450ed1bebb1433
--- /dev/null
+++ b/wandb/run-20241030_225833-frh96rd1/files/output.log
@@ -0,0 +1,47 @@
+model.safetensors.index.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20.9k/20.9k [00:00<00:00, 7.08MB/s]
+model-00001-of-00002.safetensors: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 4.97G/4.97G [01:58<00:00, 42.1MB/s]
+model-00002-of-00002.safetensors: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.46G/1.46G [00:34<00:00, 42.5MB/s]
+Downloading shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [02:32<00:00, 76.29s/it]
+Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:04<00:00,  2.44s/it]
+generation_config.json: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 185/185 [00:00<00:00, 112kB/s]
+Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 18140/18140 [00:57<00:00, 317.00 examples/s]
+tokenized_valid: Dataset({
+    features: ['input_ids', 'attention_mask'],
+    num_rows: 600
+})
+/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead
+  warnings.warn(
+[2024-10-30 23:02:10,716] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2024-10-30 23:02:19,960] [INFO] [comm.py:652:init_distributed] cdb=None
+Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination
+Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root...
+Emitting ninja build file /home/chunhui/.cache/torch_extensions/py39_cu117/cpu_adam/build.ninja...
+Building extension module cpu_adam...
+Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
+Loading extension module cpu_adam...
+Time to load cpu_adam op: 4.80836296081543 seconds
+Traceback (most recent call last):
+  File "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", line 219, in <module>
+    trainer.train()
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/trainer.py", line 2052, in train
+    return inner_training_loop(
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/trainer.py", line 2388, in _inner_training_loop
+    tr_loss_step = self.training_step(model, inputs)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/trainer.py", line 3518, in training_step
+    self.accelerator.backward(loss, **kwargs)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/accelerate/accelerator.py", line 2238, in backward
+    self.deepspeed_engine_wrapped.backward(loss, **kwargs)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/accelerate/utils/deepspeed.py", line 195, in backward
+    self.engine.step()
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/deepspeed/runtime/engine.py", line 2217, in step
+    self.tput_timer.stop(global_step=self.is_gradient_accumulation_boundary(), report_speed=report_progress)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/deepspeed/utils/timer.py", line 256, in stop
+    get_accelerator().synchronize()
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/deepspeed/accelerator/cuda_accelerator.py", line 79, in synchronize
+    return torch.cuda.synchronize(device_index)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/torch/cuda/__init__.py", line 688, in synchronize
+    return torch._C._cuda_synchronize()
+KeyboardInterrupt
+Error in atexit._run_exitfuncs:
+Traceback (most recent call last):
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/deepspeed/ops/transformer/inference/triton/matmul_ext.py", line 27, in is_nfs_path
diff --git a/wandb/run-20241030_225833-giupspdj/files/output.log b/wandb/run-20241030_225833-giupspdj/files/output.log
new file mode 100644
index 0000000000000000000000000000000000000000..fc8996c4f6d6f3b055b93f5a6c6c673f8962b7f2
--- /dev/null
+++ b/wandb/run-20241030_225833-giupspdj/files/output.log
@@ -0,0 +1,37 @@
+Downloading shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [02:32<00:00, 76.29s/it]
+Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:05<00:00,  2.56s/it]
+Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 18140/18140 [00:55<00:00, 325.18 examples/s]
+tokenized_valid: Dataset({
+    features: ['input_ids', 'attention_mask'],
+    num_rows: 600
+})
+/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead
+  warnings.warn(
+[2024-10-30 23:02:09,367] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2024-10-30 23:02:20,085] [INFO] [comm.py:652:init_distributed] cdb=None
+Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination
+Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root...
+Loading extension module cpu_adam...
+Time to load cpu_adam op: 4.662875652313232 seconds
+Traceback (most recent call last):
+  File "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", line 219, in <module>
+    trainer.train()
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/trainer.py", line 2052, in train
+    return inner_training_loop(
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/trainer.py", line 2388, in _inner_training_loop
+    tr_loss_step = self.training_step(model, inputs)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/trainer.py", line 3518, in training_step
+    self.accelerator.backward(loss, **kwargs)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/accelerate/accelerator.py", line 2238, in backward
+    self.deepspeed_engine_wrapped.backward(loss, **kwargs)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/accelerate/utils/deepspeed.py", line 195, in backward
+    self.engine.step()
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/deepspeed/runtime/engine.py", line 2217, in step
+    self.tput_timer.stop(global_step=self.is_gradient_accumulation_boundary(), report_speed=report_progress)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/deepspeed/utils/timer.py", line 256, in stop
+    get_accelerator().synchronize()
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/deepspeed/accelerator/cuda_accelerator.py", line 79, in synchronize
+    return torch.cuda.synchronize(device_index)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/torch/cuda/__init__.py", line 688, in synchronize
+    return torch._C._cuda_synchronize()
+KeyboardInterrupt
diff --git a/wandb/run-20241030_225833-giupspdj/files/requirements.txt b/wandb/run-20241030_225833-giupspdj/files/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416
--- /dev/null
+++ b/wandb/run-20241030_225833-giupspdj/files/requirements.txt
@@ -0,0 +1,147 @@
+funcsigs==1.0.2
+sentry-sdk==2.17.0
+multiprocess==0.70.16
+numpy==1.26.2
+pluralizer==1.2.0
+debugpy==1.6.7
+nvidia-cudnn-cu11==8.5.0.96
+deepspeed==0.15.2
+data==0.4
+pandas==2.1.3
+tomli==2.0.1
+charset-normalizer==3.3.2
+attrs==24.2.0
+aiosignal==1.3.1
+fsspec==2023.10.0
+nvidia-cusparse-cu11==11.7.4.91
+zipp==3.12.0
+mypy-extensions==1.0.0
+datasets==3.0.1
+joblib==1.3.2
+hjson==3.1.0
+traitlets==5.7.1
+stack-data==0.6.0
+transformers==4.45.1
+sympy==1.11.1
+Pygments==2.15.0
+docker-pycreds==0.4.0
+dill==0.3.8
+wheel==0.44.0
+prompt-toolkit==3.0.30
+parso==0.8.3
+ipykernel==6.23.1
+pyarrow==17.0.0
+certifi==2023.11.17
+nvidia-cufft-cu11==10.9.0.58
+six==1.16.0
+pydantic==2.9.2
+click==8.1.7
+nest-asyncio==1.5.6
+gmpy2==2.1.0
+matplotlib==3.8.2
+scipy==1.11.4
+typing_extensions==4.12.2
+statsmodels==0.14.0
+huggingface-hub==0.25.0
+frozenlist==1.4.1
+gpustat==1.1.1
+nvidia-nvtx-cu11==11.7.91
+safetensors==0.4.5
+stanza==1.9.2
+decorator==5.1.1
+seaborn==0.13.0
+sentencepiece==0.2.0
+PyYAML==6.0.1
+black==24.8.0
+protobuf==4.25.1
+pickleshare==0.7.5
+peft==0.13.0
+triton==2.0.0
+nvidia-cuda-runtime-cu11==11.7.99
+Jinja2==3.1.2
+nvidia-cusolver-cu11==11.4.0.1
+executing==1.2.0
+jupyter_client==8.1.0
+pluggy==1.3.0
+cmake==3.30.3
+pytz==2023.3.post1
+aiohappyeyeballs==2.4.2
+kiwisolver==1.4.5
+py-cpuinfo==9.0.0
+Pillow==10.1.0
+ptyprocess==0.7.0
+importlib_resources==6.4.5
+GitPython==3.1.43
+importlib-metadata==6.0.0
+iniconfig==2.0.0
+scikit-learn==1.3.2
+exceptiongroup==1.1.0
+networkx==2.8.6
+accelerate==1.0.0
+nltk==3.8.1
+shutilwhich==1.1.0
+fonttools==4.45.1
+future==0.18.3
+aiohttp==3.10.6
+wcwidth==0.2.5
+idna==3.6
+filelock==3.12.2
+pathspec==0.12.1
+jupyter_core==5.1.0
+lit==18.1.8
+nvidia-curand-cu11==10.2.10.91
+nvidia-cublas-cu11==11.10.3.66
+nvidia-ml-py==12.560.30
+msgpack==1.1.0
+python-dateutil==2.8.2
+blessed==1.20.0
+packaging==23.0
+gitdb==4.0.11
+yarl==1.13.0
+emoji==2.8.0
+tzdata==2023.3
+cycler==0.12.1
+tornado==6.2
+backcall==0.2.0
+plotnine==0.12.4
+ninja==1.11.1.1
+latex==0.7.0
+wandb==0.18.5
+setproctitle==1.3.3
+threadpoolctl==3.2.0
+requests==2.32.3
+pyparsing==3.1.1
+smmap==5.0.1
+pyzmq==23.0.0
+async-timeout==4.0.3
+annotated-types==0.7.0
+matplotlib-inline==0.1.6
+latexcodec==1.0.0
+ipython==8.0.0
+patsy==0.5.3
+contourpy==1.2.0
+multidict==6.1.0
+mizani==0.9.3
+urllib3==2.1.0
+tokenizers==0.20.0
+MarkupSafe==2.1.2
+pip==24.2
+pexpect==4.8.0
+tqdm==4.66.5
+jedi==0.18.2
+pydantic_core==2.23.4
+tempdir==0.7.1
+mpmath==1.2.1
+setuptools==72.1.0
+pytest==7.4.3
+pure-eval==0.2.2
+psutil==5.9.1
+comm==0.1.2
+nvidia-cuda-cupti-cu11==11.7.101
+nvidia-cuda-nvrtc-cu11==11.7.99
+regex==2023.10.3
+platformdirs==2.5.2
+asttokens==2.2.1
+torch==2.0.0
+nvidia-nccl-cu11==2.14.3
+xxhash==3.5.0
diff --git a/wandb/run-20241030_225833-giupspdj/files/wandb-metadata.json b/wandb/run-20241030_225833-giupspdj/files/wandb-metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..737cd9119ccc1dbf8e6c47dd60dc03514c886d6f
--- /dev/null
+++ b/wandb/run-20241030_225833-giupspdj/files/wandb-metadata.json
@@ -0,0 +1,97 @@
+{
+  "os":  "Linux-5.4.0-162-generic-x86_64-with-glibc2.31",
+  "python":  "3.9.19",
+  "startedAt":  "2024-10-31T02:58:33.522570Z",
+  "args":  [
+    "--perturbation",
+    "reverse_full",
+    "--train_set",
+    "10M",
+    "--batch_size",
+    "3",
+    "--epoch",
+    "3",
+    "--seed",
+    "0"
+  ],
+  "program":  "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py",
+  "codePath":  "train/train_deep_wandb.py",
+  "git":  {
+    "remote":  "git@hf.co:Yaning1001/Impossible_llm.git",
+    "commit":  "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4"
+  },
+  "email":  "yaning1001@gmail.com",
+  "root":  "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train",
+  "host":  "mms-large-2",
+  "username":  "chunhui",
+  "executable":  "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python",
+  "codePathLocal":  "train_deep_wandb.py",
+  "cpu_count":  32,
+  "cpu_count_logical":  64,
+  "gpu":  "NVIDIA RTX A6000",
+  "gpu_count":  8,
+  "disk":  {
+    "/":  {
+      "total":  "1888559353856",
+      "used":  "1710970511360"
+    }
+  },
+  "memory":  {
+    "total":  "202617098240"
+  },
+  "cpu":  {
+    "count":  32,
+    "countLogical":  64
+  },
+  "gpu_nvidia":  [
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    }
+  ],
+  "cudaVersion":  "11.8"
+}
\ No newline at end of file
diff --git a/wandb/run-20241030_225833-giupspdj/logs/debug.log b/wandb/run-20241030_225833-giupspdj/logs/debug.log
new file mode 100644
index 0000000000000000000000000000000000000000..774a295762a125dc603fcaf2ccd0d7ab13b7e0a1
--- /dev/null
+++ b/wandb/run-20241030_225833-giupspdj/logs/debug.log
@@ -0,0 +1,26 @@
+2024-10-30 22:58:33,520 INFO    MainThread:451914 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5
+2024-10-30 22:58:33,520 INFO    MainThread:451914 [wandb_setup.py:_flush():79] Configure stats pid to 451914
+2024-10-30 22:58:33,520 INFO    MainThread:451914 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings
+2024-10-30 22:58:33,520 INFO    MainThread:451914 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings
+2024-10-30 22:58:33,520 INFO    MainThread:451914 [wandb_setup.py:_flush():79] Loading settings from environment variables: {}
+2024-10-30 22:58:33,521 INFO    MainThread:451914 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None}
+2024-10-30 22:58:33,521 INFO    MainThread:451914 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'}
+2024-10-30 22:58:33,521 INFO    MainThread:451914 [wandb_setup.py:_flush():79] Applying login settings: {}
+2024-10-30 22:58:33,521 INFO    MainThread:451914 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_225833-giupspdj/logs/debug.log
+2024-10-30 22:58:33,521 INFO    MainThread:451914 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241030_225833-giupspdj/logs/debug-internal.log
+2024-10-30 22:58:33,521 INFO    MainThread:451914 [wandb_init.py:init():621] calling init triggers
+2024-10-30 22:58:33,521 INFO    MainThread:451914 [wandb_init.py:init():628] wandb.init called with sweep_config: {}
+config: {}
+2024-10-30 22:58:33,521 INFO    MainThread:451914 [wandb_init.py:init():671] starting backend
+2024-10-30 22:58:33,521 INFO    MainThread:451914 [wandb_init.py:init():675] sending inform_init request
+2024-10-30 22:58:33,522 INFO    MainThread:451914 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-10-30 22:58:33,522 INFO    MainThread:451914 [wandb_init.py:init():688] backend started and connected
+2024-10-30 22:58:33,525 INFO    MainThread:451914 [wandb_init.py:init():783] updated telemetry
+2024-10-30 22:58:33,563 INFO    MainThread:451914 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout
+2024-10-30 22:58:33,828 INFO    MainThread:451914 [wandb_init.py:init():867] starting run threads in backend
+2024-10-30 22:58:33,940 INFO    MainThread:451914 [wandb_run.py:_console_start():2463] atexit reg
+2024-10-30 22:58:33,940 INFO    MainThread:451914 [wandb_run.py:_redirect():2311] redirect: wrap_raw
+2024-10-30 22:58:33,940 INFO    MainThread:451914 [wandb_run.py:_redirect():2376] Wrapping output streams.
+2024-10-30 22:58:33,940 INFO    MainThread:451914 [wandb_run.py:_redirect():2401] Redirects installed.
+2024-10-30 22:58:33,942 INFO    MainThread:451914 [wandb_init.py:init():911] run started, returning control to user process
+2024-10-30 22:58:33,942 INFO    MainThread:451914 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_full', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0}
diff --git a/wandb/run-20241031_002020-q6ot1vz6/logs/debug-internal.log b/wandb/run-20241031_002020-q6ot1vz6/logs/debug-internal.log
new file mode 100644
index 0000000000000000000000000000000000000000..02fd60fc3b50a5bbd53fd5b95611efab4510664e
--- /dev/null
+++ b/wandb/run-20241031_002020-q6ot1vz6/logs/debug-internal.log
@@ -0,0 +1,17 @@
+{"time":"2024-10-31T00:20:20.452039426-04:00","level":"INFO","msg":"using version","core version":"0.18.5"}
+{"time":"2024-10-31T00:20:20.452050226-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241031_002020-q6ot1vz6/logs/debug-core.log"}
+{"time":"2024-10-31T00:20:20.559317297-04:00","level":"INFO","msg":"created new stream","id":"q6ot1vz6"}
+{"time":"2024-10-31T00:20:20.559364417-04:00","level":"INFO","msg":"stream: started","id":"q6ot1vz6"}
+{"time":"2024-10-31T00:20:20.559412138-04:00","level":"INFO","msg":"sender: started","stream_id":"q6ot1vz6"}
+{"time":"2024-10-31T00:20:20.559390118-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"q6ot1vz6"}}
+{"time":"2024-10-31T00:20:20.559429398-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"q6ot1vz6"}}
+{"time":"2024-10-31T00:20:21.383758901-04:00","level":"INFO","msg":"Starting system monitor"}
+{"time":"2024-10-31T09:17:19.826891967-04:00","level":"INFO","msg":"Stopping system monitor"}
+{"time":"2024-10-31T09:17:19.904241229-04:00","level":"INFO","msg":"Stopped system monitor"}
+{"time":"2024-10-31T09:17:20.764796755-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"saving job artifact","runtime_seconds":0.618214398,"subtasks":[{"desc":"wandb-job.json","runtime_seconds":0.005705472,"progress":"563B/563B"}]}],"total_operations":1}}
+{"time":"2024-10-31T09:17:22.016866722-04:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
+{"time":"2024-10-31T09:17:23.132121347-04:00","level":"INFO","msg":"stream: closing","id":"q6ot1vz6"}
+{"time":"2024-10-31T09:17:23.132161377-04:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"q6ot1vz6"}}
+{"time":"2024-10-31T09:17:23.132197758-04:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"q6ot1vz6"}}
+{"time":"2024-10-31T09:17:23.132240638-04:00","level":"INFO","msg":"sender: closed","stream_id":"q6ot1vz6"}
+{"time":"2024-10-31T09:17:23.132261438-04:00","level":"INFO","msg":"stream: closed","id":"q6ot1vz6"}
diff --git a/wandb/run-20241031_002020-u516mysu/files/config.yaml b/wandb/run-20241031_002020-u516mysu/files/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c0d8a2ed7dfb36d28050f2045c26df3a861ac3cb
--- /dev/null
+++ b/wandb/run-20241031_002020-u516mysu/files/config.yaml
@@ -0,0 +1,531 @@
+_name_or_path:
+    value: meta-llama/Llama-3.2-3B
+_wandb:
+    value:
+        cli_version: 0.18.5
+        m:
+            - "1": train/learning_rate
+              "5": 2
+              "6":
+                - 1
+                - 3
+              "7": []
+            - "1": train/global_step
+              "6":
+                - 3
+              "7": []
+            - "1": eval/loss
+              "5": 2
+              "6":
+                - 1
+                - 3
+              "7": []
+            - "1": train/grad_norm
+              "5": 2
+              "6":
+                - 1
+                - 3
+              "7": []
+            - "1": train/epoch
+              "5": 2
+              "6":
+                - 1
+                - 3
+              "7": []
+            - "1": eval/runtime
+              "5": 2
+              "6":
+                - 1
+                - 3
+              "7": []
+            - "1": eval/samples_per_second
+              "5": 2
+              "6":
+                - 1
+                - 3
+              "7": []
+            - "1": eval/steps_per_second
+              "5": 2
+              "6":
+                - 1
+                - 3
+              "7": []
+            - "1": train/loss
+              "5": 2
+              "6":
+                - 1
+                - 3
+              "7": []
+        python_version: 3.9.19
+        t:
+            "1":
+                - 1
+                - 5
+                - 11
+                - 49
+                - 51
+                - 53
+                - 55
+                - 71
+                - 98
+            "2":
+                - 1
+                - 5
+                - 11
+                - 49
+                - 51
+                - 53
+                - 55
+                - 71
+                - 98
+            "3":
+                - 2
+                - 7
+                - 13
+                - 19
+                - 23
+                - 55
+                - 62
+                - 66
+            "4": 3.9.19
+            "5": 0.18.5
+            "6": 4.45.1
+            "8":
+                - 5
+            "9":
+                "1": transformers_trainer
+            "12": 0.18.5
+            "13": linux-x86_64
+accelerator_config:
+    value:
+        dispatch_batches: null
+        even_batches: true
+        gradient_accumulation_kwargs: null
+        non_blocking: false
+        split_batches: false
+        use_seedable_sampler: true
+adafactor:
+    value: false
+adam_beta1:
+    value: 0.9
+adam_beta2:
+    value: 0.999
+adam_epsilon:
+    value: 1e-08
+add_cross_attention:
+    value: false
+architectures:
+    value:
+        - LlamaForCausalLM
+attention_bias:
+    value: false
+attention_dropout:
+    value: 0
+auto_find_batch_size:
+    value: false
+bad_words_ids:
+    value: null
+batch_eval_metrics:
+    value: false
+batch_size:
+    value: 3
+begin_suppress_tokens:
+    value: null
+bf16:
+    value: false
+bf16_full_eval:
+    value: false
+bos_token_id:
+    value: 128000
+chunk_size_feed_forward:
+    value: 0
+cross_attention_hidden_size:
+    value: null
+data_seed:
+    value: null
+dataloader_drop_last:
+    value: false
+dataloader_num_workers:
+    value: 0
+dataloader_persistent_workers:
+    value: false
+dataloader_pin_memory:
+    value: true
+dataloader_prefetch_factor:
+    value: null
+ddp_backend:
+    value: null
+ddp_broadcast_buffers:
+    value: null
+ddp_bucket_cap_mb:
+    value: null
+ddp_find_unused_parameters:
+    value: null
+ddp_timeout:
+    value: 1800
+debug:
+    value: []
+decoder_start_token_id:
+    value: null
+deepspeed:
+    value: deepspeed_config/train_dp_config.json
+disable_tqdm:
+    value: false
+dispatch_batches:
+    value: null
+diversity_penalty:
+    value: 0
+do_eval:
+    value: true
+do_predict:
+    value: false
+do_sample:
+    value: false
+do_train:
+    value: false
+early_stopping:
+    value: false
+encoder_no_repeat_ngram_size:
+    value: 0
+eos_token_id:
+    value: 128001
+epoch:
+    value: 6
+eval_accumulation_steps:
+    value: null
+eval_delay:
+    value: 0
+eval_do_concat_batches:
+    value: true
+eval_on_start:
+    value: false
+eval_steps:
+    value: 10
+eval_strategy:
+    value: steps
+eval_use_gather_object:
+    value: false
+evaluation_strategy:
+    value: steps
+exponential_decay_length_penalty:
+    value: null
+finetuning_task:
+    value: null
+forced_bos_token_id:
+    value: null
+forced_eos_token_id:
+    value: null
+fp16:
+    value: true
+fp16_backend:
+    value: auto
+fp16_full_eval:
+    value: false
+fp16_opt_level:
+    value: O1
+fsdp:
+    value: []
+fsdp_config:
+    value:
+        min_num_params: 0
+        xla: false
+        xla_fsdp_grad_ckpt: false
+        xla_fsdp_v2: false
+fsdp_min_num_params:
+    value: 0
+fsdp_transformer_layer_cls_to_wrap:
+    value: null
+full_determinism:
+    value: false
+gradient_accumulation_steps:
+    value: 2
+gradient_checkpointing:
+    value: false
+gradient_checkpointing_kwargs:
+    value: null
+greater_is_better:
+    value: null
+group_by_length:
+    value: false
+half_precision_backend:
+    value: auto
+head_dim:
+    value: 128
+hidden_act:
+    value: silu
+hidden_size:
+    value: 3072
+hub_always_push:
+    value: false
+hub_model_id:
+    value: null
+hub_private_repo:
+    value: false
+hub_strategy:
+    value: every_save
+hub_token:
+    value: <HUB_TOKEN>
+id2label:
+    value:
+        "0": LABEL_0
+        "1": LABEL_1
+ignore_data_skip:
+    value: false
+include_inputs_for_metrics:
+    value: false
+include_num_input_tokens_seen:
+    value: false
+include_tokens_per_second:
+    value: false
+initializer_range:
+    value: 0.02
+intermediate_size:
+    value: 8192
+is_decoder:
+    value: false
+is_encoder_decoder:
+    value: false
+jit_mode_eval:
+    value: false
+label_names:
+    value: null
+label_smoothing_factor:
+    value: 0
+label2id:
+    value:
+        LABEL_0: 0
+        LABEL_1: 1
+learning_rate:
+    value: 1e-05
+length_column_name:
+    value: length
+length_penalty:
+    value: 1
+load_best_model_at_end:
+    value: false
+local_rank:
+    value: 0
+log_level:
+    value: passive
+log_level_replica:
+    value: warning
+log_on_each_node:
+    value: true
+logging_dir:
+    value: ./logs
+logging_first_step:
+    value: false
+logging_nan_inf_filter:
+    value: true
+logging_steps:
+    value: 1
+logging_strategy:
+    value: steps
+lr:
+    value: 1e-05
+lr_scheduler_type:
+    value: linear
+max_grad_norm:
+    value: 1
+max_length:
+    value: 20
+max_position_embeddings:
+    value: 131072
+max_steps:
+    value: -1
+metric_for_best_model:
+    value: null
+min_length:
+    value: 0
+mlp_bias:
+    value: false
+model/num_parameters:
+    value: 3212749824
+model_type:
+    value: llama
+mp_parameters:
+    value: ""
+neftune_noise_alpha:
+    value: null
+no_cuda:
+    value: false
+no_repeat_ngram_size:
+    value: 0
+num_attention_heads:
+    value: 24
+num_beam_groups:
+    value: 1
+num_beams:
+    value: 1
+num_hidden_layers:
+    value: 28
+num_key_value_heads:
+    value: 8
+num_return_sequences:
+    value: 1
+num_train_epochs:
+    value: 6
+optim:
+    value: adamw_torch
+optim_args:
+    value: null
+optim_target_modules:
+    value: null
+output_attentions:
+    value: false
+output_dir:
+    value: ./checkpoints/Llama-3.2-3B/babylm_reverse_full_10M_seed0/runs
+output_hidden_states:
+    value: false
+output_scores:
+    value: false
+overwrite_output_dir:
+    value: false
+pad_token_id:
+    value: null
+past_index:
+    value: -1
+per_device_eval_batch_size:
+    value: 8
+per_device_train_batch_size:
+    value: 3
+per_gpu_eval_batch_size:
+    value: null
+per_gpu_train_batch_size:
+    value: null
+perturbation:
+    value: reverse_full
+prediction_loss_only:
+    value: false
+prefix:
+    value: null
+pretraining_tp:
+    value: 1
+problem_type:
+    value: null
+push_to_hub:
+    value: false
+push_to_hub_model_id:
+    value: null
+push_to_hub_organization:
+    value: null
+push_to_hub_token:
+    value: <PUSH_TO_HUB_TOKEN>
+ray_scope:
+    value: last
+remove_invalid_values:
+    value: false
+remove_unused_columns:
+    value: true
+repetition_penalty:
+    value: 1
+report_to:
+    value:
+        - wandb
+restore_callback_states_from_checkpoint:
+    value: false
+resume_from_checkpoint:
+    value: null
+return_dict:
+    value: true
+return_dict_in_generate:
+    value: false
+rms_norm_eps:
+    value: 1e-05
+rope_scaling:
+    value:
+        factor: 32
+        high_freq_factor: 4
+        low_freq_factor: 1
+        original_max_position_embeddings: 8192
+        rope_type: llama3
+rope_theta:
+    value: 500000
+run_name:
+    value: ./checkpoints/Llama-3.2-3B/babylm_reverse_full_10M_seed0/runs
+save_on_each_node:
+    value: false
+save_only_model:
+    value: false
+save_safetensors:
+    value: true
+save_steps:
+    value: 150
+save_strategy:
+    value: steps
+save_total_limit:
+    value: null
+seed:
+    value: 0
+sep_token_id:
+    value: null
+skip_memory_metrics:
+    value: true
+split_batches:
+    value: null
+suppress_tokens:
+    value: null
+task_specific_params:
+    value: null
+temperature:
+    value: 1
+tf_legacy_loss:
+    value: false
+tf32:
+    value: null
+tie_encoder_decoder:
+    value: false
+tie_word_embeddings:
+    value: true
+tokenizer_class:
+    value: null
+top_k:
+    value: 50
+top_p:
+    value: 1
+torch_compile:
+    value: false
+torch_compile_backend:
+    value: null
+torch_compile_mode:
+    value: null
+torch_dtype:
+    value: bfloat16
+torch_empty_cache_steps:
+    value: null
+torchdynamo:
+    value: null
+torchscript:
+    value: false
+tpu_metrics_debug:
+    value: false
+tpu_num_cores:
+    value: null
+train_set:
+    value: 10M
+transformers_version:
+    value: 4.45.1
+typical_p:
+    value: 1
+use_bfloat16:
+    value: false
+use_cache:
+    value: true
+use_cpu:
+    value: false
+use_ipex:
+    value: false
+use_legacy_prediction_loop:
+    value: false
+use_liger_kernel:
+    value: false
+use_mps_device:
+    value: false
+vocab_size:
+    value: 128256
+warmup_ratio:
+    value: 0
+warmup_steps:
+    value: 0
+weight_decay:
+    value: 0
diff --git a/wandb/run-20241031_002020-u516mysu/logs/debug-internal.log b/wandb/run-20241031_002020-u516mysu/logs/debug-internal.log
new file mode 100644
index 0000000000000000000000000000000000000000..68b903deeeb62d45d492756eced75ade0a58fa90
--- /dev/null
+++ b/wandb/run-20241031_002020-u516mysu/logs/debug-internal.log
@@ -0,0 +1,18 @@
+{"time":"2024-10-31T00:20:20.449414915-04:00","level":"INFO","msg":"using version","core version":"0.18.5"}
+{"time":"2024-10-31T00:20:20.449431546-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241031_002020-u516mysu/logs/debug-core.log"}
+{"time":"2024-10-31T00:20:20.559027395-04:00","level":"INFO","msg":"created new stream","id":"u516mysu"}
+{"time":"2024-10-31T00:20:20.559094905-04:00","level":"INFO","msg":"stream: started","id":"u516mysu"}
+{"time":"2024-10-31T00:20:20.559173576-04:00","level":"INFO","msg":"sender: started","stream_id":"u516mysu"}
+{"time":"2024-10-31T00:20:20.559167146-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"u516mysu"}}
+{"time":"2024-10-31T00:20:20.559122936-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"u516mysu"}}
+{"time":"2024-10-31T00:20:21.390564264-04:00","level":"INFO","msg":"Starting system monitor"}
+{"time":"2024-10-31T06:03:54.968293904-04:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/yaning1001-dartmouth-college/impossible_llm_reverse/u516mysu/file_stream"}
+{"time":"2024-10-31T09:17:19.827018178-04:00","level":"INFO","msg":"Stopping system monitor"}
+{"time":"2024-10-31T09:17:19.904181869-04:00","level":"INFO","msg":"Stopped system monitor"}
+{"time":"2024-10-31T09:17:20.764779975-04:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"saving job artifact","runtime_seconds":0.617401592}],"total_operations":1}}
+{"time":"2024-10-31T09:17:22.149262664-04:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
+{"time":"2024-10-31T09:17:23.303472881-04:00","level":"INFO","msg":"stream: closing","id":"u516mysu"}
+{"time":"2024-10-31T09:17:23.303503961-04:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"u516mysu"}}
+{"time":"2024-10-31T09:17:23.303534061-04:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"u516mysu"}}
+{"time":"2024-10-31T09:17:23.303602892-04:00","level":"INFO","msg":"sender: closed","stream_id":"u516mysu"}
+{"time":"2024-10-31T09:17:23.303611352-04:00","level":"INFO","msg":"stream: closed","id":"u516mysu"}
diff --git a/wandb/run-20241031_002020-u516mysu/logs/debug.log b/wandb/run-20241031_002020-u516mysu/logs/debug.log
new file mode 100644
index 0000000000000000000000000000000000000000..1afa6b7ef9c09c53c0777549acfff35de17b1d71
--- /dev/null
+++ b/wandb/run-20241031_002020-u516mysu/logs/debug.log
@@ -0,0 +1,36 @@
+2024-10-31 00:20:20,445 INFO    MainThread:484455 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5
+2024-10-31 00:20:20,445 INFO    MainThread:484455 [wandb_setup.py:_flush():79] Configure stats pid to 484455
+2024-10-31 00:20:20,445 INFO    MainThread:484455 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings
+2024-10-31 00:20:20,445 INFO    MainThread:484455 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings
+2024-10-31 00:20:20,445 INFO    MainThread:484455 [wandb_setup.py:_flush():79] Loading settings from environment variables: {}
+2024-10-31 00:20:20,445 INFO    MainThread:484455 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None}
+2024-10-31 00:20:20,445 INFO    MainThread:484455 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'}
+2024-10-31 00:20:20,445 INFO    MainThread:484455 [wandb_setup.py:_flush():79] Applying login settings: {}
+2024-10-31 00:20:20,445 INFO    MainThread:484455 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241031_002020-u516mysu/logs/debug.log
+2024-10-31 00:20:20,445 INFO    MainThread:484455 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241031_002020-u516mysu/logs/debug-internal.log
+2024-10-31 00:20:20,445 INFO    MainThread:484455 [wandb_init.py:init():621] calling init triggers
+2024-10-31 00:20:20,445 INFO    MainThread:484455 [wandb_init.py:init():628] wandb.init called with sweep_config: {}
+config: {}
+2024-10-31 00:20:20,445 INFO    MainThread:484455 [wandb_init.py:init():671] starting backend
+2024-10-31 00:20:20,445 INFO    MainThread:484455 [wandb_init.py:init():675] sending inform_init request
+2024-10-31 00:20:20,446 INFO    MainThread:484455 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-10-31 00:20:20,446 INFO    MainThread:484455 [wandb_init.py:init():688] backend started and connected
+2024-10-31 00:20:20,449 INFO    MainThread:484455 [wandb_init.py:init():783] updated telemetry
+2024-10-31 00:20:20,478 INFO    MainThread:484455 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout
+2024-10-31 00:20:21,385 INFO    MainThread:484455 [wandb_init.py:init():867] starting run threads in backend
+2024-10-31 00:20:21,499 INFO    MainThread:484455 [wandb_run.py:_console_start():2463] atexit reg
+2024-10-31 00:20:21,499 INFO    MainThread:484455 [wandb_run.py:_redirect():2311] redirect: wrap_raw
+2024-10-31 00:20:21,499 INFO    MainThread:484455 [wandb_run.py:_redirect():2376] Wrapping output streams.
+2024-10-31 00:20:21,499 INFO    MainThread:484455 [wandb_run.py:_redirect():2401] Redirects installed.
+2024-10-31 00:20:21,501 INFO    MainThread:484455 [wandb_init.py:init():911] run started, returning control to user process
+2024-10-31 00:20:21,502 INFO    MainThread:484455 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_full', 'train_set': '10M', 'batch_size': 3, 'epoch': 6, 'seed': 0, 'lr': 1e-05}
+2024-10-31 00:23:47,389 INFO    MainThread:484455 [wandb_run.py:_config_callback():1390] config_cb None None {'vocab_size': 128256, 'max_position_embeddings': 131072, 'hidden_size': 3072, 'intermediate_size': 8192, 'num_hidden_layers': 28, 'num_attention_heads': 24, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'pretraining_tp': 1, 'use_cache': True, 'rope_theta': 500000.0, 'rope_scaling': {'factor': 32.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}, 'attention_bias': False, 'attention_dropout': 0.0, 'mlp_bias': False, 'head_dim': 128, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': True, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['LlamaForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 128000, 'pad_token_id': None, 'eos_token_id': 128001, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'meta-llama/Llama-3.2-3B', 'transformers_version': '4.45.1', 'model_type': 'llama', 'output_dir': './checkpoints/Llama-3.2-3B/babylm_reverse_full_10M_seed0/runs', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 3, 'per_device_eval_batch_size': 8, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 2, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 1e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 6, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': './logs', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 150, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 0, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': False, 'fp16': True, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 10, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': './checkpoints/Llama-3.2-3B/babylm_reverse_full_10M_seed0/runs', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': 'deepspeed_config/train_dp_config.json', 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'evaluation_strategy': 'steps', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'eval_use_gather_object': False}
+2024-10-31 00:23:47,392 INFO    MainThread:484455 [wandb_config.py:__setitem__():154] config set model/num_parameters = 3212749824 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x7f17a2be4dc0>>
+2024-10-31 00:23:47,392 INFO    MainThread:484455 [wandb_run.py:_config_callback():1390] config_cb model/num_parameters 3212749824 None
+2024-10-31 09:17:19,686 INFO    MainThread:484455 [wandb_run.py:_finish():2158] finishing run yaning1001-dartmouth-college/impossible_llm_reverse/u516mysu
+2024-10-31 09:17:19,700 INFO    MainThread:484455 [wandb_run.py:_atexit_cleanup():2426] got exitcode: 0
+2024-10-31 09:17:19,748 INFO    MainThread:484455 [wandb_run.py:_restore():2408] restore
+2024-10-31 09:17:19,749 INFO    MainThread:484455 [wandb_run.py:_restore():2414] restore done
+2024-10-31 09:17:23,296 INFO    MainThread:484455 [wandb_run.py:_footer_history_summary_info():3975] rendering history
+2024-10-31 09:17:23,297 INFO    MainThread:484455 [wandb_run.py:_footer_history_summary_info():4007] rendering summary
+2024-10-31 09:17:23,302 INFO    MainThread:484455 [wandb_run.py:_footer_sync_info():3934] logging synced files
diff --git a/wandb/run-20241031_122005-nip14lm6/logs/debug-internal.log b/wandb/run-20241031_122005-nip14lm6/logs/debug-internal.log
new file mode 100644
index 0000000000000000000000000000000000000000..449e7db43958d1811f55c5624677b32ca276915c
--- /dev/null
+++ b/wandb/run-20241031_122005-nip14lm6/logs/debug-internal.log
@@ -0,0 +1,11 @@
+{"time":"2024-10-31T12:20:05.848495974-04:00","level":"INFO","msg":"using version","core version":"0.18.5"}
+{"time":"2024-10-31T12:20:05.848507284-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241031_122005-nip14lm6/logs/debug-core.log"}
+{"time":"2024-10-31T12:20:05.957242925-04:00","level":"INFO","msg":"created new stream","id":"nip14lm6"}
+{"time":"2024-10-31T12:20:05.957288975-04:00","level":"INFO","msg":"stream: started","id":"nip14lm6"}
+{"time":"2024-10-31T12:20:05.957371865-04:00","level":"INFO","msg":"sender: started","stream_id":"nip14lm6"}
+{"time":"2024-10-31T12:20:05.957351625-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"nip14lm6"}}
+{"time":"2024-10-31T12:20:05.957317185-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"nip14lm6"}}
+{"time":"2024-10-31T12:20:06.183349635-04:00","level":"INFO","msg":"Starting system monitor"}
+{"time":"2024-10-31T12:20:29.345034007-04:00","level":"INFO","msg":"stream: closing","id":"nip14lm6"}
+{"time":"2024-10-31T12:20:29.345114157-04:00","level":"INFO","msg":"Stopping system monitor"}
+{"time":"2024-10-31T12:20:29.345967457-04:00","level":"INFO","msg":"Stopped system monitor"}
diff --git a/wandb/run-20241031_122005-nip14lm6/logs/debug.log b/wandb/run-20241031_122005-nip14lm6/logs/debug.log
new file mode 100644
index 0000000000000000000000000000000000000000..4448025aeae355f0d8cccc7100ff8fd3dda627af
--- /dev/null
+++ b/wandb/run-20241031_122005-nip14lm6/logs/debug.log
@@ -0,0 +1,27 @@
+2024-10-31 12:20:05,843 INFO    MainThread:557184 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5
+2024-10-31 12:20:05,843 INFO    MainThread:557184 [wandb_setup.py:_flush():79] Configure stats pid to 557184
+2024-10-31 12:20:05,843 INFO    MainThread:557184 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings
+2024-10-31 12:20:05,843 INFO    MainThread:557184 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings
+2024-10-31 12:20:05,843 INFO    MainThread:557184 [wandb_setup.py:_flush():79] Loading settings from environment variables: {}
+2024-10-31 12:20:05,844 INFO    MainThread:557184 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None}
+2024-10-31 12:20:05,844 INFO    MainThread:557184 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'}
+2024-10-31 12:20:05,844 INFO    MainThread:557184 [wandb_setup.py:_flush():79] Applying login settings: {}
+2024-10-31 12:20:05,844 INFO    MainThread:557184 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241031_122005-nip14lm6/logs/debug.log
+2024-10-31 12:20:05,844 INFO    MainThread:557184 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241031_122005-nip14lm6/logs/debug-internal.log
+2024-10-31 12:20:05,844 INFO    MainThread:557184 [wandb_init.py:init():621] calling init triggers
+2024-10-31 12:20:05,844 INFO    MainThread:557184 [wandb_init.py:init():628] wandb.init called with sweep_config: {}
+config: {}
+2024-10-31 12:20:05,844 INFO    MainThread:557184 [wandb_init.py:init():671] starting backend
+2024-10-31 12:20:05,844 INFO    MainThread:557184 [wandb_init.py:init():675] sending inform_init request
+2024-10-31 12:20:05,845 INFO    MainThread:557184 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-10-31 12:20:05,846 INFO    MainThread:557184 [wandb_init.py:init():688] backend started and connected
+2024-10-31 12:20:05,848 INFO    MainThread:557184 [wandb_init.py:init():783] updated telemetry
+2024-10-31 12:20:05,876 INFO    MainThread:557184 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout
+2024-10-31 12:20:06,180 INFO    MainThread:557184 [wandb_init.py:init():867] starting run threads in backend
+2024-10-31 12:20:06,275 INFO    MainThread:557184 [wandb_run.py:_console_start():2463] atexit reg
+2024-10-31 12:20:06,275 INFO    MainThread:557184 [wandb_run.py:_redirect():2311] redirect: wrap_raw
+2024-10-31 12:20:06,275 INFO    MainThread:557184 [wandb_run.py:_redirect():2376] Wrapping output streams.
+2024-10-31 12:20:06,275 INFO    MainThread:557184 [wandb_run.py:_redirect():2401] Redirects installed.
+2024-10-31 12:20:06,277 INFO    MainThread:557184 [wandb_init.py:init():911] run started, returning control to user process
+2024-10-31 12:20:06,278 INFO    MainThread:557184 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_full', 'train_set': '10M', 'batch_size': 3, 'epoch': 6, 'seed': 0, 'lr': 5e-06}
+2024-10-31 12:20:29,345 WARNING MsgRouterThr:557184 [router.py:message_loop():77] message_loop has been closed
diff --git a/wandb/run-20241101_012438-61w48leq/logs/debug-internal.log b/wandb/run-20241101_012438-61w48leq/logs/debug-internal.log
new file mode 100644
index 0000000000000000000000000000000000000000..c39014a4072f99cd273db3609f2747a7e4996e57
--- /dev/null
+++ b/wandb/run-20241101_012438-61w48leq/logs/debug-internal.log
@@ -0,0 +1,16 @@
+{"time":"2024-11-01T01:24:38.163791445-04:00","level":"INFO","msg":"using version","core version":"0.18.5"}
+{"time":"2024-11-01T01:24:38.163804515-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_012438-61w48leq/logs/debug-core.log"}
+{"time":"2024-11-01T01:24:38.271328113-04:00","level":"INFO","msg":"created new stream","id":"61w48leq"}
+{"time":"2024-11-01T01:24:38.271368763-04:00","level":"INFO","msg":"stream: started","id":"61w48leq"}
+{"time":"2024-11-01T01:24:38.271434224-04:00","level":"INFO","msg":"sender: started","stream_id":"61w48leq"}
+{"time":"2024-11-01T01:24:38.271418254-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"61w48leq"}}
+{"time":"2024-11-01T01:24:38.271399604-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"61w48leq"}}
+{"time":"2024-11-01T01:24:38.451315095-04:00","level":"INFO","msg":"Starting system monitor"}
+{"time":"2024-11-01T01:24:38.692246251-04:00","level":"INFO","msg":"stream: closing","id":"61w48leq"}
+{"time":"2024-11-01T01:24:38.692349652-04:00","level":"INFO","msg":"Stopping system monitor"}
+{"time":"2024-11-01T01:24:38.693222148-04:00","level":"INFO","msg":"Stopped system monitor"}
+{"time":"2024-11-01T01:24:39.149304751-04:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
+{"time":"2024-11-01T01:24:39.271906984-04:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"61w48leq"}}
+{"time":"2024-11-01T01:24:39.271942114-04:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"61w48leq"}}
+{"time":"2024-11-01T01:24:39.271971665-04:00","level":"INFO","msg":"sender: closed","stream_id":"61w48leq"}
+{"time":"2024-11-01T01:24:39.272008615-04:00","level":"INFO","msg":"stream: closed","id":"61w48leq"}
diff --git a/wandb/run-20241101_012438-61w48leq/logs/debug.log b/wandb/run-20241101_012438-61w48leq/logs/debug.log
new file mode 100644
index 0000000000000000000000000000000000000000..68de204bf74879370b4bc4117222ad3bf0a8b263
--- /dev/null
+++ b/wandb/run-20241101_012438-61w48leq/logs/debug.log
@@ -0,0 +1,27 @@
+2024-11-01 01:24:38,159 INFO    MainThread:676353 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5
+2024-11-01 01:24:38,159 INFO    MainThread:676353 [wandb_setup.py:_flush():79] Configure stats pid to 676353
+2024-11-01 01:24:38,159 INFO    MainThread:676353 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings
+2024-11-01 01:24:38,159 INFO    MainThread:676353 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings
+2024-11-01 01:24:38,159 INFO    MainThread:676353 [wandb_setup.py:_flush():79] Loading settings from environment variables: {}
+2024-11-01 01:24:38,159 INFO    MainThread:676353 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None}
+2024-11-01 01:24:38,159 INFO    MainThread:676353 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'}
+2024-11-01 01:24:38,159 INFO    MainThread:676353 [wandb_setup.py:_flush():79] Applying login settings: {}
+2024-11-01 01:24:38,159 INFO    MainThread:676353 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_012438-61w48leq/logs/debug.log
+2024-11-01 01:24:38,159 INFO    MainThread:676353 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_012438-61w48leq/logs/debug-internal.log
+2024-11-01 01:24:38,159 INFO    MainThread:676353 [wandb_init.py:init():621] calling init triggers
+2024-11-01 01:24:38,159 INFO    MainThread:676353 [wandb_init.py:init():628] wandb.init called with sweep_config: {}
+config: {}
+2024-11-01 01:24:38,159 INFO    MainThread:676353 [wandb_init.py:init():671] starting backend
+2024-11-01 01:24:38,159 INFO    MainThread:676353 [wandb_init.py:init():675] sending inform_init request
+2024-11-01 01:24:38,160 INFO    MainThread:676353 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-11-01 01:24:38,161 INFO    MainThread:676353 [wandb_init.py:init():688] backend started and connected
+2024-11-01 01:24:38,164 INFO    MainThread:676353 [wandb_init.py:init():783] updated telemetry
+2024-11-01 01:24:38,193 INFO    MainThread:676353 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout
+2024-11-01 01:24:38,446 INFO    MainThread:676353 [wandb_init.py:init():867] starting run threads in backend
+2024-11-01 01:24:38,577 INFO    MainThread:676353 [wandb_run.py:_console_start():2463] atexit reg
+2024-11-01 01:24:38,577 INFO    MainThread:676353 [wandb_run.py:_redirect():2311] redirect: wrap_raw
+2024-11-01 01:24:38,577 INFO    MainThread:676353 [wandb_run.py:_redirect():2376] Wrapping output streams.
+2024-11-01 01:24:38,577 INFO    MainThread:676353 [wandb_run.py:_redirect():2401] Redirects installed.
+2024-11-01 01:24:38,579 INFO    MainThread:676353 [wandb_init.py:init():911] run started, returning control to user process
+2024-11-01 01:24:38,579 INFO    MainThread:676353 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'shuffle_nodeterministic', 'train_set': '10M', 'batch_size': 3, 'epoch': 6, 'seed': 0, 'lr': 5e-06}
+2024-11-01 01:24:38,692 WARNING MsgRouterThr:676353 [router.py:message_loop():77] message_loop has been closed
diff --git a/wandb/run-20241101_012612-q08jbqqf/files/config.yaml b/wandb/run-20241101_012612-q08jbqqf/files/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..16d25ee0e4092474360045faaf0959cfc1b3e91d
--- /dev/null
+++ b/wandb/run-20241101_012612-q08jbqqf/files/config.yaml
@@ -0,0 +1,49 @@
+_wandb:
+    value:
+        cli_version: 0.18.5
+        m: []
+        python_version: 3.9.19
+        t:
+            "1":
+                - 1
+                - 5
+                - 11
+                - 49
+                - 51
+                - 53
+                - 55
+                - 71
+                - 98
+            "2":
+                - 1
+                - 5
+                - 11
+                - 49
+                - 51
+                - 53
+                - 55
+                - 71
+                - 98
+            "3":
+                - 13
+                - 23
+                - 55
+            "4": 3.9.19
+            "5": 0.18.5
+            "6": 4.45.1
+            "8":
+                - 5
+            "12": 0.18.5
+            "13": linux-x86_64
+batch_size:
+    value: 3
+epoch:
+    value: 6
+lr:
+    value: 5e-06
+perturbation:
+    value: shuffle_nodeterministic
+seed:
+    value: 0
+train_set:
+    value: 10M
diff --git a/wandb/run-20241101_012612-q08jbqqf/files/wandb-metadata.json b/wandb/run-20241101_012612-q08jbqqf/files/wandb-metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..8ca844f44cfc77ae610b2b54bf343f13cb912b30
--- /dev/null
+++ b/wandb/run-20241101_012612-q08jbqqf/files/wandb-metadata.json
@@ -0,0 +1,97 @@
+{
+  "os":  "Linux-5.4.0-162-generic-x86_64-with-glibc2.31",
+  "python":  "3.9.19",
+  "startedAt":  "2024-11-01T05:26:12.824647Z",
+  "args":  [
+    "--perturbation",
+    "shuffle_nodeterministic",
+    "--train_set",
+    "10M",
+    "--batch_size",
+    "3",
+    "--epoch",
+    "6",
+    "--seed",
+    "0"
+  ],
+  "program":  "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py",
+  "codePath":  "train/train_deep_wandb.py",
+  "git":  {
+    "remote":  "git@hf.co:Yaning1001/Impossible_llm.git",
+    "commit":  "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4"
+  },
+  "email":  "yaning1001@gmail.com",
+  "root":  "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train",
+  "host":  "mms-large-2",
+  "username":  "chunhui",
+  "executable":  "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python",
+  "codePathLocal":  "train_deep_wandb.py",
+  "cpu_count":  32,
+  "cpu_count_logical":  64,
+  "gpu":  "NVIDIA RTX A6000",
+  "gpu_count":  8,
+  "disk":  {
+    "/":  {
+      "total":  "1888559353856",
+      "used":  "1753992228864"
+    }
+  },
+  "memory":  {
+    "total":  "202617098240"
+  },
+  "cpu":  {
+    "count":  32,
+    "countLogical":  64
+  },
+  "gpu_nvidia":  [
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    }
+  ],
+  "cudaVersion":  "11.8"
+}
\ No newline at end of file
diff --git a/wandb/run-20241101_012612-q08jbqqf/files/wandb-summary.json b/wandb/run-20241101_012612-q08jbqqf/files/wandb-summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..6c37fe1cbbb8aed86fd461a79642cb991e4d35cf
--- /dev/null
+++ b/wandb/run-20241101_012612-q08jbqqf/files/wandb-summary.json
@@ -0,0 +1 @@
+{"_wandb":{"runtime":0}}
\ No newline at end of file
diff --git a/wandb/run-20241101_012612-q08jbqqf/logs/debug-internal.log b/wandb/run-20241101_012612-q08jbqqf/logs/debug-internal.log
new file mode 100644
index 0000000000000000000000000000000000000000..fb44c1f3d5d613485f01b7bc8632a31a11de550a
--- /dev/null
+++ b/wandb/run-20241101_012612-q08jbqqf/logs/debug-internal.log
@@ -0,0 +1,16 @@
+{"time":"2024-11-01T01:26:12.826583242-04:00","level":"INFO","msg":"using version","core version":"0.18.5"}
+{"time":"2024-11-01T01:26:12.826594312-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_012612-q08jbqqf/logs/debug-core.log"}
+{"time":"2024-11-01T01:26:12.933324283-04:00","level":"INFO","msg":"created new stream","id":"q08jbqqf"}
+{"time":"2024-11-01T01:26:12.933362294-04:00","level":"INFO","msg":"stream: started","id":"q08jbqqf"}
+{"time":"2024-11-01T01:26:12.933399464-04:00","level":"INFO","msg":"sender: started","stream_id":"q08jbqqf"}
+{"time":"2024-11-01T01:26:12.933367414-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"q08jbqqf"}}
+{"time":"2024-11-01T01:26:12.933399204-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"q08jbqqf"}}
+{"time":"2024-11-01T01:26:13.132185735-04:00","level":"INFO","msg":"Starting system monitor"}
+{"time":"2024-11-01T01:26:13.242452211-04:00","level":"INFO","msg":"stream: closing","id":"q08jbqqf"}
+{"time":"2024-11-01T01:26:13.242504401-04:00","level":"INFO","msg":"Stopping system monitor"}
+{"time":"2024-11-01T01:26:13.242989514-04:00","level":"INFO","msg":"Stopped system monitor"}
+{"time":"2024-11-01T01:26:13.986483698-04:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
+{"time":"2024-11-01T01:26:14.10924044-04:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"q08jbqqf"}}
+{"time":"2024-11-01T01:26:14.109339001-04:00","level":"INFO","msg":"sender: closed","stream_id":"q08jbqqf"}
+{"time":"2024-11-01T01:26:14.109321291-04:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"q08jbqqf"}}
+{"time":"2024-11-01T01:26:14.109424031-04:00","level":"INFO","msg":"stream: closed","id":"q08jbqqf"}
diff --git a/wandb/run-20241101_012612-q08jbqqf/logs/debug.log b/wandb/run-20241101_012612-q08jbqqf/logs/debug.log
new file mode 100644
index 0000000000000000000000000000000000000000..0ff3c089ad4b805d3b13ab890c328411db76ec11
--- /dev/null
+++ b/wandb/run-20241101_012612-q08jbqqf/logs/debug.log
@@ -0,0 +1,27 @@
+2024-11-01 01:26:12,822 INFO    MainThread:677636 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5
+2024-11-01 01:26:12,822 INFO    MainThread:677636 [wandb_setup.py:_flush():79] Configure stats pid to 677636
+2024-11-01 01:26:12,823 INFO    MainThread:677636 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings
+2024-11-01 01:26:12,823 INFO    MainThread:677636 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings
+2024-11-01 01:26:12,823 INFO    MainThread:677636 [wandb_setup.py:_flush():79] Loading settings from environment variables: {}
+2024-11-01 01:26:12,823 INFO    MainThread:677636 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None}
+2024-11-01 01:26:12,823 INFO    MainThread:677636 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'}
+2024-11-01 01:26:12,823 INFO    MainThread:677636 [wandb_setup.py:_flush():79] Applying login settings: {}
+2024-11-01 01:26:12,823 INFO    MainThread:677636 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_012612-q08jbqqf/logs/debug.log
+2024-11-01 01:26:12,823 INFO    MainThread:677636 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_012612-q08jbqqf/logs/debug-internal.log
+2024-11-01 01:26:12,823 INFO    MainThread:677636 [wandb_init.py:init():621] calling init triggers
+2024-11-01 01:26:12,823 INFO    MainThread:677636 [wandb_init.py:init():628] wandb.init called with sweep_config: {}
+config: {}
+2024-11-01 01:26:12,823 INFO    MainThread:677636 [wandb_init.py:init():671] starting backend
+2024-11-01 01:26:12,823 INFO    MainThread:677636 [wandb_init.py:init():675] sending inform_init request
+2024-11-01 01:26:12,824 INFO    MainThread:677636 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-11-01 01:26:12,824 INFO    MainThread:677636 [wandb_init.py:init():688] backend started and connected
+2024-11-01 01:26:12,827 INFO    MainThread:677636 [wandb_init.py:init():783] updated telemetry
+2024-11-01 01:26:12,846 INFO    MainThread:677636 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout
+2024-11-01 01:26:13,128 INFO    MainThread:677636 [wandb_init.py:init():867] starting run threads in backend
+2024-11-01 01:26:13,232 INFO    MainThread:677636 [wandb_run.py:_console_start():2463] atexit reg
+2024-11-01 01:26:13,232 INFO    MainThread:677636 [wandb_run.py:_redirect():2311] redirect: wrap_raw
+2024-11-01 01:26:13,232 INFO    MainThread:677636 [wandb_run.py:_redirect():2376] Wrapping output streams.
+2024-11-01 01:26:13,232 INFO    MainThread:677636 [wandb_run.py:_redirect():2401] Redirects installed.
+2024-11-01 01:26:13,234 INFO    MainThread:677636 [wandb_init.py:init():911] run started, returning control to user process
+2024-11-01 01:26:13,234 INFO    MainThread:677636 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'shuffle_nodeterministic', 'train_set': '10M', 'batch_size': 3, 'epoch': 6, 'seed': 0, 'lr': 5e-06}
+2024-11-01 01:26:13,242 WARNING MsgRouterThr:677636 [router.py:message_loop():77] message_loop has been closed
diff --git a/wandb/run-20241101_094656-1l5gkwzn/files/output.log b/wandb/run-20241101_094656-1l5gkwzn/files/output.log
new file mode 100644
index 0000000000000000000000000000000000000000..13fabfc08270592f775992de99610ab2e814da93
--- /dev/null
+++ b/wandb/run-20241101_094656-1l5gkwzn/files/output.log
@@ -0,0 +1,13 @@
+Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:05<00:00,  2.60s/it]
+tokenized_valid: Dataset({
+    features: ['input_ids', 'attention_mask'],
+    num_rows: 600
+})
+/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead
+  warnings.warn(
+[2024-11-01 09:47:03,375] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2024-11-01 09:47:12,454] [INFO] [comm.py:652:init_distributed] cdb=None
+Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination
+Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root...
+Loading extension module cpu_adam...
+Time to load cpu_adam op: 4.764644622802734 seconds
diff --git a/wandb/run-20241101_094656-1l5gkwzn/files/requirements.txt b/wandb/run-20241101_094656-1l5gkwzn/files/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..95a931302e269cc9e4fa5b719b6511f176ee2416
--- /dev/null
+++ b/wandb/run-20241101_094656-1l5gkwzn/files/requirements.txt
@@ -0,0 +1,147 @@
+funcsigs==1.0.2
+sentry-sdk==2.17.0
+multiprocess==0.70.16
+numpy==1.26.2
+pluralizer==1.2.0
+debugpy==1.6.7
+nvidia-cudnn-cu11==8.5.0.96
+deepspeed==0.15.2
+data==0.4
+pandas==2.1.3
+tomli==2.0.1
+charset-normalizer==3.3.2
+attrs==24.2.0
+aiosignal==1.3.1
+fsspec==2023.10.0
+nvidia-cusparse-cu11==11.7.4.91
+zipp==3.12.0
+mypy-extensions==1.0.0
+datasets==3.0.1
+joblib==1.3.2
+hjson==3.1.0
+traitlets==5.7.1
+stack-data==0.6.0
+transformers==4.45.1
+sympy==1.11.1
+Pygments==2.15.0
+docker-pycreds==0.4.0
+dill==0.3.8
+wheel==0.44.0
+prompt-toolkit==3.0.30
+parso==0.8.3
+ipykernel==6.23.1
+pyarrow==17.0.0
+certifi==2023.11.17
+nvidia-cufft-cu11==10.9.0.58
+six==1.16.0
+pydantic==2.9.2
+click==8.1.7
+nest-asyncio==1.5.6
+gmpy2==2.1.0
+matplotlib==3.8.2
+scipy==1.11.4
+typing_extensions==4.12.2
+statsmodels==0.14.0
+huggingface-hub==0.25.0
+frozenlist==1.4.1
+gpustat==1.1.1
+nvidia-nvtx-cu11==11.7.91
+safetensors==0.4.5
+stanza==1.9.2
+decorator==5.1.1
+seaborn==0.13.0
+sentencepiece==0.2.0
+PyYAML==6.0.1
+black==24.8.0
+protobuf==4.25.1
+pickleshare==0.7.5
+peft==0.13.0
+triton==2.0.0
+nvidia-cuda-runtime-cu11==11.7.99
+Jinja2==3.1.2
+nvidia-cusolver-cu11==11.4.0.1
+executing==1.2.0
+jupyter_client==8.1.0
+pluggy==1.3.0
+cmake==3.30.3
+pytz==2023.3.post1
+aiohappyeyeballs==2.4.2
+kiwisolver==1.4.5
+py-cpuinfo==9.0.0
+Pillow==10.1.0
+ptyprocess==0.7.0
+importlib_resources==6.4.5
+GitPython==3.1.43
+importlib-metadata==6.0.0
+iniconfig==2.0.0
+scikit-learn==1.3.2
+exceptiongroup==1.1.0
+networkx==2.8.6
+accelerate==1.0.0
+nltk==3.8.1
+shutilwhich==1.1.0
+fonttools==4.45.1
+future==0.18.3
+aiohttp==3.10.6
+wcwidth==0.2.5
+idna==3.6
+filelock==3.12.2
+pathspec==0.12.1
+jupyter_core==5.1.0
+lit==18.1.8
+nvidia-curand-cu11==10.2.10.91
+nvidia-cublas-cu11==11.10.3.66
+nvidia-ml-py==12.560.30
+msgpack==1.1.0
+python-dateutil==2.8.2
+blessed==1.20.0
+packaging==23.0
+gitdb==4.0.11
+yarl==1.13.0
+emoji==2.8.0
+tzdata==2023.3
+cycler==0.12.1
+tornado==6.2
+backcall==0.2.0
+plotnine==0.12.4
+ninja==1.11.1.1
+latex==0.7.0
+wandb==0.18.5
+setproctitle==1.3.3
+threadpoolctl==3.2.0
+requests==2.32.3
+pyparsing==3.1.1
+smmap==5.0.1
+pyzmq==23.0.0
+async-timeout==4.0.3
+annotated-types==0.7.0
+matplotlib-inline==0.1.6
+latexcodec==1.0.0
+ipython==8.0.0
+patsy==0.5.3
+contourpy==1.2.0
+multidict==6.1.0
+mizani==0.9.3
+urllib3==2.1.0
+tokenizers==0.20.0
+MarkupSafe==2.1.2
+pip==24.2
+pexpect==4.8.0
+tqdm==4.66.5
+jedi==0.18.2
+pydantic_core==2.23.4
+tempdir==0.7.1
+mpmath==1.2.1
+setuptools==72.1.0
+pytest==7.4.3
+pure-eval==0.2.2
+psutil==5.9.1
+comm==0.1.2
+nvidia-cuda-cupti-cu11==11.7.101
+nvidia-cuda-nvrtc-cu11==11.7.99
+regex==2023.10.3
+platformdirs==2.5.2
+asttokens==2.2.1
+torch==2.0.0
+nvidia-nccl-cu11==2.14.3
+xxhash==3.5.0
diff --git a/wandb/run-20241101_094656-1l5gkwzn/files/wandb-metadata.json b/wandb/run-20241101_094656-1l5gkwzn/files/wandb-metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..452b0524d4317efa0dbb2f9a90d1a4d7fe1e83eb
--- /dev/null
+++ b/wandb/run-20241101_094656-1l5gkwzn/files/wandb-metadata.json
@@ -0,0 +1,97 @@
+{
+  "os":  "Linux-5.4.0-162-generic-x86_64-with-glibc2.31",
+  "python":  "3.9.19",
+  "startedAt":  "2024-11-01T13:46:56.156739Z",
+  "args":  [
+    "--perturbation",
+    "reverse_control",
+    "--train_set",
+    "10M",
+    "--batch_size",
+    "3",
+    "--epoch",
+    "7",
+    "--seed",
+    "0"
+  ],
+  "program":  "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py",
+  "codePath":  "train/train_deep_wandb.py",
+  "git":  {
+    "remote":  "git@hf.co:Yaning1001/Impossible_llm.git",
+    "commit":  "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4"
+  },
+  "email":  "yaning1001@gmail.com",
+  "root":  "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train",
+  "host":  "mms-large-2",
+  "username":  "chunhui",
+  "executable":  "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python",
+  "codePathLocal":  "train_deep_wandb.py",
+  "cpu_count":  32,
+  "cpu_count_logical":  64,
+  "gpu":  "NVIDIA RTX A6000",
+  "gpu_count":  8,
+  "disk":  {
+    "/":  {
+      "total":  "1888559353856",
+      "used":  "1754695659520"
+    }
+  },
+  "memory":  {
+    "total":  "202617098240"
+  },
+  "cpu":  {
+    "count":  32,
+    "countLogical":  64
+  },
+  "gpu_nvidia":  [
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    }
+  ],
+  "cudaVersion":  "11.8"
+}
\ No newline at end of file
diff --git a/wandb/run-20241101_094656-1l5gkwzn/logs/debug-internal.log b/wandb/run-20241101_094656-1l5gkwzn/logs/debug-internal.log
new file mode 100644
index 0000000000000000000000000000000000000000..d829038c9d5b9554d19a65234536a15a272d2620
--- /dev/null
+++ b/wandb/run-20241101_094656-1l5gkwzn/logs/debug-internal.log
@@ -0,0 +1,8 @@
+{"time":"2024-11-01T09:46:56.158847121-04:00","level":"INFO","msg":"using version","core version":"0.18.5"}
+{"time":"2024-11-01T09:46:56.158862091-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_094656-1l5gkwzn/logs/debug-core.log"}
+{"time":"2024-11-01T09:46:56.26479573-04:00","level":"INFO","msg":"created new stream","id":"1l5gkwzn"}
+{"time":"2024-11-01T09:46:56.264831431-04:00","level":"INFO","msg":"stream: started","id":"1l5gkwzn"}
+{"time":"2024-11-01T09:46:56.264866161-04:00","level":"INFO","msg":"sender: started","stream_id":"1l5gkwzn"}
+{"time":"2024-11-01T09:46:56.264860761-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"1l5gkwzn"}}
+{"time":"2024-11-01T09:46:56.264880331-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"1l5gkwzn"}}
+{"time":"2024-11-01T09:46:56.464351323-04:00","level":"INFO","msg":"Starting system monitor"}
diff --git a/wandb/run-20241101_094656-1l5gkwzn/logs/debug.log b/wandb/run-20241101_094656-1l5gkwzn/logs/debug.log
new file mode 100644
index 0000000000000000000000000000000000000000..5de6a2c78b672edc0b540519172bf6aca0bc2149
--- /dev/null
+++ b/wandb/run-20241101_094656-1l5gkwzn/logs/debug.log
@@ -0,0 +1,26 @@
+2024-11-01 09:46:56,154 INFO    MainThread:786691 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5
+2024-11-01 09:46:56,154 INFO    MainThread:786691 [wandb_setup.py:_flush():79] Configure stats pid to 786691
+2024-11-01 09:46:56,154 INFO    MainThread:786691 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings
+2024-11-01 09:46:56,154 INFO    MainThread:786691 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings
+2024-11-01 09:46:56,154 INFO    MainThread:786691 [wandb_setup.py:_flush():79] Loading settings from environment variables: {}
+2024-11-01 09:46:56,154 INFO    MainThread:786691 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None}
+2024-11-01 09:46:56,154 INFO    MainThread:786691 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'}
+2024-11-01 09:46:56,154 INFO    MainThread:786691 [wandb_setup.py:_flush():79] Applying login settings: {}
+2024-11-01 09:46:56,154 INFO    MainThread:786691 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_094656-1l5gkwzn/logs/debug.log
+2024-11-01 09:46:56,155 INFO    MainThread:786691 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_094656-1l5gkwzn/logs/debug-internal.log
+2024-11-01 09:46:56,155 INFO    MainThread:786691 [wandb_init.py:init():621] calling init triggers
+2024-11-01 09:46:56,155 INFO    MainThread:786691 [wandb_init.py:init():628] wandb.init called with sweep_config: {}
+config: {}
+2024-11-01 09:46:56,155 INFO    MainThread:786691 [wandb_init.py:init():671] starting backend
+2024-11-01 09:46:56,155 INFO    MainThread:786691 [wandb_init.py:init():675] sending inform_init request
+2024-11-01 09:46:56,156 INFO    MainThread:786691 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-11-01 09:46:56,156 INFO    MainThread:786691 [wandb_init.py:init():688] backend started and connected
+2024-11-01 09:46:56,159 INFO    MainThread:786691 [wandb_init.py:init():783] updated telemetry
+2024-11-01 09:46:56,188 INFO    MainThread:786691 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout
+2024-11-01 09:46:56,460 INFO    MainThread:786691 [wandb_init.py:init():867] starting run threads in backend
+2024-11-01 09:46:56,562 INFO    MainThread:786691 [wandb_run.py:_console_start():2463] atexit reg
+2024-11-01 09:46:56,562 INFO    MainThread:786691 [wandb_run.py:_redirect():2311] redirect: wrap_raw
+2024-11-01 09:46:56,562 INFO    MainThread:786691 [wandb_run.py:_redirect():2376] Wrapping output streams.
+2024-11-01 09:46:56,562 INFO    MainThread:786691 [wandb_run.py:_redirect():2401] Redirects installed.
+2024-11-01 09:46:56,563 INFO    MainThread:786691 [wandb_init.py:init():911] run started, returning control to user process
+2024-11-01 09:46:56,563 INFO    MainThread:786691 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_control', 'train_set': '10M', 'batch_size': 3, 'epoch': 7, 'seed': 0, 'lr': 5e-06}
diff --git a/wandb/run-20241101_094656-1l5gkwzn/run-1l5gkwzn.wandb b/wandb/run-20241101_094656-1l5gkwzn/run-1l5gkwzn.wandb
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/wandb/run-20241101_094656-ae4hctp0/logs/debug-internal.log b/wandb/run-20241101_094656-ae4hctp0/logs/debug-internal.log
new file mode 100644
index 0000000000000000000000000000000000000000..ede4f7b5feabb3a899069ab919316ac6ad8850f8
--- /dev/null
+++ b/wandb/run-20241101_094656-ae4hctp0/logs/debug-internal.log
@@ -0,0 +1,8 @@
+{"time":"2024-11-01T09:46:56.382128939-04:00","level":"INFO","msg":"using version","core version":"0.18.5"}
+{"time":"2024-11-01T09:46:56.382144359-04:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_094656-ae4hctp0/logs/debug-core.log"}
+{"time":"2024-11-01T09:46:56.488019778-04:00","level":"INFO","msg":"created new stream","id":"ae4hctp0"}
+{"time":"2024-11-01T09:46:56.488068848-04:00","level":"INFO","msg":"stream: started","id":"ae4hctp0"}
+{"time":"2024-11-01T09:46:56.488137609-04:00","level":"INFO","msg":"sender: started","stream_id":"ae4hctp0"}
+{"time":"2024-11-01T09:46:56.488116569-04:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"ae4hctp0"}}
+{"time":"2024-11-01T09:46:56.488165299-04:00","level":"INFO","msg":"handler: started","stream_id":{"value":"ae4hctp0"}}
+{"time":"2024-11-01T09:46:56.721095514-04:00","level":"INFO","msg":"Starting system monitor"}
diff --git a/wandb/run-20241101_200517-iopieyi0/logs/debug.log b/wandb/run-20241101_200517-iopieyi0/logs/debug.log
new file mode 100644
index 0000000000000000000000000000000000000000..bd086c650cc4e3715cb37ddfc82c8428eb23a42c
--- /dev/null
+++ b/wandb/run-20241101_200517-iopieyi0/logs/debug.log
@@ -0,0 +1,27 @@
+2024-11-01 20:05:17,138 INFO    MainThread:870382 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5
+2024-11-01 20:05:17,138 INFO    MainThread:870382 [wandb_setup.py:_flush():79] Configure stats pid to 870382
+2024-11-01 20:05:17,138 INFO    MainThread:870382 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings
+2024-11-01 20:05:17,138 INFO    MainThread:870382 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings
+2024-11-01 20:05:17,138 INFO    MainThread:870382 [wandb_setup.py:_flush():79] Loading settings from environment variables: {}
+2024-11-01 20:05:17,138 INFO    MainThread:870382 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None}
+2024-11-01 20:05:17,138 INFO    MainThread:870382 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'}
+2024-11-01 20:05:17,138 INFO    MainThread:870382 [wandb_setup.py:_flush():79] Applying login settings: {}
+2024-11-01 20:05:17,138 INFO    MainThread:870382 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_200517-iopieyi0/logs/debug.log
+2024-11-01 20:05:17,138 INFO    MainThread:870382 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_200517-iopieyi0/logs/debug-internal.log
+2024-11-01 20:05:17,138 INFO    MainThread:870382 [wandb_init.py:init():621] calling init triggers
+2024-11-01 20:05:17,138 INFO    MainThread:870382 [wandb_init.py:init():628] wandb.init called with sweep_config: {}
+config: {}
+2024-11-01 20:05:17,139 INFO    MainThread:870382 [wandb_init.py:init():671] starting backend
+2024-11-01 20:05:17,139 INFO    MainThread:870382 [wandb_init.py:init():675] sending inform_init request
+2024-11-01 20:05:17,140 INFO    MainThread:870382 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-11-01 20:05:17,140 INFO    MainThread:870382 [wandb_init.py:init():688] backend started and connected
+2024-11-01 20:05:17,144 INFO    MainThread:870382 [wandb_init.py:init():783] updated telemetry
+2024-11-01 20:05:17,174 INFO    MainThread:870382 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout
+2024-11-01 20:05:17,485 INFO    MainThread:870382 [wandb_init.py:init():867] starting run threads in backend
+2024-11-01 20:05:17,574 INFO    MainThread:870382 [wandb_run.py:_console_start():2463] atexit reg
+2024-11-01 20:05:17,574 INFO    MainThread:870382 [wandb_run.py:_redirect():2311] redirect: wrap_raw
+2024-11-01 20:05:17,574 INFO    MainThread:870382 [wandb_run.py:_redirect():2376] Wrapping output streams.
+2024-11-01 20:05:17,574 INFO    MainThread:870382 [wandb_run.py:_redirect():2401] Redirects installed.
+2024-11-01 20:05:17,575 INFO    MainThread:870382 [wandb_init.py:init():911] run started, returning control to user process
+2024-11-01 20:05:17,575 INFO    MainThread:870382 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'shuffle_nondeterministic', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0, 'lr': 5e-06}
+2024-11-01 20:05:25,263 WARNING MsgRouterThr:870382 [router.py:message_loop():77] message_loop has been closed
diff --git a/wandb/run-20241101_201927-8tmqrwpx/files/config.yaml b/wandb/run-20241101_201927-8tmqrwpx/files/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4e34fec43368c51fb4bd10a24a21dd490ecdba44
--- /dev/null
+++ b/wandb/run-20241101_201927-8tmqrwpx/files/config.yaml
@@ -0,0 +1,49 @@
+_wandb:
+    value:
+        cli_version: 0.18.5
+        m: []
+        python_version: 3.9.19
+        t:
+            "1":
+                - 1
+                - 5
+                - 11
+                - 49
+                - 51
+                - 53
+                - 55
+                - 71
+                - 98
+            "2":
+                - 1
+                - 5
+                - 11
+                - 49
+                - 51
+                - 53
+                - 55
+                - 71
+                - 98
+            "3":
+                - 13
+                - 23
+                - 55
+            "4": 3.9.19
+            "5": 0.18.5
+            "6": 4.45.1
+            "8":
+                - 5
+            "12": 0.18.5
+            "13": linux-x86_64
+batch_size:
+    value: 3
+epoch:
+    value: 3
+lr:
+    value: 5e-06
+perturbation:
+    value: shuffle_nondeterministic
+seed:
+    value: 0
+train_set:
+    value: 10M
diff --git a/wandb/run-20241101_201927-8tmqrwpx/files/output.log b/wandb/run-20241101_201927-8tmqrwpx/files/output.log
new file mode 100644
index 0000000000000000000000000000000000000000..8f5d7632122e3fe769565f1f46c07735f03f81d0
--- /dev/null
+++ b/wandb/run-20241101_201927-8tmqrwpx/files/output.log
@@ -0,0 +1,49 @@
+Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:05<00:00,  2.57s/it]
+tokenized_valid: Dataset({
+    features: ['input_ids', 'attention_mask'],
+    num_rows: 600
+})
+/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead
+  warnings.warn(
+[2024-11-01 20:19:34,442] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2024-11-01 20:19:43,897] [INFO] [comm.py:652:init_distributed] cdb=None
+Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination
+Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root...
+Loading extension module cpu_adam...
+Time to load cpu_adam op: 5.528482675552368 seconds
+Traceback (most recent call last):
+  File "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", line 220, in <module>
+    trainer.train()
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/trainer.py", line 2052, in train
+    return inner_training_loop(
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/trainer.py", line 2388, in _inner_training_loop
+    tr_loss_step = self.training_step(model, inputs)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/trainer.py", line 3485, in training_step
+    loss = self.compute_loss(model, inputs)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/trainer.py", line 3532, in compute_loss
+    outputs = model(**inputs)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/deepspeed/utils/nvtx.py", line 18, in wrapped_fn
+    ret_val = func(*args, **kwargs)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/deepspeed/runtime/engine.py", line 1899, in forward
+    loss = self.module(*inputs, **kwargs)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py", line 1189, in forward
+    outputs = self.model(
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py", line 1000, in forward
+    layer_outputs = decoder_layer(
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py", line 729, in forward
+    hidden_states, self_attn_weights, present_key_value = self.self_attn(
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py", line 426, in forward
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/torch/nn/functional.py", line 1845, in softmax
+    ret = input.softmax(dim, dtype=dtype)
+torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 288.00 MiB (GPU 2; 47.54 GiB total capacity; 11.61 GiB already allocated; 228.56 MiB free; 11.64 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
diff --git a/wandb/run-20241101_201927-8tmqrwpx/files/wandb-metadata.json b/wandb/run-20241101_201927-8tmqrwpx/files/wandb-metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..941e95de56bd4bf8bcbb80b8ce46b7475cf41903
--- /dev/null
+++ b/wandb/run-20241101_201927-8tmqrwpx/files/wandb-metadata.json
@@ -0,0 +1,97 @@
+{
+  "os":  "Linux-5.4.0-162-generic-x86_64-with-glibc2.31",
+  "python":  "3.9.19",
+  "startedAt":  "2024-11-02T00:19:27.013147Z",
+  "args":  [
+    "--perturbation",
+    "shuffle_nondeterministic",
+    "--train_set",
+    "10M",
+    "--batch_size",
+    "3",
+    "--epoch",
+    "3",
+    "--seed",
+    "0"
+  ],
+  "program":  "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py",
+  "codePath":  "train/train_deep_wandb.py",
+  "git":  {
+    "remote":  "git@hf.co:Yaning1001/Impossible_llm.git",
+    "commit":  "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4"
+  },
+  "email":  "yaning1001@gmail.com",
+  "root":  "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train",
+  "host":  "mms-large-2",
+  "username":  "chunhui",
+  "executable":  "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python",
+  "codePathLocal":  "train_deep_wandb.py",
+  "cpu_count":  32,
+  "cpu_count_logical":  64,
+  "gpu":  "NVIDIA RTX A6000",
+  "gpu_count":  8,
+  "disk":  {
+    "/":  {
+      "total":  "1888559353856",
+      "used":  "1754803679232"
+    }
+  },
+  "memory":  {
+    "total":  "202617098240"
+  },
+  "cpu":  {
+    "count":  32,
+    "countLogical":  64
+  },
+  "gpu_nvidia":  [
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    }
+  ],
+  "cudaVersion":  "11.8"
+}
\ No newline at end of file
diff --git a/wandb/run-20241101_201927-8tmqrwpx/files/wandb-summary.json b/wandb/run-20241101_201927-8tmqrwpx/files/wandb-summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..e676be4c6d48d550ccacb029fbc772ebc6173ea8
--- /dev/null
+++ b/wandb/run-20241101_201927-8tmqrwpx/files/wandb-summary.json
@@ -0,0 +1 @@
+{"_wandb":{"runtime":58}}
\ No newline at end of file
diff --git a/wandb/run-20241101_201927-8tmqrwpx/logs/debug.log b/wandb/run-20241101_201927-8tmqrwpx/logs/debug.log
new file mode 100644
index 0000000000000000000000000000000000000000..5ca8a8fa7f4c232e41f9332954c80c234f523ad8
--- /dev/null
+++ b/wandb/run-20241101_201927-8tmqrwpx/logs/debug.log
@@ -0,0 +1,27 @@
+2024-11-01 20:19:27,009 INFO    MainThread:878463 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5
+2024-11-01 20:19:27,010 INFO    MainThread:878463 [wandb_setup.py:_flush():79] Configure stats pid to 878463
+2024-11-01 20:19:27,010 INFO    MainThread:878463 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings
+2024-11-01 20:19:27,010 INFO    MainThread:878463 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings
+2024-11-01 20:19:27,010 INFO    MainThread:878463 [wandb_setup.py:_flush():79] Loading settings from environment variables: {}
+2024-11-01 20:19:27,010 INFO    MainThread:878463 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None}
+2024-11-01 20:19:27,010 INFO    MainThread:878463 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'}
+2024-11-01 20:19:27,010 INFO    MainThread:878463 [wandb_setup.py:_flush():79] Applying login settings: {}
+2024-11-01 20:19:27,010 INFO    MainThread:878463 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_201927-8tmqrwpx/logs/debug.log
+2024-11-01 20:19:27,010 INFO    MainThread:878463 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241101_201927-8tmqrwpx/logs/debug-internal.log
+2024-11-01 20:19:27,010 INFO    MainThread:878463 [wandb_init.py:init():621] calling init triggers
+2024-11-01 20:19:27,010 INFO    MainThread:878463 [wandb_init.py:init():628] wandb.init called with sweep_config: {}
+config: {}
+2024-11-01 20:19:27,010 INFO    MainThread:878463 [wandb_init.py:init():671] starting backend
+2024-11-01 20:19:27,010 INFO    MainThread:878463 [wandb_init.py:init():675] sending inform_init request
+2024-11-01 20:19:27,012 INFO    MainThread:878463 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-11-01 20:19:27,012 INFO    MainThread:878463 [wandb_init.py:init():688] backend started and connected
+2024-11-01 20:19:27,015 INFO    MainThread:878463 [wandb_init.py:init():783] updated telemetry
+2024-11-01 20:19:27,055 INFO    MainThread:878463 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout
+2024-11-01 20:19:27,353 INFO    MainThread:878463 [wandb_init.py:init():867] starting run threads in backend
+2024-11-01 20:19:27,444 INFO    MainThread:878463 [wandb_run.py:_console_start():2463] atexit reg
+2024-11-01 20:19:27,444 INFO    MainThread:878463 [wandb_run.py:_redirect():2311] redirect: wrap_raw
+2024-11-01 20:19:27,444 INFO    MainThread:878463 [wandb_run.py:_redirect():2376] Wrapping output streams.
+2024-11-01 20:19:27,444 INFO    MainThread:878463 [wandb_run.py:_redirect():2401] Redirects installed.
+2024-11-01 20:19:27,446 INFO    MainThread:878463 [wandb_init.py:init():911] run started, returning control to user process
+2024-11-01 20:19:27,446 INFO    MainThread:878463 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'shuffle_nondeterministic', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0, 'lr': 5e-06}
+2024-11-01 20:20:25,213 WARNING MsgRouterThr:878463 [router.py:message_loop():77] message_loop has been closed
diff --git a/wandb/run-20241101_201927-8tmqrwpx/run-8tmqrwpx.wandb b/wandb/run-20241101_201927-8tmqrwpx/run-8tmqrwpx.wandb
new file mode 100644
index 0000000000000000000000000000000000000000..6c33c8d31fbc4c042628ce6336ba26bc8ebe9380
Binary files /dev/null and b/wandb/run-20241101_201927-8tmqrwpx/run-8tmqrwpx.wandb differ
diff --git a/wandb/run-20241105_162824-fa9ep6qh/logs/debug-internal.log b/wandb/run-20241105_162824-fa9ep6qh/logs/debug-internal.log
new file mode 100644
index 0000000000000000000000000000000000000000..12ec1e4850e033e139424f3816b844015877ffd5
--- /dev/null
+++ b/wandb/run-20241105_162824-fa9ep6qh/logs/debug-internal.log
@@ -0,0 +1,8 @@
+{"time":"2024-11-05T16:28:24.428721708-05:00","level":"INFO","msg":"using version","core version":"0.18.5"}
+{"time":"2024-11-05T16:28:24.428748458-05:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241105_162824-fa9ep6qh/logs/debug-core.log"}
+{"time":"2024-11-05T16:28:24.539378288-05:00","level":"INFO","msg":"created new stream","id":"fa9ep6qh"}
+{"time":"2024-11-05T16:28:24.539423788-05:00","level":"INFO","msg":"stream: started","id":"fa9ep6qh"}
+{"time":"2024-11-05T16:28:24.539487428-05:00","level":"INFO","msg":"sender: started","stream_id":"fa9ep6qh"}
+{"time":"2024-11-05T16:28:24.539483048-05:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"fa9ep6qh"}}
+{"time":"2024-11-05T16:28:24.539488928-05:00","level":"INFO","msg":"handler: started","stream_id":{"value":"fa9ep6qh"}}
+{"time":"2024-11-05T16:28:24.757184161-05:00","level":"INFO","msg":"Starting system monitor"}
diff --git a/wandb/run-20241105_223842-16dt45ha/files/output.log b/wandb/run-20241105_223842-16dt45ha/files/output.log
new file mode 100644
index 0000000000000000000000000000000000000000..bbd8e1b0b103d08d71a90dac9d33fbbce0e4dc71
--- /dev/null
+++ b/wandb/run-20241105_223842-16dt45ha/files/output.log
@@ -0,0 +1,23 @@
+config.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 844/844 [00:00<00:00, 337kB/s]
+model.safetensors.index.json: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20.9k/20.9k [00:00<00:00, 19.0MB/s]
+model-00001-of-00002.safetensors: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4.97G/4.97G [01:58<00:00, 42.0MB/s]
+model-00002-of-00002.safetensors: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.46G/1.46G [00:34<00:00, 42.1MB/s]
+Downloading shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [02:32<00:00, 76.48s/it]
+Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:04<00:00,  2.35s/it]
+generation_config.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 185/185 [00:00<00:00, 74.9kB/s]
+Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17013/17013 [00:48<00:00, 349.55 examples/s]
+tokenized_valid: Dataset({
+    features: ['input_ids', 'attention_mask'],
+    num_rows: 1000
+})
+/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead
+  warnings.warn(
+[2024-11-05 22:42:10,933] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2024-11-05 22:42:18,255] [INFO] [comm.py:652:init_distributed] cdb=None
+Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination
+Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root...
+Emitting ninja build file /home/chunhui/.cache/torch_extensions/py39_cu117/cpu_adam/build.ninja...
+Building extension module cpu_adam...
+Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
+Loading extension module cpu_adam...
+Time to load cpu_adam op: 4.65723180770874 seconds
diff --git a/wandb/run-20241105_223842-16dt45ha/files/wandb-metadata.json b/wandb/run-20241105_223842-16dt45ha/files/wandb-metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..e359928bdc06be0020ae82591e31d49883ba162e
--- /dev/null
+++ b/wandb/run-20241105_223842-16dt45ha/files/wandb-metadata.json
@@ -0,0 +1,97 @@
+{
+  "os":  "Linux-5.4.0-162-generic-x86_64-with-glibc2.31",
+  "python":  "3.9.19",
+  "startedAt":  "2024-11-06T03:38:42.422757Z",
+  "args":  [
+    "--perturbation",
+    "shuffle_deterministic57",
+    "--train_set",
+    "10M",
+    "--batch_size",
+    "3",
+    "--epoch",
+    "3",
+    "--seed",
+    "0"
+  ],
+  "program":  "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py",
+  "codePath":  "train/train_deep_wandb.py",
+  "git":  {
+    "remote":  "git@hf.co:Yaning1001/Impossible_llm.git",
+    "commit":  "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4"
+  },
+  "email":  "yaning1001@gmail.com",
+  "root":  "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train",
+  "host":  "mms-large-2",
+  "username":  "chunhui",
+  "executable":  "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python",
+  "codePathLocal":  "train_deep_wandb.py",
+  "cpu_count":  32,
+  "cpu_count_logical":  64,
+  "gpu":  "NVIDIA RTX A6000",
+  "gpu_count":  8,
+  "disk":  {
+    "/":  {
+      "total":  "1888559353856",
+      "used":  "1785078448128"
+    }
+  },
+  "memory":  {
+    "total":  "202617098240"
+  },
+  "cpu":  {
+    "count":  32,
+    "countLogical":  64
+  },
+  "gpu_nvidia":  [
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    }
+  ],
+  "cudaVersion":  "11.8"
+}
\ No newline at end of file
diff --git a/wandb/run-20241105_223842-16dt45ha/logs/debug-internal.log b/wandb/run-20241105_223842-16dt45ha/logs/debug-internal.log
new file mode 100644
index 0000000000000000000000000000000000000000..917126591d74988aa62006d8a8850f844897946b
--- /dev/null
+++ b/wandb/run-20241105_223842-16dt45ha/logs/debug-internal.log
@@ -0,0 +1,20 @@
+{"time":"2024-11-05T22:38:42.4258052-05:00","level":"INFO","msg":"using version","core version":"0.18.5"}
+{"time":"2024-11-05T22:38:42.42582231-05:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241105_223842-16dt45ha/logs/debug-core.log"}
+{"time":"2024-11-05T22:38:42.533615549-05:00","level":"INFO","msg":"created new stream","id":"16dt45ha"}
+{"time":"2024-11-05T22:38:42.533662479-05:00","level":"INFO","msg":"stream: started","id":"16dt45ha"}
+{"time":"2024-11-05T22:38:42.53368456-05:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"16dt45ha"}}
+{"time":"2024-11-05T22:38:42.53379328-05:00","level":"INFO","msg":"sender: started","stream_id":"16dt45ha"}
+{"time":"2024-11-05T22:38:42.53375215-05:00","level":"INFO","msg":"handler: started","stream_id":{"value":"16dt45ha"}}
+{"time":"2024-11-05T22:38:42.702252167-05:00","level":"INFO","msg":"Starting system monitor"}
+{"time":"2024-11-06T13:13:43.051966319-05:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/yaning1001-dartmouth-college/exp-impo-shuffle/16dt45ha/file_stream"}
+{"time":"2024-11-06T13:17:11.729410268-05:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
+{"time":"2024-11-06T13:59:59.091926096-05:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/yaning1001-dartmouth-college/exp-impo-shuffle/16dt45ha/file_stream"}
+{"time":"2024-11-06T20:18:24.44115812-05:00","level":"INFO","msg":"Stopping system monitor"}
+{"time":"2024-11-06T20:18:24.467757337-05:00","level":"INFO","msg":"Stopped system monitor"}
+{"time":"2024-11-06T20:18:24.881758243-05:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
+{"time":"2024-11-06T20:18:25.015661128-05:00","level":"INFO","msg":"handler: operation stats","stats":{}}
+{"time":"2024-11-06T20:18:26.027043225-05:00","level":"INFO","msg":"stream: closing","id":"16dt45ha"}
+{"time":"2024-11-06T20:18:26.027121786-05:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"16dt45ha"}}
+{"time":"2024-11-06T20:18:26.027163536-05:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"16dt45ha"}}
+{"time":"2024-11-06T20:18:26.027225756-05:00","level":"INFO","msg":"sender: closed","stream_id":"16dt45ha"}
+{"time":"2024-11-06T20:18:26.027289227-05:00","level":"INFO","msg":"stream: closed","id":"16dt45ha"}
diff --git a/wandb/run-20241105_223842-16dt45ha/logs/debug.log b/wandb/run-20241105_223842-16dt45ha/logs/debug.log
new file mode 100644
index 0000000000000000000000000000000000000000..2faa777ded1e5752843579ddac954fa3f6522010
--- /dev/null
+++ b/wandb/run-20241105_223842-16dt45ha/logs/debug.log
@@ -0,0 +1,33 @@
+2024-11-05 22:38:42,419 INFO    MainThread:1803214 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5
+2024-11-05 22:38:42,419 INFO    MainThread:1803214 [wandb_setup.py:_flush():79] Configure stats pid to 1803214
+2024-11-05 22:38:42,419 INFO    MainThread:1803214 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings
+2024-11-05 22:38:42,419 INFO    MainThread:1803214 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings
+2024-11-05 22:38:42,419 INFO    MainThread:1803214 [wandb_setup.py:_flush():79] Loading settings from environment variables: {}
+2024-11-05 22:38:42,419 INFO    MainThread:1803214 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None}
+2024-11-05 22:38:42,420 INFO    MainThread:1803214 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'}
+2024-11-05 22:38:42,420 INFO    MainThread:1803214 [wandb_setup.py:_flush():79] Applying login settings: {}
+2024-11-05 22:38:42,420 INFO    MainThread:1803214 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241105_223842-16dt45ha/logs/debug.log
+2024-11-05 22:38:42,420 INFO    MainThread:1803214 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241105_223842-16dt45ha/logs/debug-internal.log
+2024-11-05 22:38:42,420 INFO    MainThread:1803214 [wandb_init.py:init():621] calling init triggers
+2024-11-05 22:38:42,420 INFO    MainThread:1803214 [wandb_init.py:init():628] wandb.init called with sweep_config: {}
+config: {}
+2024-11-05 22:38:42,420 INFO    MainThread:1803214 [wandb_init.py:init():671] starting backend
+2024-11-05 22:38:42,420 INFO    MainThread:1803214 [wandb_init.py:init():675] sending inform_init request
+2024-11-05 22:38:42,422 INFO    MainThread:1803214 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-11-05 22:38:42,422 INFO    MainThread:1803214 [wandb_init.py:init():688] backend started and connected
+2024-11-05 22:38:42,426 INFO    MainThread:1803214 [wandb_init.py:init():783] updated telemetry
+2024-11-05 22:38:42,465 INFO    MainThread:1803214 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout
+2024-11-05 22:38:42,698 INFO    MainThread:1803214 [wandb_init.py:init():867] starting run threads in backend
+2024-11-05 22:38:42,788 INFO    MainThread:1803214 [wandb_run.py:_console_start():2463] atexit reg
+2024-11-05 22:38:42,788 INFO    MainThread:1803214 [wandb_run.py:_redirect():2311] redirect: wrap_raw
+2024-11-05 22:38:42,788 INFO    MainThread:1803214 [wandb_run.py:_redirect():2376] Wrapping output streams.
+2024-11-05 22:38:42,788 INFO    MainThread:1803214 [wandb_run.py:_redirect():2401] Redirects installed.
+2024-11-05 22:38:42,790 INFO    MainThread:1803214 [wandb_init.py:init():911] run started, returning control to user process
+2024-11-05 22:38:42,790 INFO    MainThread:1803214 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'shuffle_deterministic57', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0, 'lr': 5e-06}
+2024-11-06 20:18:24,430 INFO    MainThread:1803214 [wandb_run.py:_finish():2158] finishing run yaning1001-dartmouth-college/exp-impo-shuffle/16dt45ha
+2024-11-06 20:18:24,439 INFO    MainThread:1803214 [wandb_run.py:_atexit_cleanup():2426] got exitcode: 0
+2024-11-06 20:18:24,440 INFO    MainThread:1803214 [wandb_run.py:_restore():2408] restore
+2024-11-06 20:18:24,440 INFO    MainThread:1803214 [wandb_run.py:_restore():2414] restore done
+2024-11-06 20:18:26,018 INFO    MainThread:1803214 [wandb_run.py:_footer_history_summary_info():3975] rendering history
+2024-11-06 20:18:26,018 INFO    MainThread:1803214 [wandb_run.py:_footer_history_summary_info():4007] rendering summary
+2024-11-06 20:18:26,026 INFO    MainThread:1803214 [wandb_run.py:_footer_sync_info():3934] logging synced files
diff --git a/wandb/run-20241106_224236-lcylopmq/files/config.yaml b/wandb/run-20241106_224236-lcylopmq/files/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..507bc548a43d1c7afb495d052831f04f348b9c17
--- /dev/null
+++ b/wandb/run-20241106_224236-lcylopmq/files/config.yaml
@@ -0,0 +1,49 @@
+_wandb:
+    value:
+        cli_version: 0.18.5
+        m: []
+        python_version: 3.9.19
+        t:
+            "1":
+                - 1
+                - 5
+                - 11
+                - 49
+                - 51
+                - 53
+                - 55
+                - 71
+                - 98
+            "2":
+                - 1
+                - 5
+                - 11
+                - 49
+                - 51
+                - 53
+                - 55
+                - 71
+                - 98
+            "3":
+                - 13
+                - 23
+                - 55
+            "4": 3.9.19
+            "5": 0.18.5
+            "6": 4.45.1
+            "8":
+                - 5
+            "12": 0.18.5
+            "13": linux-x86_64
+batch_size:
+    value: 3
+epoch:
+    value: 3
+lr:
+    value: 5e-06
+perturbation:
+    value: shuffle_deterministic84
+seed:
+    value: 0
+train_set:
+    value: 10M
diff --git a/wandb/run-20241106_224236-lcylopmq/files/output.log b/wandb/run-20241106_224236-lcylopmq/files/output.log
new file mode 100644
index 0000000000000000000000000000000000000000..16507ca71b603951c4fc600d8f40d9a32af49231
--- /dev/null
+++ b/wandb/run-20241106_224236-lcylopmq/files/output.log
@@ -0,0 +1,60 @@
+Traceback (most recent call last):
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_http.py", line 406, in hf_raise_for_status
+    response.raise_for_status()
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/requests/models.py", line 1024, in raise_for_status
+    raise HTTPError(http_error_msg, response=self)
+requests.exceptions.HTTPError: 401 Client Error: Unauthorized for url: https://huggingface.co/meta-llama/Llama-3.2-3B/resolve/main/config.json
+
+The above exception was the direct cause of the following exception:
+
+Traceback (most recent call last):
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 403, in cached_file
+    resolved_file = hf_hub_download(
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_deprecation.py", line 101, in inner_f
+    return f(*args, **kwargs)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn
+    return fn(*args, **kwargs)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1232, in hf_hub_download
+    return _hf_hub_download_to_cache_dir(
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1339, in _hf_hub_download_to_cache_dir
+    _raise_on_head_call_error(head_call_error, force_download, local_files_only)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1854, in _raise_on_head_call_error
+    raise head_call_error
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1746, in _get_metadata_or_catch_error
+    metadata = get_hf_file_metadata(
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn
+    return fn(*args, **kwargs)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 1666, in get_hf_file_metadata
+    r = _request_wrapper(
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 364, in _request_wrapper
+    response = _request_wrapper(
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/file_download.py", line 388, in _request_wrapper
+    hf_raise_for_status(response)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/huggingface_hub/utils/_http.py", line 423, in hf_raise_for_status
+    raise _format(GatedRepoError, message, response) from e
+huggingface_hub.errors.GatedRepoError: 401 Client Error. (Request ID: Root=1-672c372c-7865a00f7bd975bd3318bf53;0f1d0061-5d18-4db3-bc77-04ef93856428)
+
+Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.2-3B/resolve/main/config.json.
+Access to model meta-llama/Llama-3.2-3B is restricted. You must have access to it and be authenticated to access it. Please log in.
+
+The above exception was the direct cause of the following exception:
+
+Traceback (most recent call last):
+  File "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py", line 174, in <module>
+    model = AutoModelForCausalLM.from_pretrained(model_name,
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/models/auto/auto_factory.py", line 526, in from_pretrained
+    config, kwargs = AutoConfig.from_pretrained(
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/models/auto/configuration_auto.py", line 1006, in from_pretrained
+    config_dict, unused_kwargs = PretrainedConfig.get_config_dict(pretrained_model_name_or_path, **kwargs)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/configuration_utils.py", line 567, in get_config_dict
+    config_dict, kwargs = cls._get_config_dict(pretrained_model_name_or_path, **kwargs)
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/configuration_utils.py", line 626, in _get_config_dict
+    resolved_config_file = cached_file(
+  File "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/utils/hub.py", line 421, in cached_file
+    raise EnvironmentError(
+OSError: You are trying to access a gated repo.
+Make sure to have access to it at https://huggingface.co/meta-llama/Llama-3.2-3B.
+401 Client Error. (Request ID: Root=1-672c372c-7865a00f7bd975bd3318bf53;0f1d0061-5d18-4db3-bc77-04ef93856428)
+
+Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.2-3B/resolve/main/config.json.
+Access to model meta-llama/Llama-3.2-3B is restricted. You must have access to it and be authenticated to access it. Please log in.
diff --git a/wandb/run-20241106_224236-lcylopmq/files/wandb-metadata.json b/wandb/run-20241106_224236-lcylopmq/files/wandb-metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..2a85a7b990920f54d8240bf89ecd28bd11efd74c
--- /dev/null
+++ b/wandb/run-20241106_224236-lcylopmq/files/wandb-metadata.json
@@ -0,0 +1,97 @@
+{
+  "os":  "Linux-5.4.0-162-generic-x86_64-with-glibc2.31",
+  "python":  "3.9.19",
+  "startedAt":  "2024-11-07T03:42:36.158647Z",
+  "args":  [
+    "--perturbation",
+    "shuffle_deterministic84",
+    "--train_set",
+    "10M",
+    "--batch_size",
+    "3",
+    "--epoch",
+    "3",
+    "--seed",
+    "0"
+  ],
+  "program":  "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py",
+  "codePath":  "train/train_deep_wandb.py",
+  "git":  {
+    "remote":  "git@hf.co:Yaning1001/Impossible_llm.git",
+    "commit":  "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4"
+  },
+  "email":  "yaning1001@gmail.com",
+  "root":  "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train",
+  "host":  "mms-large-2",
+  "username":  "chunhui",
+  "executable":  "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python",
+  "codePathLocal":  "train_deep_wandb.py",
+  "cpu_count":  32,
+  "cpu_count_logical":  64,
+  "gpu":  "NVIDIA RTX A6000",
+  "gpu_count":  8,
+  "disk":  {
+    "/":  {
+      "total":  "1888559353856",
+      "used":  "1774852591616"
+    }
+  },
+  "memory":  {
+    "total":  "202617098240"
+  },
+  "cpu":  {
+    "count":  32,
+    "countLogical":  64
+  },
+  "gpu_nvidia":  [
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    }
+  ],
+  "cudaVersion":  "11.8"
+}
\ No newline at end of file
diff --git a/wandb/run-20241106_224236-lcylopmq/files/wandb-summary.json b/wandb/run-20241106_224236-lcylopmq/files/wandb-summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..6c37fe1cbbb8aed86fd461a79642cb991e4d35cf
--- /dev/null
+++ b/wandb/run-20241106_224236-lcylopmq/files/wandb-summary.json
@@ -0,0 +1 @@
+{"_wandb":{"runtime":0}}
\ No newline at end of file
diff --git a/wandb/run-20241106_224236-lcylopmq/logs/debug-internal.log b/wandb/run-20241106_224236-lcylopmq/logs/debug-internal.log
new file mode 100644
index 0000000000000000000000000000000000000000..d9a15cc25de3c0b6d83a401177ddded8076d147f
--- /dev/null
+++ b/wandb/run-20241106_224236-lcylopmq/logs/debug-internal.log
@@ -0,0 +1,16 @@
+{"time":"2024-11-06T22:42:36.160514293-05:00","level":"INFO","msg":"using version","core version":"0.18.5"}
+{"time":"2024-11-06T22:42:36.160525653-05:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241106_224236-lcylopmq/logs/debug-core.log"}
+{"time":"2024-11-06T22:42:36.266257449-05:00","level":"INFO","msg":"created new stream","id":"lcylopmq"}
+{"time":"2024-11-06T22:42:36.266285829-05:00","level":"INFO","msg":"stream: started","id":"lcylopmq"}
+{"time":"2024-11-06T22:42:36.266359779-05:00","level":"INFO","msg":"sender: started","stream_id":"lcylopmq"}
+{"time":"2024-11-06T22:42:36.266358289-05:00","level":"INFO","msg":"handler: started","stream_id":{"value":"lcylopmq"}}
+{"time":"2024-11-06T22:42:36.266325909-05:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"lcylopmq"}}
+{"time":"2024-11-06T22:42:36.431323953-05:00","level":"INFO","msg":"Starting system monitor"}
+{"time":"2024-11-06T22:42:36.726762468-05:00","level":"INFO","msg":"stream: closing","id":"lcylopmq"}
+{"time":"2024-11-06T22:42:36.726788588-05:00","level":"INFO","msg":"Stopping system monitor"}
+{"time":"2024-11-06T22:42:36.727316972-05:00","level":"INFO","msg":"Stopped system monitor"}
+{"time":"2024-11-06T22:42:37.048793203-05:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
+{"time":"2024-11-06T22:42:37.163097395-05:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"lcylopmq"}}
+{"time":"2024-11-06T22:42:37.163124896-05:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"lcylopmq"}}
+{"time":"2024-11-06T22:42:37.163158956-05:00","level":"INFO","msg":"sender: closed","stream_id":"lcylopmq"}
+{"time":"2024-11-06T22:42:37.163174736-05:00","level":"INFO","msg":"stream: closed","id":"lcylopmq"}
diff --git a/wandb/run-20241106_224236-lcylopmq/logs/debug.log b/wandb/run-20241106_224236-lcylopmq/logs/debug.log
new file mode 100644
index 0000000000000000000000000000000000000000..2bf5eaa78c8405ca2ab279ed85d234f4b6f0ed3f
--- /dev/null
+++ b/wandb/run-20241106_224236-lcylopmq/logs/debug.log
@@ -0,0 +1,27 @@
+2024-11-06 22:42:36,156 INFO    MainThread:1982053 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5
+2024-11-06 22:42:36,157 INFO    MainThread:1982053 [wandb_setup.py:_flush():79] Configure stats pid to 1982053
+2024-11-06 22:42:36,157 INFO    MainThread:1982053 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings
+2024-11-06 22:42:36,157 INFO    MainThread:1982053 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings
+2024-11-06 22:42:36,157 INFO    MainThread:1982053 [wandb_setup.py:_flush():79] Loading settings from environment variables: {}
+2024-11-06 22:42:36,157 INFO    MainThread:1982053 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None}
+2024-11-06 22:42:36,157 INFO    MainThread:1982053 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_deep_wandb.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_deep_wandb.py'}
+2024-11-06 22:42:36,157 INFO    MainThread:1982053 [wandb_setup.py:_flush():79] Applying login settings: {}
+2024-11-06 22:42:36,157 INFO    MainThread:1982053 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241106_224236-lcylopmq/logs/debug.log
+2024-11-06 22:42:36,157 INFO    MainThread:1982053 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241106_224236-lcylopmq/logs/debug-internal.log
+2024-11-06 22:42:36,157 INFO    MainThread:1982053 [wandb_init.py:init():621] calling init triggers
+2024-11-06 22:42:36,157 INFO    MainThread:1982053 [wandb_init.py:init():628] wandb.init called with sweep_config: {}
+config: {}
+2024-11-06 22:42:36,157 INFO    MainThread:1982053 [wandb_init.py:init():671] starting backend
+2024-11-06 22:42:36,157 INFO    MainThread:1982053 [wandb_init.py:init():675] sending inform_init request
+2024-11-06 22:42:36,158 INFO    MainThread:1982053 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-11-06 22:42:36,158 INFO    MainThread:1982053 [wandb_init.py:init():688] backend started and connected
+2024-11-06 22:42:36,161 INFO    MainThread:1982053 [wandb_init.py:init():783] updated telemetry
+2024-11-06 22:42:36,183 INFO    MainThread:1982053 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout
+2024-11-06 22:42:36,427 INFO    MainThread:1982053 [wandb_init.py:init():867] starting run threads in backend
+2024-11-06 22:42:36,525 INFO    MainThread:1982053 [wandb_run.py:_console_start():2463] atexit reg
+2024-11-06 22:42:36,525 INFO    MainThread:1982053 [wandb_run.py:_redirect():2311] redirect: wrap_raw
+2024-11-06 22:42:36,525 INFO    MainThread:1982053 [wandb_run.py:_redirect():2376] Wrapping output streams.
+2024-11-06 22:42:36,525 INFO    MainThread:1982053 [wandb_run.py:_redirect():2401] Redirects installed.
+2024-11-06 22:42:36,527 INFO    MainThread:1982053 [wandb_init.py:init():911] run started, returning control to user process
+2024-11-06 22:42:36,527 INFO    MainThread:1982053 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'shuffle_deterministic84', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0, 'lr': 5e-06}
+2024-11-06 22:42:36,726 WARNING MsgRouterThr:1982053 [router.py:message_loop():77] message_loop has been closed
diff --git a/wandb/run-20241106_224236-lcylopmq/run-lcylopmq.wandb b/wandb/run-20241106_224236-lcylopmq/run-lcylopmq.wandb
new file mode 100644
index 0000000000000000000000000000000000000000..9301f449e750273f5e3ea7b83966928498540c52
Binary files /dev/null and b/wandb/run-20241106_224236-lcylopmq/run-lcylopmq.wandb differ
diff --git a/wandb/run-20241114_090201-6a5c399u/files/config.yaml b/wandb/run-20241114_090201-6a5c399u/files/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5feb2a98b43e2e1c45b273f0fd6ee4e8b6deba5d
--- /dev/null
+++ b/wandb/run-20241114_090201-6a5c399u/files/config.yaml
@@ -0,0 +1,50 @@
+_wandb:
+    value:
+        cli_version: 0.18.5
+        m: []
+        python_version: 3.9.19
+        t:
+            "1":
+                - 1
+                - 5
+                - 11
+                - 49
+                - 51
+                - 53
+                - 55
+                - 71
+                - 98
+            "2":
+                - 1
+                - 5
+                - 11
+                - 49
+                - 51
+                - 53
+                - 55
+                - 71
+                - 98
+            "3":
+                - 2
+                - 13
+                - 23
+                - 55
+            "4": 3.9.19
+            "5": 0.18.5
+            "6": 4.45.1
+            "8":
+                - 5
+            "12": 0.18.5
+            "13": linux-x86_64
+batch_size:
+    value: 3
+epoch:
+    value: 3
+lr:
+    value: 5e-06
+perturbation:
+    value: reverse_full
+seed:
+    value: 0
+train_set:
+    value: 10M
diff --git a/wandb/run-20241114_090201-6a5c399u/files/output.log b/wandb/run-20241114_090201-6a5c399u/files/output.log
new file mode 100644
index 0000000000000000000000000000000000000000..70e7e2006d8e04a5a709325c3593e4406af7261a
--- /dev/null
+++ b/wandb/run-20241114_090201-6a5c399u/files/output.log
@@ -0,0 +1,14 @@
+Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:18<00:00,  9.17s/it]
+Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 18140/18140 [00:48<00:00, 373.53 examples/s]
+tokenized_valid: Dataset({
+    features: ['input_ids', 'attention_mask'],
+    num_rows: 1000
+})
+/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead
+  warnings.warn(
+[2024-11-14 09:03:10,872] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2024-11-14 09:03:18,323] [INFO] [comm.py:652:init_distributed] cdb=None
+Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination
+Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root...
+Loading extension module cpu_adam...
+Time to load cpu_adam op: 6.166570425033569 seconds
diff --git a/wandb/run-20241114_090201-6a5c399u/files/wandb-metadata.json b/wandb/run-20241114_090201-6a5c399u/files/wandb-metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..2f322f8cf1f65f386cf648753131cf6686b755c9
--- /dev/null
+++ b/wandb/run-20241114_090201-6a5c399u/files/wandb-metadata.json
@@ -0,0 +1,97 @@
+{
+  "os":  "Linux-5.4.0-162-generic-x86_64-with-glibc2.31",
+  "python":  "3.9.19",
+  "startedAt":  "2024-11-14T14:02:01.488483Z",
+  "args":  [
+    "--perturbation",
+    "reverse_full",
+    "--train_set",
+    "10M",
+    "--batch_size",
+    "3",
+    "--epoch",
+    "3",
+    "--seed",
+    "0"
+  ],
+  "program":  "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_ftp.py",
+  "codePath":  "train/train_ftp.py",
+  "git":  {
+    "remote":  "git@hf.co:Yaning1001/Impossible_llm.git",
+    "commit":  "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4"
+  },
+  "email":  "yaning1001@gmail.com",
+  "root":  "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train",
+  "host":  "mms-large-2",
+  "username":  "chunhui",
+  "executable":  "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python",
+  "codePathLocal":  "train_ftp.py",
+  "cpu_count":  32,
+  "cpu_count_logical":  64,
+  "gpu":  "NVIDIA RTX A6000",
+  "gpu_count":  8,
+  "disk":  {
+    "/":  {
+      "total":  "1888559353856",
+      "used":  "1745683763200"
+    }
+  },
+  "memory":  {
+    "total":  "202617098240"
+  },
+  "cpu":  {
+    "count":  32,
+    "countLogical":  64
+  },
+  "gpu_nvidia":  [
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    }
+  ],
+  "cudaVersion":  "11.8"
+}
\ No newline at end of file
diff --git a/wandb/run-20241114_090201-6a5c399u/files/wandb-summary.json b/wandb/run-20241114_090201-6a5c399u/files/wandb-summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..0853048d5f5f5f9c30a84b53b581e5ac26c7f5b3
--- /dev/null
+++ b/wandb/run-20241114_090201-6a5c399u/files/wandb-summary.json
@@ -0,0 +1 @@
+{"_wandb":{"runtime":46317}}
\ No newline at end of file
diff --git a/wandb/run-20241114_090201-6a5c399u/logs/debug-internal.log b/wandb/run-20241114_090201-6a5c399u/logs/debug-internal.log
new file mode 100644
index 0000000000000000000000000000000000000000..8b8629587b752e76907b56bc91a413d044bd8bba
--- /dev/null
+++ b/wandb/run-20241114_090201-6a5c399u/logs/debug-internal.log
@@ -0,0 +1,19 @@
+{"time":"2024-11-14T09:02:01.491584174-05:00","level":"INFO","msg":"using version","core version":"0.18.5"}
+{"time":"2024-11-14T09:02:01.491603844-05:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241114_090201-6a5c399u/logs/debug-core.log"}
+{"time":"2024-11-14T09:02:01.604859038-05:00","level":"INFO","msg":"created new stream","id":"6a5c399u"}
+{"time":"2024-11-14T09:02:01.604928089-05:00","level":"INFO","msg":"stream: started","id":"6a5c399u"}
+{"time":"2024-11-14T09:02:01.60499237-05:00","level":"INFO","msg":"sender: started","stream_id":"6a5c399u"}
+{"time":"2024-11-14T09:02:01.60496991-05:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"6a5c399u"}}
+{"time":"2024-11-14T09:02:01.60499665-05:00","level":"INFO","msg":"handler: started","stream_id":{"value":"6a5c399u"}}
+{"time":"2024-11-14T09:02:01.846552564-05:00","level":"INFO","msg":"Starting system monitor"}
+{"time":"2024-11-14T12:42:47.253040258-05:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/yaning1001-dartmouth-college/exp-impo-reverse/6a5c399u/file_stream"}
+{"time":"2024-11-14T19:05:46.838454977-05:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"}
+{"time":"2024-11-14T21:53:58.50116399-05:00","level":"INFO","msg":"Stopping system monitor"}
+{"time":"2024-11-14T21:53:58.654316298-05:00","level":"INFO","msg":"Stopped system monitor"}
+{"time":"2024-11-14T21:53:59.303756025-05:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
+{"time":"2024-11-14T21:53:59.431028349-05:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"uploading summary","runtime_seconds":0.127079923}],"total_operations":1}}
+{"time":"2024-11-14T21:54:00.531902324-05:00","level":"INFO","msg":"stream: closing","id":"6a5c399u"}
+{"time":"2024-11-14T21:54:00.532112326-05:00","level":"INFO","msg":"sender: closed","stream_id":"6a5c399u"}
+{"time":"2024-11-14T21:54:00.532030445-05:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"6a5c399u"}}
+{"time":"2024-11-14T21:54:00.531933455-05:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"6a5c399u"}}
+{"time":"2024-11-14T21:54:00.545435925-05:00","level":"INFO","msg":"stream: closed","id":"6a5c399u"}
diff --git a/wandb/run-20241114_090201-6a5c399u/logs/debug.log b/wandb/run-20241114_090201-6a5c399u/logs/debug.log
new file mode 100644
index 0000000000000000000000000000000000000000..29a27c6f7b3e2acb721f46c139b225ec508026f6
--- /dev/null
+++ b/wandb/run-20241114_090201-6a5c399u/logs/debug.log
@@ -0,0 +1,33 @@
+2024-11-14 09:02:01,486 INFO    MainThread:2573813 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5
+2024-11-14 09:02:01,486 INFO    MainThread:2573813 [wandb_setup.py:_flush():79] Configure stats pid to 2573813
+2024-11-14 09:02:01,486 INFO    MainThread:2573813 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings
+2024-11-14 09:02:01,486 INFO    MainThread:2573813 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings
+2024-11-14 09:02:01,486 INFO    MainThread:2573813 [wandb_setup.py:_flush():79] Loading settings from environment variables: {}
+2024-11-14 09:02:01,486 INFO    MainThread:2573813 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None}
+2024-11-14 09:02:01,486 INFO    MainThread:2573813 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_ftp.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_ftp.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_ftp.py'}
+2024-11-14 09:02:01,486 INFO    MainThread:2573813 [wandb_setup.py:_flush():79] Applying login settings: {}
+2024-11-14 09:02:01,486 INFO    MainThread:2573813 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241114_090201-6a5c399u/logs/debug.log
+2024-11-14 09:02:01,486 INFO    MainThread:2573813 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241114_090201-6a5c399u/logs/debug-internal.log
+2024-11-14 09:02:01,486 INFO    MainThread:2573813 [wandb_init.py:init():621] calling init triggers
+2024-11-14 09:02:01,486 INFO    MainThread:2573813 [wandb_init.py:init():628] wandb.init called with sweep_config: {}
+config: {}
+2024-11-14 09:02:01,486 INFO    MainThread:2573813 [wandb_init.py:init():671] starting backend
+2024-11-14 09:02:01,486 INFO    MainThread:2573813 [wandb_init.py:init():675] sending inform_init request
+2024-11-14 09:02:01,487 INFO    MainThread:2573813 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-11-14 09:02:01,488 INFO    MainThread:2573813 [wandb_init.py:init():688] backend started and connected
+2024-11-14 09:02:01,492 INFO    MainThread:2573813 [wandb_init.py:init():783] updated telemetry
+2024-11-14 09:02:01,509 INFO    MainThread:2573813 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout
+2024-11-14 09:02:01,842 INFO    MainThread:2573813 [wandb_init.py:init():867] starting run threads in backend
+2024-11-14 09:02:01,965 INFO    MainThread:2573813 [wandb_run.py:_console_start():2463] atexit reg
+2024-11-14 09:02:01,966 INFO    MainThread:2573813 [wandb_run.py:_redirect():2311] redirect: wrap_raw
+2024-11-14 09:02:01,966 INFO    MainThread:2573813 [wandb_run.py:_redirect():2376] Wrapping output streams.
+2024-11-14 09:02:01,966 INFO    MainThread:2573813 [wandb_run.py:_redirect():2401] Redirects installed.
+2024-11-14 09:02:01,968 INFO    MainThread:2573813 [wandb_init.py:init():911] run started, returning control to user process
+2024-11-14 09:02:01,968 INFO    MainThread:2573813 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'reverse_full', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0, 'lr': 5e-06}
+2024-11-14 21:53:58,405 INFO    MainThread:2573813 [wandb_run.py:_finish():2158] finishing run yaning1001-dartmouth-college/exp-impo-reverse/6a5c399u
+2024-11-14 21:53:58,429 INFO    MainThread:2573813 [wandb_run.py:_atexit_cleanup():2426] got exitcode: 0
+2024-11-14 21:53:58,429 INFO    MainThread:2573813 [wandb_run.py:_restore():2408] restore
+2024-11-14 21:53:58,429 INFO    MainThread:2573813 [wandb_run.py:_restore():2414] restore done
+2024-11-14 21:54:00,466 INFO    MainThread:2573813 [wandb_run.py:_footer_history_summary_info():3975] rendering history
+2024-11-14 21:54:00,466 INFO    MainThread:2573813 [wandb_run.py:_footer_history_summary_info():4007] rendering summary
+2024-11-14 21:54:00,529 INFO    MainThread:2573813 [wandb_run.py:_footer_sync_info():3934] logging synced files
diff --git a/wandb/run-20241116_005740-xzrt3mur/files/config.yaml b/wandb/run-20241116_005740-xzrt3mur/files/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..148d339cca30f16130a38470cf8f83ec34286da5
--- /dev/null
+++ b/wandb/run-20241116_005740-xzrt3mur/files/config.yaml
@@ -0,0 +1,50 @@
+_wandb:
+    value:
+        cli_version: 0.18.5
+        m: []
+        python_version: 3.9.19
+        t:
+            "1":
+                - 1
+                - 5
+                - 11
+                - 49
+                - 51
+                - 53
+                - 55
+                - 71
+                - 98
+            "2":
+                - 1
+                - 5
+                - 11
+                - 49
+                - 51
+                - 53
+                - 55
+                - 71
+                - 98
+            "3":
+                - 2
+                - 13
+                - 23
+                - 55
+            "4": 3.9.19
+            "5": 0.18.5
+            "6": 4.45.1
+            "8":
+                - 5
+            "12": 0.18.5
+            "13": linux-x86_64
+batch_size:
+    value: 3
+epoch:
+    value: 3
+lr:
+    value: 5e-06
+perturbation:
+    value: shuffle_deterministic84
+seed:
+    value: 0
+train_set:
+    value: 10M
diff --git a/wandb/run-20241116_005740-xzrt3mur/files/output.log b/wandb/run-20241116_005740-xzrt3mur/files/output.log
new file mode 100644
index 0000000000000000000000000000000000000000..bbbdce9efc36511659213433eeda5a62c2d8ca8f
--- /dev/null
+++ b/wandb/run-20241116_005740-xzrt3mur/files/output.log
@@ -0,0 +1,21 @@
+model.safetensors.index.json: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20.9k/20.9k [00:00<00:00, 6.15MB/s]
+model-00001-of-00002.safetensors: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4.97G/4.97G [01:57<00:00, 42.2MB/s]
+model-00002-of-00002.safetensors: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.46G/1.46G [00:34<00:00, 42.5MB/s]
+Downloading shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [02:32<00:00, 76.13s/it]
+Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:04<00:00,  2.08s/it]
+Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17013/17013 [00:46<00:00, 362.58 examples/s]
+tokenized_valid: Dataset({
+    features: ['input_ids', 'attention_mask'],
+    num_rows: 1000
+})
+/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead
+  warnings.warn(
+[2024-11-16 01:01:06,549] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2024-11-16 01:01:12,683] [INFO] [comm.py:652:init_distributed] cdb=None
+Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination
+Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root...
+Emitting ninja build file /home/chunhui/.cache/torch_extensions/py39_cu117/cpu_adam/build.ninja...
+Building extension module cpu_adam...
+Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
+Loading extension module cpu_adam...
+Time to load cpu_adam op: 6.444216966629028 seconds
diff --git a/wandb/run-20241116_005740-xzrt3mur/files/wandb-metadata.json b/wandb/run-20241116_005740-xzrt3mur/files/wandb-metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..475ade252b36f3a7600db203f506ffaaf4bd3396
--- /dev/null
+++ b/wandb/run-20241116_005740-xzrt3mur/files/wandb-metadata.json
@@ -0,0 +1,97 @@
+{
+  "os":  "Linux-5.4.0-162-generic-x86_64-with-glibc2.31",
+  "python":  "3.9.19",
+  "startedAt":  "2024-11-16T05:57:40.644226Z",
+  "args":  [
+    "--perturbation",
+    "shuffle_deterministic84",
+    "--train_set",
+    "10M",
+    "--batch_size",
+    "3",
+    "--epoch",
+    "3",
+    "--seed",
+    "0"
+  ],
+  "program":  "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_ftp.py",
+  "codePath":  "train/train_ftp.py",
+  "git":  {
+    "remote":  "git@hf.co:Yaning1001/Impossible_llm.git",
+    "commit":  "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4"
+  },
+  "email":  "yaning1001@gmail.com",
+  "root":  "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train",
+  "host":  "mms-large-2",
+  "username":  "chunhui",
+  "executable":  "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python",
+  "codePathLocal":  "train_ftp.py",
+  "cpu_count":  32,
+  "cpu_count_logical":  64,
+  "gpu":  "NVIDIA RTX A6000",
+  "gpu_count":  8,
+  "disk":  {
+    "/":  {
+      "total":  "1888559353856",
+      "used":  "1787098271744"
+    }
+  },
+  "memory":  {
+    "total":  "202617098240"
+  },
+  "cpu":  {
+    "count":  32,
+    "countLogical":  64
+  },
+  "gpu_nvidia":  [
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    }
+  ],
+  "cudaVersion":  "11.8"
+}
\ No newline at end of file
diff --git a/wandb/run-20241116_005740-xzrt3mur/files/wandb-summary.json b/wandb/run-20241116_005740-xzrt3mur/files/wandb-summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..1039a6654b28cb6be3fbe374ec28aed4bec8bde7
--- /dev/null
+++ b/wandb/run-20241116_005740-xzrt3mur/files/wandb-summary.json
@@ -0,0 +1 @@
+{"_wandb":{"runtime":42763}}
\ No newline at end of file
diff --git a/wandb/run-20241116_005740-xzrt3mur/logs/debug-internal.log b/wandb/run-20241116_005740-xzrt3mur/logs/debug-internal.log
new file mode 100644
index 0000000000000000000000000000000000000000..dd9c789da92cbc791aad18b3afaffed125ac5e50
--- /dev/null
+++ b/wandb/run-20241116_005740-xzrt3mur/logs/debug-internal.log
@@ -0,0 +1,18 @@
+{"time":"2024-11-16T00:57:40.647062341-05:00","level":"INFO","msg":"using version","core version":"0.18.5"}
+{"time":"2024-11-16T00:57:40.647076591-05:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241116_005740-xzrt3mur/logs/debug-core.log"}
+{"time":"2024-11-16T00:57:40.759097257-05:00","level":"INFO","msg":"created new stream","id":"xzrt3mur"}
+{"time":"2024-11-16T00:57:40.759169428-05:00","level":"INFO","msg":"stream: started","id":"xzrt3mur"}
+{"time":"2024-11-16T00:57:40.759194328-05:00","level":"INFO","msg":"sender: started","stream_id":"xzrt3mur"}
+{"time":"2024-11-16T00:57:40.759168828-05:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"xzrt3mur"}}
+{"time":"2024-11-16T00:57:40.759191958-05:00","level":"INFO","msg":"handler: started","stream_id":{"value":"xzrt3mur"}}
+{"time":"2024-11-16T00:57:40.947337481-05:00","level":"INFO","msg":"Starting system monitor"}
+{"time":"2024-11-16T11:20:49.097984756-05:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
+{"time":"2024-11-16T12:50:24.495304118-05:00","level":"INFO","msg":"Stopping system monitor"}
+{"time":"2024-11-16T12:50:24.691016764-05:00","level":"INFO","msg":"Stopped system monitor"}
+{"time":"2024-11-16T12:50:25.291907192-05:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"saving job artifact","runtime_seconds":0.434485007}],"total_operations":1}}
+{"time":"2024-11-16T12:50:27.435115904-05:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
+{"time":"2024-11-16T12:50:28.779835771-05:00","level":"INFO","msg":"stream: closing","id":"xzrt3mur"}
+{"time":"2024-11-16T12:50:28.779862421-05:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"xzrt3mur"}}
+{"time":"2024-11-16T12:50:28.779884771-05:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"xzrt3mur"}}
+{"time":"2024-11-16T12:50:28.779924172-05:00","level":"INFO","msg":"sender: closed","stream_id":"xzrt3mur"}
+{"time":"2024-11-16T12:50:28.779995232-05:00","level":"INFO","msg":"stream: closed","id":"xzrt3mur"}
diff --git a/wandb/run-20241116_005740-xzrt3mur/logs/debug.log b/wandb/run-20241116_005740-xzrt3mur/logs/debug.log
new file mode 100644
index 0000000000000000000000000000000000000000..8179713f1f89794a44b4895f4fce5a68c0b6c1d3
--- /dev/null
+++ b/wandb/run-20241116_005740-xzrt3mur/logs/debug.log
@@ -0,0 +1,33 @@
+2024-11-16 00:57:40,640 INFO    MainThread:2657484 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5
+2024-11-16 00:57:40,640 INFO    MainThread:2657484 [wandb_setup.py:_flush():79] Configure stats pid to 2657484
+2024-11-16 00:57:40,640 INFO    MainThread:2657484 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings
+2024-11-16 00:57:40,640 INFO    MainThread:2657484 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings
+2024-11-16 00:57:40,641 INFO    MainThread:2657484 [wandb_setup.py:_flush():79] Loading settings from environment variables: {}
+2024-11-16 00:57:40,641 INFO    MainThread:2657484 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None}
+2024-11-16 00:57:40,641 INFO    MainThread:2657484 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_ftp.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_ftp.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_ftp.py'}
+2024-11-16 00:57:40,641 INFO    MainThread:2657484 [wandb_setup.py:_flush():79] Applying login settings: {}
+2024-11-16 00:57:40,641 INFO    MainThread:2657484 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241116_005740-xzrt3mur/logs/debug.log
+2024-11-16 00:57:40,641 INFO    MainThread:2657484 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241116_005740-xzrt3mur/logs/debug-internal.log
+2024-11-16 00:57:40,641 INFO    MainThread:2657484 [wandb_init.py:init():621] calling init triggers
+2024-11-16 00:57:40,641 INFO    MainThread:2657484 [wandb_init.py:init():628] wandb.init called with sweep_config: {}
+config: {}
+2024-11-16 00:57:40,641 INFO    MainThread:2657484 [wandb_init.py:init():671] starting backend
+2024-11-16 00:57:40,641 INFO    MainThread:2657484 [wandb_init.py:init():675] sending inform_init request
+2024-11-16 00:57:40,643 INFO    MainThread:2657484 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-11-16 00:57:40,643 INFO    MainThread:2657484 [wandb_init.py:init():688] backend started and connected
+2024-11-16 00:57:40,646 INFO    MainThread:2657484 [wandb_init.py:init():783] updated telemetry
+2024-11-16 00:57:40,665 INFO    MainThread:2657484 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout
+2024-11-16 00:57:40,943 INFO    MainThread:2657484 [wandb_init.py:init():867] starting run threads in backend
+2024-11-16 00:57:41,079 INFO    MainThread:2657484 [wandb_run.py:_console_start():2463] atexit reg
+2024-11-16 00:57:41,079 INFO    MainThread:2657484 [wandb_run.py:_redirect():2311] redirect: wrap_raw
+2024-11-16 00:57:41,079 INFO    MainThread:2657484 [wandb_run.py:_redirect():2376] Wrapping output streams.
+2024-11-16 00:57:41,079 INFO    MainThread:2657484 [wandb_run.py:_redirect():2401] Redirects installed.
+2024-11-16 00:57:41,081 INFO    MainThread:2657484 [wandb_init.py:init():911] run started, returning control to user process
+2024-11-16 00:57:41,082 INFO    MainThread:2657484 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'shuffle_deterministic84', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0, 'lr': 5e-06}
+2024-11-16 12:50:24,091 INFO    MainThread:2657484 [wandb_run.py:_finish():2158] finishing run yaning1001-dartmouth-college/exp-impo-shuffle/xzrt3mur
+2024-11-16 12:50:24,199 INFO    MainThread:2657484 [wandb_run.py:_atexit_cleanup():2426] got exitcode: 0
+2024-11-16 12:50:24,289 INFO    MainThread:2657484 [wandb_run.py:_restore():2408] restore
+2024-11-16 12:50:24,290 INFO    MainThread:2657484 [wandb_run.py:_restore():2414] restore done
+2024-11-16 12:50:28,579 INFO    MainThread:2657484 [wandb_run.py:_footer_history_summary_info():3975] rendering history
+2024-11-16 12:50:28,579 INFO    MainThread:2657484 [wandb_run.py:_footer_history_summary_info():4007] rendering summary
+2024-11-16 12:50:28,736 INFO    MainThread:2657484 [wandb_run.py:_footer_sync_info():3934] logging synced files
diff --git a/wandb/run-20241119_135256-i48f8k8i/files/config.yaml b/wandb/run-20241119_135256-i48f8k8i/files/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2697b22f1922754efc3a49feb62c57c7fea4c220
--- /dev/null
+++ b/wandb/run-20241119_135256-i48f8k8i/files/config.yaml
@@ -0,0 +1,531 @@
+_name_or_path:
+    value: meta-llama/Llama-3.2-3B
+_wandb:
+    value:
+        cli_version: 0.18.5
+        m:
+            - "1": train/epoch
+              "5": 2
+              "6":
+                - 1
+                - 3
+              "7": []
+            - "1": train/global_step
+              "6":
+                - 3
+              "7": []
+            - "1": eval/samples_per_second
+              "5": 2
+              "6":
+                - 1
+                - 3
+              "7": []
+            - "1": eval/steps_per_second
+              "5": 2
+              "6":
+                - 1
+                - 3
+              "7": []
+            - "1": eval/loss
+              "5": 2
+              "6":
+                - 1
+                - 3
+              "7": []
+            - "1": train/grad_norm
+              "5": 2
+              "6":
+                - 1
+                - 3
+              "7": []
+            - "1": train/loss
+              "5": 2
+              "6":
+                - 1
+                - 3
+              "7": []
+            - "1": train/learning_rate
+              "5": 2
+              "6":
+                - 1
+                - 3
+              "7": []
+            - "1": eval/runtime
+              "5": 2
+              "6":
+                - 1
+                - 3
+              "7": []
+        python_version: 3.9.19
+        t:
+            "1":
+                - 1
+                - 5
+                - 11
+                - 49
+                - 51
+                - 53
+                - 55
+                - 71
+                - 98
+            "2":
+                - 1
+                - 5
+                - 11
+                - 49
+                - 51
+                - 53
+                - 55
+                - 71
+                - 98
+            "3":
+                - 2
+                - 7
+                - 13
+                - 19
+                - 23
+                - 55
+                - 62
+                - 66
+            "4": 3.9.19
+            "5": 0.18.5
+            "6": 4.45.1
+            "8":
+                - 5
+            "9":
+                "1": transformers_trainer
+            "12": 0.18.5
+            "13": linux-x86_64
+accelerator_config:
+    value:
+        dispatch_batches: null
+        even_batches: true
+        gradient_accumulation_kwargs: null
+        non_blocking: false
+        split_batches: false
+        use_seedable_sampler: true
+adafactor:
+    value: false
+adam_beta1:
+    value: 0.9
+adam_beta2:
+    value: 0.999
+adam_epsilon:
+    value: 1e-08
+add_cross_attention:
+    value: false
+architectures:
+    value:
+        - LlamaForCausalLM
+attention_bias:
+    value: false
+attention_dropout:
+    value: 0
+auto_find_batch_size:
+    value: false
+bad_words_ids:
+    value: null
+batch_eval_metrics:
+    value: false
+batch_size:
+    value: 3
+begin_suppress_tokens:
+    value: null
+bf16:
+    value: false
+bf16_full_eval:
+    value: false
+bos_token_id:
+    value: 128000
+chunk_size_feed_forward:
+    value: 0
+cross_attention_hidden_size:
+    value: null
+data_seed:
+    value: null
+dataloader_drop_last:
+    value: false
+dataloader_num_workers:
+    value: 0
+dataloader_persistent_workers:
+    value: false
+dataloader_pin_memory:
+    value: true
+dataloader_prefetch_factor:
+    value: null
+ddp_backend:
+    value: null
+ddp_broadcast_buffers:
+    value: null
+ddp_bucket_cap_mb:
+    value: null
+ddp_find_unused_parameters:
+    value: null
+ddp_timeout:
+    value: 1800
+debug:
+    value: []
+decoder_start_token_id:
+    value: null
+deepspeed:
+    value: deepspeed_config/train_dp_config.json
+disable_tqdm:
+    value: false
+dispatch_batches:
+    value: null
+diversity_penalty:
+    value: 0
+do_eval:
+    value: true
+do_predict:
+    value: false
+do_sample:
+    value: false
+do_train:
+    value: false
+early_stopping:
+    value: false
+encoder_no_repeat_ngram_size:
+    value: 0
+eos_token_id:
+    value: 128001
+epoch:
+    value: 3
+eval_accumulation_steps:
+    value: null
+eval_delay:
+    value: 0
+eval_do_concat_batches:
+    value: true
+eval_on_start:
+    value: false
+eval_steps:
+    value: 10
+eval_strategy:
+    value: steps
+eval_use_gather_object:
+    value: false
+evaluation_strategy:
+    value: steps
+exponential_decay_length_penalty:
+    value: null
+finetuning_task:
+    value: null
+forced_bos_token_id:
+    value: null
+forced_eos_token_id:
+    value: null
+fp16:
+    value: true
+fp16_backend:
+    value: auto
+fp16_full_eval:
+    value: false
+fp16_opt_level:
+    value: O1
+fsdp:
+    value: []
+fsdp_config:
+    value:
+        min_num_params: 0
+        xla: false
+        xla_fsdp_grad_ckpt: false
+        xla_fsdp_v2: false
+fsdp_min_num_params:
+    value: 0
+fsdp_transformer_layer_cls_to_wrap:
+    value: null
+full_determinism:
+    value: false
+gradient_accumulation_steps:
+    value: 2
+gradient_checkpointing:
+    value: false
+gradient_checkpointing_kwargs:
+    value: null
+greater_is_better:
+    value: null
+group_by_length:
+    value: false
+half_precision_backend:
+    value: auto
+head_dim:
+    value: 128
+hidden_act:
+    value: silu
+hidden_size:
+    value: 3072
+hub_always_push:
+    value: false
+hub_model_id:
+    value: null
+hub_private_repo:
+    value: false
+hub_strategy:
+    value: every_save
+hub_token:
+    value: <HUB_TOKEN>
+id2label:
+    value:
+        "0": LABEL_0
+        "1": LABEL_1
+ignore_data_skip:
+    value: false
+include_inputs_for_metrics:
+    value: false
+include_num_input_tokens_seen:
+    value: false
+include_tokens_per_second:
+    value: false
+initializer_range:
+    value: 0.02
+intermediate_size:
+    value: 8192
+is_decoder:
+    value: false
+is_encoder_decoder:
+    value: false
+jit_mode_eval:
+    value: false
+label_names:
+    value: null
+label_smoothing_factor:
+    value: 0
+label2id:
+    value:
+        LABEL_0: 0
+        LABEL_1: 1
+learning_rate:
+    value: 5e-06
+length_column_name:
+    value: length
+length_penalty:
+    value: 1
+load_best_model_at_end:
+    value: false
+local_rank:
+    value: 0
+log_level:
+    value: passive
+log_level_replica:
+    value: warning
+log_on_each_node:
+    value: true
+logging_dir:
+    value: ./logs
+logging_first_step:
+    value: false
+logging_nan_inf_filter:
+    value: true
+logging_steps:
+    value: 1
+logging_strategy:
+    value: steps
+lr:
+    value: 5e-06
+lr_scheduler_type:
+    value: linear
+max_grad_norm:
+    value: 1
+max_length:
+    value: 20
+max_position_embeddings:
+    value: 131072
+max_steps:
+    value: -1
+metric_for_best_model:
+    value: null
+min_length:
+    value: 0
+mlp_bias:
+    value: false
+model/num_parameters:
+    value: 3212749824
+model_type:
+    value: llama
+mp_parameters:
+    value: ""
+neftune_noise_alpha:
+    value: null
+no_cuda:
+    value: false
+no_repeat_ngram_size:
+    value: 0
+num_attention_heads:
+    value: 24
+num_beam_groups:
+    value: 1
+num_beams:
+    value: 1
+num_hidden_layers:
+    value: 28
+num_key_value_heads:
+    value: 8
+num_return_sequences:
+    value: 1
+num_train_epochs:
+    value: 3
+optim:
+    value: adamw_torch
+optim_args:
+    value: null
+optim_target_modules:
+    value: null
+output_attentions:
+    value: false
+output_dir:
+    value: ./checkpoints/Llama-3.2-3B-FTP/babylm_shuffle_local10_10M_seed0/runs
+output_hidden_states:
+    value: false
+output_scores:
+    value: false
+overwrite_output_dir:
+    value: false
+pad_token_id:
+    value: null
+past_index:
+    value: -1
+per_device_eval_batch_size:
+    value: 8
+per_device_train_batch_size:
+    value: 3
+per_gpu_eval_batch_size:
+    value: null
+per_gpu_train_batch_size:
+    value: null
+perturbation:
+    value: shuffle_local10
+prediction_loss_only:
+    value: false
+prefix:
+    value: null
+pretraining_tp:
+    value: 1
+problem_type:
+    value: null
+push_to_hub:
+    value: false
+push_to_hub_model_id:
+    value: null
+push_to_hub_organization:
+    value: null
+push_to_hub_token:
+    value: <PUSH_TO_HUB_TOKEN>
+ray_scope:
+    value: last
+remove_invalid_values:
+    value: false
+remove_unused_columns:
+    value: true
+repetition_penalty:
+    value: 1
+report_to:
+    value:
+        - wandb
+restore_callback_states_from_checkpoint:
+    value: false
+resume_from_checkpoint:
+    value: null
+return_dict:
+    value: true
+return_dict_in_generate:
+    value: false
+rms_norm_eps:
+    value: 1e-05
+rope_scaling:
+    value:
+        factor: 32
+        high_freq_factor: 4
+        low_freq_factor: 1
+        original_max_position_embeddings: 8192
+        rope_type: llama3
+rope_theta:
+    value: 500000
+run_name:
+    value: ./checkpoints/Llama-3.2-3B-FTP/babylm_shuffle_local10_10M_seed0/runs
+save_on_each_node:
+    value: false
+save_only_model:
+    value: false
+save_safetensors:
+    value: true
+save_steps:
+    value: 100
+save_strategy:
+    value: steps
+save_total_limit:
+    value: null
+seed:
+    value: 0
+sep_token_id:
+    value: null
+skip_memory_metrics:
+    value: true
+split_batches:
+    value: null
+suppress_tokens:
+    value: null
+task_specific_params:
+    value: null
+temperature:
+    value: 1
+tf_legacy_loss:
+    value: false
+tf32:
+    value: null
+tie_encoder_decoder:
+    value: false
+tie_word_embeddings:
+    value: true
+tokenizer_class:
+    value: null
+top_k:
+    value: 50
+top_p:
+    value: 1
+torch_compile:
+    value: false
+torch_compile_backend:
+    value: null
+torch_compile_mode:
+    value: null
+torch_dtype:
+    value: bfloat16
+torch_empty_cache_steps:
+    value: null
+torchdynamo:
+    value: null
+torchscript:
+    value: false
+tpu_metrics_debug:
+    value: false
+tpu_num_cores:
+    value: null
+train_set:
+    value: 10M
+transformers_version:
+    value: 4.45.1
+typical_p:
+    value: 1
+use_bfloat16:
+    value: false
+use_cache:
+    value: true
+use_cpu:
+    value: false
+use_ipex:
+    value: false
+use_legacy_prediction_loop:
+    value: false
+use_liger_kernel:
+    value: false
+use_mps_device:
+    value: false
+vocab_size:
+    value: 128256
+warmup_ratio:
+    value: 0.1
+warmup_steps:
+    value: 0
+weight_decay:
+    value: 0
diff --git a/wandb/run-20241119_135256-i48f8k8i/files/wandb-summary.json b/wandb/run-20241119_135256-i48f8k8i/files/wandb-summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..0c61b3a89be0969169461b29e05eebf1bb0a0a96
--- /dev/null
+++ b/wandb/run-20241119_135256-i48f8k8i/files/wandb-summary.json
@@ -0,0 +1 @@
+{"_wandb":{"runtime":42875},"_step":3009,"train/global_step":2736,"eval/samples_per_second":26.579,"train_steps_per_second":0.064,"train/learning_rate":3.6555645816409427e-08,"train_runtime":42556.4051,"train/epoch":2.9983561643835617,"eval/runtime":37.6238,"train/grad_norm":2.390791416168213,"eval/loss":1.7726256847381592,"eval/steps_per_second":1.116,"total_flos":8.528958544014213e+17,"train_samples_per_second":1.158,"_timestamp":1.732085251819263e+09,"train/loss":1.5547,"train_loss":1.673318871542027,"_runtime":42874.926636984}
\ No newline at end of file
diff --git a/wandb/run-20241119_135256-i48f8k8i/logs/debug-internal.log b/wandb/run-20241119_135256-i48f8k8i/logs/debug-internal.log
new file mode 100644
index 0000000000000000000000000000000000000000..3156e143d1bd2b974261fcc2ee7c568cd27bdb60
--- /dev/null
+++ b/wandb/run-20241119_135256-i48f8k8i/logs/debug-internal.log
@@ -0,0 +1,18 @@
+{"time":"2024-11-19T13:52:56.896325342-05:00","level":"INFO","msg":"using version","core version":"0.18.5"}
+{"time":"2024-11-19T13:52:56.896346472-05:00","level":"INFO","msg":"created symlink","path":"/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241119_135256-i48f8k8i/logs/debug-core.log"}
+{"time":"2024-11-19T13:52:57.008353535-05:00","level":"INFO","msg":"created new stream","id":"i48f8k8i"}
+{"time":"2024-11-19T13:52:57.008423955-05:00","level":"INFO","msg":"stream: started","id":"i48f8k8i"}
+{"time":"2024-11-19T13:52:57.008468935-05:00","level":"INFO","msg":"sender: started","stream_id":"i48f8k8i"}
+{"time":"2024-11-19T13:52:57.008462395-05:00","level":"INFO","msg":"handler: started","stream_id":{"value":"i48f8k8i"}}
+{"time":"2024-11-19T13:52:57.008440745-05:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"i48f8k8i"}}
+{"time":"2024-11-19T13:52:57.256723715-05:00","level":"INFO","msg":"Starting system monitor"}
+{"time":"2024-11-19T14:26:42.599960136-05:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/yaning1001-dartmouth-college/exp-impo-shuffle/i48f8k8i/file_stream"}
+{"time":"2024-11-20T01:47:31.968427857-05:00","level":"INFO","msg":"Stopping system monitor"}
+{"time":"2024-11-20T01:47:31.978326214-05:00","level":"INFO","msg":"Stopped system monitor"}
+{"time":"2024-11-20T01:47:32.609111028-05:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
+{"time":"2024-11-20T01:47:32.771175491-05:00","level":"INFO","msg":"handler: operation stats","stats":{}}
+{"time":"2024-11-20T01:47:33.880918495-05:00","level":"INFO","msg":"stream: closing","id":"i48f8k8i"}
+{"time":"2024-11-20T01:47:33.880953885-05:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"i48f8k8i"}}
+{"time":"2024-11-20T01:47:33.880983775-05:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"i48f8k8i"}}
+{"time":"2024-11-20T01:47:33.881025656-05:00","level":"INFO","msg":"sender: closed","stream_id":"i48f8k8i"}
+{"time":"2024-11-20T01:47:33.881098396-05:00","level":"INFO","msg":"stream: closed","id":"i48f8k8i"}
diff --git a/wandb/run-20241119_135256-i48f8k8i/logs/debug.log b/wandb/run-20241119_135256-i48f8k8i/logs/debug.log
new file mode 100644
index 0000000000000000000000000000000000000000..55a55a989b74da64647f1b55754e0f060dd3f069
--- /dev/null
+++ b/wandb/run-20241119_135256-i48f8k8i/logs/debug.log
@@ -0,0 +1,36 @@
+2024-11-19 13:52:56,889 INFO    MainThread:2719620 [wandb_setup.py:_flush():79] Current SDK version is 0.18.5
+2024-11-19 13:52:56,889 INFO    MainThread:2719620 [wandb_setup.py:_flush():79] Configure stats pid to 2719620
+2024-11-19 13:52:56,889 INFO    MainThread:2719620 [wandb_setup.py:_flush():79] Loading settings from /home/chunhui/.config/wandb/settings
+2024-11-19 13:52:56,890 INFO    MainThread:2719620 [wandb_setup.py:_flush():79] Loading settings from /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/settings
+2024-11-19 13:52:56,890 INFO    MainThread:2719620 [wandb_setup.py:_flush():79] Loading settings from environment variables: {}
+2024-11-19 13:52:56,890 INFO    MainThread:2719620 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': None, '_disable_service': None}
+2024-11-19 13:52:56,890 INFO    MainThread:2719620 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'train/train_ftp.py', 'program_abspath': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_ftp.py', 'program': '/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_ftp.py'}
+2024-11-19 13:52:56,890 INFO    MainThread:2719620 [wandb_setup.py:_flush():79] Applying login settings: {}
+2024-11-19 13:52:56,890 INFO    MainThread:2719620 [wandb_init.py:_log_setup():534] Logging user logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241119_135256-i48f8k8i/logs/debug.log
+2024-11-19 13:52:56,890 INFO    MainThread:2719620 [wandb_init.py:_log_setup():535] Logging internal logs to /mnt/ssd3/chunhui/yaning/project/impossible_llm/train/wandb/run-20241119_135256-i48f8k8i/logs/debug-internal.log
+2024-11-19 13:52:56,890 INFO    MainThread:2719620 [wandb_init.py:init():621] calling init triggers
+2024-11-19 13:52:56,890 INFO    MainThread:2719620 [wandb_init.py:init():628] wandb.init called with sweep_config: {}
+config: {}
+2024-11-19 13:52:56,890 INFO    MainThread:2719620 [wandb_init.py:init():671] starting backend
+2024-11-19 13:52:56,890 INFO    MainThread:2719620 [wandb_init.py:init():675] sending inform_init request
+2024-11-19 13:52:56,892 INFO    MainThread:2719620 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-11-19 13:52:56,892 INFO    MainThread:2719620 [wandb_init.py:init():688] backend started and connected
+2024-11-19 13:52:56,896 INFO    MainThread:2719620 [wandb_init.py:init():783] updated telemetry
+2024-11-19 13:52:56,926 INFO    MainThread:2719620 [wandb_init.py:init():816] communicating run to backend with 90.0 second timeout
+2024-11-19 13:52:57,253 INFO    MainThread:2719620 [wandb_init.py:init():867] starting run threads in backend
+2024-11-19 13:52:57,358 INFO    MainThread:2719620 [wandb_run.py:_console_start():2463] atexit reg
+2024-11-19 13:52:57,358 INFO    MainThread:2719620 [wandb_run.py:_redirect():2311] redirect: wrap_raw
+2024-11-19 13:52:57,358 INFO    MainThread:2719620 [wandb_run.py:_redirect():2376] Wrapping output streams.
+2024-11-19 13:52:57,358 INFO    MainThread:2719620 [wandb_run.py:_redirect():2401] Redirects installed.
+2024-11-19 13:52:57,359 INFO    MainThread:2719620 [wandb_init.py:init():911] run started, returning control to user process
+2024-11-19 13:52:57,360 INFO    MainThread:2719620 [wandb_run.py:_config_callback():1390] config_cb None None {'perturbation': 'shuffle_local10', 'train_set': '10M', 'batch_size': 3, 'epoch': 3, 'seed': 0, 'lr': 5e-06}
+2024-11-19 13:58:15,416 INFO    MainThread:2719620 [wandb_run.py:_config_callback():1390] config_cb None None {'vocab_size': 128256, 'max_position_embeddings': 131072, 'hidden_size': 3072, 'intermediate_size': 8192, 'num_hidden_layers': 28, 'num_attention_heads': 24, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'pretraining_tp': 1, 'use_cache': True, 'rope_theta': 500000.0, 'rope_scaling': {'factor': 32.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}, 'attention_bias': False, 'attention_dropout': 0.0, 'mlp_bias': False, 'head_dim': 128, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': True, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['LlamaForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 128000, 'pad_token_id': None, 'eos_token_id': 128001, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'meta-llama/Llama-3.2-3B', 'transformers_version': '4.45.1', 'model_type': 'llama', 'output_dir': './checkpoints/Llama-3.2-3B-FTP/babylm_shuffle_local10_10M_seed0/runs', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 3, 'per_device_eval_batch_size': 8, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 2, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-06, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.1, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': './logs', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 100, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 0, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': False, 'fp16': True, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 10, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': './checkpoints/Llama-3.2-3B-FTP/babylm_shuffle_local10_10M_seed0/runs', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': 'deepspeed_config/train_dp_config.json', 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'evaluation_strategy': 'steps', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'eval_use_gather_object': False}
+2024-11-19 13:58:15,418 INFO    MainThread:2719620 [wandb_config.py:__setitem__():154] config set model/num_parameters = 3212749824 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x7f422fbdc0a0>>
+2024-11-19 13:58:15,418 INFO    MainThread:2719620 [wandb_run.py:_config_callback():1390] config_cb model/num_parameters 3212749824 None
+2024-11-20 01:47:31,947 INFO    MainThread:2719620 [wandb_run.py:_finish():2158] finishing run yaning1001-dartmouth-college/exp-impo-shuffle/i48f8k8i
+2024-11-20 01:47:31,967 INFO    MainThread:2719620 [wandb_run.py:_atexit_cleanup():2426] got exitcode: 0
+2024-11-20 01:47:31,967 INFO    MainThread:2719620 [wandb_run.py:_restore():2408] restore
+2024-11-20 01:47:31,968 INFO    MainThread:2719620 [wandb_run.py:_restore():2414] restore done
+2024-11-20 01:47:33,869 INFO    MainThread:2719620 [wandb_run.py:_footer_history_summary_info():3975] rendering history
+2024-11-20 01:47:33,870 INFO    MainThread:2719620 [wandb_run.py:_footer_history_summary_info():4007] rendering summary
+2024-11-20 01:47:33,880 INFO    MainThread:2719620 [wandb_run.py:_footer_sync_info():3934] logging synced files
diff --git a/wandb/run-20241128_161554-907lsb28/files/config.yaml b/wandb/run-20241128_161554-907lsb28/files/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8d07c4054eb6cea6e4e93d3fb630156b4b6a6798
--- /dev/null
+++ b/wandb/run-20241128_161554-907lsb28/files/config.yaml
@@ -0,0 +1,50 @@
+_wandb:
+    value:
+        cli_version: 0.18.5
+        m: []
+        python_version: 3.9.19
+        t:
+            "1":
+                - 1
+                - 5
+                - 11
+                - 49
+                - 51
+                - 53
+                - 55
+                - 71
+                - 98
+            "2":
+                - 1
+                - 5
+                - 11
+                - 49
+                - 51
+                - 53
+                - 55
+                - 71
+                - 98
+            "3":
+                - 2
+                - 13
+                - 23
+                - 55
+            "4": 3.9.19
+            "5": 0.18.5
+            "6": 4.45.1
+            "8":
+                - 5
+            "12": 0.18.5
+            "13": linux-x86_64
+batch_size:
+    value: 3
+epoch:
+    value: 3
+lr:
+    value: 5e-06
+perturbation:
+    value: reverse_control
+seed:
+    value: 0
+train_set:
+    value: 10M
diff --git a/wandb/run-20241128_161554-907lsb28/files/output.log b/wandb/run-20241128_161554-907lsb28/files/output.log
new file mode 100644
index 0000000000000000000000000000000000000000..c1baedaa5b22a4efcb2d4e2039878d4396e90496
--- /dev/null
+++ b/wandb/run-20241128_161554-907lsb28/files/output.log
@@ -0,0 +1,14 @@
+Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17519/17519 [00:48<00:00, 362.25 examples/s]
+Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 18140/18140 [00:49<00:00, 364.14 examples/s]
+tokenized_valid: Dataset({
+    features: ['input_ids', 'attention_mask'],
+    num_rows: 1000
+})
+/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/lib/python3.9/site-packages/transformers/training_args.py:1545: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead
+  warnings.warn(
+[2024-11-28 16:18:35,555] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2024-11-28 16:18:41,080] [INFO] [comm.py:652:init_distributed] cdb=None
+Installed CUDA version 11.8 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination
+Using /home/chunhui/.cache/torch_extensions/py39_cu117 as PyTorch extensions root...
+Loading extension module cpu_adam...
+Time to load cpu_adam op: 28.010013341903687 seconds
diff --git a/wandb/run-20241128_161554-907lsb28/files/wandb-metadata.json b/wandb/run-20241128_161554-907lsb28/files/wandb-metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..9d4ed4c529877634500143f7e6f6bc93590246ad
--- /dev/null
+++ b/wandb/run-20241128_161554-907lsb28/files/wandb-metadata.json
@@ -0,0 +1,97 @@
+{
+  "os":  "Linux-5.4.0-162-generic-x86_64-with-glibc2.31",
+  "python":  "3.9.19",
+  "startedAt":  "2024-11-28T21:15:54.211208Z",
+  "args":  [
+    "--perturbation",
+    "reverse_control",
+    "--train_set",
+    "10M",
+    "--batch_size",
+    "3",
+    "--epoch",
+    "3",
+    "--seed",
+    "0"
+  ],
+  "program":  "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train/train_llama_1B.py",
+  "codePath":  "train/train_llama_1B.py",
+  "git":  {
+    "remote":  "git@hf.co:Yaning1001/Impossible_llm.git",
+    "commit":  "ed716cdcfcdea02b67f7ed0f3504c2b1c8b737c4"
+  },
+  "email":  "yaning1001@gmail.com",
+  "root":  "/mnt/ssd3/chunhui/yaning/project/impossible_llm/train",
+  "host":  "mms-large-2",
+  "username":  "chunhui",
+  "executable":  "/mnt/ssd3/chunhui/miniconda/envs/impossible_llm/bin/python",
+  "codePathLocal":  "train_llama_1B.py",
+  "cpu_count":  32,
+  "cpu_count_logical":  64,
+  "gpu":  "NVIDIA RTX A6000",
+  "gpu_count":  8,
+  "disk":  {
+    "/":  {
+      "total":  "1888559353856",
+      "used":  "1723122159616"
+    }
+  },
+  "memory":  {
+    "total":  "202617098240"
+  },
+  "cpu":  {
+    "count":  32,
+    "countLogical":  64
+  },
+  "gpu_nvidia":  [
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA RTX A6000",
+      "memoryTotal":  "51527024640",
+      "cudaCores":  10752,
+      "architecture":  "Ampere"
+    }
+  ],
+  "cudaVersion":  "11.8"
+}
\ No newline at end of file
diff --git a/wandb/run-20241128_161554-907lsb28/files/wandb-summary.json b/wandb/run-20241128_161554-907lsb28/files/wandb-summary.json
new file mode 100644
index 0000000000000000000000000000000000000000..0f6aab003b4b8541a72d07e8d6bea8280b1dfec1
--- /dev/null
+++ b/wandb/run-20241128_161554-907lsb28/files/wandb-summary.json
@@ -0,0 +1 @@
+{"_wandb":{"runtime":53001}}
\ No newline at end of file