diff --git a/.gitattributes b/.gitattributes index 8159f434d185a74a7c644036038fb4cb7e0b4653..f23a47b49f720ee4b287d9ca2167aa837325feb4 100644 --- a/.gitattributes +++ b/.gitattributes @@ -109,3 +109,6 @@ train/checkpoints/Llama-3.2-3B/babylm_reverse_partial_10M_seed0/runs/checkpoint- train/checkpoints/Llama-3.2-3B/babylm_reverse_partial_10M_seed0/runs/checkpoint-1500/model-00002-of-00002.safetensors filter=lfs diff=lfs merge=lfs -text train/checkpoints/Llama-3.2-3B/babylm_reverse_full_10M_seed0/runs/checkpoint-1500/model-00002-of-00002.safetensors filter=lfs diff=lfs merge=lfs -text train/checkpoints/Llama-3.2-3B/babylm_reverse_partial_10M_seed0/runs/checkpoint-750/model-00001-of-00002.safetensors filter=lfs diff=lfs merge=lfs -text +train/checkpoints/Llama-3.2-3B/babylm_reverse_partial_10M_seed0/runs/checkpoint-300/tokenizer.json filter=lfs diff=lfs merge=lfs -text +train/checkpoints/Llama-3.2-3B/babylm_reverse_partial_10M_seed0/runs/checkpoint-150/model-00002-of-00002.safetensors filter=lfs diff=lfs merge=lfs -text +train/checkpoints/Llama-3.2-3B/babylm_reverse_partial_10M_seed0/artifacts/models--meta-llama--Llama-3.2-3B/snapshots/5cc0ffe09ee49f7be6ca7c794ee6bd7245e84e60/model-00001-of-00002.safetensors filter=lfs diff=lfs merge=lfs -text diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_10M/childes.train b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_10M/childes.train new file mode 100644 index 0000000000000000000000000000000000000000..6322735693d8737c4acee67dae7cc87eb41b2fa8 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_10M/childes.train @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c87508c63b8f72b11057081ab3386857da4630cfce970b64edf6631a5843f73a +size 20738833 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_10M/simple_wiki.train b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_10M/simple_wiki.train new file mode 100644 index 0000000000000000000000000000000000000000..4654f6395e447f5608fc6ab92296dda69ba826ab --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_10M/simple_wiki.train @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db0f4def8606febfcd5d2fc4cbd94ecb3a83ecaa5c6fac787332e689c04e8207 +size 10344982 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_10M/switchboard.train b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_10M/switchboard.train new file mode 100644 index 0000000000000000000000000000000000000000..678f23fa6faa9716b733a76cbb2eee0fcc9c359f --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_10M/switchboard.train @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53dea73f4c03f44ede7de69d72a1d2009dc4c6a0a1a5f70825f7bd2673f53119 +size 900008 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_dev/bnc_spoken.dev b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_dev/bnc_spoken.dev new file mode 100644 index 0000000000000000000000000000000000000000..944204b5c03b770f2121a22bf9a341fdb2c0e3d9 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_dev/bnc_spoken.dev @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6880fe2c3b3f907cd2f0ab1640a4b660a1d1a300d9162987fe12ab055dd3622 +size 7261687 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_dev/childes.dev b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_dev/childes.dev new file mode 100644 index 0000000000000000000000000000000000000000..37d183567ea1a3fcddaeb05f6766183b8eb6963e --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_dev/childes.dev @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53e2ff95011848bdc4005561cc2a9dfe66345e251f2e304fcb5480b21bc1d7e6 +size 20182660 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_dev/gutenberg.dev b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_dev/gutenberg.dev new file mode 100644 index 0000000000000000000000000000000000000000..f88cc66dbad25ee89d48ed9b80bcb43acf2c66b6 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_dev/gutenberg.dev @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:642090437ebe3d53342ddb073be5b18858a8884c5656392a6fa0b898434174c1 +size 18046847 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_dev/open_subtitles.dev b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_dev/open_subtitles.dev new file mode 100644 index 0000000000000000000000000000000000000000..fb393d7f34f2688ab890809d05f8fbfe4084e5d8 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_dev/open_subtitles.dev @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:acf0d1cc1b9243ca5273d0fdc9bc070a8c423f310fdd22ac2a0eb07c43fe4e22 +size 14219956 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_dev/simple_wiki.dev b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_dev/simple_wiki.dev new file mode 100644 index 0000000000000000000000000000000000000000..1bb90615c3c22dd48318f09be5b0ccb66917125c --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_dev/simple_wiki.dev @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de11012e2af2eb1e1d9a964cae140e009333f8e0490dd7cda4362163d1258218 +size 9949452 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_dev/switchboard.dev b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_dev/switchboard.dev new file mode 100644 index 0000000000000000000000000000000000000000..309ca5466c5a4efaeff47894310d6f033f723189 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_dev/switchboard.dev @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5d4cf5e7608002b6712881194b6ae72d26edd50e086bd21e726d8f92508ae87 +size 969814 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_test_unaffected/bnc_spoken_unaffected.test b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_test_unaffected/bnc_spoken_unaffected.test new file mode 100644 index 0000000000000000000000000000000000000000..f3ed9eeb0d22c95fc93e9bd9c1345d23641d91d7 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_test_unaffected/bnc_spoken_unaffected.test @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9eb1eba6d158ddd0bfac1380397c69dd6b22ba3d4bc6a416adf2ba7785d2b042 +size 2444742 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_test_unaffected/childes_unaffected.test b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_test_unaffected/childes_unaffected.test new file mode 100644 index 0000000000000000000000000000000000000000..a63482cf911dbba344ea0ebf7917706263616f87 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_test_unaffected/childes_unaffected.test @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f64c1a13da60782ef82919dae3ade1017581f7f3d4a0ebac23dfec5412db3b29 +size 13943080 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_test_unaffected/gutenberg_unaffected.test b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_test_unaffected/gutenberg_unaffected.test new file mode 100644 index 0000000000000000000000000000000000000000..ce57901a4352b9ee8332ef73307ae3060d16b2a4 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_test_unaffected/gutenberg_unaffected.test @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f43e12333c6cb1d29e7d4989de1f15e1ef7345849c20149266c6b753711914c9 +size 11577187 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_test_unaffected/open_subtitles_unaffected.test b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_test_unaffected/open_subtitles_unaffected.test new file mode 100644 index 0000000000000000000000000000000000000000..0e571d6fcbf576ad57b8ac3a36e97726c8d3274c --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_test_unaffected/open_subtitles_unaffected.test @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55c8699bfcf94704d6a6a8fd5a945df2b831324460e9afc320ad267d78254a1f +size 9343347 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_test_unaffected/simple_wiki_unaffected.test b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_test_unaffected/simple_wiki_unaffected.test new file mode 100644 index 0000000000000000000000000000000000000000..f6a1766b053f5905875ad469680e519bea6e6da6 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_test_unaffected/simple_wiki_unaffected.test @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:939b6252cebc2e2c7e6e2a269b0ebc23878bfcb598f628803b1261ae638a3c8f +size 4692459 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_test_unaffected/switchboard_unaffected.test b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_test_unaffected/switchboard_unaffected.test new file mode 100644 index 0000000000000000000000000000000000000000..03c9df4244b9908b6a75f1e0892583fe06ad5e7c --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_control/babylm_test_unaffected/switchboard_unaffected.test @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d7dd8fdd488b0d1c190a256047ef0d6640fe83cc388582c8672167731a483b4 +size 489742 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_10M/bnc_spoken.train b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_10M/bnc_spoken.train new file mode 100644 index 0000000000000000000000000000000000000000..2ca5f3002c2a4341f7089dc3290bc865fd152f9e --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_10M/bnc_spoken.train @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63703c90273cdbcb545a2e66d526171c2e27ba32895dd5838996845a81c16b47 +size 4826577 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_10M/childes.train b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_10M/childes.train new file mode 100644 index 0000000000000000000000000000000000000000..57251b9cad9d3f6b7dade0279a5ced3dcd3d290a --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_10M/childes.train @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1c36215eb7e8625d96b09f2b2b6c77498a14ac29368fd7321562b7c3f159022 +size 20314003 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_10M/gutenberg.train b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_10M/gutenberg.train new file mode 100644 index 0000000000000000000000000000000000000000..584ba06a870493e8f0878e87b37c3b8290a33781 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_10M/gutenberg.train @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a7f47f866353170ae87a8d8dfa3a46e797ab05748bdc23873facd2cd9238e79 +size 16123058 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_10M/open_subtitles.train b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_10M/open_subtitles.train new file mode 100644 index 0000000000000000000000000000000000000000..46c6775b55e0ca4bfcdebe137ac0d30fbde9461b --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_10M/open_subtitles.train @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9511bfdeacbb950164d067a58f7c35c0baa1b88d51248415ac2cb86009884dc4 +size 12849902 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_10M/simple_wiki.train b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_10M/simple_wiki.train new file mode 100644 index 0000000000000000000000000000000000000000..591ea586e61020084a931308378f4f9e02f7c9d7 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_10M/simple_wiki.train @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a4cce0cae8f2d0285821ac98526bc7096bb1b1abac0eee368b99355af218936 +size 10048203 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_10M/switchboard.train b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_10M/switchboard.train new file mode 100644 index 0000000000000000000000000000000000000000..705ec384e99a7e1ce975593a99cfc655ffd1d06f --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_10M/switchboard.train @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15b0dc760c3a2fa4e39096e68b926f5072c4a756ed09790cc82ef183df0f8a58 +size 872953 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_dev/bnc_spoken.dev b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_dev/bnc_spoken.dev new file mode 100644 index 0000000000000000000000000000000000000000..07404f5d9283ca1205f554abe68d3cc535d6aaed --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_dev/bnc_spoken.dev @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac1b923209e36fa0dbdb48260bcb97ad8181bbb7bb6bf6f8b194f231db35fa9f +size 6982835 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_dev/childes.dev b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_dev/childes.dev new file mode 100644 index 0000000000000000000000000000000000000000..97833f38959f01909570c9766e6244e6e146ff5e --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_dev/childes.dev @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0004528c23a6a9a04648e65aedd1dcf3893fdf5d1f1c29b0ebb1aff937d9933a +size 19754589 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_dev/gutenberg.dev b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_dev/gutenberg.dev new file mode 100644 index 0000000000000000000000000000000000000000..48d3e4c766b8497d4eb661956dc8dcc6e536768e --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_dev/gutenberg.dev @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae4a10692845499613f66c4bf8190e6f6bff1c8179871c2df5956d1324925e50 +size 17782170 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_dev/open_subtitles.dev b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_dev/open_subtitles.dev new file mode 100644 index 0000000000000000000000000000000000000000..34695f22eac8f5f38e3f8d13137632a9f8a97e8c --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_dev/open_subtitles.dev @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb0e757d46d5d57a415a8f63d1f9b25600617daed87877ac6f576d19285a51f3 +size 13916387 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_dev/simple_wiki.dev b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_dev/simple_wiki.dev new file mode 100644 index 0000000000000000000000000000000000000000..7359c09e7cdd57593ac215b7f24246bfd0d2c207 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_dev/simple_wiki.dev @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:409f76cc21495f9fa0b9298108511d3ac37b92d30274d0ccd13232cd33069302 +size 9658196 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_dev/switchboard.dev b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_dev/switchboard.dev new file mode 100644 index 0000000000000000000000000000000000000000..9f0d16314f89866953f65f15ab32ed4d9bc85241 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_dev/switchboard.dev @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78ee9293f37e1b323ba6b20c277e40e93673cb23b51e959c12f24b955ab35959 +size 937530 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_test_affected/bnc_spoken_affected.test b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_test_affected/bnc_spoken_affected.test new file mode 100644 index 0000000000000000000000000000000000000000..e12207b76294023e848915abf5e42a687c104f26 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_test_affected/bnc_spoken_affected.test @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e83138bbf7edfc57deff9f466fe8d3128e9705243d3a681a1d261f8e02e5749f +size 2432046 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_test_affected/childes_affected.test b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_test_affected/childes_affected.test new file mode 100644 index 0000000000000000000000000000000000000000..9bdda69c785e5d91e51bd42e1fc7d9c7b3cdf503 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_test_affected/childes_affected.test @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9dd97dbcc8774050941f5903273524c4599cfa053304365050f9d99d57cdc54 +size 4631468 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_test_affected/gutenberg_affected.test b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_test_affected/gutenberg_affected.test new file mode 100644 index 0000000000000000000000000000000000000000..74e0c8ff302a53b6b9d2d60542d5e0841f718f31 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_test_affected/gutenberg_affected.test @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d46015030a08675e785be2056afc22ac1f592c8c3598d4302cd258a02adc619 +size 3520158 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_test_affected/open_subtitles_affected.test b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_test_affected/open_subtitles_affected.test new file mode 100644 index 0000000000000000000000000000000000000000..2e9798fe7eeb5feed1f58c5e7caa6a1b1b178ee5 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_test_affected/open_subtitles_affected.test @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a690bad0c874c2f167c16067e2d32aaf9224aeb4e0db5b7a42f0536e34faddb7 +size 2917989 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_test_affected/simple_wiki_affected.test b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_test_affected/simple_wiki_affected.test new file mode 100644 index 0000000000000000000000000000000000000000..280725848be36d8f11bc91eedad6c05ac90364f4 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_test_affected/simple_wiki_affected.test @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c70a0a6a01d1ea39dd942794982f005ead2a011bc158d77c4234c4c69876df23 +size 4309876 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_test_affected/switchboard_affected.test b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_test_affected/switchboard_affected.test new file mode 100644 index 0000000000000000000000000000000000000000..8d516040c18cde426c74f4d32746d4176be3b7e5 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_test_affected/switchboard_affected.test @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bcbd47f3a8e97f6f4be759eafefab4a0b863e4a25dc67a04c0fe7c7bcc7b4207 +size 495231 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_test_unaffected/bnc_spoken_unaffected.test b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_test_unaffected/bnc_spoken_unaffected.test new file mode 100644 index 0000000000000000000000000000000000000000..f3ed9eeb0d22c95fc93e9bd9c1345d23641d91d7 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_test_unaffected/bnc_spoken_unaffected.test @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9eb1eba6d158ddd0bfac1380397c69dd6b22ba3d4bc6a416adf2ba7785d2b042 +size 2444742 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_test_unaffected/childes_unaffected.test b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_test_unaffected/childes_unaffected.test new file mode 100644 index 0000000000000000000000000000000000000000..a63482cf911dbba344ea0ebf7917706263616f87 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_test_unaffected/childes_unaffected.test @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f64c1a13da60782ef82919dae3ade1017581f7f3d4a0ebac23dfec5412db3b29 +size 13943080 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_test_unaffected/gutenberg_unaffected.test b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_test_unaffected/gutenberg_unaffected.test new file mode 100644 index 0000000000000000000000000000000000000000..ce57901a4352b9ee8332ef73307ae3060d16b2a4 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_test_unaffected/gutenberg_unaffected.test @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f43e12333c6cb1d29e7d4989de1f15e1ef7345849c20149266c6b753711914c9 +size 11577187 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_test_unaffected/open_subtitles_unaffected.test b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_test_unaffected/open_subtitles_unaffected.test new file mode 100644 index 0000000000000000000000000000000000000000..0e571d6fcbf576ad57b8ac3a36e97726c8d3274c --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_test_unaffected/open_subtitles_unaffected.test @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55c8699bfcf94704d6a6a8fd5a945df2b831324460e9afc320ad267d78254a1f +size 9343347 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_test_unaffected/simple_wiki_unaffected.test b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_test_unaffected/simple_wiki_unaffected.test new file mode 100644 index 0000000000000000000000000000000000000000..f6a1766b053f5905875ad469680e519bea6e6da6 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_test_unaffected/simple_wiki_unaffected.test @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:939b6252cebc2e2c7e6e2a269b0ebc23878bfcb598f628803b1261ae638a3c8f +size 4692459 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_test_unaffected/switchboard_unaffected.test b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_test_unaffected/switchboard_unaffected.test new file mode 100644 index 0000000000000000000000000000000000000000..03c9df4244b9908b6a75f1e0892583fe06ad5e7c --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_test_unaffected/switchboard_unaffected.test @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d7dd8fdd488b0d1c190a256047ef0d6640fe83cc388582c8672167731a483b4 +size 489742 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_test_unaffected_sents/bnc_spoken_unaffected_sents.test b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_test_unaffected_sents/bnc_spoken_unaffected_sents.test new file mode 100644 index 0000000000000000000000000000000000000000..2c10b84555b658c267de5fc171dab184e1ed6c93 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_test_unaffected_sents/bnc_spoken_unaffected_sents.test @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6a52471e389f1b9e0ee8e9e81d6f07024a9d0e2bef64b3360bd3eea57b9af0d +size 1936645 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_test_unaffected_sents/childes_unaffected_sents.test b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_test_unaffected_sents/childes_unaffected_sents.test new file mode 100644 index 0000000000000000000000000000000000000000..7990fc3cd75af52ea7aa3fc7bbc4402ec388c598 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_test_unaffected_sents/childes_unaffected_sents.test @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e32ba7d98ae9ebefe7f25ba8c14f5ef9c4ea2e18423062253953b7a229f30689 +size 9069635 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_test_unaffected_sents/gutenberg_unaffected_sents.test b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_test_unaffected_sents/gutenberg_unaffected_sents.test new file mode 100644 index 0000000000000000000000000000000000000000..a7cc88918c20d5e09f0eb2ff20624f703dd5a695 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_test_unaffected_sents/gutenberg_unaffected_sents.test @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ff490ad471e7a632f9cce0a79216fab698424b37322073f1d8f9193cec67d61 +size 9908252 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_test_unaffected_sents/open_subtitles_unaffected_sents.test b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_test_unaffected_sents/open_subtitles_unaffected_sents.test new file mode 100644 index 0000000000000000000000000000000000000000..29a3c997586185972e9fdd3bb8153173a0adff69 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_test_unaffected_sents/open_subtitles_unaffected_sents.test @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a36df78972cda2a89ce70d2b159b073c9c2cf114e173d9623df3ba1704c0ed62 +size 6614054 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_test_unaffected_sents/simple_wiki_unaffected_sents.test b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_test_unaffected_sents/simple_wiki_unaffected_sents.test new file mode 100644 index 0000000000000000000000000000000000000000..4d877355edf03d06b1bf5e058071bc2783a94f5f --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_test_unaffected_sents/simple_wiki_unaffected_sents.test @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a6656ee5ed666a5879baab086ce842f7162195959330aee09327081b7f54946 +size 3774618 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_test_unaffected_sents/switchboard_unaffected_sents.test b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_test_unaffected_sents/switchboard_unaffected_sents.test new file mode 100644 index 0000000000000000000000000000000000000000..e0e817412af83f8ec37fd5bd24b613cccca90123 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_hop_words4/babylm_test_unaffected_sents/switchboard_unaffected_sents.test @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:499ba9c7b67d50543f5c7b6977cc7c18ac5da7ef0cc28e9cbeb1816d45038f40 +size 337196 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_10M/bnc_spoken.train b/data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_10M/bnc_spoken.train new file mode 100644 index 0000000000000000000000000000000000000000..7d0159e399ddb1c330166d964c073782e5542fb3 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_10M/bnc_spoken.train @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7f3f9361d1fbfb3c54fd03ad0920c2ea56129154a139536f2bd1c9bc386433e +size 6031633 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_10M/childes.train b/data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_10M/childes.train new file mode 100644 index 0000000000000000000000000000000000000000..031e2a67cc23f25ef819acdb76fbd10a3a80a7e0 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_10M/childes.train @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:306886f267a45e539626d0f443de14e47857dffbc1c49af2952f695be65abe94 +size 27744065 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_10M/gutenberg.train b/data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_10M/gutenberg.train new file mode 100644 index 0000000000000000000000000000000000000000..bc883bcc61798e836cecb671e0d170933ba8aea4 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_10M/gutenberg.train @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80f66c8ffbaf735257546581779a045e158d0eb1acce94e1dde82f3a40201ad9 +size 17454115 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_10M/open_subtitles.train b/data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_10M/open_subtitles.train new file mode 100644 index 0000000000000000000000000000000000000000..687d849b708c519d915d6accd744016c628ffd86 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_10M/open_subtitles.train @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65a0ed9f03820ea0094c0907b92158f8dc93be2a2366f78fca32b0f518cc1532 +size 16642763 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_10M/simple_wiki.train b/data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_10M/simple_wiki.train new file mode 100644 index 0000000000000000000000000000000000000000..0e0dbff81f7777c75af9ba9785fa7b6fca03be5b --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_10M/simple_wiki.train @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a850d8105b195ae44ef995c19935b293c1f6e656092b939a6653330df4e029a7 +size 10895427 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_10M/switchboard.train b/data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_10M/switchboard.train new file mode 100644 index 0000000000000000000000000000000000000000..a43e95e443bcf2fc68624273583eb7f935ed1869 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_10M/switchboard.train @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3806c15e01d35f8a24490cfe806dad225c805350b1dacd58a8cdb3552f9ad20a +size 1036543 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_dev/bnc_spoken.dev b/data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_dev/bnc_spoken.dev new file mode 100644 index 0000000000000000000000000000000000000000..1dc621a8636170f1207df6abb980fdf46b9f08c8 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_dev/bnc_spoken.dev @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6953bd68db1bcb94de2d0ea0f54ac03ea7d45c616a17ea9a32fe338973b4fdd +size 8429897 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_dev/childes.dev b/data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_dev/childes.dev new file mode 100644 index 0000000000000000000000000000000000000000..d204d4f4d30edf979d700ce7446d9c44953dc47f --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_dev/childes.dev @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4768ccf94a28479b0260aaa03d77f253993377dcb254ecaaeddddf116efeb46e +size 26156290 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_dev/gutenberg.dev b/data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_dev/gutenberg.dev new file mode 100644 index 0000000000000000000000000000000000000000..b6828a37e2df2fd5c699788afaf2030f7e57135d --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_dev/gutenberg.dev @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60d549c161cf217d8c690259ad4fe8b21e872b19a51790fa608a0de0c9388402 +size 19052763 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_dev/open_subtitles.dev b/data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_dev/open_subtitles.dev new file mode 100644 index 0000000000000000000000000000000000000000..8677b2b547404e7074ba32e9292a3256c8702b00 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_dev/open_subtitles.dev @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dea88c756f5a493984d4ab0f89fb3e745a8c11b7c4bebbdc2cefcff15a5b094a +size 17470300 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_dev/simple_wiki.dev b/data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_dev/simple_wiki.dev new file mode 100644 index 0000000000000000000000000000000000000000..c8bd4d953e7399fff36e6154c7be99b7cf22d269 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_dev/simple_wiki.dev @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f59e2840fedc1df35d613802eef9ebc78c610d2ced18d07cdaef9ec50bb2643d +size 10488191 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_dev/switchboard.dev b/data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_dev/switchboard.dev new file mode 100644 index 0000000000000000000000000000000000000000..02bab549cb4b13e15d14c9cc53a81a6895a70a87 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_dev/switchboard.dev @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06df6c0523c28217c76d24d01e93bc6cbe5bfdb237ac7016e97313d615e4b770 +size 1060762 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_test_affected/bnc_spoken_affected.test b/data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_test_affected/bnc_spoken_affected.test new file mode 100644 index 0000000000000000000000000000000000000000..1a87e64aaf884b3355e71a7fbcb9a28470ed8714 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_test_affected/bnc_spoken_affected.test @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6bf13c3e47d8e089a0750e582834eab118de3617b2dce28193ee67a4d6a0df06 +size 6094837 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_test_affected/childes_affected.test b/data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_test_affected/childes_affected.test new file mode 100644 index 0000000000000000000000000000000000000000..819668c84cb2eacfd8290e340d7b058462d5aee7 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_test_affected/childes_affected.test @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af465b57be513be8baa365a3401153feb12247ea96822dfd2977507069f7775b +size 25972079 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_test_affected/gutenberg_affected.test b/data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_test_affected/gutenberg_affected.test new file mode 100644 index 0000000000000000000000000000000000000000..d686e4fbdd2c0055cd088dc86e7ef35e1f932d49 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_test_affected/gutenberg_affected.test @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa3d79bbdfafa4f5a344578401040708e12f870b5af943bab459ea7ca97d6a0c +size 16240650 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_test_affected/open_subtitles_affected.test b/data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_test_affected/open_subtitles_affected.test new file mode 100644 index 0000000000000000000000000000000000000000..cd686f4ac24e524cf5d27958ab3fe550129b97e6 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_test_affected/open_subtitles_affected.test @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e69b3b8f320cff9f67abc35cb59a28d41a0ae8372979fa814d651b1a62b54b4 +size 15905914 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_test_affected/simple_wiki_affected.test b/data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_test_affected/simple_wiki_affected.test new file mode 100644 index 0000000000000000000000000000000000000000..916454a4a758210ade9977a9aadaa857b003d65c --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_test_affected/simple_wiki_affected.test @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73cf711b33085aba57c654c9c6c48a8f70baf6a85e812f5771c2017563fa79fd +size 9818194 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_test_affected/switchboard_affected.test b/data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_test_affected/switchboard_affected.test new file mode 100644 index 0000000000000000000000000000000000000000..9645e12556f2d8a12107974bd2cf3a1875699ac7 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_reverse_control/babylm_test_affected/switchboard_affected.test @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7da756acf98d137ce3fddb708f4178da37b24d1ef6513c1e4376c9f47d43659 +size 1169117 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic21/babylm_10M/bnc_spoken.train b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic21/babylm_10M/bnc_spoken.train new file mode 100644 index 0000000000000000000000000000000000000000..c08d6902261044c3047afc22aa5e33762f262c9f --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic21/babylm_10M/bnc_spoken.train @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1f2e020c2f3b576acd74212cf03e7ff1538e990b3320fe6e735eb2a02c5fbd5 +size 5509282 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic21/babylm_10M/childes.train b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic21/babylm_10M/childes.train new file mode 100644 index 0000000000000000000000000000000000000000..d41c4a0071df79ab35d1336c863b60e544a29aff --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic21/babylm_10M/childes.train @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7467401b7e8ec6b4a7696c8aa3bea4b4e4b45343f37b63465173b407134506e9 +size 24490419 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic21/babylm_10M/gutenberg.train b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic21/babylm_10M/gutenberg.train new file mode 100644 index 0000000000000000000000000000000000000000..a8941c72e88481d43a7acee4d016f377350f8db3 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic21/babylm_10M/gutenberg.train @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b30f7ff9cdd69b95edefe489566bd712342a894220b2b8be27aad691a7a20d5 +size 16364206 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic21/babylm_10M/open_subtitles.train b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic21/babylm_10M/open_subtitles.train new file mode 100644 index 0000000000000000000000000000000000000000..b5c0eab87473872f3d9d6a9733166ab36cb0aa57 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic21/babylm_10M/open_subtitles.train @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d6b94a21b145050dbb648cc4726fd4f56d248c374091048712204b69ed0eff5 +size 14492108 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic21/babylm_10M/simple_wiki.train b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic21/babylm_10M/simple_wiki.train new file mode 100644 index 0000000000000000000000000000000000000000..0eaaff7d1385d81d412cc2ef43924b0f52b96498 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic21/babylm_10M/simple_wiki.train @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47ae4ac60df87db4d5b98c1f578e04814dd87bda59ba1c6dd91b205018fe6e80 +size 10199712 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic21/babylm_10M/switchboard.train b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic21/babylm_10M/switchboard.train new file mode 100644 index 0000000000000000000000000000000000000000..007b57de63225cb8686347bac60137b3f5a30202 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic21/babylm_10M/switchboard.train @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28cfa2c7b178d42e6a97db0ae816ab8105e5ea4eb4f3604d1a1f2c9d71d48872 +size 961566 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic21/babylm_dev/bnc_spoken.dev b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic21/babylm_dev/bnc_spoken.dev new file mode 100644 index 0000000000000000000000000000000000000000..c219a6ce51f4a7b9f703238f96ef98c152977b52 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic21/babylm_dev/bnc_spoken.dev @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3c89e160e2b1497eab193067e21ba7f3414d998801b01c3a3272ff6a73a9e60 +size 7706841 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic21/babylm_dev/open_subtitles.dev b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic21/babylm_dev/open_subtitles.dev new file mode 100644 index 0000000000000000000000000000000000000000..10831bfb15aca135a80bf56355bf776022ef317a --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic21/babylm_dev/open_subtitles.dev @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad565411527ae77e37e9d9602c7e642ceb5b77e1427b3099b43612754cbcb78f +size 15254169 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic21/babylm_dev/switchboard.dev b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic21/babylm_dev/switchboard.dev new file mode 100644 index 0000000000000000000000000000000000000000..325640c4ae027baaf56bba2f526245087fbe7cc4 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic21/babylm_dev/switchboard.dev @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9722c922c0d83c0283c0dd64c345dbb19a5d2e7c052debfcdfdeb473783cc5c +size 988060 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic21/babylm_test_affected/bnc_spoken_affected.test b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic21/babylm_test_affected/bnc_spoken_affected.test new file mode 100644 index 0000000000000000000000000000000000000000..1b332929365affe86ab003fae38fe0792fc70d5c --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic21/babylm_test_affected/bnc_spoken_affected.test @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7ba99f5b084d0c980dab94caff6301e6006d974930ba2fe9fe88561d74dc79b +size 5546587 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic21/babylm_test_affected/childes_affected.test b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic21/babylm_test_affected/childes_affected.test new file mode 100644 index 0000000000000000000000000000000000000000..2bf5d0845b67486a8a72c3da2b285dd4c66d55e4 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic21/babylm_test_affected/childes_affected.test @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d932657dda267b8a6de54ace2661087081c511bbb52a0beebb60f65d696f70fe +size 22923367 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic21/babylm_test_affected/gutenberg_affected.test b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic21/babylm_test_affected/gutenberg_affected.test new file mode 100644 index 0000000000000000000000000000000000000000..e9a7bf2e60b1e662bbba2eba3b4fb02c326d2d4e --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic21/babylm_test_affected/gutenberg_affected.test @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddcc1007238d39894d7858b4d4b5b4318ccb3c50d6cd43f32ca2921730ad6b4e +size 15240278 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic21/babylm_test_affected/open_subtitles_affected.test b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic21/babylm_test_affected/open_subtitles_affected.test new file mode 100644 index 0000000000000000000000000000000000000000..6875b6cc603241a45bb5e2d95594094e8938f60b --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic21/babylm_test_affected/open_subtitles_affected.test @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f7c793ea84cac7df48693400d22240818f18629d184a92b4ebdfea68dd76f3c +size 13869412 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic21/babylm_test_affected/simple_wiki_affected.test b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic21/babylm_test_affected/simple_wiki_affected.test new file mode 100644 index 0000000000000000000000000000000000000000..707f8335f9a0c1b62451b105b9757dd93efd83b5 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic21/babylm_test_affected/simple_wiki_affected.test @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84ffbe7b319051486071c8ee553799ad86f9978c497c2926384c8ee6ebaa4fb5 +size 9142615 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic21/babylm_test_affected/switchboard_affected.test b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic21/babylm_test_affected/switchboard_affected.test new file mode 100644 index 0000000000000000000000000000000000000000..af19f78865387e5676a8bbd0801c1ca7e0b18224 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic21/babylm_test_affected/switchboard_affected.test @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a591d630a481e37502cc0cf7ec9ab20b343a4dde306e23ecea50740874205d49 +size 1087539 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_10M/bnc_spoken.train b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_10M/bnc_spoken.train new file mode 100644 index 0000000000000000000000000000000000000000..0675928bec0516426a3e4ed1b47ee33508aec3cb --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_10M/bnc_spoken.train @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b35bf64b862a415f2b568bdc3436d3e12867f648c58f3defee4f76a0cd910a9 +size 5509282 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_10M/childes.train b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_10M/childes.train new file mode 100644 index 0000000000000000000000000000000000000000..d2a8ca3c416285999d7ba3659846a2d97120dddb --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_10M/childes.train @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59756693057ff9ef15f0233414fbb7f0cc2bf8c31bcca604deb18337808ac5ce +size 24490419 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_10M/gutenberg.train b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_10M/gutenberg.train new file mode 100644 index 0000000000000000000000000000000000000000..77168ede0c3f607f51b429baaf16ac9078fa51d6 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_10M/gutenberg.train @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ba8bf0227f754809de2c52adeaf9c415b7a8504935ecf3889b0d7ba60f5ef05 +size 16364206 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_10M/open_subtitles.train b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_10M/open_subtitles.train new file mode 100644 index 0000000000000000000000000000000000000000..7fdde5307d97b4e82b5a3f330c0da8d4a8b5b4ea --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_10M/open_subtitles.train @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17af50252818161ce363c511007d85e3005a3eb9a26b9a519d338a909df2dfad +size 14492108 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_10M/simple_wiki.train b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_10M/simple_wiki.train new file mode 100644 index 0000000000000000000000000000000000000000..180c6022cb0348241ea7c94fb04f29912bbd395d --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_10M/simple_wiki.train @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09e39f508d256cb05322b36b49b75893f515146b446be8afd647efcb62751a8f +size 10199712 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_10M/switchboard.train b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_10M/switchboard.train new file mode 100644 index 0000000000000000000000000000000000000000..8cea701c78aaa560a53b1edc8700e39737ba327b --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_10M/switchboard.train @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81c240d2e9ad3292b15e106a54cce4f1788a66e16a6c5c6672222940b6fda6b6 +size 961566 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_dev/bnc_spoken.dev b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_dev/bnc_spoken.dev new file mode 100644 index 0000000000000000000000000000000000000000..6cfe15212e1d9cccf4638eec3d50565c96628dec --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_dev/bnc_spoken.dev @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41d8ca47eb14ecde2af5bcf2114477056d608f8a328065eaabe9db930a518eab +size 7706841 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_dev/childes.dev b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_dev/childes.dev new file mode 100644 index 0000000000000000000000000000000000000000..ef733704cb598d904e3a390efa767dd6e86c7d8b --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_dev/childes.dev @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:209fc73494686fa7e79cfe5b51614bcda4720d43ab6800fc0996fea5d45e9403 +size 23046068 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_dev/gutenberg.dev b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_dev/gutenberg.dev new file mode 100644 index 0000000000000000000000000000000000000000..45eff0eae41fde13f36f495c963f73874a417608 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_dev/gutenberg.dev @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7c37d0d8b902ae10e974df940956aa66a6c4836b42f702c75914f59060685b5 +size 17909917 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_dev/open_subtitles.dev b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_dev/open_subtitles.dev new file mode 100644 index 0000000000000000000000000000000000000000..8219d85c842addc5e304d22b98fd31ad06933c91 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_dev/open_subtitles.dev @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e40b05f726b2216b679f310081ecb5bcebde5108ce6783b5a1607c1f581d9c4d +size 15254169 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_dev/simple_wiki.dev b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_dev/simple_wiki.dev new file mode 100644 index 0000000000000000000000000000000000000000..785b9a7b02afb0a002e85019d1efdf0a12d250a1 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_dev/simple_wiki.dev @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5accc6ca1391a706cfc54a12a305fe43da40b78ba20c416e7ca711b8ccf5c7c2 +size 9832550 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_dev/switchboard.dev b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_dev/switchboard.dev new file mode 100644 index 0000000000000000000000000000000000000000..2da5f15df69ddb780c536cde55b33556e85949ca --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_dev/switchboard.dev @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f9e816f7d587b8e7a5ccdaa1e245b4eb419ff29e9f2555943c3344211b7a90d +size 988060 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_test_affected/bnc_spoken_affected.test b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_test_affected/bnc_spoken_affected.test new file mode 100644 index 0000000000000000000000000000000000000000..a41a0735315993dd21e0f54e92b9811e240c9a16 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_test_affected/bnc_spoken_affected.test @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae1952db0aba0d5fa599de67f37ce7f21a84e92042b46cded101e3ffc2ea4f1f +size 5546587 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_test_affected/childes_affected.test b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_test_affected/childes_affected.test new file mode 100644 index 0000000000000000000000000000000000000000..1ed0d0cd3fbd0f72d7472c63a791d98806c72591 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_test_affected/childes_affected.test @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8989c51ea9e3047d12f0e25c1c180f05aacbef065acb0fc854d89f2b60bf47f8 +size 22923367 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_test_affected/gutenberg_affected.test b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_test_affected/gutenberg_affected.test new file mode 100644 index 0000000000000000000000000000000000000000..d5af960c45bbfe0acef951b3269e3d0a77b2caf5 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_test_affected/gutenberg_affected.test @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f9ae6f04ee398fe071b54a7f27830699b3e58e25ff9d87ea7f1603fbf319326 +size 15240278 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_test_affected/open_subtitles_affected.test b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_test_affected/open_subtitles_affected.test new file mode 100644 index 0000000000000000000000000000000000000000..0a8ffcadb706f997d381815970055e1280f53bd9 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_test_affected/open_subtitles_affected.test @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d6e3f5d4dfe0439c709c84495336e1ae5b61fcbbfd49a1e49f336133530f92c +size 13869412 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_test_affected/simple_wiki_affected.test b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_test_affected/simple_wiki_affected.test new file mode 100644 index 0000000000000000000000000000000000000000..c5bbefb477287b56d538344e4bb6bfcc14fb38d5 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_test_affected/simple_wiki_affected.test @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a03ae4860fe48b63f59abb37c5d18e2f8b3c1d15d226482eafca765ad6e3472 +size 9142615 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_test_affected/switchboard_affected.test b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_test_affected/switchboard_affected.test new file mode 100644 index 0000000000000000000000000000000000000000..47d40ebfad701f219667e00826695f25df3fcdfa --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic57/babylm_test_affected/switchboard_affected.test @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9158f62dba1ce72bcdfdc3b6158a4cf4ba4a5cb00e0d9a6546f32aa561376b93 +size 1087539 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic84/babylm_10M/bnc_spoken.train b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic84/babylm_10M/bnc_spoken.train new file mode 100644 index 0000000000000000000000000000000000000000..c16372e340292871fe54e848552774fe858c6d45 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic84/babylm_10M/bnc_spoken.train @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1ec322ded30cf669c1c640af0fbea47a6e574e9249ed53db9b7315f059a0348 +size 5509282 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic84/babylm_10M/childes.train b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic84/babylm_10M/childes.train new file mode 100644 index 0000000000000000000000000000000000000000..616469b9a58b26321f49a026df9d57cb6931599f --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic84/babylm_10M/childes.train @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c6ac4b91c67aad8941fd0b715002e10ace6ca3e9b3356d6890e4b10084f37b7 +size 24490419 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic84/babylm_10M/simple_wiki.train b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic84/babylm_10M/simple_wiki.train new file mode 100644 index 0000000000000000000000000000000000000000..99e81736c603ce9cb73a97a4056df33a7c6a7f5c --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic84/babylm_10M/simple_wiki.train @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b90482da8f1031aa65a4c1b768bcf9c450904d5d99de7379a10fd4c1cfa43a2 +size 10199712 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic84/babylm_dev/childes.dev b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic84/babylm_dev/childes.dev new file mode 100644 index 0000000000000000000000000000000000000000..5ea5dbb7333e14ae2ea3cd586941a09bbffb0218 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic84/babylm_dev/childes.dev @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:685c0e63c429d8956c2a2db905722ec92f741fcd2b7a987c959858078e030bfd +size 23046068 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic84/babylm_dev/gutenberg.dev b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic84/babylm_dev/gutenberg.dev new file mode 100644 index 0000000000000000000000000000000000000000..aeb94e2c059bd586101273db97a098acc4fc26e1 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic84/babylm_dev/gutenberg.dev @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ca26d4e9390206ecb1ed97ef2abcaebf4bfa913c75c9e00910e4b61e2073333 +size 17909917 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic84/babylm_dev/simple_wiki.dev b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic84/babylm_dev/simple_wiki.dev new file mode 100644 index 0000000000000000000000000000000000000000..86014203b3e22b59071c79550cc12230997a0b10 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic84/babylm_dev/simple_wiki.dev @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4559a93f68dc74c6e085ca6cb6da38d09897c2539bc19a481d53ae0b51e09f6 +size 9832550 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic84/babylm_dev/switchboard.dev b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic84/babylm_dev/switchboard.dev new file mode 100644 index 0000000000000000000000000000000000000000..c416d88d6febe7789f9e6538478ee8cca38c791b --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic84/babylm_dev/switchboard.dev @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62428d1dbe7b45843209c68a6eb416cad1ca06f583ceecc7fb4cfdcc5bb7039b +size 988060 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic84/babylm_test_affected/bnc_spoken_affected.test b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic84/babylm_test_affected/bnc_spoken_affected.test new file mode 100644 index 0000000000000000000000000000000000000000..5227f5f9bef86ac104da028b6e390cb461b0a17d --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic84/babylm_test_affected/bnc_spoken_affected.test @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c515793f2a32f8d11864d487bc713e296c8cd7440dad6d6ad5091790781b8882 +size 5546587 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic84/babylm_test_affected/gutenberg_affected.test b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic84/babylm_test_affected/gutenberg_affected.test new file mode 100644 index 0000000000000000000000000000000000000000..f53ebcafa13ad710195a7b70f1c7d462382c60ee --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic84/babylm_test_affected/gutenberg_affected.test @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:acf3ac4e7784b78cddb8e85b7e3cac3ddc9ca97f374efe1528e47e6a17ba7ac0 +size 15240278 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic84/babylm_test_affected/open_subtitles_affected.test b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic84/babylm_test_affected/open_subtitles_affected.test new file mode 100644 index 0000000000000000000000000000000000000000..e380c16006493a40b0c6d7bb025da387a2731446 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic84/babylm_test_affected/open_subtitles_affected.test @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9226c84960927f0af67a5481da2ede9de5e16b1ed069f9e4e119ea702ea51e56 +size 13869412 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic84/babylm_test_affected/simple_wiki_affected.test b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic84/babylm_test_affected/simple_wiki_affected.test new file mode 100644 index 0000000000000000000000000000000000000000..c28d20f008430301c2d418e5dcefa0c082691e47 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic84/babylm_test_affected/simple_wiki_affected.test @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f74703851d4066c1548042a5e0c9a4f92a57c1926951f801f5f96f95973ca884 +size 9142615 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic84/babylm_test_affected/switchboard_affected.test b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic84/babylm_test_affected/switchboard_affected.test new file mode 100644 index 0000000000000000000000000000000000000000..900debf31ae52672369b338ef6170a18aecb04d2 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_deterministic84/babylm_test_affected/switchboard_affected.test @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c08589042abe86e69ad5ff7d0bfa62c40c5dc0894cec3d78b07d137f65cdbbe8 +size 1087539 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_even_odd/babylm_10M/bnc_spoken.train b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_even_odd/babylm_10M/bnc_spoken.train new file mode 100644 index 0000000000000000000000000000000000000000..e441d4853206bb104ef215102b5c631498d4263d --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_even_odd/babylm_10M/bnc_spoken.train @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:686f11726c20257b461310cb58739502d0b17bb5be935c9531f2645a9a8cd304 +size 5509282 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_even_odd/babylm_10M/childes.train b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_even_odd/babylm_10M/childes.train new file mode 100644 index 0000000000000000000000000000000000000000..14010a00c313b977197d98e95c014797a3f4eab6 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_even_odd/babylm_10M/childes.train @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5affe047705a421f9347123516f8ef45971c4e5bbc58ec4482e4f2850566bd82 +size 24490419 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_even_odd/babylm_10M/gutenberg.train b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_even_odd/babylm_10M/gutenberg.train new file mode 100644 index 0000000000000000000000000000000000000000..613aedae5c6cb10578cd9790f3dded17d1ddea77 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_even_odd/babylm_10M/gutenberg.train @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e376f68571accd38b8808792516b27103e2c862e3b3d40ad27f7173ef20d62c6 +size 16364206 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_even_odd/babylm_10M/open_subtitles.train b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_even_odd/babylm_10M/open_subtitles.train new file mode 100644 index 0000000000000000000000000000000000000000..ff4c0e62e0c98ed2c35d50a5e05c9fbc1c264944 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_even_odd/babylm_10M/open_subtitles.train @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c2a25d92554237a720a0bbc6aed65711851e37ba2686f4368dcd5655f355fc3 +size 14492108 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_even_odd/babylm_10M/simple_wiki.train b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_even_odd/babylm_10M/simple_wiki.train new file mode 100644 index 0000000000000000000000000000000000000000..9f2b1b5ee0761e4d081e7ee3952be0485b4cabcc --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_even_odd/babylm_10M/simple_wiki.train @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62757fedfa4e784d519d30dbf408f8beedfca267169a7195c93dd0e481fde428 +size 10199712 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_even_odd/babylm_10M/switchboard.train b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_even_odd/babylm_10M/switchboard.train new file mode 100644 index 0000000000000000000000000000000000000000..28a4dd24fc5f969c851b3973ac3f594401e6ce37 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_even_odd/babylm_10M/switchboard.train @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d3897b07220273eb3f0d91db4565fc9bb75d20ba642cf187e8b65ea44afe99a +size 961566 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_even_odd/babylm_dev/bnc_spoken.dev b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_even_odd/babylm_dev/bnc_spoken.dev new file mode 100644 index 0000000000000000000000000000000000000000..66b44aa8050909fac0150771ec0ab3306b0f75f2 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_even_odd/babylm_dev/bnc_spoken.dev @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2e0c1aa2c4988e0ea0595d208b7994dc21a5974d3d79dd8e026d61625d16897 +size 7706841 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_even_odd/babylm_dev/childes.dev b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_even_odd/babylm_dev/childes.dev new file mode 100644 index 0000000000000000000000000000000000000000..7576793a289260f7f09f062d5362b4e92185273b --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_even_odd/babylm_dev/childes.dev @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c78bdaee3f4f54dd8b66e2fc669a64fe3d3e349877bd8b45fca08f731095ff1e +size 23046068 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_even_odd/babylm_dev/gutenberg.dev b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_even_odd/babylm_dev/gutenberg.dev new file mode 100644 index 0000000000000000000000000000000000000000..df6a6e62acd859efa08a9c532adb337bab89f12d --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_even_odd/babylm_dev/gutenberg.dev @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce8c001a82c73ae80a565355bf52780755199fc254dc1f618e666ae5a36c6512 +size 17909917 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_even_odd/babylm_dev/open_subtitles.dev b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_even_odd/babylm_dev/open_subtitles.dev new file mode 100644 index 0000000000000000000000000000000000000000..20155f7fdd38d20df18b970bedc96fa691480a5b --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_even_odd/babylm_dev/open_subtitles.dev @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:360d390fc147c127d5977ce46544d6452fb1aa57fedeae30302a67a6d51a18a5 +size 15254169 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_even_odd/babylm_dev/simple_wiki.dev b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_even_odd/babylm_dev/simple_wiki.dev new file mode 100644 index 0000000000000000000000000000000000000000..596637a4c3ddb93f36ea0cc3dd08d8a52bd23968 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_even_odd/babylm_dev/simple_wiki.dev @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a990551113c06fa260eb2c41f5c19f04ec9ffd59a1d171403550dbdf5f48c6a +size 9832550 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_even_odd/babylm_dev/switchboard.dev b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_even_odd/babylm_dev/switchboard.dev new file mode 100644 index 0000000000000000000000000000000000000000..c46485566f3512a8e7b2c4004cd34ebaf68efb6d --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_even_odd/babylm_dev/switchboard.dev @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f9fd297ae429ddebfd1fce90d7dca024a35ed2db5e9c3aaa27fa7f6b366a43d +size 988060 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_even_odd/babylm_test_affected/bnc_spoken_affected.test b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_even_odd/babylm_test_affected/bnc_spoken_affected.test new file mode 100644 index 0000000000000000000000000000000000000000..43ebd0d321eef982b3030da050e06aeeeecc0ed8 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_even_odd/babylm_test_affected/bnc_spoken_affected.test @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0a63b2516cf4f060b253007493c4029ba0bb7d2105f9a9e369bb0472b7cf0aa +size 5546587 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_even_odd/babylm_test_affected/childes_affected.test b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_even_odd/babylm_test_affected/childes_affected.test new file mode 100644 index 0000000000000000000000000000000000000000..71d8f3e6e50597f16d526101a4dc1f0eab6a5241 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_even_odd/babylm_test_affected/childes_affected.test @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3165ef0aedb66539a7d3f0e9b2eec22ab4c50a61b91c03ab94dadfdda8a9eb7e +size 22923367 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_even_odd/babylm_test_affected/gutenberg_affected.test b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_even_odd/babylm_test_affected/gutenberg_affected.test new file mode 100644 index 0000000000000000000000000000000000000000..5d45cf1059d0e5fa38f79952a1d89702bdbf23d3 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_even_odd/babylm_test_affected/gutenberg_affected.test @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9f55a970db612555fa670b963f136451d9a9686d1c2a395563839b8a044c3f5 +size 15240278 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_even_odd/babylm_test_affected/open_subtitles_affected.test b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_even_odd/babylm_test_affected/open_subtitles_affected.test new file mode 100644 index 0000000000000000000000000000000000000000..f0644dc32aa8a8c5b6ae32592d56b028fda79ac9 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_even_odd/babylm_test_affected/open_subtitles_affected.test @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d5daa79ee2d43c160ff558f9326cc459e4f05217c1a8b8104ab2b55ff567749 +size 13869412 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_even_odd/babylm_test_affected/simple_wiki_affected.test b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_even_odd/babylm_test_affected/simple_wiki_affected.test new file mode 100644 index 0000000000000000000000000000000000000000..0cacfa37817c4b53b72391c370f945ec8e09e984 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_even_odd/babylm_test_affected/simple_wiki_affected.test @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd74cbe1736301b5614fa9f9a196707c2c1f62dd707568a75adced5e92e34b4f +size 9142615 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_even_odd/babylm_test_affected/switchboard_affected.test b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_even_odd/babylm_test_affected/switchboard_affected.test new file mode 100644 index 0000000000000000000000000000000000000000..00429a989ccb719d9e3793feed93e33e1fb1366c --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_even_odd/babylm_test_affected/switchboard_affected.test @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea694b34ca74b59b0e6f44c7a7d8ebe6c563814bda83337a400f10af7aef89c6 +size 1087539 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_10M/bnc_spoken.train b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_10M/bnc_spoken.train new file mode 100644 index 0000000000000000000000000000000000000000..776705b0246ebb52fc34920f43c4d9ca8d8c35c5 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_10M/bnc_spoken.train @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c594c02e391ed682cd9b73e275df84952536356e8c601259fc950a18cd38257 +size 5509282 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_10M/childes.train b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_10M/childes.train new file mode 100644 index 0000000000000000000000000000000000000000..06662664c274f787ca800582f97a22f9f9a853f6 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_10M/childes.train @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a0418a9634000933b5b16849391ca06c339b09846f608fda7dbf39fa8b4bda0 +size 24490419 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_10M/gutenberg.train b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_10M/gutenberg.train new file mode 100644 index 0000000000000000000000000000000000000000..103e20ca0329ebd25529286f2398dc6d3916f690 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_10M/gutenberg.train @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75545c10bb580f7b15070cc35dd10189659822ec331f428fe15f72d20bd7c05a +size 16364206 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_10M/open_subtitles.train b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_10M/open_subtitles.train new file mode 100644 index 0000000000000000000000000000000000000000..71a0f36892454f12be7345662fe9760437a102ac --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_10M/open_subtitles.train @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d44b595c7891c1298b034d8ca1a8da41837d42fac3e565c0204d141208d5b25c +size 14492108 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_10M/simple_wiki.train b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_10M/simple_wiki.train new file mode 100644 index 0000000000000000000000000000000000000000..c117d2ad075660944640a68d1291d0d656a6a394 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_10M/simple_wiki.train @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d5a3ff8defb228b335680110969c2bbb319334adbca520b5648ee975b2bf011 +size 10199712 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_10M/switchboard.train b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_10M/switchboard.train new file mode 100644 index 0000000000000000000000000000000000000000..c24ce7033753ae9dd32f380473ff33456f885a5c --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_10M/switchboard.train @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe40d60683a72ff783a007336b70a775fe4f32884f5f67b7f524d239fc4b709b +size 961566 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_dev/bnc_spoken.dev b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_dev/bnc_spoken.dev new file mode 100644 index 0000000000000000000000000000000000000000..d6051e430ce389142c1ad99958bfca1cfcdddff0 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_dev/bnc_spoken.dev @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e11304fc8ff3e9dccd325e9d1f0848ed45e1ff35e31d8ebae6cffa50d6335e6 +size 7706841 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_dev/childes.dev b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_dev/childes.dev new file mode 100644 index 0000000000000000000000000000000000000000..a67e8cc8d32345dbc9ab2c51809b77142320fc01 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_dev/childes.dev @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1061634929992e6a878dcfecb75dd539e05f9607681062c411fb20f07cd9f7e +size 23046068 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_dev/gutenberg.dev b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_dev/gutenberg.dev new file mode 100644 index 0000000000000000000000000000000000000000..19a8f1747a73fe3bbed63a1fa60e30a0740eec90 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_dev/gutenberg.dev @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47ae49a6c6a93fb58873c8a71f8ae2ef7677acfd37a12da73a9026831464a28d +size 17909917 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_dev/open_subtitles.dev b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_dev/open_subtitles.dev new file mode 100644 index 0000000000000000000000000000000000000000..9f8dfcbacae0c1671aa54ea68fa1093148dec0dc --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_dev/open_subtitles.dev @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77c1949c203a65c8c214a4b7c734c55b9995cf15ce53f80c50985ce3769c4de8 +size 15254169 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_dev/simple_wiki.dev b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_dev/simple_wiki.dev new file mode 100644 index 0000000000000000000000000000000000000000..03dc688baf948de0ee19705f5111a2e08ba3ad03 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_dev/simple_wiki.dev @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28028094f46c2b42202a019c6244c38b6ade2c3a6b43429eb02fb7596a289353 +size 9832550 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_dev/switchboard.dev b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_dev/switchboard.dev new file mode 100644 index 0000000000000000000000000000000000000000..a80b50d954bd93863fe23c479f0d95e83b3e98a6 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_dev/switchboard.dev @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38766bdb362a79677197d9f6cd90679bc3ce423a2cace379e4f8d24ceff14630 +size 988060 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_test_affected/bnc_spoken_affected.test b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_test_affected/bnc_spoken_affected.test new file mode 100644 index 0000000000000000000000000000000000000000..67aa8c801b1db5fb77bc7ac01040893606d284cf --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_test_affected/bnc_spoken_affected.test @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff645fa6b083988c828b0247282a8491107aba6881ac8b6dbd602f059bfb2818 +size 5546587 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_test_affected/childes_affected.test b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_test_affected/childes_affected.test new file mode 100644 index 0000000000000000000000000000000000000000..31524dcc049f51dbda7776ae8fdc3cb285ce0698 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_test_affected/childes_affected.test @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:276736b11ecf52b312716501103d5f9aa5f045de88f7e3343b85efc08d08984f +size 22923367 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_test_affected/gutenberg_affected.test b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_test_affected/gutenberg_affected.test new file mode 100644 index 0000000000000000000000000000000000000000..152bfdafd92c2dfcccc5b63743dfd177fe79198e --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_test_affected/gutenberg_affected.test @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2acf92fba7fac35210793cfe2a38b40e7be015352b840d6c398a54e32a654c53 +size 15240278 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_test_affected/open_subtitles_affected.test b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_test_affected/open_subtitles_affected.test new file mode 100644 index 0000000000000000000000000000000000000000..70c4ecf475e6408fa6f00a2b82ffa565e8296691 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_test_affected/open_subtitles_affected.test @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb1674b3b27baa7e7f6f4d5f04f87ad46887d0a861457367a98ddd3e2b2bebdd +size 13869412 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_test_affected/simple_wiki_affected.test b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_test_affected/simple_wiki_affected.test new file mode 100644 index 0000000000000000000000000000000000000000..6780cc02a8a507c6e3bc7536cea6cb99f1fc29a1 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_test_affected/simple_wiki_affected.test @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:499191588efb085a480f5ccf19a556493214e7782f35867a07b9a84b127e3683 +size 9142615 diff --git a/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_test_affected/switchboard_affected.test b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_test_affected/switchboard_affected.test new file mode 100644 index 0000000000000000000000000000000000000000..af84c05075cf72291b7303b381aba44eed62a123 --- /dev/null +++ b/data/Perturbed_data/Llama-3.2-3B/babylm_shuffle_local5/babylm_test_affected/switchboard_affected.test @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a773adf3bf8e213d99e80e4e237b4188d7137846c7b39c87bf76b4966bb24175 +size 1087539 diff --git a/edge_probing/edge_probing.py b/edge_probing/edge_probing.py new file mode 100644 index 0000000000000000000000000000000000000000..506416eb1aad1058521e39230281c6369cf477fe --- /dev/null +++ b/edge_probing/edge_probing.py @@ -0,0 +1,222 @@ +# edge_probing.py +# Author: Julie Kallini + +# For importing utils +import sys +sys.path.append("..") + +from utils import CHECKPOINT_READ_PATH, PERTURBATIONS, PAREN_MODELS, get_gpt2_tokenizer_with_markers +from gpt2_no_positional_encoding_model import GPT2NoPositionalEncodingLMHeadModel +from transformers import GPT2LMHeadModel +from sklearn.preprocessing import StandardScaler +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import accuracy_score +from sklearn.model_selection import train_test_split +from itertools import zip_longest +import torch +import tqdm +import argparse +import pandas as pd +import os + + +MAX_TRAINING_STEPS = 3000 +CHECKPOINTS = list(range(200, MAX_TRAINING_STEPS+1, 200)) +LAYERS = [1, 3, 6, 9, 12, "Avg Last 4"] + + +def get_layer_embedding(model, token_sequences, indices, layer=None): + + # Pad input token sequences + input_ids = zip(*zip_longest(*token_sequences, + fillvalue=gpt2_tokenizer.eos_token_id)) + input_ids = torch.tensor(list(input_ids)).to(device) + + # Get GPT2 model's output + with torch.no_grad(): + output = model(input_ids) + + # Either get the hidden state of the specified layer or + # get the average of the last 4 hidden states + if layer is not None: + hidden_states = output.hidden_states[layer] + else: + hidden_states = output.hidden_states[-4:] + hidden_states = sum(hidden_states) / 4 + + # Create mask using start and end indices + batch_size, seq_length = input_ids.shape + mask = torch.full((batch_size, seq_length), 0).to(device) + for i, (start_idx, end_idx) in enumerate(indices): + mask[i, start_idx:end_idx] = 1 + + # Mask out embeddings of tokens outside indices + mask_expanded = mask.unsqueeze(-1).expand(hidden_states.size()) + hidden_states = hidden_states * mask_expanded + + return hidden_states + + +def max_pooling(tensor, index_tuples): + pooled_results = [] + for i, (start, end) in enumerate(index_tuples): + # Extracting the embeddings corresponding to the specified range + embeddings = tensor[i, start:end, :] + + # Performing max pooling + max_pooled = torch.max(embeddings, dim=0)[0] + + pooled_results.append(max_pooled) + return torch.stack(pooled_results) + + +def mean_pooling(tensor, index_tuples): + batch_size, seq_len, embedding_size = tensor.shape + output = torch.empty(batch_size, embedding_size, + device=tensor.device, dtype=tensor.dtype) + + for i, (start, end) in enumerate(index_tuples): + embeddings = tensor[i, start:end, :] + output[i, :] = torch.mean(embeddings, dim=0) + + return output + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + prog='Edge probing', + description='Edge probing experiments') + parser.add_argument('perturbation_type', + default='all', + const='all', + nargs='?', + choices=PERTURBATIONS.keys(), + help='Perturbation function used to transform BabyLM dataset') + parser.add_argument('train_set', + default='all', + const='all', + nargs='?', + choices=["100M", "10M"], + help='BabyLM train set') + parser.add_argument('random_seed', type=int, help="Random seed") + parser.add_argument('paren_model', + default='all', + const='all', + nargs='?', + choices=list(PAREN_MODELS.keys()) + ["randinit"], + help='Parenthesis model') + parser.add_argument('pooling_operation', + default='all', + const='all', + nargs='?', + choices=["mean", "max"], + help='Pooling operation to compute on embeddings') + parser.add_argument('-np', '--no_pos_encodings', action='store_true', + help="Train GPT-2 with no positional encodings") + + # Get args + args = parser.parse_args() + + if args.pooling_operation == "mean": + pooling_function = mean_pooling + elif args.pooling_operation == "max": + pooling_function = max_pooling + else: + raise Exception("Pooling operation undefined") + + # Init tokenizer + gpt2_tokenizer = get_gpt2_tokenizer_with_markers([]) + gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token + + # Get path to model + no_pos_encodings_underscore = "_no_positional_encodings" if args.no_pos_encodings else "" + model = f"babylm_{args.perturbation_type}_{args.train_set}_{args.paren_model}{no_pos_encodings_underscore}_seed{args.random_seed}" + model_path = f"{CHECKPOINT_READ_PATH}/babylm_{args.perturbation_type}_{args.train_set}_{args.paren_model}{no_pos_encodings_underscore}/{model}/runs/{model}/checkpoint-" + + # Get constituency parse data + if "hop" in args.perturbation_type: + phrase_df = pd.read_csv("phrase_data/hop_phrase_data.csv") + elif "reverse" in args.perturbation_type: + phrase_df = pd.read_csv("phrase_data/reverse_phrase_data.csv") + else: + raise Exception("Phrase data not found") + + token_sequences = list(phrase_df["Sentence Tokens"]) + if args.perturbation_type == "reverse_full": + indices = list( + zip(phrase_df["Rev Start Index"], phrase_df["Rev End Index"])) + else: + indices = list(zip(phrase_df["Start Index"], phrase_df["End Index"])) + labels = list(phrase_df["Category"]) + + BATCH_SIZE = 32 + device = "cuda" + + edge_probing_df = pd.DataFrame(LAYERS, columns=["GPT-2 Layer"]) + for ckpt in CHECKPOINTS: + + # Load model + if args.no_pos_encodings: + model = GPT2LMHeadModel.from_pretrained( + model_path + str(ckpt), output_hidden_states=True).to(device) + else: + model = GPT2NoPositionalEncodingLMHeadModel.from_pretrained( + model_path + str(ckpt), output_hidden_states=True).to(device) + + layer_accuracies = [] + for layer in LAYERS: + print(f"Checkpoint: {ckpt}, Layer: {layer}") + print("Computing span embeddings...") + + # Iterate over token sequences and indices to get embeddings + spans = [] + for i in tqdm.tqdm(list(range(0, len(token_sequences), BATCH_SIZE))): + + tokens_batch = [[int(tok) for tok in seq.split()] + for seq in token_sequences[i:i+BATCH_SIZE]] + if args.perturbation_type == "reverse_full": + tokens_batch = [toks[::-1] for toks in tokens_batch] + + index_batch = indices[i:i+BATCH_SIZE] + + # Extract embeddings + if layer == "Avg Last 4": + embeddings = get_layer_embedding( + model, tokens_batch, index_batch, None) + else: + embeddings = get_layer_embedding( + model, tokens_batch, index_batch, layer) + pooled_results = pooling_function(embeddings, index_batch) + spans.extend(list(pooled_results)) + + # Get features and ground truth + X = torch.vstack(spans).detach().cpu().numpy() + y = labels + + # Split the data; since we pass random seed, it + # will be the same split every time + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.2, random_state=args.random_seed) + + # Fit L2-regularized linear classifier + clf = LogisticRegression(max_iter=10, + random_state=args.random_seed).fit(X_train, y_train) + + # Get probe accuracy + y_test_pred = clf.predict(X_test) + acc = accuracy_score(y_test, y_test_pred) + layer_accuracies.append(acc) + print(f"Accuracy: {acc}") + + edge_probing_df[f"Accuracy (ckpt {ckpt})"] = layer_accuracies + + # Write results to CSV + nps = '_no_pos_encodings' if args.no_pos_encodings else '' + directory = f"edge_probing_results/{args.perturbation_type}_{args.train_set}{nps}" + if not os.path.exists(directory): + os.makedirs(directory) + + file = directory + \ + f"/{args.paren_model}_{args.pooling_operation}_pooling_seed{args.random_seed}.csv" + print(f"Writing results to CSV: {file}") + edge_probing_df.to_csv(file) diff --git a/edge_probing/edge_probing.sh b/edge_probing/edge_probing.sh new file mode 100644 index 0000000000000000000000000000000000000000..24f45aadbd2587552c46445605f4c00a0714801d --- /dev/null +++ b/edge_probing/edge_probing.sh @@ -0,0 +1,60 @@ +#!/bin/sh +# edge_probing.sh +# author: Julie Kallini + +echo " +------------------------------------------------------------------------------- +Arguments +------------------------------------------------------------------------------- +" +echo "Random seed: $1" +NO_POS_ENCODINGS=${2:-''} +echo "No pos encodings: $NO_POS_ENCODINGS" + +echo " +------------------------------------------------------------------------------- +Run edge probing for each perturbation type +------------------------------------------------------------------------------- +" + +COMMAND="python3 edge_probing.py hop_control 100M $1 randinit mean $NO_POS_ENCODINGS" +echo $COMMAND +eval $COMMAND +echo " +" + +COMMAND="python3 edge_probing.py hop_tokens4 100M $1 randinit mean $NO_POS_ENCODINGS" +echo $COMMAND +eval $COMMAND +echo " +" + +COMMAND="python3 edge_probing.py hop_words4 100M $1 randinit mean $NO_POS_ENCODINGS" +echo $COMMAND +eval $COMMAND +echo " +" + +COMMAND="python3 edge_probing.py reverse_control 100M $1 randinit mean $NO_POS_ENCODINGS" +echo $COMMAND +eval $COMMAND +echo " +" + +COMMAND="python3 edge_probing.py reverse_full 100M $1 randinit mean $NO_POS_ENCODINGS" +echo $COMMAND +eval $COMMAND +echo " +" + +COMMAND="python3 edge_probing.py reverse_partial 100M $1 randinit mean $NO_POS_ENCODINGS" +echo $COMMAND +eval $COMMAND +echo " +" + +echo " +------------------------------------------------------------------------------- +Done! +------------------------------------------------------------------------------- +" diff --git a/train/checkpoints/Llama-3.2-3B/babylm_reverse_full_10M_seed0/runs/checkpoint-1350/rng_state_6.pth b/train/checkpoints/Llama-3.2-3B/babylm_reverse_full_10M_seed0/runs/checkpoint-1350/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..b9ca4382c9c6e570c4214df002a61397040d7349 Binary files /dev/null and b/train/checkpoints/Llama-3.2-3B/babylm_reverse_full_10M_seed0/runs/checkpoint-1350/rng_state_6.pth differ diff --git a/train/checkpoints/Llama-3.2-3B/babylm_reverse_full_10M_seed0/runs/checkpoint-1350/scheduler.pt b/train/checkpoints/Llama-3.2-3B/babylm_reverse_full_10M_seed0/runs/checkpoint-1350/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..4dd990ada9398f5cf76fb68bc676694c6f73f05f Binary files /dev/null and b/train/checkpoints/Llama-3.2-3B/babylm_reverse_full_10M_seed0/runs/checkpoint-1350/scheduler.pt differ diff --git a/train/checkpoints/Llama-3.2-3B/babylm_reverse_full_10M_seed0/runs/checkpoint-1350/tokenizer_config.json b/train/checkpoints/Llama-3.2-3B/babylm_reverse_full_10M_seed0/runs/checkpoint-1350/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0b782d4602b361f7577abb52b710a8c0e088588d --- /dev/null +++ b/train/checkpoints/Llama-3.2-3B/babylm_reverse_full_10M_seed0/runs/checkpoint-1350/tokenizer_config.json @@ -0,0 +1,2078 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128256": { + "content": "[PAD]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128257": { + "content": "🅁", + "lstrip": true, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|end_of_text|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "[PAD]", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/train/checkpoints/Llama-3.2-3B/babylm_reverse_full_10M_seed0/runs/checkpoint-1800/rng_state_2.pth b/train/checkpoints/Llama-3.2-3B/babylm_reverse_full_10M_seed0/runs/checkpoint-1800/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..a1f3b48b55bfc351cb4cae01832b5adfffdf8a53 Binary files /dev/null and b/train/checkpoints/Llama-3.2-3B/babylm_reverse_full_10M_seed0/runs/checkpoint-1800/rng_state_2.pth differ diff --git a/train/checkpoints/Llama-3.2-3B/babylm_reverse_partial_10M_seed0/artifacts/models--meta-llama--Llama-3.2-3B/snapshots/5cc0ffe09ee49f7be6ca7c794ee6bd7245e84e60/model-00001-of-00002.safetensors b/train/checkpoints/Llama-3.2-3B/babylm_reverse_partial_10M_seed0/artifacts/models--meta-llama--Llama-3.2-3B/snapshots/5cc0ffe09ee49f7be6ca7c794ee6bd7245e84e60/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e362b81bf93cef8c6d96b08171bab63416808c70 --- /dev/null +++ b/train/checkpoints/Llama-3.2-3B/babylm_reverse_partial_10M_seed0/artifacts/models--meta-llama--Llama-3.2-3B/snapshots/5cc0ffe09ee49f7be6ca7c794ee6bd7245e84e60/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:584d8d3e3f82f7964955174dfe5e3b1cf117a9d859f022cfdf7fcb884856e002 +size 4965799096 diff --git a/train/checkpoints/Llama-3.2-3B/babylm_reverse_partial_10M_seed0/runs/checkpoint-150/model-00002-of-00002.safetensors b/train/checkpoints/Llama-3.2-3B/babylm_reverse_partial_10M_seed0/runs/checkpoint-150/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..aca2798c1a2f92955571dc9c395c26cf805971df --- /dev/null +++ b/train/checkpoints/Llama-3.2-3B/babylm_reverse_partial_10M_seed0/runs/checkpoint-150/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47dbfb13ca2fb7dcb87c9eabb5f29f3c134bb5dcbbff67038d7453957bba043a +size 2247734920 diff --git a/train/checkpoints/Llama-3.2-3B/babylm_reverse_partial_10M_seed0/runs/checkpoint-300/tokenizer.json b/train/checkpoints/Llama-3.2-3B/babylm_reverse_partial_10M_seed0/runs/checkpoint-300/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..d0af2aebd22a29e048ac5f32c95a99f6ee0f465b --- /dev/null +++ b/train/checkpoints/Llama-3.2-3B/babylm_reverse_partial_10M_seed0/runs/checkpoint-300/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6fba7b84ac8c089d417b794dfc0527040604b90e2cdfe6e9df5b55afe9eab61a +size 17210282 diff --git a/train/deepspeed_config/train_dp_config.json b/train/deepspeed_config/train_dp_config.json new file mode 100644 index 0000000000000000000000000000000000000000..526c8e8fe5e92fbf8125b574726494130ccdd019 --- /dev/null +++ b/train/deepspeed_config/train_dp_config.json @@ -0,0 +1,19 @@ +{ + "train_micro_batch_size_per_gpu": "auto", + + "zero_optimization": { + "stage": 2, + "offload_optimizer": { + "device": "cpu", + "pin_memory": true + }, + "overlap_comm": true, + "reduce_bucket_size": 1e8 + }, + "fp16": { + "enabled": true + }, + "gradient_accumulation_steps": "auto", + "wall_clock_breakdown": false + } + \ No newline at end of file diff --git a/training/babylm_dataset.py b/training/babylm_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..2503c8d992321f8aaa799969d19c57fce051f42f --- /dev/null +++ b/training/babylm_dataset.py @@ -0,0 +1,140 @@ +# babylm_dataset.py +# author: Julie Kallini + +import datasets +import os +import glob +import tqdm +from numpy.random import default_rng +from itertools import product + +logger = datasets.logging.get_logger(__name__) + +_DESCRIPTION = """\ + Pre-tokenized BabyLM HuggingFace dataset for verb perturbations. +""" +_PERTURBED_DATA_PATH = "../data/Qwen_perturbed_data/Qwen2.5-7B" +_PERTURBATIONS = ["hop_control", "hop_tokens4", "hop_words4", + "reverse_control", "reverse_partial", "reverse_full", + "shuffle_control", "shuffle_nondeterministic", + "shuffle_deterministic21", "shuffle_deterministic57", "shuffle_deterministic84", + "shuffle_local3", "shuffle_local5", "shuffle_local10", + "shuffle_even_odd"] +# _RANDOM_SEEDS = [0, 14, 41, 53, 96] +_RANDOM_SEEDS = [0] +# _TRAIN_SETS = ["100M", "10M"] +_TRAIN_SETS = ["10M"] +_EOS_TOKEN_ID = 50256 + + +class BabyConfig(datasets.BuilderConfig): + + def __init__(self, data_dir, babylm_train_set, random_seed, **kwargs): + """BuilderConfig for IzParens + + Args: + data_dir: path to directory of tokenized, perturbed BabyLM dataset + """ + super(BabyConfig, self).__init__( + **kwargs, + ) + self.data_dir = data_dir + self.babylm_train_set = babylm_train_set + self.random_seed = random_seed + + +class BabyLMCorpus(datasets.GeneratorBasedBuilder): + BUILDER_CONFIGS = [ + BabyConfig( + name=f"babylm_{perturbation}_{train_set}_seed{random_seed}", + data_dir=os.path.join( + _PERTURBED_DATA_PATH, "babylm_" + perturbation), + babylm_train_set=train_set, + random_seed=random_seed, + ) for perturbation, train_set, random_seed in list(product(_PERTURBATIONS, _TRAIN_SETS, _RANDOM_SEEDS)) + ] + + def _info(self): + return datasets.DatasetInfo( + # This is the description that will appear on the datasets page. + description=_DESCRIPTION, + # datasets.features.FeatureConnectors + features=datasets.Features( + { + "text": datasets.Value("string") + # These are the features of your dataset like images, labels ... + } + ), + # If there's a common (input, target) tuple from the features, + # specify them here. They'll be used if as_supervised=True in + # builder.as_dataset. + supervised_keys=None, + ) + + def _split_generators(self, dl_manager): + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={"data_dir": os.path.join( + self.config.data_dir, "babylm_" + self.config.babylm_train_set), "random_seed": self.config.random_seed, "split": "train"}, + ), + # datasets.SplitGenerator( + # name=datasets.Split.VALIDATION, + # gen_kwargs={"data_dir": os.path.join( + # self.config.data_dir, "babylm_dev"), "random_seed": self.config.random_seed, "split": "valid"}, + # + ] + + def __chunk(self, sentences, eos_token): + + # Tokenize each sentence + logger.info("Loading pre-tokenized data") + tokenized_sentences = [] + for sent in tqdm.tqdm(sentences): + tokenized_sentences.append([int(tok) for tok in sent.split()]) + + # Concatenate the tokenized sentences using the EOS token + logger.info("Concatenating tokenized data using EOS token") + all_tokens = [] + for tokens in tqdm.tqdm(tokenized_sentences): + all_tokens.extend(tokens) + all_tokens.append(eos_token) + + # Chunk the tokens into sublists of max_seq_len tokens each + logger.info("Chunking tokens into sublists of 1024") + max_seq_len = 1024 + chunked_tokens = [] + for i in tqdm.tqdm(range(0, len(all_tokens), max_seq_len)): + chunked_tokens.append(all_tokens[i:i + max_seq_len]) + + # Drop last line if not a multiple of max_seq_len + if len(chunked_tokens[-1]) < max_seq_len: + chunked_tokens.pop() + + return chunked_tokens + + def _generate_examples(self, data_dir, random_seed, split): + """This function returns the BabyLM text in the discretized, tokenized form.""" + + logger.info("Generating examples from = %s", data_dir) + infiles = sorted(glob.glob(os.path.join(data_dir, "*"))) + + # Extend sentences + all_sentences = [] + for infile in infiles: + f = open(infile, encoding="utf-8") + all_sentences.extend(f.readlines()) + logger.info("Total sentences: {}".format(len(all_sentences))) + + # Shuffle because we are pre-tokenizing + rng = default_rng(seed=random_seed) + rng.shuffle(all_sentences) + + # Tokenize and chunk + tokenized_lines = self.__chunk(all_sentences, _EOS_TOKEN_ID) + + # Generate data + logger.info("Writing dataset as space-separated sequences of tokens") + for idx, line in enumerate(tokenized_lines): + l = " ".join([str(tok) for tok in line]) + "\n" + yield idx, {"text": l} \ No newline at end of file diff --git a/training/conf/template/babylm_dataset_template.yaml b/training/conf/template/babylm_dataset_template.yaml new file mode 100644 index 0000000000000000000000000000000000000000..de98aa530c9a73686b754dfb84ad86270cd3913c --- /dev/null +++ b/training/conf/template/babylm_dataset_template.yaml @@ -0,0 +1,13 @@ +# dataset_{{ perturbation }}_{{ train_set }}_seed{{ seed }}.yaml +# Configuration for altered babylm-dataset +--- +dataset: + id: /nlp/scr/kallini/llms-in-llms/training/babylm_dataset.py + name: babylm_{{ perturbation }}_{{ train_set }}_seed{{ seed }} + validation_ratio: null + + # Number of Preprocessing Workers + num_proc: 4 + + # Number of Evaluation Preprocessing Workers + eval_num_proc: 4 \ No newline at end of file diff --git a/training/conf/template/babylm_train_template.yaml b/training/conf/template/babylm_train_template.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a9a7404d67135f379d8b9f64445377bcce0608ad --- /dev/null +++ b/training/conf/template/babylm_train_template.yaml @@ -0,0 +1,49 @@ +# train_{{ perturbation }}_{{ train_set }}_{{ paren_model }}{{ no_pos_encodings_underscore }}_seed{{ seed }}.yaml +# Based on mistral-small.yaml. +--- +# Inherit Dataset, Tokenization, Model, and Training Details +inherit: + - datasets/dataset_{{ perturbation }}_{{ train_set }}_seed{{ seed }}.yaml + - models/gpt2{{ no_pos_encodings }}-small-{{ perturbation }}-{{ paren_model }}.yaml + - trainers/gpt2-small.yaml + +# Run ID -- make sure to override! +run_id: babylm_{{ perturbation }}_{{ train_set }}_{{ paren_model }}{{ no_pos_encodings_underscore }}_seed{{ seed }} + +# Weights & Biases +wandb: kallini +group: babylm-perturbation-experiments + +# Artifacts & Caching +artifacts: + cache_dir: {{ ckpt_path }}/babylm_{{ perturbation }}_{{ train_set }}_{{ paren_model }}{{ no_pos_encodings_underscore }}/babylm_{{ perturbation }}_{{ train_set }}_{{ paren_model }}{{ no_pos_encodings_underscore }}_seed{{ seed }}/artifacts + run_dir: {{ ckpt_path }}/babylm_{{ perturbation }}_{{ train_set }}_{{ paren_model }}{{ no_pos_encodings_underscore }}/babylm_{{ perturbation }}_{{ train_set }}_{{ paren_model }}{{ no_pos_encodings_underscore }}_seed{{ seed }}/runs + +# Save Effective Batch Size for Easy Handling ==> Main Code asserts infra + training_config results in this! +effective_bsz: 512 + +# Resume from Checkpoint +resume: false +resume_checkpoint: null + +# List of frequencies at which to save checkpoints, provided as a list of two-element tuples: +# - Frequency (`freq`) at which to save checkpoints (# steps) +# - Bound (`until`) on global step for given frequency (checkpoint every `freq` steps until global step = `until`) +checkpoint_frequency: + - [100, 3000] + +# `torch.distributed` Default Infra Parameters -- to be overwritten by call to `torch.distributed.launch` +local_rank: -1 +nnodes: -1 +nproc_per_node: -1 + +# DeepSpeed Default Infra Parameters -- to be overwritten by call to `DeepSpeed` +num_gpus: -1 +num_nodes: -1 +world_size: -1 + +# Logging Parameters -- 10 = DEBUG, 20 = INFO, 30 = WARNING, 40 = ERROR, 50 = CRITICAL +log_level: 20 + +# Random Seed +seed: {{ seed }} \ No newline at end of file diff --git a/training/conf/template/gpt2-small-template.yaml b/training/conf/template/gpt2-small-template.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c9e54a22abe0d562ac81387c1d86d41d08d7b450 --- /dev/null +++ b/training/conf/template/gpt2-small-template.yaml @@ -0,0 +1,22 @@ +# gpt2{{ no_pos_encodings }}-small-{{ perturbation }}-{{ paren_model }}.yaml +# Configuration for the GPT-2 Small Model. +--- +model: + id: "gpt2{{ no_pos_encodings }}-small" + + # Boolean whether to use the pre-existing Hugging Face AutoTokenizer (or train a new one from scratch) + pretrained_tokenizer: false + passthrough_tokenizer: true + + # Sequence Length + seq_len: 1024 + + # Stability + reorder_and_upcast_attn: true + scale_attn_by_inverse_layer_idx: true + + # Initialize Weights from File + initial_weights: {{ paren_model_path }} + + # Configure Model From File + config_path: /nlp/scr/kallini/mistral/conf/models/gpt2-small-{{ vocab_size }}.json \ No newline at end of file diff --git a/training/generate_yaml.py b/training/generate_yaml.py new file mode 100644 index 0000000000000000000000000000000000000000..da1878517c622823bdd4dfbd454e935fd1f14ce2 --- /dev/null +++ b/training/generate_yaml.py @@ -0,0 +1,131 @@ +# generate_yaml.py +# Author: Julie Kallini + +# For importing utils +import sys +sys.path.append("..") + +from jinja2 import Template +from utils import PERTURBATIONS, CHECKPOINT_WRITE_PATH, \ + PAREN_MODELS, PAREN_MODEL_PATH +import argparse +import os + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser( + prog='Generate yaml for training', + description='Generate train and dataset yaml configs for mistral training') + parser.add_argument('perturbation_type', + default='all', + const='all', + nargs='?', + choices=PERTURBATIONS.keys(), + help='Perturbation function used to transform BabyLM dataset') + parser.add_argument('train_set', + default='all', + const='all', + nargs='?', + choices=["100M", "10M"], + help='BabyLM train set') + parser.add_argument('random_seed', type=int, help="Random seed") + parser.add_argument('paren_model', + default='all', + const='all', + nargs='?', + choices=list(PAREN_MODELS.keys()) + ["randinit"], + help='Parenthesis model') + parser.add_argument('-np', '--no_pos_encodings', action='store_true', + help="Train GPT-2 with no positional encodings") + + # Get args + args = parser.parse_args() + if args.paren_model != "randinit": + paren_model_path = PAREN_MODEL_PATH + PAREN_MODELS[args.paren_model] + "/checkpoint-5000" + else: + paren_model_path = "null" + paren_model_name = args.paren_model + no_pos_encodings_str = "-no-positional-encodings" if args.no_pos_encodings else "" + no_pos_encodings_underscore = "_no_positional_encodings" if args.no_pos_encodings else "" + + # Create directory for yaml + yaml_directory = f"conf/babylm_{args.perturbation_type}_{args.train_set}_{paren_model_name}{no_pos_encodings_underscore}/seed{args.random_seed}" + if not os.path.exists(yaml_directory): + os.makedirs(yaml_directory) + + print("Generating GPT-2 model yaml file...") + + # Get model template, which varies due to changes in vocab size + model_temp_file = open("conf/template/gpt2-small-template.yaml") + lines = model_temp_file.readlines() + model_temp_file.close() + + # Fill model template + tokenizer = PERTURBATIONS[args.perturbation_type]["gpt2_tokenizer"] + vocab_size = len(tokenizer) + model_template = Template("".join(lines)) + model_conf = model_template.render( + perturbation=args.perturbation_type, + vocab_size=vocab_size, + paren_model=paren_model_name, + paren_model_path=paren_model_path, + no_pos_encodings=no_pos_encodings_str, + ) + + # Write model yaml to file + model_file = open( + f"conf/babylm_{args.perturbation_type}_{args.train_set}_{paren_model_name}{no_pos_encodings_underscore}/gpt2{no_pos_encodings_str}-small-{args.perturbation_type}-{paren_model_name}.yaml", "w") + model_file.write(model_conf) + model_file.close() + + print("Generating train yaml file...") + + # Get train template file + train_temp_file = open("conf/template/babylm_train_template.yaml") + lines = train_temp_file.readlines() + train_temp_file.close() + + # Fill train template file + train_template = Template("".join(lines)) + train_conf = train_template.render( + perturbation=args.perturbation_type, + seed=args.random_seed, + ckpt_path=CHECKPOINT_WRITE_PATH, + train_set=args.train_set, + paren_model=paren_model_name, + no_pos_encodings=no_pos_encodings_str, + no_pos_encodings_underscore=no_pos_encodings_underscore, + ) + + # Write train yaml to file + train_file = open(yaml_directory + \ + f"/train_{args.perturbation_type}_{args.train_set}_{paren_model_name}{no_pos_encodings_underscore}_seed{args.random_seed}.yaml", "w") + train_file.write(train_conf) + train_file.close() + + print("Generating dataset yaml file...") + + # Get dataset temp file + dataset_temp_file = open("conf/template/babylm_dataset_template.yaml") + lines = dataset_temp_file.readlines() + dataset_temp_file.close() + + # Fill dataset template file + dataset_template = Template("".join(lines)) + dataset_conf = dataset_template.render( + perturbation=args.perturbation_type, + train_set=args.train_set, + seed=args.random_seed, + ) + + # Write dataset yaml to file + dataset_file = open(yaml_directory + \ + f"/dataset_{args.perturbation_type}_{args.train_set}_seed{args.random_seed}.yaml", "w") + dataset_file.write(dataset_conf) + dataset_file.close() + + # Create directory for model checkpoints + ckpt_directory = CHECKPOINT_WRITE_PATH + f"/babylm_{args.perturbation_type}_{args.train_set}_{paren_model_name}{no_pos_encodings_underscore}" + if not os.path.exists(ckpt_directory): + os.makedirs(ckpt_directory) \ No newline at end of file diff --git a/training/prepare_training.sh b/training/prepare_training.sh new file mode 100644 index 0000000000000000000000000000000000000000..efb59c82b1512a68cf161887139960a4797c4349 --- /dev/null +++ b/training/prepare_training.sh @@ -0,0 +1,63 @@ +#!/bin/sh +# prepare_training.sh +# author: Julie Kallini + +readonly MISTRAL_PATH=/nlp/scr/kallini/mistral + +echo " +------------------------------------------------------------------------------- +Arguments +------------------------------------------------------------------------------- +" +echo "Perturbation type: $1" +echo "Train set: $2" +echo "Random seed: $3" +echo "Paren pretrained model: $4" +NO_POS_ENCODINGS=${5:-''} +echo "No pos encodings: $NO_POS_ENCODINGS" +echo "Mistral path: $MISTRAL_PATH" + +if [ -z "$NO_POS_ENCODINGS" ] +then + NPS="" + NPSunderscore="" +else + NPS="-no-positional-encodings" + NPSunderscore="_no_positional_encodings" +fi + +# Generate yaml files for mistral training +echo " +------------------------------------------------------------------------------- +Generating yaml files for mistral training +------------------------------------------------------------------------------- +" +GENERATE_YAML_COMMAND="python3 generate_yaml.py $1 $2 $3 $4 $NO_POS_ENCODINGS" +echo $GENERATE_YAML_COMMAND +eval $GENERATE_YAML_COMMAND + +# Copy yaml files to mistral directory +echo " +------------------------------------------------------------------------------- +Copying config yaml files to mistral directory +------------------------------------------------------------------------------- +" +COPY_DATASET_COMMAND="cp conf/babylm_$1_$2_$4$NPSunderscore/seed$3/dataset_$1_$2_seed$3.yaml $MISTRAL_PATH/conf/datasets/dataset_$1_$2_seed$3.yaml" +echo $COPY_DATASET_COMMAND +eval $COPY_DATASET_COMMAND +echo "" + +COPY_TRAIN_COMMAND="cp conf/babylm_$1_$2_$4$NPSunderscore/seed$3/train_$1_$2_$4$NPSunderscore"_"seed$3.yaml $MISTRAL_PATH/conf/train_$1_$2_$4$NPSunderscore"_"seed$3.yaml" +echo $COPY_TRAIN_COMMAND +eval $COPY_TRAIN_COMMAND +echo "" + +COPY_MODEL_COMMAND="cp conf/babylm_$1_$2_$4$NPSunderscore/gpt2$NPS-small-$1-$4.yaml $MISTRAL_PATH/conf/models/gpt2$NPS-small-$1-$4.yaml" +echo $COPY_MODEL_COMMAND +eval $COPY_MODEL_COMMAND + +echo " +------------------------------------------------------------------------------- +Done! +------------------------------------------------------------------------------- +"