diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/anagrams1-v0-res.json b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/anagrams1-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..c89528892ae2cb5dfc87cf28f587062a18323d87
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/anagrams1-v0-res.json
@@ -0,0 +1 @@
+{"results": {"anagrams1": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"anagrams1": 0}}
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/arc_easy-v0-loglikelihood b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/arc_easy-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..d82be433abe592079dc9ce67ec7e97fe668c8590
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/arc_easy-v0-loglikelihood
@@ -0,0 +1 @@
+ffa6e39a35a16299dcb015f17f986aaa598ad8b4840c4cebe0339a7042232741
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/arc_easy-v0-res.json b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/arc_easy-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..f217448594199a54d671be7302857509eb6d691f
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/arc_easy-v0-res.json
@@ -0,0 +1 @@
+{"results": {"arc_easy": {"acc": 0.2474747474747475, "acc_norm": 0.24074074074074073, "acc_norm_stderr": 0.008772796145221907, "acc_stderr": 0.008855114414834707}}, "versions": {"arc_easy": 0}}
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/arithmetic_3da-v0-res.json b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/arithmetic_3da-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..1bbb3eb0c26b177cb739f58d8098b339278fcd84
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/arithmetic_3da-v0-res.json
@@ -0,0 +1 @@
+{"results": {"arithmetic_3da": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"arithmetic_3da": 0}}
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_animate_subject_trans-v0-loglikelihood b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_animate_subject_trans-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..07106a905853aad9876257f308e3af5900066253
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_animate_subject_trans-v0-loglikelihood
@@ -0,0 +1 @@
+2a84231e7b79f517427e57e2099c88fed3d60a7efab4ef9506e263b4091d5cfa
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_causative-v0-res.json b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_causative-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..90dc95da8116c38d2ff3bec041973004b7f5703b
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_causative-v0-res.json
@@ -0,0 +1 @@
+{"results": {"blimp_causative": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_causative": 0}}
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_with_adj_2-v0-res.json b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_with_adj_2-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..67ea47559d248f90cc66870a37fdecd850ba4c79
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_with_adj_2-v0-res.json
@@ -0,0 +1 @@
+{"results": {"blimp_determiner_noun_agreement_with_adj_2": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_determiner_noun_agreement_with_adj_2": 0}}
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_existential_there_quantifiers_1-v0-res.json b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_existential_there_quantifiers_1-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..076319f01e4309fae1bebb80834d35ebdebec6ec
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_existential_there_quantifiers_1-v0-res.json
@@ -0,0 +1 @@
+{"results": {"blimp_existential_there_quantifiers_1": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_existential_there_quantifiers_1": 0}}
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_irregular_plural_subject_verb_agreement_2-v0-res.json b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_irregular_plural_subject_verb_agreement_2-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..b0289b9dea483e58b56403fdfa30575b61fdfbd1
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_irregular_plural_subject_verb_agreement_2-v0-res.json
@@ -0,0 +1 @@
+{"results": {"blimp_irregular_plural_subject_verb_agreement_2": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_irregular_plural_subject_verb_agreement_2": 0}}
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_matrix_question_npi_licensor_present-v0-res.json b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_matrix_question_npi_licensor_present-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..4fba717b88b566130bd8dbd52dd0da2d5a65ee17
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_matrix_question_npi_licensor_present-v0-res.json
@@ -0,0 +1 @@
+{"results": {"blimp_matrix_question_npi_licensor_present": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_matrix_question_npi_licensor_present": 0}}
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_principle_A_c_command-v0-loglikelihood b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_principle_A_c_command-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..87b49c5de9f79253e3cfa34ad3e6fb5c8d8a7b06
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_principle_A_c_command-v0-loglikelihood
@@ -0,0 +1 @@
+7c2ed82612af9175052cd44d8e178b6dd084c04eb462a3d88fcacfad2df8be8e
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_principle_A_domain_2-v0-res.json b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_principle_A_domain_2-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..1bda1a2aa9c1eeee68b3ca88f2de38cbb8e5d67b
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_principle_A_domain_2-v0-res.json
@@ -0,0 +1 @@
+{"results": {"blimp_principle_A_domain_2": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_principle_A_domain_2": 0}}
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_regular_plural_subject_verb_agreement_1-v0-loglikelihood b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_regular_plural_subject_verb_agreement_1-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..0a32ca7f971e537ab6fc6d338db3ad1c3d506f64
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_regular_plural_subject_verb_agreement_1-v0-loglikelihood
@@ -0,0 +1 @@
+5bc0441f31e32443cf761bca6e961d504e1e84b15aa4e1d79e5c8ed5b4c2aa3a
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_sentential_negation_npi_licensor_present-v0-res.json b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_sentential_negation_npi_licensor_present-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..4305bb313c67880a0e4ebf7827c29a2aa2df6d66
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_sentential_negation_npi_licensor_present-v0-res.json
@@ -0,0 +1 @@
+{"results": {"blimp_sentential_negation_npi_licensor_present": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_sentential_negation_npi_licensor_present": 0}}
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_transitive-v0-res.json b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_transitive-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..d2c99ab803288212934142c2507a8c316695a34b
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_transitive-v0-res.json
@@ -0,0 +1 @@
+{"results": {"blimp_transitive": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_transitive": 0}}
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_wh_island-v0-loglikelihood b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_wh_island-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..d27f1316dc96be401dee9392f973e9bbd799a409
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_wh_island-v0-loglikelihood
@@ -0,0 +1 @@
+91a9e4b60b0f3572a7fdbd7648d0e69f36e5eb34db715315b0082558d7ed8b65
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_wh_questions_object_gap-v0-loglikelihood b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_wh_questions_object_gap-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..c3e6af12f2da0a1857c0f0456bf4052d5558329e
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_wh_questions_object_gap-v0-loglikelihood
@@ -0,0 +1 @@
+4d4aaa0274ccd485ff8430ed61b8f83806febe18c16616c7d050f637a0463eba
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_wh_questions_subject_gap_long_distance-v0-res.json b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_wh_questions_subject_gap_long_distance-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..fe6bbf95e5406ad38d4894bf5d4609beeaa05f9a
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_wh_questions_subject_gap_long_distance-v0-res.json
@@ -0,0 +1 @@
+{"results": {"blimp_wh_questions_subject_gap_long_distance": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_wh_questions_subject_gap_long_distance": 0}}
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_wh_vs_that_with_gap-v0-loglikelihood b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_wh_vs_that_with_gap-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..4c15f2283eb93c5ab4b9cdbddf3e91117211918d
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/blimp_wh_vs_that_with_gap-v0-loglikelihood
@@ -0,0 +1 @@
+d41a9b85e4c31e445bf9b46b8642df02203ccc02b4a9b254bf76066d5c54b4b7
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/cb-v0-res.json b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/cb-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..ba386fd6c7e67c5048d2f4a4240e1b308dca7db5
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/cb-v0-res.json
@@ -0,0 +1 @@
+{"results": {"cb": {"acc": 0.3392857142857143, "acc_stderr": 0.06384226561930825, "f1": 0.2819143819143819}}, "versions": {"cb": 0}}
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/cb-v1-res.json b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/cb-v1-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..1cff410b2c35a16b457d163d95ac7cbd8eb704e2
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/cb-v1-res.json
@@ -0,0 +1 @@
+{"results": {"cb": {"acc": 0.3392857142857143, "acc_stderr": 0.06384226561930825, "f1": 0.2819143819143819}}, "versions": {"cb": 1}}
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/crows_pairs_english-v0-loglikelihood b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/crows_pairs_english-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..63749433f1703a4c81965e6c04fec04177631bae
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/crows_pairs_english-v0-loglikelihood
@@ -0,0 +1 @@
+ee3ce1ddb8071d4189e5b06e7f3c618a434221ac52935d0f434c4d183f01458a
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/crows_pairs_english_disability-v0-res.json b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/crows_pairs_english_disability-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..14510a13a1c390adfbb9c73149b88e5b8a2c4f64
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/crows_pairs_english_disability-v0-res.json
@@ -0,0 +1 @@
+{"results": {"crows_pairs_english_disability": {"likelihood_difference": 0.3148684792547637, "likelihood_difference_stderr": 0.02800803147051987, "pct_stereotype": 0.36923076923076925, "pct_stereotype_stderr": 0.06032456592830047}}, "versions": {"crows_pairs_english_disability": 0}}
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/crows_pairs_english_physical_appearance-v0-loglikelihood b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/crows_pairs_english_physical_appearance-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..fedfdac52d966f6edcdb229456858da1959b24d1
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/crows_pairs_english_physical_appearance-v0-loglikelihood
@@ -0,0 +1 @@
+d1823f5038afafa7a5338e42531720480c8ccf4e177789526caf294d52d56e89
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/crows_pairs_english_race_color-v0-res.json b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/crows_pairs_english_race_color-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..75d356522edf08f93f03d3ba37ed323d39f5b35e
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/crows_pairs_english_race_color-v0-res.json
@@ -0,0 +1 @@
+{"results": {"crows_pairs_english_race_color": {"likelihood_difference": 0.3322827903840805, "likelihood_difference_stderr": 0.01019838186372816, "pct_stereotype": 0.4822834645669291, "pct_stereotype_stderr": 0.022191835500120254}}, "versions": {"crows_pairs_english_race_color": 0}}
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/crows_pairs_english_religion-v0-loglikelihood b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/crows_pairs_english_religion-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..b56bc901ca48380f5a188f9c18ef12ba0abe49ca
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/crows_pairs_english_religion-v0-loglikelihood
@@ -0,0 +1 @@
+2ed57377174adaf0fb30037eb055eafdd02cd46e57bc32066d5fecd90a14b6e1
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/crows_pairs_french_physical_appearance-v0-loglikelihood b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/crows_pairs_french_physical_appearance-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..167b5e3ba055d1d67ca70e4f9cd3879f6b40b179
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/crows_pairs_french_physical_appearance-v0-loglikelihood
@@ -0,0 +1 @@
+ea61eaad64e9292790d4bbef955ffeebed7a595de098bc5ac726a6e51f27f9af
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/cycle_letters-v0-greedy_until b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/cycle_letters-v0-greedy_until
new file mode 100644
index 0000000000000000000000000000000000000000..9068a24ef5af549a13fe5b4362c2b5afc741bd29
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/cycle_letters-v0-greedy_until
@@ -0,0 +1 @@
+eb23f7d5de7528eefd8ed5f8054c402ff947319cccfef7195995946f99389201
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/ethics_utilitarianism-v0-res.json b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/ethics_utilitarianism-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..857af346b47d7ce11ee4192b928608a2111776f4
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/ethics_utilitarianism-v0-res.json
@@ -0,0 +1 @@
+{"results": {"ethics_utilitarianism": {"acc": 0.49771214642262895, "acc_stderr": 0.007211546310787838}}, "versions": {"ethics_utilitarianism": 0}}
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/ethics_utilitarianism_original-v0-loglikelihood b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/ethics_utilitarianism_original-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..bd3ff6c459c5a5739b233dd86c5434f64bbc1b16
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/ethics_utilitarianism_original-v0-loglikelihood
@@ -0,0 +1 @@
+5b42ba1faf5ece6a6ec9a3976ce79c1fac8df5b98272aab85457188c2142693c
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/ethics_utilitarianism_original-v0-res.json b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/ethics_utilitarianism_original-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..16940c8f5a7dd9ebb1d73298346ab1d19811ec90
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/ethics_utilitarianism_original-v0-res.json
@@ -0,0 +1 @@
+{"results": {"ethics_utilitarianism_original": {"acc": 0.5214226289517471, "acc_stderr": 0.007204999520618661}}, "versions": {"ethics_utilitarianism_original": 0}}
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/hellaswag-v0-res.json b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/hellaswag-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..6be94a640950b2451775fddccbf80060c4a673b0
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/hellaswag-v0-res.json
@@ -0,0 +1 @@
+{"results": {"hellaswag": {"acc": 0.24965146385182235, "acc_norm": 0.24756024696275641, "acc_norm_stderr": 0.004307128573285236, "acc_stderr": 0.004319267432460666}}, "versions": {"hellaswag": 0}}
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/hendrycksTest-business_ethics-v0-loglikelihood b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/hendrycksTest-business_ethics-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..a0f8b7c09b3b6307123f1328c51c1dcfb797aed2
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/hendrycksTest-business_ethics-v0-loglikelihood
@@ -0,0 +1 @@
+b3b27e9dbad587377d3c8cab1072782de883e245da93a563bd8b3099017b1fc0
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/hendrycksTest-college_biology-v0-loglikelihood b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/hendrycksTest-college_biology-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..7f665ef4a1bd06ecfd30d999ae6880c00ba849cf
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/hendrycksTest-college_biology-v0-loglikelihood
@@ -0,0 +1 @@
+c29e4e67ff91af29b9434884874414d1b1b32ccc32903c6b1639469b19907419
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/hendrycksTest-college_biology-v0-res.json b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/hendrycksTest-college_biology-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..6705b9cad27c7f1eb647b513861646faaccad584
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/hendrycksTest-college_biology-v0-res.json
@@ -0,0 +1 @@
+{"results": {"hendrycksTest-college_biology": {"acc": 0.24305555555555555, "acc_norm": 0.2361111111111111, "acc_norm_stderr": 0.03551446610810826, "acc_stderr": 0.03586879280080341}}, "versions": {"hendrycksTest-college_biology": 0}}
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/hendrycksTest-college_computer_science-v0-loglikelihood b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/hendrycksTest-college_computer_science-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..695bc8c31592a4c33d70d5d07a8c5b523d9bd3cc
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/hendrycksTest-college_computer_science-v0-loglikelihood
@@ -0,0 +1 @@
+4ea26ad780290429ac5a3317559c154848d662bd40532c966458ba6f2a32d0a3
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/hendrycksTest-computer_security-v0-res.json b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/hendrycksTest-computer_security-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..60f02eba9cb04602d8b67d67269d8b82e0930721
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/hendrycksTest-computer_security-v0-res.json
@@ -0,0 +1 @@
+{"results": {"hendrycksTest-computer_security": {"acc": 0.24, "acc_norm": 0.27, "acc_norm_stderr": 0.044619604333847394, "acc_stderr": 0.042923469599092816}}, "versions": {"hendrycksTest-computer_security": 0}}
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/hendrycksTest-elementary_mathematics-v0-res.json b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/hendrycksTest-elementary_mathematics-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..84cd983ee9d33f831ee397ffd8b11990b70a4b60
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/hendrycksTest-elementary_mathematics-v0-res.json
@@ -0,0 +1 @@
+{"results": {"hendrycksTest-elementary_mathematics": {"acc": 0.2724867724867725, "acc_norm": 0.2830687830687831, "acc_norm_stderr": 0.023201392938194978, "acc_stderr": 0.022930973071633345}}, "versions": {"hendrycksTest-elementary_mathematics": 0}}
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_statistics-v0-res.json b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_statistics-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..4c6a21d7dac4cd7b6fa217e8bebf34d959554a7a
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_statistics-v0-res.json
@@ -0,0 +1 @@
+{"results": {"hendrycksTest-high_school_statistics": {"acc": 0.2962962962962963, "acc_norm": 0.3055555555555556, "acc_norm_stderr": 0.03141554629402544, "acc_stderr": 0.03114144782353604}}, "versions": {"hendrycksTest-high_school_statistics": 0}}
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_world_history-v0-loglikelihood b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_world_history-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..228dfe072cd02f94bced495f271c5cc108850719
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_world_history-v0-loglikelihood
@@ -0,0 +1 @@
+1c8b994bd9a63ec874fc8d0e3a27077118b7adc472306b2fd6c55635a78b9d52
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/hendrycksTest-moral_disputes-v0-res.json b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/hendrycksTest-moral_disputes-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..26ea1c2a75ccfb96af880ee30eef11520e9ea39c
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/hendrycksTest-moral_disputes-v0-res.json
@@ -0,0 +1 @@
+{"results": {"hendrycksTest-moral_disputes": {"acc": 0.24855491329479767, "acc_norm": 0.27167630057803466, "acc_norm_stderr": 0.023948512905468365, "acc_stderr": 0.023267528432100174}}, "versions": {"hendrycksTest-moral_disputes": 0}}
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/hendrycksTest-moral_scenarios-v0-loglikelihood b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/hendrycksTest-moral_scenarios-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..d5ea0d8156ae4efaa0f7568ae8fd3a8ed3992d37
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/hendrycksTest-moral_scenarios-v0-loglikelihood
@@ -0,0 +1 @@
+a8e1882e77728b53c8b86312254d08320d8363fb606d746a8dd145b812f62cf5
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/hendrycksTest-philosophy-v0-res.json b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/hendrycksTest-philosophy-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..ec9c1e79c117c88246fa596ca90821025c9786af
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/hendrycksTest-philosophy-v0-res.json
@@ -0,0 +1 @@
+{"results": {"hendrycksTest-philosophy": {"acc": 0.26366559485530544, "acc_norm": 0.2733118971061093, "acc_norm_stderr": 0.02531176597542612, "acc_stderr": 0.02502553850053234}}, "versions": {"hendrycksTest-philosophy": 0}}
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/hendrycksTest-public_relations-v0-res.json b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/hendrycksTest-public_relations-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..9ba711cca75cfa5f22bb2dc52e68839ac3820b88
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/hendrycksTest-public_relations-v0-res.json
@@ -0,0 +1 @@
+{"results": {"hendrycksTest-public_relations": {"acc": 0.3090909090909091, "acc_norm": 0.2636363636363636, "acc_norm_stderr": 0.04220224692971987, "acc_stderr": 0.044262946482000985}}, "versions": {"hendrycksTest-public_relations": 0}}
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/hendrycksTest-sociology-v0-res.json b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/hendrycksTest-sociology-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..8711cf195e4fa92606a47c1b7c701643f0ef483e
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/hendrycksTest-sociology-v0-res.json
@@ -0,0 +1 @@
+{"results": {"hendrycksTest-sociology": {"acc": 0.23383084577114427, "acc_norm": 0.24875621890547264, "acc_norm_stderr": 0.030567675938916707, "acc_stderr": 0.02992941540834838}}, "versions": {"hendrycksTest-sociology": 0}}
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/hendrycksTest-virology-v0-res.json b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/hendrycksTest-virology-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..0004b194049a5dce0266002b4a19882fbb8c6bfa
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/hendrycksTest-virology-v0-res.json
@@ -0,0 +1 @@
+{"results": {"hendrycksTest-virology": {"acc": 0.27710843373493976, "acc_norm": 0.2710843373493976, "acc_norm_stderr": 0.03460579907553027, "acc_stderr": 0.034843315926805875}}, "versions": {"hendrycksTest-virology": 0}}
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/iwslt17-en-ar-v0-res.json b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/iwslt17-en-ar-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..a22fa9036c790cb48e142bd05a59da7824a9c83f
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/iwslt17-en-ar-v0-res.json
@@ -0,0 +1 @@
+{"results": {"iwslt17-en-ar": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.0, "chrf_stderr": 0.0, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"iwslt17-en-ar": 0}}
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/lambada_mt_es-v0-res.json b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/lambada_mt_es-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..5f95957324e138bb424e71ff93f81a0c0a11f2cb
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/lambada_mt_es-v0-res.json
@@ -0,0 +1 @@
+{"results": {"lambada_mt_es": {"acc": 0.0, "acc_stderr": 0.0, "ppl": 1.6479047769869253, "ppl_stderr": 0.006497321146240192}}, "versions": {"lambada_mt_es": 0}}
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/lambada_openai-v0-res.json b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/lambada_openai-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..30fcb907b5dbbabb2af4cf3a156cf18c67d387df
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/lambada_openai-v0-res.json
@@ -0,0 +1 @@
+{"results": {"lambada_openai": {"acc": 0.0, "acc_stderr": 0.0, "ppl": 1.6479047769869253, "ppl_stderr": 0.006497321146240192}}, "versions": {"lambada_openai": 0}}
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/lambada_openai_mt_en-v0-res.json b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/lambada_openai_mt_en-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..f7fdfc9c2d5c6d5d4abb7d6e932454615c095ea1
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/lambada_openai_mt_en-v0-res.json
@@ -0,0 +1 @@
+{"results": {"lambada_openai_mt_en": {"acc": 0.0, "acc_stderr": 0.0, "ppl": 1.6479047769869253, "ppl_stderr": 0.006497321146240192}}, "versions": {"lambada_openai_mt_en": 0}}
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/lambada_openai_mt_fr-v0-loglikelihood b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/lambada_openai_mt_fr-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..3c444f66611959e4c13451d306fba403261ecfbb
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/lambada_openai_mt_fr-v0-loglikelihood
@@ -0,0 +1 @@
+5d16f4a0c51dc6d7b6df2ebeba2bbfa51e700b843779b559b3d90183d7b02a11
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/math_algebra-v1-res.json b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/math_algebra-v1-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..10d18c2f864117ae56fe56ba1191f6cde4bec7b3
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/math_algebra-v1-res.json
@@ -0,0 +1 @@
+{"results": {"math_algebra": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_algebra": 1}}
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/math_counting_and_prob-v0-res.json b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/math_counting_and_prob-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..8ee1d031de8ec7d2af61c83567d433f9116ba24d
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/math_counting_and_prob-v0-res.json
@@ -0,0 +1 @@
+{"results": {"math_counting_and_prob": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_counting_and_prob": 0}}
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/math_geometry-v1-res.json b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/math_geometry-v1-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..eb6851fc63ff08c657743ef6abf5073ba73144e5
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/math_geometry-v1-res.json
@@ -0,0 +1 @@
+{"results": {"math_geometry": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_geometry": 1}}
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/mutual_plus-v0-loglikelihood b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/mutual_plus-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..f4ba9d37310a19cc7928fd0d599776d8a9da8dba
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/mutual_plus-v0-loglikelihood
@@ -0,0 +1 @@
+b846bb9db109535f59a93d1ce340cf09f68bdf4fed5b8decd168784220fe07fa
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/pile_books3-v0-loglikelihood_rolling b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/pile_books3-v0-loglikelihood_rolling
new file mode 100644
index 0000000000000000000000000000000000000000..b483d3b45b43abddd6cbd169a8afda8d3f803d9c
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/pile_books3-v0-loglikelihood_rolling
@@ -0,0 +1 @@
+0f8f36f705b999b6d55fa72ff89a82793dd1cb568ab1f8727a6a2086a12b9410
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/pile_europarl-v1-loglikelihood_rolling b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/pile_europarl-v1-loglikelihood_rolling
new file mode 100644
index 0000000000000000000000000000000000000000..80272607557f6e0c97220efa30c8b9ad38f52aa8
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/pile_europarl-v1-loglikelihood_rolling
@@ -0,0 +1 @@
+e67d3dbccd47d308bfc5b0e66b76d0dfc5e386ebfa94e056562c2281c395543f
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/pile_europarl-v1-res.json b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/pile_europarl-v1-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..b948f0d3691443f50c9f9d5ae24804b0c7e79aaa
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/pile_europarl-v1-res.json
@@ -0,0 +1 @@
+{"results": {"pile_europarl": {"bits_per_byte": 1.2477664839621123e-05, "byte_perplexity": 1.000008648895605, "word_perplexity": 1.000063506523818}}, "versions": {"pile_europarl": 1}}
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/pile_github-v1-loglikelihood_rolling b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/pile_github-v1-loglikelihood_rolling
new file mode 100644
index 0000000000000000000000000000000000000000..cf8251e4f68e2e893624142031e80d4d5777f4f2
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/pile_github-v1-loglikelihood_rolling
@@ -0,0 +1 @@
+df384c3df3d8f53273e97127c5bb84c17e638acad7d6bc9c91f6dee96d43b639
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/pile_gutenberg-v0-loglikelihood_rolling b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/pile_gutenberg-v0-loglikelihood_rolling
new file mode 100644
index 0000000000000000000000000000000000000000..bd7b15927f717baab5b7ce2e9d659dda6d681769
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/pile_gutenberg-v0-loglikelihood_rolling
@@ -0,0 +1 @@
+02a559f74a9105145e7d4d9c5ddea372b5b4938f5368dc8ffafc39cbe3b4c7ef
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/pile_hackernews-v1-res.json b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/pile_hackernews-v1-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..ea135278b720703540187531afb0ef82e7d6a1ce
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/pile_hackernews-v1-res.json
@@ -0,0 +1 @@
+{"results": {"pile_hackernews": {"bits_per_byte": 0.00014672607267878518, "byte_perplexity": 1.0001017079354932, "word_perplexity": 1.0006273924348839}}, "versions": {"pile_hackernews": 1}}
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/pile_openwebtext2-v1-res.json b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/pile_openwebtext2-v1-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..ca433e3c854780d034839c8e4d029cb6b5bfca1a
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/pile_openwebtext2-v1-res.json
@@ -0,0 +1 @@
+{"results": {"pile_openwebtext2": {"bits_per_byte": 0.000184802319359215, "byte_perplexity": 1.000128103411166, "word_perplexity": 1.0007951516532847}}, "versions": {"pile_openwebtext2": 1}}
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/pile_uspto-v1-res.json b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/pile_uspto-v1-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..599ae44ef430af958ab53c57d0b7900928ad243a
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/pile_uspto-v1-res.json
@@ -0,0 +1 @@
+{"results": {"pile_uspto": {"bits_per_byte": 0.000174024142670342, "byte_perplexity": 1.00012063161925, "word_perplexity": 1.0007716198916954}}, "versions": {"pile_uspto": 1}}
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/pubmedqa-v0-loglikelihood b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/pubmedqa-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..97db87ce2be9b3d2c08479ee73c7ba3923817795
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/pubmedqa-v0-loglikelihood
@@ -0,0 +1 @@
+7a04a1fb1d2b19db84fd15c224015d6c0306a41195a4e71fe6abd48fb4d53b9f
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/random_insertion-v0-greedy_until b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/random_insertion-v0-greedy_until
new file mode 100644
index 0000000000000000000000000000000000000000..4844e5393b8358d225f516f1a948f1deccab7840
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/random_insertion-v0-greedy_until
@@ -0,0 +1 @@
+6c48baa6924f3635120f33062251c4b571b3d4e9fe46b14d91f54ddd1c857997
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/random_insertion-v0-res.json b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/random_insertion-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..9b5f507f6745120414ba5cfd39fc92eac4e48424
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/random_insertion-v0-res.json
@@ -0,0 +1 @@
+{"results": {"random_insertion": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"random_insertion": 0}}
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/textsynth_test_0a89c2739f9598b4be2674b0a8e43931d7f3f0b696970bcba31f9b52bdf12297.pkl b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/textsynth_test_0a89c2739f9598b4be2674b0a8e43931d7f3f0b696970bcba31f9b52bdf12297.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..258d73cd68b190d87670edd3c11210c97e59ab91
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/textsynth_test_0a89c2739f9598b4be2674b0a8e43931d7f3f0b696970bcba31f9b52bdf12297.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cd0a3c44334dc2b7c48aa448d0a2c2ffde3c9a28e6c29d4ed175cbb22334bef3
+size 1805
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/textsynth_test_abcbcba648d89e5d81a50511a6d24ddeb538de2ffe108c1370dd74ce6ac8038d.pkl b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/textsynth_test_abcbcba648d89e5d81a50511a6d24ddeb538de2ffe108c1370dd74ce6ac8038d.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..6b3a1d3f57a5a19f012439a4eb611af6a7f22ea7
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/textsynth_test_abcbcba648d89e5d81a50511a6d24ddeb538de2ffe108c1370dd74ce6ac8038d.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9278fa1ee2540397f38cc755be8cad1277c51dc92d91aeea8c4ba1a26eb8490
+size 1773
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/textsynth_test_f4bfe4beb605bd52a8ab6be3c9293639e7e2261d98de58159d15ccb83131bf4e.pkl b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/textsynth_test_f4bfe4beb605bd52a8ab6be3c9293639e7e2261d98de58159d15ccb83131bf4e.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..03838a9bcf5b523ebbea661a3265fd5d05867143
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/textsynth_test_f4bfe4beb605bd52a8ab6be3c9293639e7e2261d98de58159d15ccb83131bf4e.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:beb2dc04ba4d834a434169fa0fe6dc5b6a20bb2b1144f25caf48703edb821ce5
+size 1911
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/triviaqa-v1-res.json b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/triviaqa-v1-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..985f64c8e0eb3bc1dd563becf0cdf186baa172cd
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/triviaqa-v1-res.json
@@ -0,0 +1 @@
+{"results": {"triviaqa": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"triviaqa": 1}}
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/webqs-v0-res.json b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/webqs-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..9f0fdc76cab096c80a87295773054510803ba218
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/webqs-v0-res.json
@@ -0,0 +1 @@
+{"results": {"webqs": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"webqs": 0}}
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/winogrande-v0-loglikelihood b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/winogrande-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..97866f6ce45cb9a213d27310a78b7cdeab23bc9a
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/winogrande-v0-loglikelihood
@@ -0,0 +1 @@
+90a3eff49de9173964d46f5ed57bcf9a78a72dd1bfe0e5323b25cebb40b49ea9
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/wmt14-fr-en-v0-res.json b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/wmt14-fr-en-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..5261876f55a69dcaf33b3842690f81c12eb42f3a
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/wmt14-fr-en-v0-res.json
@@ -0,0 +1 @@
+{"results": {"wmt14-fr-en": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.01275083169440515, "chrf_stderr": 8.45474998563806e-05, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt14-fr-en": 0}}
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/wmt16-en-de-v0-greedy_until b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/wmt16-en-de-v0-greedy_until
new file mode 100644
index 0000000000000000000000000000000000000000..45eaaaca8c5892944b1b9c9af0c469e3c63e4881
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/wmt16-en-de-v0-greedy_until
@@ -0,0 +1 @@
+d71e2074af3770e9b29ac561caf2e1c29ad6b0dc50ec2e7bcc5501747b11f0da
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/wmt16-en-de-v0-res.json b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/wmt16-en-de-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..88bee7ffa69b1bf7accdd56a3870f61d4c0453da
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/wmt16-en-de-v0-res.json
@@ -0,0 +1 @@
+{"results": {"wmt16-en-de": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.010909486120840577, "chrf_stderr": 0.000122611124711072, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt16-en-de": 0}}
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/wmt20-de-fr-v0-greedy_until b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/wmt20-de-fr-v0-greedy_until
new file mode 100644
index 0000000000000000000000000000000000000000..7cb9424082836f0d56afe809cf44c78fc844d993
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/wmt20-de-fr-v0-greedy_until
@@ -0,0 +1 @@
+7f197bc281d6dbf9425900ef0dee7175021c43e355050f149f43b161c52bf0b0
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/wmt20-en-ja-v0-greedy_until b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/wmt20-en-ja-v0-greedy_until
new file mode 100644
index 0000000000000000000000000000000000000000..9777002c79830918a3939ec6978d606ae967ffe6
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/wmt20-en-ja-v0-greedy_until
@@ -0,0 +1 @@
+7fe61f5847a51e93e97c84b39f4420978727754e4b6cf636a27851c615857530
\ No newline at end of file
diff --git a/scripts/yans/eval/lm-evaluation-harness/tests/testdata/wmt20-en-ja-v1-greedy_until b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/wmt20-en-ja-v1-greedy_until
new file mode 100644
index 0000000000000000000000000000000000000000..9777002c79830918a3939ec6978d606ae967ffe6
--- /dev/null
+++ b/scripts/yans/eval/lm-evaluation-harness/tests/testdata/wmt20-en-ja-v1-greedy_until
@@ -0,0 +1 @@
+7fe61f5847a51e93e97c84b39f4420978727754e4b6cf636a27851c615857530
\ No newline at end of file