diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/anli_r1-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/anli_r1-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..b6f6b350182f2a363b2b247b1145dd5c3b54157e
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/anli_r1-v0-res.json
@@ -0,0 +1 @@
+{"results": {"anli_r1": {"acc": 0.334, "acc_stderr": 0.014922019523732967}}, "versions": {"anli_r1": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/arithmetic_1dc-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/arithmetic_1dc-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..29e447d578ed11f77d962c079e9db9e3f415d801
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/arithmetic_1dc-v0-res.json
@@ -0,0 +1 @@
+{"results": {"arithmetic_1dc": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"arithmetic_1dc": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_animate_subject_trans-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_animate_subject_trans-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..480cf29a4d403852153ef59fe596bcc6a5cf34df
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_animate_subject_trans-v0-res.json
@@ -0,0 +1 @@
+{"results": {"blimp_animate_subject_trans": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_animate_subject_trans": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_2-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_2-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..bc2dc6e1ed3ad7f38496a2de9610db4d145fc41f
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_2-v0-res.json
@@ -0,0 +1 @@
+{"results": {"blimp_determiner_noun_agreement_2": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_determiner_noun_agreement_2": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_irregular_1-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_irregular_1-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..8caeecf43de4f434ede855dca08c5f62b702a46c
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_irregular_1-v0-res.json
@@ -0,0 +1 @@
+{"results": {"blimp_determiner_noun_agreement_irregular_1": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_determiner_noun_agreement_irregular_1": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_irregular_2-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_irregular_2-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..12a4ebe1d2a83e1a8d5dc85ade8913f31931d8b6
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_irregular_2-v0-loglikelihood
@@ -0,0 +1 @@
+ddb24ddfaebe076b3aa7107937d71bf5f4503a78283bc889e39200368603681e
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_irregular_2-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_irregular_2-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..c04ead457767b4bf390b2ba28f55d7f23c95d4cb
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_irregular_2-v0-res.json
@@ -0,0 +1 @@
+{"results": {"blimp_determiner_noun_agreement_irregular_2": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_determiner_noun_agreement_irregular_2": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_drop_argument-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_drop_argument-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..853a4d2f92c5c6da8d146a85e120a32dca147c4c
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_drop_argument-v0-res.json
@@ -0,0 +1 @@
+{"results": {"blimp_drop_argument": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_drop_argument": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_existential_there_quantifiers_2-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_existential_there_quantifiers_2-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..b8500d68b553a66f850ebc39192644c2d138f0a1
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_existential_there_quantifiers_2-v0-res.json
@@ -0,0 +1 @@
+{"results": {"blimp_existential_there_quantifiers_2": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_existential_there_quantifiers_2": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_left_branch_island_simple_question-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_left_branch_island_simple_question-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..22adb2995e9b5d4173b4ae7096714514022c8e9f
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_left_branch_island_simple_question-v0-loglikelihood
@@ -0,0 +1 @@
+6cb36bbdae7754f8832f50872c3dd511ce12547e00fa0771deb747be3355eb85
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_npi_present_2-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_npi_present_2-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..543fdc061433e58041b92ecc9d3f5e34d2427db1
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_npi_present_2-v0-loglikelihood
@@ -0,0 +1 @@
+fdb688ac6259bb65d234ef0a36e9a9ee449f9608f633b12e1943b462aead8e17
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_principle_A_case_1-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_principle_A_case_1-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..ce8166c4605ac5d9968da1d3370a73fab286e886
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_principle_A_case_1-v0-loglikelihood
@@ -0,0 +1 @@
+49d2b8ce6667a6166fdc2a2e5dbe7ff07d9b8415e9f33482aef15956b3ebc24a
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_principle_A_case_2-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_principle_A_case_2-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..8c043857d4845d1bfebf34ede397049c16e981c2
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_principle_A_case_2-v0-loglikelihood
@@ -0,0 +1 @@
+cd68adb65c891d672e22bf53c054b2083ab08bc1da43951732b409c942d14bc7
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_principle_A_domain_1-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_principle_A_domain_1-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..9efbffb50fea7f1bca803438e9122ad3c9e953c0
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_principle_A_domain_1-v0-res.json
@@ -0,0 +1 @@
+{"results": {"blimp_principle_A_domain_1": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_principle_A_domain_1": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_principle_A_domain_3-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_principle_A_domain_3-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..77c4bf916ab761be87f77618e41abe33d550d7c1
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_principle_A_domain_3-v0-res.json
@@ -0,0 +1 @@
+{"results": {"blimp_principle_A_domain_3": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_principle_A_domain_3": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_superlative_quantifiers_1-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_superlative_quantifiers_1-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..b69d445f3c257608fd5be46aa74bd53cd598042c
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_superlative_quantifiers_1-v0-res.json
@@ -0,0 +1 @@
+{"results": {"blimp_superlative_quantifiers_1": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_superlative_quantifiers_1": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_tough_vs_raising_1-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_tough_vs_raising_1-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..44ea10c1380c3dccdbc8d2ad6a2d84e716e81773
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_tough_vs_raising_1-v0-res.json
@@ -0,0 +1 @@
+{"results": {"blimp_tough_vs_raising_1": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_tough_vs_raising_1": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_wh_questions_subject_gap_long_distance-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_wh_questions_subject_gap_long_distance-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..f83ed1fb7413ddccae66c32078a9a5f7b19eb03e
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_wh_questions_subject_gap_long_distance-v0-loglikelihood
@@ -0,0 +1 @@
+37483dfda688b62ad27161c9fc1e1e7710c5a6e6a7cd3474df119bcafd30e97f
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_wh_vs_that_with_gap-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_wh_vs_that_with_gap-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..14befd4ab6450dbb2147d66e5458981756bfc25b
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_wh_vs_that_with_gap-v0-res.json
@@ -0,0 +1 @@
+{"results": {"blimp_wh_vs_that_with_gap": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_wh_vs_that_with_gap": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/cb-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/cb-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..6fa6f6dae6c806be8a5cad8416df6766f22ae475
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/cb-v0-loglikelihood
@@ -0,0 +1 @@
+ec3b1bbb9561e39c43c6f77a23b4060b15c606141c5346e3d0791b3e92aaa5d0
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/cola-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/cola-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..462e5d9401318226da067adcc39b27a09157a127
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/cola-v0-res.json
@@ -0,0 +1 @@
+{"results": {"cola": {"mcc": -0.04538802810223175, "mcc_stderr": 0.023100371589225246}}, "versions": {"cola": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/coqa-v1-greedy_until b/scripts/yans/lm-evaluation-harness/tests/testdata/coqa-v1-greedy_until
new file mode 100644
index 0000000000000000000000000000000000000000..f6e3f64b18a1d7d3ec2702d115c694bbe62cc8ef
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/coqa-v1-greedy_until
@@ -0,0 +1 @@
+57581470b921435d40da97872bb1cfda6ecf963ccc4b0240a3b04e3fea8c8e3a
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_english_age-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_english_age-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..598d2cce10cc3ecefb6eb8d1deb74801e25b11af
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_english_age-v0-loglikelihood
@@ -0,0 +1 @@
+de74d2ac7f926f2f486c045d84aae8f71711102f9d77b31f758fd148810d13d3
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_english_physical_appearance-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_english_physical_appearance-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..4e5ce5b3e7d4df6366fc2cb0219a24b92f1fabed
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_english_physical_appearance-v0-res.json
@@ -0,0 +1 @@
+{"results": {"crows_pairs_english_physical_appearance": {"likelihood_difference": 0.3221673223187262, "likelihood_difference_stderr": 0.026978346460100555, "pct_stereotype": 0.4027777777777778, "pct_stereotype_stderr": 0.05820650942569533}}, "versions": {"crows_pairs_english_physical_appearance": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_english_sexual_orientation-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_english_sexual_orientation-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..0a58b730c1e43271ba9d287c6b645ab97d10a560
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_english_sexual_orientation-v0-loglikelihood
@@ -0,0 +1 @@
+e754a309296b157677dfba6e6feef983d1ce38dd0169ae726265621a7b573163
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_sexual_orientation-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_sexual_orientation-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..0336c1ddc64ef089490495a817922f3e7c9bdc73
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_sexual_orientation-v0-loglikelihood
@@ -0,0 +1 @@
+2ce823fdb93d325aa8fb40db5d335b093b4b69792763532d940a752440ee3a76
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/cycle_letters-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/cycle_letters-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..5b05a9430e90ec2ce0ddcb49a243be9479d3fad1
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/cycle_letters-v0-res.json
@@ -0,0 +1 @@
+{"results": {"cycle_letters": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"cycle_letters": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/ethics_cm-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/ethics_cm-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..69289144e0e3ceb0051596d5768b70667f7d19a8
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/ethics_cm-v0-loglikelihood
@@ -0,0 +1 @@
+92d136ebb2bd86cd036e61699ad9a1417dbb48651f0a3afa5045cf57cef5a3f6
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/gguf_test_44e268d15decc4d2d0f99e57e1476269826cd3b54262f7a0981f75ddd45b25d0.pkl b/scripts/yans/lm-evaluation-harness/tests/testdata/gguf_test_44e268d15decc4d2d0f99e57e1476269826cd3b54262f7a0981f75ddd45b25d0.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..f6d36557909011f0a91489f212de38cac70d0f4a
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/gguf_test_44e268d15decc4d2d0f99e57e1476269826cd3b54262f7a0981f75ddd45b25d0.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d672564bf8af93738531c26d02efda9ed844d1fdcdd619a06e37fce7ef6d8485
+size 153
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/gpt3_test_8025023377febbd8c5f2b9f26705c394ff375d0cad7c89c10fd9b8e1eb66ff1c.pkl b/scripts/yans/lm-evaluation-harness/tests/testdata/gpt3_test_8025023377febbd8c5f2b9f26705c394ff375d0cad7c89c10fd9b8e1eb66ff1c.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..2f4c6f9c7102ac9466d1da0d81307088f2fdcdb4
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/gpt3_test_8025023377febbd8c5f2b9f26705c394ff375d0cad7c89c10fd9b8e1eb66ff1c.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:590805560ee790d530c075ad76633eb2e9749440083e0bab63489ff920fdfd33
+size 70917
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/headqa-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/headqa-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..adc093cf62c2f807a0f413d0ecc200879931a5b7
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/headqa-v0-res.json
@@ -0,0 +1 @@
+{"results": {"headqa": {"acc": 0.23559445660102116, "acc_norm": 0.25018234865062, "acc_norm_stderr": 0.008272783230806014, "acc_stderr": 0.008105688874297972}}, "versions": {"headqa": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/headqa_es-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/headqa_es-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..9129d834b6037cda3db655064d6c18bb3dccfb54
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/headqa_es-v0-loglikelihood
@@ -0,0 +1 @@
+767ca34d9714edd9fb030ddbcc35a64e5180d1e247b0cb557fbb22fdf971ad1f
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/headqa_es-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/headqa_es-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..0964db9bbb8a6b0ca129c3e069151f334558de54
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/headqa_es-v0-res.json
@@ -0,0 +1 @@
+{"results": {"headqa_es": {"acc": 0.23559445660102116, "acc_norm": 0.25018234865062, "acc_norm_stderr": 0.008272783230806014, "acc_stderr": 0.008105688874297972}}, "versions": {"headqa_es": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-abstract_algebra-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-abstract_algebra-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..dc2c9a0d7d4d4a18ee7c8cb0e266a29fa5bd48f6
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-abstract_algebra-v0-res.json
@@ -0,0 +1 @@
+{"results": {"hendrycksTest-abstract_algebra": {"acc": 0.32, "acc_norm": 0.34, "acc_norm_stderr": 0.04760952285695235, "acc_stderr": 0.04688261722621504}}, "versions": {"hendrycksTest-abstract_algebra": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-electrical_engineering-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-electrical_engineering-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..9c9e72efdf98ed9afb4881647929246433e1f857
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-electrical_engineering-v0-loglikelihood
@@ -0,0 +1 @@
+b9b5d8b8bb02696302ec6bc2a99bf987a5504d3bae0e529d2c8f263538c97518
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_biology-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_biology-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..1e2c01e2b19082144373a13ee25e3e68bf8df588
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_biology-v0-loglikelihood
@@ -0,0 +1 @@
+d4dc051f37a49dc75c218741e87bc826fd44f31ee1309b55e0f33bd191c1bc78
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_biology-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_biology-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..a666d9ce9c969f808ea84909730cee046ccc6294
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_biology-v0-res.json
@@ -0,0 +1 @@
+{"results": {"hendrycksTest-high_school_biology": {"acc": 0.23870967741935484, "acc_norm": 0.2709677419354839, "acc_norm_stderr": 0.025284416114900152, "acc_stderr": 0.024251071262208834}}, "versions": {"hendrycksTest-high_school_biology": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_computer_science-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_computer_science-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..bbc2dacf5f5ac0b14327f0637b4b1aabea7a6167
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_computer_science-v0-res.json
@@ -0,0 +1 @@
+{"results": {"hendrycksTest-high_school_computer_science": {"acc": 0.2, "acc_norm": 0.22, "acc_norm_stderr": 0.04163331998932269, "acc_stderr": 0.04020151261036845}}, "versions": {"hendrycksTest-high_school_computer_science": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_european_history-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_european_history-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..b5cea9cbe310db37d488984f3ff6aa57921576d9
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_european_history-v0-res.json
@@ -0,0 +1 @@
+{"results": {"hendrycksTest-high_school_european_history": {"acc": 0.23636363636363636, "acc_norm": 0.24242424242424243, "acc_norm_stderr": 0.03346409881055953, "acc_stderr": 0.033175059300091805}}, "versions": {"hendrycksTest-high_school_european_history": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_macroeconomics-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_macroeconomics-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..c0106d373dcf6136b147bb3787fed6c9c8a3da8f
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_macroeconomics-v0-loglikelihood
@@ -0,0 +1 @@
+ce4faae2fb6628caa48f6fc74cbc848880db49e6ff51079392778a2322bcefef
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-moral_disputes-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-moral_disputes-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..953fc3be48759378aea33eb767cb7367514a5de9
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-moral_disputes-v0-loglikelihood
@@ -0,0 +1 @@
+d6ef028022c02b69d1516973e08bebaa14d8debcf2589a2bb124823178202d20
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-nutrition-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-nutrition-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..2716bebe69e1c3884ba2e88056c87c5a5268b53e
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-nutrition-v0-loglikelihood
@@ -0,0 +1 @@
+19e49d218f55ed5ec4bd1a6cd3f3388c6f620b81484e7abe8b298e5481c3044d
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-prehistory-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-prehistory-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..4c01847ef594713fee284436be6fe8d20d602554
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-prehistory-v0-loglikelihood
@@ -0,0 +1 @@
+6983c560a562749f4f702249a3a6ae51fa495acc0643a980bf2cf52c6c5d4b95
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-professional_law-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-professional_law-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..f15a9b34ff26e1382d04b4d6e41fdae6085b30c8
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-professional_law-v0-res.json
@@ -0,0 +1 @@
+{"results": {"hendrycksTest-professional_law": {"acc": 0.2561929595827901, "acc_norm": 0.2470664928292047, "acc_norm_stderr": 0.011015752255279352, "acc_stderr": 0.011149173153110582}}, "versions": {"hendrycksTest-professional_law": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-professional_medicine-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-professional_medicine-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..801ea2d224b7f4699c3a3defd7cde023e777a29e
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-professional_medicine-v0-res.json
@@ -0,0 +1 @@
+{"results": {"hendrycksTest-professional_medicine": {"acc": 0.23161764705882354, "acc_norm": 0.2536764705882353, "acc_norm_stderr": 0.02643132987078953, "acc_stderr": 0.025626533803777562}}, "versions": {"hendrycksTest-professional_medicine": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_mt_de-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_mt_de-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..7267ea739a54f5fd165ef0011d27446faac04689
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_mt_de-v0-res.json
@@ -0,0 +1 @@
+{"results": {"lambada_mt_de": {"acc": 0.0, "acc_stderr": 0.0, "ppl": 1.6479047769869253, "ppl_stderr": 0.006497321146240192}}, "versions": {"lambada_mt_de": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_mt_fr-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_mt_fr-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..637c23500b9c153fe74ad9cb0369bd57f22d80a0
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_mt_fr-v0-res.json
@@ -0,0 +1 @@
+{"results": {"lambada_mt_fr": {"acc": 0.0, "acc_stderr": 0.0, "ppl": 1.6479047769869253, "ppl_stderr": 0.006497321146240192}}, "versions": {"lambada_mt_fr": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_standard_cloze-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_standard_cloze-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..0a7b76241f898374a3a75952e16fe15af9a6d48e
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_standard_cloze-v0-loglikelihood
@@ -0,0 +1 @@
+b604f00bc9f2a77ef41f8cfdb5a8509b3ae9266893b9e90abc665f5399ecba4e
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/logiqa-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/logiqa-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..9cd40fce0a7062ec6897a119d44b1de88f762d08
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/logiqa-v0-loglikelihood
@@ -0,0 +1 @@
+12495c50454ba5e1ce0753bd18c09aaca516bebd27648d815e37b15229dbf198
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/math_algebra-v0-greedy_until b/scripts/yans/lm-evaluation-harness/tests/testdata/math_algebra-v0-greedy_until
new file mode 100644
index 0000000000000000000000000000000000000000..ce881a0232cff3f1025b746184ce8a0170e34303
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/math_algebra-v0-greedy_until
@@ -0,0 +1 @@
+f19182ce697a2c095d9e5b56ee6659dc38c93994b69ca75d7c3d3f5fd87572b4
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/math_geometry-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/math_geometry-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..1b25dc283c96c63d30df9f0ce3d04aadb8f93625
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/math_geometry-v0-res.json
@@ -0,0 +1 @@
+{"results": {"math_geometry": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_geometry": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/math_precalc-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/math_precalc-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..699dc5fe38ea411d6d53c9e19d78ba6d96ddfb40
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/math_precalc-v0-res.json
@@ -0,0 +1 @@
+{"results": {"math_precalc": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_precalc": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/mathqa-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/mathqa-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..9f33d79035cc7caf00704a8764aa0adf657c0b78
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/mathqa-v0-loglikelihood
@@ -0,0 +1 @@
+a45260e49f02c7cb8886b3746db4d388890860b202dd8a9f0267e3c324e0af13
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/multirc-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/multirc-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..b3681ec17595adc4c4541ded263add219912af58
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/multirc-v0-loglikelihood
@@ -0,0 +1 @@
+cdb026c027437a8b4653212d0944d36fc16f49921dcb8e4bef899d15a55e9f80
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/multirc-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/multirc-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..87e9c532eb7d7deb7d08635dd955df7a68ab9813
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/multirc-v0-res.json
@@ -0,0 +1 @@
+{"results": {"multirc": {"acc": 0.07450157397691501, "acc_stderr": 0.008510441526175931}}, "versions": {"multirc": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/mutual-v1-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/mutual-v1-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..0022f466d25f3e3a639720e4600732c9c0c1141d
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/mutual-v1-loglikelihood
@@ -0,0 +1 @@
+f759213a28f0412510bf1a24c9cab0dae64bdee902d42a26225295445e7779db
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/openbookqa-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/openbookqa-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..04f4c25442e678a63d3f6213dc9364bfa25b1a7a
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/openbookqa-v0-res.json
@@ -0,0 +1 @@
+{"results": {"openbookqa": {"acc": 0.214, "acc_norm": 0.276, "acc_norm_stderr": 0.020011219298073517, "acc_stderr": 0.018359797502387046}}, "versions": {"openbookqa": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/pile_books3-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_books3-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..df19cd0a18f122d695f8aea4a717ab4dde79a987
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_books3-v0-res.json
@@ -0,0 +1 @@
+{"results": {"pile_books3": {"bits_per_byte": 8.942486206275221e-07, "byte_perplexity": 1.0000008942490204, "word_perplexity": 1.0000052870063607}}, "versions": {"pile_books3": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/pile_europarl-v0-loglikelihood_rolling b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_europarl-v0-loglikelihood_rolling
new file mode 100644
index 0000000000000000000000000000000000000000..80272607557f6e0c97220efa30c8b9ad38f52aa8
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_europarl-v0-loglikelihood_rolling
@@ -0,0 +1 @@
+e67d3dbccd47d308bfc5b0e66b76d0dfc5e386ebfa94e056562c2281c395543f
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/pile_openwebtext2-v1-loglikelihood_rolling b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_openwebtext2-v1-loglikelihood_rolling
new file mode 100644
index 0000000000000000000000000000000000000000..22046e440584d0df85ceeed057ad2c0633273782
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_openwebtext2-v1-loglikelihood_rolling
@@ -0,0 +1 @@
+5d6c19665f429ab1ccbe027da67f42bdaf219f819ab093673976eee55e015ff4
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/pile_pubmed-abstracts-v1-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_pubmed-abstracts-v1-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..21b6bb451fe376e62899f22ea422b3ce9cada469
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_pubmed-abstracts-v1-res.json
@@ -0,0 +1 @@
+{"results": {"pile_pubmed-abstracts": {"bits_per_byte": 0.0005417858444030858, "byte_perplexity": 1.0003756078534862, "word_perplexity": 1.0025884332779}}, "versions": {"pile_pubmed-abstracts": 1}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/pile_pubmed-central-v1-loglikelihood_rolling b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_pubmed-central-v1-loglikelihood_rolling
new file mode 100644
index 0000000000000000000000000000000000000000..283109f32e0aac45adcbc90c7c8fb41114e7771f
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_pubmed-central-v1-loglikelihood_rolling
@@ -0,0 +1 @@
+40b39d120d99a145690444e86acc3e3e24d41e6e0538a75e26929ad84926e5e0
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/pile_ubuntu-irc-v0-loglikelihood_rolling b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_ubuntu-irc-v0-loglikelihood_rolling
new file mode 100644
index 0000000000000000000000000000000000000000..ce041998635643ee17aace3105b227ef0746917e
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_ubuntu-irc-v0-loglikelihood_rolling
@@ -0,0 +1 @@
+4eb69e314f0864ec8890e2323d7e76f8a8309692c4f090e2b41bf4be681a811d
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/pile_wikipedia-v1-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_wikipedia-v1-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..4f2314e66b3a5dbd9ed3c25d9e9a97c7d1fbff3d
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_wikipedia-v1-res.json
@@ -0,0 +1 @@
+{"results": {"pile_wikipedia": {"bits_per_byte": 0.00024287370359008176, "byte_perplexity": 1.0001683613940646, "word_perplexity": 1.001084677949439}}, "versions": {"pile_wikipedia": 1}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/pile_youtubesubtitles-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_youtubesubtitles-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..b58ce148f0071707d5da39135aaeb92a2a1457a2
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_youtubesubtitles-v0-res.json
@@ -0,0 +1 @@
+{"results": {"pile_youtubesubtitles": {"bits_per_byte": 2.3447170928931888e-05, "byte_perplexity": 1.000023447445816, "word_perplexity": 1.0001529192262875}}, "versions": {"pile_youtubesubtitles": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/prost-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/prost-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..ff99d83f40a966afe7df30661a3fc4d9dd09c4ca
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/prost-v0-res.json
@@ -0,0 +1 @@
+{"results": {"prost": {"acc": 0.24631725021349274, "acc_norm": 0.2581127241673783, "acc_norm_stderr": 0.00319703079646546, "acc_stderr": 0.003147855968061357}}, "versions": {"prost": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/qa4mre_2011-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/qa4mre_2011-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..601c4eb763d97500cfcd4e24ca6602986c49939c
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/qa4mre_2011-v0-res.json
@@ -0,0 +1 @@
+{"results": {"qa4mre_2011": {"acc": 0.225, "acc_norm": 0.23333333333333334, "acc_norm_stderr": 0.03877199986918664, "acc_stderr": 0.0382797091741014}}, "versions": {"qa4mre_2011": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/sciq-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/sciq-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..7071515827af18b10a7b3607e6249ed3e7c1929e
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/sciq-v0-res.json
@@ -0,0 +1 @@
+{"results": {"sciq": {"acc": 0.234, "acc_norm": 0.239, "acc_norm_stderr": 0.01349300044693758, "acc_stderr": 0.01339490288966001}}, "versions": {"sciq": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/squad2-v1-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/squad2-v1-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..2c970f7583b3d8236d9ca2e802ce6e0403b36074
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/squad2-v1-loglikelihood
@@ -0,0 +1 @@
+f5da6173402b274dc89130755c222c6ca6b2a3bacaaa4e4ab07be9322b7bad65
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/swag-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/swag-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..a1aeee972e83a41dbb7301f5a98ad5c97486402f
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/swag-v0-res.json
@@ -0,0 +1 @@
+{"results": {"swag": {"acc": 0.2482255323402979, "acc_norm": 0.24882535239428172, "acc_norm_stderr": 0.00305666959496067, "acc_stderr": 0.003054201832644171}}, "versions": {"swag": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/textsynth_test_3092d07756f3e1d010c07524cc8a2ecba7f0c19f9e39f2aaf2bf440bfe328004.pkl b/scripts/yans/lm-evaluation-harness/tests/testdata/textsynth_test_3092d07756f3e1d010c07524cc8a2ecba7f0c19f9e39f2aaf2bf440bfe328004.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..a1c496a0f45e0b5e23c34a5650fc852349d20ad8
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/textsynth_test_3092d07756f3e1d010c07524cc8a2ecba7f0c19f9e39f2aaf2bf440bfe328004.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5428228cd48e036420c17844c421eb782a6471cd77075120acde387a4e176ab2
+size 2003
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/textsynth_test_7afdc285388e51094e12645f305328c759574fa3ec9751631025f8ad5ebf9f3e.pkl b/scripts/yans/lm-evaluation-harness/tests/testdata/textsynth_test_7afdc285388e51094e12645f305328c759574fa3ec9751631025f8ad5ebf9f3e.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..94fc64bf12baf1c58b928641626e54d72d45b8f8
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/textsynth_test_7afdc285388e51094e12645f305328c759574fa3ec9751631025f8ad5ebf9f3e.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2a5612e9590402d33652e1664d1b5fa83cfb0f1fcaa25c782e314e8ddbcce231
+size 1975
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/toxigen-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/toxigen-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..65bb7cf4596c8973ae7dd2efc60e366c65bc4800
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/toxigen-v0-res.json
@@ -0,0 +1 @@
+{"results": {"toxigen": {"acc": 0.5053191489361702, "acc_norm": 0.46808510638297873, "acc_norm_stderr": 0.016283609940023203, "acc_stderr": 0.016315959984563776}}, "versions": {"toxigen": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/triviaqa-v1-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/triviaqa-v1-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..d576c4977fc769dc56c31340f07558fefc1f1459
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/triviaqa-v1-loglikelihood
@@ -0,0 +1 @@
+f8ec05b306b9f6187c0f8117cae441fb85a7a2e4670f4f9a1a3b632b1978421a
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/truthfulqa_mc-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/truthfulqa_mc-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..51303977a9cbb311433a840af6ce636728bdb118
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/truthfulqa_mc-v0-loglikelihood
@@ -0,0 +1 @@
+226a6783976177dc9ceda5688623ff37023242eff30ddf270b886bf7b9b32228
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/wmt16-en-ro-v0-greedy_until b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt16-en-ro-v0-greedy_until
new file mode 100644
index 0000000000000000000000000000000000000000..291492556e5182600291565c640a463da7f00616
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt16-en-ro-v0-greedy_until
@@ -0,0 +1 @@
+4be7fdda313394f19b5995b00ada1dfa3bb158ee1f020ef8d07ecea260fa60b2
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-en-iu-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-en-iu-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..22f042eb4eba6e6e662e46807232679782f7b6b9
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-en-iu-v0-res.json
@@ -0,0 +1 @@
+{"results": {"wmt20-en-iu": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.00011803644548940443, "chrf_stderr": 2.175287038623409e-05, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-en-iu": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/wnli-v1-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/wnli-v1-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..d12348e0aeb8d7feec272059e08eb30cbb1d918d
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/wnli-v1-res.json
@@ -0,0 +1 @@
+{"results": {"wnli": {"acc": 0.5633802816901409, "acc_stderr": 0.0592793555841297}}, "versions": {"wnli": 1}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/wsc-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/wsc-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..d0d2963fe90b29dbbf2527e9a3b559cf9b9c23c7
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/wsc-v0-loglikelihood
@@ -0,0 +1 @@
+45865468eff5ca31e6a050947a6b3310d9d5ed19d0f2e578a32ecaf1c768600f
\ No newline at end of file