diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/arc_challenge-v2.0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/arc_challenge-v2.0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..f0186c4c4b395b6f57e26120975ec0378cd9c0ea --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/arc_challenge-v2.0-res.json @@ -0,0 +1 @@ +{"results": {"arc_challenge": {"acc": 0.26621160409556316, "acc_norm": 0.28242320819112626, "acc_norm_stderr": 0.01315545688409722, "acc_stderr": 0.01291577478152323}}, "versions": {"arc_challenge": "2.0"}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/arithmetic_1dc-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/arithmetic_1dc-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..01756b4d47703cc943f7721509af1ead77739d1e --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/arithmetic_1dc-v0-loglikelihood @@ -0,0 +1 @@ +04c3a63a6b3c579bd3775d92b3076ba9130041d5ce7cf9244d3f86e95c804387 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/arithmetic_5da-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/arithmetic_5da-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..a751332bc6fbae7b680f4412609dcf0695eb972c --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/arithmetic_5da-v0-loglikelihood @@ -0,0 +1 @@ +49edb1e735660631ea6cc309721e6c0b80b7106a613a6959514852ca48f1130e \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/arithmetic_5ds-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/arithmetic_5ds-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..0f959c21f6bb46a40cf1dd83c5525583189d3793 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/arithmetic_5ds-v0-loglikelihood @@ -0,0 +1 @@ +2888d6d098a5ef8c1e7f0d8295ba80826e2e04e431f57508dfb71d53e1cd4604 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_animate_subject_passive-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_animate_subject_passive-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..47cd3d3be14eedc3d525b408e76abe69c45f8586 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_animate_subject_passive-v0-loglikelihood @@ -0,0 +1 @@ +064c38fcd072b8bd12f54ea4f8e41599ed4e11dc386e93b77e1fc07967d1f960 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_complex_NP_island-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_complex_NP_island-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..3a6d0875c6aadaf550692275a3ecd0b3ac099d3c --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_complex_NP_island-v0-loglikelihood @@ -0,0 +1 @@ +f46cfcc7e43050a235fd2a6b989cabbfbcce76786df74db9f0d4a9cd1caa1628 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_coordinate_structure_constraint_complex_left_branch-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_coordinate_structure_constraint_complex_left_branch-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..2750fcda2aa5ee2efc6f20faa8932853f0f42ba2 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_coordinate_structure_constraint_complex_left_branch-v0-res.json @@ -0,0 +1 @@ +{"results": {"blimp_coordinate_structure_constraint_complex_left_branch": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_coordinate_structure_constraint_complex_left_branch": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_2-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_2-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..72ab237e58550ef3d5f57edcc44f716e0ebece64 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_2-v0-loglikelihood @@ -0,0 +1 @@ +123e2acd00fbba60aba1fbae607c79a062e512c9e79c7d8dfafff63e30111d76 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_distractor_agreement_relative_clause-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_distractor_agreement_relative_clause-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..1fddc2190c85c0161921a5a4026cd518445fc386 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_distractor_agreement_relative_clause-v0-loglikelihood @@ -0,0 +1 @@ +bf78e2b53c0f3531303c668c96bd3897a0a35e960da37439e63724ecba4e371a \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_ellipsis_n_bar_1-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_ellipsis_n_bar_1-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..611211bec04bc5833413b1ea21baf5f216b2cb3b --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_ellipsis_n_bar_1-v0-loglikelihood @@ -0,0 +1 @@ +d14e4b7fcdd68991eb39b9cf3ade4b37dee9ddd39b688f861d81a327e47a969f \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_existential_there_quantifiers_2-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_existential_there_quantifiers_2-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..4b1a428c4d32831cc6181054631c723408b8382a --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_existential_there_quantifiers_2-v0-loglikelihood @@ -0,0 +1 @@ +6e6add7baff4217f383425bef58288202018e041b24084edcaa5df8af08f820c \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_existential_there_subject_raising-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_existential_there_subject_raising-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..00c913dcd3ba3846464d04067c5b896c7e5c3c19 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_existential_there_subject_raising-v0-res.json @@ -0,0 +1 @@ +{"results": {"blimp_existential_there_subject_raising": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_existential_there_subject_raising": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_expletive_it_object_raising-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_expletive_it_object_raising-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..735dc09826d056ed20a40b8bd9ccf54b434d05a8 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_expletive_it_object_raising-v0-res.json @@ -0,0 +1 @@ +{"results": {"blimp_expletive_it_object_raising": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_expletive_it_object_raising": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_irregular_plural_subject_verb_agreement_1-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_irregular_plural_subject_verb_agreement_1-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..bd7f4bd9ea496a4c8cd2c39c519c21caa26bf42e --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_irregular_plural_subject_verb_agreement_1-v0-loglikelihood @@ -0,0 +1 @@ +7084358b1b7dd7fb5ead1a58f4b499d6f7610eca897bfac25a986d0f9a91aa5d \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_left_branch_island_echo_question-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_left_branch_island_echo_question-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..da909529e5ae766814dc24d28e65ef3df4e7109c --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_left_branch_island_echo_question-v0-loglikelihood @@ -0,0 +1 @@ +9852b38612db8c6adf938a5d8a7a9e5ce9e655259d6cc806b142506fcaff0ed4 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_only_npi_scope-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_only_npi_scope-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..82fbbab07d39f44d560d77f2f93535846b413e8e --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_only_npi_scope-v0-res.json @@ -0,0 +1 @@ +{"results": {"blimp_only_npi_scope": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_only_npi_scope": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_passive_1-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_passive_1-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..64070cf58dd53d10a9e3b8f3510d3387f2983cfd --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_passive_1-v0-res.json @@ -0,0 +1 @@ +{"results": {"blimp_passive_1": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_passive_1": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_principle_A_case_2-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_principle_A_case_2-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..ec8108c88d9554aefbeb34e6e0432e490253d26c --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_principle_A_case_2-v0-res.json @@ -0,0 +1 @@ +{"results": {"blimp_principle_A_case_2": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_principle_A_case_2": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_principle_A_domain_3-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_principle_A_domain_3-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..c37e9364012f74afc7b5dd493344a3d535a7c611 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_principle_A_domain_3-v0-loglikelihood @@ -0,0 +1 @@ +38454befedcf1f3f6ef27d3bef9ccfdfb3e94a7ab32d86a63493a920d2d50093 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_sentential_subject_island-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_sentential_subject_island-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..6220172936ccbee00cc7d5420c30893109d366b2 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_sentential_subject_island-v0-loglikelihood @@ -0,0 +1 @@ +80f5f98fad26240de2767fe58c4b18d864df41cbfa76f06c84c3fce9f14f4833 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_superlative_quantifiers_2-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_superlative_quantifiers_2-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..2733d251cf90f264f28db48a2b17b520e528f2c7 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_superlative_quantifiers_2-v0-res.json @@ -0,0 +1 @@ +{"results": {"blimp_superlative_quantifiers_2": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_superlative_quantifiers_2": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_wh_vs_that_no_gap_long_distance-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_wh_vs_that_no_gap_long_distance-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..13359ac3d2092bb8d38d44f17a125124c034d317 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_wh_vs_that_no_gap_long_distance-v0-loglikelihood @@ -0,0 +1 @@ +a142cc2a6fcd93230b650927b07367cad957b8f3f42cb4072151da53dea301df \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_wh_vs_that_no_gap_long_distance-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_wh_vs_that_no_gap_long_distance-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..de9e8007180f265cb7b2aed51e277b93fded9ce6 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_wh_vs_that_no_gap_long_distance-v0-res.json @@ -0,0 +1 @@ +{"results": {"blimp_wh_vs_that_no_gap_long_distance": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_wh_vs_that_no_gap_long_distance": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/cb-v1-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/cb-v1-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..ad7e928fe6a3d79857c3c076c6459d8b6c31897c --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/cb-v1-loglikelihood @@ -0,0 +1 @@ +77b11f4348eb8a7f57faf95c531fda01ab4bf0e729f91a82451ed8e71ec8e66d \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/coqa-v0-greedy_until b/scripts/yans/lm-evaluation-harness/tests/testdata/coqa-v0-greedy_until new file mode 100644 index 0000000000000000000000000000000000000000..c1a9e165a7e42191745e38b1dd8d6b9e2fe609cb --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/coqa-v0-greedy_until @@ -0,0 +1 @@ +4a8605d5deed0423ec095700251ed93325b45d320aca35d4ce1e94702094435e \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_english_nationality-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_english_nationality-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..1186c252981559fc1e9859252f82aaea27310c4f --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_english_nationality-v0-loglikelihood @@ -0,0 +1 @@ +b85bc849811ccfa9971a6ee3fca7342752c314c0cb6f126e10d9ec4d0450c541 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_english_race_color-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_english_race_color-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..9feec03298368b126f4c7361084fb894b8170ffd --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_english_race_color-v0-loglikelihood @@ -0,0 +1 @@ +0a750596d77cd96502dc414ff699a399b1b91c2078adeec1d3dd982b3d591089 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_race_color-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_race_color-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..bdb9d9c6aff73eac1def51836e15733ad940835c --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_race_color-v0-res.json @@ -0,0 +1 @@ +{"results": {"crows_pairs_french_race_color": {"likelihood_difference": 0.33233909422443764, "likelihood_difference_stderr": 0.010623405969915857, "pct_stereotype": 0.4782608695652174, "pct_stereotype_stderr": 0.023315932363473738}}, "versions": {"crows_pairs_french_race_color": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_religion-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_religion-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..b31daf0e281664ab74ae88a9edd6bb1029f28d57 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_religion-v0-loglikelihood @@ -0,0 +1 @@ +8af6445eeb634dad5f0723e40615afe993e1e3f129a4f314fe4117e633c2efd3 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_socioeconomic-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_socioeconomic-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..2f6455aec029fea8d7ee8fa866e9f7779ac99914 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_socioeconomic-v0-loglikelihood @@ -0,0 +1 @@ +8ba0a525c65f795c99f6416e70c998e75e4b6cc43bf9a4bd7ccacd3c3591e9cb \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/ethics_virtue-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/ethics_virtue-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..48652c4689e2be24972881d0abff497d203ace9a --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/ethics_virtue-v0-loglikelihood @@ -0,0 +1 @@ +8021db8de46850090ddae6e6ec2d382029c3027b7c69884607503f916d09b709 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/gpt3_test_f307d52964c295e2005c5e782b688c24388e0cecadf29f1e6fc7f394236ea9c0.pkl b/scripts/yans/lm-evaluation-harness/tests/testdata/gpt3_test_f307d52964c295e2005c5e782b688c24388e0cecadf29f1e6fc7f394236ea9c0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e056fc1afdd78b1d7bec2610bc4e8962ba816bde --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/gpt3_test_f307d52964c295e2005c5e782b688c24388e0cecadf29f1e6fc7f394236ea9c0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f11de4b3d45d1590ba78935e824ae86ef75bbc370df500f89dde2c397d11c01a +size 1297 diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hellaswag-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/hellaswag-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..c679a3e311759f4a00707b7454e0e8be4bcdfff0 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hellaswag-v0-loglikelihood @@ -0,0 +1 @@ +abb808c97d6529eda6c11067837a132c62d25cba0394d720f80cca6df9f7196e \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-college_computer_science-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-college_computer_science-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..aea595c09f5baf6d21867c47fd5e42152244f555 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-college_computer_science-v0-res.json @@ -0,0 +1 @@ +{"results": {"hendrycksTest-college_computer_science": {"acc": 0.22, "acc_norm": 0.24, "acc_norm_stderr": 0.04292346959909282, "acc_stderr": 0.041633319989322695}}, "versions": {"hendrycksTest-college_computer_science": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-college_mathematics-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-college_mathematics-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..766b3388ed88d61e2c17ed2a35110879160c5f7f --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-college_mathematics-v0-res.json @@ -0,0 +1 @@ +{"results": {"hendrycksTest-college_mathematics": {"acc": 0.18, "acc_norm": 0.2, "acc_norm_stderr": 0.04020151261036844, "acc_stderr": 0.038612291966536955}}, "versions": {"hendrycksTest-college_mathematics": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-college_medicine-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-college_medicine-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..2fb96497d12f9b72dbbd38f0d64aa75615bfe14b --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-college_medicine-v0-loglikelihood @@ -0,0 +1 @@ +dd6e0a9be1407890e9f8cd4434fb6aa4752ab3d2473837fd465ad99f60ad685e \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-college_medicine-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-college_medicine-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..524552c9bb99335a9a7bee73076bc633b7eb10e3 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-college_medicine-v0-res.json @@ -0,0 +1 @@ +{"results": {"hendrycksTest-college_medicine": {"acc": 0.27167630057803466, "acc_norm": 0.2543352601156069, "acc_norm_stderr": 0.0332055644308557, "acc_stderr": 0.03391750322321659}}, "versions": {"hendrycksTest-college_medicine": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-college_physics-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-college_physics-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..97e56f2ae62e6b0012d49c6a7a55614a6d6eaf58 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-college_physics-v0-res.json @@ -0,0 +1 @@ +{"results": {"hendrycksTest-college_physics": {"acc": 0.23529411764705882, "acc_norm": 0.23529411764705882, "acc_norm_stderr": 0.04220773659171453, "acc_stderr": 0.04220773659171452}}, "versions": {"hendrycksTest-college_physics": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-global_facts-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-global_facts-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..d2fff47bcbaaaead17eceef0ca09cd45014c5aac --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-global_facts-v0-res.json @@ -0,0 +1 @@ +{"results": {"hendrycksTest-global_facts": {"acc": 0.23, "acc_norm": 0.23, "acc_norm_stderr": 0.04229525846816507, "acc_stderr": 0.04229525846816507}}, "versions": {"hendrycksTest-global_facts": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_macroeconomics-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_macroeconomics-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..fb6835039c9d68b5cf5d52244a349c1b8a964c5c --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_macroeconomics-v0-res.json @@ -0,0 +1 @@ +{"results": {"hendrycksTest-high_school_macroeconomics": {"acc": 0.2230769230769231, "acc_norm": 0.22564102564102564, "acc_norm_stderr": 0.021193632525148522, "acc_stderr": 0.021107730127244}}, "versions": {"hendrycksTest-high_school_macroeconomics": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-human_aging-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-human_aging-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..061678f2e4f30402e1c44da7c4a23cae0e57bedf --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-human_aging-v0-res.json @@ -0,0 +1 @@ +{"results": {"hendrycksTest-human_aging": {"acc": 0.21524663677130046, "acc_norm": 0.17937219730941703, "acc_norm_stderr": 0.025749819569192804, "acc_stderr": 0.02758406660220827}}, "versions": {"hendrycksTest-human_aging": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-machine_learning-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-machine_learning-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..26be724f2426d0a7b204b2f4dee509597e85ab41 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-machine_learning-v0-res.json @@ -0,0 +1 @@ +{"results": {"hendrycksTest-machine_learning": {"acc": 0.23214285714285715, "acc_norm": 0.22321428571428573, "acc_norm_stderr": 0.039523019677025116, "acc_stderr": 0.04007341809755806}}, "versions": {"hendrycksTest-machine_learning": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-professional_psychology-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-professional_psychology-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..9865854da311057c18f8a2571eedac2d02608df5 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-professional_psychology-v0-loglikelihood @@ -0,0 +1 @@ +92a5fad6e9ec700f84946faeccd399dda3569fb71837c9fb0c5c87f5ec29c43e \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_cloze-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_cloze-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..f3f3f931ac7e066cbab7b6ff68732360c764324f --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_cloze-v0-res.json @@ -0,0 +1 @@ +{"results": {"lambada_cloze": {"acc": 0.0, "acc_stderr": 0.0, "ppl": 1.6479047769869253, "ppl_stderr": 0.006497321146240192}}, "versions": {"lambada_cloze": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_mt_it-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_mt_it-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..ca3fd80298aa1c565c978b26e992ccd42c7144f6 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_mt_it-v0-loglikelihood @@ -0,0 +1 @@ +fd87c6c5cf4e0499c5f9f80e5bd7ee6a4f3d2991902a0cc3ec9e6eaf22d6760a \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_standard_cloze-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_standard_cloze-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..5b3139fce60c4f456d1354e85f86f15657d63b85 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_standard_cloze-v0-res.json @@ -0,0 +1 @@ +{"results": {"lambada_standard_cloze": {"acc": 0.0, "acc_stderr": 0.0, "ppl": 1.6479047769869253, "ppl_stderr": 0.006497321146240192}}, "versions": {"lambada_standard_cloze": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/math_counting_and_prob-v1-greedy_until b/scripts/yans/lm-evaluation-harness/tests/testdata/math_counting_and_prob-v1-greedy_until new file mode 100644 index 0000000000000000000000000000000000000000..6f49557ecf42758d64d1297c5569f3d4d95dd9c1 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/math_counting_and_prob-v1-greedy_until @@ -0,0 +1 @@ +2aa9ae43ee9dbb2457525247d7b65358632c5eaa9cbfc40cf95a4f17f5d942ad \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/math_counting_and_prob-v1-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/math_counting_and_prob-v1-res.json new file mode 100644 index 0000000000000000000000000000000000000000..240f7b6b42b77b8e94c1ec2eab2df808181a2cb3 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/math_counting_and_prob-v1-res.json @@ -0,0 +1 @@ +{"results": {"math_counting_and_prob": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_counting_and_prob": 1}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/math_num_theory-v0-greedy_until b/scripts/yans/lm-evaluation-harness/tests/testdata/math_num_theory-v0-greedy_until new file mode 100644 index 0000000000000000000000000000000000000000..82febb9f5dfeefbd6dc5d244574ac5666c6b8bba --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/math_num_theory-v0-greedy_until @@ -0,0 +1 @@ +b920ccb507afdcf3ef6f4c04891913731e9f32ec914801791c6d9f8abf6e1897 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/math_prealgebra-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/math_prealgebra-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..b3ada8a6be4d86b71a0c6b92c605d3c8a25a29a2 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/math_prealgebra-v0-res.json @@ -0,0 +1 @@ +{"results": {"math_prealgebra": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_prealgebra": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/math_prealgebra-v1-greedy_until b/scripts/yans/lm-evaluation-harness/tests/testdata/math_prealgebra-v1-greedy_until new file mode 100644 index 0000000000000000000000000000000000000000..5200f4cfa9ed3a735661e987791bf1434555db6e --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/math_prealgebra-v1-greedy_until @@ -0,0 +1 @@ +752cdf343d7152e476b0273065024f6ea0e0f47ea385c6bdf9067736cb39724a \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/math_prealgebra-v1-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/math_prealgebra-v1-res.json new file mode 100644 index 0000000000000000000000000000000000000000..e3869faa8012568df2eae14e3774712960c4a544 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/math_prealgebra-v1-res.json @@ -0,0 +1 @@ +{"results": {"math_prealgebra": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_prealgebra": 1}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/mnli_mismatched-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/mnli_mismatched-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..261deed96275da1af0c8a0616b0af6247cfaf1c0 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/mnli_mismatched-v0-res.json @@ -0,0 +1 @@ +{"results": {"mnli_mismatched": {"acc": 0.3360455655004068, "acc_stderr": 0.004763973908606819}}, "versions": {"mnli_mismatched": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/mutual-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/mutual-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..0022f466d25f3e3a639720e4600732c9c0c1141d --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/mutual-v0-loglikelihood @@ -0,0 +1 @@ +f759213a28f0412510bf1a24c9cab0dae64bdee902d42a26225295445e7779db \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/mutual_plus-v1-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/mutual_plus-v1-res.json new file mode 100644 index 0000000000000000000000000000000000000000..cdb6c85b65643b2214358d18b057d0737d53b9ba --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/mutual_plus-v1-res.json @@ -0,0 +1 @@ +{"results": {"mutual_plus": {"mrr": 0.5275583145221953, "mrr_stderr": 0.009940894824430708, "r@1": 0.26297968397291194, "r@1_stderr": 0.01479889176605113, "r@2": 0.5, "r@2_stderr": 0.01680731613632036}}, "versions": {"mutual_plus": 1}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/pile_bookcorpus2-v0-loglikelihood_rolling b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_bookcorpus2-v0-loglikelihood_rolling new file mode 100644 index 0000000000000000000000000000000000000000..b37a91cc2dea829e8dab7bb0fe934442c54b3a26 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_bookcorpus2-v0-loglikelihood_rolling @@ -0,0 +1 @@ +5c17ddfebeab8c41dabadb6fc216ceda91e3fe5dc95aaf1b2c843d7f11828b03 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/pile_freelaw-v0-loglikelihood_rolling b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_freelaw-v0-loglikelihood_rolling new file mode 100644 index 0000000000000000000000000000000000000000..7b5771f4911f3069217d75d12cbdfa1a579b6663 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_freelaw-v0-loglikelihood_rolling @@ -0,0 +1 @@ +d77f3f68aadd6cbf1290c2f6737b2ed5d5c2a60e4c81a65c280f207783caabe1 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/pile_nih-exporter-v1-loglikelihood_rolling b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_nih-exporter-v1-loglikelihood_rolling new file mode 100644 index 0000000000000000000000000000000000000000..5f76588a813eebe7f0958a07253480d30de2ccf3 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_nih-exporter-v1-loglikelihood_rolling @@ -0,0 +1 @@ +520ea6e04e8a39dc0b5f63a837429a78a40e63d39d109096101feb8c5b2cf8d8 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/pile_philpapers-v1-loglikelihood_rolling b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_philpapers-v1-loglikelihood_rolling new file mode 100644 index 0000000000000000000000000000000000000000..4fbbc241ba9487c2513cdf46dbb76e004e401418 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_philpapers-v1-loglikelihood_rolling @@ -0,0 +1 @@ +339ba5d8c044c4a3ff9b9a8eaa24da1d6c01b72972074eb671a7da049eeb7047 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/pile_stackexchange-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_stackexchange-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..76fdd0a6dd2f8ca39611601c5cb514664d5dccbc --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_stackexchange-v0-res.json @@ -0,0 +1 @@ +{"results": {"pile_stackexchange": {"bits_per_byte": 0.0002288815898835956, "byte_perplexity": 1.0002289077852733, "word_perplexity": 1.0016993562258851}}, "versions": {"pile_stackexchange": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/pile_uspto-v0-loglikelihood_rolling b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_uspto-v0-loglikelihood_rolling new file mode 100644 index 0000000000000000000000000000000000000000..4649d3b9b7f1f17e4731644d470fc0a2651a980d --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_uspto-v0-loglikelihood_rolling @@ -0,0 +1 @@ +789b2bdb31564d512b70f801316f49320a26c83ba361226bac0afb255341d477 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/qa4mre_2011-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/qa4mre_2011-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..049134c7a1eac7ba79fa86951526a4ca96ddd200 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/qa4mre_2011-v0-loglikelihood @@ -0,0 +1 @@ +0d09f17c65768e797633494d2d218e4e46a26f718cab8b0bf3d156b073a8c437 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/qa4mre_2013-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/qa4mre_2013-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..43243706d9b743cec2965545f3f4436a3e5d7551 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/qa4mre_2013-v0-loglikelihood @@ -0,0 +1 @@ +52fc431e94c67f983e28ebc70cf45e6c14116b0ae77dc1bf22347c705a65d054 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/record-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/record-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..a54fa05cd1ac551a973ff8155ddca6d868a49b42 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/record-v0-loglikelihood @@ -0,0 +1 @@ +a3e378fbde4e28f375cac1561bbfc7d7673c2af193628a774ad012d5192393aa \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/sciq-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/sciq-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..25ce988773df6dd27009a4ac47357dfb7d70748e --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/sciq-v0-loglikelihood @@ -0,0 +1 @@ +71cbb6e2a7ac4512c3761ea801d420eb3fac49d158c7e4deaa3ab8727bea923c \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/textsynth_test_0c1c14571add7903b89e588c8212572b95bb57b334fc0752c89a7e045a5f63ae.pkl b/scripts/yans/lm-evaluation-harness/tests/testdata/textsynth_test_0c1c14571add7903b89e588c8212572b95bb57b334fc0752c89a7e045a5f63ae.pkl new file mode 100644 index 0000000000000000000000000000000000000000..df0fab88f67ddcaeb75bb3436da9d944aef821f3 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/textsynth_test_0c1c14571add7903b89e588c8212572b95bb57b334fc0752c89a7e045a5f63ae.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0767f0abf685cd948057def299fa0b97dc9ebdad4e356dd708a7d4bde45ba71 +size 1853 diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/textsynth_test_434076260b6af3a46b7a5eaceec3306a5872c400a3872f744280b237455a0f8e.pkl b/scripts/yans/lm-evaluation-harness/tests/testdata/textsynth_test_434076260b6af3a46b7a5eaceec3306a5872c400a3872f744280b237455a0f8e.pkl new file mode 100644 index 0000000000000000000000000000000000000000..09ac12f9334d8d77e64731eb53a258bc826e826b --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/textsynth_test_434076260b6af3a46b7a5eaceec3306a5872c400a3872f744280b237455a0f8e.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5b9ac677652592e9139c2623ad240c101dd337f7276de84c84062257c4d9b9a +size 2866 diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/textsynth_test_7209c4617547bfe17cb9e7f5f735fe35822d650aefdc5fbeeaf0c1724effbe09.pkl b/scripts/yans/lm-evaluation-harness/tests/testdata/textsynth_test_7209c4617547bfe17cb9e7f5f735fe35822d650aefdc5fbeeaf0c1724effbe09.pkl new file mode 100644 index 0000000000000000000000000000000000000000..12977a6db7e0a4e4a0a0d9da9d352753d396c36c --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/textsynth_test_7209c4617547bfe17cb9e7f5f735fe35822d650aefdc5fbeeaf0c1724effbe09.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d66146528d7288a309c4e58664419c5e465b3b97ac968402f1e4baac5dc9cd7a +size 1871 diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/truthfulqa_gen-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/truthfulqa_gen-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..5e68fa8dc6ace5fd91322aacdc74de3814832d9a --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/truthfulqa_gen-v0-res.json @@ -0,0 +1 @@ +{"results": {"truthfulqa_gen": {"bleu_acc": 0.0, "bleu_acc_stderr": 0.0, "bleu_diff": 0.0, "bleu_diff_stderr": 0.0, "bleu_max": 0.0, "bleu_max_stderr": 0.0, "bleurt_acc": 0.8372093023255814, "bleurt_acc_stderr": 0.012923696051772253, "bleurt_diff": 0.13967358205134603, "bleurt_diff_stderr": 0.00532907098769571, "bleurt_max": -1.4402793981454072, "bleurt_max_stderr": 0.0021884846359458963, "rouge1_acc": 0.0, "rouge1_acc_stderr": 0.0, "rouge1_diff": 0.0, "rouge1_diff_stderr": 0.0, "rouge1_max": 0.0, "rouge1_max_stderr": 0.0, "rouge2_acc": 0.0, "rouge2_acc_stderr": 0.0, "rouge2_diff": 0.0, "rouge2_diff_stderr": 0.0, "rouge2_max": 0.0, "rouge2_max_stderr": 0.0, "rougeL_acc": 0.0, "rougeL_acc_stderr": 0.0, "rougeL_diff": 0.0, "rougeL_diff_stderr": 0.0, "rougeL_max": 0.0, "rougeL_max_stderr": 0.0}}, "versions": {"truthfulqa_gen": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/truthfulqa_gen-v1-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/truthfulqa_gen-v1-res.json new file mode 100644 index 0000000000000000000000000000000000000000..30aa72f2bafd0788837ca50fa9d5c75f954daef0 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/truthfulqa_gen-v1-res.json @@ -0,0 +1 @@ +{"results": {"truthfulqa_gen": {"bleu_acc": 0.0, "bleu_acc_stderr": 0.0, "bleu_diff": 0.0, "bleu_diff_stderr": 0.0, "bleu_max": 0.0, "bleu_max_stderr": 0.0, "bleurt_acc": 0.835985312117503, "bleurt_acc_stderr": 0.012962704327492454, "bleurt_diff": 0.14077322143090107, "bleurt_diff_stderr": 0.005459888909582694, "bleurt_max": -1.4399358725752065, "bleurt_max_stderr": 0.0022126992369197133, "rouge1_acc": 0.0, "rouge1_acc_stderr": 0.0, "rouge1_diff": 0.0, "rouge1_diff_stderr": 0.0, "rouge1_max": 0.0, "rouge1_max_stderr": 0.0, "rouge2_acc": 0.0, "rouge2_acc_stderr": 0.0, "rouge2_diff": 0.0, "rouge2_diff_stderr": 0.0, "rouge2_max": 0.0, "rouge2_max_stderr": 0.0, "rougeL_acc": 0.0, "rougeL_acc_stderr": 0.0, "rougeL_diff": 0.0, "rougeL_diff_stderr": 0.0, "rougeL_max": 0.0, "rougeL_max_stderr": 0.0}}, "versions": {"truthfulqa_gen": 1}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/wikitext-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/wikitext-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..9ac0c37bb5aa8cdde37bf84c61a0d020c8a03900 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/wikitext-v0-res.json @@ -0,0 +1 @@ +{"results": {"wikitext": {"bits_per_byte": 2.219817611605802e-05, "byte_perplexity": 1.0000221984224973, "word_perplexity": 1.000118710696617}}, "versions": {"wikitext": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/wmt16-ro-en-v0-greedy_until b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt16-ro-en-v0-greedy_until new file mode 100644 index 0000000000000000000000000000000000000000..fbcac1b7e3887c6ffa8fd6da6e21595fb0c49a4f --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt16-ro-en-v0-greedy_until @@ -0,0 +1 @@ +d1b7c50751b0d5d7470b7f49f2bab9d09792c91460fc92cc34f06617013d7c65 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-de-en-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-de-en-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..790424fe4f226224642530ba7fd53a59eec4caa0 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-de-en-v0-res.json @@ -0,0 +1 @@ +{"results": {"wmt20-de-en": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.006703243310670055, "chrf_stderr": 0.0001292711927988445, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-de-en": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-de-fr-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-de-fr-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..79a0d12fe6f5750749e56dc3919283f71d021fa0 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-de-fr-v0-res.json @@ -0,0 +1 @@ +{"results": {"wmt20-de-fr": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.011897164096796364, "chrf_stderr": 0.00010158164726118333, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-de-fr": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-en-cs-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-en-cs-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..2ba9db70d3579ff23ee70c3b16eb92d7d87144e6 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-en-cs-v0-res.json @@ -0,0 +1 @@ +{"results": {"wmt20-en-cs": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.009879653442394573, "chrf_stderr": 8.210293331159994e-05, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-en-cs": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-en-iu-v0-greedy_until b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-en-iu-v0-greedy_until new file mode 100644 index 0000000000000000000000000000000000000000..d26bb4f92a03612cf3a4170733973e39870164b7 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-en-iu-v0-greedy_until @@ -0,0 +1 @@ +f5688199890a48f73f2cc04a2152e35190f0e0ddd40e629fa24ee39d423ea389 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-en-ta-v0-greedy_until b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-en-ta-v0-greedy_until new file mode 100644 index 0000000000000000000000000000000000000000..1b40263f154461098d6ee820bc0d003c03a6962c --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-en-ta-v0-greedy_until @@ -0,0 +1 @@ +5fc556fa90bca7f1b1396e97e392eac8080b0ad53488358799b8fc0b21a94cb1 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-ta-en-v0-greedy_until b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-ta-en-v0-greedy_until new file mode 100644 index 0000000000000000000000000000000000000000..f0f65972451ff666399f7b2c81194c4b892ac783 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-ta-en-v0-greedy_until @@ -0,0 +1 @@ +111ea3efdc08f1cf536631b9426c3a20e482c575d009d2a8c71f59c027578eec \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/wsc-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/wsc-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..84be59624161779e494896d2618dbcf0f1f4b4b0 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/wsc-v0-res.json @@ -0,0 +1 @@ +{"results": {"wsc": {"acc": 0.5480769230769231, "acc_stderr": 0.049038186969314335}}, "versions": {"wsc": 0}} \ No newline at end of file